pmap.c revision 306558
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/pmap.c 306558 2016-10-01 19:30:28Z alc $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bitstring.h>
108#include <sys/bus.h>
109#include <sys/systm.h>
110#include <sys/kernel.h>
111#include <sys/ktr.h>
112#include <sys/lock.h>
113#include <sys/malloc.h>
114#include <sys/mman.h>
115#include <sys/mutex.h>
116#include <sys/proc.h>
117#include <sys/rwlock.h>
118#include <sys/sx.h>
119#include <sys/turnstile.h>
120#include <sys/vmem.h>
121#include <sys/vmmeter.h>
122#include <sys/sched.h>
123#include <sys/sysctl.h>
124#include <sys/smp.h>
125
126#include <vm/vm.h>
127#include <vm/vm_param.h>
128#include <vm/vm_kern.h>
129#include <vm/vm_page.h>
130#include <vm/vm_map.h>
131#include <vm/vm_object.h>
132#include <vm/vm_extern.h>
133#include <vm/vm_pageout.h>
134#include <vm/vm_pager.h>
135#include <vm/vm_phys.h>
136#include <vm/vm_radix.h>
137#include <vm/vm_reserv.h>
138#include <vm/uma.h>
139
140#include <machine/intr_machdep.h>
141#include <x86/apicvar.h>
142#include <machine/cpu.h>
143#include <machine/cputypes.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146#include <machine/specialreg.h>
147#ifdef SMP
148#include <machine/smp.h>
149#endif
150
151static __inline boolean_t
152pmap_type_guest(pmap_t pmap)
153{
154
155	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
156}
157
158static __inline boolean_t
159pmap_emulate_ad_bits(pmap_t pmap)
160{
161
162	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
163}
164
165static __inline pt_entry_t
166pmap_valid_bit(pmap_t pmap)
167{
168	pt_entry_t mask;
169
170	switch (pmap->pm_type) {
171	case PT_X86:
172	case PT_RVI:
173		mask = X86_PG_V;
174		break;
175	case PT_EPT:
176		if (pmap_emulate_ad_bits(pmap))
177			mask = EPT_PG_EMUL_V;
178		else
179			mask = EPT_PG_READ;
180		break;
181	default:
182		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
183	}
184
185	return (mask);
186}
187
188static __inline pt_entry_t
189pmap_rw_bit(pmap_t pmap)
190{
191	pt_entry_t mask;
192
193	switch (pmap->pm_type) {
194	case PT_X86:
195	case PT_RVI:
196		mask = X86_PG_RW;
197		break;
198	case PT_EPT:
199		if (pmap_emulate_ad_bits(pmap))
200			mask = EPT_PG_EMUL_RW;
201		else
202			mask = EPT_PG_WRITE;
203		break;
204	default:
205		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
206	}
207
208	return (mask);
209}
210
211static __inline pt_entry_t
212pmap_global_bit(pmap_t pmap)
213{
214	pt_entry_t mask;
215
216	switch (pmap->pm_type) {
217	case PT_X86:
218		mask = X86_PG_G;
219		break;
220	case PT_RVI:
221	case PT_EPT:
222		mask = 0;
223		break;
224	default:
225		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
226	}
227
228	return (mask);
229}
230
231static __inline pt_entry_t
232pmap_accessed_bit(pmap_t pmap)
233{
234	pt_entry_t mask;
235
236	switch (pmap->pm_type) {
237	case PT_X86:
238	case PT_RVI:
239		mask = X86_PG_A;
240		break;
241	case PT_EPT:
242		if (pmap_emulate_ad_bits(pmap))
243			mask = EPT_PG_READ;
244		else
245			mask = EPT_PG_A;
246		break;
247	default:
248		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
249	}
250
251	return (mask);
252}
253
254static __inline pt_entry_t
255pmap_modified_bit(pmap_t pmap)
256{
257	pt_entry_t mask;
258
259	switch (pmap->pm_type) {
260	case PT_X86:
261	case PT_RVI:
262		mask = X86_PG_M;
263		break;
264	case PT_EPT:
265		if (pmap_emulate_ad_bits(pmap))
266			mask = EPT_PG_WRITE;
267		else
268			mask = EPT_PG_M;
269		break;
270	default:
271		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
272	}
273
274	return (mask);
275}
276
277extern	struct pcpu __pcpu[];
278
279#if !defined(DIAGNOSTIC)
280#ifdef __GNUC_GNU_INLINE__
281#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
282#else
283#define PMAP_INLINE	extern inline
284#endif
285#else
286#define PMAP_INLINE
287#endif
288
289#ifdef PV_STATS
290#define PV_STAT(x)	do { x ; } while (0)
291#else
292#define PV_STAT(x)	do { } while (0)
293#endif
294
295#define	pa_index(pa)	((pa) >> PDRSHIFT)
296#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
297
298#define	NPV_LIST_LOCKS	MAXCPU
299
300#define	PHYS_TO_PV_LIST_LOCK(pa)	\
301			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
302
303#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
304	struct rwlock **_lockp = (lockp);		\
305	struct rwlock *_new_lock;			\
306							\
307	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
308	if (_new_lock != *_lockp) {			\
309		if (*_lockp != NULL)			\
310			rw_wunlock(*_lockp);		\
311		*_lockp = _new_lock;			\
312		rw_wlock(*_lockp);			\
313	}						\
314} while (0)
315
316#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
317			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
318
319#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
320	struct rwlock **_lockp = (lockp);		\
321							\
322	if (*_lockp != NULL) {				\
323		rw_wunlock(*_lockp);			\
324		*_lockp = NULL;				\
325	}						\
326} while (0)
327
328#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
329			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
330
331struct pmap kernel_pmap_store;
332
333vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
334vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
335
336int nkpt;
337SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
338    "Number of kernel page table pages allocated on bootup");
339
340static int ndmpdp;
341vm_paddr_t dmaplimit;
342vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
343pt_entry_t pg_nx;
344
345static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
346
347static int pat_works = 1;
348SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
349    "Is page attribute table fully functional?");
350
351static int pg_ps_enabled = 1;
352SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
353    &pg_ps_enabled, 0, "Are large page mappings enabled?");
354
355#define	PAT_INDEX_SIZE	8
356static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
357
358static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
359static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
360u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
361u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
362
363static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
364static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
365static int		ndmpdpphys;	/* number of DMPDPphys pages */
366
367/*
368 * pmap_mapdev support pre initialization (i.e. console)
369 */
370#define	PMAP_PREINIT_MAPPING_COUNT	8
371static struct pmap_preinit_mapping {
372	vm_paddr_t	pa;
373	vm_offset_t	va;
374	vm_size_t	sz;
375	int		mode;
376} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
377static int pmap_initialized;
378
379/*
380 * Data for the pv entry allocation mechanism.
381 * Updates to pv_invl_gen are protected by the pv_list_locks[]
382 * elements, but reads are not.
383 */
384static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
385static struct mtx pv_chunks_mutex;
386static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
387static u_long pv_invl_gen[NPV_LIST_LOCKS];
388static struct md_page *pv_table;
389static struct md_page pv_dummy;
390
391/*
392 * All those kernel PT submaps that BSD is so fond of
393 */
394pt_entry_t *CMAP1 = 0;
395caddr_t CADDR1 = 0;
396static vm_offset_t qframe = 0;
397static struct mtx qframe_mtx;
398
399static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
400
401int pmap_pcid_enabled = 1;
402SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
403    &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
404int invpcid_works = 0;
405SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
406    "Is the invpcid instruction available ?");
407
408static int
409pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
410{
411	int i;
412	uint64_t res;
413
414	res = 0;
415	CPU_FOREACH(i) {
416		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
417	}
418	return (sysctl_handle_64(oidp, &res, 0, req));
419}
420SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
421    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
422    "Count of saved TLB context on switch");
423
424static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
425    LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
426static struct mtx invl_gen_mtx;
427static u_long pmap_invl_gen = 0;
428/* Fake lock object to satisfy turnstiles interface. */
429static struct lock_object invl_gen_ts = {
430	.lo_name = "invlts",
431};
432
433#define	PMAP_ASSERT_NOT_IN_DI() \
434    KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started"))
435
436/*
437 * Start a new Delayed Invalidation (DI) block of code, executed by
438 * the current thread.  Within a DI block, the current thread may
439 * destroy both the page table and PV list entries for a mapping and
440 * then release the corresponding PV list lock before ensuring that
441 * the mapping is flushed from the TLBs of any processors with the
442 * pmap active.
443 */
444static void
445pmap_delayed_invl_started(void)
446{
447	struct pmap_invl_gen *invl_gen;
448	u_long currgen;
449
450	invl_gen = &curthread->td_md.md_invl_gen;
451	PMAP_ASSERT_NOT_IN_DI();
452	mtx_lock(&invl_gen_mtx);
453	if (LIST_EMPTY(&pmap_invl_gen_tracker))
454		currgen = pmap_invl_gen;
455	else
456		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
457	invl_gen->gen = currgen + 1;
458	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
459	mtx_unlock(&invl_gen_mtx);
460}
461
462/*
463 * Finish the DI block, previously started by the current thread.  All
464 * required TLB flushes for the pages marked by
465 * pmap_delayed_invl_page() must be finished before this function is
466 * called.
467 *
468 * This function works by bumping the global DI generation number to
469 * the generation number of the current thread's DI, unless there is a
470 * pending DI that started earlier.  In the latter case, bumping the
471 * global DI generation number would incorrectly signal that the
472 * earlier DI had finished.  Instead, this function bumps the earlier
473 * DI's generation number to match the generation number of the
474 * current thread's DI.
475 */
476static void
477pmap_delayed_invl_finished(void)
478{
479	struct pmap_invl_gen *invl_gen, *next;
480	struct turnstile *ts;
481
482	invl_gen = &curthread->td_md.md_invl_gen;
483	KASSERT(invl_gen->gen != 0, ("missed invl_started"));
484	mtx_lock(&invl_gen_mtx);
485	next = LIST_NEXT(invl_gen, link);
486	if (next == NULL) {
487		turnstile_chain_lock(&invl_gen_ts);
488		ts = turnstile_lookup(&invl_gen_ts);
489		pmap_invl_gen = invl_gen->gen;
490		if (ts != NULL) {
491			turnstile_broadcast(ts, TS_SHARED_QUEUE);
492			turnstile_unpend(ts, TS_SHARED_LOCK);
493		}
494		turnstile_chain_unlock(&invl_gen_ts);
495	} else {
496		next->gen = invl_gen->gen;
497	}
498	LIST_REMOVE(invl_gen, link);
499	mtx_unlock(&invl_gen_mtx);
500	invl_gen->gen = 0;
501}
502
503#ifdef PV_STATS
504static long invl_wait;
505SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
506    "Number of times DI invalidation blocked pmap_remove_all/write");
507#endif
508
509static u_long *
510pmap_delayed_invl_genp(vm_page_t m)
511{
512
513	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
514}
515
516/*
517 * Ensure that all currently executing DI blocks, that need to flush
518 * TLB for the given page m, actually flushed the TLB at the time the
519 * function returned.  If the page m has an empty PV list and we call
520 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
521 * valid mapping for the page m in either its page table or TLB.
522 *
523 * This function works by blocking until the global DI generation
524 * number catches up with the generation number associated with the
525 * given page m and its PV list.  Since this function's callers
526 * typically own an object lock and sometimes own a page lock, it
527 * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
528 * processor.
529 */
530static void
531pmap_delayed_invl_wait(vm_page_t m)
532{
533	struct thread *td;
534	struct turnstile *ts;
535	u_long *m_gen;
536#ifdef PV_STATS
537	bool accounted = false;
538#endif
539
540	td = curthread;
541	m_gen = pmap_delayed_invl_genp(m);
542	while (*m_gen > pmap_invl_gen) {
543#ifdef PV_STATS
544		if (!accounted) {
545			atomic_add_long(&invl_wait, 1);
546			accounted = true;
547		}
548#endif
549		ts = turnstile_trywait(&invl_gen_ts);
550		if (*m_gen > pmap_invl_gen)
551			turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
552		else
553			turnstile_cancel(ts);
554	}
555}
556
557/*
558 * Mark the page m's PV list as participating in the current thread's
559 * DI block.  Any threads concurrently using m's PV list to remove or
560 * restrict all mappings to m will wait for the current thread's DI
561 * block to complete before proceeding.
562 *
563 * The function works by setting the DI generation number for m's PV
564 * list to at least the DI generation number of the current thread.
565 * This forces a caller of pmap_delayed_invl_wait() to block until
566 * current thread calls pmap_delayed_invl_finished().
567 */
568static void
569pmap_delayed_invl_page(vm_page_t m)
570{
571	u_long gen, *m_gen;
572
573	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
574	gen = curthread->td_md.md_invl_gen.gen;
575	if (gen == 0)
576		return;
577	m_gen = pmap_delayed_invl_genp(m);
578	if (*m_gen < gen)
579		*m_gen = gen;
580}
581
582/*
583 * Crashdump maps.
584 */
585static caddr_t crashdumpmap;
586
587static void	free_pv_chunk(struct pv_chunk *pc);
588static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
589static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
590static int	popcnt_pc_map_pq(uint64_t *map);
591static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
592static void	reserve_pv_entries(pmap_t pmap, int needed,
593		    struct rwlock **lockp);
594static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
595		    struct rwlock **lockp);
596static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
597		    struct rwlock **lockp);
598static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
599		    struct rwlock **lockp);
600static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
601static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
602		    vm_offset_t va);
603
604static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
605static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
606static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
607    vm_offset_t va, struct rwlock **lockp);
608static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
609    vm_offset_t va);
610static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
611    vm_prot_t prot, struct rwlock **lockp);
612static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
613    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
614static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
615static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
616static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
617static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
618static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
619static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
620    struct rwlock **lockp);
621static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
622    vm_prot_t prot);
623static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
624static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
625    struct spglist *free, struct rwlock **lockp);
626static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
627    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
628static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
629static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
630    struct spglist *free);
631static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
632    vm_page_t m, struct rwlock **lockp);
633static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
634    pd_entry_t newpde);
635static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
636
637static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
638		struct rwlock **lockp);
639static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
640		struct rwlock **lockp);
641static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
642		struct rwlock **lockp);
643
644static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
645    struct spglist *free);
646static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
647static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
648
649/*
650 * Move the kernel virtual free pointer to the next
651 * 2MB.  This is used to help improve performance
652 * by using a large (2MB) page for much of the kernel
653 * (.text, .data, .bss)
654 */
655static vm_offset_t
656pmap_kmem_choose(vm_offset_t addr)
657{
658	vm_offset_t newaddr = addr;
659
660	newaddr = roundup2(addr, NBPDR);
661	return (newaddr);
662}
663
664/********************/
665/* Inline functions */
666/********************/
667
668/* Return a non-clipped PD index for a given VA */
669static __inline vm_pindex_t
670pmap_pde_pindex(vm_offset_t va)
671{
672	return (va >> PDRSHIFT);
673}
674
675
676/* Return a pointer to the PML4 slot that corresponds to a VA */
677static __inline pml4_entry_t *
678pmap_pml4e(pmap_t pmap, vm_offset_t va)
679{
680
681	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
682}
683
684/* Return a pointer to the PDP slot that corresponds to a VA */
685static __inline pdp_entry_t *
686pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
687{
688	pdp_entry_t *pdpe;
689
690	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
691	return (&pdpe[pmap_pdpe_index(va)]);
692}
693
694/* Return a pointer to the PDP slot that corresponds to a VA */
695static __inline pdp_entry_t *
696pmap_pdpe(pmap_t pmap, vm_offset_t va)
697{
698	pml4_entry_t *pml4e;
699	pt_entry_t PG_V;
700
701	PG_V = pmap_valid_bit(pmap);
702	pml4e = pmap_pml4e(pmap, va);
703	if ((*pml4e & PG_V) == 0)
704		return (NULL);
705	return (pmap_pml4e_to_pdpe(pml4e, va));
706}
707
708/* Return a pointer to the PD slot that corresponds to a VA */
709static __inline pd_entry_t *
710pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
711{
712	pd_entry_t *pde;
713
714	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
715	return (&pde[pmap_pde_index(va)]);
716}
717
718/* Return a pointer to the PD slot that corresponds to a VA */
719static __inline pd_entry_t *
720pmap_pde(pmap_t pmap, vm_offset_t va)
721{
722	pdp_entry_t *pdpe;
723	pt_entry_t PG_V;
724
725	PG_V = pmap_valid_bit(pmap);
726	pdpe = pmap_pdpe(pmap, va);
727	if (pdpe == NULL || (*pdpe & PG_V) == 0)
728		return (NULL);
729	return (pmap_pdpe_to_pde(pdpe, va));
730}
731
732/* Return a pointer to the PT slot that corresponds to a VA */
733static __inline pt_entry_t *
734pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
735{
736	pt_entry_t *pte;
737
738	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
739	return (&pte[pmap_pte_index(va)]);
740}
741
742/* Return a pointer to the PT slot that corresponds to a VA */
743static __inline pt_entry_t *
744pmap_pte(pmap_t pmap, vm_offset_t va)
745{
746	pd_entry_t *pde;
747	pt_entry_t PG_V;
748
749	PG_V = pmap_valid_bit(pmap);
750	pde = pmap_pde(pmap, va);
751	if (pde == NULL || (*pde & PG_V) == 0)
752		return (NULL);
753	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
754		return ((pt_entry_t *)pde);
755	return (pmap_pde_to_pte(pde, va));
756}
757
758static __inline void
759pmap_resident_count_inc(pmap_t pmap, int count)
760{
761
762	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
763	pmap->pm_stats.resident_count += count;
764}
765
766static __inline void
767pmap_resident_count_dec(pmap_t pmap, int count)
768{
769
770	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
771	KASSERT(pmap->pm_stats.resident_count >= count,
772	    ("pmap %p resident count underflow %ld %d", pmap,
773	    pmap->pm_stats.resident_count, count));
774	pmap->pm_stats.resident_count -= count;
775}
776
777PMAP_INLINE pt_entry_t *
778vtopte(vm_offset_t va)
779{
780	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
781
782	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
783
784	return (PTmap + ((va >> PAGE_SHIFT) & mask));
785}
786
787static __inline pd_entry_t *
788vtopde(vm_offset_t va)
789{
790	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
791
792	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
793
794	return (PDmap + ((va >> PDRSHIFT) & mask));
795}
796
797static u_int64_t
798allocpages(vm_paddr_t *firstaddr, int n)
799{
800	u_int64_t ret;
801
802	ret = *firstaddr;
803	bzero((void *)ret, n * PAGE_SIZE);
804	*firstaddr += n * PAGE_SIZE;
805	return (ret);
806}
807
808CTASSERT(powerof2(NDMPML4E));
809
810/* number of kernel PDP slots */
811#define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
812
813static void
814nkpt_init(vm_paddr_t addr)
815{
816	int pt_pages;
817
818#ifdef NKPT
819	pt_pages = NKPT;
820#else
821	pt_pages = howmany(addr, 1 << PDRSHIFT);
822	pt_pages += NKPDPE(pt_pages);
823
824	/*
825	 * Add some slop beyond the bare minimum required for bootstrapping
826	 * the kernel.
827	 *
828	 * This is quite important when allocating KVA for kernel modules.
829	 * The modules are required to be linked in the negative 2GB of
830	 * the address space.  If we run out of KVA in this region then
831	 * pmap_growkernel() will need to allocate page table pages to map
832	 * the entire 512GB of KVA space which is an unnecessary tax on
833	 * physical memory.
834	 *
835	 * Secondly, device memory mapped as part of setting up the low-
836	 * level console(s) is taken from KVA, starting at virtual_avail.
837	 * This is because cninit() is called after pmap_bootstrap() but
838	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
839	 * not uncommon.
840	 */
841	pt_pages += 32;		/* 64MB additional slop. */
842#endif
843	nkpt = pt_pages;
844}
845
846static void
847create_pagetables(vm_paddr_t *firstaddr)
848{
849	int i, j, ndm1g, nkpdpe;
850	pt_entry_t *pt_p;
851	pd_entry_t *pd_p;
852	pdp_entry_t *pdp_p;
853	pml4_entry_t *p4_p;
854
855	/* Allocate page table pages for the direct map */
856	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
857	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
858		ndmpdp = 4;
859	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
860	if (ndmpdpphys > NDMPML4E) {
861		/*
862		 * Each NDMPML4E allows 512 GB, so limit to that,
863		 * and then readjust ndmpdp and ndmpdpphys.
864		 */
865		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
866		Maxmem = atop(NDMPML4E * NBPML4);
867		ndmpdpphys = NDMPML4E;
868		ndmpdp = NDMPML4E * NPDEPG;
869	}
870	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
871	ndm1g = 0;
872	if ((amd_feature & AMDID_PAGE1GB) != 0)
873		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
874	if (ndm1g < ndmpdp)
875		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
876	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
877
878	/* Allocate pages */
879	KPML4phys = allocpages(firstaddr, 1);
880	KPDPphys = allocpages(firstaddr, NKPML4E);
881
882	/*
883	 * Allocate the initial number of kernel page table pages required to
884	 * bootstrap.  We defer this until after all memory-size dependent
885	 * allocations are done (e.g. direct map), so that we don't have to
886	 * build in too much slop in our estimate.
887	 *
888	 * Note that when NKPML4E > 1, we have an empty page underneath
889	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
890	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
891	 */
892	nkpt_init(*firstaddr);
893	nkpdpe = NKPDPE(nkpt);
894
895	KPTphys = allocpages(firstaddr, nkpt);
896	KPDphys = allocpages(firstaddr, nkpdpe);
897
898	/* Fill in the underlying page table pages */
899	/* Nominally read-only (but really R/W) from zero to physfree */
900	/* XXX not fully used, underneath 2M pages */
901	pt_p = (pt_entry_t *)KPTphys;
902	for (i = 0; ptoa(i) < *firstaddr; i++)
903		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
904
905	/* Now map the page tables at their location within PTmap */
906	pd_p = (pd_entry_t *)KPDphys;
907	for (i = 0; i < nkpt; i++)
908		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
909
910	/* Map from zero to end of allocations under 2M pages */
911	/* This replaces some of the KPTphys entries above */
912	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
913		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
914		    X86_PG_G;
915
916	/* And connect up the PD to the PDP (leaving room for L4 pages) */
917	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
918	for (i = 0; i < nkpdpe; i++)
919		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
920		    PG_U;
921
922	/*
923	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
924	 * the end of physical memory is not aligned to a 1GB page boundary,
925	 * then the residual physical memory is mapped with 2MB pages.  Later,
926	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
927	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
928	 * that are partially used.
929	 */
930	pd_p = (pd_entry_t *)DMPDphys;
931	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
932		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
933		/* Preset PG_M and PG_A because demotion expects it. */
934		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
935		    X86_PG_M | X86_PG_A;
936	}
937	pdp_p = (pdp_entry_t *)DMPDPphys;
938	for (i = 0; i < ndm1g; i++) {
939		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
940		/* Preset PG_M and PG_A because demotion expects it. */
941		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
942		    X86_PG_M | X86_PG_A;
943	}
944	for (j = 0; i < ndmpdp; i++, j++) {
945		pdp_p[i] = DMPDphys + ptoa(j);
946		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
947	}
948
949	/* And recursively map PML4 to itself in order to get PTmap */
950	p4_p = (pml4_entry_t *)KPML4phys;
951	p4_p[PML4PML4I] = KPML4phys;
952	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
953
954	/* Connect the Direct Map slot(s) up to the PML4. */
955	for (i = 0; i < ndmpdpphys; i++) {
956		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
957		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
958	}
959
960	/* Connect the KVA slots up to the PML4 */
961	for (i = 0; i < NKPML4E; i++) {
962		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
963		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
964	}
965}
966
967/*
968 *	Bootstrap the system enough to run with virtual memory.
969 *
970 *	On amd64 this is called after mapping has already been enabled
971 *	and just syncs the pmap module with what has already been done.
972 *	[We can't call it easily with mapping off since the kernel is not
973 *	mapped with PA == VA, hence we would have to relocate every address
974 *	from the linked base (virtual) address "KERNBASE" to the actual
975 *	(physical) address starting relative to 0]
976 */
977void
978pmap_bootstrap(vm_paddr_t *firstaddr)
979{
980	vm_offset_t va;
981	pt_entry_t *pte;
982	int i;
983
984	/*
985	 * Create an initial set of page tables to run the kernel in.
986	 */
987	create_pagetables(firstaddr);
988
989	/*
990	 * Add a physical memory segment (vm_phys_seg) corresponding to the
991	 * preallocated kernel page table pages so that vm_page structures
992	 * representing these pages will be created.  The vm_page structures
993	 * are required for promotion of the corresponding kernel virtual
994	 * addresses to superpage mappings.
995	 */
996	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
997
998	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
999	virtual_avail = pmap_kmem_choose(virtual_avail);
1000
1001	virtual_end = VM_MAX_KERNEL_ADDRESS;
1002
1003
1004	/* XXX do %cr0 as well */
1005	load_cr4(rcr4() | CR4_PGE);
1006	load_cr3(KPML4phys);
1007	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1008		load_cr4(rcr4() | CR4_SMEP);
1009
1010	/*
1011	 * Initialize the kernel pmap (which is statically allocated).
1012	 */
1013	PMAP_LOCK_INIT(kernel_pmap);
1014	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1015	kernel_pmap->pm_cr3 = KPML4phys;
1016	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
1017	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1018	kernel_pmap->pm_flags = pmap_flags;
1019
1020 	/*
1021	 * Initialize the TLB invalidations generation number lock.
1022	 */
1023	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1024
1025	/*
1026	 * Reserve some special page table entries/VA space for temporary
1027	 * mapping of pages.
1028	 */
1029#define	SYSMAP(c, p, v, n)	\
1030	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1031
1032	va = virtual_avail;
1033	pte = vtopte(va);
1034
1035	/*
1036	 * Crashdump maps.  The first page is reused as CMAP1 for the
1037	 * memory test.
1038	 */
1039	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1040	CADDR1 = crashdumpmap;
1041
1042	virtual_avail = va;
1043
1044	/* Initialize the PAT MSR. */
1045	pmap_init_pat();
1046
1047	/* Initialize TLB Context Id. */
1048	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1049	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1050		/* Check for INVPCID support */
1051		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
1052		    != 0;
1053		for (i = 0; i < MAXCPU; i++) {
1054			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1055			kernel_pmap->pm_pcids[i].pm_gen = 1;
1056		}
1057		__pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1;
1058		__pcpu[0].pc_pcid_gen = 1;
1059		/*
1060		 * pcpu area for APs is zeroed during AP startup.
1061		 * pc_pcid_next and pc_pcid_gen are initialized by AP
1062		 * during pcpu setup.
1063		 */
1064		load_cr4(rcr4() | CR4_PCIDE);
1065	} else {
1066		pmap_pcid_enabled = 0;
1067	}
1068}
1069
1070/*
1071 * Setup the PAT MSR.
1072 */
1073void
1074pmap_init_pat(void)
1075{
1076	int pat_table[PAT_INDEX_SIZE];
1077	uint64_t pat_msr;
1078	u_long cr0, cr4;
1079	int i;
1080
1081	/* Bail if this CPU doesn't implement PAT. */
1082	if ((cpu_feature & CPUID_PAT) == 0)
1083		panic("no PAT??");
1084
1085	/* Set default PAT index table. */
1086	for (i = 0; i < PAT_INDEX_SIZE; i++)
1087		pat_table[i] = -1;
1088	pat_table[PAT_WRITE_BACK] = 0;
1089	pat_table[PAT_WRITE_THROUGH] = 1;
1090	pat_table[PAT_UNCACHEABLE] = 3;
1091	pat_table[PAT_WRITE_COMBINING] = 3;
1092	pat_table[PAT_WRITE_PROTECTED] = 3;
1093	pat_table[PAT_UNCACHED] = 3;
1094
1095	/* Initialize default PAT entries. */
1096	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1097	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
1098	    PAT_VALUE(2, PAT_UNCACHED) |
1099	    PAT_VALUE(3, PAT_UNCACHEABLE) |
1100	    PAT_VALUE(4, PAT_WRITE_BACK) |
1101	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
1102	    PAT_VALUE(6, PAT_UNCACHED) |
1103	    PAT_VALUE(7, PAT_UNCACHEABLE);
1104
1105	if (pat_works) {
1106		/*
1107		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1108		 * Program 5 and 6 as WP and WC.
1109		 * Leave 4 and 7 as WB and UC.
1110		 */
1111		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
1112		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1113		    PAT_VALUE(6, PAT_WRITE_COMBINING);
1114		pat_table[PAT_UNCACHED] = 2;
1115		pat_table[PAT_WRITE_PROTECTED] = 5;
1116		pat_table[PAT_WRITE_COMBINING] = 6;
1117	} else {
1118		/*
1119		 * Just replace PAT Index 2 with WC instead of UC-.
1120		 */
1121		pat_msr &= ~PAT_MASK(2);
1122		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
1123		pat_table[PAT_WRITE_COMBINING] = 2;
1124	}
1125
1126	/* Disable PGE. */
1127	cr4 = rcr4();
1128	load_cr4(cr4 & ~CR4_PGE);
1129
1130	/* Disable caches (CD = 1, NW = 0). */
1131	cr0 = rcr0();
1132	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1133
1134	/* Flushes caches and TLBs. */
1135	wbinvd();
1136	invltlb();
1137
1138	/* Update PAT and index table. */
1139	wrmsr(MSR_PAT, pat_msr);
1140	for (i = 0; i < PAT_INDEX_SIZE; i++)
1141		pat_index[i] = pat_table[i];
1142
1143	/* Flush caches and TLBs again. */
1144	wbinvd();
1145	invltlb();
1146
1147	/* Restore caches and PGE. */
1148	load_cr0(cr0);
1149	load_cr4(cr4);
1150}
1151
1152/*
1153 *	Initialize a vm_page's machine-dependent fields.
1154 */
1155void
1156pmap_page_init(vm_page_t m)
1157{
1158
1159	TAILQ_INIT(&m->md.pv_list);
1160	m->md.pat_mode = PAT_WRITE_BACK;
1161}
1162
1163/*
1164 *	Initialize the pmap module.
1165 *	Called by vm_init, to initialize any structures that the pmap
1166 *	system needs to map virtual memory.
1167 */
1168void
1169pmap_init(void)
1170{
1171	struct pmap_preinit_mapping *ppim;
1172	vm_page_t mpte;
1173	vm_size_t s;
1174	int error, i, pv_npg;
1175
1176	/*
1177	 * Initialize the vm page array entries for the kernel pmap's
1178	 * page table pages.
1179	 */
1180	for (i = 0; i < nkpt; i++) {
1181		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1182		KASSERT(mpte >= vm_page_array &&
1183		    mpte < &vm_page_array[vm_page_array_size],
1184		    ("pmap_init: page table page is out of range"));
1185		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1186		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1187	}
1188
1189	/*
1190	 * If the kernel is running on a virtual machine, then it must assume
1191	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1192	 * be prepared for the hypervisor changing the vendor and family that
1193	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1194	 * 10h Erratum 383 is enabled if the processor's feature set does not
1195	 * include at least one feature that is only supported by older Intel
1196	 * or newer AMD processors.
1197	 */
1198	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1199	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1200	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1201	    AMDID2_FMA4)) == 0)
1202		workaround_erratum383 = 1;
1203
1204	/*
1205	 * Are large page mappings enabled?
1206	 */
1207	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1208	if (pg_ps_enabled) {
1209		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1210		    ("pmap_init: can't assign to pagesizes[1]"));
1211		pagesizes[1] = NBPDR;
1212	}
1213
1214	/*
1215	 * Initialize the pv chunk list mutex.
1216	 */
1217	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1218
1219	/*
1220	 * Initialize the pool of pv list locks.
1221	 */
1222	for (i = 0; i < NPV_LIST_LOCKS; i++)
1223		rw_init(&pv_list_locks[i], "pmap pv list");
1224
1225	/*
1226	 * Calculate the size of the pv head table for superpages.
1227	 */
1228	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1229
1230	/*
1231	 * Allocate memory for the pv head table for superpages.
1232	 */
1233	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1234	s = round_page(s);
1235	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1236	    M_WAITOK | M_ZERO);
1237	for (i = 0; i < pv_npg; i++)
1238		TAILQ_INIT(&pv_table[i].pv_list);
1239	TAILQ_INIT(&pv_dummy.pv_list);
1240
1241	pmap_initialized = 1;
1242	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1243		ppim = pmap_preinit_mapping + i;
1244		if (ppim->va == 0)
1245			continue;
1246		/* Make the direct map consistent */
1247		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
1248			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1249			    ppim->sz, ppim->mode);
1250		}
1251		if (!bootverbose)
1252			continue;
1253		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1254		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
1255	}
1256
1257	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
1258	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
1259	    (vmem_addr_t *)&qframe);
1260	if (error != 0)
1261		panic("qframe allocation failed");
1262}
1263
1264static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1265    "2MB page mapping counters");
1266
1267static u_long pmap_pde_demotions;
1268SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1269    &pmap_pde_demotions, 0, "2MB page demotions");
1270
1271static u_long pmap_pde_mappings;
1272SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1273    &pmap_pde_mappings, 0, "2MB page mappings");
1274
1275static u_long pmap_pde_p_failures;
1276SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1277    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1278
1279static u_long pmap_pde_promotions;
1280SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1281    &pmap_pde_promotions, 0, "2MB page promotions");
1282
1283static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1284    "1GB page mapping counters");
1285
1286static u_long pmap_pdpe_demotions;
1287SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1288    &pmap_pdpe_demotions, 0, "1GB page demotions");
1289
1290/***************************************************
1291 * Low level helper routines.....
1292 ***************************************************/
1293
1294static pt_entry_t
1295pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1296{
1297	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1298
1299	switch (pmap->pm_type) {
1300	case PT_X86:
1301	case PT_RVI:
1302		/* Verify that both PAT bits are not set at the same time */
1303		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1304		    ("Invalid PAT bits in entry %#lx", entry));
1305
1306		/* Swap the PAT bits if one of them is set */
1307		if ((entry & x86_pat_bits) != 0)
1308			entry ^= x86_pat_bits;
1309		break;
1310	case PT_EPT:
1311		/*
1312		 * Nothing to do - the memory attributes are represented
1313		 * the same way for regular pages and superpages.
1314		 */
1315		break;
1316	default:
1317		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1318	}
1319
1320	return (entry);
1321}
1322
1323/*
1324 * Determine the appropriate bits to set in a PTE or PDE for a specified
1325 * caching mode.
1326 */
1327int
1328pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1329{
1330	int cache_bits, pat_flag, pat_idx;
1331
1332	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1333		panic("Unknown caching mode %d\n", mode);
1334
1335	switch (pmap->pm_type) {
1336	case PT_X86:
1337	case PT_RVI:
1338		/* The PAT bit is different for PTE's and PDE's. */
1339		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1340
1341		/* Map the caching mode to a PAT index. */
1342		pat_idx = pat_index[mode];
1343
1344		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1345		cache_bits = 0;
1346		if (pat_idx & 0x4)
1347			cache_bits |= pat_flag;
1348		if (pat_idx & 0x2)
1349			cache_bits |= PG_NC_PCD;
1350		if (pat_idx & 0x1)
1351			cache_bits |= PG_NC_PWT;
1352		break;
1353
1354	case PT_EPT:
1355		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1356		break;
1357
1358	default:
1359		panic("unsupported pmap type %d", pmap->pm_type);
1360	}
1361
1362	return (cache_bits);
1363}
1364
1365static int
1366pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1367{
1368	int mask;
1369
1370	switch (pmap->pm_type) {
1371	case PT_X86:
1372	case PT_RVI:
1373		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1374		break;
1375	case PT_EPT:
1376		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1377		break;
1378	default:
1379		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1380	}
1381
1382	return (mask);
1383}
1384
1385static __inline boolean_t
1386pmap_ps_enabled(pmap_t pmap)
1387{
1388
1389	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1390}
1391
1392static void
1393pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1394{
1395
1396	switch (pmap->pm_type) {
1397	case PT_X86:
1398		break;
1399	case PT_RVI:
1400	case PT_EPT:
1401		/*
1402		 * XXX
1403		 * This is a little bogus since the generation number is
1404		 * supposed to be bumped up when a region of the address
1405		 * space is invalidated in the page tables.
1406		 *
1407		 * In this case the old PDE entry is valid but yet we want
1408		 * to make sure that any mappings using the old entry are
1409		 * invalidated in the TLB.
1410		 *
1411		 * The reason this works as expected is because we rendezvous
1412		 * "all" host cpus and force any vcpu context to exit as a
1413		 * side-effect.
1414		 */
1415		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1416		break;
1417	default:
1418		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1419	}
1420	pde_store(pde, newpde);
1421}
1422
1423/*
1424 * After changing the page size for the specified virtual address in the page
1425 * table, flush the corresponding entries from the processor's TLB.  Only the
1426 * calling processor's TLB is affected.
1427 *
1428 * The calling thread must be pinned to a processor.
1429 */
1430static void
1431pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1432{
1433	pt_entry_t PG_G;
1434
1435	if (pmap_type_guest(pmap))
1436		return;
1437
1438	KASSERT(pmap->pm_type == PT_X86,
1439	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1440
1441	PG_G = pmap_global_bit(pmap);
1442
1443	if ((newpde & PG_PS) == 0)
1444		/* Demotion: flush a specific 2MB page mapping. */
1445		invlpg(va);
1446	else if ((newpde & PG_G) == 0)
1447		/*
1448		 * Promotion: flush every 4KB page mapping from the TLB
1449		 * because there are too many to flush individually.
1450		 */
1451		invltlb();
1452	else {
1453		/*
1454		 * Promotion: flush every 4KB page mapping from the TLB,
1455		 * including any global (PG_G) mappings.
1456		 */
1457		invltlb_glob();
1458	}
1459}
1460#ifdef SMP
1461
1462/*
1463 * For SMP, these functions have to use the IPI mechanism for coherence.
1464 *
1465 * N.B.: Before calling any of the following TLB invalidation functions,
1466 * the calling processor must ensure that all stores updating a non-
1467 * kernel page table are globally performed.  Otherwise, another
1468 * processor could cache an old, pre-update entry without being
1469 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1470 * active on another processor after its pm_active field is checked by
1471 * one of the following functions but before a store updating the page
1472 * table is globally performed. (2) The pmap becomes active on another
1473 * processor before its pm_active field is checked but due to
1474 * speculative loads one of the following functions stills reads the
1475 * pmap as inactive on the other processor.
1476 *
1477 * The kernel page table is exempt because its pm_active field is
1478 * immutable.  The kernel page table is always active on every
1479 * processor.
1480 */
1481
1482/*
1483 * Interrupt the cpus that are executing in the guest context.
1484 * This will force the vcpu to exit and the cached EPT mappings
1485 * will be invalidated by the host before the next vmresume.
1486 */
1487static __inline void
1488pmap_invalidate_ept(pmap_t pmap)
1489{
1490	int ipinum;
1491
1492	sched_pin();
1493	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1494	    ("pmap_invalidate_ept: absurd pm_active"));
1495
1496	/*
1497	 * The TLB mappings associated with a vcpu context are not
1498	 * flushed each time a different vcpu is chosen to execute.
1499	 *
1500	 * This is in contrast with a process's vtop mappings that
1501	 * are flushed from the TLB on each context switch.
1502	 *
1503	 * Therefore we need to do more than just a TLB shootdown on
1504	 * the active cpus in 'pmap->pm_active'. To do this we keep
1505	 * track of the number of invalidations performed on this pmap.
1506	 *
1507	 * Each vcpu keeps a cache of this counter and compares it
1508	 * just before a vmresume. If the counter is out-of-date an
1509	 * invept will be done to flush stale mappings from the TLB.
1510	 */
1511	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1512
1513	/*
1514	 * Force the vcpu to exit and trap back into the hypervisor.
1515	 */
1516	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1517	ipi_selected(pmap->pm_active, ipinum);
1518	sched_unpin();
1519}
1520
1521void
1522pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1523{
1524	cpuset_t *mask;
1525	u_int cpuid, i;
1526
1527	if (pmap_type_guest(pmap)) {
1528		pmap_invalidate_ept(pmap);
1529		return;
1530	}
1531
1532	KASSERT(pmap->pm_type == PT_X86,
1533	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1534
1535	sched_pin();
1536	if (pmap == kernel_pmap) {
1537		invlpg(va);
1538		mask = &all_cpus;
1539	} else {
1540		cpuid = PCPU_GET(cpuid);
1541		if (pmap == PCPU_GET(curpmap))
1542			invlpg(va);
1543		else if (pmap_pcid_enabled)
1544			pmap->pm_pcids[cpuid].pm_gen = 0;
1545		if (pmap_pcid_enabled) {
1546			CPU_FOREACH(i) {
1547				if (cpuid != i)
1548					pmap->pm_pcids[i].pm_gen = 0;
1549			}
1550		}
1551		mask = &pmap->pm_active;
1552	}
1553	smp_masked_invlpg(*mask, va);
1554	sched_unpin();
1555}
1556
1557/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1558#define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1559
1560void
1561pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1562{
1563	cpuset_t *mask;
1564	vm_offset_t addr;
1565	u_int cpuid, i;
1566
1567	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1568		pmap_invalidate_all(pmap);
1569		return;
1570	}
1571
1572	if (pmap_type_guest(pmap)) {
1573		pmap_invalidate_ept(pmap);
1574		return;
1575	}
1576
1577	KASSERT(pmap->pm_type == PT_X86,
1578	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1579
1580	sched_pin();
1581	cpuid = PCPU_GET(cpuid);
1582	if (pmap == kernel_pmap) {
1583		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1584			invlpg(addr);
1585		mask = &all_cpus;
1586	} else {
1587		if (pmap == PCPU_GET(curpmap)) {
1588			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1589				invlpg(addr);
1590		} else if (pmap_pcid_enabled) {
1591			pmap->pm_pcids[cpuid].pm_gen = 0;
1592		}
1593		if (pmap_pcid_enabled) {
1594			CPU_FOREACH(i) {
1595				if (cpuid != i)
1596					pmap->pm_pcids[i].pm_gen = 0;
1597			}
1598		}
1599		mask = &pmap->pm_active;
1600	}
1601	smp_masked_invlpg_range(*mask, sva, eva);
1602	sched_unpin();
1603}
1604
1605void
1606pmap_invalidate_all(pmap_t pmap)
1607{
1608	cpuset_t *mask;
1609	struct invpcid_descr d;
1610	u_int cpuid, i;
1611
1612	if (pmap_type_guest(pmap)) {
1613		pmap_invalidate_ept(pmap);
1614		return;
1615	}
1616
1617	KASSERT(pmap->pm_type == PT_X86,
1618	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1619
1620	sched_pin();
1621	if (pmap == kernel_pmap) {
1622		if (pmap_pcid_enabled && invpcid_works) {
1623			bzero(&d, sizeof(d));
1624			invpcid(&d, INVPCID_CTXGLOB);
1625		} else {
1626			invltlb_glob();
1627		}
1628		mask = &all_cpus;
1629	} else {
1630		cpuid = PCPU_GET(cpuid);
1631		if (pmap == PCPU_GET(curpmap)) {
1632			if (pmap_pcid_enabled) {
1633				if (invpcid_works) {
1634					d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
1635					d.pad = 0;
1636					d.addr = 0;
1637					invpcid(&d, INVPCID_CTX);
1638				} else {
1639					load_cr3(pmap->pm_cr3 | pmap->pm_pcids
1640					    [PCPU_GET(cpuid)].pm_pcid);
1641				}
1642			} else {
1643				invltlb();
1644			}
1645		} else if (pmap_pcid_enabled) {
1646			pmap->pm_pcids[cpuid].pm_gen = 0;
1647		}
1648		if (pmap_pcid_enabled) {
1649			CPU_FOREACH(i) {
1650				if (cpuid != i)
1651					pmap->pm_pcids[i].pm_gen = 0;
1652			}
1653		}
1654		mask = &pmap->pm_active;
1655	}
1656	smp_masked_invltlb(*mask, pmap);
1657	sched_unpin();
1658}
1659
1660void
1661pmap_invalidate_cache(void)
1662{
1663
1664	sched_pin();
1665	wbinvd();
1666	smp_cache_flush();
1667	sched_unpin();
1668}
1669
1670struct pde_action {
1671	cpuset_t invalidate;	/* processors that invalidate their TLB */
1672	pmap_t pmap;
1673	vm_offset_t va;
1674	pd_entry_t *pde;
1675	pd_entry_t newpde;
1676	u_int store;		/* processor that updates the PDE */
1677};
1678
1679static void
1680pmap_update_pde_action(void *arg)
1681{
1682	struct pde_action *act = arg;
1683
1684	if (act->store == PCPU_GET(cpuid))
1685		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1686}
1687
1688static void
1689pmap_update_pde_teardown(void *arg)
1690{
1691	struct pde_action *act = arg;
1692
1693	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1694		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1695}
1696
1697/*
1698 * Change the page size for the specified virtual address in a way that
1699 * prevents any possibility of the TLB ever having two entries that map the
1700 * same virtual address using different page sizes.  This is the recommended
1701 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1702 * machine check exception for a TLB state that is improperly diagnosed as a
1703 * hardware error.
1704 */
1705static void
1706pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1707{
1708	struct pde_action act;
1709	cpuset_t active, other_cpus;
1710	u_int cpuid;
1711
1712	sched_pin();
1713	cpuid = PCPU_GET(cpuid);
1714	other_cpus = all_cpus;
1715	CPU_CLR(cpuid, &other_cpus);
1716	if (pmap == kernel_pmap || pmap_type_guest(pmap))
1717		active = all_cpus;
1718	else {
1719		active = pmap->pm_active;
1720	}
1721	if (CPU_OVERLAP(&active, &other_cpus)) {
1722		act.store = cpuid;
1723		act.invalidate = active;
1724		act.va = va;
1725		act.pmap = pmap;
1726		act.pde = pde;
1727		act.newpde = newpde;
1728		CPU_SET(cpuid, &active);
1729		smp_rendezvous_cpus(active,
1730		    smp_no_rendevous_barrier, pmap_update_pde_action,
1731		    pmap_update_pde_teardown, &act);
1732	} else {
1733		pmap_update_pde_store(pmap, pde, newpde);
1734		if (CPU_ISSET(cpuid, &active))
1735			pmap_update_pde_invalidate(pmap, va, newpde);
1736	}
1737	sched_unpin();
1738}
1739#else /* !SMP */
1740/*
1741 * Normal, non-SMP, invalidation functions.
1742 */
1743void
1744pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1745{
1746
1747	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1748		pmap->pm_eptgen++;
1749		return;
1750	}
1751	KASSERT(pmap->pm_type == PT_X86,
1752	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
1753
1754	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
1755		invlpg(va);
1756	else if (pmap_pcid_enabled)
1757		pmap->pm_pcids[0].pm_gen = 0;
1758}
1759
1760void
1761pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1762{
1763	vm_offset_t addr;
1764
1765	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1766		pmap->pm_eptgen++;
1767		return;
1768	}
1769	KASSERT(pmap->pm_type == PT_X86,
1770	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
1771
1772	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
1773		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1774			invlpg(addr);
1775	} else if (pmap_pcid_enabled) {
1776		pmap->pm_pcids[0].pm_gen = 0;
1777	}
1778}
1779
1780void
1781pmap_invalidate_all(pmap_t pmap)
1782{
1783	struct invpcid_descr d;
1784
1785	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1786		pmap->pm_eptgen++;
1787		return;
1788	}
1789	KASSERT(pmap->pm_type == PT_X86,
1790	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
1791
1792	if (pmap == kernel_pmap) {
1793		if (pmap_pcid_enabled && invpcid_works) {
1794			bzero(&d, sizeof(d));
1795			invpcid(&d, INVPCID_CTXGLOB);
1796		} else {
1797			invltlb_glob();
1798		}
1799	} else if (pmap == PCPU_GET(curpmap)) {
1800		if (pmap_pcid_enabled) {
1801			if (invpcid_works) {
1802				d.pcid = pmap->pm_pcids[0].pm_pcid;
1803				d.pad = 0;
1804				d.addr = 0;
1805				invpcid(&d, INVPCID_CTX);
1806			} else {
1807				load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
1808				    pm_pcid);
1809			}
1810		} else {
1811			invltlb();
1812		}
1813	} else if (pmap_pcid_enabled) {
1814		pmap->pm_pcids[0].pm_gen = 0;
1815	}
1816}
1817
1818PMAP_INLINE void
1819pmap_invalidate_cache(void)
1820{
1821
1822	wbinvd();
1823}
1824
1825static void
1826pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1827{
1828
1829	pmap_update_pde_store(pmap, pde, newpde);
1830	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
1831		pmap_update_pde_invalidate(pmap, va, newpde);
1832	else
1833		pmap->pm_pcids[0].pm_gen = 0;
1834}
1835#endif /* !SMP */
1836
1837#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1838
1839void
1840pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1841{
1842
1843	if (force) {
1844		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1845	} else {
1846		KASSERT((sva & PAGE_MASK) == 0,
1847		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1848		KASSERT((eva & PAGE_MASK) == 0,
1849		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1850	}
1851
1852	if ((cpu_feature & CPUID_SS) != 0 && !force)
1853		; /* If "Self Snoop" is supported and allowed, do nothing. */
1854	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1855	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1856		/*
1857		 * XXX: Some CPUs fault, hang, or trash the local APIC
1858		 * registers if we use CLFLUSH on the local APIC
1859		 * range.  The local APIC is always uncached, so we
1860		 * don't need to flush for that range anyway.
1861		 */
1862		if (pmap_kextract(sva) == lapic_paddr)
1863			return;
1864
1865		/*
1866		 * Otherwise, do per-cache line flush.  Use the mfence
1867		 * instruction to insure that previous stores are
1868		 * included in the write-back.  The processor
1869		 * propagates flush to other processors in the cache
1870		 * coherence domain.
1871		 */
1872		mfence();
1873		for (; sva < eva; sva += cpu_clflush_line_size)
1874			clflushopt(sva);
1875		mfence();
1876	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1877	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1878		if (pmap_kextract(sva) == lapic_paddr)
1879			return;
1880		/*
1881		 * Writes are ordered by CLFLUSH on Intel CPUs.
1882		 */
1883		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1884			mfence();
1885		for (; sva < eva; sva += cpu_clflush_line_size)
1886			clflush(sva);
1887		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1888			mfence();
1889	} else {
1890
1891		/*
1892		 * No targeted cache flush methods are supported by CPU,
1893		 * or the supplied range is bigger than 2MB.
1894		 * Globally invalidate cache.
1895		 */
1896		pmap_invalidate_cache();
1897	}
1898}
1899
1900/*
1901 * Remove the specified set of pages from the data and instruction caches.
1902 *
1903 * In contrast to pmap_invalidate_cache_range(), this function does not
1904 * rely on the CPU's self-snoop feature, because it is intended for use
1905 * when moving pages into a different cache domain.
1906 */
1907void
1908pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1909{
1910	vm_offset_t daddr, eva;
1911	int i;
1912	bool useclflushopt;
1913
1914	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
1915	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1916	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
1917		pmap_invalidate_cache();
1918	else {
1919		if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL)
1920			mfence();
1921		for (i = 0; i < count; i++) {
1922			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1923			eva = daddr + PAGE_SIZE;
1924			for (; daddr < eva; daddr += cpu_clflush_line_size) {
1925				if (useclflushopt)
1926					clflushopt(daddr);
1927				else
1928					clflush(daddr);
1929			}
1930		}
1931		if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL)
1932			mfence();
1933	}
1934}
1935
1936/*
1937 *	Routine:	pmap_extract
1938 *	Function:
1939 *		Extract the physical page address associated
1940 *		with the given map/virtual_address pair.
1941 */
1942vm_paddr_t
1943pmap_extract(pmap_t pmap, vm_offset_t va)
1944{
1945	pdp_entry_t *pdpe;
1946	pd_entry_t *pde;
1947	pt_entry_t *pte, PG_V;
1948	vm_paddr_t pa;
1949
1950	pa = 0;
1951	PG_V = pmap_valid_bit(pmap);
1952	PMAP_LOCK(pmap);
1953	pdpe = pmap_pdpe(pmap, va);
1954	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1955		if ((*pdpe & PG_PS) != 0)
1956			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1957		else {
1958			pde = pmap_pdpe_to_pde(pdpe, va);
1959			if ((*pde & PG_V) != 0) {
1960				if ((*pde & PG_PS) != 0) {
1961					pa = (*pde & PG_PS_FRAME) |
1962					    (va & PDRMASK);
1963				} else {
1964					pte = pmap_pde_to_pte(pde, va);
1965					pa = (*pte & PG_FRAME) |
1966					    (va & PAGE_MASK);
1967				}
1968			}
1969		}
1970	}
1971	PMAP_UNLOCK(pmap);
1972	return (pa);
1973}
1974
1975/*
1976 *	Routine:	pmap_extract_and_hold
1977 *	Function:
1978 *		Atomically extract and hold the physical page
1979 *		with the given pmap and virtual address pair
1980 *		if that mapping permits the given protection.
1981 */
1982vm_page_t
1983pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1984{
1985	pd_entry_t pde, *pdep;
1986	pt_entry_t pte, PG_RW, PG_V;
1987	vm_paddr_t pa;
1988	vm_page_t m;
1989
1990	pa = 0;
1991	m = NULL;
1992	PG_RW = pmap_rw_bit(pmap);
1993	PG_V = pmap_valid_bit(pmap);
1994	PMAP_LOCK(pmap);
1995retry:
1996	pdep = pmap_pde(pmap, va);
1997	if (pdep != NULL && (pde = *pdep)) {
1998		if (pde & PG_PS) {
1999			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
2000				if (vm_page_pa_tryrelock(pmap, (pde &
2001				    PG_PS_FRAME) | (va & PDRMASK), &pa))
2002					goto retry;
2003				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
2004				    (va & PDRMASK));
2005				vm_page_hold(m);
2006			}
2007		} else {
2008			pte = *pmap_pde_to_pte(pdep, va);
2009			if ((pte & PG_V) &&
2010			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
2011				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
2012				    &pa))
2013					goto retry;
2014				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
2015				vm_page_hold(m);
2016			}
2017		}
2018	}
2019	PA_UNLOCK_COND(pa);
2020	PMAP_UNLOCK(pmap);
2021	return (m);
2022}
2023
2024vm_paddr_t
2025pmap_kextract(vm_offset_t va)
2026{
2027	pd_entry_t pde;
2028	vm_paddr_t pa;
2029
2030	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2031		pa = DMAP_TO_PHYS(va);
2032	} else {
2033		pde = *vtopde(va);
2034		if (pde & PG_PS) {
2035			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2036		} else {
2037			/*
2038			 * Beware of a concurrent promotion that changes the
2039			 * PDE at this point!  For example, vtopte() must not
2040			 * be used to access the PTE because it would use the
2041			 * new PDE.  It is, however, safe to use the old PDE
2042			 * because the page table page is preserved by the
2043			 * promotion.
2044			 */
2045			pa = *pmap_pde_to_pte(&pde, va);
2046			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2047		}
2048	}
2049	return (pa);
2050}
2051
2052/***************************************************
2053 * Low level mapping routines.....
2054 ***************************************************/
2055
2056/*
2057 * Add a wired page to the kva.
2058 * Note: not SMP coherent.
2059 */
2060PMAP_INLINE void
2061pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2062{
2063	pt_entry_t *pte;
2064
2065	pte = vtopte(va);
2066	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
2067}
2068
2069static __inline void
2070pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2071{
2072	pt_entry_t *pte;
2073	int cache_bits;
2074
2075	pte = vtopte(va);
2076	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2077	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
2078}
2079
2080/*
2081 * Remove a page from the kernel pagetables.
2082 * Note: not SMP coherent.
2083 */
2084PMAP_INLINE void
2085pmap_kremove(vm_offset_t va)
2086{
2087	pt_entry_t *pte;
2088
2089	pte = vtopte(va);
2090	pte_clear(pte);
2091}
2092
2093/*
2094 *	Used to map a range of physical addresses into kernel
2095 *	virtual address space.
2096 *
2097 *	The value passed in '*virt' is a suggested virtual address for
2098 *	the mapping. Architectures which can support a direct-mapped
2099 *	physical to virtual region can return the appropriate address
2100 *	within that region, leaving '*virt' unchanged. Other
2101 *	architectures should map the pages starting at '*virt' and
2102 *	update '*virt' with the first usable address after the mapped
2103 *	region.
2104 */
2105vm_offset_t
2106pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2107{
2108	return PHYS_TO_DMAP(start);
2109}
2110
2111
2112/*
2113 * Add a list of wired pages to the kva
2114 * this routine is only used for temporary
2115 * kernel mappings that do not need to have
2116 * page modification or references recorded.
2117 * Note that old mappings are simply written
2118 * over.  The page *must* be wired.
2119 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2120 */
2121void
2122pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2123{
2124	pt_entry_t *endpte, oldpte, pa, *pte;
2125	vm_page_t m;
2126	int cache_bits;
2127
2128	oldpte = 0;
2129	pte = vtopte(sva);
2130	endpte = pte + count;
2131	while (pte < endpte) {
2132		m = *ma++;
2133		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2134		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2135		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2136			oldpte |= *pte;
2137			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
2138		}
2139		pte++;
2140	}
2141	if (__predict_false((oldpte & X86_PG_V) != 0))
2142		pmap_invalidate_range(kernel_pmap, sva, sva + count *
2143		    PAGE_SIZE);
2144}
2145
2146/*
2147 * This routine tears out page mappings from the
2148 * kernel -- it is meant only for temporary mappings.
2149 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2150 */
2151void
2152pmap_qremove(vm_offset_t sva, int count)
2153{
2154	vm_offset_t va;
2155
2156	va = sva;
2157	while (count-- > 0) {
2158		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2159		pmap_kremove(va);
2160		va += PAGE_SIZE;
2161	}
2162	pmap_invalidate_range(kernel_pmap, sva, va);
2163}
2164
2165/***************************************************
2166 * Page table page management routines.....
2167 ***************************************************/
2168static __inline void
2169pmap_free_zero_pages(struct spglist *free)
2170{
2171	vm_page_t m;
2172
2173	while ((m = SLIST_FIRST(free)) != NULL) {
2174		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2175		/* Preserve the page's PG_ZERO setting. */
2176		vm_page_free_toq(m);
2177	}
2178}
2179
2180/*
2181 * Schedule the specified unused page table page to be freed.  Specifically,
2182 * add the page to the specified list of pages that will be released to the
2183 * physical memory manager after the TLB has been updated.
2184 */
2185static __inline void
2186pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2187    boolean_t set_PG_ZERO)
2188{
2189
2190	if (set_PG_ZERO)
2191		m->flags |= PG_ZERO;
2192	else
2193		m->flags &= ~PG_ZERO;
2194	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2195}
2196
2197/*
2198 * Inserts the specified page table page into the specified pmap's collection
2199 * of idle page table pages.  Each of a pmap's page table pages is responsible
2200 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2201 * ordered by this virtual address range.
2202 */
2203static __inline int
2204pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2205{
2206
2207	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2208	return (vm_radix_insert(&pmap->pm_root, mpte));
2209}
2210
2211/*
2212 * Looks for a page table page mapping the specified virtual address in the
2213 * specified pmap's collection of idle page table pages.  Returns NULL if there
2214 * is no page table page corresponding to the specified virtual address.
2215 */
2216static __inline vm_page_t
2217pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2218{
2219
2220	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2221	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2222}
2223
2224/*
2225 * Removes the specified page table page from the specified pmap's collection
2226 * of idle page table pages.  The specified page table page must be a member of
2227 * the pmap's collection.
2228 */
2229static __inline void
2230pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2231{
2232
2233	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2234	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2235}
2236
2237/*
2238 * Decrements a page table page's wire count, which is used to record the
2239 * number of valid page table entries within the page.  If the wire count
2240 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2241 * page table page was unmapped and FALSE otherwise.
2242 */
2243static inline boolean_t
2244pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2245{
2246
2247	--m->wire_count;
2248	if (m->wire_count == 0) {
2249		_pmap_unwire_ptp(pmap, va, m, free);
2250		return (TRUE);
2251	} else
2252		return (FALSE);
2253}
2254
2255static void
2256_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2257{
2258
2259	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2260	/*
2261	 * unmap the page table page
2262	 */
2263	if (m->pindex >= (NUPDE + NUPDPE)) {
2264		/* PDP page */
2265		pml4_entry_t *pml4;
2266		pml4 = pmap_pml4e(pmap, va);
2267		*pml4 = 0;
2268	} else if (m->pindex >= NUPDE) {
2269		/* PD page */
2270		pdp_entry_t *pdp;
2271		pdp = pmap_pdpe(pmap, va);
2272		*pdp = 0;
2273	} else {
2274		/* PTE page */
2275		pd_entry_t *pd;
2276		pd = pmap_pde(pmap, va);
2277		*pd = 0;
2278	}
2279	pmap_resident_count_dec(pmap, 1);
2280	if (m->pindex < NUPDE) {
2281		/* We just released a PT, unhold the matching PD */
2282		vm_page_t pdpg;
2283
2284		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2285		pmap_unwire_ptp(pmap, va, pdpg, free);
2286	}
2287	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2288		/* We just released a PD, unhold the matching PDP */
2289		vm_page_t pdppg;
2290
2291		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2292		pmap_unwire_ptp(pmap, va, pdppg, free);
2293	}
2294
2295	/*
2296	 * This is a release store so that the ordinary store unmapping
2297	 * the page table page is globally performed before TLB shoot-
2298	 * down is begun.
2299	 */
2300	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
2301
2302	/*
2303	 * Put page on a list so that it is released after
2304	 * *ALL* TLB shootdown is done
2305	 */
2306	pmap_add_delayed_free_list(m, free, TRUE);
2307}
2308
2309/*
2310 * After removing a page table entry, this routine is used to
2311 * conditionally free the page, and manage the hold/wire counts.
2312 */
2313static int
2314pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2315    struct spglist *free)
2316{
2317	vm_page_t mpte;
2318
2319	if (va >= VM_MAXUSER_ADDRESS)
2320		return (0);
2321	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2322	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2323	return (pmap_unwire_ptp(pmap, va, mpte, free));
2324}
2325
2326void
2327pmap_pinit0(pmap_t pmap)
2328{
2329	int i;
2330
2331	PMAP_LOCK_INIT(pmap);
2332	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2333	pmap->pm_cr3 = KPML4phys;
2334	pmap->pm_root.rt_root = 0;
2335	CPU_ZERO(&pmap->pm_active);
2336	TAILQ_INIT(&pmap->pm_pvchunk);
2337	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2338	pmap->pm_flags = pmap_flags;
2339	CPU_FOREACH(i) {
2340		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2341		pmap->pm_pcids[i].pm_gen = 0;
2342	}
2343	PCPU_SET(curpmap, kernel_pmap);
2344	pmap_activate(curthread);
2345	CPU_FILL(&kernel_pmap->pm_active);
2346}
2347
2348void
2349pmap_pinit_pml4(vm_page_t pml4pg)
2350{
2351	pml4_entry_t *pm_pml4;
2352	int i;
2353
2354	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2355
2356	/* Wire in kernel global address entries. */
2357	for (i = 0; i < NKPML4E; i++) {
2358		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
2359		    X86_PG_V | PG_U;
2360	}
2361	for (i = 0; i < ndmpdpphys; i++) {
2362		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
2363		    X86_PG_V | PG_U;
2364	}
2365
2366	/* install self-referential address mapping entry(s) */
2367	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
2368	    X86_PG_A | X86_PG_M;
2369}
2370
2371/*
2372 * Initialize a preallocated and zeroed pmap structure,
2373 * such as one in a vmspace structure.
2374 */
2375int
2376pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2377{
2378	vm_page_t pml4pg;
2379	vm_paddr_t pml4phys;
2380	int i;
2381
2382	/*
2383	 * allocate the page directory page
2384	 */
2385	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2386	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2387		VM_WAIT;
2388
2389	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2390	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2391	CPU_FOREACH(i) {
2392		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2393		pmap->pm_pcids[i].pm_gen = 0;
2394	}
2395	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2396
2397	if ((pml4pg->flags & PG_ZERO) == 0)
2398		pagezero(pmap->pm_pml4);
2399
2400	/*
2401	 * Do not install the host kernel mappings in the nested page
2402	 * tables. These mappings are meaningless in the guest physical
2403	 * address space.
2404	 */
2405	if ((pmap->pm_type = pm_type) == PT_X86) {
2406		pmap->pm_cr3 = pml4phys;
2407		pmap_pinit_pml4(pml4pg);
2408	}
2409
2410	pmap->pm_root.rt_root = 0;
2411	CPU_ZERO(&pmap->pm_active);
2412	TAILQ_INIT(&pmap->pm_pvchunk);
2413	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2414	pmap->pm_flags = flags;
2415	pmap->pm_eptgen = 0;
2416
2417	return (1);
2418}
2419
2420int
2421pmap_pinit(pmap_t pmap)
2422{
2423
2424	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2425}
2426
2427/*
2428 * This routine is called if the desired page table page does not exist.
2429 *
2430 * If page table page allocation fails, this routine may sleep before
2431 * returning NULL.  It sleeps only if a lock pointer was given.
2432 *
2433 * Note: If a page allocation fails at page table level two or three,
2434 * one or two pages may be held during the wait, only to be released
2435 * afterwards.  This conservative approach is easily argued to avoid
2436 * race conditions.
2437 */
2438static vm_page_t
2439_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2440{
2441	vm_page_t m, pdppg, pdpg;
2442	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2443
2444	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2445
2446	PG_A = pmap_accessed_bit(pmap);
2447	PG_M = pmap_modified_bit(pmap);
2448	PG_V = pmap_valid_bit(pmap);
2449	PG_RW = pmap_rw_bit(pmap);
2450
2451	/*
2452	 * Allocate a page table page.
2453	 */
2454	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2455	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2456		if (lockp != NULL) {
2457			RELEASE_PV_LIST_LOCK(lockp);
2458			PMAP_UNLOCK(pmap);
2459			PMAP_ASSERT_NOT_IN_DI();
2460			VM_WAIT;
2461			PMAP_LOCK(pmap);
2462		}
2463
2464		/*
2465		 * Indicate the need to retry.  While waiting, the page table
2466		 * page may have been allocated.
2467		 */
2468		return (NULL);
2469	}
2470	if ((m->flags & PG_ZERO) == 0)
2471		pmap_zero_page(m);
2472
2473	/*
2474	 * Map the pagetable page into the process address space, if
2475	 * it isn't already there.
2476	 */
2477
2478	if (ptepindex >= (NUPDE + NUPDPE)) {
2479		pml4_entry_t *pml4;
2480		vm_pindex_t pml4index;
2481
2482		/* Wire up a new PDPE page */
2483		pml4index = ptepindex - (NUPDE + NUPDPE);
2484		pml4 = &pmap->pm_pml4[pml4index];
2485		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2486
2487	} else if (ptepindex >= NUPDE) {
2488		vm_pindex_t pml4index;
2489		vm_pindex_t pdpindex;
2490		pml4_entry_t *pml4;
2491		pdp_entry_t *pdp;
2492
2493		/* Wire up a new PDE page */
2494		pdpindex = ptepindex - NUPDE;
2495		pml4index = pdpindex >> NPML4EPGSHIFT;
2496
2497		pml4 = &pmap->pm_pml4[pml4index];
2498		if ((*pml4 & PG_V) == 0) {
2499			/* Have to allocate a new pdp, recurse */
2500			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2501			    lockp) == NULL) {
2502				--m->wire_count;
2503				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2504				vm_page_free_zero(m);
2505				return (NULL);
2506			}
2507		} else {
2508			/* Add reference to pdp page */
2509			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2510			pdppg->wire_count++;
2511		}
2512		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2513
2514		/* Now find the pdp page */
2515		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2516		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2517
2518	} else {
2519		vm_pindex_t pml4index;
2520		vm_pindex_t pdpindex;
2521		pml4_entry_t *pml4;
2522		pdp_entry_t *pdp;
2523		pd_entry_t *pd;
2524
2525		/* Wire up a new PTE page */
2526		pdpindex = ptepindex >> NPDPEPGSHIFT;
2527		pml4index = pdpindex >> NPML4EPGSHIFT;
2528
2529		/* First, find the pdp and check that its valid. */
2530		pml4 = &pmap->pm_pml4[pml4index];
2531		if ((*pml4 & PG_V) == 0) {
2532			/* Have to allocate a new pd, recurse */
2533			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2534			    lockp) == NULL) {
2535				--m->wire_count;
2536				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2537				vm_page_free_zero(m);
2538				return (NULL);
2539			}
2540			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2541			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2542		} else {
2543			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2544			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2545			if ((*pdp & PG_V) == 0) {
2546				/* Have to allocate a new pd, recurse */
2547				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2548				    lockp) == NULL) {
2549					--m->wire_count;
2550					atomic_subtract_int(&vm_cnt.v_wire_count,
2551					    1);
2552					vm_page_free_zero(m);
2553					return (NULL);
2554				}
2555			} else {
2556				/* Add reference to the pd page */
2557				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2558				pdpg->wire_count++;
2559			}
2560		}
2561		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2562
2563		/* Now we know where the page directory page is */
2564		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2565		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2566	}
2567
2568	pmap_resident_count_inc(pmap, 1);
2569
2570	return (m);
2571}
2572
2573static vm_page_t
2574pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2575{
2576	vm_pindex_t pdpindex, ptepindex;
2577	pdp_entry_t *pdpe, PG_V;
2578	vm_page_t pdpg;
2579
2580	PG_V = pmap_valid_bit(pmap);
2581
2582retry:
2583	pdpe = pmap_pdpe(pmap, va);
2584	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2585		/* Add a reference to the pd page. */
2586		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2587		pdpg->wire_count++;
2588	} else {
2589		/* Allocate a pd page. */
2590		ptepindex = pmap_pde_pindex(va);
2591		pdpindex = ptepindex >> NPDPEPGSHIFT;
2592		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2593		if (pdpg == NULL && lockp != NULL)
2594			goto retry;
2595	}
2596	return (pdpg);
2597}
2598
2599static vm_page_t
2600pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2601{
2602	vm_pindex_t ptepindex;
2603	pd_entry_t *pd, PG_V;
2604	vm_page_t m;
2605
2606	PG_V = pmap_valid_bit(pmap);
2607
2608	/*
2609	 * Calculate pagetable page index
2610	 */
2611	ptepindex = pmap_pde_pindex(va);
2612retry:
2613	/*
2614	 * Get the page directory entry
2615	 */
2616	pd = pmap_pde(pmap, va);
2617
2618	/*
2619	 * This supports switching from a 2MB page to a
2620	 * normal 4K page.
2621	 */
2622	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2623		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2624			/*
2625			 * Invalidation of the 2MB page mapping may have caused
2626			 * the deallocation of the underlying PD page.
2627			 */
2628			pd = NULL;
2629		}
2630	}
2631
2632	/*
2633	 * If the page table page is mapped, we just increment the
2634	 * hold count, and activate it.
2635	 */
2636	if (pd != NULL && (*pd & PG_V) != 0) {
2637		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2638		m->wire_count++;
2639	} else {
2640		/*
2641		 * Here if the pte page isn't mapped, or if it has been
2642		 * deallocated.
2643		 */
2644		m = _pmap_allocpte(pmap, ptepindex, lockp);
2645		if (m == NULL && lockp != NULL)
2646			goto retry;
2647	}
2648	return (m);
2649}
2650
2651
2652/***************************************************
2653 * Pmap allocation/deallocation routines.
2654 ***************************************************/
2655
2656/*
2657 * Release any resources held by the given physical map.
2658 * Called when a pmap initialized by pmap_pinit is being released.
2659 * Should only be called if the map contains no valid mappings.
2660 */
2661void
2662pmap_release(pmap_t pmap)
2663{
2664	vm_page_t m;
2665	int i;
2666
2667	KASSERT(pmap->pm_stats.resident_count == 0,
2668	    ("pmap_release: pmap resident count %ld != 0",
2669	    pmap->pm_stats.resident_count));
2670	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2671	    ("pmap_release: pmap has reserved page table page(s)"));
2672	KASSERT(CPU_EMPTY(&pmap->pm_active),
2673	    ("releasing active pmap %p", pmap));
2674
2675	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2676
2677	for (i = 0; i < NKPML4E; i++)	/* KVA */
2678		pmap->pm_pml4[KPML4BASE + i] = 0;
2679	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2680		pmap->pm_pml4[DMPML4I + i] = 0;
2681	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2682
2683	m->wire_count--;
2684	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2685	vm_page_free_zero(m);
2686}
2687
2688static int
2689kvm_size(SYSCTL_HANDLER_ARGS)
2690{
2691	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2692
2693	return sysctl_handle_long(oidp, &ksize, 0, req);
2694}
2695SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2696    0, 0, kvm_size, "LU", "Size of KVM");
2697
2698static int
2699kvm_free(SYSCTL_HANDLER_ARGS)
2700{
2701	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2702
2703	return sysctl_handle_long(oidp, &kfree, 0, req);
2704}
2705SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2706    0, 0, kvm_free, "LU", "Amount of KVM free");
2707
2708/*
2709 * grow the number of kernel page table entries, if needed
2710 */
2711void
2712pmap_growkernel(vm_offset_t addr)
2713{
2714	vm_paddr_t paddr;
2715	vm_page_t nkpg;
2716	pd_entry_t *pde, newpdir;
2717	pdp_entry_t *pdpe;
2718
2719	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2720
2721	/*
2722	 * Return if "addr" is within the range of kernel page table pages
2723	 * that were preallocated during pmap bootstrap.  Moreover, leave
2724	 * "kernel_vm_end" and the kernel page table as they were.
2725	 *
2726	 * The correctness of this action is based on the following
2727	 * argument: vm_map_insert() allocates contiguous ranges of the
2728	 * kernel virtual address space.  It calls this function if a range
2729	 * ends after "kernel_vm_end".  If the kernel is mapped between
2730	 * "kernel_vm_end" and "addr", then the range cannot begin at
2731	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2732	 * than the kernel.  Thus, there is no immediate need to allocate
2733	 * any new kernel page table pages between "kernel_vm_end" and
2734	 * "KERNBASE".
2735	 */
2736	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2737		return;
2738
2739	addr = roundup2(addr, NBPDR);
2740	if (addr - 1 >= kernel_map->max_offset)
2741		addr = kernel_map->max_offset;
2742	while (kernel_vm_end < addr) {
2743		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2744		if ((*pdpe & X86_PG_V) == 0) {
2745			/* We need a new PDP entry */
2746			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2747			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2748			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2749			if (nkpg == NULL)
2750				panic("pmap_growkernel: no memory to grow kernel");
2751			if ((nkpg->flags & PG_ZERO) == 0)
2752				pmap_zero_page(nkpg);
2753			paddr = VM_PAGE_TO_PHYS(nkpg);
2754			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2755			    X86_PG_A | X86_PG_M);
2756			continue; /* try again */
2757		}
2758		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2759		if ((*pde & X86_PG_V) != 0) {
2760			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2761			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2762				kernel_vm_end = kernel_map->max_offset;
2763				break;
2764			}
2765			continue;
2766		}
2767
2768		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2769		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2770		    VM_ALLOC_ZERO);
2771		if (nkpg == NULL)
2772			panic("pmap_growkernel: no memory to grow kernel");
2773		if ((nkpg->flags & PG_ZERO) == 0)
2774			pmap_zero_page(nkpg);
2775		paddr = VM_PAGE_TO_PHYS(nkpg);
2776		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2777		pde_store(pde, newpdir);
2778
2779		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2780		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2781			kernel_vm_end = kernel_map->max_offset;
2782			break;
2783		}
2784	}
2785}
2786
2787
2788/***************************************************
2789 * page management routines.
2790 ***************************************************/
2791
2792CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2793CTASSERT(_NPCM == 3);
2794CTASSERT(_NPCPV == 168);
2795
2796static __inline struct pv_chunk *
2797pv_to_chunk(pv_entry_t pv)
2798{
2799
2800	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2801}
2802
2803#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2804
2805#define	PC_FREE0	0xfffffffffffffffful
2806#define	PC_FREE1	0xfffffffffffffffful
2807#define	PC_FREE2	0x000000fffffffffful
2808
2809static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2810
2811#ifdef PV_STATS
2812static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2813
2814SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2815	"Current number of pv entry chunks");
2816SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2817	"Current number of pv entry chunks allocated");
2818SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2819	"Current number of pv entry chunks frees");
2820SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2821	"Number of times tried to get a chunk page but failed.");
2822
2823static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2824static int pv_entry_spare;
2825
2826SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2827	"Current number of pv entry frees");
2828SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2829	"Current number of pv entry allocs");
2830SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2831	"Current number of pv entries");
2832SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2833	"Current number of spare pv entries");
2834#endif
2835
2836/*
2837 * We are in a serious low memory condition.  Resort to
2838 * drastic measures to free some pages so we can allocate
2839 * another pv entry chunk.
2840 *
2841 * Returns NULL if PV entries were reclaimed from the specified pmap.
2842 *
2843 * We do not, however, unmap 2mpages because subsequent accesses will
2844 * allocate per-page pv entries until repromotion occurs, thereby
2845 * exacerbating the shortage of free pv entries.
2846 */
2847static vm_page_t
2848reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2849{
2850	struct pch new_tail;
2851	struct pv_chunk *pc;
2852	struct md_page *pvh;
2853	pd_entry_t *pde;
2854	pmap_t pmap;
2855	pt_entry_t *pte, tpte;
2856	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2857	pv_entry_t pv;
2858	vm_offset_t va;
2859	vm_page_t m, m_pc;
2860	struct spglist free;
2861	uint64_t inuse;
2862	int bit, field, freed;
2863
2864	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2865	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2866	pmap = NULL;
2867	m_pc = NULL;
2868	PG_G = PG_A = PG_M = PG_RW = 0;
2869	SLIST_INIT(&free);
2870	TAILQ_INIT(&new_tail);
2871	pmap_delayed_invl_started();
2872	mtx_lock(&pv_chunks_mutex);
2873	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2874		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2875		mtx_unlock(&pv_chunks_mutex);
2876		if (pmap != pc->pc_pmap) {
2877			if (pmap != NULL) {
2878				pmap_invalidate_all(pmap);
2879				if (pmap != locked_pmap)
2880					PMAP_UNLOCK(pmap);
2881			}
2882			pmap_delayed_invl_finished();
2883			pmap_delayed_invl_started();
2884			pmap = pc->pc_pmap;
2885			/* Avoid deadlock and lock recursion. */
2886			if (pmap > locked_pmap) {
2887				RELEASE_PV_LIST_LOCK(lockp);
2888				PMAP_LOCK(pmap);
2889			} else if (pmap != locked_pmap &&
2890			    !PMAP_TRYLOCK(pmap)) {
2891				pmap = NULL;
2892				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2893				mtx_lock(&pv_chunks_mutex);
2894				continue;
2895			}
2896			PG_G = pmap_global_bit(pmap);
2897			PG_A = pmap_accessed_bit(pmap);
2898			PG_M = pmap_modified_bit(pmap);
2899			PG_RW = pmap_rw_bit(pmap);
2900		}
2901
2902		/*
2903		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2904		 */
2905		freed = 0;
2906		for (field = 0; field < _NPCM; field++) {
2907			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2908			    inuse != 0; inuse &= ~(1UL << bit)) {
2909				bit = bsfq(inuse);
2910				pv = &pc->pc_pventry[field * 64 + bit];
2911				va = pv->pv_va;
2912				pde = pmap_pde(pmap, va);
2913				if ((*pde & PG_PS) != 0)
2914					continue;
2915				pte = pmap_pde_to_pte(pde, va);
2916				if ((*pte & PG_W) != 0)
2917					continue;
2918				tpte = pte_load_clear(pte);
2919				if ((tpte & PG_G) != 0)
2920					pmap_invalidate_page(pmap, va);
2921				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2922				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2923					vm_page_dirty(m);
2924				if ((tpte & PG_A) != 0)
2925					vm_page_aflag_set(m, PGA_REFERENCED);
2926				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2927				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2928				m->md.pv_gen++;
2929				if (TAILQ_EMPTY(&m->md.pv_list) &&
2930				    (m->flags & PG_FICTITIOUS) == 0) {
2931					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2932					if (TAILQ_EMPTY(&pvh->pv_list)) {
2933						vm_page_aflag_clear(m,
2934						    PGA_WRITEABLE);
2935					}
2936				}
2937				pmap_delayed_invl_page(m);
2938				pc->pc_map[field] |= 1UL << bit;
2939				pmap_unuse_pt(pmap, va, *pde, &free);
2940				freed++;
2941			}
2942		}
2943		if (freed == 0) {
2944			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2945			mtx_lock(&pv_chunks_mutex);
2946			continue;
2947		}
2948		/* Every freed mapping is for a 4 KB page. */
2949		pmap_resident_count_dec(pmap, freed);
2950		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2951		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2952		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2953		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2954		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2955		    pc->pc_map[2] == PC_FREE2) {
2956			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2957			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2958			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2959			/* Entire chunk is free; return it. */
2960			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2961			dump_drop_page(m_pc->phys_addr);
2962			mtx_lock(&pv_chunks_mutex);
2963			break;
2964		}
2965		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2966		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2967		mtx_lock(&pv_chunks_mutex);
2968		/* One freed pv entry in locked_pmap is sufficient. */
2969		if (pmap == locked_pmap)
2970			break;
2971	}
2972	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2973	mtx_unlock(&pv_chunks_mutex);
2974	if (pmap != NULL) {
2975		pmap_invalidate_all(pmap);
2976		if (pmap != locked_pmap)
2977			PMAP_UNLOCK(pmap);
2978	}
2979	pmap_delayed_invl_finished();
2980	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2981		m_pc = SLIST_FIRST(&free);
2982		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2983		/* Recycle a freed page table page. */
2984		m_pc->wire_count = 1;
2985		atomic_add_int(&vm_cnt.v_wire_count, 1);
2986	}
2987	pmap_free_zero_pages(&free);
2988	return (m_pc);
2989}
2990
2991/*
2992 * free the pv_entry back to the free list
2993 */
2994static void
2995free_pv_entry(pmap_t pmap, pv_entry_t pv)
2996{
2997	struct pv_chunk *pc;
2998	int idx, field, bit;
2999
3000	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3001	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3002	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3003	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3004	pc = pv_to_chunk(pv);
3005	idx = pv - &pc->pc_pventry[0];
3006	field = idx / 64;
3007	bit = idx % 64;
3008	pc->pc_map[field] |= 1ul << bit;
3009	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
3010	    pc->pc_map[2] != PC_FREE2) {
3011		/* 98% of the time, pc is already at the head of the list. */
3012		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3013			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3014			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3015		}
3016		return;
3017	}
3018	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3019	free_pv_chunk(pc);
3020}
3021
3022static void
3023free_pv_chunk(struct pv_chunk *pc)
3024{
3025	vm_page_t m;
3026
3027	mtx_lock(&pv_chunks_mutex);
3028 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3029	mtx_unlock(&pv_chunks_mutex);
3030	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3031	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3032	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3033	/* entire chunk is free, return it */
3034	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3035	dump_drop_page(m->phys_addr);
3036	vm_page_unwire(m, PQ_NONE);
3037	vm_page_free(m);
3038}
3039
3040/*
3041 * Returns a new PV entry, allocating a new PV chunk from the system when
3042 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3043 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3044 * returned.
3045 *
3046 * The given PV list lock may be released.
3047 */
3048static pv_entry_t
3049get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3050{
3051	int bit, field;
3052	pv_entry_t pv;
3053	struct pv_chunk *pc;
3054	vm_page_t m;
3055
3056	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3057	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3058retry:
3059	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3060	if (pc != NULL) {
3061		for (field = 0; field < _NPCM; field++) {
3062			if (pc->pc_map[field]) {
3063				bit = bsfq(pc->pc_map[field]);
3064				break;
3065			}
3066		}
3067		if (field < _NPCM) {
3068			pv = &pc->pc_pventry[field * 64 + bit];
3069			pc->pc_map[field] &= ~(1ul << bit);
3070			/* If this was the last item, move it to tail */
3071			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3072			    pc->pc_map[2] == 0) {
3073				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3074				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3075				    pc_list);
3076			}
3077			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3078			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3079			return (pv);
3080		}
3081	}
3082	/* No free items, allocate another chunk */
3083	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3084	    VM_ALLOC_WIRED);
3085	if (m == NULL) {
3086		if (lockp == NULL) {
3087			PV_STAT(pc_chunk_tryfail++);
3088			return (NULL);
3089		}
3090		m = reclaim_pv_chunk(pmap, lockp);
3091		if (m == NULL)
3092			goto retry;
3093	}
3094	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3095	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3096	dump_add_page(m->phys_addr);
3097	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3098	pc->pc_pmap = pmap;
3099	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
3100	pc->pc_map[1] = PC_FREE1;
3101	pc->pc_map[2] = PC_FREE2;
3102	mtx_lock(&pv_chunks_mutex);
3103	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3104	mtx_unlock(&pv_chunks_mutex);
3105	pv = &pc->pc_pventry[0];
3106	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3107	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3108	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3109	return (pv);
3110}
3111
3112/*
3113 * Returns the number of one bits within the given PV chunk map.
3114 *
3115 * The erratas for Intel processors state that "POPCNT Instruction May
3116 * Take Longer to Execute Than Expected".  It is believed that the
3117 * issue is the spurious dependency on the destination register.
3118 * Provide a hint to the register rename logic that the destination
3119 * value is overwritten, by clearing it, as suggested in the
3120 * optimization manual.  It should be cheap for unaffected processors
3121 * as well.
3122 *
3123 * Reference numbers for erratas are
3124 * 4th Gen Core: HSD146
3125 * 5th Gen Core: BDM85
3126 * 6th Gen Core: SKL029
3127 */
3128static int
3129popcnt_pc_map_pq(uint64_t *map)
3130{
3131	u_long result, tmp;
3132
3133	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
3134	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
3135	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
3136	    : "=&r" (result), "=&r" (tmp)
3137	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
3138	return (result);
3139}
3140
3141/*
3142 * Ensure that the number of spare PV entries in the specified pmap meets or
3143 * exceeds the given count, "needed".
3144 *
3145 * The given PV list lock may be released.
3146 */
3147static void
3148reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3149{
3150	struct pch new_tail;
3151	struct pv_chunk *pc;
3152	int avail, free;
3153	vm_page_t m;
3154
3155	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3156	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3157
3158	/*
3159	 * Newly allocated PV chunks must be stored in a private list until
3160	 * the required number of PV chunks have been allocated.  Otherwise,
3161	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3162	 * contrast, these chunks must be added to the pmap upon allocation.
3163	 */
3164	TAILQ_INIT(&new_tail);
3165retry:
3166	avail = 0;
3167	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3168#ifndef __POPCNT__
3169		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
3170			bit_count((bitstr_t *)pc->pc_map, 0,
3171			    sizeof(pc->pc_map) * NBBY, &free);
3172		else
3173#endif
3174		free = popcnt_pc_map_pq(pc->pc_map);
3175		if (free == 0)
3176			break;
3177		avail += free;
3178		if (avail >= needed)
3179			break;
3180	}
3181	for (; avail < needed; avail += _NPCPV) {
3182		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3183		    VM_ALLOC_WIRED);
3184		if (m == NULL) {
3185			m = reclaim_pv_chunk(pmap, lockp);
3186			if (m == NULL)
3187				goto retry;
3188		}
3189		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3190		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3191		dump_add_page(m->phys_addr);
3192		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3193		pc->pc_pmap = pmap;
3194		pc->pc_map[0] = PC_FREE0;
3195		pc->pc_map[1] = PC_FREE1;
3196		pc->pc_map[2] = PC_FREE2;
3197		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3198		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3199		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3200	}
3201	if (!TAILQ_EMPTY(&new_tail)) {
3202		mtx_lock(&pv_chunks_mutex);
3203		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3204		mtx_unlock(&pv_chunks_mutex);
3205	}
3206}
3207
3208/*
3209 * First find and then remove the pv entry for the specified pmap and virtual
3210 * address from the specified pv list.  Returns the pv entry if found and NULL
3211 * otherwise.  This operation can be performed on pv lists for either 4KB or
3212 * 2MB page mappings.
3213 */
3214static __inline pv_entry_t
3215pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3216{
3217	pv_entry_t pv;
3218
3219	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3220		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3221			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3222			pvh->pv_gen++;
3223			break;
3224		}
3225	}
3226	return (pv);
3227}
3228
3229/*
3230 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3231 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3232 * entries for each of the 4KB page mappings.
3233 */
3234static void
3235pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3236    struct rwlock **lockp)
3237{
3238	struct md_page *pvh;
3239	struct pv_chunk *pc;
3240	pv_entry_t pv;
3241	vm_offset_t va_last;
3242	vm_page_t m;
3243	int bit, field;
3244
3245	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3246	KASSERT((pa & PDRMASK) == 0,
3247	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3248	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3249
3250	/*
3251	 * Transfer the 2mpage's pv entry for this mapping to the first
3252	 * page's pv list.  Once this transfer begins, the pv list lock
3253	 * must not be released until the last pv entry is reinstantiated.
3254	 */
3255	pvh = pa_to_pvh(pa);
3256	va = trunc_2mpage(va);
3257	pv = pmap_pvh_remove(pvh, pmap, va);
3258	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3259	m = PHYS_TO_VM_PAGE(pa);
3260	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3261	m->md.pv_gen++;
3262	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3263	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3264	va_last = va + NBPDR - PAGE_SIZE;
3265	for (;;) {
3266		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3267		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3268		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3269		for (field = 0; field < _NPCM; field++) {
3270			while (pc->pc_map[field]) {
3271				bit = bsfq(pc->pc_map[field]);
3272				pc->pc_map[field] &= ~(1ul << bit);
3273				pv = &pc->pc_pventry[field * 64 + bit];
3274				va += PAGE_SIZE;
3275				pv->pv_va = va;
3276				m++;
3277				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3278			    ("pmap_pv_demote_pde: page %p is not managed", m));
3279				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3280				m->md.pv_gen++;
3281				if (va == va_last)
3282					goto out;
3283			}
3284		}
3285		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3286		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3287	}
3288out:
3289	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3290		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3291		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3292	}
3293	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3294	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3295}
3296
3297/*
3298 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3299 * replace the many pv entries for the 4KB page mappings by a single pv entry
3300 * for the 2MB page mapping.
3301 */
3302static void
3303pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3304    struct rwlock **lockp)
3305{
3306	struct md_page *pvh;
3307	pv_entry_t pv;
3308	vm_offset_t va_last;
3309	vm_page_t m;
3310
3311	KASSERT((pa & PDRMASK) == 0,
3312	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3313	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3314
3315	/*
3316	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3317	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3318	 * a transfer avoids the possibility that get_pv_entry() calls
3319	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3320	 * mappings that is being promoted.
3321	 */
3322	m = PHYS_TO_VM_PAGE(pa);
3323	va = trunc_2mpage(va);
3324	pv = pmap_pvh_remove(&m->md, pmap, va);
3325	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3326	pvh = pa_to_pvh(pa);
3327	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3328	pvh->pv_gen++;
3329	/* Free the remaining NPTEPG - 1 pv entries. */
3330	va_last = va + NBPDR - PAGE_SIZE;
3331	do {
3332		m++;
3333		va += PAGE_SIZE;
3334		pmap_pvh_free(&m->md, pmap, va);
3335	} while (va < va_last);
3336}
3337
3338/*
3339 * First find and then destroy the pv entry for the specified pmap and virtual
3340 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3341 * page mappings.
3342 */
3343static void
3344pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3345{
3346	pv_entry_t pv;
3347
3348	pv = pmap_pvh_remove(pvh, pmap, va);
3349	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3350	free_pv_entry(pmap, pv);
3351}
3352
3353/*
3354 * Conditionally create the PV entry for a 4KB page mapping if the required
3355 * memory can be allocated without resorting to reclamation.
3356 */
3357static boolean_t
3358pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3359    struct rwlock **lockp)
3360{
3361	pv_entry_t pv;
3362
3363	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3364	/* Pass NULL instead of the lock pointer to disable reclamation. */
3365	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3366		pv->pv_va = va;
3367		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3368		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3369		m->md.pv_gen++;
3370		return (TRUE);
3371	} else
3372		return (FALSE);
3373}
3374
3375/*
3376 * Conditionally create the PV entry for a 2MB page mapping if the required
3377 * memory can be allocated without resorting to reclamation.
3378 */
3379static boolean_t
3380pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3381    struct rwlock **lockp)
3382{
3383	struct md_page *pvh;
3384	pv_entry_t pv;
3385
3386	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3387	/* Pass NULL instead of the lock pointer to disable reclamation. */
3388	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3389		pv->pv_va = va;
3390		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3391		pvh = pa_to_pvh(pa);
3392		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3393		pvh->pv_gen++;
3394		return (TRUE);
3395	} else
3396		return (FALSE);
3397}
3398
3399/*
3400 * Fills a page table page with mappings to consecutive physical pages.
3401 */
3402static void
3403pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3404{
3405	pt_entry_t *pte;
3406
3407	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3408		*pte = newpte;
3409		newpte += PAGE_SIZE;
3410	}
3411}
3412
3413/*
3414 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3415 * mapping is invalidated.
3416 */
3417static boolean_t
3418pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3419{
3420	struct rwlock *lock;
3421	boolean_t rv;
3422
3423	lock = NULL;
3424	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3425	if (lock != NULL)
3426		rw_wunlock(lock);
3427	return (rv);
3428}
3429
3430static boolean_t
3431pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3432    struct rwlock **lockp)
3433{
3434	pd_entry_t newpde, oldpde;
3435	pt_entry_t *firstpte, newpte;
3436	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3437	vm_paddr_t mptepa;
3438	vm_page_t mpte;
3439	struct spglist free;
3440	int PG_PTE_CACHE;
3441
3442	PG_G = pmap_global_bit(pmap);
3443	PG_A = pmap_accessed_bit(pmap);
3444	PG_M = pmap_modified_bit(pmap);
3445	PG_RW = pmap_rw_bit(pmap);
3446	PG_V = pmap_valid_bit(pmap);
3447	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3448
3449	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3450	oldpde = *pde;
3451	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3452	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3453	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3454	    NULL)
3455		pmap_remove_pt_page(pmap, mpte);
3456	else {
3457		KASSERT((oldpde & PG_W) == 0,
3458		    ("pmap_demote_pde: page table page for a wired mapping"
3459		    " is missing"));
3460
3461		/*
3462		 * Invalidate the 2MB page mapping and return "failure" if the
3463		 * mapping was never accessed or the allocation of the new
3464		 * page table page fails.  If the 2MB page mapping belongs to
3465		 * the direct map region of the kernel's address space, then
3466		 * the page allocation request specifies the highest possible
3467		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3468		 * normal.  Page table pages are preallocated for every other
3469		 * part of the kernel address space, so the direct map region
3470		 * is the only part of the kernel address space that must be
3471		 * handled here.
3472		 */
3473		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3474		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3475		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3476		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3477			SLIST_INIT(&free);
3478			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3479			    lockp);
3480			pmap_invalidate_page(pmap, trunc_2mpage(va));
3481			pmap_free_zero_pages(&free);
3482			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3483			    " in pmap %p", va, pmap);
3484			return (FALSE);
3485		}
3486		if (va < VM_MAXUSER_ADDRESS)
3487			pmap_resident_count_inc(pmap, 1);
3488	}
3489	mptepa = VM_PAGE_TO_PHYS(mpte);
3490	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3491	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3492	KASSERT((oldpde & PG_A) != 0,
3493	    ("pmap_demote_pde: oldpde is missing PG_A"));
3494	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3495	    ("pmap_demote_pde: oldpde is missing PG_M"));
3496	newpte = oldpde & ~PG_PS;
3497	newpte = pmap_swap_pat(pmap, newpte);
3498
3499	/*
3500	 * If the page table page is new, initialize it.
3501	 */
3502	if (mpte->wire_count == 1) {
3503		mpte->wire_count = NPTEPG;
3504		pmap_fill_ptp(firstpte, newpte);
3505	}
3506	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3507	    ("pmap_demote_pde: firstpte and newpte map different physical"
3508	    " addresses"));
3509
3510	/*
3511	 * If the mapping has changed attributes, update the page table
3512	 * entries.
3513	 */
3514	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3515		pmap_fill_ptp(firstpte, newpte);
3516
3517	/*
3518	 * The spare PV entries must be reserved prior to demoting the
3519	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3520	 * of the PDE and the PV lists will be inconsistent, which can result
3521	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3522	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3523	 * PV entry for the 2MB page mapping that is being demoted.
3524	 */
3525	if ((oldpde & PG_MANAGED) != 0)
3526		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3527
3528	/*
3529	 * Demote the mapping.  This pmap is locked.  The old PDE has
3530	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3531	 * set.  Thus, there is no danger of a race with another
3532	 * processor changing the setting of PG_A and/or PG_M between
3533	 * the read above and the store below.
3534	 */
3535	if (workaround_erratum383)
3536		pmap_update_pde(pmap, va, pde, newpde);
3537	else
3538		pde_store(pde, newpde);
3539
3540	/*
3541	 * Invalidate a stale recursive mapping of the page table page.
3542	 */
3543	if (va >= VM_MAXUSER_ADDRESS)
3544		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3545
3546	/*
3547	 * Demote the PV entry.
3548	 */
3549	if ((oldpde & PG_MANAGED) != 0)
3550		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3551
3552	atomic_add_long(&pmap_pde_demotions, 1);
3553	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3554	    " in pmap %p", va, pmap);
3555	return (TRUE);
3556}
3557
3558/*
3559 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3560 */
3561static void
3562pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3563{
3564	pd_entry_t newpde;
3565	vm_paddr_t mptepa;
3566	vm_page_t mpte;
3567
3568	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3569	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3570	mpte = pmap_lookup_pt_page(pmap, va);
3571	if (mpte == NULL)
3572		panic("pmap_remove_kernel_pde: Missing pt page.");
3573
3574	pmap_remove_pt_page(pmap, mpte);
3575	mptepa = VM_PAGE_TO_PHYS(mpte);
3576	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3577
3578	/*
3579	 * Initialize the page table page.
3580	 */
3581	pagezero((void *)PHYS_TO_DMAP(mptepa));
3582
3583	/*
3584	 * Demote the mapping.
3585	 */
3586	if (workaround_erratum383)
3587		pmap_update_pde(pmap, va, pde, newpde);
3588	else
3589		pde_store(pde, newpde);
3590
3591	/*
3592	 * Invalidate a stale recursive mapping of the page table page.
3593	 */
3594	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3595}
3596
3597/*
3598 * pmap_remove_pde: do the things to unmap a superpage in a process
3599 */
3600static int
3601pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3602    struct spglist *free, struct rwlock **lockp)
3603{
3604	struct md_page *pvh;
3605	pd_entry_t oldpde;
3606	vm_offset_t eva, va;
3607	vm_page_t m, mpte;
3608	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3609
3610	PG_G = pmap_global_bit(pmap);
3611	PG_A = pmap_accessed_bit(pmap);
3612	PG_M = pmap_modified_bit(pmap);
3613	PG_RW = pmap_rw_bit(pmap);
3614
3615	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3616	KASSERT((sva & PDRMASK) == 0,
3617	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3618	oldpde = pte_load_clear(pdq);
3619	if (oldpde & PG_W)
3620		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3621
3622	/*
3623	 * Machines that don't support invlpg, also don't support
3624	 * PG_G.
3625	 */
3626	if (oldpde & PG_G)
3627		pmap_invalidate_page(kernel_pmap, sva);
3628	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3629	if (oldpde & PG_MANAGED) {
3630		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3631		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3632		pmap_pvh_free(pvh, pmap, sva);
3633		eva = sva + NBPDR;
3634		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3635		    va < eva; va += PAGE_SIZE, m++) {
3636			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3637				vm_page_dirty(m);
3638			if (oldpde & PG_A)
3639				vm_page_aflag_set(m, PGA_REFERENCED);
3640			if (TAILQ_EMPTY(&m->md.pv_list) &&
3641			    TAILQ_EMPTY(&pvh->pv_list))
3642				vm_page_aflag_clear(m, PGA_WRITEABLE);
3643			pmap_delayed_invl_page(m);
3644		}
3645	}
3646	if (pmap == kernel_pmap) {
3647		pmap_remove_kernel_pde(pmap, pdq, sva);
3648	} else {
3649		mpte = pmap_lookup_pt_page(pmap, sva);
3650		if (mpte != NULL) {
3651			pmap_remove_pt_page(pmap, mpte);
3652			pmap_resident_count_dec(pmap, 1);
3653			KASSERT(mpte->wire_count == NPTEPG,
3654			    ("pmap_remove_pde: pte page wire count error"));
3655			mpte->wire_count = 0;
3656			pmap_add_delayed_free_list(mpte, free, FALSE);
3657			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
3658		}
3659	}
3660	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3661}
3662
3663/*
3664 * pmap_remove_pte: do the things to unmap a page in a process
3665 */
3666static int
3667pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3668    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3669{
3670	struct md_page *pvh;
3671	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3672	vm_page_t m;
3673
3674	PG_A = pmap_accessed_bit(pmap);
3675	PG_M = pmap_modified_bit(pmap);
3676	PG_RW = pmap_rw_bit(pmap);
3677
3678	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3679	oldpte = pte_load_clear(ptq);
3680	if (oldpte & PG_W)
3681		pmap->pm_stats.wired_count -= 1;
3682	pmap_resident_count_dec(pmap, 1);
3683	if (oldpte & PG_MANAGED) {
3684		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3685		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3686			vm_page_dirty(m);
3687		if (oldpte & PG_A)
3688			vm_page_aflag_set(m, PGA_REFERENCED);
3689		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3690		pmap_pvh_free(&m->md, pmap, va);
3691		if (TAILQ_EMPTY(&m->md.pv_list) &&
3692		    (m->flags & PG_FICTITIOUS) == 0) {
3693			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3694			if (TAILQ_EMPTY(&pvh->pv_list))
3695				vm_page_aflag_clear(m, PGA_WRITEABLE);
3696		}
3697		pmap_delayed_invl_page(m);
3698	}
3699	return (pmap_unuse_pt(pmap, va, ptepde, free));
3700}
3701
3702/*
3703 * Remove a single page from a process address space
3704 */
3705static void
3706pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3707    struct spglist *free)
3708{
3709	struct rwlock *lock;
3710	pt_entry_t *pte, PG_V;
3711
3712	PG_V = pmap_valid_bit(pmap);
3713	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3714	if ((*pde & PG_V) == 0)
3715		return;
3716	pte = pmap_pde_to_pte(pde, va);
3717	if ((*pte & PG_V) == 0)
3718		return;
3719	lock = NULL;
3720	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3721	if (lock != NULL)
3722		rw_wunlock(lock);
3723	pmap_invalidate_page(pmap, va);
3724}
3725
3726/*
3727 *	Remove the given range of addresses from the specified map.
3728 *
3729 *	It is assumed that the start and end are properly
3730 *	rounded to the page size.
3731 */
3732void
3733pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3734{
3735	struct rwlock *lock;
3736	vm_offset_t va, va_next;
3737	pml4_entry_t *pml4e;
3738	pdp_entry_t *pdpe;
3739	pd_entry_t ptpaddr, *pde;
3740	pt_entry_t *pte, PG_G, PG_V;
3741	struct spglist free;
3742	int anyvalid;
3743
3744	PG_G = pmap_global_bit(pmap);
3745	PG_V = pmap_valid_bit(pmap);
3746
3747	/*
3748	 * Perform an unsynchronized read.  This is, however, safe.
3749	 */
3750	if (pmap->pm_stats.resident_count == 0)
3751		return;
3752
3753	anyvalid = 0;
3754	SLIST_INIT(&free);
3755
3756	pmap_delayed_invl_started();
3757	PMAP_LOCK(pmap);
3758
3759	/*
3760	 * special handling of removing one page.  a very
3761	 * common operation and easy to short circuit some
3762	 * code.
3763	 */
3764	if (sva + PAGE_SIZE == eva) {
3765		pde = pmap_pde(pmap, sva);
3766		if (pde && (*pde & PG_PS) == 0) {
3767			pmap_remove_page(pmap, sva, pde, &free);
3768			goto out;
3769		}
3770	}
3771
3772	lock = NULL;
3773	for (; sva < eva; sva = va_next) {
3774
3775		if (pmap->pm_stats.resident_count == 0)
3776			break;
3777
3778		pml4e = pmap_pml4e(pmap, sva);
3779		if ((*pml4e & PG_V) == 0) {
3780			va_next = (sva + NBPML4) & ~PML4MASK;
3781			if (va_next < sva)
3782				va_next = eva;
3783			continue;
3784		}
3785
3786		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3787		if ((*pdpe & PG_V) == 0) {
3788			va_next = (sva + NBPDP) & ~PDPMASK;
3789			if (va_next < sva)
3790				va_next = eva;
3791			continue;
3792		}
3793
3794		/*
3795		 * Calculate index for next page table.
3796		 */
3797		va_next = (sva + NBPDR) & ~PDRMASK;
3798		if (va_next < sva)
3799			va_next = eva;
3800
3801		pde = pmap_pdpe_to_pde(pdpe, sva);
3802		ptpaddr = *pde;
3803
3804		/*
3805		 * Weed out invalid mappings.
3806		 */
3807		if (ptpaddr == 0)
3808			continue;
3809
3810		/*
3811		 * Check for large page.
3812		 */
3813		if ((ptpaddr & PG_PS) != 0) {
3814			/*
3815			 * Are we removing the entire large page?  If not,
3816			 * demote the mapping and fall through.
3817			 */
3818			if (sva + NBPDR == va_next && eva >= va_next) {
3819				/*
3820				 * The TLB entry for a PG_G mapping is
3821				 * invalidated by pmap_remove_pde().
3822				 */
3823				if ((ptpaddr & PG_G) == 0)
3824					anyvalid = 1;
3825				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3826				continue;
3827			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3828			    &lock)) {
3829				/* The large page mapping was destroyed. */
3830				continue;
3831			} else
3832				ptpaddr = *pde;
3833		}
3834
3835		/*
3836		 * Limit our scan to either the end of the va represented
3837		 * by the current page table page, or to the end of the
3838		 * range being removed.
3839		 */
3840		if (va_next > eva)
3841			va_next = eva;
3842
3843		va = va_next;
3844		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3845		    sva += PAGE_SIZE) {
3846			if (*pte == 0) {
3847				if (va != va_next) {
3848					pmap_invalidate_range(pmap, va, sva);
3849					va = va_next;
3850				}
3851				continue;
3852			}
3853			if ((*pte & PG_G) == 0)
3854				anyvalid = 1;
3855			else if (va == va_next)
3856				va = sva;
3857			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3858			    &lock)) {
3859				sva += PAGE_SIZE;
3860				break;
3861			}
3862		}
3863		if (va != va_next)
3864			pmap_invalidate_range(pmap, va, sva);
3865	}
3866	if (lock != NULL)
3867		rw_wunlock(lock);
3868out:
3869	if (anyvalid)
3870		pmap_invalidate_all(pmap);
3871	PMAP_UNLOCK(pmap);
3872	pmap_delayed_invl_finished();
3873	pmap_free_zero_pages(&free);
3874}
3875
3876/*
3877 *	Routine:	pmap_remove_all
3878 *	Function:
3879 *		Removes this physical page from
3880 *		all physical maps in which it resides.
3881 *		Reflects back modify bits to the pager.
3882 *
3883 *	Notes:
3884 *		Original versions of this routine were very
3885 *		inefficient because they iteratively called
3886 *		pmap_remove (slow...)
3887 */
3888
3889void
3890pmap_remove_all(vm_page_t m)
3891{
3892	struct md_page *pvh;
3893	pv_entry_t pv;
3894	pmap_t pmap;
3895	struct rwlock *lock;
3896	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3897	pd_entry_t *pde;
3898	vm_offset_t va;
3899	struct spglist free;
3900	int pvh_gen, md_gen;
3901
3902	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3903	    ("pmap_remove_all: page %p is not managed", m));
3904	SLIST_INIT(&free);
3905	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3906	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
3907	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
3908retry:
3909	rw_wlock(lock);
3910	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3911		pmap = PV_PMAP(pv);
3912		if (!PMAP_TRYLOCK(pmap)) {
3913			pvh_gen = pvh->pv_gen;
3914			rw_wunlock(lock);
3915			PMAP_LOCK(pmap);
3916			rw_wlock(lock);
3917			if (pvh_gen != pvh->pv_gen) {
3918				rw_wunlock(lock);
3919				PMAP_UNLOCK(pmap);
3920				goto retry;
3921			}
3922		}
3923		va = pv->pv_va;
3924		pde = pmap_pde(pmap, va);
3925		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
3926		PMAP_UNLOCK(pmap);
3927	}
3928	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3929		pmap = PV_PMAP(pv);
3930		if (!PMAP_TRYLOCK(pmap)) {
3931			pvh_gen = pvh->pv_gen;
3932			md_gen = m->md.pv_gen;
3933			rw_wunlock(lock);
3934			PMAP_LOCK(pmap);
3935			rw_wlock(lock);
3936			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3937				rw_wunlock(lock);
3938				PMAP_UNLOCK(pmap);
3939				goto retry;
3940			}
3941		}
3942		PG_A = pmap_accessed_bit(pmap);
3943		PG_M = pmap_modified_bit(pmap);
3944		PG_RW = pmap_rw_bit(pmap);
3945		pmap_resident_count_dec(pmap, 1);
3946		pde = pmap_pde(pmap, pv->pv_va);
3947		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3948		    " a 2mpage in page %p's pv list", m));
3949		pte = pmap_pde_to_pte(pde, pv->pv_va);
3950		tpte = pte_load_clear(pte);
3951		if (tpte & PG_W)
3952			pmap->pm_stats.wired_count--;
3953		if (tpte & PG_A)
3954			vm_page_aflag_set(m, PGA_REFERENCED);
3955
3956		/*
3957		 * Update the vm_page_t clean and reference bits.
3958		 */
3959		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3960			vm_page_dirty(m);
3961		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3962		pmap_invalidate_page(pmap, pv->pv_va);
3963		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3964		m->md.pv_gen++;
3965		free_pv_entry(pmap, pv);
3966		PMAP_UNLOCK(pmap);
3967	}
3968	vm_page_aflag_clear(m, PGA_WRITEABLE);
3969	rw_wunlock(lock);
3970	pmap_delayed_invl_wait(m);
3971	pmap_free_zero_pages(&free);
3972}
3973
3974/*
3975 * pmap_protect_pde: do the things to protect a 2mpage in a process
3976 */
3977static boolean_t
3978pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3979{
3980	pd_entry_t newpde, oldpde;
3981	vm_offset_t eva, va;
3982	vm_page_t m;
3983	boolean_t anychanged;
3984	pt_entry_t PG_G, PG_M, PG_RW;
3985
3986	PG_G = pmap_global_bit(pmap);
3987	PG_M = pmap_modified_bit(pmap);
3988	PG_RW = pmap_rw_bit(pmap);
3989
3990	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3991	KASSERT((sva & PDRMASK) == 0,
3992	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3993	anychanged = FALSE;
3994retry:
3995	oldpde = newpde = *pde;
3996	if (oldpde & PG_MANAGED) {
3997		eva = sva + NBPDR;
3998		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3999		    va < eva; va += PAGE_SIZE, m++)
4000			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4001				vm_page_dirty(m);
4002	}
4003	if ((prot & VM_PROT_WRITE) == 0)
4004		newpde &= ~(PG_RW | PG_M);
4005	if ((prot & VM_PROT_EXECUTE) == 0)
4006		newpde |= pg_nx;
4007	if (newpde != oldpde) {
4008		if (!atomic_cmpset_long(pde, oldpde, newpde))
4009			goto retry;
4010		if (oldpde & PG_G)
4011			pmap_invalidate_page(pmap, sva);
4012		else
4013			anychanged = TRUE;
4014	}
4015	return (anychanged);
4016}
4017
4018/*
4019 *	Set the physical protection on the
4020 *	specified range of this map as requested.
4021 */
4022void
4023pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4024{
4025	vm_offset_t va_next;
4026	pml4_entry_t *pml4e;
4027	pdp_entry_t *pdpe;
4028	pd_entry_t ptpaddr, *pde;
4029	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
4030	boolean_t anychanged;
4031
4032	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4033	if (prot == VM_PROT_NONE) {
4034		pmap_remove(pmap, sva, eva);
4035		return;
4036	}
4037
4038	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4039	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
4040		return;
4041
4042	PG_G = pmap_global_bit(pmap);
4043	PG_M = pmap_modified_bit(pmap);
4044	PG_V = pmap_valid_bit(pmap);
4045	PG_RW = pmap_rw_bit(pmap);
4046	anychanged = FALSE;
4047
4048	PMAP_LOCK(pmap);
4049	for (; sva < eva; sva = va_next) {
4050
4051		pml4e = pmap_pml4e(pmap, sva);
4052		if ((*pml4e & PG_V) == 0) {
4053			va_next = (sva + NBPML4) & ~PML4MASK;
4054			if (va_next < sva)
4055				va_next = eva;
4056			continue;
4057		}
4058
4059		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4060		if ((*pdpe & PG_V) == 0) {
4061			va_next = (sva + NBPDP) & ~PDPMASK;
4062			if (va_next < sva)
4063				va_next = eva;
4064			continue;
4065		}
4066
4067		va_next = (sva + NBPDR) & ~PDRMASK;
4068		if (va_next < sva)
4069			va_next = eva;
4070
4071		pde = pmap_pdpe_to_pde(pdpe, sva);
4072		ptpaddr = *pde;
4073
4074		/*
4075		 * Weed out invalid mappings.
4076		 */
4077		if (ptpaddr == 0)
4078			continue;
4079
4080		/*
4081		 * Check for large page.
4082		 */
4083		if ((ptpaddr & PG_PS) != 0) {
4084			/*
4085			 * Are we protecting the entire large page?  If not,
4086			 * demote the mapping and fall through.
4087			 */
4088			if (sva + NBPDR == va_next && eva >= va_next) {
4089				/*
4090				 * The TLB entry for a PG_G mapping is
4091				 * invalidated by pmap_protect_pde().
4092				 */
4093				if (pmap_protect_pde(pmap, pde, sva, prot))
4094					anychanged = TRUE;
4095				continue;
4096			} else if (!pmap_demote_pde(pmap, pde, sva)) {
4097				/*
4098				 * The large page mapping was destroyed.
4099				 */
4100				continue;
4101			}
4102		}
4103
4104		if (va_next > eva)
4105			va_next = eva;
4106
4107		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4108		    sva += PAGE_SIZE) {
4109			pt_entry_t obits, pbits;
4110			vm_page_t m;
4111
4112retry:
4113			obits = pbits = *pte;
4114			if ((pbits & PG_V) == 0)
4115				continue;
4116
4117			if ((prot & VM_PROT_WRITE) == 0) {
4118				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4119				    (PG_MANAGED | PG_M | PG_RW)) {
4120					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4121					vm_page_dirty(m);
4122				}
4123				pbits &= ~(PG_RW | PG_M);
4124			}
4125			if ((prot & VM_PROT_EXECUTE) == 0)
4126				pbits |= pg_nx;
4127
4128			if (pbits != obits) {
4129				if (!atomic_cmpset_long(pte, obits, pbits))
4130					goto retry;
4131				if (obits & PG_G)
4132					pmap_invalidate_page(pmap, sva);
4133				else
4134					anychanged = TRUE;
4135			}
4136		}
4137	}
4138	if (anychanged)
4139		pmap_invalidate_all(pmap);
4140	PMAP_UNLOCK(pmap);
4141}
4142
4143/*
4144 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4145 * single page table page (PTP) to a single 2MB page mapping.  For promotion
4146 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4147 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4148 * identical characteristics.
4149 */
4150static void
4151pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4152    struct rwlock **lockp)
4153{
4154	pd_entry_t newpde;
4155	pt_entry_t *firstpte, oldpte, pa, *pte;
4156	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4157	vm_page_t mpte;
4158	int PG_PTE_CACHE;
4159
4160	PG_A = pmap_accessed_bit(pmap);
4161	PG_G = pmap_global_bit(pmap);
4162	PG_M = pmap_modified_bit(pmap);
4163	PG_V = pmap_valid_bit(pmap);
4164	PG_RW = pmap_rw_bit(pmap);
4165	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4166
4167	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4168
4169	/*
4170	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4171	 * either invalid, unused, or does not map the first 4KB physical page
4172	 * within a 2MB page.
4173	 */
4174	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4175setpde:
4176	newpde = *firstpte;
4177	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4178		atomic_add_long(&pmap_pde_p_failures, 1);
4179		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4180		    " in pmap %p", va, pmap);
4181		return;
4182	}
4183	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4184		/*
4185		 * When PG_M is already clear, PG_RW can be cleared without
4186		 * a TLB invalidation.
4187		 */
4188		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4189			goto setpde;
4190		newpde &= ~PG_RW;
4191	}
4192
4193	/*
4194	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4195	 * PTE maps an unexpected 4KB physical page or does not have identical
4196	 * characteristics to the first PTE.
4197	 */
4198	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4199	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4200setpte:
4201		oldpte = *pte;
4202		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4203			atomic_add_long(&pmap_pde_p_failures, 1);
4204			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4205			    " in pmap %p", va, pmap);
4206			return;
4207		}
4208		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4209			/*
4210			 * When PG_M is already clear, PG_RW can be cleared
4211			 * without a TLB invalidation.
4212			 */
4213			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4214				goto setpte;
4215			oldpte &= ~PG_RW;
4216			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4217			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4218			    (va & ~PDRMASK), pmap);
4219		}
4220		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4221			atomic_add_long(&pmap_pde_p_failures, 1);
4222			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4223			    " in pmap %p", va, pmap);
4224			return;
4225		}
4226		pa -= PAGE_SIZE;
4227	}
4228
4229	/*
4230	 * Save the page table page in its current state until the PDE
4231	 * mapping the superpage is demoted by pmap_demote_pde() or
4232	 * destroyed by pmap_remove_pde().
4233	 */
4234	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4235	KASSERT(mpte >= vm_page_array &&
4236	    mpte < &vm_page_array[vm_page_array_size],
4237	    ("pmap_promote_pde: page table page is out of range"));
4238	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4239	    ("pmap_promote_pde: page table page's pindex is wrong"));
4240	if (pmap_insert_pt_page(pmap, mpte)) {
4241		atomic_add_long(&pmap_pde_p_failures, 1);
4242		CTR2(KTR_PMAP,
4243		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4244		    pmap);
4245		return;
4246	}
4247
4248	/*
4249	 * Promote the pv entries.
4250	 */
4251	if ((newpde & PG_MANAGED) != 0)
4252		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4253
4254	/*
4255	 * Propagate the PAT index to its proper position.
4256	 */
4257	newpde = pmap_swap_pat(pmap, newpde);
4258
4259	/*
4260	 * Map the superpage.
4261	 */
4262	if (workaround_erratum383)
4263		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4264	else
4265		pde_store(pde, PG_PS | newpde);
4266
4267	atomic_add_long(&pmap_pde_promotions, 1);
4268	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4269	    " in pmap %p", va, pmap);
4270}
4271
4272/*
4273 *	Insert the given physical page (p) at
4274 *	the specified virtual address (v) in the
4275 *	target physical map with the protection requested.
4276 *
4277 *	If specified, the page will be wired down, meaning
4278 *	that the related pte can not be reclaimed.
4279 *
4280 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4281 *	or lose information.  That is, this routine must actually
4282 *	insert this page into the given map NOW.
4283 *
4284 *	When destroying both a page table and PV entry, this function
4285 *	performs the TLB invalidation before releasing the PV list
4286 *	lock, so we do not need pmap_delayed_invl_page() calls here.
4287 */
4288int
4289pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4290    u_int flags, int8_t psind __unused)
4291{
4292	struct rwlock *lock;
4293	pd_entry_t *pde;
4294	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4295	pt_entry_t newpte, origpte;
4296	pv_entry_t pv;
4297	vm_paddr_t opa, pa;
4298	vm_page_t mpte, om;
4299	boolean_t nosleep;
4300
4301	PG_A = pmap_accessed_bit(pmap);
4302	PG_G = pmap_global_bit(pmap);
4303	PG_M = pmap_modified_bit(pmap);
4304	PG_V = pmap_valid_bit(pmap);
4305	PG_RW = pmap_rw_bit(pmap);
4306
4307	va = trunc_page(va);
4308	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4309	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4310	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4311	    va));
4312	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4313	    va >= kmi.clean_eva,
4314	    ("pmap_enter: managed mapping within the clean submap"));
4315	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4316		VM_OBJECT_ASSERT_LOCKED(m->object);
4317	pa = VM_PAGE_TO_PHYS(m);
4318	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4319	if ((flags & VM_PROT_WRITE) != 0)
4320		newpte |= PG_M;
4321	if ((prot & VM_PROT_WRITE) != 0)
4322		newpte |= PG_RW;
4323	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4324	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4325	if ((prot & VM_PROT_EXECUTE) == 0)
4326		newpte |= pg_nx;
4327	if ((flags & PMAP_ENTER_WIRED) != 0)
4328		newpte |= PG_W;
4329	if (va < VM_MAXUSER_ADDRESS)
4330		newpte |= PG_U;
4331	if (pmap == kernel_pmap)
4332		newpte |= PG_G;
4333	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4334
4335	/*
4336	 * Set modified bit gratuitously for writeable mappings if
4337	 * the page is unmanaged. We do not want to take a fault
4338	 * to do the dirty bit accounting for these mappings.
4339	 */
4340	if ((m->oflags & VPO_UNMANAGED) != 0) {
4341		if ((newpte & PG_RW) != 0)
4342			newpte |= PG_M;
4343	}
4344
4345	mpte = NULL;
4346
4347	lock = NULL;
4348	PMAP_LOCK(pmap);
4349
4350	/*
4351	 * In the case that a page table page is not
4352	 * resident, we are creating it here.
4353	 */
4354retry:
4355	pde = pmap_pde(pmap, va);
4356	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4357	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4358		pte = pmap_pde_to_pte(pde, va);
4359		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4360			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4361			mpte->wire_count++;
4362		}
4363	} else if (va < VM_MAXUSER_ADDRESS) {
4364		/*
4365		 * Here if the pte page isn't mapped, or if it has been
4366		 * deallocated.
4367		 */
4368		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4369		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4370		    nosleep ? NULL : &lock);
4371		if (mpte == NULL && nosleep) {
4372			if (lock != NULL)
4373				rw_wunlock(lock);
4374			PMAP_UNLOCK(pmap);
4375			return (KERN_RESOURCE_SHORTAGE);
4376		}
4377		goto retry;
4378	} else
4379		panic("pmap_enter: invalid page directory va=%#lx", va);
4380
4381	origpte = *pte;
4382
4383	/*
4384	 * Is the specified virtual address already mapped?
4385	 */
4386	if ((origpte & PG_V) != 0) {
4387		/*
4388		 * Wiring change, just update stats. We don't worry about
4389		 * wiring PT pages as they remain resident as long as there
4390		 * are valid mappings in them. Hence, if a user page is wired,
4391		 * the PT page will be also.
4392		 */
4393		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4394			pmap->pm_stats.wired_count++;
4395		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4396			pmap->pm_stats.wired_count--;
4397
4398		/*
4399		 * Remove the extra PT page reference.
4400		 */
4401		if (mpte != NULL) {
4402			mpte->wire_count--;
4403			KASSERT(mpte->wire_count > 0,
4404			    ("pmap_enter: missing reference to page table page,"
4405			     " va: 0x%lx", va));
4406		}
4407
4408		/*
4409		 * Has the physical page changed?
4410		 */
4411		opa = origpte & PG_FRAME;
4412		if (opa == pa) {
4413			/*
4414			 * No, might be a protection or wiring change.
4415			 */
4416			if ((origpte & PG_MANAGED) != 0) {
4417				newpte |= PG_MANAGED;
4418				if ((newpte & PG_RW) != 0)
4419					vm_page_aflag_set(m, PGA_WRITEABLE);
4420			}
4421			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4422				goto unchanged;
4423			goto validate;
4424		}
4425	} else {
4426		/*
4427		 * Increment the counters.
4428		 */
4429		if ((newpte & PG_W) != 0)
4430			pmap->pm_stats.wired_count++;
4431		pmap_resident_count_inc(pmap, 1);
4432	}
4433
4434	/*
4435	 * Enter on the PV list if part of our managed memory.
4436	 */
4437	if ((m->oflags & VPO_UNMANAGED) == 0) {
4438		newpte |= PG_MANAGED;
4439		pv = get_pv_entry(pmap, &lock);
4440		pv->pv_va = va;
4441		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4442		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4443		m->md.pv_gen++;
4444		if ((newpte & PG_RW) != 0)
4445			vm_page_aflag_set(m, PGA_WRITEABLE);
4446	}
4447
4448	/*
4449	 * Update the PTE.
4450	 */
4451	if ((origpte & PG_V) != 0) {
4452validate:
4453		origpte = pte_load_store(pte, newpte);
4454		opa = origpte & PG_FRAME;
4455		if (opa != pa) {
4456			if ((origpte & PG_MANAGED) != 0) {
4457				om = PHYS_TO_VM_PAGE(opa);
4458				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4459				    PG_RW))
4460					vm_page_dirty(om);
4461				if ((origpte & PG_A) != 0)
4462					vm_page_aflag_set(om, PGA_REFERENCED);
4463				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4464				pmap_pvh_free(&om->md, pmap, va);
4465				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4466				    TAILQ_EMPTY(&om->md.pv_list) &&
4467				    ((om->flags & PG_FICTITIOUS) != 0 ||
4468				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4469					vm_page_aflag_clear(om, PGA_WRITEABLE);
4470			}
4471		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4472		    PG_RW)) == (PG_M | PG_RW)) {
4473			if ((origpte & PG_MANAGED) != 0)
4474				vm_page_dirty(m);
4475
4476			/*
4477			 * Although the PTE may still have PG_RW set, TLB
4478			 * invalidation may nonetheless be required because
4479			 * the PTE no longer has PG_M set.
4480			 */
4481		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4482			/*
4483			 * This PTE change does not require TLB invalidation.
4484			 */
4485			goto unchanged;
4486		}
4487		if ((origpte & PG_A) != 0)
4488			pmap_invalidate_page(pmap, va);
4489	} else
4490		pte_store(pte, newpte);
4491
4492unchanged:
4493
4494	/*
4495	 * If both the page table page and the reservation are fully
4496	 * populated, then attempt promotion.
4497	 */
4498	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4499	    pmap_ps_enabled(pmap) &&
4500	    (m->flags & PG_FICTITIOUS) == 0 &&
4501	    vm_reserv_level_iffullpop(m) == 0)
4502		pmap_promote_pde(pmap, pde, va, &lock);
4503
4504	if (lock != NULL)
4505		rw_wunlock(lock);
4506	PMAP_UNLOCK(pmap);
4507	return (KERN_SUCCESS);
4508}
4509
4510/*
4511 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4512 * otherwise.  Fails if (1) a page table page cannot be allocated without
4513 * blocking, (2) a mapping already exists at the specified virtual address, or
4514 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4515 */
4516static boolean_t
4517pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4518    struct rwlock **lockp)
4519{
4520	pd_entry_t *pde, newpde;
4521	pt_entry_t PG_V;
4522	vm_page_t mpde;
4523	struct spglist free;
4524
4525	PG_V = pmap_valid_bit(pmap);
4526	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4527
4528	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4529		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4530		    " in pmap %p", va, pmap);
4531		return (FALSE);
4532	}
4533	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4534	pde = &pde[pmap_pde_index(va)];
4535	if ((*pde & PG_V) != 0) {
4536		KASSERT(mpde->wire_count > 1,
4537		    ("pmap_enter_pde: mpde's wire count is too low"));
4538		mpde->wire_count--;
4539		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4540		    " in pmap %p", va, pmap);
4541		return (FALSE);
4542	}
4543	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4544	    PG_PS | PG_V;
4545	if ((m->oflags & VPO_UNMANAGED) == 0) {
4546		newpde |= PG_MANAGED;
4547
4548		/*
4549		 * Abort this mapping if its PV entry could not be created.
4550		 */
4551		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4552		    lockp)) {
4553			SLIST_INIT(&free);
4554			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4555				/*
4556				 * Although "va" is not mapped, paging-
4557				 * structure caches could nonetheless have
4558				 * entries that refer to the freed page table
4559				 * pages.  Invalidate those entries.
4560				 */
4561				pmap_invalidate_page(pmap, va);
4562				pmap_free_zero_pages(&free);
4563			}
4564			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4565			    " in pmap %p", va, pmap);
4566			return (FALSE);
4567		}
4568	}
4569	if ((prot & VM_PROT_EXECUTE) == 0)
4570		newpde |= pg_nx;
4571	if (va < VM_MAXUSER_ADDRESS)
4572		newpde |= PG_U;
4573
4574	/*
4575	 * Increment counters.
4576	 */
4577	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4578
4579	/*
4580	 * Map the superpage.
4581	 */
4582	pde_store(pde, newpde);
4583
4584	atomic_add_long(&pmap_pde_mappings, 1);
4585	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4586	    " in pmap %p", va, pmap);
4587	return (TRUE);
4588}
4589
4590/*
4591 * Maps a sequence of resident pages belonging to the same object.
4592 * The sequence begins with the given page m_start.  This page is
4593 * mapped at the given virtual address start.  Each subsequent page is
4594 * mapped at a virtual address that is offset from start by the same
4595 * amount as the page is offset from m_start within the object.  The
4596 * last page in the sequence is the page with the largest offset from
4597 * m_start that can be mapped at a virtual address less than the given
4598 * virtual address end.  Not every virtual page between start and end
4599 * is mapped; only those for which a resident page exists with the
4600 * corresponding offset from m_start are mapped.
4601 */
4602void
4603pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4604    vm_page_t m_start, vm_prot_t prot)
4605{
4606	struct rwlock *lock;
4607	vm_offset_t va;
4608	vm_page_t m, mpte;
4609	vm_pindex_t diff, psize;
4610
4611	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4612
4613	psize = atop(end - start);
4614	mpte = NULL;
4615	m = m_start;
4616	lock = NULL;
4617	PMAP_LOCK(pmap);
4618	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4619		va = start + ptoa(diff);
4620		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4621		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4622		    pmap_enter_pde(pmap, va, m, prot, &lock))
4623			m = &m[NBPDR / PAGE_SIZE - 1];
4624		else
4625			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4626			    mpte, &lock);
4627		m = TAILQ_NEXT(m, listq);
4628	}
4629	if (lock != NULL)
4630		rw_wunlock(lock);
4631	PMAP_UNLOCK(pmap);
4632}
4633
4634/*
4635 * this code makes some *MAJOR* assumptions:
4636 * 1. Current pmap & pmap exists.
4637 * 2. Not wired.
4638 * 3. Read access.
4639 * 4. No page table pages.
4640 * but is *MUCH* faster than pmap_enter...
4641 */
4642
4643void
4644pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4645{
4646	struct rwlock *lock;
4647
4648	lock = NULL;
4649	PMAP_LOCK(pmap);
4650	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4651	if (lock != NULL)
4652		rw_wunlock(lock);
4653	PMAP_UNLOCK(pmap);
4654}
4655
4656static vm_page_t
4657pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4658    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4659{
4660	struct spglist free;
4661	pt_entry_t *pte, PG_V;
4662	vm_paddr_t pa;
4663
4664	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4665	    (m->oflags & VPO_UNMANAGED) != 0,
4666	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4667	PG_V = pmap_valid_bit(pmap);
4668	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4669
4670	/*
4671	 * In the case that a page table page is not
4672	 * resident, we are creating it here.
4673	 */
4674	if (va < VM_MAXUSER_ADDRESS) {
4675		vm_pindex_t ptepindex;
4676		pd_entry_t *ptepa;
4677
4678		/*
4679		 * Calculate pagetable page index
4680		 */
4681		ptepindex = pmap_pde_pindex(va);
4682		if (mpte && (mpte->pindex == ptepindex)) {
4683			mpte->wire_count++;
4684		} else {
4685			/*
4686			 * Get the page directory entry
4687			 */
4688			ptepa = pmap_pde(pmap, va);
4689
4690			/*
4691			 * If the page table page is mapped, we just increment
4692			 * the hold count, and activate it.  Otherwise, we
4693			 * attempt to allocate a page table page.  If this
4694			 * attempt fails, we don't retry.  Instead, we give up.
4695			 */
4696			if (ptepa && (*ptepa & PG_V) != 0) {
4697				if (*ptepa & PG_PS)
4698					return (NULL);
4699				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4700				mpte->wire_count++;
4701			} else {
4702				/*
4703				 * Pass NULL instead of the PV list lock
4704				 * pointer, because we don't intend to sleep.
4705				 */
4706				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4707				if (mpte == NULL)
4708					return (mpte);
4709			}
4710		}
4711		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4712		pte = &pte[pmap_pte_index(va)];
4713	} else {
4714		mpte = NULL;
4715		pte = vtopte(va);
4716	}
4717	if (*pte) {
4718		if (mpte != NULL) {
4719			mpte->wire_count--;
4720			mpte = NULL;
4721		}
4722		return (mpte);
4723	}
4724
4725	/*
4726	 * Enter on the PV list if part of our managed memory.
4727	 */
4728	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4729	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4730		if (mpte != NULL) {
4731			SLIST_INIT(&free);
4732			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4733				/*
4734				 * Although "va" is not mapped, paging-
4735				 * structure caches could nonetheless have
4736				 * entries that refer to the freed page table
4737				 * pages.  Invalidate those entries.
4738				 */
4739				pmap_invalidate_page(pmap, va);
4740				pmap_free_zero_pages(&free);
4741			}
4742			mpte = NULL;
4743		}
4744		return (mpte);
4745	}
4746
4747	/*
4748	 * Increment counters
4749	 */
4750	pmap_resident_count_inc(pmap, 1);
4751
4752	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4753	if ((prot & VM_PROT_EXECUTE) == 0)
4754		pa |= pg_nx;
4755
4756	/*
4757	 * Now validate mapping with RO protection
4758	 */
4759	if ((m->oflags & VPO_UNMANAGED) != 0)
4760		pte_store(pte, pa | PG_V | PG_U);
4761	else
4762		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4763	return (mpte);
4764}
4765
4766/*
4767 * Make a temporary mapping for a physical address.  This is only intended
4768 * to be used for panic dumps.
4769 */
4770void *
4771pmap_kenter_temporary(vm_paddr_t pa, int i)
4772{
4773	vm_offset_t va;
4774
4775	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4776	pmap_kenter(va, pa);
4777	invlpg(va);
4778	return ((void *)crashdumpmap);
4779}
4780
4781/*
4782 * This code maps large physical mmap regions into the
4783 * processor address space.  Note that some shortcuts
4784 * are taken, but the code works.
4785 */
4786void
4787pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4788    vm_pindex_t pindex, vm_size_t size)
4789{
4790	pd_entry_t *pde;
4791	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4792	vm_paddr_t pa, ptepa;
4793	vm_page_t p, pdpg;
4794	int pat_mode;
4795
4796	PG_A = pmap_accessed_bit(pmap);
4797	PG_M = pmap_modified_bit(pmap);
4798	PG_V = pmap_valid_bit(pmap);
4799	PG_RW = pmap_rw_bit(pmap);
4800
4801	VM_OBJECT_ASSERT_WLOCKED(object);
4802	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4803	    ("pmap_object_init_pt: non-device object"));
4804	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4805		if (!pmap_ps_enabled(pmap))
4806			return;
4807		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4808			return;
4809		p = vm_page_lookup(object, pindex);
4810		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4811		    ("pmap_object_init_pt: invalid page %p", p));
4812		pat_mode = p->md.pat_mode;
4813
4814		/*
4815		 * Abort the mapping if the first page is not physically
4816		 * aligned to a 2MB page boundary.
4817		 */
4818		ptepa = VM_PAGE_TO_PHYS(p);
4819		if (ptepa & (NBPDR - 1))
4820			return;
4821
4822		/*
4823		 * Skip the first page.  Abort the mapping if the rest of
4824		 * the pages are not physically contiguous or have differing
4825		 * memory attributes.
4826		 */
4827		p = TAILQ_NEXT(p, listq);
4828		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4829		    pa += PAGE_SIZE) {
4830			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4831			    ("pmap_object_init_pt: invalid page %p", p));
4832			if (pa != VM_PAGE_TO_PHYS(p) ||
4833			    pat_mode != p->md.pat_mode)
4834				return;
4835			p = TAILQ_NEXT(p, listq);
4836		}
4837
4838		/*
4839		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4840		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4841		 * will not affect the termination of this loop.
4842		 */
4843		PMAP_LOCK(pmap);
4844		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4845		    pa < ptepa + size; pa += NBPDR) {
4846			pdpg = pmap_allocpde(pmap, addr, NULL);
4847			if (pdpg == NULL) {
4848				/*
4849				 * The creation of mappings below is only an
4850				 * optimization.  If a page directory page
4851				 * cannot be allocated without blocking,
4852				 * continue on to the next mapping rather than
4853				 * blocking.
4854				 */
4855				addr += NBPDR;
4856				continue;
4857			}
4858			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4859			pde = &pde[pmap_pde_index(addr)];
4860			if ((*pde & PG_V) == 0) {
4861				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4862				    PG_U | PG_RW | PG_V);
4863				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4864				atomic_add_long(&pmap_pde_mappings, 1);
4865			} else {
4866				/* Continue on if the PDE is already valid. */
4867				pdpg->wire_count--;
4868				KASSERT(pdpg->wire_count > 0,
4869				    ("pmap_object_init_pt: missing reference "
4870				    "to page directory page, va: 0x%lx", addr));
4871			}
4872			addr += NBPDR;
4873		}
4874		PMAP_UNLOCK(pmap);
4875	}
4876}
4877
4878/*
4879 *	Clear the wired attribute from the mappings for the specified range of
4880 *	addresses in the given pmap.  Every valid mapping within that range
4881 *	must have the wired attribute set.  In contrast, invalid mappings
4882 *	cannot have the wired attribute set, so they are ignored.
4883 *
4884 *	The wired attribute of the page table entry is not a hardware
4885 *	feature, so there is no need to invalidate any TLB entries.
4886 *	Since pmap_demote_pde() for the wired entry must never fail,
4887 *	pmap_delayed_invl_started()/finished() calls around the
4888 *	function are not needed.
4889 */
4890void
4891pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4892{
4893	vm_offset_t va_next;
4894	pml4_entry_t *pml4e;
4895	pdp_entry_t *pdpe;
4896	pd_entry_t *pde;
4897	pt_entry_t *pte, PG_V;
4898
4899	PG_V = pmap_valid_bit(pmap);
4900	PMAP_LOCK(pmap);
4901	for (; sva < eva; sva = va_next) {
4902		pml4e = pmap_pml4e(pmap, sva);
4903		if ((*pml4e & PG_V) == 0) {
4904			va_next = (sva + NBPML4) & ~PML4MASK;
4905			if (va_next < sva)
4906				va_next = eva;
4907			continue;
4908		}
4909		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4910		if ((*pdpe & PG_V) == 0) {
4911			va_next = (sva + NBPDP) & ~PDPMASK;
4912			if (va_next < sva)
4913				va_next = eva;
4914			continue;
4915		}
4916		va_next = (sva + NBPDR) & ~PDRMASK;
4917		if (va_next < sva)
4918			va_next = eva;
4919		pde = pmap_pdpe_to_pde(pdpe, sva);
4920		if ((*pde & PG_V) == 0)
4921			continue;
4922		if ((*pde & PG_PS) != 0) {
4923			if ((*pde & PG_W) == 0)
4924				panic("pmap_unwire: pde %#jx is missing PG_W",
4925				    (uintmax_t)*pde);
4926
4927			/*
4928			 * Are we unwiring the entire large page?  If not,
4929			 * demote the mapping and fall through.
4930			 */
4931			if (sva + NBPDR == va_next && eva >= va_next) {
4932				atomic_clear_long(pde, PG_W);
4933				pmap->pm_stats.wired_count -= NBPDR /
4934				    PAGE_SIZE;
4935				continue;
4936			} else if (!pmap_demote_pde(pmap, pde, sva))
4937				panic("pmap_unwire: demotion failed");
4938		}
4939		if (va_next > eva)
4940			va_next = eva;
4941		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4942		    sva += PAGE_SIZE) {
4943			if ((*pte & PG_V) == 0)
4944				continue;
4945			if ((*pte & PG_W) == 0)
4946				panic("pmap_unwire: pte %#jx is missing PG_W",
4947				    (uintmax_t)*pte);
4948
4949			/*
4950			 * PG_W must be cleared atomically.  Although the pmap
4951			 * lock synchronizes access to PG_W, another processor
4952			 * could be setting PG_M and/or PG_A concurrently.
4953			 */
4954			atomic_clear_long(pte, PG_W);
4955			pmap->pm_stats.wired_count--;
4956		}
4957	}
4958	PMAP_UNLOCK(pmap);
4959}
4960
4961/*
4962 *	Copy the range specified by src_addr/len
4963 *	from the source map to the range dst_addr/len
4964 *	in the destination map.
4965 *
4966 *	This routine is only advisory and need not do anything.
4967 */
4968
4969void
4970pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4971    vm_offset_t src_addr)
4972{
4973	struct rwlock *lock;
4974	struct spglist free;
4975	vm_offset_t addr;
4976	vm_offset_t end_addr = src_addr + len;
4977	vm_offset_t va_next;
4978	pt_entry_t PG_A, PG_M, PG_V;
4979
4980	if (dst_addr != src_addr)
4981		return;
4982
4983	if (dst_pmap->pm_type != src_pmap->pm_type)
4984		return;
4985
4986	/*
4987	 * EPT page table entries that require emulation of A/D bits are
4988	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4989	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4990	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4991	 * implementations flag an EPT misconfiguration for exec-only
4992	 * mappings we skip this function entirely for emulated pmaps.
4993	 */
4994	if (pmap_emulate_ad_bits(dst_pmap))
4995		return;
4996
4997	lock = NULL;
4998	if (dst_pmap < src_pmap) {
4999		PMAP_LOCK(dst_pmap);
5000		PMAP_LOCK(src_pmap);
5001	} else {
5002		PMAP_LOCK(src_pmap);
5003		PMAP_LOCK(dst_pmap);
5004	}
5005
5006	PG_A = pmap_accessed_bit(dst_pmap);
5007	PG_M = pmap_modified_bit(dst_pmap);
5008	PG_V = pmap_valid_bit(dst_pmap);
5009
5010	for (addr = src_addr; addr < end_addr; addr = va_next) {
5011		pt_entry_t *src_pte, *dst_pte;
5012		vm_page_t dstmpde, dstmpte, srcmpte;
5013		pml4_entry_t *pml4e;
5014		pdp_entry_t *pdpe;
5015		pd_entry_t srcptepaddr, *pde;
5016
5017		KASSERT(addr < UPT_MIN_ADDRESS,
5018		    ("pmap_copy: invalid to pmap_copy page tables"));
5019
5020		pml4e = pmap_pml4e(src_pmap, addr);
5021		if ((*pml4e & PG_V) == 0) {
5022			va_next = (addr + NBPML4) & ~PML4MASK;
5023			if (va_next < addr)
5024				va_next = end_addr;
5025			continue;
5026		}
5027
5028		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
5029		if ((*pdpe & PG_V) == 0) {
5030			va_next = (addr + NBPDP) & ~PDPMASK;
5031			if (va_next < addr)
5032				va_next = end_addr;
5033			continue;
5034		}
5035
5036		va_next = (addr + NBPDR) & ~PDRMASK;
5037		if (va_next < addr)
5038			va_next = end_addr;
5039
5040		pde = pmap_pdpe_to_pde(pdpe, addr);
5041		srcptepaddr = *pde;
5042		if (srcptepaddr == 0)
5043			continue;
5044
5045		if (srcptepaddr & PG_PS) {
5046			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5047				continue;
5048			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
5049			if (dstmpde == NULL)
5050				break;
5051			pde = (pd_entry_t *)
5052			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
5053			pde = &pde[pmap_pde_index(addr)];
5054			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5055			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
5056			    PG_PS_FRAME, &lock))) {
5057				*pde = srcptepaddr & ~PG_W;
5058				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5059				atomic_add_long(&pmap_pde_mappings, 1);
5060			} else
5061				dstmpde->wire_count--;
5062			continue;
5063		}
5064
5065		srcptepaddr &= PG_FRAME;
5066		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5067		KASSERT(srcmpte->wire_count > 0,
5068		    ("pmap_copy: source page table page is unused"));
5069
5070		if (va_next > end_addr)
5071			va_next = end_addr;
5072
5073		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5074		src_pte = &src_pte[pmap_pte_index(addr)];
5075		dstmpte = NULL;
5076		while (addr < va_next) {
5077			pt_entry_t ptetemp;
5078			ptetemp = *src_pte;
5079			/*
5080			 * we only virtual copy managed pages
5081			 */
5082			if ((ptetemp & PG_MANAGED) != 0) {
5083				if (dstmpte != NULL &&
5084				    dstmpte->pindex == pmap_pde_pindex(addr))
5085					dstmpte->wire_count++;
5086				else if ((dstmpte = pmap_allocpte(dst_pmap,
5087				    addr, NULL)) == NULL)
5088					goto out;
5089				dst_pte = (pt_entry_t *)
5090				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5091				dst_pte = &dst_pte[pmap_pte_index(addr)];
5092				if (*dst_pte == 0 &&
5093				    pmap_try_insert_pv_entry(dst_pmap, addr,
5094				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5095				    &lock)) {
5096					/*
5097					 * Clear the wired, modified, and
5098					 * accessed (referenced) bits
5099					 * during the copy.
5100					 */
5101					*dst_pte = ptetemp & ~(PG_W | PG_M |
5102					    PG_A);
5103					pmap_resident_count_inc(dst_pmap, 1);
5104				} else {
5105					SLIST_INIT(&free);
5106					if (pmap_unwire_ptp(dst_pmap, addr,
5107					    dstmpte, &free)) {
5108						/*
5109						 * Although "addr" is not
5110						 * mapped, paging-structure
5111						 * caches could nonetheless
5112						 * have entries that refer to
5113						 * the freed page table pages.
5114						 * Invalidate those entries.
5115						 */
5116						pmap_invalidate_page(dst_pmap,
5117						    addr);
5118						pmap_free_zero_pages(&free);
5119					}
5120					goto out;
5121				}
5122				if (dstmpte->wire_count >= srcmpte->wire_count)
5123					break;
5124			}
5125			addr += PAGE_SIZE;
5126			src_pte++;
5127		}
5128	}
5129out:
5130	if (lock != NULL)
5131		rw_wunlock(lock);
5132	PMAP_UNLOCK(src_pmap);
5133	PMAP_UNLOCK(dst_pmap);
5134}
5135
5136/*
5137 *	pmap_zero_page zeros the specified hardware page by mapping
5138 *	the page into KVM and using bzero to clear its contents.
5139 */
5140void
5141pmap_zero_page(vm_page_t m)
5142{
5143	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5144
5145	pagezero((void *)va);
5146}
5147
5148/*
5149 *	pmap_zero_page_area zeros the specified hardware page by mapping
5150 *	the page into KVM and using bzero to clear its contents.
5151 *
5152 *	off and size may not cover an area beyond a single hardware page.
5153 */
5154void
5155pmap_zero_page_area(vm_page_t m, int off, int size)
5156{
5157	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5158
5159	if (off == 0 && size == PAGE_SIZE)
5160		pagezero((void *)va);
5161	else
5162		bzero((char *)va + off, size);
5163}
5164
5165/*
5166 *	pmap_zero_page_idle zeros the specified hardware page by mapping
5167 *	the page into KVM and using bzero to clear its contents.  This
5168 *	is intended to be called from the vm_pagezero process only and
5169 *	outside of Giant.
5170 */
5171void
5172pmap_zero_page_idle(vm_page_t m)
5173{
5174	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5175
5176	pagezero((void *)va);
5177}
5178
5179/*
5180 *	pmap_copy_page copies the specified (machine independent)
5181 *	page by mapping the page into virtual memory and using
5182 *	bcopy to copy the page, one machine dependent page at a
5183 *	time.
5184 */
5185void
5186pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5187{
5188	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5189	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5190
5191	pagecopy((void *)src, (void *)dst);
5192}
5193
5194int unmapped_buf_allowed = 1;
5195
5196void
5197pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5198    vm_offset_t b_offset, int xfersize)
5199{
5200	void *a_cp, *b_cp;
5201	vm_page_t pages[2];
5202	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
5203	int cnt;
5204	boolean_t mapped;
5205
5206	while (xfersize > 0) {
5207		a_pg_offset = a_offset & PAGE_MASK;
5208		pages[0] = ma[a_offset >> PAGE_SHIFT];
5209		b_pg_offset = b_offset & PAGE_MASK;
5210		pages[1] = mb[b_offset >> PAGE_SHIFT];
5211		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5212		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5213		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
5214		a_cp = (char *)vaddr[0] + a_pg_offset;
5215		b_cp = (char *)vaddr[1] + b_pg_offset;
5216		bcopy(a_cp, b_cp, cnt);
5217		if (__predict_false(mapped))
5218			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
5219		a_offset += cnt;
5220		b_offset += cnt;
5221		xfersize -= cnt;
5222	}
5223}
5224
5225/*
5226 * Returns true if the pmap's pv is one of the first
5227 * 16 pvs linked to from this page.  This count may
5228 * be changed upwards or downwards in the future; it
5229 * is only necessary that true be returned for a small
5230 * subset of pmaps for proper page aging.
5231 */
5232boolean_t
5233pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5234{
5235	struct md_page *pvh;
5236	struct rwlock *lock;
5237	pv_entry_t pv;
5238	int loops = 0;
5239	boolean_t rv;
5240
5241	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5242	    ("pmap_page_exists_quick: page %p is not managed", m));
5243	rv = FALSE;
5244	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5245	rw_rlock(lock);
5246	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5247		if (PV_PMAP(pv) == pmap) {
5248			rv = TRUE;
5249			break;
5250		}
5251		loops++;
5252		if (loops >= 16)
5253			break;
5254	}
5255	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5256		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5257		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5258			if (PV_PMAP(pv) == pmap) {
5259				rv = TRUE;
5260				break;
5261			}
5262			loops++;
5263			if (loops >= 16)
5264				break;
5265		}
5266	}
5267	rw_runlock(lock);
5268	return (rv);
5269}
5270
5271/*
5272 *	pmap_page_wired_mappings:
5273 *
5274 *	Return the number of managed mappings to the given physical page
5275 *	that are wired.
5276 */
5277int
5278pmap_page_wired_mappings(vm_page_t m)
5279{
5280	struct rwlock *lock;
5281	struct md_page *pvh;
5282	pmap_t pmap;
5283	pt_entry_t *pte;
5284	pv_entry_t pv;
5285	int count, md_gen, pvh_gen;
5286
5287	if ((m->oflags & VPO_UNMANAGED) != 0)
5288		return (0);
5289	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5290	rw_rlock(lock);
5291restart:
5292	count = 0;
5293	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5294		pmap = PV_PMAP(pv);
5295		if (!PMAP_TRYLOCK(pmap)) {
5296			md_gen = m->md.pv_gen;
5297			rw_runlock(lock);
5298			PMAP_LOCK(pmap);
5299			rw_rlock(lock);
5300			if (md_gen != m->md.pv_gen) {
5301				PMAP_UNLOCK(pmap);
5302				goto restart;
5303			}
5304		}
5305		pte = pmap_pte(pmap, pv->pv_va);
5306		if ((*pte & PG_W) != 0)
5307			count++;
5308		PMAP_UNLOCK(pmap);
5309	}
5310	if ((m->flags & PG_FICTITIOUS) == 0) {
5311		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5312		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5313			pmap = PV_PMAP(pv);
5314			if (!PMAP_TRYLOCK(pmap)) {
5315				md_gen = m->md.pv_gen;
5316				pvh_gen = pvh->pv_gen;
5317				rw_runlock(lock);
5318				PMAP_LOCK(pmap);
5319				rw_rlock(lock);
5320				if (md_gen != m->md.pv_gen ||
5321				    pvh_gen != pvh->pv_gen) {
5322					PMAP_UNLOCK(pmap);
5323					goto restart;
5324				}
5325			}
5326			pte = pmap_pde(pmap, pv->pv_va);
5327			if ((*pte & PG_W) != 0)
5328				count++;
5329			PMAP_UNLOCK(pmap);
5330		}
5331	}
5332	rw_runlock(lock);
5333	return (count);
5334}
5335
5336/*
5337 * Returns TRUE if the given page is mapped individually or as part of
5338 * a 2mpage.  Otherwise, returns FALSE.
5339 */
5340boolean_t
5341pmap_page_is_mapped(vm_page_t m)
5342{
5343	struct rwlock *lock;
5344	boolean_t rv;
5345
5346	if ((m->oflags & VPO_UNMANAGED) != 0)
5347		return (FALSE);
5348	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5349	rw_rlock(lock);
5350	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5351	    ((m->flags & PG_FICTITIOUS) == 0 &&
5352	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5353	rw_runlock(lock);
5354	return (rv);
5355}
5356
5357/*
5358 * Destroy all managed, non-wired mappings in the given user-space
5359 * pmap.  This pmap cannot be active on any processor besides the
5360 * caller.
5361 *
5362 * This function cannot be applied to the kernel pmap.  Moreover, it
5363 * is not intended for general use.  It is only to be used during
5364 * process termination.  Consequently, it can be implemented in ways
5365 * that make it faster than pmap_remove().  First, it can more quickly
5366 * destroy mappings by iterating over the pmap's collection of PV
5367 * entries, rather than searching the page table.  Second, it doesn't
5368 * have to test and clear the page table entries atomically, because
5369 * no processor is currently accessing the user address space.  In
5370 * particular, a page table entry's dirty bit won't change state once
5371 * this function starts.
5372 */
5373void
5374pmap_remove_pages(pmap_t pmap)
5375{
5376	pd_entry_t ptepde;
5377	pt_entry_t *pte, tpte;
5378	pt_entry_t PG_M, PG_RW, PG_V;
5379	struct spglist free;
5380	vm_page_t m, mpte, mt;
5381	pv_entry_t pv;
5382	struct md_page *pvh;
5383	struct pv_chunk *pc, *npc;
5384	struct rwlock *lock;
5385	int64_t bit;
5386	uint64_t inuse, bitmask;
5387	int allfree, field, freed, idx;
5388	boolean_t superpage;
5389	vm_paddr_t pa;
5390
5391	/*
5392	 * Assert that the given pmap is only active on the current
5393	 * CPU.  Unfortunately, we cannot block another CPU from
5394	 * activating the pmap while this function is executing.
5395	 */
5396	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5397#ifdef INVARIANTS
5398	{
5399		cpuset_t other_cpus;
5400
5401		other_cpus = all_cpus;
5402		critical_enter();
5403		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5404		CPU_AND(&other_cpus, &pmap->pm_active);
5405		critical_exit();
5406		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5407	}
5408#endif
5409
5410	lock = NULL;
5411	PG_M = pmap_modified_bit(pmap);
5412	PG_V = pmap_valid_bit(pmap);
5413	PG_RW = pmap_rw_bit(pmap);
5414
5415	SLIST_INIT(&free);
5416	PMAP_LOCK(pmap);
5417	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5418		allfree = 1;
5419		freed = 0;
5420		for (field = 0; field < _NPCM; field++) {
5421			inuse = ~pc->pc_map[field] & pc_freemask[field];
5422			while (inuse != 0) {
5423				bit = bsfq(inuse);
5424				bitmask = 1UL << bit;
5425				idx = field * 64 + bit;
5426				pv = &pc->pc_pventry[idx];
5427				inuse &= ~bitmask;
5428
5429				pte = pmap_pdpe(pmap, pv->pv_va);
5430				ptepde = *pte;
5431				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5432				tpte = *pte;
5433				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5434					superpage = FALSE;
5435					ptepde = tpte;
5436					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5437					    PG_FRAME);
5438					pte = &pte[pmap_pte_index(pv->pv_va)];
5439					tpte = *pte;
5440				} else {
5441					/*
5442					 * Keep track whether 'tpte' is a
5443					 * superpage explicitly instead of
5444					 * relying on PG_PS being set.
5445					 *
5446					 * This is because PG_PS is numerically
5447					 * identical to PG_PTE_PAT and thus a
5448					 * regular page could be mistaken for
5449					 * a superpage.
5450					 */
5451					superpage = TRUE;
5452				}
5453
5454				if ((tpte & PG_V) == 0) {
5455					panic("bad pte va %lx pte %lx",
5456					    pv->pv_va, tpte);
5457				}
5458
5459/*
5460 * We cannot remove wired pages from a process' mapping at this time
5461 */
5462				if (tpte & PG_W) {
5463					allfree = 0;
5464					continue;
5465				}
5466
5467				if (superpage)
5468					pa = tpte & PG_PS_FRAME;
5469				else
5470					pa = tpte & PG_FRAME;
5471
5472				m = PHYS_TO_VM_PAGE(pa);
5473				KASSERT(m->phys_addr == pa,
5474				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5475				    m, (uintmax_t)m->phys_addr,
5476				    (uintmax_t)tpte));
5477
5478				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5479				    m < &vm_page_array[vm_page_array_size],
5480				    ("pmap_remove_pages: bad tpte %#jx",
5481				    (uintmax_t)tpte));
5482
5483				pte_clear(pte);
5484
5485				/*
5486				 * Update the vm_page_t clean/reference bits.
5487				 */
5488				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5489					if (superpage) {
5490						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5491							vm_page_dirty(mt);
5492					} else
5493						vm_page_dirty(m);
5494				}
5495
5496				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5497
5498				/* Mark free */
5499				pc->pc_map[field] |= bitmask;
5500				if (superpage) {
5501					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5502					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5503					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5504					pvh->pv_gen++;
5505					if (TAILQ_EMPTY(&pvh->pv_list)) {
5506						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5507							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5508							    TAILQ_EMPTY(&mt->md.pv_list))
5509								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5510					}
5511					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5512					if (mpte != NULL) {
5513						pmap_remove_pt_page(pmap, mpte);
5514						pmap_resident_count_dec(pmap, 1);
5515						KASSERT(mpte->wire_count == NPTEPG,
5516						    ("pmap_remove_pages: pte page wire count error"));
5517						mpte->wire_count = 0;
5518						pmap_add_delayed_free_list(mpte, &free, FALSE);
5519						atomic_subtract_int(&vm_cnt.v_wire_count, 1);
5520					}
5521				} else {
5522					pmap_resident_count_dec(pmap, 1);
5523					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5524					m->md.pv_gen++;
5525					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5526					    TAILQ_EMPTY(&m->md.pv_list) &&
5527					    (m->flags & PG_FICTITIOUS) == 0) {
5528						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5529						if (TAILQ_EMPTY(&pvh->pv_list))
5530							vm_page_aflag_clear(m, PGA_WRITEABLE);
5531					}
5532				}
5533				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5534				freed++;
5535			}
5536		}
5537		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5538		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5539		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5540		if (allfree) {
5541			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5542			free_pv_chunk(pc);
5543		}
5544	}
5545	if (lock != NULL)
5546		rw_wunlock(lock);
5547	pmap_invalidate_all(pmap);
5548	PMAP_UNLOCK(pmap);
5549	pmap_free_zero_pages(&free);
5550}
5551
5552static boolean_t
5553pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5554{
5555	struct rwlock *lock;
5556	pv_entry_t pv;
5557	struct md_page *pvh;
5558	pt_entry_t *pte, mask;
5559	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5560	pmap_t pmap;
5561	int md_gen, pvh_gen;
5562	boolean_t rv;
5563
5564	rv = FALSE;
5565	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5566	rw_rlock(lock);
5567restart:
5568	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5569		pmap = PV_PMAP(pv);
5570		if (!PMAP_TRYLOCK(pmap)) {
5571			md_gen = m->md.pv_gen;
5572			rw_runlock(lock);
5573			PMAP_LOCK(pmap);
5574			rw_rlock(lock);
5575			if (md_gen != m->md.pv_gen) {
5576				PMAP_UNLOCK(pmap);
5577				goto restart;
5578			}
5579		}
5580		pte = pmap_pte(pmap, pv->pv_va);
5581		mask = 0;
5582		if (modified) {
5583			PG_M = pmap_modified_bit(pmap);
5584			PG_RW = pmap_rw_bit(pmap);
5585			mask |= PG_RW | PG_M;
5586		}
5587		if (accessed) {
5588			PG_A = pmap_accessed_bit(pmap);
5589			PG_V = pmap_valid_bit(pmap);
5590			mask |= PG_V | PG_A;
5591		}
5592		rv = (*pte & mask) == mask;
5593		PMAP_UNLOCK(pmap);
5594		if (rv)
5595			goto out;
5596	}
5597	if ((m->flags & PG_FICTITIOUS) == 0) {
5598		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5599		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5600			pmap = PV_PMAP(pv);
5601			if (!PMAP_TRYLOCK(pmap)) {
5602				md_gen = m->md.pv_gen;
5603				pvh_gen = pvh->pv_gen;
5604				rw_runlock(lock);
5605				PMAP_LOCK(pmap);
5606				rw_rlock(lock);
5607				if (md_gen != m->md.pv_gen ||
5608				    pvh_gen != pvh->pv_gen) {
5609					PMAP_UNLOCK(pmap);
5610					goto restart;
5611				}
5612			}
5613			pte = pmap_pde(pmap, pv->pv_va);
5614			mask = 0;
5615			if (modified) {
5616				PG_M = pmap_modified_bit(pmap);
5617				PG_RW = pmap_rw_bit(pmap);
5618				mask |= PG_RW | PG_M;
5619			}
5620			if (accessed) {
5621				PG_A = pmap_accessed_bit(pmap);
5622				PG_V = pmap_valid_bit(pmap);
5623				mask |= PG_V | PG_A;
5624			}
5625			rv = (*pte & mask) == mask;
5626			PMAP_UNLOCK(pmap);
5627			if (rv)
5628				goto out;
5629		}
5630	}
5631out:
5632	rw_runlock(lock);
5633	return (rv);
5634}
5635
5636/*
5637 *	pmap_is_modified:
5638 *
5639 *	Return whether or not the specified physical page was modified
5640 *	in any physical maps.
5641 */
5642boolean_t
5643pmap_is_modified(vm_page_t m)
5644{
5645
5646	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5647	    ("pmap_is_modified: page %p is not managed", m));
5648
5649	/*
5650	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5651	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5652	 * is clear, no PTEs can have PG_M set.
5653	 */
5654	VM_OBJECT_ASSERT_WLOCKED(m->object);
5655	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5656		return (FALSE);
5657	return (pmap_page_test_mappings(m, FALSE, TRUE));
5658}
5659
5660/*
5661 *	pmap_is_prefaultable:
5662 *
5663 *	Return whether or not the specified virtual address is eligible
5664 *	for prefault.
5665 */
5666boolean_t
5667pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5668{
5669	pd_entry_t *pde;
5670	pt_entry_t *pte, PG_V;
5671	boolean_t rv;
5672
5673	PG_V = pmap_valid_bit(pmap);
5674	rv = FALSE;
5675	PMAP_LOCK(pmap);
5676	pde = pmap_pde(pmap, addr);
5677	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5678		pte = pmap_pde_to_pte(pde, addr);
5679		rv = (*pte & PG_V) == 0;
5680	}
5681	PMAP_UNLOCK(pmap);
5682	return (rv);
5683}
5684
5685/*
5686 *	pmap_is_referenced:
5687 *
5688 *	Return whether or not the specified physical page was referenced
5689 *	in any physical maps.
5690 */
5691boolean_t
5692pmap_is_referenced(vm_page_t m)
5693{
5694
5695	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5696	    ("pmap_is_referenced: page %p is not managed", m));
5697	return (pmap_page_test_mappings(m, TRUE, FALSE));
5698}
5699
5700/*
5701 * Clear the write and modified bits in each of the given page's mappings.
5702 */
5703void
5704pmap_remove_write(vm_page_t m)
5705{
5706	struct md_page *pvh;
5707	pmap_t pmap;
5708	struct rwlock *lock;
5709	pv_entry_t next_pv, pv;
5710	pd_entry_t *pde;
5711	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5712	vm_offset_t va;
5713	int pvh_gen, md_gen;
5714
5715	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5716	    ("pmap_remove_write: page %p is not managed", m));
5717
5718	/*
5719	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5720	 * set by another thread while the object is locked.  Thus,
5721	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5722	 */
5723	VM_OBJECT_ASSERT_WLOCKED(m->object);
5724	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5725		return;
5726	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5727	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5728	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
5729retry_pv_loop:
5730	rw_wlock(lock);
5731	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5732		pmap = PV_PMAP(pv);
5733		if (!PMAP_TRYLOCK(pmap)) {
5734			pvh_gen = pvh->pv_gen;
5735			rw_wunlock(lock);
5736			PMAP_LOCK(pmap);
5737			rw_wlock(lock);
5738			if (pvh_gen != pvh->pv_gen) {
5739				PMAP_UNLOCK(pmap);
5740				rw_wunlock(lock);
5741				goto retry_pv_loop;
5742			}
5743		}
5744		PG_RW = pmap_rw_bit(pmap);
5745		va = pv->pv_va;
5746		pde = pmap_pde(pmap, va);
5747		if ((*pde & PG_RW) != 0)
5748			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5749		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5750		    ("inconsistent pv lock %p %p for page %p",
5751		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5752		PMAP_UNLOCK(pmap);
5753	}
5754	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5755		pmap = PV_PMAP(pv);
5756		if (!PMAP_TRYLOCK(pmap)) {
5757			pvh_gen = pvh->pv_gen;
5758			md_gen = m->md.pv_gen;
5759			rw_wunlock(lock);
5760			PMAP_LOCK(pmap);
5761			rw_wlock(lock);
5762			if (pvh_gen != pvh->pv_gen ||
5763			    md_gen != m->md.pv_gen) {
5764				PMAP_UNLOCK(pmap);
5765				rw_wunlock(lock);
5766				goto retry_pv_loop;
5767			}
5768		}
5769		PG_M = pmap_modified_bit(pmap);
5770		PG_RW = pmap_rw_bit(pmap);
5771		pde = pmap_pde(pmap, pv->pv_va);
5772		KASSERT((*pde & PG_PS) == 0,
5773		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5774		    m));
5775		pte = pmap_pde_to_pte(pde, pv->pv_va);
5776retry:
5777		oldpte = *pte;
5778		if (oldpte & PG_RW) {
5779			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5780			    ~(PG_RW | PG_M)))
5781				goto retry;
5782			if ((oldpte & PG_M) != 0)
5783				vm_page_dirty(m);
5784			pmap_invalidate_page(pmap, pv->pv_va);
5785		}
5786		PMAP_UNLOCK(pmap);
5787	}
5788	rw_wunlock(lock);
5789	vm_page_aflag_clear(m, PGA_WRITEABLE);
5790	pmap_delayed_invl_wait(m);
5791}
5792
5793static __inline boolean_t
5794safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5795{
5796
5797	if (!pmap_emulate_ad_bits(pmap))
5798		return (TRUE);
5799
5800	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5801
5802	/*
5803	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
5804	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5805	 * if the EPT_PG_WRITE bit is set.
5806	 */
5807	if ((pte & EPT_PG_WRITE) != 0)
5808		return (FALSE);
5809
5810	/*
5811	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5812	 */
5813	if ((pte & EPT_PG_EXECUTE) == 0 ||
5814	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5815		return (TRUE);
5816	else
5817		return (FALSE);
5818}
5819
5820#define	PMAP_TS_REFERENCED_MAX	5
5821
5822/*
5823 *	pmap_ts_referenced:
5824 *
5825 *	Return a count of reference bits for a page, clearing those bits.
5826 *	It is not necessary for every reference bit to be cleared, but it
5827 *	is necessary that 0 only be returned when there are truly no
5828 *	reference bits set.
5829 *
5830 *	XXX: The exact number of bits to check and clear is a matter that
5831 *	should be tested and standardized at some point in the future for
5832 *	optimal aging of shared pages.
5833 *
5834 *	As an optimization, update the page's dirty field if a modified bit is
5835 *	found while counting reference bits.  This opportunistic update can be
5836 *	performed at low cost and can eliminate the need for some future calls
5837 *	to pmap_is_modified().  However, since this function stops after
5838 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
5839 *	dirty pages.  Those dirty pages will only be detected by a future call
5840 *	to pmap_is_modified().
5841 *
5842 *	A DI block is not needed within this function, because
5843 *	invalidations are performed before the PV list lock is
5844 *	released.
5845 */
5846int
5847pmap_ts_referenced(vm_page_t m)
5848{
5849	struct md_page *pvh;
5850	pv_entry_t pv, pvf;
5851	pmap_t pmap;
5852	struct rwlock *lock;
5853	pd_entry_t oldpde, *pde;
5854	pt_entry_t *pte, PG_A, PG_M, PG_RW;
5855	vm_offset_t va;
5856	vm_paddr_t pa;
5857	int cleared, md_gen, not_cleared, pvh_gen;
5858	struct spglist free;
5859	boolean_t demoted;
5860
5861	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5862	    ("pmap_ts_referenced: page %p is not managed", m));
5863	SLIST_INIT(&free);
5864	cleared = 0;
5865	pa = VM_PAGE_TO_PHYS(m);
5866	lock = PHYS_TO_PV_LIST_LOCK(pa);
5867	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
5868	rw_wlock(lock);
5869retry:
5870	not_cleared = 0;
5871	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5872		goto small_mappings;
5873	pv = pvf;
5874	do {
5875		if (pvf == NULL)
5876			pvf = pv;
5877		pmap = PV_PMAP(pv);
5878		if (!PMAP_TRYLOCK(pmap)) {
5879			pvh_gen = pvh->pv_gen;
5880			rw_wunlock(lock);
5881			PMAP_LOCK(pmap);
5882			rw_wlock(lock);
5883			if (pvh_gen != pvh->pv_gen) {
5884				PMAP_UNLOCK(pmap);
5885				goto retry;
5886			}
5887		}
5888		PG_A = pmap_accessed_bit(pmap);
5889		PG_M = pmap_modified_bit(pmap);
5890		PG_RW = pmap_rw_bit(pmap);
5891		va = pv->pv_va;
5892		pde = pmap_pde(pmap, pv->pv_va);
5893		oldpde = *pde;
5894		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5895			/*
5896			 * Although "oldpde" is mapping a 2MB page, because
5897			 * this function is called at a 4KB page granularity,
5898			 * we only update the 4KB page under test.
5899			 */
5900			vm_page_dirty(m);
5901		}
5902		if ((*pde & PG_A) != 0) {
5903			/*
5904			 * Since this reference bit is shared by 512 4KB
5905			 * pages, it should not be cleared every time it is
5906			 * tested.  Apply a simple "hash" function on the
5907			 * physical page number, the virtual superpage number,
5908			 * and the pmap address to select one 4KB page out of
5909			 * the 512 on which testing the reference bit will
5910			 * result in clearing that reference bit.  This
5911			 * function is designed to avoid the selection of the
5912			 * same 4KB page for every 2MB page mapping.
5913			 *
5914			 * On demotion, a mapping that hasn't been referenced
5915			 * is simply destroyed.  To avoid the possibility of a
5916			 * subsequent page fault on a demoted wired mapping,
5917			 * always leave its reference bit set.  Moreover,
5918			 * since the superpage is wired, the current state of
5919			 * its reference bit won't affect page replacement.
5920			 */
5921			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5922			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5923			    (*pde & PG_W) == 0) {
5924				if (safe_to_clear_referenced(pmap, oldpde)) {
5925					atomic_clear_long(pde, PG_A);
5926					pmap_invalidate_page(pmap, pv->pv_va);
5927					demoted = FALSE;
5928				} else if (pmap_demote_pde_locked(pmap, pde,
5929				    pv->pv_va, &lock)) {
5930					/*
5931					 * Remove the mapping to a single page
5932					 * so that a subsequent access may
5933					 * repromote.  Since the underlying
5934					 * page table page is fully populated,
5935					 * this removal never frees a page
5936					 * table page.
5937					 */
5938					demoted = TRUE;
5939					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5940					    PG_PS_FRAME);
5941					pte = pmap_pde_to_pte(pde, va);
5942					pmap_remove_pte(pmap, pte, va, *pde,
5943					    NULL, &lock);
5944					pmap_invalidate_page(pmap, va);
5945				} else
5946					demoted = TRUE;
5947
5948				if (demoted) {
5949					/*
5950					 * The superpage mapping was removed
5951					 * entirely and therefore 'pv' is no
5952					 * longer valid.
5953					 */
5954					if (pvf == pv)
5955						pvf = NULL;
5956					pv = NULL;
5957				}
5958				cleared++;
5959				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5960				    ("inconsistent pv lock %p %p for page %p",
5961				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5962			} else
5963				not_cleared++;
5964		}
5965		PMAP_UNLOCK(pmap);
5966		/* Rotate the PV list if it has more than one entry. */
5967		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5968			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5969			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5970			pvh->pv_gen++;
5971		}
5972		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5973			goto out;
5974	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5975small_mappings:
5976	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5977		goto out;
5978	pv = pvf;
5979	do {
5980		if (pvf == NULL)
5981			pvf = pv;
5982		pmap = PV_PMAP(pv);
5983		if (!PMAP_TRYLOCK(pmap)) {
5984			pvh_gen = pvh->pv_gen;
5985			md_gen = m->md.pv_gen;
5986			rw_wunlock(lock);
5987			PMAP_LOCK(pmap);
5988			rw_wlock(lock);
5989			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5990				PMAP_UNLOCK(pmap);
5991				goto retry;
5992			}
5993		}
5994		PG_A = pmap_accessed_bit(pmap);
5995		PG_M = pmap_modified_bit(pmap);
5996		PG_RW = pmap_rw_bit(pmap);
5997		pde = pmap_pde(pmap, pv->pv_va);
5998		KASSERT((*pde & PG_PS) == 0,
5999		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
6000		    m));
6001		pte = pmap_pde_to_pte(pde, pv->pv_va);
6002		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6003			vm_page_dirty(m);
6004		if ((*pte & PG_A) != 0) {
6005			if (safe_to_clear_referenced(pmap, *pte)) {
6006				atomic_clear_long(pte, PG_A);
6007				pmap_invalidate_page(pmap, pv->pv_va);
6008				cleared++;
6009			} else if ((*pte & PG_W) == 0) {
6010				/*
6011				 * Wired pages cannot be paged out so
6012				 * doing accessed bit emulation for
6013				 * them is wasted effort. We do the
6014				 * hard work for unwired pages only.
6015				 */
6016				pmap_remove_pte(pmap, pte, pv->pv_va,
6017				    *pde, &free, &lock);
6018				pmap_invalidate_page(pmap, pv->pv_va);
6019				cleared++;
6020				if (pvf == pv)
6021					pvf = NULL;
6022				pv = NULL;
6023				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6024				    ("inconsistent pv lock %p %p for page %p",
6025				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6026			} else
6027				not_cleared++;
6028		}
6029		PMAP_UNLOCK(pmap);
6030		/* Rotate the PV list if it has more than one entry. */
6031		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6032			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6033			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6034			m->md.pv_gen++;
6035		}
6036	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6037	    not_cleared < PMAP_TS_REFERENCED_MAX);
6038out:
6039	rw_wunlock(lock);
6040	pmap_free_zero_pages(&free);
6041	return (cleared + not_cleared);
6042}
6043
6044/*
6045 *	Apply the given advice to the specified range of addresses within the
6046 *	given pmap.  Depending on the advice, clear the referenced and/or
6047 *	modified flags in each mapping and set the mapped page's dirty field.
6048 */
6049void
6050pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6051{
6052	struct rwlock *lock;
6053	pml4_entry_t *pml4e;
6054	pdp_entry_t *pdpe;
6055	pd_entry_t oldpde, *pde;
6056	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6057	vm_offset_t va_next;
6058	vm_page_t m;
6059	boolean_t anychanged;
6060
6061	if (advice != MADV_DONTNEED && advice != MADV_FREE)
6062		return;
6063
6064	/*
6065	 * A/D bit emulation requires an alternate code path when clearing
6066	 * the modified and accessed bits below. Since this function is
6067	 * advisory in nature we skip it entirely for pmaps that require
6068	 * A/D bit emulation.
6069	 */
6070	if (pmap_emulate_ad_bits(pmap))
6071		return;
6072
6073	PG_A = pmap_accessed_bit(pmap);
6074	PG_G = pmap_global_bit(pmap);
6075	PG_M = pmap_modified_bit(pmap);
6076	PG_V = pmap_valid_bit(pmap);
6077	PG_RW = pmap_rw_bit(pmap);
6078	anychanged = FALSE;
6079	pmap_delayed_invl_started();
6080	PMAP_LOCK(pmap);
6081	for (; sva < eva; sva = va_next) {
6082		pml4e = pmap_pml4e(pmap, sva);
6083		if ((*pml4e & PG_V) == 0) {
6084			va_next = (sva + NBPML4) & ~PML4MASK;
6085			if (va_next < sva)
6086				va_next = eva;
6087			continue;
6088		}
6089		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6090		if ((*pdpe & PG_V) == 0) {
6091			va_next = (sva + NBPDP) & ~PDPMASK;
6092			if (va_next < sva)
6093				va_next = eva;
6094			continue;
6095		}
6096		va_next = (sva + NBPDR) & ~PDRMASK;
6097		if (va_next < sva)
6098			va_next = eva;
6099		pde = pmap_pdpe_to_pde(pdpe, sva);
6100		oldpde = *pde;
6101		if ((oldpde & PG_V) == 0)
6102			continue;
6103		else if ((oldpde & PG_PS) != 0) {
6104			if ((oldpde & PG_MANAGED) == 0)
6105				continue;
6106			lock = NULL;
6107			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6108				if (lock != NULL)
6109					rw_wunlock(lock);
6110
6111				/*
6112				 * The large page mapping was destroyed.
6113				 */
6114				continue;
6115			}
6116
6117			/*
6118			 * Unless the page mappings are wired, remove the
6119			 * mapping to a single page so that a subsequent
6120			 * access may repromote.  Since the underlying page
6121			 * table page is fully populated, this removal never
6122			 * frees a page table page.
6123			 */
6124			if ((oldpde & PG_W) == 0) {
6125				pte = pmap_pde_to_pte(pde, sva);
6126				KASSERT((*pte & PG_V) != 0,
6127				    ("pmap_advise: invalid PTE"));
6128				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6129				    &lock);
6130				anychanged = TRUE;
6131			}
6132			if (lock != NULL)
6133				rw_wunlock(lock);
6134		}
6135		if (va_next > eva)
6136			va_next = eva;
6137		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6138		    sva += PAGE_SIZE) {
6139			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
6140			    PG_V))
6141				continue;
6142			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6143				if (advice == MADV_DONTNEED) {
6144					/*
6145					 * Future calls to pmap_is_modified()
6146					 * can be avoided by making the page
6147					 * dirty now.
6148					 */
6149					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6150					vm_page_dirty(m);
6151				}
6152				atomic_clear_long(pte, PG_M | PG_A);
6153			} else if ((*pte & PG_A) != 0)
6154				atomic_clear_long(pte, PG_A);
6155			else
6156				continue;
6157			if ((*pte & PG_G) != 0)
6158				pmap_invalidate_page(pmap, sva);
6159			else
6160				anychanged = TRUE;
6161		}
6162	}
6163	if (anychanged)
6164		pmap_invalidate_all(pmap);
6165	PMAP_UNLOCK(pmap);
6166	pmap_delayed_invl_finished();
6167}
6168
6169/*
6170 *	Clear the modify bits on the specified physical page.
6171 */
6172void
6173pmap_clear_modify(vm_page_t m)
6174{
6175	struct md_page *pvh;
6176	pmap_t pmap;
6177	pv_entry_t next_pv, pv;
6178	pd_entry_t oldpde, *pde;
6179	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6180	struct rwlock *lock;
6181	vm_offset_t va;
6182	int md_gen, pvh_gen;
6183
6184	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6185	    ("pmap_clear_modify: page %p is not managed", m));
6186	VM_OBJECT_ASSERT_WLOCKED(m->object);
6187	KASSERT(!vm_page_xbusied(m),
6188	    ("pmap_clear_modify: page %p is exclusive busied", m));
6189
6190	/*
6191	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6192	 * If the object containing the page is locked and the page is not
6193	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6194	 */
6195	if ((m->aflags & PGA_WRITEABLE) == 0)
6196		return;
6197	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6198	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
6199	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6200	rw_wlock(lock);
6201restart:
6202	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6203		pmap = PV_PMAP(pv);
6204		if (!PMAP_TRYLOCK(pmap)) {
6205			pvh_gen = pvh->pv_gen;
6206			rw_wunlock(lock);
6207			PMAP_LOCK(pmap);
6208			rw_wlock(lock);
6209			if (pvh_gen != pvh->pv_gen) {
6210				PMAP_UNLOCK(pmap);
6211				goto restart;
6212			}
6213		}
6214		PG_M = pmap_modified_bit(pmap);
6215		PG_V = pmap_valid_bit(pmap);
6216		PG_RW = pmap_rw_bit(pmap);
6217		va = pv->pv_va;
6218		pde = pmap_pde(pmap, va);
6219		oldpde = *pde;
6220		if ((oldpde & PG_RW) != 0) {
6221			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6222				if ((oldpde & PG_W) == 0) {
6223					/*
6224					 * Write protect the mapping to a
6225					 * single page so that a subsequent
6226					 * write access may repromote.
6227					 */
6228					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6229					    PG_PS_FRAME);
6230					pte = pmap_pde_to_pte(pde, va);
6231					oldpte = *pte;
6232					if ((oldpte & PG_V) != 0) {
6233						while (!atomic_cmpset_long(pte,
6234						    oldpte,
6235						    oldpte & ~(PG_M | PG_RW)))
6236							oldpte = *pte;
6237						vm_page_dirty(m);
6238						pmap_invalidate_page(pmap, va);
6239					}
6240				}
6241			}
6242		}
6243		PMAP_UNLOCK(pmap);
6244	}
6245	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6246		pmap = PV_PMAP(pv);
6247		if (!PMAP_TRYLOCK(pmap)) {
6248			md_gen = m->md.pv_gen;
6249			pvh_gen = pvh->pv_gen;
6250			rw_wunlock(lock);
6251			PMAP_LOCK(pmap);
6252			rw_wlock(lock);
6253			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6254				PMAP_UNLOCK(pmap);
6255				goto restart;
6256			}
6257		}
6258		PG_M = pmap_modified_bit(pmap);
6259		PG_RW = pmap_rw_bit(pmap);
6260		pde = pmap_pde(pmap, pv->pv_va);
6261		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6262		    " a 2mpage in page %p's pv list", m));
6263		pte = pmap_pde_to_pte(pde, pv->pv_va);
6264		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6265			atomic_clear_long(pte, PG_M);
6266			pmap_invalidate_page(pmap, pv->pv_va);
6267		}
6268		PMAP_UNLOCK(pmap);
6269	}
6270	rw_wunlock(lock);
6271}
6272
6273/*
6274 * Miscellaneous support routines follow
6275 */
6276
6277/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6278static __inline void
6279pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6280{
6281	u_int opte, npte;
6282
6283	/*
6284	 * The cache mode bits are all in the low 32-bits of the
6285	 * PTE, so we can just spin on updating the low 32-bits.
6286	 */
6287	do {
6288		opte = *(u_int *)pte;
6289		npte = opte & ~mask;
6290		npte |= cache_bits;
6291	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6292}
6293
6294/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6295static __inline void
6296pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6297{
6298	u_int opde, npde;
6299
6300	/*
6301	 * The cache mode bits are all in the low 32-bits of the
6302	 * PDE, so we can just spin on updating the low 32-bits.
6303	 */
6304	do {
6305		opde = *(u_int *)pde;
6306		npde = opde & ~mask;
6307		npde |= cache_bits;
6308	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6309}
6310
6311/*
6312 * Map a set of physical memory pages into the kernel virtual
6313 * address space. Return a pointer to where it is mapped. This
6314 * routine is intended to be used for mapping device memory,
6315 * NOT real memory.
6316 */
6317void *
6318pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6319{
6320	struct pmap_preinit_mapping *ppim;
6321	vm_offset_t va, offset;
6322	vm_size_t tmpsize;
6323	int i;
6324
6325	offset = pa & PAGE_MASK;
6326	size = round_page(offset + size);
6327	pa = trunc_page(pa);
6328
6329	if (!pmap_initialized) {
6330		va = 0;
6331		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6332			ppim = pmap_preinit_mapping + i;
6333			if (ppim->va == 0) {
6334				ppim->pa = pa;
6335				ppim->sz = size;
6336				ppim->mode = mode;
6337				ppim->va = virtual_avail;
6338				virtual_avail += size;
6339				va = ppim->va;
6340				break;
6341			}
6342		}
6343		if (va == 0)
6344			panic("%s: too many preinit mappings", __func__);
6345	} else {
6346		/*
6347		 * If we have a preinit mapping, re-use it.
6348		 */
6349		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6350			ppim = pmap_preinit_mapping + i;
6351			if (ppim->pa == pa && ppim->sz == size &&
6352			    ppim->mode == mode)
6353				return ((void *)(ppim->va + offset));
6354		}
6355		/*
6356		 * If the specified range of physical addresses fits within
6357		 * the direct map window, use the direct map.
6358		 */
6359		if (pa < dmaplimit && pa + size < dmaplimit) {
6360			va = PHYS_TO_DMAP(pa);
6361			if (!pmap_change_attr(va, size, mode))
6362				return ((void *)(va + offset));
6363		}
6364		va = kva_alloc(size);
6365		if (va == 0)
6366			panic("%s: Couldn't allocate KVA", __func__);
6367	}
6368	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6369		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6370	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6371	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
6372	return ((void *)(va + offset));
6373}
6374
6375void *
6376pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6377{
6378
6379	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6380}
6381
6382void *
6383pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6384{
6385
6386	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6387}
6388
6389void
6390pmap_unmapdev(vm_offset_t va, vm_size_t size)
6391{
6392	struct pmap_preinit_mapping *ppim;
6393	vm_offset_t offset;
6394	int i;
6395
6396	/* If we gave a direct map region in pmap_mapdev, do nothing */
6397	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6398		return;
6399	offset = va & PAGE_MASK;
6400	size = round_page(offset + size);
6401	va = trunc_page(va);
6402	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6403		ppim = pmap_preinit_mapping + i;
6404		if (ppim->va == va && ppim->sz == size) {
6405			if (pmap_initialized)
6406				return;
6407			ppim->pa = 0;
6408			ppim->va = 0;
6409			ppim->sz = 0;
6410			ppim->mode = 0;
6411			if (va + size == virtual_avail)
6412				virtual_avail = va;
6413			return;
6414		}
6415	}
6416	if (pmap_initialized)
6417		kva_free(va, size);
6418}
6419
6420/*
6421 * Tries to demote a 1GB page mapping.
6422 */
6423static boolean_t
6424pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6425{
6426	pdp_entry_t newpdpe, oldpdpe;
6427	pd_entry_t *firstpde, newpde, *pde;
6428	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6429	vm_paddr_t mpdepa;
6430	vm_page_t mpde;
6431
6432	PG_A = pmap_accessed_bit(pmap);
6433	PG_M = pmap_modified_bit(pmap);
6434	PG_V = pmap_valid_bit(pmap);
6435	PG_RW = pmap_rw_bit(pmap);
6436
6437	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6438	oldpdpe = *pdpe;
6439	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6440	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6441	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6442	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6443		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6444		    " in pmap %p", va, pmap);
6445		return (FALSE);
6446	}
6447	mpdepa = VM_PAGE_TO_PHYS(mpde);
6448	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6449	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6450	KASSERT((oldpdpe & PG_A) != 0,
6451	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6452	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6453	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6454	newpde = oldpdpe;
6455
6456	/*
6457	 * Initialize the page directory page.
6458	 */
6459	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6460		*pde = newpde;
6461		newpde += NBPDR;
6462	}
6463
6464	/*
6465	 * Demote the mapping.
6466	 */
6467	*pdpe = newpdpe;
6468
6469	/*
6470	 * Invalidate a stale recursive mapping of the page directory page.
6471	 */
6472	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6473
6474	pmap_pdpe_demotions++;
6475	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6476	    " in pmap %p", va, pmap);
6477	return (TRUE);
6478}
6479
6480/*
6481 * Sets the memory attribute for the specified page.
6482 */
6483void
6484pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6485{
6486
6487	m->md.pat_mode = ma;
6488
6489	/*
6490	 * If "m" is a normal page, update its direct mapping.  This update
6491	 * can be relied upon to perform any cache operations that are
6492	 * required for data coherence.
6493	 */
6494	if ((m->flags & PG_FICTITIOUS) == 0 &&
6495	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6496	    m->md.pat_mode))
6497		panic("memory attribute change on the direct map failed");
6498}
6499
6500/*
6501 * Changes the specified virtual address range's memory type to that given by
6502 * the parameter "mode".  The specified virtual address range must be
6503 * completely contained within either the direct map or the kernel map.  If
6504 * the virtual address range is contained within the kernel map, then the
6505 * memory type for each of the corresponding ranges of the direct map is also
6506 * changed.  (The corresponding ranges of the direct map are those ranges that
6507 * map the same physical pages as the specified virtual address range.)  These
6508 * changes to the direct map are necessary because Intel describes the
6509 * behavior of their processors as "undefined" if two or more mappings to the
6510 * same physical page have different memory types.
6511 *
6512 * Returns zero if the change completed successfully, and either EINVAL or
6513 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6514 * of the virtual address range was not mapped, and ENOMEM is returned if
6515 * there was insufficient memory available to complete the change.  In the
6516 * latter case, the memory type may have been changed on some part of the
6517 * virtual address range or the direct map.
6518 */
6519int
6520pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6521{
6522	int error;
6523
6524	PMAP_LOCK(kernel_pmap);
6525	error = pmap_change_attr_locked(va, size, mode);
6526	PMAP_UNLOCK(kernel_pmap);
6527	return (error);
6528}
6529
6530static int
6531pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6532{
6533	vm_offset_t base, offset, tmpva;
6534	vm_paddr_t pa_start, pa_end, pa_end1;
6535	pdp_entry_t *pdpe;
6536	pd_entry_t *pde;
6537	pt_entry_t *pte;
6538	int cache_bits_pte, cache_bits_pde, error;
6539	boolean_t changed;
6540
6541	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6542	base = trunc_page(va);
6543	offset = va & PAGE_MASK;
6544	size = round_page(offset + size);
6545
6546	/*
6547	 * Only supported on kernel virtual addresses, including the direct
6548	 * map but excluding the recursive map.
6549	 */
6550	if (base < DMAP_MIN_ADDRESS)
6551		return (EINVAL);
6552
6553	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6554	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6555	changed = FALSE;
6556
6557	/*
6558	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6559	 * into 4KB pages if required.
6560	 */
6561	for (tmpva = base; tmpva < base + size; ) {
6562		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6563		if (pdpe == NULL || *pdpe == 0)
6564			return (EINVAL);
6565		if (*pdpe & PG_PS) {
6566			/*
6567			 * If the current 1GB page already has the required
6568			 * memory type, then we need not demote this page. Just
6569			 * increment tmpva to the next 1GB page frame.
6570			 */
6571			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6572				tmpva = trunc_1gpage(tmpva) + NBPDP;
6573				continue;
6574			}
6575
6576			/*
6577			 * If the current offset aligns with a 1GB page frame
6578			 * and there is at least 1GB left within the range, then
6579			 * we need not break down this page into 2MB pages.
6580			 */
6581			if ((tmpva & PDPMASK) == 0 &&
6582			    tmpva + PDPMASK < base + size) {
6583				tmpva += NBPDP;
6584				continue;
6585			}
6586			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6587				return (ENOMEM);
6588		}
6589		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6590		if (*pde == 0)
6591			return (EINVAL);
6592		if (*pde & PG_PS) {
6593			/*
6594			 * If the current 2MB page already has the required
6595			 * memory type, then we need not demote this page. Just
6596			 * increment tmpva to the next 2MB page frame.
6597			 */
6598			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6599				tmpva = trunc_2mpage(tmpva) + NBPDR;
6600				continue;
6601			}
6602
6603			/*
6604			 * If the current offset aligns with a 2MB page frame
6605			 * and there is at least 2MB left within the range, then
6606			 * we need not break down this page into 4KB pages.
6607			 */
6608			if ((tmpva & PDRMASK) == 0 &&
6609			    tmpva + PDRMASK < base + size) {
6610				tmpva += NBPDR;
6611				continue;
6612			}
6613			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6614				return (ENOMEM);
6615		}
6616		pte = pmap_pde_to_pte(pde, tmpva);
6617		if (*pte == 0)
6618			return (EINVAL);
6619		tmpva += PAGE_SIZE;
6620	}
6621	error = 0;
6622
6623	/*
6624	 * Ok, all the pages exist, so run through them updating their
6625	 * cache mode if required.
6626	 */
6627	pa_start = pa_end = 0;
6628	for (tmpva = base; tmpva < base + size; ) {
6629		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6630		if (*pdpe & PG_PS) {
6631			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6632				pmap_pde_attr(pdpe, cache_bits_pde,
6633				    X86_PG_PDE_CACHE);
6634				changed = TRUE;
6635			}
6636			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6637			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
6638				if (pa_start == pa_end) {
6639					/* Start physical address run. */
6640					pa_start = *pdpe & PG_PS_FRAME;
6641					pa_end = pa_start + NBPDP;
6642				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6643					pa_end += NBPDP;
6644				else {
6645					/* Run ended, update direct map. */
6646					error = pmap_change_attr_locked(
6647					    PHYS_TO_DMAP(pa_start),
6648					    pa_end - pa_start, mode);
6649					if (error != 0)
6650						break;
6651					/* Start physical address run. */
6652					pa_start = *pdpe & PG_PS_FRAME;
6653					pa_end = pa_start + NBPDP;
6654				}
6655			}
6656			tmpva = trunc_1gpage(tmpva) + NBPDP;
6657			continue;
6658		}
6659		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6660		if (*pde & PG_PS) {
6661			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6662				pmap_pde_attr(pde, cache_bits_pde,
6663				    X86_PG_PDE_CACHE);
6664				changed = TRUE;
6665			}
6666			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6667			    (*pde & PG_PS_FRAME) < dmaplimit) {
6668				if (pa_start == pa_end) {
6669					/* Start physical address run. */
6670					pa_start = *pde & PG_PS_FRAME;
6671					pa_end = pa_start + NBPDR;
6672				} else if (pa_end == (*pde & PG_PS_FRAME))
6673					pa_end += NBPDR;
6674				else {
6675					/* Run ended, update direct map. */
6676					error = pmap_change_attr_locked(
6677					    PHYS_TO_DMAP(pa_start),
6678					    pa_end - pa_start, mode);
6679					if (error != 0)
6680						break;
6681					/* Start physical address run. */
6682					pa_start = *pde & PG_PS_FRAME;
6683					pa_end = pa_start + NBPDR;
6684				}
6685			}
6686			tmpva = trunc_2mpage(tmpva) + NBPDR;
6687		} else {
6688			pte = pmap_pde_to_pte(pde, tmpva);
6689			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6690				pmap_pte_attr(pte, cache_bits_pte,
6691				    X86_PG_PTE_CACHE);
6692				changed = TRUE;
6693			}
6694			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6695			    (*pte & PG_PS_FRAME) < dmaplimit) {
6696				if (pa_start == pa_end) {
6697					/* Start physical address run. */
6698					pa_start = *pte & PG_FRAME;
6699					pa_end = pa_start + PAGE_SIZE;
6700				} else if (pa_end == (*pte & PG_FRAME))
6701					pa_end += PAGE_SIZE;
6702				else {
6703					/* Run ended, update direct map. */
6704					error = pmap_change_attr_locked(
6705					    PHYS_TO_DMAP(pa_start),
6706					    pa_end - pa_start, mode);
6707					if (error != 0)
6708						break;
6709					/* Start physical address run. */
6710					pa_start = *pte & PG_FRAME;
6711					pa_end = pa_start + PAGE_SIZE;
6712				}
6713			}
6714			tmpva += PAGE_SIZE;
6715		}
6716	}
6717	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6718		pa_end1 = MIN(pa_end, dmaplimit);
6719		if (pa_start != pa_end1)
6720			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6721			    pa_end1 - pa_start, mode);
6722	}
6723
6724	/*
6725	 * Flush CPU caches if required to make sure any data isn't cached that
6726	 * shouldn't be, etc.
6727	 */
6728	if (changed) {
6729		pmap_invalidate_range(kernel_pmap, base, tmpva);
6730		pmap_invalidate_cache_range(base, tmpva, FALSE);
6731	}
6732	return (error);
6733}
6734
6735/*
6736 * Demotes any mapping within the direct map region that covers more than the
6737 * specified range of physical addresses.  This range's size must be a power
6738 * of two and its starting address must be a multiple of its size.  Since the
6739 * demotion does not change any attributes of the mapping, a TLB invalidation
6740 * is not mandatory.  The caller may, however, request a TLB invalidation.
6741 */
6742void
6743pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6744{
6745	pdp_entry_t *pdpe;
6746	pd_entry_t *pde;
6747	vm_offset_t va;
6748	boolean_t changed;
6749
6750	if (len == 0)
6751		return;
6752	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6753	KASSERT((base & (len - 1)) == 0,
6754	    ("pmap_demote_DMAP: base is not a multiple of len"));
6755	if (len < NBPDP && base < dmaplimit) {
6756		va = PHYS_TO_DMAP(base);
6757		changed = FALSE;
6758		PMAP_LOCK(kernel_pmap);
6759		pdpe = pmap_pdpe(kernel_pmap, va);
6760		if ((*pdpe & X86_PG_V) == 0)
6761			panic("pmap_demote_DMAP: invalid PDPE");
6762		if ((*pdpe & PG_PS) != 0) {
6763			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6764				panic("pmap_demote_DMAP: PDPE failed");
6765			changed = TRUE;
6766		}
6767		if (len < NBPDR) {
6768			pde = pmap_pdpe_to_pde(pdpe, va);
6769			if ((*pde & X86_PG_V) == 0)
6770				panic("pmap_demote_DMAP: invalid PDE");
6771			if ((*pde & PG_PS) != 0) {
6772				if (!pmap_demote_pde(kernel_pmap, pde, va))
6773					panic("pmap_demote_DMAP: PDE failed");
6774				changed = TRUE;
6775			}
6776		}
6777		if (changed && invalidate)
6778			pmap_invalidate_page(kernel_pmap, va);
6779		PMAP_UNLOCK(kernel_pmap);
6780	}
6781}
6782
6783/*
6784 * perform the pmap work for mincore
6785 */
6786int
6787pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6788{
6789	pd_entry_t *pdep;
6790	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6791	vm_paddr_t pa;
6792	int val;
6793
6794	PG_A = pmap_accessed_bit(pmap);
6795	PG_M = pmap_modified_bit(pmap);
6796	PG_V = pmap_valid_bit(pmap);
6797	PG_RW = pmap_rw_bit(pmap);
6798
6799	PMAP_LOCK(pmap);
6800retry:
6801	pdep = pmap_pde(pmap, addr);
6802	if (pdep != NULL && (*pdep & PG_V)) {
6803		if (*pdep & PG_PS) {
6804			pte = *pdep;
6805			/* Compute the physical address of the 4KB page. */
6806			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6807			    PG_FRAME;
6808			val = MINCORE_SUPER;
6809		} else {
6810			pte = *pmap_pde_to_pte(pdep, addr);
6811			pa = pte & PG_FRAME;
6812			val = 0;
6813		}
6814	} else {
6815		pte = 0;
6816		pa = 0;
6817		val = 0;
6818	}
6819	if ((pte & PG_V) != 0) {
6820		val |= MINCORE_INCORE;
6821		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6822			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6823		if ((pte & PG_A) != 0)
6824			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6825	}
6826	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6827	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6828	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6829		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6830		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6831			goto retry;
6832	} else
6833		PA_UNLOCK_COND(*locked_pa);
6834	PMAP_UNLOCK(pmap);
6835	return (val);
6836}
6837
6838static uint64_t
6839pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
6840{
6841	uint32_t gen, new_gen, pcid_next;
6842
6843	CRITICAL_ASSERT(curthread);
6844	gen = PCPU_GET(pcid_gen);
6845	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
6846	    pmap->pm_pcids[cpuid].pm_gen == gen)
6847		return (CR3_PCID_SAVE);
6848	pcid_next = PCPU_GET(pcid_next);
6849	KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
6850	    cpuid, pcid_next));
6851	if (pcid_next == PMAP_PCID_OVERMAX) {
6852		new_gen = gen + 1;
6853		if (new_gen == 0)
6854			new_gen = 1;
6855		PCPU_SET(pcid_gen, new_gen);
6856		pcid_next = PMAP_PCID_KERN + 1;
6857	} else {
6858		new_gen = gen;
6859	}
6860	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
6861	pmap->pm_pcids[cpuid].pm_gen = new_gen;
6862	PCPU_SET(pcid_next, pcid_next + 1);
6863	return (0);
6864}
6865
6866void
6867pmap_activate_sw(struct thread *td)
6868{
6869	pmap_t oldpmap, pmap;
6870	uint64_t cached, cr3;
6871	u_int cpuid;
6872
6873	oldpmap = PCPU_GET(curpmap);
6874	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6875	if (oldpmap == pmap)
6876		return;
6877	cpuid = PCPU_GET(cpuid);
6878#ifdef SMP
6879	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6880#else
6881	CPU_SET(cpuid, &pmap->pm_active);
6882#endif
6883	cr3 = rcr3();
6884	if (pmap_pcid_enabled) {
6885		cached = pmap_pcid_alloc(pmap, cpuid);
6886		KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 &&
6887		    pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
6888		    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
6889		    pmap->pm_pcids[cpuid].pm_pcid));
6890		KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
6891		    pmap == kernel_pmap,
6892		    ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x",
6893		    td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
6894		if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) {
6895			load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
6896			    cached);
6897			if (cached)
6898				PCPU_INC(pm_save_cnt);
6899		}
6900	} else if (cr3 != pmap->pm_cr3) {
6901		load_cr3(pmap->pm_cr3);
6902	}
6903	PCPU_SET(curpmap, pmap);
6904#ifdef SMP
6905	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6906#else
6907	CPU_CLR(cpuid, &oldpmap->pm_active);
6908#endif
6909}
6910
6911void
6912pmap_activate(struct thread *td)
6913{
6914
6915	critical_enter();
6916	pmap_activate_sw(td);
6917	critical_exit();
6918}
6919
6920void
6921pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6922{
6923}
6924
6925/*
6926 *	Increase the starting virtual address of the given mapping if a
6927 *	different alignment might result in more superpage mappings.
6928 */
6929void
6930pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6931    vm_offset_t *addr, vm_size_t size)
6932{
6933	vm_offset_t superpage_offset;
6934
6935	if (size < NBPDR)
6936		return;
6937	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6938		offset += ptoa(object->pg_color);
6939	superpage_offset = offset & PDRMASK;
6940	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6941	    (*addr & PDRMASK) == superpage_offset)
6942		return;
6943	if ((*addr & PDRMASK) < superpage_offset)
6944		*addr = (*addr & ~PDRMASK) + superpage_offset;
6945	else
6946		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6947}
6948
6949#ifdef INVARIANTS
6950static unsigned long num_dirty_emulations;
6951SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6952	     &num_dirty_emulations, 0, NULL);
6953
6954static unsigned long num_accessed_emulations;
6955SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6956	     &num_accessed_emulations, 0, NULL);
6957
6958static unsigned long num_superpage_accessed_emulations;
6959SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6960	     &num_superpage_accessed_emulations, 0, NULL);
6961
6962static unsigned long ad_emulation_superpage_promotions;
6963SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6964	     &ad_emulation_superpage_promotions, 0, NULL);
6965#endif	/* INVARIANTS */
6966
6967int
6968pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6969{
6970	int rv;
6971	struct rwlock *lock;
6972	vm_page_t m, mpte;
6973	pd_entry_t *pde;
6974	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6975
6976	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6977	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6978
6979	if (!pmap_emulate_ad_bits(pmap))
6980		return (-1);
6981
6982	PG_A = pmap_accessed_bit(pmap);
6983	PG_M = pmap_modified_bit(pmap);
6984	PG_V = pmap_valid_bit(pmap);
6985	PG_RW = pmap_rw_bit(pmap);
6986
6987	rv = -1;
6988	lock = NULL;
6989	PMAP_LOCK(pmap);
6990
6991	pde = pmap_pde(pmap, va);
6992	if (pde == NULL || (*pde & PG_V) == 0)
6993		goto done;
6994
6995	if ((*pde & PG_PS) != 0) {
6996		if (ftype == VM_PROT_READ) {
6997#ifdef INVARIANTS
6998			atomic_add_long(&num_superpage_accessed_emulations, 1);
6999#endif
7000			*pde |= PG_A;
7001			rv = 0;
7002		}
7003		goto done;
7004	}
7005
7006	pte = pmap_pde_to_pte(pde, va);
7007	if ((*pte & PG_V) == 0)
7008		goto done;
7009
7010	if (ftype == VM_PROT_WRITE) {
7011		if ((*pte & PG_RW) == 0)
7012			goto done;
7013		/*
7014		 * Set the modified and accessed bits simultaneously.
7015		 *
7016		 * Intel EPT PTEs that do software emulation of A/D bits map
7017		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
7018		 * An EPT misconfiguration is triggered if the PTE is writable
7019		 * but not readable (WR=10). This is avoided by setting PG_A
7020		 * and PG_M simultaneously.
7021		 */
7022		*pte |= PG_M | PG_A;
7023	} else {
7024		*pte |= PG_A;
7025	}
7026
7027	/* try to promote the mapping */
7028	if (va < VM_MAXUSER_ADDRESS)
7029		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7030	else
7031		mpte = NULL;
7032
7033	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
7034
7035	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
7036	    pmap_ps_enabled(pmap) &&
7037	    (m->flags & PG_FICTITIOUS) == 0 &&
7038	    vm_reserv_level_iffullpop(m) == 0) {
7039		pmap_promote_pde(pmap, pde, va, &lock);
7040#ifdef INVARIANTS
7041		atomic_add_long(&ad_emulation_superpage_promotions, 1);
7042#endif
7043	}
7044#ifdef INVARIANTS
7045	if (ftype == VM_PROT_WRITE)
7046		atomic_add_long(&num_dirty_emulations, 1);
7047	else
7048		atomic_add_long(&num_accessed_emulations, 1);
7049#endif
7050	rv = 0;		/* success */
7051done:
7052	if (lock != NULL)
7053		rw_wunlock(lock);
7054	PMAP_UNLOCK(pmap);
7055	return (rv);
7056}
7057
7058void
7059pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
7060{
7061	pml4_entry_t *pml4;
7062	pdp_entry_t *pdp;
7063	pd_entry_t *pde;
7064	pt_entry_t *pte, PG_V;
7065	int idx;
7066
7067	idx = 0;
7068	PG_V = pmap_valid_bit(pmap);
7069	PMAP_LOCK(pmap);
7070
7071	pml4 = pmap_pml4e(pmap, va);
7072	ptr[idx++] = *pml4;
7073	if ((*pml4 & PG_V) == 0)
7074		goto done;
7075
7076	pdp = pmap_pml4e_to_pdpe(pml4, va);
7077	ptr[idx++] = *pdp;
7078	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
7079		goto done;
7080
7081	pde = pmap_pdpe_to_pde(pdp, va);
7082	ptr[idx++] = *pde;
7083	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
7084		goto done;
7085
7086	pte = pmap_pde_to_pte(pde, va);
7087	ptr[idx++] = *pte;
7088
7089done:
7090	PMAP_UNLOCK(pmap);
7091	*num = idx;
7092}
7093
7094/**
7095 * Get the kernel virtual address of a set of physical pages. If there are
7096 * physical addresses not covered by the DMAP perform a transient mapping
7097 * that will be removed when calling pmap_unmap_io_transient.
7098 *
7099 * \param page        The pages the caller wishes to obtain the virtual
7100 *                    address on the kernel memory map.
7101 * \param vaddr       On return contains the kernel virtual memory address
7102 *                    of the pages passed in the page parameter.
7103 * \param count       Number of pages passed in.
7104 * \param can_fault   TRUE if the thread using the mapped pages can take
7105 *                    page faults, FALSE otherwise.
7106 *
7107 * \returns TRUE if the caller must call pmap_unmap_io_transient when
7108 *          finished or FALSE otherwise.
7109 *
7110 */
7111boolean_t
7112pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7113    boolean_t can_fault)
7114{
7115	vm_paddr_t paddr;
7116	boolean_t needs_mapping;
7117	pt_entry_t *pte;
7118	int cache_bits, error, i;
7119
7120	/*
7121	 * Allocate any KVA space that we need, this is done in a separate
7122	 * loop to prevent calling vmem_alloc while pinned.
7123	 */
7124	needs_mapping = FALSE;
7125	for (i = 0; i < count; i++) {
7126		paddr = VM_PAGE_TO_PHYS(page[i]);
7127		if (__predict_false(paddr >= dmaplimit)) {
7128			error = vmem_alloc(kernel_arena, PAGE_SIZE,
7129			    M_BESTFIT | M_WAITOK, &vaddr[i]);
7130			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7131			needs_mapping = TRUE;
7132		} else {
7133			vaddr[i] = PHYS_TO_DMAP(paddr);
7134		}
7135	}
7136
7137	/* Exit early if everything is covered by the DMAP */
7138	if (!needs_mapping)
7139		return (FALSE);
7140
7141	/*
7142	 * NB:  The sequence of updating a page table followed by accesses
7143	 * to the corresponding pages used in the !DMAP case is subject to
7144	 * the situation described in the "AMD64 Architecture Programmer's
7145	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
7146	 * Coherency Considerations".  Therefore, issuing the INVLPG right
7147	 * after modifying the PTE bits is crucial.
7148	 */
7149	if (!can_fault)
7150		sched_pin();
7151	for (i = 0; i < count; i++) {
7152		paddr = VM_PAGE_TO_PHYS(page[i]);
7153		if (paddr >= dmaplimit) {
7154			if (can_fault) {
7155				/*
7156				 * Slow path, since we can get page faults
7157				 * while mappings are active don't pin the
7158				 * thread to the CPU and instead add a global
7159				 * mapping visible to all CPUs.
7160				 */
7161				pmap_qenter(vaddr[i], &page[i], 1);
7162			} else {
7163				pte = vtopte(vaddr[i]);
7164				cache_bits = pmap_cache_bits(kernel_pmap,
7165				    page[i]->md.pat_mode, 0);
7166				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
7167				    cache_bits);
7168				invlpg(vaddr[i]);
7169			}
7170		}
7171	}
7172
7173	return (needs_mapping);
7174}
7175
7176void
7177pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7178    boolean_t can_fault)
7179{
7180	vm_paddr_t paddr;
7181	int i;
7182
7183	if (!can_fault)
7184		sched_unpin();
7185	for (i = 0; i < count; i++) {
7186		paddr = VM_PAGE_TO_PHYS(page[i]);
7187		if (paddr >= dmaplimit) {
7188			if (can_fault)
7189				pmap_qremove(vaddr[i], 1);
7190			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
7191		}
7192	}
7193}
7194
7195vm_offset_t
7196pmap_quick_enter_page(vm_page_t m)
7197{
7198	vm_paddr_t paddr;
7199
7200	paddr = VM_PAGE_TO_PHYS(m);
7201	if (paddr < dmaplimit)
7202		return (PHYS_TO_DMAP(paddr));
7203	mtx_lock_spin(&qframe_mtx);
7204	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
7205	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
7206	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
7207	return (qframe);
7208}
7209
7210void
7211pmap_quick_remove_page(vm_offset_t addr)
7212{
7213
7214	if (addr != qframe)
7215		return;
7216	pte_store(vtopte(qframe), 0);
7217	invlpg(qframe);
7218	mtx_unlock_spin(&qframe_mtx);
7219}
7220
7221#include "opt_ddb.h"
7222#ifdef DDB
7223#include <ddb/ddb.h>
7224
7225DB_SHOW_COMMAND(pte, pmap_print_pte)
7226{
7227	pmap_t pmap;
7228	pml4_entry_t *pml4;
7229	pdp_entry_t *pdp;
7230	pd_entry_t *pde;
7231	pt_entry_t *pte, PG_V;
7232	vm_offset_t va;
7233
7234	if (have_addr) {
7235		va = (vm_offset_t)addr;
7236		pmap = PCPU_GET(curpmap); /* XXX */
7237	} else {
7238		db_printf("show pte addr\n");
7239		return;
7240	}
7241	PG_V = pmap_valid_bit(pmap);
7242	pml4 = pmap_pml4e(pmap, va);
7243	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
7244	if ((*pml4 & PG_V) == 0) {
7245		db_printf("\n");
7246		return;
7247	}
7248	pdp = pmap_pml4e_to_pdpe(pml4, va);
7249	db_printf(" pdpe %#016lx", *pdp);
7250	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
7251		db_printf("\n");
7252		return;
7253	}
7254	pde = pmap_pdpe_to_pde(pdp, va);
7255	db_printf(" pde %#016lx", *pde);
7256	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
7257		db_printf("\n");
7258		return;
7259	}
7260	pte = pmap_pde_to_pte(pde, va);
7261	db_printf(" pte %#016lx\n", *pte);
7262}
7263
7264DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
7265{
7266	vm_paddr_t a;
7267
7268	if (have_addr) {
7269		a = (vm_paddr_t)addr;
7270		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
7271	} else {
7272		db_printf("show phys2dmap addr\n");
7273	}
7274}
7275#endif
7276