pmap.c revision 328386
199357Smarkm/*-
299357Smarkm * Copyright (c) 1991 Regents of the University of California.
399357Smarkm * All rights reserved.
499357Smarkm * Copyright (c) 1994 John S. Dyson
599357Smarkm * All rights reserved.
699357Smarkm * Copyright (c) 1994 David Greenman
799357Smarkm * All rights reserved.
899357Smarkm * Copyright (c) 2003 Peter Wemm
999357Smarkm * All rights reserved.
1099357Smarkm * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
1199357Smarkm * All rights reserved.
1299357Smarkm *
1399357Smarkm * This code is derived from software contributed to Berkeley by
1499357Smarkm * the Systems Programming Group of the University of Utah Computer
1599357Smarkm * Science Department and William Jolitz of UUNET Technologies Inc.
1699357Smarkm *
1799357Smarkm * Redistribution and use in source and binary forms, with or without
1899357Smarkm * modification, are permitted provided that the following conditions
1999357Smarkm * are met:
2099357Smarkm * 1. Redistributions of source code must retain the above copyright
2199357Smarkm *    notice, this list of conditions and the following disclaimer.
2299357Smarkm * 2. Redistributions in binary form must reproduce the above copyright
2399357Smarkm *    notice, this list of conditions and the following disclaimer in the
2499357Smarkm *    documentation and/or other materials provided with the distribution.
2599357Smarkm * 3. All advertising materials mentioning features or use of this software
2699357Smarkm *    must display the following acknowledgement:
2799357Smarkm *	This product includes software developed by the University of
2899357Smarkm *	California, Berkeley and its contributors.
2999357Smarkm * 4. Neither the name of the University nor the names of its contributors
3099357Smarkm *    may be used to endorse or promote products derived from this software
3199357Smarkm *    without specific prior written permission.
3299357Smarkm *
3399357Smarkm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
3499357Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3599357Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36106053Swollman * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3799357Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3899357Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3999357Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4099357Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4199357Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
4299357Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
4399357Smarkm * SUCH DAMAGE.
4499357Smarkm *
4599357Smarkm *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
4699357Smarkm */
4799357Smarkm/*-
4899357Smarkm * Copyright (c) 2003 Networks Associates Technology, Inc.
4999357Smarkm * All rights reserved.
5099357Smarkm *
5199357Smarkm * This software was developed for the FreeBSD Project by Jake Burkholder,
5299357Smarkm * Safeport Network Services, and Network Associates Laboratories, the
5399357Smarkm * Security Research Division of Network Associates, Inc. under
5499357Smarkm * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
5599357Smarkm * CHATS research program.
5699357Smarkm *
5799357Smarkm * Redistribution and use in source and binary forms, with or without
5899357Smarkm * modification, are permitted provided that the following conditions
5999357Smarkm * are met:
6099357Smarkm * 1. Redistributions of source code must retain the above copyright
6199357Smarkm *    notice, this list of conditions and the following disclaimer.
6299357Smarkm * 2. Redistributions in binary form must reproduce the above copyright
6399357Smarkm *    notice, this list of conditions and the following disclaimer in the
6499357Smarkm *    documentation and/or other materials provided with the distribution.
6599357Smarkm *
6699357Smarkm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
6799357Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6899357Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
6999357Smarkm * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
7099357Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
7199357Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
7299357Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
7399357Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
7499357Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
7599357Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
7699357Smarkm * SUCH DAMAGE.
7799357Smarkm */
7899357Smarkm
7999357Smarkm#define	AMD64_NPT_AWARE
8099357Smarkm
8199357Smarkm#include <sys/cdefs.h>
8299357Smarkm__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/pmap.c 328386 2018-01-25 02:45:21Z pkelsey $");
8399357Smarkm
8499357Smarkm/*
8599357Smarkm *	Manages physical address maps.
8699357Smarkm *
8799357Smarkm *	Since the information managed by this module is
8899357Smarkm *	also stored by the logical address mapping module,
8999357Smarkm *	this module may throw away valid virtual-to-physical
9099357Smarkm *	mappings at almost any time.  However, invalidations
9199357Smarkm *	of virtual-to-physical mappings must be done as
9299357Smarkm *	requested.
9399357Smarkm *
9499357Smarkm *	In order to cope with hardware architectures which
9599357Smarkm *	make virtual-to-physical map invalidates expensive,
9699357Smarkm *	this module may delay invalidate or reduced protection
9799357Smarkm *	operations until such time as they are actually
9899357Smarkm *	necessary.  This module is given full information as
9999357Smarkm *	to which processors are currently using which maps,
10099357Smarkm *	and to when physical maps must be made correct.
10199357Smarkm */
10299357Smarkm
10399357Smarkm#include "opt_pmap.h"
10499357Smarkm#include "opt_vm.h"
10599357Smarkm
10699357Smarkm#include <sys/param.h>
10799357Smarkm#include <sys/bitstring.h>
10899357Smarkm#include <sys/bus.h>
10999357Smarkm#include <sys/systm.h>
11099357Smarkm#include <sys/kernel.h>
11199357Smarkm#include <sys/ktr.h>
11299357Smarkm#include <sys/lock.h>
11399357Smarkm#include <sys/malloc.h>
11499357Smarkm#include <sys/mman.h>
11599357Smarkm#include <sys/mutex.h>
11699357Smarkm#include <sys/proc.h>
11799357Smarkm#include <sys/rwlock.h>
11899357Smarkm#include <sys/sx.h>
11999357Smarkm#include <sys/turnstile.h>
12099357Smarkm#include <sys/vmem.h>
12199357Smarkm#include <sys/vmmeter.h>
12299357Smarkm#include <sys/sched.h>
12399357Smarkm#include <sys/sysctl.h>
12499357Smarkm#include <sys/smp.h>
12599357Smarkm
12699357Smarkm#include <vm/vm.h>
12799357Smarkm#include <vm/vm_param.h>
12899357Smarkm#include <vm/vm_kern.h>
12999357Smarkm#include <vm/vm_page.h>
13099357Smarkm#include <vm/vm_map.h>
13199357Smarkm#include <vm/vm_object.h>
13299357Smarkm#include <vm/vm_extern.h>
13399357Smarkm#include <vm/vm_pageout.h>
13499357Smarkm#include <vm/vm_pager.h>
13599357Smarkm#include <vm/vm_phys.h>
13699357Smarkm#include <vm/vm_radix.h>
13799357Smarkm#include <vm/vm_reserv.h>
13899357Smarkm#include <vm/uma.h>
13999357Smarkm
14099357Smarkm#include <machine/intr_machdep.h>
14199357Smarkm#include <x86/apicvar.h>
14299357Smarkm#include <machine/cpu.h>
14399357Smarkm#include <machine/cputypes.h>
14499357Smarkm#include <machine/md_var.h>
14599357Smarkm#include <machine/pcb.h>
14699357Smarkm#include <machine/specialreg.h>
14799357Smarkm#ifdef SMP
14899357Smarkm#include <machine/smp.h>
14999357Smarkm#endif
15099357Smarkm
15199357Smarkmstatic __inline boolean_t
15299357Smarkmpmap_type_guest(pmap_t pmap)
15399357Smarkm{
15499357Smarkm
15599357Smarkm	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
15699357Smarkm}
15799357Smarkm
15899357Smarkmstatic __inline boolean_t
15999357Smarkmpmap_emulate_ad_bits(pmap_t pmap)
16099357Smarkm{
16199357Smarkm
16299357Smarkm	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
16399357Smarkm}
16499357Smarkm
16599357Smarkmstatic __inline pt_entry_t
16699357Smarkmpmap_valid_bit(pmap_t pmap)
16799357Smarkm{
16899357Smarkm	pt_entry_t mask;
16999357Smarkm
17099357Smarkm	switch (pmap->pm_type) {
17199357Smarkm	case PT_X86:
17299357Smarkm	case PT_RVI:
17399357Smarkm		mask = X86_PG_V;
17499357Smarkm		break;
17599357Smarkm	case PT_EPT:
17699357Smarkm		if (pmap_emulate_ad_bits(pmap))
17799357Smarkm			mask = EPT_PG_EMUL_V;
17899357Smarkm		else
17999357Smarkm			mask = EPT_PG_READ;
18099357Smarkm		break;
18199357Smarkm	default:
18299357Smarkm		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
18399357Smarkm	}
18499357Smarkm
18599357Smarkm	return (mask);
18699357Smarkm}
18799357Smarkm
18899357Smarkmstatic __inline pt_entry_t
18999357Smarkmpmap_rw_bit(pmap_t pmap)
19099357Smarkm{
19199357Smarkm	pt_entry_t mask;
19299357Smarkm
19399357Smarkm	switch (pmap->pm_type) {
19499357Smarkm	case PT_X86:
19599357Smarkm	case PT_RVI:
19699357Smarkm		mask = X86_PG_RW;
19799357Smarkm		break;
19899357Smarkm	case PT_EPT:
19999357Smarkm		if (pmap_emulate_ad_bits(pmap))
20099357Smarkm			mask = EPT_PG_EMUL_RW;
20199357Smarkm		else
20299357Smarkm			mask = EPT_PG_WRITE;
20399357Smarkm		break;
20499357Smarkm	default:
20599357Smarkm		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
20699357Smarkm	}
20799357Smarkm
20899357Smarkm	return (mask);
20999357Smarkm}
21099357Smarkm
21199357Smarkmstatic __inline pt_entry_t
21299357Smarkmpmap_global_bit(pmap_t pmap)
21399357Smarkm{
21499357Smarkm	pt_entry_t mask;
21599357Smarkm
21699357Smarkm	switch (pmap->pm_type) {
21799357Smarkm	case PT_X86:
21899357Smarkm		mask = X86_PG_G;
21999357Smarkm		break;
22099357Smarkm	case PT_RVI:
22199357Smarkm	case PT_EPT:
22299357Smarkm		mask = 0;
22399357Smarkm		break;
22499357Smarkm	default:
22599357Smarkm		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
22699357Smarkm	}
22799357Smarkm
22899357Smarkm	return (mask);
22999357Smarkm}
23099357Smarkm
23199357Smarkmstatic __inline pt_entry_t
23299357Smarkmpmap_accessed_bit(pmap_t pmap)
23399357Smarkm{
23499357Smarkm	pt_entry_t mask;
23599357Smarkm
23699357Smarkm	switch (pmap->pm_type) {
23799357Smarkm	case PT_X86:
23899357Smarkm	case PT_RVI:
23999357Smarkm		mask = X86_PG_A;
24099357Smarkm		break;
24199357Smarkm	case PT_EPT:
24299357Smarkm		if (pmap_emulate_ad_bits(pmap))
24399357Smarkm			mask = EPT_PG_READ;
24499357Smarkm		else
24599357Smarkm			mask = EPT_PG_A;
24699357Smarkm		break;
24799357Smarkm	default:
24899357Smarkm		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
24999357Smarkm	}
25099357Smarkm
25199357Smarkm	return (mask);
25299357Smarkm}
25399357Smarkm
25499357Smarkmstatic __inline pt_entry_t
25599357Smarkmpmap_modified_bit(pmap_t pmap)
25699357Smarkm{
25799357Smarkm	pt_entry_t mask;
25899357Smarkm
25999357Smarkm	switch (pmap->pm_type) {
26099357Smarkm	case PT_X86:
26199357Smarkm	case PT_RVI:
26299357Smarkm		mask = X86_PG_M;
26399357Smarkm		break;
26499357Smarkm	case PT_EPT:
26599357Smarkm		if (pmap_emulate_ad_bits(pmap))
26699357Smarkm			mask = EPT_PG_WRITE;
26799357Smarkm		else
26899357Smarkm			mask = EPT_PG_M;
26999357Smarkm		break;
27099357Smarkm	default:
27199357Smarkm		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
27299357Smarkm	}
27399357Smarkm
27499357Smarkm	return (mask);
27599357Smarkm}
27699357Smarkm
27799357Smarkmextern	struct pcpu __pcpu[];
27899357Smarkm
27999357Smarkm#if !defined(DIAGNOSTIC)
28099357Smarkm#ifdef __GNUC_GNU_INLINE__
28199357Smarkm#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
28299357Smarkm#else
28399357Smarkm#define PMAP_INLINE	extern inline
28499357Smarkm#endif
28599357Smarkm#else
28699357Smarkm#define PMAP_INLINE
28799357Smarkm#endif
28899357Smarkm
28999357Smarkm#ifdef PV_STATS
29099357Smarkm#define PV_STAT(x)	do { x ; } while (0)
29199357Smarkm#else
29299357Smarkm#define PV_STAT(x)	do { } while (0)
29399357Smarkm#endif
29499357Smarkm
29599357Smarkm#define	pa_index(pa)	((pa) >> PDRSHIFT)
29699357Smarkm#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
29799357Smarkm
29899357Smarkm#define	NPV_LIST_LOCKS	MAXCPU
29999357Smarkm
30099357Smarkm#define	PHYS_TO_PV_LIST_LOCK(pa)	\
30199357Smarkm			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
30299357Smarkm
30399357Smarkm#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
30499357Smarkm	struct rwlock **_lockp = (lockp);		\
30599357Smarkm	struct rwlock *_new_lock;			\
30699357Smarkm							\
30799357Smarkm	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
30899357Smarkm	if (_new_lock != *_lockp) {			\
30999357Smarkm		if (*_lockp != NULL)			\
31099357Smarkm			rw_wunlock(*_lockp);		\
31199357Smarkm		*_lockp = _new_lock;			\
31299357Smarkm		rw_wlock(*_lockp);			\
31399357Smarkm	}						\
31499357Smarkm} while (0)
31599357Smarkm
31699357Smarkm#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
31799357Smarkm			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
31899357Smarkm
31999357Smarkm#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
32099357Smarkm	struct rwlock **_lockp = (lockp);		\
32199357Smarkm							\
32299357Smarkm	if (*_lockp != NULL) {				\
32399357Smarkm		rw_wunlock(*_lockp);			\
32499357Smarkm		*_lockp = NULL;				\
32599357Smarkm	}						\
32699357Smarkm} while (0)
32799357Smarkm
32899357Smarkm#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
32999357Smarkm			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
33099357Smarkm
33199357Smarkmstruct pmap kernel_pmap_store;
33299357Smarkm
33399357Smarkmvm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
33499357Smarkmvm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
33599357Smarkm
33699357Smarkmint nkpt;
33799357SmarkmSYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
33899357Smarkm    "Number of kernel page table pages allocated on bootup");
33999357Smarkm
34099357Smarkmstatic int ndmpdp;
34199357Smarkmvm_paddr_t dmaplimit;
34299357Smarkmvm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
34399357Smarkmpt_entry_t pg_nx;
34499357Smarkm
34599357Smarkmstatic SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
34699357Smarkm
34799357Smarkmstatic int pat_works = 1;
34899357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
34999357Smarkm    "Is page attribute table fully functional?");
35099357Smarkm
35199357Smarkmstatic int pg_ps_enabled = 1;
35299357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
35399357Smarkm    &pg_ps_enabled, 0, "Are large page mappings enabled?");
35499357Smarkm
35599357Smarkm#define	PAT_INDEX_SIZE	8
35699357Smarkmstatic int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
35799357Smarkm
35899357Smarkmstatic u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
35999357Smarkmstatic u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
36099357Smarkmu_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
36199357Smarkmu_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
36299357Smarkm
36399357Smarkmstatic u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
36499357Smarkmstatic u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
36599357Smarkmstatic int		ndmpdpphys;	/* number of DMPDPphys pages */
36699357Smarkm
36799357Smarkm/*
36899357Smarkm * pmap_mapdev support pre initialization (i.e. console)
36999357Smarkm */
37099357Smarkm#define	PMAP_PREINIT_MAPPING_COUNT	8
37199357Smarkmstatic struct pmap_preinit_mapping {
37299357Smarkm	vm_paddr_t	pa;
37399357Smarkm	vm_offset_t	va;
37499357Smarkm	vm_size_t	sz;
37599357Smarkm	int		mode;
37699357Smarkm} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
37799357Smarkmstatic int pmap_initialized;
37899357Smarkm
37999357Smarkm/*
38099357Smarkm * Data for the pv entry allocation mechanism.
38199357Smarkm * Updates to pv_invl_gen are protected by the pv_list_locks[]
38299357Smarkm * elements, but reads are not.
38399357Smarkm */
38499357Smarkmstatic TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
385167260Skevlostatic struct mtx pv_chunks_mutex;
38699357Smarkmstatic struct rwlock pv_list_locks[NPV_LIST_LOCKS];
38799357Smarkmstatic u_long pv_invl_gen[NPV_LIST_LOCKS];
38899357Smarkmstatic struct md_page *pv_table;
38999357Smarkmstatic struct md_page pv_dummy;
39099357Smarkm
39199357Smarkm/*
39299357Smarkm * All those kernel PT submaps that BSD is so fond of
39399357Smarkm */
39499357Smarkmpt_entry_t *CMAP1 = NULL;
39599357Smarkmcaddr_t CADDR1 = 0;
39699357Smarkmstatic vm_offset_t qframe = 0;
39799357Smarkmstatic struct mtx qframe_mtx;
39899357Smarkm
39999357Smarkmstatic int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
40099357Smarkm
40199357Smarkmint pmap_pcid_enabled = 1;
40299357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
40399357Smarkm    &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
40499357Smarkmint invpcid_works = 0;
40599357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
40699357Smarkm    "Is the invpcid instruction available ?");
40799357Smarkm
40899357Smarkmstatic int
40999357Smarkmpmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
41099357Smarkm{
41199357Smarkm	int i;
41299357Smarkm	uint64_t res;
41399357Smarkm
41499357Smarkm	res = 0;
41599357Smarkm	CPU_FOREACH(i) {
41699357Smarkm		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
41799357Smarkm	}
41899357Smarkm	return (sysctl_handle_64(oidp, &res, 0, req));
41999357Smarkm}
42099357SmarkmSYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
42199357Smarkm    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
42299357Smarkm    "Count of saved TLB context on switch");
42399357Smarkm
42499357Smarkmstatic LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
42599357Smarkm    LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
42699357Smarkmstatic struct mtx invl_gen_mtx;
42799357Smarkmstatic u_long pmap_invl_gen = 0;
42899357Smarkm/* Fake lock object to satisfy turnstiles interface. */
42999357Smarkmstatic struct lock_object invl_gen_ts = {
43099357Smarkm	.lo_name = "invlts",
43199357Smarkm};
43299357Smarkm
43399357Smarkmstatic bool
43499357Smarkmpmap_not_in_di(void)
43599357Smarkm{
43699357Smarkm
43799357Smarkm	return (curthread->td_md.md_invl_gen.gen == 0);
43899357Smarkm}
43999357Smarkm
44099357Smarkm#define	PMAP_ASSERT_NOT_IN_DI() \
44199357Smarkm    KASSERT(pmap_not_in_di(), ("DI already started"))
44299357Smarkm
44399357Smarkm/*
44499357Smarkm * Start a new Delayed Invalidation (DI) block of code, executed by
44599357Smarkm * the current thread.  Within a DI block, the current thread may
44699357Smarkm * destroy both the page table and PV list entries for a mapping and
44799357Smarkm * then release the corresponding PV list lock before ensuring that
44899357Smarkm * the mapping is flushed from the TLBs of any processors with the
44999357Smarkm * pmap active.
45099357Smarkm */
45199357Smarkmstatic void
45299357Smarkmpmap_delayed_invl_started(void)
45399357Smarkm{
45499357Smarkm	struct pmap_invl_gen *invl_gen;
45599357Smarkm	u_long currgen;
45699357Smarkm
45799357Smarkm	invl_gen = &curthread->td_md.md_invl_gen;
45899357Smarkm	PMAP_ASSERT_NOT_IN_DI();
45999357Smarkm	mtx_lock(&invl_gen_mtx);
46099357Smarkm	if (LIST_EMPTY(&pmap_invl_gen_tracker))
46199357Smarkm		currgen = pmap_invl_gen;
46299357Smarkm	else
46399357Smarkm		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
46499357Smarkm	invl_gen->gen = currgen + 1;
46599357Smarkm	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
46699357Smarkm	mtx_unlock(&invl_gen_mtx);
46799357Smarkm}
46899357Smarkm
46999357Smarkm/*
47099357Smarkm * Finish the DI block, previously started by the current thread.  All
47199357Smarkm * required TLB flushes for the pages marked by
47299357Smarkm * pmap_delayed_invl_page() must be finished before this function is
47399357Smarkm * called.
47499357Smarkm *
47599357Smarkm * This function works by bumping the global DI generation number to
47699357Smarkm * the generation number of the current thread's DI, unless there is a
47799357Smarkm * pending DI that started earlier.  In the latter case, bumping the
47899357Smarkm * global DI generation number would incorrectly signal that the
47999357Smarkm * earlier DI had finished.  Instead, this function bumps the earlier
48099357Smarkm * DI's generation number to match the generation number of the
48199357Smarkm * current thread's DI.
48299357Smarkm */
48399357Smarkmstatic void
48499357Smarkmpmap_delayed_invl_finished(void)
48599357Smarkm{
48699357Smarkm	struct pmap_invl_gen *invl_gen, *next;
48799357Smarkm	struct turnstile *ts;
48899357Smarkm
48999357Smarkm	invl_gen = &curthread->td_md.md_invl_gen;
49099357Smarkm	KASSERT(invl_gen->gen != 0, ("missed invl_started"));
49199357Smarkm	mtx_lock(&invl_gen_mtx);
49299357Smarkm	next = LIST_NEXT(invl_gen, link);
49399357Smarkm	if (next == NULL) {
49499357Smarkm		turnstile_chain_lock(&invl_gen_ts);
49599357Smarkm		ts = turnstile_lookup(&invl_gen_ts);
49699357Smarkm		pmap_invl_gen = invl_gen->gen;
49799357Smarkm		if (ts != NULL) {
49899357Smarkm			turnstile_broadcast(ts, TS_SHARED_QUEUE);
49999357Smarkm			turnstile_unpend(ts, TS_SHARED_LOCK);
50099357Smarkm		}
50199357Smarkm		turnstile_chain_unlock(&invl_gen_ts);
50299357Smarkm	} else {
50399357Smarkm		next->gen = invl_gen->gen;
50499357Smarkm	}
50599357Smarkm	LIST_REMOVE(invl_gen, link);
50699357Smarkm	mtx_unlock(&invl_gen_mtx);
50799357Smarkm	invl_gen->gen = 0;
50899357Smarkm}
50999357Smarkm
51099357Smarkm#ifdef PV_STATS
51199357Smarkmstatic long invl_wait;
51299357SmarkmSYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
51399357Smarkm    "Number of times DI invalidation blocked pmap_remove_all/write");
51499357Smarkm#endif
51599357Smarkm
51699357Smarkmstatic u_long *
51799357Smarkmpmap_delayed_invl_genp(vm_page_t m)
51899357Smarkm{
51999357Smarkm
52099357Smarkm	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
52199357Smarkm}
52299357Smarkm
52399357Smarkm/*
52499357Smarkm * Ensure that all currently executing DI blocks, that need to flush
52599357Smarkm * TLB for the given page m, actually flushed the TLB at the time the
52699357Smarkm * function returned.  If the page m has an empty PV list and we call
52799357Smarkm * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
52899357Smarkm * valid mapping for the page m in either its page table or TLB.
52999357Smarkm *
53099357Smarkm * This function works by blocking until the global DI generation
53199357Smarkm * number catches up with the generation number associated with the
53299357Smarkm * given page m and its PV list.  Since this function's callers
53399357Smarkm * typically own an object lock and sometimes own a page lock, it
53499357Smarkm * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
53599357Smarkm * processor.
53699357Smarkm */
53799357Smarkmstatic void
53899357Smarkmpmap_delayed_invl_wait(vm_page_t m)
53999357Smarkm{
54099357Smarkm	struct turnstile *ts;
54199357Smarkm	u_long *m_gen;
54299357Smarkm#ifdef PV_STATS
54399357Smarkm	bool accounted = false;
54499357Smarkm#endif
54599357Smarkm
54699357Smarkm	m_gen = pmap_delayed_invl_genp(m);
54799357Smarkm	while (*m_gen > pmap_invl_gen) {
54899357Smarkm#ifdef PV_STATS
54999357Smarkm		if (!accounted) {
55099357Smarkm			atomic_add_long(&invl_wait, 1);
55199357Smarkm			accounted = true;
55299357Smarkm		}
55399357Smarkm#endif
55499357Smarkm		ts = turnstile_trywait(&invl_gen_ts);
55599357Smarkm		if (*m_gen > pmap_invl_gen)
55699357Smarkm			turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
55799357Smarkm		else
55899357Smarkm			turnstile_cancel(ts);
55999357Smarkm	}
56099357Smarkm}
56199357Smarkm
56299357Smarkm/*
56399357Smarkm * Mark the page m's PV list as participating in the current thread's
56499357Smarkm * DI block.  Any threads concurrently using m's PV list to remove or
56599357Smarkm * restrict all mappings to m will wait for the current thread's DI
56699357Smarkm * block to complete before proceeding.
56799357Smarkm *
56899357Smarkm * The function works by setting the DI generation number for m's PV
56999357Smarkm * list to at least the DI generation number of the current thread.
57099357Smarkm * This forces a caller of pmap_delayed_invl_wait() to block until
57199357Smarkm * current thread calls pmap_delayed_invl_finished().
57299357Smarkm */
57399357Smarkmstatic void
574147685Srupmap_delayed_invl_page(vm_page_t m)
57599357Smarkm{
57699357Smarkm	u_long gen, *m_gen;
57799357Smarkm
57899357Smarkm	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
57999357Smarkm	gen = curthread->td_md.md_invl_gen.gen;
58099357Smarkm	if (gen == 0)
58199357Smarkm		return;
582147685Sru	m_gen = pmap_delayed_invl_genp(m);
583147685Sru	if (*m_gen < gen)
58499357Smarkm		*m_gen = gen;
58599357Smarkm}
58699357Smarkm
58799357Smarkm/*
58899357Smarkm * Crashdump maps.
58999357Smarkm */
59099357Smarkmstatic caddr_t crashdumpmap;
59199357Smarkm
59299357Smarkm/*
59399357Smarkm * Internal flags for pmap_enter()'s helper functions.
59499357Smarkm */
59599357Smarkm#define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
59699357Smarkm#define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
59799357Smarkm
59899357Smarkmstatic void	free_pv_chunk(struct pv_chunk *pc);
59999357Smarkmstatic void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
60099357Smarkmstatic pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
60199357Smarkmstatic int	popcnt_pc_map_pq(uint64_t *map);
60299357Smarkmstatic vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
60399357Smarkmstatic void	reserve_pv_entries(pmap_t pmap, int needed,
60499357Smarkm		    struct rwlock **lockp);
60599357Smarkmstatic void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
60699357Smarkm		    struct rwlock **lockp);
60799357Smarkmstatic bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
60899357Smarkm		    u_int flags, struct rwlock **lockp);
60999357Smarkm#if VM_NRESERVLEVEL > 0
61099357Smarkmstatic void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
61199357Smarkm		    struct rwlock **lockp);
61299357Smarkm#endif
61399357Smarkmstatic void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
61499357Smarkmstatic pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
61599357Smarkm		    vm_offset_t va);
61699357Smarkm
61799357Smarkmstatic int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
61899357Smarkmstatic boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
61999357Smarkmstatic boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
62099357Smarkm    vm_offset_t va, struct rwlock **lockp);
62199357Smarkmstatic boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
62299357Smarkm    vm_offset_t va);
62399357Smarkmstatic bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
62499357Smarkm		    vm_prot_t prot, struct rwlock **lockp);
62599357Smarkmstatic int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
62699357Smarkm		    u_int flags, vm_page_t m, struct rwlock **lockp);
62799357Smarkmstatic vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
62899357Smarkm    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
62999357Smarkmstatic void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
63099357Smarkmstatic int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
63199357Smarkmstatic void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
63299357Smarkm		    pd_entry_t pde);
63399357Smarkmstatic void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
63499357Smarkmstatic void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
63599357Smarkm#if VM_NRESERVLEVEL > 0
63699357Smarkmstatic void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
63799357Smarkm    struct rwlock **lockp);
63899357Smarkm#endif
63999357Smarkmstatic boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
64099357Smarkm    vm_prot_t prot);
64199357Smarkmstatic void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
64299357Smarkmstatic int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
64399357Smarkm    struct spglist *free, struct rwlock **lockp);
64499357Smarkmstatic int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
64599357Smarkm    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
64699357Smarkmstatic vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
64799357Smarkmstatic void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
64899357Smarkm    struct spglist *free);
64999357Smarkmstatic bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
65099357Smarkm		    pd_entry_t *pde, struct spglist *free,
65199357Smarkm		    struct rwlock **lockp);
65299357Smarkmstatic boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
65399357Smarkm    vm_page_t m, struct rwlock **lockp);
65499357Smarkmstatic void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
65599357Smarkm    pd_entry_t newpde);
65699357Smarkmstatic void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
65799357Smarkm
65899357Smarkmstatic vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
65999357Smarkm		struct rwlock **lockp);
66099357Smarkmstatic vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
66199357Smarkm		struct rwlock **lockp);
66299357Smarkmstatic vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
66399357Smarkm		struct rwlock **lockp);
66499357Smarkm
66599357Smarkmstatic void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
66699357Smarkm    struct spglist *free);
66799357Smarkmstatic int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
66899357Smarkmstatic vm_offset_t pmap_kmem_choose(vm_offset_t addr);
66999357Smarkm
67099357Smarkm/*
67199357Smarkm * Move the kernel virtual free pointer to the next
67299357Smarkm * 2MB.  This is used to help improve performance
67399357Smarkm * by using a large (2MB) page for much of the kernel
67499357Smarkm * (.text, .data, .bss)
67599357Smarkm */
67699357Smarkmstatic vm_offset_t
67799357Smarkmpmap_kmem_choose(vm_offset_t addr)
67899357Smarkm{
67999357Smarkm	vm_offset_t newaddr = addr;
68099357Smarkm
68199357Smarkm	newaddr = roundup2(addr, NBPDR);
68299357Smarkm	return (newaddr);
68399357Smarkm}
68499357Smarkm
68599357Smarkm/********************/
68699357Smarkm/* Inline functions */
68799357Smarkm/********************/
68899357Smarkm
68999357Smarkm/* Return a non-clipped PD index for a given VA */
69099357Smarkmstatic __inline vm_pindex_t
69199357Smarkmpmap_pde_pindex(vm_offset_t va)
69299357Smarkm{
69399357Smarkm	return (va >> PDRSHIFT);
69499357Smarkm}
69599357Smarkm
69699357Smarkm
69799357Smarkm/* Return a pointer to the PML4 slot that corresponds to a VA */
69899357Smarkmstatic __inline pml4_entry_t *
69999357Smarkmpmap_pml4e(pmap_t pmap, vm_offset_t va)
70099357Smarkm{
70199357Smarkm
70299357Smarkm	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
70399357Smarkm}
70499357Smarkm
70599357Smarkm/* Return a pointer to the PDP slot that corresponds to a VA */
70699357Smarkmstatic __inline pdp_entry_t *
70799357Smarkmpmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
70899357Smarkm{
70999357Smarkm	pdp_entry_t *pdpe;
71099357Smarkm
71199357Smarkm	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
71299357Smarkm	return (&pdpe[pmap_pdpe_index(va)]);
71399357Smarkm}
71499357Smarkm
71599357Smarkm/* Return a pointer to the PDP slot that corresponds to a VA */
71699357Smarkmstatic __inline pdp_entry_t *
71799357Smarkmpmap_pdpe(pmap_t pmap, vm_offset_t va)
71899357Smarkm{
71999357Smarkm	pml4_entry_t *pml4e;
72099357Smarkm	pt_entry_t PG_V;
72199357Smarkm
72299357Smarkm	PG_V = pmap_valid_bit(pmap);
72399357Smarkm	pml4e = pmap_pml4e(pmap, va);
72499357Smarkm	if ((*pml4e & PG_V) == 0)
72599357Smarkm		return (NULL);
72699357Smarkm	return (pmap_pml4e_to_pdpe(pml4e, va));
72799357Smarkm}
72899357Smarkm
72999357Smarkm/* Return a pointer to the PD slot that corresponds to a VA */
73099357Smarkmstatic __inline pd_entry_t *
73199357Smarkmpmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
73299357Smarkm{
73399357Smarkm	pd_entry_t *pde;
73499357Smarkm
73599357Smarkm	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
73699357Smarkm	return (&pde[pmap_pde_index(va)]);
73799357Smarkm}
73899357Smarkm
73999357Smarkm/* Return a pointer to the PD slot that corresponds to a VA */
74099357Smarkmstatic __inline pd_entry_t *
74199357Smarkmpmap_pde(pmap_t pmap, vm_offset_t va)
74299357Smarkm{
74399357Smarkm	pdp_entry_t *pdpe;
74499357Smarkm	pt_entry_t PG_V;
74599357Smarkm
74699357Smarkm	PG_V = pmap_valid_bit(pmap);
74799357Smarkm	pdpe = pmap_pdpe(pmap, va);
74899357Smarkm	if (pdpe == NULL || (*pdpe & PG_V) == 0)
74999357Smarkm		return (NULL);
75099357Smarkm	return (pmap_pdpe_to_pde(pdpe, va));
75199357Smarkm}
75299357Smarkm
75399357Smarkm/* Return a pointer to the PT slot that corresponds to a VA */
75499357Smarkmstatic __inline pt_entry_t *
75599357Smarkmpmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
75699357Smarkm{
75799357Smarkm	pt_entry_t *pte;
75899357Smarkm
75999357Smarkm	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
76099357Smarkm	return (&pte[pmap_pte_index(va)]);
76199357Smarkm}
76299357Smarkm
76399357Smarkm/* Return a pointer to the PT slot that corresponds to a VA */
76499357Smarkmstatic __inline pt_entry_t *
76599357Smarkmpmap_pte(pmap_t pmap, vm_offset_t va)
76699357Smarkm{
76799357Smarkm	pd_entry_t *pde;
76899357Smarkm	pt_entry_t PG_V;
76999357Smarkm
77099357Smarkm	PG_V = pmap_valid_bit(pmap);
77199357Smarkm	pde = pmap_pde(pmap, va);
77299357Smarkm	if (pde == NULL || (*pde & PG_V) == 0)
77399357Smarkm		return (NULL);
77499357Smarkm	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
77599357Smarkm		return ((pt_entry_t *)pde);
77699357Smarkm	return (pmap_pde_to_pte(pde, va));
77799357Smarkm}
77899357Smarkm
77999357Smarkmstatic __inline void
78099357Smarkmpmap_resident_count_inc(pmap_t pmap, int count)
78199357Smarkm{
78299357Smarkm
78399357Smarkm	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
78499357Smarkm	pmap->pm_stats.resident_count += count;
78599357Smarkm}
78699357Smarkm
78799357Smarkmstatic __inline void
78899357Smarkmpmap_resident_count_dec(pmap_t pmap, int count)
78999357Smarkm{
79099357Smarkm
79199357Smarkm	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
79299357Smarkm	KASSERT(pmap->pm_stats.resident_count >= count,
79399357Smarkm	    ("pmap %p resident count underflow %ld %d", pmap,
79499357Smarkm	    pmap->pm_stats.resident_count, count));
79599357Smarkm	pmap->pm_stats.resident_count -= count;
79699357Smarkm}
79799357Smarkm
79899357SmarkmPMAP_INLINE pt_entry_t *
79999357Smarkmvtopte(vm_offset_t va)
80099357Smarkm{
80199357Smarkm	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
80299357Smarkm
80399357Smarkm	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
80499357Smarkm
80599357Smarkm	return (PTmap + ((va >> PAGE_SHIFT) & mask));
80699357Smarkm}
80799357Smarkm
80899357Smarkmstatic __inline pd_entry_t *
80999357Smarkmvtopde(vm_offset_t va)
81099357Smarkm{
81199357Smarkm	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
81299357Smarkm
81399357Smarkm	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
81499357Smarkm
81599357Smarkm	return (PDmap + ((va >> PDRSHIFT) & mask));
81699357Smarkm}
81799357Smarkm
81899357Smarkmstatic u_int64_t
81999357Smarkmallocpages(vm_paddr_t *firstaddr, int n)
82099357Smarkm{
82199357Smarkm	u_int64_t ret;
82299357Smarkm
82399357Smarkm	ret = *firstaddr;
82499357Smarkm	bzero((void *)ret, n * PAGE_SIZE);
82599357Smarkm	*firstaddr += n * PAGE_SIZE;
82699357Smarkm	return (ret);
82799357Smarkm}
82899357Smarkm
82999357SmarkmCTASSERT(powerof2(NDMPML4E));
83099357Smarkm
83199357Smarkm/* number of kernel PDP slots */
83299357Smarkm#define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
83399357Smarkm
83499357Smarkmstatic void
835154151Sflznkpt_init(vm_paddr_t addr)
836154151Sflz{
837154151Sflz	int pt_pages;
83899357Smarkm
83999357Smarkm#ifdef NKPT
84099357Smarkm	pt_pages = NKPT;
84199357Smarkm#else
84299357Smarkm	pt_pages = howmany(addr, 1 << PDRSHIFT);
843	pt_pages += NKPDPE(pt_pages);
844
845	/*
846	 * Add some slop beyond the bare minimum required for bootstrapping
847	 * the kernel.
848	 *
849	 * This is quite important when allocating KVA for kernel modules.
850	 * The modules are required to be linked in the negative 2GB of
851	 * the address space.  If we run out of KVA in this region then
852	 * pmap_growkernel() will need to allocate page table pages to map
853	 * the entire 512GB of KVA space which is an unnecessary tax on
854	 * physical memory.
855	 *
856	 * Secondly, device memory mapped as part of setting up the low-
857	 * level console(s) is taken from KVA, starting at virtual_avail.
858	 * This is because cninit() is called after pmap_bootstrap() but
859	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
860	 * not uncommon.
861	 */
862	pt_pages += 32;		/* 64MB additional slop. */
863#endif
864	nkpt = pt_pages;
865}
866
867static void
868create_pagetables(vm_paddr_t *firstaddr)
869{
870	int i, j, ndm1g, nkpdpe;
871	pt_entry_t *pt_p;
872	pd_entry_t *pd_p;
873	pdp_entry_t *pdp_p;
874	pml4_entry_t *p4_p;
875
876	/* Allocate page table pages for the direct map */
877	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
878	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
879		ndmpdp = 4;
880	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
881	if (ndmpdpphys > NDMPML4E) {
882		/*
883		 * Each NDMPML4E allows 512 GB, so limit to that,
884		 * and then readjust ndmpdp and ndmpdpphys.
885		 */
886		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
887		Maxmem = atop(NDMPML4E * NBPML4);
888		ndmpdpphys = NDMPML4E;
889		ndmpdp = NDMPML4E * NPDEPG;
890	}
891	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
892	ndm1g = 0;
893	if ((amd_feature & AMDID_PAGE1GB) != 0)
894		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
895	if (ndm1g < ndmpdp)
896		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
897	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
898
899	/* Allocate pages */
900	KPML4phys = allocpages(firstaddr, 1);
901	KPDPphys = allocpages(firstaddr, NKPML4E);
902
903	/*
904	 * Allocate the initial number of kernel page table pages required to
905	 * bootstrap.  We defer this until after all memory-size dependent
906	 * allocations are done (e.g. direct map), so that we don't have to
907	 * build in too much slop in our estimate.
908	 *
909	 * Note that when NKPML4E > 1, we have an empty page underneath
910	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
911	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
912	 */
913	nkpt_init(*firstaddr);
914	nkpdpe = NKPDPE(nkpt);
915
916	KPTphys = allocpages(firstaddr, nkpt);
917	KPDphys = allocpages(firstaddr, nkpdpe);
918
919	/* Fill in the underlying page table pages */
920	/* Nominally read-only (but really R/W) from zero to physfree */
921	/* XXX not fully used, underneath 2M pages */
922	pt_p = (pt_entry_t *)KPTphys;
923	for (i = 0; ptoa(i) < *firstaddr; i++)
924		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
925
926	/* Now map the page tables at their location within PTmap */
927	pd_p = (pd_entry_t *)KPDphys;
928	for (i = 0; i < nkpt; i++)
929		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
930
931	/* Map from zero to end of allocations under 2M pages */
932	/* This replaces some of the KPTphys entries above */
933	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
934		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
935		    X86_PG_G;
936
937	/* And connect up the PD to the PDP (leaving room for L4 pages) */
938	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
939	for (i = 0; i < nkpdpe; i++)
940		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
941		    PG_U;
942
943	/*
944	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
945	 * the end of physical memory is not aligned to a 1GB page boundary,
946	 * then the residual physical memory is mapped with 2MB pages.  Later,
947	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
948	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
949	 * that are partially used.
950	 */
951	pd_p = (pd_entry_t *)DMPDphys;
952	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
953		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
954		/* Preset PG_M and PG_A because demotion expects it. */
955		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
956		    X86_PG_M | X86_PG_A;
957	}
958	pdp_p = (pdp_entry_t *)DMPDPphys;
959	for (i = 0; i < ndm1g; i++) {
960		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
961		/* Preset PG_M and PG_A because demotion expects it. */
962		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
963		    X86_PG_M | X86_PG_A;
964	}
965	for (j = 0; i < ndmpdp; i++, j++) {
966		pdp_p[i] = DMPDphys + ptoa(j);
967		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
968	}
969
970	/* And recursively map PML4 to itself in order to get PTmap */
971	p4_p = (pml4_entry_t *)KPML4phys;
972	p4_p[PML4PML4I] = KPML4phys;
973	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
974
975	/* Connect the Direct Map slot(s) up to the PML4. */
976	for (i = 0; i < ndmpdpphys; i++) {
977		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
978		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
979	}
980
981	/* Connect the KVA slots up to the PML4 */
982	for (i = 0; i < NKPML4E; i++) {
983		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
984		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
985	}
986}
987
988/*
989 *	Bootstrap the system enough to run with virtual memory.
990 *
991 *	On amd64 this is called after mapping has already been enabled
992 *	and just syncs the pmap module with what has already been done.
993 *	[We can't call it easily with mapping off since the kernel is not
994 *	mapped with PA == VA, hence we would have to relocate every address
995 *	from the linked base (virtual) address "KERNBASE" to the actual
996 *	(physical) address starting relative to 0]
997 */
998void
999pmap_bootstrap(vm_paddr_t *firstaddr)
1000{
1001	vm_offset_t va;
1002	pt_entry_t *pte;
1003	int i;
1004
1005	/*
1006	 * Create an initial set of page tables to run the kernel in.
1007	 */
1008	create_pagetables(firstaddr);
1009
1010	/*
1011	 * Add a physical memory segment (vm_phys_seg) corresponding to the
1012	 * preallocated kernel page table pages so that vm_page structures
1013	 * representing these pages will be created.  The vm_page structures
1014	 * are required for promotion of the corresponding kernel virtual
1015	 * addresses to superpage mappings.
1016	 */
1017	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1018
1019	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
1020	virtual_avail = pmap_kmem_choose(virtual_avail);
1021
1022	virtual_end = VM_MAX_KERNEL_ADDRESS;
1023
1024
1025	/* XXX do %cr0 as well */
1026	load_cr4(rcr4() | CR4_PGE);
1027	load_cr3(KPML4phys);
1028	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1029		load_cr4(rcr4() | CR4_SMEP);
1030
1031	/*
1032	 * Initialize the kernel pmap (which is statically allocated).
1033	 */
1034	PMAP_LOCK_INIT(kernel_pmap);
1035	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1036	kernel_pmap->pm_cr3 = KPML4phys;
1037	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
1038	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1039	kernel_pmap->pm_flags = pmap_flags;
1040
1041 	/*
1042	 * Initialize the TLB invalidations generation number lock.
1043	 */
1044	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1045
1046	/*
1047	 * Reserve some special page table entries/VA space for temporary
1048	 * mapping of pages.
1049	 */
1050#define	SYSMAP(c, p, v, n)	\
1051	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1052
1053	va = virtual_avail;
1054	pte = vtopte(va);
1055
1056	/*
1057	 * Crashdump maps.  The first page is reused as CMAP1 for the
1058	 * memory test.
1059	 */
1060	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1061	CADDR1 = crashdumpmap;
1062
1063	virtual_avail = va;
1064
1065	/*
1066	 * Initialize the PAT MSR.
1067	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
1068	 * side-effect, invalidates stale PG_G TLB entries that might
1069	 * have been created in our pre-boot environment.
1070	 */
1071	pmap_init_pat();
1072
1073	/* Initialize TLB Context Id. */
1074	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1075	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1076		/* Check for INVPCID support */
1077		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
1078		    != 0;
1079		for (i = 0; i < MAXCPU; i++) {
1080			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1081			kernel_pmap->pm_pcids[i].pm_gen = 1;
1082		}
1083		__pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1;
1084		__pcpu[0].pc_pcid_gen = 1;
1085		/*
1086		 * pcpu area for APs is zeroed during AP startup.
1087		 * pc_pcid_next and pc_pcid_gen are initialized by AP
1088		 * during pcpu setup.
1089		 */
1090		load_cr4(rcr4() | CR4_PCIDE);
1091	} else {
1092		pmap_pcid_enabled = 0;
1093	}
1094}
1095
1096/*
1097 * Setup the PAT MSR.
1098 */
1099void
1100pmap_init_pat(void)
1101{
1102	int pat_table[PAT_INDEX_SIZE];
1103	uint64_t pat_msr;
1104	u_long cr0, cr4;
1105	int i;
1106
1107	/* Bail if this CPU doesn't implement PAT. */
1108	if ((cpu_feature & CPUID_PAT) == 0)
1109		panic("no PAT??");
1110
1111	/* Set default PAT index table. */
1112	for (i = 0; i < PAT_INDEX_SIZE; i++)
1113		pat_table[i] = -1;
1114	pat_table[PAT_WRITE_BACK] = 0;
1115	pat_table[PAT_WRITE_THROUGH] = 1;
1116	pat_table[PAT_UNCACHEABLE] = 3;
1117	pat_table[PAT_WRITE_COMBINING] = 3;
1118	pat_table[PAT_WRITE_PROTECTED] = 3;
1119	pat_table[PAT_UNCACHED] = 3;
1120
1121	/* Initialize default PAT entries. */
1122	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1123	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
1124	    PAT_VALUE(2, PAT_UNCACHED) |
1125	    PAT_VALUE(3, PAT_UNCACHEABLE) |
1126	    PAT_VALUE(4, PAT_WRITE_BACK) |
1127	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
1128	    PAT_VALUE(6, PAT_UNCACHED) |
1129	    PAT_VALUE(7, PAT_UNCACHEABLE);
1130
1131	if (pat_works) {
1132		/*
1133		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1134		 * Program 5 and 6 as WP and WC.
1135		 * Leave 4 and 7 as WB and UC.
1136		 */
1137		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
1138		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1139		    PAT_VALUE(6, PAT_WRITE_COMBINING);
1140		pat_table[PAT_UNCACHED] = 2;
1141		pat_table[PAT_WRITE_PROTECTED] = 5;
1142		pat_table[PAT_WRITE_COMBINING] = 6;
1143	} else {
1144		/*
1145		 * Just replace PAT Index 2 with WC instead of UC-.
1146		 */
1147		pat_msr &= ~PAT_MASK(2);
1148		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
1149		pat_table[PAT_WRITE_COMBINING] = 2;
1150	}
1151
1152	/* Disable PGE. */
1153	cr4 = rcr4();
1154	load_cr4(cr4 & ~CR4_PGE);
1155
1156	/* Disable caches (CD = 1, NW = 0). */
1157	cr0 = rcr0();
1158	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1159
1160	/* Flushes caches and TLBs. */
1161	wbinvd();
1162	invltlb();
1163
1164	/* Update PAT and index table. */
1165	wrmsr(MSR_PAT, pat_msr);
1166	for (i = 0; i < PAT_INDEX_SIZE; i++)
1167		pat_index[i] = pat_table[i];
1168
1169	/* Flush caches and TLBs again. */
1170	wbinvd();
1171	invltlb();
1172
1173	/* Restore caches and PGE. */
1174	load_cr0(cr0);
1175	load_cr4(cr4);
1176}
1177
1178/*
1179 *	Initialize a vm_page's machine-dependent fields.
1180 */
1181void
1182pmap_page_init(vm_page_t m)
1183{
1184
1185	TAILQ_INIT(&m->md.pv_list);
1186	m->md.pat_mode = PAT_WRITE_BACK;
1187}
1188
1189/*
1190 *	Initialize the pmap module.
1191 *	Called by vm_init, to initialize any structures that the pmap
1192 *	system needs to map virtual memory.
1193 */
1194void
1195pmap_init(void)
1196{
1197	struct pmap_preinit_mapping *ppim;
1198	vm_page_t mpte;
1199	vm_size_t s;
1200	int error, i, pv_npg;
1201
1202	/*
1203	 * Initialize the vm page array entries for the kernel pmap's
1204	 * page table pages.
1205	 */
1206	for (i = 0; i < nkpt; i++) {
1207		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1208		KASSERT(mpte >= vm_page_array &&
1209		    mpte < &vm_page_array[vm_page_array_size],
1210		    ("pmap_init: page table page is out of range"));
1211		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1212		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1213	}
1214
1215	/*
1216	 * If the kernel is running on a virtual machine, then it must assume
1217	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1218	 * be prepared for the hypervisor changing the vendor and family that
1219	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1220	 * 10h Erratum 383 is enabled if the processor's feature set does not
1221	 * include at least one feature that is only supported by older Intel
1222	 * or newer AMD processors.
1223	 */
1224	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1225	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1226	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1227	    AMDID2_FMA4)) == 0)
1228		workaround_erratum383 = 1;
1229
1230	/*
1231	 * Are large page mappings enabled?
1232	 */
1233	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1234	if (pg_ps_enabled) {
1235		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1236		    ("pmap_init: can't assign to pagesizes[1]"));
1237		pagesizes[1] = NBPDR;
1238	}
1239
1240	/*
1241	 * Initialize the pv chunk list mutex.
1242	 */
1243	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1244
1245	/*
1246	 * Initialize the pool of pv list locks.
1247	 */
1248	for (i = 0; i < NPV_LIST_LOCKS; i++)
1249		rw_init(&pv_list_locks[i], "pmap pv list");
1250
1251	/*
1252	 * Calculate the size of the pv head table for superpages.
1253	 */
1254	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1255
1256	/*
1257	 * Allocate memory for the pv head table for superpages.
1258	 */
1259	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1260	s = round_page(s);
1261	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1262	    M_WAITOK | M_ZERO);
1263	for (i = 0; i < pv_npg; i++)
1264		TAILQ_INIT(&pv_table[i].pv_list);
1265	TAILQ_INIT(&pv_dummy.pv_list);
1266
1267	pmap_initialized = 1;
1268	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1269		ppim = pmap_preinit_mapping + i;
1270		if (ppim->va == 0)
1271			continue;
1272		/* Make the direct map consistent */
1273		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
1274			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1275			    ppim->sz, ppim->mode);
1276		}
1277		if (!bootverbose)
1278			continue;
1279		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1280		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
1281	}
1282
1283	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
1284	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
1285	    (vmem_addr_t *)&qframe);
1286	if (error != 0)
1287		panic("qframe allocation failed");
1288}
1289
1290static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1291    "2MB page mapping counters");
1292
1293static u_long pmap_pde_demotions;
1294SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1295    &pmap_pde_demotions, 0, "2MB page demotions");
1296
1297static u_long pmap_pde_mappings;
1298SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1299    &pmap_pde_mappings, 0, "2MB page mappings");
1300
1301static u_long pmap_pde_p_failures;
1302SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1303    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1304
1305static u_long pmap_pde_promotions;
1306SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1307    &pmap_pde_promotions, 0, "2MB page promotions");
1308
1309static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1310    "1GB page mapping counters");
1311
1312static u_long pmap_pdpe_demotions;
1313SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1314    &pmap_pdpe_demotions, 0, "1GB page demotions");
1315
1316/***************************************************
1317 * Low level helper routines.....
1318 ***************************************************/
1319
1320static pt_entry_t
1321pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1322{
1323	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1324
1325	switch (pmap->pm_type) {
1326	case PT_X86:
1327	case PT_RVI:
1328		/* Verify that both PAT bits are not set at the same time */
1329		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1330		    ("Invalid PAT bits in entry %#lx", entry));
1331
1332		/* Swap the PAT bits if one of them is set */
1333		if ((entry & x86_pat_bits) != 0)
1334			entry ^= x86_pat_bits;
1335		break;
1336	case PT_EPT:
1337		/*
1338		 * Nothing to do - the memory attributes are represented
1339		 * the same way for regular pages and superpages.
1340		 */
1341		break;
1342	default:
1343		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1344	}
1345
1346	return (entry);
1347}
1348
1349/*
1350 * Determine the appropriate bits to set in a PTE or PDE for a specified
1351 * caching mode.
1352 */
1353int
1354pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1355{
1356	int cache_bits, pat_flag, pat_idx;
1357
1358	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1359		panic("Unknown caching mode %d\n", mode);
1360
1361	switch (pmap->pm_type) {
1362	case PT_X86:
1363	case PT_RVI:
1364		/* The PAT bit is different for PTE's and PDE's. */
1365		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1366
1367		/* Map the caching mode to a PAT index. */
1368		pat_idx = pat_index[mode];
1369
1370		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1371		cache_bits = 0;
1372		if (pat_idx & 0x4)
1373			cache_bits |= pat_flag;
1374		if (pat_idx & 0x2)
1375			cache_bits |= PG_NC_PCD;
1376		if (pat_idx & 0x1)
1377			cache_bits |= PG_NC_PWT;
1378		break;
1379
1380	case PT_EPT:
1381		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1382		break;
1383
1384	default:
1385		panic("unsupported pmap type %d", pmap->pm_type);
1386	}
1387
1388	return (cache_bits);
1389}
1390
1391static int
1392pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1393{
1394	int mask;
1395
1396	switch (pmap->pm_type) {
1397	case PT_X86:
1398	case PT_RVI:
1399		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1400		break;
1401	case PT_EPT:
1402		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1403		break;
1404	default:
1405		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1406	}
1407
1408	return (mask);
1409}
1410
1411bool
1412pmap_ps_enabled(pmap_t pmap)
1413{
1414
1415	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1416}
1417
1418static void
1419pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1420{
1421
1422	switch (pmap->pm_type) {
1423	case PT_X86:
1424		break;
1425	case PT_RVI:
1426	case PT_EPT:
1427		/*
1428		 * XXX
1429		 * This is a little bogus since the generation number is
1430		 * supposed to be bumped up when a region of the address
1431		 * space is invalidated in the page tables.
1432		 *
1433		 * In this case the old PDE entry is valid but yet we want
1434		 * to make sure that any mappings using the old entry are
1435		 * invalidated in the TLB.
1436		 *
1437		 * The reason this works as expected is because we rendezvous
1438		 * "all" host cpus and force any vcpu context to exit as a
1439		 * side-effect.
1440		 */
1441		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1442		break;
1443	default:
1444		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1445	}
1446	pde_store(pde, newpde);
1447}
1448
1449/*
1450 * After changing the page size for the specified virtual address in the page
1451 * table, flush the corresponding entries from the processor's TLB.  Only the
1452 * calling processor's TLB is affected.
1453 *
1454 * The calling thread must be pinned to a processor.
1455 */
1456static void
1457pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1458{
1459	pt_entry_t PG_G;
1460
1461	if (pmap_type_guest(pmap))
1462		return;
1463
1464	KASSERT(pmap->pm_type == PT_X86,
1465	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1466
1467	PG_G = pmap_global_bit(pmap);
1468
1469	if ((newpde & PG_PS) == 0)
1470		/* Demotion: flush a specific 2MB page mapping. */
1471		invlpg(va);
1472	else if ((newpde & PG_G) == 0)
1473		/*
1474		 * Promotion: flush every 4KB page mapping from the TLB
1475		 * because there are too many to flush individually.
1476		 */
1477		invltlb();
1478	else {
1479		/*
1480		 * Promotion: flush every 4KB page mapping from the TLB,
1481		 * including any global (PG_G) mappings.
1482		 */
1483		invltlb_glob();
1484	}
1485}
1486#ifdef SMP
1487
1488/*
1489 * For SMP, these functions have to use the IPI mechanism for coherence.
1490 *
1491 * N.B.: Before calling any of the following TLB invalidation functions,
1492 * the calling processor must ensure that all stores updating a non-
1493 * kernel page table are globally performed.  Otherwise, another
1494 * processor could cache an old, pre-update entry without being
1495 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1496 * active on another processor after its pm_active field is checked by
1497 * one of the following functions but before a store updating the page
1498 * table is globally performed. (2) The pmap becomes active on another
1499 * processor before its pm_active field is checked but due to
1500 * speculative loads one of the following functions stills reads the
1501 * pmap as inactive on the other processor.
1502 *
1503 * The kernel page table is exempt because its pm_active field is
1504 * immutable.  The kernel page table is always active on every
1505 * processor.
1506 */
1507
1508/*
1509 * Interrupt the cpus that are executing in the guest context.
1510 * This will force the vcpu to exit and the cached EPT mappings
1511 * will be invalidated by the host before the next vmresume.
1512 */
1513static __inline void
1514pmap_invalidate_ept(pmap_t pmap)
1515{
1516	int ipinum;
1517
1518	sched_pin();
1519	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1520	    ("pmap_invalidate_ept: absurd pm_active"));
1521
1522	/*
1523	 * The TLB mappings associated with a vcpu context are not
1524	 * flushed each time a different vcpu is chosen to execute.
1525	 *
1526	 * This is in contrast with a process's vtop mappings that
1527	 * are flushed from the TLB on each context switch.
1528	 *
1529	 * Therefore we need to do more than just a TLB shootdown on
1530	 * the active cpus in 'pmap->pm_active'. To do this we keep
1531	 * track of the number of invalidations performed on this pmap.
1532	 *
1533	 * Each vcpu keeps a cache of this counter and compares it
1534	 * just before a vmresume. If the counter is out-of-date an
1535	 * invept will be done to flush stale mappings from the TLB.
1536	 */
1537	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1538
1539	/*
1540	 * Force the vcpu to exit and trap back into the hypervisor.
1541	 */
1542	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1543	ipi_selected(pmap->pm_active, ipinum);
1544	sched_unpin();
1545}
1546
1547void
1548pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1549{
1550	cpuset_t *mask;
1551	u_int cpuid, i;
1552
1553	if (pmap_type_guest(pmap)) {
1554		pmap_invalidate_ept(pmap);
1555		return;
1556	}
1557
1558	KASSERT(pmap->pm_type == PT_X86,
1559	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1560
1561	sched_pin();
1562	if (pmap == kernel_pmap) {
1563		invlpg(va);
1564		mask = &all_cpus;
1565	} else {
1566		cpuid = PCPU_GET(cpuid);
1567		if (pmap == PCPU_GET(curpmap))
1568			invlpg(va);
1569		else if (pmap_pcid_enabled)
1570			pmap->pm_pcids[cpuid].pm_gen = 0;
1571		if (pmap_pcid_enabled) {
1572			CPU_FOREACH(i) {
1573				if (cpuid != i)
1574					pmap->pm_pcids[i].pm_gen = 0;
1575			}
1576		}
1577		mask = &pmap->pm_active;
1578	}
1579	smp_masked_invlpg(*mask, va);
1580	sched_unpin();
1581}
1582
1583/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1584#define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1585
1586void
1587pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1588{
1589	cpuset_t *mask;
1590	vm_offset_t addr;
1591	u_int cpuid, i;
1592
1593	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1594		pmap_invalidate_all(pmap);
1595		return;
1596	}
1597
1598	if (pmap_type_guest(pmap)) {
1599		pmap_invalidate_ept(pmap);
1600		return;
1601	}
1602
1603	KASSERT(pmap->pm_type == PT_X86,
1604	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1605
1606	sched_pin();
1607	cpuid = PCPU_GET(cpuid);
1608	if (pmap == kernel_pmap) {
1609		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1610			invlpg(addr);
1611		mask = &all_cpus;
1612	} else {
1613		if (pmap == PCPU_GET(curpmap)) {
1614			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1615				invlpg(addr);
1616		} else if (pmap_pcid_enabled) {
1617			pmap->pm_pcids[cpuid].pm_gen = 0;
1618		}
1619		if (pmap_pcid_enabled) {
1620			CPU_FOREACH(i) {
1621				if (cpuid != i)
1622					pmap->pm_pcids[i].pm_gen = 0;
1623			}
1624		}
1625		mask = &pmap->pm_active;
1626	}
1627	smp_masked_invlpg_range(*mask, sva, eva);
1628	sched_unpin();
1629}
1630
1631void
1632pmap_invalidate_all(pmap_t pmap)
1633{
1634	cpuset_t *mask;
1635	struct invpcid_descr d;
1636	u_int cpuid, i;
1637
1638	if (pmap_type_guest(pmap)) {
1639		pmap_invalidate_ept(pmap);
1640		return;
1641	}
1642
1643	KASSERT(pmap->pm_type == PT_X86,
1644	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1645
1646	sched_pin();
1647	if (pmap == kernel_pmap) {
1648		if (pmap_pcid_enabled && invpcid_works) {
1649			bzero(&d, sizeof(d));
1650			invpcid(&d, INVPCID_CTXGLOB);
1651		} else {
1652			invltlb_glob();
1653		}
1654		mask = &all_cpus;
1655	} else {
1656		cpuid = PCPU_GET(cpuid);
1657		if (pmap == PCPU_GET(curpmap)) {
1658			if (pmap_pcid_enabled) {
1659				if (invpcid_works) {
1660					d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
1661					d.pad = 0;
1662					d.addr = 0;
1663					invpcid(&d, INVPCID_CTX);
1664				} else {
1665					load_cr3(pmap->pm_cr3 | pmap->pm_pcids
1666					    [PCPU_GET(cpuid)].pm_pcid);
1667				}
1668			} else {
1669				invltlb();
1670			}
1671		} else if (pmap_pcid_enabled) {
1672			pmap->pm_pcids[cpuid].pm_gen = 0;
1673		}
1674		if (pmap_pcid_enabled) {
1675			CPU_FOREACH(i) {
1676				if (cpuid != i)
1677					pmap->pm_pcids[i].pm_gen = 0;
1678			}
1679		}
1680		mask = &pmap->pm_active;
1681	}
1682	smp_masked_invltlb(*mask, pmap);
1683	sched_unpin();
1684}
1685
1686void
1687pmap_invalidate_cache(void)
1688{
1689
1690	sched_pin();
1691	wbinvd();
1692	smp_cache_flush();
1693	sched_unpin();
1694}
1695
1696struct pde_action {
1697	cpuset_t invalidate;	/* processors that invalidate their TLB */
1698	pmap_t pmap;
1699	vm_offset_t va;
1700	pd_entry_t *pde;
1701	pd_entry_t newpde;
1702	u_int store;		/* processor that updates the PDE */
1703};
1704
1705static void
1706pmap_update_pde_action(void *arg)
1707{
1708	struct pde_action *act = arg;
1709
1710	if (act->store == PCPU_GET(cpuid))
1711		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1712}
1713
1714static void
1715pmap_update_pde_teardown(void *arg)
1716{
1717	struct pde_action *act = arg;
1718
1719	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1720		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1721}
1722
1723/*
1724 * Change the page size for the specified virtual address in a way that
1725 * prevents any possibility of the TLB ever having two entries that map the
1726 * same virtual address using different page sizes.  This is the recommended
1727 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1728 * machine check exception for a TLB state that is improperly diagnosed as a
1729 * hardware error.
1730 */
1731static void
1732pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1733{
1734	struct pde_action act;
1735	cpuset_t active, other_cpus;
1736	u_int cpuid;
1737
1738	sched_pin();
1739	cpuid = PCPU_GET(cpuid);
1740	other_cpus = all_cpus;
1741	CPU_CLR(cpuid, &other_cpus);
1742	if (pmap == kernel_pmap || pmap_type_guest(pmap))
1743		active = all_cpus;
1744	else {
1745		active = pmap->pm_active;
1746	}
1747	if (CPU_OVERLAP(&active, &other_cpus)) {
1748		act.store = cpuid;
1749		act.invalidate = active;
1750		act.va = va;
1751		act.pmap = pmap;
1752		act.pde = pde;
1753		act.newpde = newpde;
1754		CPU_SET(cpuid, &active);
1755		smp_rendezvous_cpus(active,
1756		    smp_no_rendezvous_barrier, pmap_update_pde_action,
1757		    pmap_update_pde_teardown, &act);
1758	} else {
1759		pmap_update_pde_store(pmap, pde, newpde);
1760		if (CPU_ISSET(cpuid, &active))
1761			pmap_update_pde_invalidate(pmap, va, newpde);
1762	}
1763	sched_unpin();
1764}
1765#else /* !SMP */
1766/*
1767 * Normal, non-SMP, invalidation functions.
1768 */
1769void
1770pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1771{
1772
1773	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1774		pmap->pm_eptgen++;
1775		return;
1776	}
1777	KASSERT(pmap->pm_type == PT_X86,
1778	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
1779
1780	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
1781		invlpg(va);
1782	else if (pmap_pcid_enabled)
1783		pmap->pm_pcids[0].pm_gen = 0;
1784}
1785
1786void
1787pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1788{
1789	vm_offset_t addr;
1790
1791	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1792		pmap->pm_eptgen++;
1793		return;
1794	}
1795	KASSERT(pmap->pm_type == PT_X86,
1796	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
1797
1798	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
1799		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1800			invlpg(addr);
1801	} else if (pmap_pcid_enabled) {
1802		pmap->pm_pcids[0].pm_gen = 0;
1803	}
1804}
1805
1806void
1807pmap_invalidate_all(pmap_t pmap)
1808{
1809	struct invpcid_descr d;
1810
1811	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
1812		pmap->pm_eptgen++;
1813		return;
1814	}
1815	KASSERT(pmap->pm_type == PT_X86,
1816	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
1817
1818	if (pmap == kernel_pmap) {
1819		if (pmap_pcid_enabled && invpcid_works) {
1820			bzero(&d, sizeof(d));
1821			invpcid(&d, INVPCID_CTXGLOB);
1822		} else {
1823			invltlb_glob();
1824		}
1825	} else if (pmap == PCPU_GET(curpmap)) {
1826		if (pmap_pcid_enabled) {
1827			if (invpcid_works) {
1828				d.pcid = pmap->pm_pcids[0].pm_pcid;
1829				d.pad = 0;
1830				d.addr = 0;
1831				invpcid(&d, INVPCID_CTX);
1832			} else {
1833				load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
1834				    pm_pcid);
1835			}
1836		} else {
1837			invltlb();
1838		}
1839	} else if (pmap_pcid_enabled) {
1840		pmap->pm_pcids[0].pm_gen = 0;
1841	}
1842}
1843
1844PMAP_INLINE void
1845pmap_invalidate_cache(void)
1846{
1847
1848	wbinvd();
1849}
1850
1851static void
1852pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1853{
1854
1855	pmap_update_pde_store(pmap, pde, newpde);
1856	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
1857		pmap_update_pde_invalidate(pmap, va, newpde);
1858	else
1859		pmap->pm_pcids[0].pm_gen = 0;
1860}
1861#endif /* !SMP */
1862
1863static void
1864pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
1865{
1866
1867	/*
1868	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1869	 * by a promotion that did not invalidate the 512 4KB page mappings
1870	 * that might exist in the TLB.  Consequently, at this point, the TLB
1871	 * may hold both 4KB and 2MB page mappings for the address range [va,
1872	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
1873	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1874	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
1875	 * single INVLPG suffices to invalidate the 2MB page mapping from the
1876	 * TLB.
1877	 */
1878	if ((pde & PG_PROMOTED) != 0)
1879		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
1880	else
1881		pmap_invalidate_page(pmap, va);
1882}
1883
1884#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1885
1886void
1887pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1888{
1889
1890	if (force) {
1891		sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
1892	} else {
1893		KASSERT((sva & PAGE_MASK) == 0,
1894		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1895		KASSERT((eva & PAGE_MASK) == 0,
1896		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1897	}
1898
1899	if ((cpu_feature & CPUID_SS) != 0 && !force)
1900		; /* If "Self Snoop" is supported and allowed, do nothing. */
1901	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1902	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1903		/*
1904		 * XXX: Some CPUs fault, hang, or trash the local APIC
1905		 * registers if we use CLFLUSH on the local APIC
1906		 * range.  The local APIC is always uncached, so we
1907		 * don't need to flush for that range anyway.
1908		 */
1909		if (pmap_kextract(sva) == lapic_paddr)
1910			return;
1911
1912		/*
1913		 * Otherwise, do per-cache line flush.  Use the sfence
1914		 * instruction to insure that previous stores are
1915		 * included in the write-back.  The processor
1916		 * propagates flush to other processors in the cache
1917		 * coherence domain.
1918		 */
1919		sfence();
1920		for (; sva < eva; sva += cpu_clflush_line_size)
1921			clflushopt(sva);
1922		sfence();
1923	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1924	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1925		if (pmap_kextract(sva) == lapic_paddr)
1926			return;
1927		/*
1928		 * Writes are ordered by CLFLUSH on Intel CPUs.
1929		 */
1930		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1931			mfence();
1932		for (; sva < eva; sva += cpu_clflush_line_size)
1933			clflush(sva);
1934		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1935			mfence();
1936	} else {
1937
1938		/*
1939		 * No targeted cache flush methods are supported by CPU,
1940		 * or the supplied range is bigger than 2MB.
1941		 * Globally invalidate cache.
1942		 */
1943		pmap_invalidate_cache();
1944	}
1945}
1946
1947/*
1948 * Remove the specified set of pages from the data and instruction caches.
1949 *
1950 * In contrast to pmap_invalidate_cache_range(), this function does not
1951 * rely on the CPU's self-snoop feature, because it is intended for use
1952 * when moving pages into a different cache domain.
1953 */
1954void
1955pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1956{
1957	vm_offset_t daddr, eva;
1958	int i;
1959	bool useclflushopt;
1960
1961	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
1962	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1963	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
1964		pmap_invalidate_cache();
1965	else {
1966		if (useclflushopt)
1967			sfence();
1968		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
1969			mfence();
1970		for (i = 0; i < count; i++) {
1971			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1972			eva = daddr + PAGE_SIZE;
1973			for (; daddr < eva; daddr += cpu_clflush_line_size) {
1974				if (useclflushopt)
1975					clflushopt(daddr);
1976				else
1977					clflush(daddr);
1978			}
1979		}
1980		if (useclflushopt)
1981			sfence();
1982		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
1983			mfence();
1984	}
1985}
1986
1987/*
1988 *	Routine:	pmap_extract
1989 *	Function:
1990 *		Extract the physical page address associated
1991 *		with the given map/virtual_address pair.
1992 */
1993vm_paddr_t
1994pmap_extract(pmap_t pmap, vm_offset_t va)
1995{
1996	pdp_entry_t *pdpe;
1997	pd_entry_t *pde;
1998	pt_entry_t *pte, PG_V;
1999	vm_paddr_t pa;
2000
2001	pa = 0;
2002	PG_V = pmap_valid_bit(pmap);
2003	PMAP_LOCK(pmap);
2004	pdpe = pmap_pdpe(pmap, va);
2005	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2006		if ((*pdpe & PG_PS) != 0)
2007			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
2008		else {
2009			pde = pmap_pdpe_to_pde(pdpe, va);
2010			if ((*pde & PG_V) != 0) {
2011				if ((*pde & PG_PS) != 0) {
2012					pa = (*pde & PG_PS_FRAME) |
2013					    (va & PDRMASK);
2014				} else {
2015					pte = pmap_pde_to_pte(pde, va);
2016					pa = (*pte & PG_FRAME) |
2017					    (va & PAGE_MASK);
2018				}
2019			}
2020		}
2021	}
2022	PMAP_UNLOCK(pmap);
2023	return (pa);
2024}
2025
2026/*
2027 *	Routine:	pmap_extract_and_hold
2028 *	Function:
2029 *		Atomically extract and hold the physical page
2030 *		with the given pmap and virtual address pair
2031 *		if that mapping permits the given protection.
2032 */
2033vm_page_t
2034pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2035{
2036	pd_entry_t pde, *pdep;
2037	pt_entry_t pte, PG_RW, PG_V;
2038	vm_paddr_t pa;
2039	vm_page_t m;
2040
2041	pa = 0;
2042	m = NULL;
2043	PG_RW = pmap_rw_bit(pmap);
2044	PG_V = pmap_valid_bit(pmap);
2045	PMAP_LOCK(pmap);
2046retry:
2047	pdep = pmap_pde(pmap, va);
2048	if (pdep != NULL && (pde = *pdep)) {
2049		if (pde & PG_PS) {
2050			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
2051				if (vm_page_pa_tryrelock(pmap, (pde &
2052				    PG_PS_FRAME) | (va & PDRMASK), &pa))
2053					goto retry;
2054				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
2055				    (va & PDRMASK));
2056				vm_page_hold(m);
2057			}
2058		} else {
2059			pte = *pmap_pde_to_pte(pdep, va);
2060			if ((pte & PG_V) &&
2061			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
2062				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
2063				    &pa))
2064					goto retry;
2065				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
2066				vm_page_hold(m);
2067			}
2068		}
2069	}
2070	PA_UNLOCK_COND(pa);
2071	PMAP_UNLOCK(pmap);
2072	return (m);
2073}
2074
2075vm_paddr_t
2076pmap_kextract(vm_offset_t va)
2077{
2078	pd_entry_t pde;
2079	vm_paddr_t pa;
2080
2081	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2082		pa = DMAP_TO_PHYS(va);
2083	} else {
2084		pde = *vtopde(va);
2085		if (pde & PG_PS) {
2086			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2087		} else {
2088			/*
2089			 * Beware of a concurrent promotion that changes the
2090			 * PDE at this point!  For example, vtopte() must not
2091			 * be used to access the PTE because it would use the
2092			 * new PDE.  It is, however, safe to use the old PDE
2093			 * because the page table page is preserved by the
2094			 * promotion.
2095			 */
2096			pa = *pmap_pde_to_pte(&pde, va);
2097			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2098		}
2099	}
2100	return (pa);
2101}
2102
2103/***************************************************
2104 * Low level mapping routines.....
2105 ***************************************************/
2106
2107/*
2108 * Add a wired page to the kva.
2109 * Note: not SMP coherent.
2110 */
2111PMAP_INLINE void
2112pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2113{
2114	pt_entry_t *pte;
2115
2116	pte = vtopte(va);
2117	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
2118}
2119
2120static __inline void
2121pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2122{
2123	pt_entry_t *pte;
2124	int cache_bits;
2125
2126	pte = vtopte(va);
2127	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2128	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
2129}
2130
2131/*
2132 * Remove a page from the kernel pagetables.
2133 * Note: not SMP coherent.
2134 */
2135PMAP_INLINE void
2136pmap_kremove(vm_offset_t va)
2137{
2138	pt_entry_t *pte;
2139
2140	pte = vtopte(va);
2141	pte_clear(pte);
2142}
2143
2144/*
2145 *	Used to map a range of physical addresses into kernel
2146 *	virtual address space.
2147 *
2148 *	The value passed in '*virt' is a suggested virtual address for
2149 *	the mapping. Architectures which can support a direct-mapped
2150 *	physical to virtual region can return the appropriate address
2151 *	within that region, leaving '*virt' unchanged. Other
2152 *	architectures should map the pages starting at '*virt' and
2153 *	update '*virt' with the first usable address after the mapped
2154 *	region.
2155 */
2156vm_offset_t
2157pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2158{
2159	return PHYS_TO_DMAP(start);
2160}
2161
2162
2163/*
2164 * Add a list of wired pages to the kva
2165 * this routine is only used for temporary
2166 * kernel mappings that do not need to have
2167 * page modification or references recorded.
2168 * Note that old mappings are simply written
2169 * over.  The page *must* be wired.
2170 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2171 */
2172void
2173pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2174{
2175	pt_entry_t *endpte, oldpte, pa, *pte;
2176	vm_page_t m;
2177	int cache_bits;
2178
2179	oldpte = 0;
2180	pte = vtopte(sva);
2181	endpte = pte + count;
2182	while (pte < endpte) {
2183		m = *ma++;
2184		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2185		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2186		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2187			oldpte |= *pte;
2188			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
2189		}
2190		pte++;
2191	}
2192	if (__predict_false((oldpte & X86_PG_V) != 0))
2193		pmap_invalidate_range(kernel_pmap, sva, sva + count *
2194		    PAGE_SIZE);
2195}
2196
2197/*
2198 * This routine tears out page mappings from the
2199 * kernel -- it is meant only for temporary mappings.
2200 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2201 */
2202void
2203pmap_qremove(vm_offset_t sva, int count)
2204{
2205	vm_offset_t va;
2206
2207	va = sva;
2208	while (count-- > 0) {
2209		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2210		pmap_kremove(va);
2211		va += PAGE_SIZE;
2212	}
2213	pmap_invalidate_range(kernel_pmap, sva, va);
2214}
2215
2216/***************************************************
2217 * Page table page management routines.....
2218 ***************************************************/
2219static __inline void
2220pmap_free_zero_pages(struct spglist *free)
2221{
2222	vm_page_t m;
2223	int count;
2224
2225	for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) {
2226		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2227		/* Preserve the page's PG_ZERO setting. */
2228		vm_page_free_toq(m);
2229	}
2230	atomic_subtract_int(&vm_cnt.v_wire_count, count);
2231}
2232
2233/*
2234 * Schedule the specified unused page table page to be freed.  Specifically,
2235 * add the page to the specified list of pages that will be released to the
2236 * physical memory manager after the TLB has been updated.
2237 */
2238static __inline void
2239pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2240    boolean_t set_PG_ZERO)
2241{
2242
2243	if (set_PG_ZERO)
2244		m->flags |= PG_ZERO;
2245	else
2246		m->flags &= ~PG_ZERO;
2247	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2248}
2249
2250/*
2251 * Inserts the specified page table page into the specified pmap's collection
2252 * of idle page table pages.  Each of a pmap's page table pages is responsible
2253 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2254 * ordered by this virtual address range.
2255 */
2256static __inline int
2257pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2258{
2259
2260	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2261	return (vm_radix_insert(&pmap->pm_root, mpte));
2262}
2263
2264/*
2265 * Removes the page table page mapping the specified virtual address from the
2266 * specified pmap's collection of idle page table pages, and returns it.
2267 * Otherwise, returns NULL if there is no page table page corresponding to the
2268 * specified virtual address.
2269 */
2270static __inline vm_page_t
2271pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2272{
2273
2274	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2275	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
2276}
2277
2278/*
2279 * Decrements a page table page's wire count, which is used to record the
2280 * number of valid page table entries within the page.  If the wire count
2281 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2282 * page table page was unmapped and FALSE otherwise.
2283 */
2284static inline boolean_t
2285pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2286{
2287
2288	--m->wire_count;
2289	if (m->wire_count == 0) {
2290		_pmap_unwire_ptp(pmap, va, m, free);
2291		return (TRUE);
2292	} else
2293		return (FALSE);
2294}
2295
2296static void
2297_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2298{
2299
2300	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2301	/*
2302	 * unmap the page table page
2303	 */
2304	if (m->pindex >= (NUPDE + NUPDPE)) {
2305		/* PDP page */
2306		pml4_entry_t *pml4;
2307		pml4 = pmap_pml4e(pmap, va);
2308		*pml4 = 0;
2309	} else if (m->pindex >= NUPDE) {
2310		/* PD page */
2311		pdp_entry_t *pdp;
2312		pdp = pmap_pdpe(pmap, va);
2313		*pdp = 0;
2314	} else {
2315		/* PTE page */
2316		pd_entry_t *pd;
2317		pd = pmap_pde(pmap, va);
2318		*pd = 0;
2319	}
2320	pmap_resident_count_dec(pmap, 1);
2321	if (m->pindex < NUPDE) {
2322		/* We just released a PT, unhold the matching PD */
2323		vm_page_t pdpg;
2324
2325		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2326		pmap_unwire_ptp(pmap, va, pdpg, free);
2327	}
2328	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2329		/* We just released a PD, unhold the matching PDP */
2330		vm_page_t pdppg;
2331
2332		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2333		pmap_unwire_ptp(pmap, va, pdppg, free);
2334	}
2335
2336	/*
2337	 * Put page on a list so that it is released after
2338	 * *ALL* TLB shootdown is done
2339	 */
2340	pmap_add_delayed_free_list(m, free, TRUE);
2341}
2342
2343/*
2344 * After removing a page table entry, this routine is used to
2345 * conditionally free the page, and manage the hold/wire counts.
2346 */
2347static int
2348pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2349    struct spglist *free)
2350{
2351	vm_page_t mpte;
2352
2353	if (va >= VM_MAXUSER_ADDRESS)
2354		return (0);
2355	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2356	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2357	return (pmap_unwire_ptp(pmap, va, mpte, free));
2358}
2359
2360void
2361pmap_pinit0(pmap_t pmap)
2362{
2363	int i;
2364
2365	PMAP_LOCK_INIT(pmap);
2366	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2367	pmap->pm_cr3 = KPML4phys;
2368	pmap->pm_root.rt_root = 0;
2369	CPU_ZERO(&pmap->pm_active);
2370	TAILQ_INIT(&pmap->pm_pvchunk);
2371	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2372	pmap->pm_flags = pmap_flags;
2373	CPU_FOREACH(i) {
2374		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2375		pmap->pm_pcids[i].pm_gen = 0;
2376	}
2377	PCPU_SET(curpmap, kernel_pmap);
2378	pmap_activate(curthread);
2379	CPU_FILL(&kernel_pmap->pm_active);
2380}
2381
2382void
2383pmap_pinit_pml4(vm_page_t pml4pg)
2384{
2385	pml4_entry_t *pm_pml4;
2386	int i;
2387
2388	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2389
2390	/* Wire in kernel global address entries. */
2391	for (i = 0; i < NKPML4E; i++) {
2392		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
2393		    X86_PG_V | PG_U;
2394	}
2395	for (i = 0; i < ndmpdpphys; i++) {
2396		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
2397		    X86_PG_V | PG_U;
2398	}
2399
2400	/* install self-referential address mapping entry(s) */
2401	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
2402	    X86_PG_A | X86_PG_M;
2403}
2404
2405/*
2406 * Initialize a preallocated and zeroed pmap structure,
2407 * such as one in a vmspace structure.
2408 */
2409int
2410pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2411{
2412	vm_page_t pml4pg;
2413	vm_paddr_t pml4phys;
2414	int i;
2415
2416	/*
2417	 * allocate the page directory page
2418	 */
2419	pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2420	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
2421
2422	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2423	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2424	CPU_FOREACH(i) {
2425		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2426		pmap->pm_pcids[i].pm_gen = 0;
2427	}
2428	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2429
2430	if ((pml4pg->flags & PG_ZERO) == 0)
2431		pagezero(pmap->pm_pml4);
2432
2433	/*
2434	 * Do not install the host kernel mappings in the nested page
2435	 * tables. These mappings are meaningless in the guest physical
2436	 * address space.
2437	 */
2438	if ((pmap->pm_type = pm_type) == PT_X86) {
2439		pmap->pm_cr3 = pml4phys;
2440		pmap_pinit_pml4(pml4pg);
2441	}
2442
2443	pmap->pm_root.rt_root = 0;
2444	CPU_ZERO(&pmap->pm_active);
2445	TAILQ_INIT(&pmap->pm_pvchunk);
2446	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2447	pmap->pm_flags = flags;
2448	pmap->pm_eptgen = 0;
2449
2450	return (1);
2451}
2452
2453int
2454pmap_pinit(pmap_t pmap)
2455{
2456
2457	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2458}
2459
2460/*
2461 * This routine is called if the desired page table page does not exist.
2462 *
2463 * If page table page allocation fails, this routine may sleep before
2464 * returning NULL.  It sleeps only if a lock pointer was given.
2465 *
2466 * Note: If a page allocation fails at page table level two or three,
2467 * one or two pages may be held during the wait, only to be released
2468 * afterwards.  This conservative approach is easily argued to avoid
2469 * race conditions.
2470 */
2471static vm_page_t
2472_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2473{
2474	vm_page_t m, pdppg, pdpg;
2475	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2476
2477	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2478
2479	PG_A = pmap_accessed_bit(pmap);
2480	PG_M = pmap_modified_bit(pmap);
2481	PG_V = pmap_valid_bit(pmap);
2482	PG_RW = pmap_rw_bit(pmap);
2483
2484	/*
2485	 * Allocate a page table page.
2486	 */
2487	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2488	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2489		if (lockp != NULL) {
2490			RELEASE_PV_LIST_LOCK(lockp);
2491			PMAP_UNLOCK(pmap);
2492			PMAP_ASSERT_NOT_IN_DI();
2493			VM_WAIT;
2494			PMAP_LOCK(pmap);
2495		}
2496
2497		/*
2498		 * Indicate the need to retry.  While waiting, the page table
2499		 * page may have been allocated.
2500		 */
2501		return (NULL);
2502	}
2503	if ((m->flags & PG_ZERO) == 0)
2504		pmap_zero_page(m);
2505
2506	/*
2507	 * Map the pagetable page into the process address space, if
2508	 * it isn't already there.
2509	 */
2510
2511	if (ptepindex >= (NUPDE + NUPDPE)) {
2512		pml4_entry_t *pml4;
2513		vm_pindex_t pml4index;
2514
2515		/* Wire up a new PDPE page */
2516		pml4index = ptepindex - (NUPDE + NUPDPE);
2517		pml4 = &pmap->pm_pml4[pml4index];
2518		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2519
2520	} else if (ptepindex >= NUPDE) {
2521		vm_pindex_t pml4index;
2522		vm_pindex_t pdpindex;
2523		pml4_entry_t *pml4;
2524		pdp_entry_t *pdp;
2525
2526		/* Wire up a new PDE page */
2527		pdpindex = ptepindex - NUPDE;
2528		pml4index = pdpindex >> NPML4EPGSHIFT;
2529
2530		pml4 = &pmap->pm_pml4[pml4index];
2531		if ((*pml4 & PG_V) == 0) {
2532			/* Have to allocate a new pdp, recurse */
2533			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2534			    lockp) == NULL) {
2535				--m->wire_count;
2536				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2537				vm_page_free_zero(m);
2538				return (NULL);
2539			}
2540		} else {
2541			/* Add reference to pdp page */
2542			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2543			pdppg->wire_count++;
2544		}
2545		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2546
2547		/* Now find the pdp page */
2548		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2549		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2550
2551	} else {
2552		vm_pindex_t pml4index;
2553		vm_pindex_t pdpindex;
2554		pml4_entry_t *pml4;
2555		pdp_entry_t *pdp;
2556		pd_entry_t *pd;
2557
2558		/* Wire up a new PTE page */
2559		pdpindex = ptepindex >> NPDPEPGSHIFT;
2560		pml4index = pdpindex >> NPML4EPGSHIFT;
2561
2562		/* First, find the pdp and check that its valid. */
2563		pml4 = &pmap->pm_pml4[pml4index];
2564		if ((*pml4 & PG_V) == 0) {
2565			/* Have to allocate a new pd, recurse */
2566			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2567			    lockp) == NULL) {
2568				--m->wire_count;
2569				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2570				vm_page_free_zero(m);
2571				return (NULL);
2572			}
2573			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2574			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2575		} else {
2576			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2577			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2578			if ((*pdp & PG_V) == 0) {
2579				/* Have to allocate a new pd, recurse */
2580				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2581				    lockp) == NULL) {
2582					--m->wire_count;
2583					atomic_subtract_int(&vm_cnt.v_wire_count,
2584					    1);
2585					vm_page_free_zero(m);
2586					return (NULL);
2587				}
2588			} else {
2589				/* Add reference to the pd page */
2590				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2591				pdpg->wire_count++;
2592			}
2593		}
2594		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2595
2596		/* Now we know where the page directory page is */
2597		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2598		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2599	}
2600
2601	pmap_resident_count_inc(pmap, 1);
2602
2603	return (m);
2604}
2605
2606static vm_page_t
2607pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2608{
2609	vm_pindex_t pdpindex, ptepindex;
2610	pdp_entry_t *pdpe, PG_V;
2611	vm_page_t pdpg;
2612
2613	PG_V = pmap_valid_bit(pmap);
2614
2615retry:
2616	pdpe = pmap_pdpe(pmap, va);
2617	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2618		/* Add a reference to the pd page. */
2619		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2620		pdpg->wire_count++;
2621	} else {
2622		/* Allocate a pd page. */
2623		ptepindex = pmap_pde_pindex(va);
2624		pdpindex = ptepindex >> NPDPEPGSHIFT;
2625		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2626		if (pdpg == NULL && lockp != NULL)
2627			goto retry;
2628	}
2629	return (pdpg);
2630}
2631
2632static vm_page_t
2633pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2634{
2635	vm_pindex_t ptepindex;
2636	pd_entry_t *pd, PG_V;
2637	vm_page_t m;
2638
2639	PG_V = pmap_valid_bit(pmap);
2640
2641	/*
2642	 * Calculate pagetable page index
2643	 */
2644	ptepindex = pmap_pde_pindex(va);
2645retry:
2646	/*
2647	 * Get the page directory entry
2648	 */
2649	pd = pmap_pde(pmap, va);
2650
2651	/*
2652	 * This supports switching from a 2MB page to a
2653	 * normal 4K page.
2654	 */
2655	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2656		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2657			/*
2658			 * Invalidation of the 2MB page mapping may have caused
2659			 * the deallocation of the underlying PD page.
2660			 */
2661			pd = NULL;
2662		}
2663	}
2664
2665	/*
2666	 * If the page table page is mapped, we just increment the
2667	 * hold count, and activate it.
2668	 */
2669	if (pd != NULL && (*pd & PG_V) != 0) {
2670		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2671		m->wire_count++;
2672	} else {
2673		/*
2674		 * Here if the pte page isn't mapped, or if it has been
2675		 * deallocated.
2676		 */
2677		m = _pmap_allocpte(pmap, ptepindex, lockp);
2678		if (m == NULL && lockp != NULL)
2679			goto retry;
2680	}
2681	return (m);
2682}
2683
2684
2685/***************************************************
2686 * Pmap allocation/deallocation routines.
2687 ***************************************************/
2688
2689/*
2690 * Release any resources held by the given physical map.
2691 * Called when a pmap initialized by pmap_pinit is being released.
2692 * Should only be called if the map contains no valid mappings.
2693 */
2694void
2695pmap_release(pmap_t pmap)
2696{
2697	vm_page_t m;
2698	int i;
2699
2700	KASSERT(pmap->pm_stats.resident_count == 0,
2701	    ("pmap_release: pmap resident count %ld != 0",
2702	    pmap->pm_stats.resident_count));
2703	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2704	    ("pmap_release: pmap has reserved page table page(s)"));
2705	KASSERT(CPU_EMPTY(&pmap->pm_active),
2706	    ("releasing active pmap %p", pmap));
2707
2708	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2709
2710	for (i = 0; i < NKPML4E; i++)	/* KVA */
2711		pmap->pm_pml4[KPML4BASE + i] = 0;
2712	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2713		pmap->pm_pml4[DMPML4I + i] = 0;
2714	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2715
2716	m->wire_count--;
2717	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2718	vm_page_free_zero(m);
2719}
2720
2721static int
2722kvm_size(SYSCTL_HANDLER_ARGS)
2723{
2724	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2725
2726	return sysctl_handle_long(oidp, &ksize, 0, req);
2727}
2728SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2729    0, 0, kvm_size, "LU", "Size of KVM");
2730
2731static int
2732kvm_free(SYSCTL_HANDLER_ARGS)
2733{
2734	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2735
2736	return sysctl_handle_long(oidp, &kfree, 0, req);
2737}
2738SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2739    0, 0, kvm_free, "LU", "Amount of KVM free");
2740
2741/*
2742 * grow the number of kernel page table entries, if needed
2743 */
2744void
2745pmap_growkernel(vm_offset_t addr)
2746{
2747	vm_paddr_t paddr;
2748	vm_page_t nkpg;
2749	pd_entry_t *pde, newpdir;
2750	pdp_entry_t *pdpe;
2751
2752	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2753
2754	/*
2755	 * Return if "addr" is within the range of kernel page table pages
2756	 * that were preallocated during pmap bootstrap.  Moreover, leave
2757	 * "kernel_vm_end" and the kernel page table as they were.
2758	 *
2759	 * The correctness of this action is based on the following
2760	 * argument: vm_map_insert() allocates contiguous ranges of the
2761	 * kernel virtual address space.  It calls this function if a range
2762	 * ends after "kernel_vm_end".  If the kernel is mapped between
2763	 * "kernel_vm_end" and "addr", then the range cannot begin at
2764	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2765	 * than the kernel.  Thus, there is no immediate need to allocate
2766	 * any new kernel page table pages between "kernel_vm_end" and
2767	 * "KERNBASE".
2768	 */
2769	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2770		return;
2771
2772	addr = roundup2(addr, NBPDR);
2773	if (addr - 1 >= kernel_map->max_offset)
2774		addr = kernel_map->max_offset;
2775	while (kernel_vm_end < addr) {
2776		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2777		if ((*pdpe & X86_PG_V) == 0) {
2778			/* We need a new PDP entry */
2779			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2780			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2781			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2782			if (nkpg == NULL)
2783				panic("pmap_growkernel: no memory to grow kernel");
2784			if ((nkpg->flags & PG_ZERO) == 0)
2785				pmap_zero_page(nkpg);
2786			paddr = VM_PAGE_TO_PHYS(nkpg);
2787			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2788			    X86_PG_A | X86_PG_M);
2789			continue; /* try again */
2790		}
2791		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2792		if ((*pde & X86_PG_V) != 0) {
2793			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2794			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2795				kernel_vm_end = kernel_map->max_offset;
2796				break;
2797			}
2798			continue;
2799		}
2800
2801		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2802		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2803		    VM_ALLOC_ZERO);
2804		if (nkpg == NULL)
2805			panic("pmap_growkernel: no memory to grow kernel");
2806		if ((nkpg->flags & PG_ZERO) == 0)
2807			pmap_zero_page(nkpg);
2808		paddr = VM_PAGE_TO_PHYS(nkpg);
2809		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2810		pde_store(pde, newpdir);
2811
2812		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2813		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2814			kernel_vm_end = kernel_map->max_offset;
2815			break;
2816		}
2817	}
2818}
2819
2820
2821/***************************************************
2822 * page management routines.
2823 ***************************************************/
2824
2825CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2826CTASSERT(_NPCM == 3);
2827CTASSERT(_NPCPV == 168);
2828
2829static __inline struct pv_chunk *
2830pv_to_chunk(pv_entry_t pv)
2831{
2832
2833	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2834}
2835
2836#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2837
2838#define	PC_FREE0	0xfffffffffffffffful
2839#define	PC_FREE1	0xfffffffffffffffful
2840#define	PC_FREE2	0x000000fffffffffful
2841
2842static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2843
2844#ifdef PV_STATS
2845static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2846
2847SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2848	"Current number of pv entry chunks");
2849SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2850	"Current number of pv entry chunks allocated");
2851SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2852	"Current number of pv entry chunks frees");
2853SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2854	"Number of times tried to get a chunk page but failed.");
2855
2856static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2857static int pv_entry_spare;
2858
2859SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2860	"Current number of pv entry frees");
2861SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2862	"Current number of pv entry allocs");
2863SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2864	"Current number of pv entries");
2865SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2866	"Current number of spare pv entries");
2867#endif
2868
2869static void
2870reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
2871{
2872
2873	if (pmap == NULL)
2874		return;
2875	pmap_invalidate_all(pmap);
2876	if (pmap != locked_pmap)
2877		PMAP_UNLOCK(pmap);
2878	if (start_di)
2879		pmap_delayed_invl_finished();
2880}
2881
2882/*
2883 * We are in a serious low memory condition.  Resort to
2884 * drastic measures to free some pages so we can allocate
2885 * another pv entry chunk.
2886 *
2887 * Returns NULL if PV entries were reclaimed from the specified pmap.
2888 *
2889 * We do not, however, unmap 2mpages because subsequent accesses will
2890 * allocate per-page pv entries until repromotion occurs, thereby
2891 * exacerbating the shortage of free pv entries.
2892 */
2893static vm_page_t
2894reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2895{
2896	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2897	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2898	struct md_page *pvh;
2899	pd_entry_t *pde;
2900	pmap_t next_pmap, pmap;
2901	pt_entry_t *pte, tpte;
2902	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2903	pv_entry_t pv;
2904	vm_offset_t va;
2905	vm_page_t m, m_pc;
2906	struct spglist free;
2907	uint64_t inuse;
2908	int bit, field, freed;
2909	bool start_di;
2910	static int active_reclaims = 0;
2911
2912	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2913	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2914	pmap = NULL;
2915	m_pc = NULL;
2916	PG_G = PG_A = PG_M = PG_RW = 0;
2917	SLIST_INIT(&free);
2918	bzero(&pc_marker_b, sizeof(pc_marker_b));
2919	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2920	pc_marker = (struct pv_chunk *)&pc_marker_b;
2921	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2922
2923	/*
2924	 * A delayed invalidation block should already be active if
2925	 * pmap_advise() or pmap_remove() called this function by way
2926	 * of pmap_demote_pde_locked().
2927	 */
2928	start_di = pmap_not_in_di();
2929
2930	mtx_lock(&pv_chunks_mutex);
2931	active_reclaims++;
2932	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
2933	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
2934	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2935	    SLIST_EMPTY(&free)) {
2936		next_pmap = pc->pc_pmap;
2937		if (next_pmap == NULL) {
2938			/*
2939			 * The next chunk is a marker.  However, it is
2940			 * not our marker, so active_reclaims must be
2941			 * > 1.  Consequently, the next_chunk code
2942			 * will not rotate the pv_chunks list.
2943			 */
2944			goto next_chunk;
2945		}
2946		mtx_unlock(&pv_chunks_mutex);
2947
2948		/*
2949		 * A pv_chunk can only be removed from the pc_lru list
2950		 * when both pc_chunks_mutex is owned and the
2951		 * corresponding pmap is locked.
2952		 */
2953		if (pmap != next_pmap) {
2954			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
2955			    start_di);
2956			pmap = next_pmap;
2957			/* Avoid deadlock and lock recursion. */
2958			if (pmap > locked_pmap) {
2959				RELEASE_PV_LIST_LOCK(lockp);
2960				PMAP_LOCK(pmap);
2961				if (start_di)
2962					pmap_delayed_invl_started();
2963				mtx_lock(&pv_chunks_mutex);
2964				continue;
2965			} else if (pmap != locked_pmap) {
2966				if (PMAP_TRYLOCK(pmap)) {
2967					if (start_di)
2968						pmap_delayed_invl_started();
2969					mtx_lock(&pv_chunks_mutex);
2970					continue;
2971				} else {
2972					pmap = NULL; /* pmap is not locked */
2973					mtx_lock(&pv_chunks_mutex);
2974					pc = TAILQ_NEXT(pc_marker, pc_lru);
2975					if (pc == NULL ||
2976					    pc->pc_pmap != next_pmap)
2977						continue;
2978					goto next_chunk;
2979				}
2980			} else if (start_di)
2981				pmap_delayed_invl_started();
2982			PG_G = pmap_global_bit(pmap);
2983			PG_A = pmap_accessed_bit(pmap);
2984			PG_M = pmap_modified_bit(pmap);
2985			PG_RW = pmap_rw_bit(pmap);
2986		}
2987
2988		/*
2989		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2990		 */
2991		freed = 0;
2992		for (field = 0; field < _NPCM; field++) {
2993			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2994			    inuse != 0; inuse &= ~(1UL << bit)) {
2995				bit = bsfq(inuse);
2996				pv = &pc->pc_pventry[field * 64 + bit];
2997				va = pv->pv_va;
2998				pde = pmap_pde(pmap, va);
2999				if ((*pde & PG_PS) != 0)
3000					continue;
3001				pte = pmap_pde_to_pte(pde, va);
3002				if ((*pte & PG_W) != 0)
3003					continue;
3004				tpte = pte_load_clear(pte);
3005				if ((tpte & PG_G) != 0)
3006					pmap_invalidate_page(pmap, va);
3007				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3008				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3009					vm_page_dirty(m);
3010				if ((tpte & PG_A) != 0)
3011					vm_page_aflag_set(m, PGA_REFERENCED);
3012				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3013				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3014				m->md.pv_gen++;
3015				if (TAILQ_EMPTY(&m->md.pv_list) &&
3016				    (m->flags & PG_FICTITIOUS) == 0) {
3017					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3018					if (TAILQ_EMPTY(&pvh->pv_list)) {
3019						vm_page_aflag_clear(m,
3020						    PGA_WRITEABLE);
3021					}
3022				}
3023				pmap_delayed_invl_page(m);
3024				pc->pc_map[field] |= 1UL << bit;
3025				pmap_unuse_pt(pmap, va, *pde, &free);
3026				freed++;
3027			}
3028		}
3029		if (freed == 0) {
3030			mtx_lock(&pv_chunks_mutex);
3031			goto next_chunk;
3032		}
3033		/* Every freed mapping is for a 4 KB page. */
3034		pmap_resident_count_dec(pmap, freed);
3035		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3036		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3037		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3038		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3039		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
3040		    pc->pc_map[2] == PC_FREE2) {
3041			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3042			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3043			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3044			/* Entire chunk is free; return it. */
3045			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3046			dump_drop_page(m_pc->phys_addr);
3047			mtx_lock(&pv_chunks_mutex);
3048			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3049			break;
3050		}
3051		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3052		mtx_lock(&pv_chunks_mutex);
3053		/* One freed pv entry in locked_pmap is sufficient. */
3054		if (pmap == locked_pmap)
3055			break;
3056next_chunk:
3057		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3058		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
3059		if (active_reclaims == 1 && pmap != NULL) {
3060			/*
3061			 * Rotate the pv chunks list so that we do not
3062			 * scan the same pv chunks that could not be
3063			 * freed (because they contained a wired
3064			 * and/or superpage mapping) on every
3065			 * invocation of reclaim_pv_chunk().
3066			 */
3067			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
3068				MPASS(pc->pc_pmap != NULL);
3069				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3070				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3071			}
3072		}
3073	}
3074	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3075	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
3076	active_reclaims--;
3077	mtx_unlock(&pv_chunks_mutex);
3078	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
3079	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3080		m_pc = SLIST_FIRST(&free);
3081		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3082		/* Recycle a freed page table page. */
3083		m_pc->wire_count = 1;
3084	}
3085	pmap_free_zero_pages(&free);
3086	return (m_pc);
3087}
3088
3089/*
3090 * free the pv_entry back to the free list
3091 */
3092static void
3093free_pv_entry(pmap_t pmap, pv_entry_t pv)
3094{
3095	struct pv_chunk *pc;
3096	int idx, field, bit;
3097
3098	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3099	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3100	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3101	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3102	pc = pv_to_chunk(pv);
3103	idx = pv - &pc->pc_pventry[0];
3104	field = idx / 64;
3105	bit = idx % 64;
3106	pc->pc_map[field] |= 1ul << bit;
3107	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
3108	    pc->pc_map[2] != PC_FREE2) {
3109		/* 98% of the time, pc is already at the head of the list. */
3110		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3111			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3112			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3113		}
3114		return;
3115	}
3116	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3117	free_pv_chunk(pc);
3118}
3119
3120static void
3121free_pv_chunk(struct pv_chunk *pc)
3122{
3123	vm_page_t m;
3124
3125	mtx_lock(&pv_chunks_mutex);
3126 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3127	mtx_unlock(&pv_chunks_mutex);
3128	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3129	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3130	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3131	/* entire chunk is free, return it */
3132	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3133	dump_drop_page(m->phys_addr);
3134	vm_page_unwire(m, PQ_NONE);
3135	vm_page_free(m);
3136}
3137
3138/*
3139 * Returns a new PV entry, allocating a new PV chunk from the system when
3140 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3141 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3142 * returned.
3143 *
3144 * The given PV list lock may be released.
3145 */
3146static pv_entry_t
3147get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3148{
3149	int bit, field;
3150	pv_entry_t pv;
3151	struct pv_chunk *pc;
3152	vm_page_t m;
3153
3154	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3155	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3156retry:
3157	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3158	if (pc != NULL) {
3159		for (field = 0; field < _NPCM; field++) {
3160			if (pc->pc_map[field]) {
3161				bit = bsfq(pc->pc_map[field]);
3162				break;
3163			}
3164		}
3165		if (field < _NPCM) {
3166			pv = &pc->pc_pventry[field * 64 + bit];
3167			pc->pc_map[field] &= ~(1ul << bit);
3168			/* If this was the last item, move it to tail */
3169			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3170			    pc->pc_map[2] == 0) {
3171				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3172				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3173				    pc_list);
3174			}
3175			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3176			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3177			return (pv);
3178		}
3179	}
3180	/* No free items, allocate another chunk */
3181	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3182	    VM_ALLOC_WIRED);
3183	if (m == NULL) {
3184		if (lockp == NULL) {
3185			PV_STAT(pc_chunk_tryfail++);
3186			return (NULL);
3187		}
3188		m = reclaim_pv_chunk(pmap, lockp);
3189		if (m == NULL)
3190			goto retry;
3191	}
3192	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3193	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3194	dump_add_page(m->phys_addr);
3195	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3196	pc->pc_pmap = pmap;
3197	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
3198	pc->pc_map[1] = PC_FREE1;
3199	pc->pc_map[2] = PC_FREE2;
3200	mtx_lock(&pv_chunks_mutex);
3201	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3202	mtx_unlock(&pv_chunks_mutex);
3203	pv = &pc->pc_pventry[0];
3204	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3205	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3206	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3207	return (pv);
3208}
3209
3210/*
3211 * Returns the number of one bits within the given PV chunk map.
3212 *
3213 * The erratas for Intel processors state that "POPCNT Instruction May
3214 * Take Longer to Execute Than Expected".  It is believed that the
3215 * issue is the spurious dependency on the destination register.
3216 * Provide a hint to the register rename logic that the destination
3217 * value is overwritten, by clearing it, as suggested in the
3218 * optimization manual.  It should be cheap for unaffected processors
3219 * as well.
3220 *
3221 * Reference numbers for erratas are
3222 * 4th Gen Core: HSD146
3223 * 5th Gen Core: BDM85
3224 * 6th Gen Core: SKL029
3225 */
3226static int
3227popcnt_pc_map_pq(uint64_t *map)
3228{
3229	u_long result, tmp;
3230
3231	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
3232	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
3233	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
3234	    : "=&r" (result), "=&r" (tmp)
3235	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
3236	return (result);
3237}
3238
3239/*
3240 * Ensure that the number of spare PV entries in the specified pmap meets or
3241 * exceeds the given count, "needed".
3242 *
3243 * The given PV list lock may be released.
3244 */
3245static void
3246reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3247{
3248	struct pch new_tail;
3249	struct pv_chunk *pc;
3250	int avail, free;
3251	vm_page_t m;
3252
3253	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3254	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3255
3256	/*
3257	 * Newly allocated PV chunks must be stored in a private list until
3258	 * the required number of PV chunks have been allocated.  Otherwise,
3259	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3260	 * contrast, these chunks must be added to the pmap upon allocation.
3261	 */
3262	TAILQ_INIT(&new_tail);
3263retry:
3264	avail = 0;
3265	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3266#ifndef __POPCNT__
3267		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
3268			bit_count((bitstr_t *)pc->pc_map, 0,
3269			    sizeof(pc->pc_map) * NBBY, &free);
3270		else
3271#endif
3272		free = popcnt_pc_map_pq(pc->pc_map);
3273		if (free == 0)
3274			break;
3275		avail += free;
3276		if (avail >= needed)
3277			break;
3278	}
3279	for (; avail < needed; avail += _NPCPV) {
3280		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3281		    VM_ALLOC_WIRED);
3282		if (m == NULL) {
3283			m = reclaim_pv_chunk(pmap, lockp);
3284			if (m == NULL)
3285				goto retry;
3286		}
3287		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3288		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3289		dump_add_page(m->phys_addr);
3290		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3291		pc->pc_pmap = pmap;
3292		pc->pc_map[0] = PC_FREE0;
3293		pc->pc_map[1] = PC_FREE1;
3294		pc->pc_map[2] = PC_FREE2;
3295		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3296		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3297		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3298	}
3299	if (!TAILQ_EMPTY(&new_tail)) {
3300		mtx_lock(&pv_chunks_mutex);
3301		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3302		mtx_unlock(&pv_chunks_mutex);
3303	}
3304}
3305
3306/*
3307 * First find and then remove the pv entry for the specified pmap and virtual
3308 * address from the specified pv list.  Returns the pv entry if found and NULL
3309 * otherwise.  This operation can be performed on pv lists for either 4KB or
3310 * 2MB page mappings.
3311 */
3312static __inline pv_entry_t
3313pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3314{
3315	pv_entry_t pv;
3316
3317	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3318		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3319			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3320			pvh->pv_gen++;
3321			break;
3322		}
3323	}
3324	return (pv);
3325}
3326
3327/*
3328 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3329 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3330 * entries for each of the 4KB page mappings.
3331 */
3332static void
3333pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3334    struct rwlock **lockp)
3335{
3336	struct md_page *pvh;
3337	struct pv_chunk *pc;
3338	pv_entry_t pv;
3339	vm_offset_t va_last;
3340	vm_page_t m;
3341	int bit, field;
3342
3343	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3344	KASSERT((pa & PDRMASK) == 0,
3345	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3346	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3347
3348	/*
3349	 * Transfer the 2mpage's pv entry for this mapping to the first
3350	 * page's pv list.  Once this transfer begins, the pv list lock
3351	 * must not be released until the last pv entry is reinstantiated.
3352	 */
3353	pvh = pa_to_pvh(pa);
3354	va = trunc_2mpage(va);
3355	pv = pmap_pvh_remove(pvh, pmap, va);
3356	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3357	m = PHYS_TO_VM_PAGE(pa);
3358	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3359	m->md.pv_gen++;
3360	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3361	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3362	va_last = va + NBPDR - PAGE_SIZE;
3363	for (;;) {
3364		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3365		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3366		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3367		for (field = 0; field < _NPCM; field++) {
3368			while (pc->pc_map[field]) {
3369				bit = bsfq(pc->pc_map[field]);
3370				pc->pc_map[field] &= ~(1ul << bit);
3371				pv = &pc->pc_pventry[field * 64 + bit];
3372				va += PAGE_SIZE;
3373				pv->pv_va = va;
3374				m++;
3375				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3376			    ("pmap_pv_demote_pde: page %p is not managed", m));
3377				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3378				m->md.pv_gen++;
3379				if (va == va_last)
3380					goto out;
3381			}
3382		}
3383		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3384		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3385	}
3386out:
3387	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3388		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3389		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3390	}
3391	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3392	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3393}
3394
3395#if VM_NRESERVLEVEL > 0
3396/*
3397 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3398 * replace the many pv entries for the 4KB page mappings by a single pv entry
3399 * for the 2MB page mapping.
3400 */
3401static void
3402pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3403    struct rwlock **lockp)
3404{
3405	struct md_page *pvh;
3406	pv_entry_t pv;
3407	vm_offset_t va_last;
3408	vm_page_t m;
3409
3410	KASSERT((pa & PDRMASK) == 0,
3411	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3412	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3413
3414	/*
3415	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3416	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3417	 * a transfer avoids the possibility that get_pv_entry() calls
3418	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3419	 * mappings that is being promoted.
3420	 */
3421	m = PHYS_TO_VM_PAGE(pa);
3422	va = trunc_2mpage(va);
3423	pv = pmap_pvh_remove(&m->md, pmap, va);
3424	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3425	pvh = pa_to_pvh(pa);
3426	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3427	pvh->pv_gen++;
3428	/* Free the remaining NPTEPG - 1 pv entries. */
3429	va_last = va + NBPDR - PAGE_SIZE;
3430	do {
3431		m++;
3432		va += PAGE_SIZE;
3433		pmap_pvh_free(&m->md, pmap, va);
3434	} while (va < va_last);
3435}
3436#endif /* VM_NRESERVLEVEL > 0 */
3437
3438/*
3439 * First find and then destroy the pv entry for the specified pmap and virtual
3440 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3441 * page mappings.
3442 */
3443static void
3444pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3445{
3446	pv_entry_t pv;
3447
3448	pv = pmap_pvh_remove(pvh, pmap, va);
3449	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3450	free_pv_entry(pmap, pv);
3451}
3452
3453/*
3454 * Conditionally create the PV entry for a 4KB page mapping if the required
3455 * memory can be allocated without resorting to reclamation.
3456 */
3457static boolean_t
3458pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3459    struct rwlock **lockp)
3460{
3461	pv_entry_t pv;
3462
3463	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3464	/* Pass NULL instead of the lock pointer to disable reclamation. */
3465	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3466		pv->pv_va = va;
3467		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3468		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3469		m->md.pv_gen++;
3470		return (TRUE);
3471	} else
3472		return (FALSE);
3473}
3474
3475/*
3476 * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3477 * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3478 * false if the PV entry cannot be allocated without resorting to reclamation.
3479 */
3480static bool
3481pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
3482    struct rwlock **lockp)
3483{
3484	struct md_page *pvh;
3485	pv_entry_t pv;
3486	vm_paddr_t pa;
3487
3488	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3489	/* Pass NULL instead of the lock pointer to disable reclamation. */
3490	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3491	    NULL : lockp)) == NULL)
3492		return (false);
3493	pv->pv_va = va;
3494	pa = pde & PG_PS_FRAME;
3495	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3496	pvh = pa_to_pvh(pa);
3497	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3498	pvh->pv_gen++;
3499	return (true);
3500}
3501
3502/*
3503 * Fills a page table page with mappings to consecutive physical pages.
3504 */
3505static void
3506pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3507{
3508	pt_entry_t *pte;
3509
3510	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3511		*pte = newpte;
3512		newpte += PAGE_SIZE;
3513	}
3514}
3515
3516/*
3517 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3518 * mapping is invalidated.
3519 */
3520static boolean_t
3521pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3522{
3523	struct rwlock *lock;
3524	boolean_t rv;
3525
3526	lock = NULL;
3527	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3528	if (lock != NULL)
3529		rw_wunlock(lock);
3530	return (rv);
3531}
3532
3533static boolean_t
3534pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3535    struct rwlock **lockp)
3536{
3537	pd_entry_t newpde, oldpde;
3538	pt_entry_t *firstpte, newpte;
3539	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3540	vm_paddr_t mptepa;
3541	vm_page_t mpte;
3542	struct spglist free;
3543	vm_offset_t sva;
3544	int PG_PTE_CACHE;
3545
3546	PG_G = pmap_global_bit(pmap);
3547	PG_A = pmap_accessed_bit(pmap);
3548	PG_M = pmap_modified_bit(pmap);
3549	PG_RW = pmap_rw_bit(pmap);
3550	PG_V = pmap_valid_bit(pmap);
3551	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3552
3553	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3554	oldpde = *pde;
3555	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3556	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3557	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
3558	    NULL) {
3559		KASSERT((oldpde & PG_W) == 0,
3560		    ("pmap_demote_pde: page table page for a wired mapping"
3561		    " is missing"));
3562
3563		/*
3564		 * Invalidate the 2MB page mapping and return "failure" if the
3565		 * mapping was never accessed or the allocation of the new
3566		 * page table page fails.  If the 2MB page mapping belongs to
3567		 * the direct map region of the kernel's address space, then
3568		 * the page allocation request specifies the highest possible
3569		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3570		 * normal.  Page table pages are preallocated for every other
3571		 * part of the kernel address space, so the direct map region
3572		 * is the only part of the kernel address space that must be
3573		 * handled here.
3574		 */
3575		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3576		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3577		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3578		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3579			SLIST_INIT(&free);
3580			sva = trunc_2mpage(va);
3581			pmap_remove_pde(pmap, pde, sva, &free, lockp);
3582			if ((oldpde & PG_G) == 0)
3583				pmap_invalidate_pde_page(pmap, sva, oldpde);
3584			pmap_free_zero_pages(&free);
3585			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3586			    " in pmap %p", va, pmap);
3587			return (FALSE);
3588		}
3589		if (va < VM_MAXUSER_ADDRESS)
3590			pmap_resident_count_inc(pmap, 1);
3591	}
3592	mptepa = VM_PAGE_TO_PHYS(mpte);
3593	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3594	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3595	KASSERT((oldpde & PG_A) != 0,
3596	    ("pmap_demote_pde: oldpde is missing PG_A"));
3597	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3598	    ("pmap_demote_pde: oldpde is missing PG_M"));
3599	newpte = oldpde & ~PG_PS;
3600	newpte = pmap_swap_pat(pmap, newpte);
3601
3602	/*
3603	 * If the page table page is new, initialize it.
3604	 */
3605	if (mpte->wire_count == 1) {
3606		mpte->wire_count = NPTEPG;
3607		pmap_fill_ptp(firstpte, newpte);
3608	}
3609	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3610	    ("pmap_demote_pde: firstpte and newpte map different physical"
3611	    " addresses"));
3612
3613	/*
3614	 * If the mapping has changed attributes, update the page table
3615	 * entries.
3616	 */
3617	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3618		pmap_fill_ptp(firstpte, newpte);
3619
3620	/*
3621	 * The spare PV entries must be reserved prior to demoting the
3622	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3623	 * of the PDE and the PV lists will be inconsistent, which can result
3624	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3625	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3626	 * PV entry for the 2MB page mapping that is being demoted.
3627	 */
3628	if ((oldpde & PG_MANAGED) != 0)
3629		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3630
3631	/*
3632	 * Demote the mapping.  This pmap is locked.  The old PDE has
3633	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3634	 * set.  Thus, there is no danger of a race with another
3635	 * processor changing the setting of PG_A and/or PG_M between
3636	 * the read above and the store below.
3637	 */
3638	if (workaround_erratum383)
3639		pmap_update_pde(pmap, va, pde, newpde);
3640	else
3641		pde_store(pde, newpde);
3642
3643	/*
3644	 * Invalidate a stale recursive mapping of the page table page.
3645	 */
3646	if (va >= VM_MAXUSER_ADDRESS)
3647		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3648
3649	/*
3650	 * Demote the PV entry.
3651	 */
3652	if ((oldpde & PG_MANAGED) != 0)
3653		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3654
3655	atomic_add_long(&pmap_pde_demotions, 1);
3656	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3657	    " in pmap %p", va, pmap);
3658	return (TRUE);
3659}
3660
3661/*
3662 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3663 */
3664static void
3665pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3666{
3667	pd_entry_t newpde;
3668	vm_paddr_t mptepa;
3669	vm_page_t mpte;
3670
3671	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3672	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3673	mpte = pmap_remove_pt_page(pmap, va);
3674	if (mpte == NULL)
3675		panic("pmap_remove_kernel_pde: Missing pt page.");
3676
3677	mptepa = VM_PAGE_TO_PHYS(mpte);
3678	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3679
3680	/*
3681	 * Initialize the page table page.
3682	 */
3683	pagezero((void *)PHYS_TO_DMAP(mptepa));
3684
3685	/*
3686	 * Demote the mapping.
3687	 */
3688	if (workaround_erratum383)
3689		pmap_update_pde(pmap, va, pde, newpde);
3690	else
3691		pde_store(pde, newpde);
3692
3693	/*
3694	 * Invalidate a stale recursive mapping of the page table page.
3695	 */
3696	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3697}
3698
3699/*
3700 * pmap_remove_pde: do the things to unmap a superpage in a process
3701 */
3702static int
3703pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3704    struct spglist *free, struct rwlock **lockp)
3705{
3706	struct md_page *pvh;
3707	pd_entry_t oldpde;
3708	vm_offset_t eva, va;
3709	vm_page_t m, mpte;
3710	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3711
3712	PG_G = pmap_global_bit(pmap);
3713	PG_A = pmap_accessed_bit(pmap);
3714	PG_M = pmap_modified_bit(pmap);
3715	PG_RW = pmap_rw_bit(pmap);
3716
3717	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3718	KASSERT((sva & PDRMASK) == 0,
3719	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3720	oldpde = pte_load_clear(pdq);
3721	if (oldpde & PG_W)
3722		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3723	if ((oldpde & PG_G) != 0)
3724		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3725	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3726	if (oldpde & PG_MANAGED) {
3727		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3728		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3729		pmap_pvh_free(pvh, pmap, sva);
3730		eva = sva + NBPDR;
3731		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3732		    va < eva; va += PAGE_SIZE, m++) {
3733			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3734				vm_page_dirty(m);
3735			if (oldpde & PG_A)
3736				vm_page_aflag_set(m, PGA_REFERENCED);
3737			if (TAILQ_EMPTY(&m->md.pv_list) &&
3738			    TAILQ_EMPTY(&pvh->pv_list))
3739				vm_page_aflag_clear(m, PGA_WRITEABLE);
3740			pmap_delayed_invl_page(m);
3741		}
3742	}
3743	if (pmap == kernel_pmap) {
3744		pmap_remove_kernel_pde(pmap, pdq, sva);
3745	} else {
3746		mpte = pmap_remove_pt_page(pmap, sva);
3747		if (mpte != NULL) {
3748			pmap_resident_count_dec(pmap, 1);
3749			KASSERT(mpte->wire_count == NPTEPG,
3750			    ("pmap_remove_pde: pte page wire count error"));
3751			mpte->wire_count = 0;
3752			pmap_add_delayed_free_list(mpte, free, FALSE);
3753		}
3754	}
3755	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3756}
3757
3758/*
3759 * pmap_remove_pte: do the things to unmap a page in a process
3760 */
3761static int
3762pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3763    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3764{
3765	struct md_page *pvh;
3766	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3767	vm_page_t m;
3768
3769	PG_A = pmap_accessed_bit(pmap);
3770	PG_M = pmap_modified_bit(pmap);
3771	PG_RW = pmap_rw_bit(pmap);
3772
3773	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3774	oldpte = pte_load_clear(ptq);
3775	if (oldpte & PG_W)
3776		pmap->pm_stats.wired_count -= 1;
3777	pmap_resident_count_dec(pmap, 1);
3778	if (oldpte & PG_MANAGED) {
3779		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3780		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3781			vm_page_dirty(m);
3782		if (oldpte & PG_A)
3783			vm_page_aflag_set(m, PGA_REFERENCED);
3784		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3785		pmap_pvh_free(&m->md, pmap, va);
3786		if (TAILQ_EMPTY(&m->md.pv_list) &&
3787		    (m->flags & PG_FICTITIOUS) == 0) {
3788			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3789			if (TAILQ_EMPTY(&pvh->pv_list))
3790				vm_page_aflag_clear(m, PGA_WRITEABLE);
3791		}
3792		pmap_delayed_invl_page(m);
3793	}
3794	return (pmap_unuse_pt(pmap, va, ptepde, free));
3795}
3796
3797/*
3798 * Remove a single page from a process address space
3799 */
3800static void
3801pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3802    struct spglist *free)
3803{
3804	struct rwlock *lock;
3805	pt_entry_t *pte, PG_V;
3806
3807	PG_V = pmap_valid_bit(pmap);
3808	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3809	if ((*pde & PG_V) == 0)
3810		return;
3811	pte = pmap_pde_to_pte(pde, va);
3812	if ((*pte & PG_V) == 0)
3813		return;
3814	lock = NULL;
3815	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3816	if (lock != NULL)
3817		rw_wunlock(lock);
3818	pmap_invalidate_page(pmap, va);
3819}
3820
3821/*
3822 * Removes the specified range of addresses from the page table page.
3823 */
3824static bool
3825pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
3826    pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
3827{
3828	pt_entry_t PG_G, *pte;
3829	vm_offset_t va;
3830	bool anyvalid;
3831
3832	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3833	PG_G = pmap_global_bit(pmap);
3834	anyvalid = false;
3835	va = eva;
3836	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
3837	    sva += PAGE_SIZE) {
3838		if (*pte == 0) {
3839			if (va != eva) {
3840				pmap_invalidate_range(pmap, va, sva);
3841				va = eva;
3842			}
3843			continue;
3844		}
3845		if ((*pte & PG_G) == 0)
3846			anyvalid = true;
3847		else if (va == eva)
3848			va = sva;
3849		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
3850			sva += PAGE_SIZE;
3851			break;
3852		}
3853	}
3854	if (va != eva)
3855		pmap_invalidate_range(pmap, va, sva);
3856	return (anyvalid);
3857}
3858
3859/*
3860 *	Remove the given range of addresses from the specified map.
3861 *
3862 *	It is assumed that the start and end are properly
3863 *	rounded to the page size.
3864 */
3865void
3866pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3867{
3868	struct rwlock *lock;
3869	vm_offset_t va_next;
3870	pml4_entry_t *pml4e;
3871	pdp_entry_t *pdpe;
3872	pd_entry_t ptpaddr, *pde;
3873	pt_entry_t PG_G, PG_V;
3874	struct spglist free;
3875	int anyvalid;
3876
3877	PG_G = pmap_global_bit(pmap);
3878	PG_V = pmap_valid_bit(pmap);
3879
3880	/*
3881	 * Perform an unsynchronized read.  This is, however, safe.
3882	 */
3883	if (pmap->pm_stats.resident_count == 0)
3884		return;
3885
3886	anyvalid = 0;
3887	SLIST_INIT(&free);
3888
3889	pmap_delayed_invl_started();
3890	PMAP_LOCK(pmap);
3891
3892	/*
3893	 * special handling of removing one page.  a very
3894	 * common operation and easy to short circuit some
3895	 * code.
3896	 */
3897	if (sva + PAGE_SIZE == eva) {
3898		pde = pmap_pde(pmap, sva);
3899		if (pde && (*pde & PG_PS) == 0) {
3900			pmap_remove_page(pmap, sva, pde, &free);
3901			goto out;
3902		}
3903	}
3904
3905	lock = NULL;
3906	for (; sva < eva; sva = va_next) {
3907
3908		if (pmap->pm_stats.resident_count == 0)
3909			break;
3910
3911		pml4e = pmap_pml4e(pmap, sva);
3912		if ((*pml4e & PG_V) == 0) {
3913			va_next = (sva + NBPML4) & ~PML4MASK;
3914			if (va_next < sva)
3915				va_next = eva;
3916			continue;
3917		}
3918
3919		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3920		if ((*pdpe & PG_V) == 0) {
3921			va_next = (sva + NBPDP) & ~PDPMASK;
3922			if (va_next < sva)
3923				va_next = eva;
3924			continue;
3925		}
3926
3927		/*
3928		 * Calculate index for next page table.
3929		 */
3930		va_next = (sva + NBPDR) & ~PDRMASK;
3931		if (va_next < sva)
3932			va_next = eva;
3933
3934		pde = pmap_pdpe_to_pde(pdpe, sva);
3935		ptpaddr = *pde;
3936
3937		/*
3938		 * Weed out invalid mappings.
3939		 */
3940		if (ptpaddr == 0)
3941			continue;
3942
3943		/*
3944		 * Check for large page.
3945		 */
3946		if ((ptpaddr & PG_PS) != 0) {
3947			/*
3948			 * Are we removing the entire large page?  If not,
3949			 * demote the mapping and fall through.
3950			 */
3951			if (sva + NBPDR == va_next && eva >= va_next) {
3952				/*
3953				 * The TLB entry for a PG_G mapping is
3954				 * invalidated by pmap_remove_pde().
3955				 */
3956				if ((ptpaddr & PG_G) == 0)
3957					anyvalid = 1;
3958				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3959				continue;
3960			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3961			    &lock)) {
3962				/* The large page mapping was destroyed. */
3963				continue;
3964			} else
3965				ptpaddr = *pde;
3966		}
3967
3968		/*
3969		 * Limit our scan to either the end of the va represented
3970		 * by the current page table page, or to the end of the
3971		 * range being removed.
3972		 */
3973		if (va_next > eva)
3974			va_next = eva;
3975
3976		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
3977			anyvalid = 1;
3978	}
3979	if (lock != NULL)
3980		rw_wunlock(lock);
3981out:
3982	if (anyvalid)
3983		pmap_invalidate_all(pmap);
3984	PMAP_UNLOCK(pmap);
3985	pmap_delayed_invl_finished();
3986	pmap_free_zero_pages(&free);
3987}
3988
3989/*
3990 *	Routine:	pmap_remove_all
3991 *	Function:
3992 *		Removes this physical page from
3993 *		all physical maps in which it resides.
3994 *		Reflects back modify bits to the pager.
3995 *
3996 *	Notes:
3997 *		Original versions of this routine were very
3998 *		inefficient because they iteratively called
3999 *		pmap_remove (slow...)
4000 */
4001
4002void
4003pmap_remove_all(vm_page_t m)
4004{
4005	struct md_page *pvh;
4006	pv_entry_t pv;
4007	pmap_t pmap;
4008	struct rwlock *lock;
4009	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
4010	pd_entry_t *pde;
4011	vm_offset_t va;
4012	struct spglist free;
4013	int pvh_gen, md_gen;
4014
4015	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4016	    ("pmap_remove_all: page %p is not managed", m));
4017	SLIST_INIT(&free);
4018	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4019	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4020	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4021retry:
4022	rw_wlock(lock);
4023	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4024		pmap = PV_PMAP(pv);
4025		if (!PMAP_TRYLOCK(pmap)) {
4026			pvh_gen = pvh->pv_gen;
4027			rw_wunlock(lock);
4028			PMAP_LOCK(pmap);
4029			rw_wlock(lock);
4030			if (pvh_gen != pvh->pv_gen) {
4031				rw_wunlock(lock);
4032				PMAP_UNLOCK(pmap);
4033				goto retry;
4034			}
4035		}
4036		va = pv->pv_va;
4037		pde = pmap_pde(pmap, va);
4038		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
4039		PMAP_UNLOCK(pmap);
4040	}
4041	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4042		pmap = PV_PMAP(pv);
4043		if (!PMAP_TRYLOCK(pmap)) {
4044			pvh_gen = pvh->pv_gen;
4045			md_gen = m->md.pv_gen;
4046			rw_wunlock(lock);
4047			PMAP_LOCK(pmap);
4048			rw_wlock(lock);
4049			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4050				rw_wunlock(lock);
4051				PMAP_UNLOCK(pmap);
4052				goto retry;
4053			}
4054		}
4055		PG_A = pmap_accessed_bit(pmap);
4056		PG_M = pmap_modified_bit(pmap);
4057		PG_RW = pmap_rw_bit(pmap);
4058		pmap_resident_count_dec(pmap, 1);
4059		pde = pmap_pde(pmap, pv->pv_va);
4060		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
4061		    " a 2mpage in page %p's pv list", m));
4062		pte = pmap_pde_to_pte(pde, pv->pv_va);
4063		tpte = pte_load_clear(pte);
4064		if (tpte & PG_W)
4065			pmap->pm_stats.wired_count--;
4066		if (tpte & PG_A)
4067			vm_page_aflag_set(m, PGA_REFERENCED);
4068
4069		/*
4070		 * Update the vm_page_t clean and reference bits.
4071		 */
4072		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4073			vm_page_dirty(m);
4074		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
4075		pmap_invalidate_page(pmap, pv->pv_va);
4076		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4077		m->md.pv_gen++;
4078		free_pv_entry(pmap, pv);
4079		PMAP_UNLOCK(pmap);
4080	}
4081	vm_page_aflag_clear(m, PGA_WRITEABLE);
4082	rw_wunlock(lock);
4083	pmap_delayed_invl_wait(m);
4084	pmap_free_zero_pages(&free);
4085}
4086
4087/*
4088 * pmap_protect_pde: do the things to protect a 2mpage in a process
4089 */
4090static boolean_t
4091pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
4092{
4093	pd_entry_t newpde, oldpde;
4094	vm_offset_t eva, va;
4095	vm_page_t m;
4096	boolean_t anychanged;
4097	pt_entry_t PG_G, PG_M, PG_RW;
4098
4099	PG_G = pmap_global_bit(pmap);
4100	PG_M = pmap_modified_bit(pmap);
4101	PG_RW = pmap_rw_bit(pmap);
4102
4103	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4104	KASSERT((sva & PDRMASK) == 0,
4105	    ("pmap_protect_pde: sva is not 2mpage aligned"));
4106	anychanged = FALSE;
4107retry:
4108	oldpde = newpde = *pde;
4109	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4110	    (PG_MANAGED | PG_M | PG_RW)) {
4111		eva = sva + NBPDR;
4112		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4113		    va < eva; va += PAGE_SIZE, m++)
4114			vm_page_dirty(m);
4115	}
4116	if ((prot & VM_PROT_WRITE) == 0)
4117		newpde &= ~(PG_RW | PG_M);
4118	if ((prot & VM_PROT_EXECUTE) == 0)
4119		newpde |= pg_nx;
4120	if (newpde != oldpde) {
4121		/*
4122		 * As an optimization to future operations on this PDE, clear
4123		 * PG_PROMOTED.  The impending invalidation will remove any
4124		 * lingering 4KB page mappings from the TLB.
4125		 */
4126		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
4127			goto retry;
4128		if ((oldpde & PG_G) != 0)
4129			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4130		else
4131			anychanged = TRUE;
4132	}
4133	return (anychanged);
4134}
4135
4136/*
4137 *	Set the physical protection on the
4138 *	specified range of this map as requested.
4139 */
4140void
4141pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4142{
4143	vm_offset_t va_next;
4144	pml4_entry_t *pml4e;
4145	pdp_entry_t *pdpe;
4146	pd_entry_t ptpaddr, *pde;
4147	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
4148	boolean_t anychanged;
4149
4150	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4151	if (prot == VM_PROT_NONE) {
4152		pmap_remove(pmap, sva, eva);
4153		return;
4154	}
4155
4156	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4157	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
4158		return;
4159
4160	PG_G = pmap_global_bit(pmap);
4161	PG_M = pmap_modified_bit(pmap);
4162	PG_V = pmap_valid_bit(pmap);
4163	PG_RW = pmap_rw_bit(pmap);
4164	anychanged = FALSE;
4165
4166	/*
4167	 * Although this function delays and batches the invalidation
4168	 * of stale TLB entries, it does not need to call
4169	 * pmap_delayed_invl_started() and
4170	 * pmap_delayed_invl_finished(), because it does not
4171	 * ordinarily destroy mappings.  Stale TLB entries from
4172	 * protection-only changes need only be invalidated before the
4173	 * pmap lock is released, because protection-only changes do
4174	 * not destroy PV entries.  Even operations that iterate over
4175	 * a physical page's PV list of mappings, like
4176	 * pmap_remove_write(), acquire the pmap lock for each
4177	 * mapping.  Consequently, for protection-only changes, the
4178	 * pmap lock suffices to synchronize both page table and TLB
4179	 * updates.
4180	 *
4181	 * This function only destroys a mapping if pmap_demote_pde()
4182	 * fails.  In that case, stale TLB entries are immediately
4183	 * invalidated.
4184	 */
4185
4186	PMAP_LOCK(pmap);
4187	for (; sva < eva; sva = va_next) {
4188
4189		pml4e = pmap_pml4e(pmap, sva);
4190		if ((*pml4e & PG_V) == 0) {
4191			va_next = (sva + NBPML4) & ~PML4MASK;
4192			if (va_next < sva)
4193				va_next = eva;
4194			continue;
4195		}
4196
4197		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4198		if ((*pdpe & PG_V) == 0) {
4199			va_next = (sva + NBPDP) & ~PDPMASK;
4200			if (va_next < sva)
4201				va_next = eva;
4202			continue;
4203		}
4204
4205		va_next = (sva + NBPDR) & ~PDRMASK;
4206		if (va_next < sva)
4207			va_next = eva;
4208
4209		pde = pmap_pdpe_to_pde(pdpe, sva);
4210		ptpaddr = *pde;
4211
4212		/*
4213		 * Weed out invalid mappings.
4214		 */
4215		if (ptpaddr == 0)
4216			continue;
4217
4218		/*
4219		 * Check for large page.
4220		 */
4221		if ((ptpaddr & PG_PS) != 0) {
4222			/*
4223			 * Are we protecting the entire large page?  If not,
4224			 * demote the mapping and fall through.
4225			 */
4226			if (sva + NBPDR == va_next && eva >= va_next) {
4227				/*
4228				 * The TLB entry for a PG_G mapping is
4229				 * invalidated by pmap_protect_pde().
4230				 */
4231				if (pmap_protect_pde(pmap, pde, sva, prot))
4232					anychanged = TRUE;
4233				continue;
4234			} else if (!pmap_demote_pde(pmap, pde, sva)) {
4235				/*
4236				 * The large page mapping was destroyed.
4237				 */
4238				continue;
4239			}
4240		}
4241
4242		if (va_next > eva)
4243			va_next = eva;
4244
4245		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4246		    sva += PAGE_SIZE) {
4247			pt_entry_t obits, pbits;
4248			vm_page_t m;
4249
4250retry:
4251			obits = pbits = *pte;
4252			if ((pbits & PG_V) == 0)
4253				continue;
4254
4255			if ((prot & VM_PROT_WRITE) == 0) {
4256				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4257				    (PG_MANAGED | PG_M | PG_RW)) {
4258					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4259					vm_page_dirty(m);
4260				}
4261				pbits &= ~(PG_RW | PG_M);
4262			}
4263			if ((prot & VM_PROT_EXECUTE) == 0)
4264				pbits |= pg_nx;
4265
4266			if (pbits != obits) {
4267				if (!atomic_cmpset_long(pte, obits, pbits))
4268					goto retry;
4269				if (obits & PG_G)
4270					pmap_invalidate_page(pmap, sva);
4271				else
4272					anychanged = TRUE;
4273			}
4274		}
4275	}
4276	if (anychanged)
4277		pmap_invalidate_all(pmap);
4278	PMAP_UNLOCK(pmap);
4279}
4280
4281#if VM_NRESERVLEVEL > 0
4282/*
4283 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4284 * single page table page (PTP) to a single 2MB page mapping.  For promotion
4285 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4286 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4287 * identical characteristics.
4288 */
4289static void
4290pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4291    struct rwlock **lockp)
4292{
4293	pd_entry_t newpde;
4294	pt_entry_t *firstpte, oldpte, pa, *pte;
4295	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4296	vm_page_t mpte;
4297	int PG_PTE_CACHE;
4298
4299	PG_A = pmap_accessed_bit(pmap);
4300	PG_G = pmap_global_bit(pmap);
4301	PG_M = pmap_modified_bit(pmap);
4302	PG_V = pmap_valid_bit(pmap);
4303	PG_RW = pmap_rw_bit(pmap);
4304	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4305
4306	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4307
4308	/*
4309	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4310	 * either invalid, unused, or does not map the first 4KB physical page
4311	 * within a 2MB page.
4312	 */
4313	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4314setpde:
4315	newpde = *firstpte;
4316	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4317		atomic_add_long(&pmap_pde_p_failures, 1);
4318		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4319		    " in pmap %p", va, pmap);
4320		return;
4321	}
4322	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4323		/*
4324		 * When PG_M is already clear, PG_RW can be cleared without
4325		 * a TLB invalidation.
4326		 */
4327		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4328			goto setpde;
4329		newpde &= ~PG_RW;
4330	}
4331
4332	/*
4333	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4334	 * PTE maps an unexpected 4KB physical page or does not have identical
4335	 * characteristics to the first PTE.
4336	 */
4337	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4338	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4339setpte:
4340		oldpte = *pte;
4341		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4342			atomic_add_long(&pmap_pde_p_failures, 1);
4343			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4344			    " in pmap %p", va, pmap);
4345			return;
4346		}
4347		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4348			/*
4349			 * When PG_M is already clear, PG_RW can be cleared
4350			 * without a TLB invalidation.
4351			 */
4352			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4353				goto setpte;
4354			oldpte &= ~PG_RW;
4355			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4356			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4357			    (va & ~PDRMASK), pmap);
4358		}
4359		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4360			atomic_add_long(&pmap_pde_p_failures, 1);
4361			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4362			    " in pmap %p", va, pmap);
4363			return;
4364		}
4365		pa -= PAGE_SIZE;
4366	}
4367
4368	/*
4369	 * Save the page table page in its current state until the PDE
4370	 * mapping the superpage is demoted by pmap_demote_pde() or
4371	 * destroyed by pmap_remove_pde().
4372	 */
4373	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4374	KASSERT(mpte >= vm_page_array &&
4375	    mpte < &vm_page_array[vm_page_array_size],
4376	    ("pmap_promote_pde: page table page is out of range"));
4377	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4378	    ("pmap_promote_pde: page table page's pindex is wrong"));
4379	if (pmap_insert_pt_page(pmap, mpte)) {
4380		atomic_add_long(&pmap_pde_p_failures, 1);
4381		CTR2(KTR_PMAP,
4382		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4383		    pmap);
4384		return;
4385	}
4386
4387	/*
4388	 * Promote the pv entries.
4389	 */
4390	if ((newpde & PG_MANAGED) != 0)
4391		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4392
4393	/*
4394	 * Propagate the PAT index to its proper position.
4395	 */
4396	newpde = pmap_swap_pat(pmap, newpde);
4397
4398	/*
4399	 * Map the superpage.
4400	 */
4401	if (workaround_erratum383)
4402		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4403	else
4404		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
4405
4406	atomic_add_long(&pmap_pde_promotions, 1);
4407	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4408	    " in pmap %p", va, pmap);
4409}
4410#endif /* VM_NRESERVLEVEL > 0 */
4411
4412/*
4413 *	Insert the given physical page (p) at
4414 *	the specified virtual address (v) in the
4415 *	target physical map with the protection requested.
4416 *
4417 *	If specified, the page will be wired down, meaning
4418 *	that the related pte can not be reclaimed.
4419 *
4420 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4421 *	or lose information.  That is, this routine must actually
4422 *	insert this page into the given map NOW.
4423 *
4424 *	When destroying both a page table and PV entry, this function
4425 *	performs the TLB invalidation before releasing the PV list
4426 *	lock, so we do not need pmap_delayed_invl_page() calls here.
4427 */
4428int
4429pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4430    u_int flags, int8_t psind)
4431{
4432	struct rwlock *lock;
4433	pd_entry_t *pde;
4434	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4435	pt_entry_t newpte, origpte;
4436	pv_entry_t pv;
4437	vm_paddr_t opa, pa;
4438	vm_page_t mpte, om;
4439	int rv;
4440	boolean_t nosleep;
4441
4442	PG_A = pmap_accessed_bit(pmap);
4443	PG_G = pmap_global_bit(pmap);
4444	PG_M = pmap_modified_bit(pmap);
4445	PG_V = pmap_valid_bit(pmap);
4446	PG_RW = pmap_rw_bit(pmap);
4447
4448	va = trunc_page(va);
4449	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4450	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4451	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4452	    va));
4453	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4454	    va >= kmi.clean_eva,
4455	    ("pmap_enter: managed mapping within the clean submap"));
4456	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4457		VM_OBJECT_ASSERT_LOCKED(m->object);
4458	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
4459	    ("pmap_enter: flags %u has reserved bits set", flags));
4460	pa = VM_PAGE_TO_PHYS(m);
4461	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4462	if ((flags & VM_PROT_WRITE) != 0)
4463		newpte |= PG_M;
4464	if ((prot & VM_PROT_WRITE) != 0)
4465		newpte |= PG_RW;
4466	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4467	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4468	if ((prot & VM_PROT_EXECUTE) == 0)
4469		newpte |= pg_nx;
4470	if ((flags & PMAP_ENTER_WIRED) != 0)
4471		newpte |= PG_W;
4472	if (va < VM_MAXUSER_ADDRESS)
4473		newpte |= PG_U;
4474	if (pmap == kernel_pmap)
4475		newpte |= PG_G;
4476	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
4477
4478	/*
4479	 * Set modified bit gratuitously for writeable mappings if
4480	 * the page is unmanaged. We do not want to take a fault
4481	 * to do the dirty bit accounting for these mappings.
4482	 */
4483	if ((m->oflags & VPO_UNMANAGED) != 0) {
4484		if ((newpte & PG_RW) != 0)
4485			newpte |= PG_M;
4486	} else
4487		newpte |= PG_MANAGED;
4488
4489	lock = NULL;
4490	PMAP_LOCK(pmap);
4491	if (psind == 1) {
4492		/* Assert the required virtual and physical alignment. */
4493		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
4494		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4495		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
4496		goto out;
4497	}
4498	mpte = NULL;
4499
4500	/*
4501	 * In the case that a page table page is not
4502	 * resident, we are creating it here.
4503	 */
4504retry:
4505	pde = pmap_pde(pmap, va);
4506	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4507	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4508		pte = pmap_pde_to_pte(pde, va);
4509		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4510			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4511			mpte->wire_count++;
4512		}
4513	} else if (va < VM_MAXUSER_ADDRESS) {
4514		/*
4515		 * Here if the pte page isn't mapped, or if it has been
4516		 * deallocated.
4517		 */
4518		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4519		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4520		    nosleep ? NULL : &lock);
4521		if (mpte == NULL && nosleep) {
4522			rv = KERN_RESOURCE_SHORTAGE;
4523			goto out;
4524		}
4525		goto retry;
4526	} else
4527		panic("pmap_enter: invalid page directory va=%#lx", va);
4528
4529	origpte = *pte;
4530
4531	/*
4532	 * Is the specified virtual address already mapped?
4533	 */
4534	if ((origpte & PG_V) != 0) {
4535		/*
4536		 * Wiring change, just update stats. We don't worry about
4537		 * wiring PT pages as they remain resident as long as there
4538		 * are valid mappings in them. Hence, if a user page is wired,
4539		 * the PT page will be also.
4540		 */
4541		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4542			pmap->pm_stats.wired_count++;
4543		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4544			pmap->pm_stats.wired_count--;
4545
4546		/*
4547		 * Remove the extra PT page reference.
4548		 */
4549		if (mpte != NULL) {
4550			mpte->wire_count--;
4551			KASSERT(mpte->wire_count > 0,
4552			    ("pmap_enter: missing reference to page table page,"
4553			     " va: 0x%lx", va));
4554		}
4555
4556		/*
4557		 * Has the physical page changed?
4558		 */
4559		opa = origpte & PG_FRAME;
4560		if (opa == pa) {
4561			/*
4562			 * No, might be a protection or wiring change.
4563			 */
4564			if ((origpte & PG_MANAGED) != 0 &&
4565			    (newpte & PG_RW) != 0)
4566				vm_page_aflag_set(m, PGA_WRITEABLE);
4567			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4568				goto unchanged;
4569			goto validate;
4570		}
4571	} else {
4572		/*
4573		 * Increment the counters.
4574		 */
4575		if ((newpte & PG_W) != 0)
4576			pmap->pm_stats.wired_count++;
4577		pmap_resident_count_inc(pmap, 1);
4578	}
4579
4580	/*
4581	 * Enter on the PV list if part of our managed memory.
4582	 */
4583	if ((newpte & PG_MANAGED) != 0) {
4584		pv = get_pv_entry(pmap, &lock);
4585		pv->pv_va = va;
4586		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4587		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4588		m->md.pv_gen++;
4589		if ((newpte & PG_RW) != 0)
4590			vm_page_aflag_set(m, PGA_WRITEABLE);
4591	}
4592
4593	/*
4594	 * Update the PTE.
4595	 */
4596	if ((origpte & PG_V) != 0) {
4597validate:
4598		origpte = pte_load_store(pte, newpte);
4599		opa = origpte & PG_FRAME;
4600		if (opa != pa) {
4601			if ((origpte & PG_MANAGED) != 0) {
4602				om = PHYS_TO_VM_PAGE(opa);
4603				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4604				    PG_RW))
4605					vm_page_dirty(om);
4606				if ((origpte & PG_A) != 0)
4607					vm_page_aflag_set(om, PGA_REFERENCED);
4608				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4609				pmap_pvh_free(&om->md, pmap, va);
4610				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4611				    TAILQ_EMPTY(&om->md.pv_list) &&
4612				    ((om->flags & PG_FICTITIOUS) != 0 ||
4613				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4614					vm_page_aflag_clear(om, PGA_WRITEABLE);
4615			}
4616		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4617		    PG_RW)) == (PG_M | PG_RW)) {
4618			if ((origpte & PG_MANAGED) != 0)
4619				vm_page_dirty(m);
4620
4621			/*
4622			 * Although the PTE may still have PG_RW set, TLB
4623			 * invalidation may nonetheless be required because
4624			 * the PTE no longer has PG_M set.
4625			 */
4626		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4627			/*
4628			 * This PTE change does not require TLB invalidation.
4629			 */
4630			goto unchanged;
4631		}
4632		if ((origpte & PG_A) != 0)
4633			pmap_invalidate_page(pmap, va);
4634	} else
4635		pte_store(pte, newpte);
4636
4637unchanged:
4638
4639#if VM_NRESERVLEVEL > 0
4640	/*
4641	 * If both the page table page and the reservation are fully
4642	 * populated, then attempt promotion.
4643	 */
4644	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4645	    pmap_ps_enabled(pmap) &&
4646	    (m->flags & PG_FICTITIOUS) == 0 &&
4647	    vm_reserv_level_iffullpop(m) == 0)
4648		pmap_promote_pde(pmap, pde, va, &lock);
4649#endif
4650
4651	rv = KERN_SUCCESS;
4652out:
4653	if (lock != NULL)
4654		rw_wunlock(lock);
4655	PMAP_UNLOCK(pmap);
4656	return (rv);
4657}
4658
4659/*
4660 * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
4661 * if successful.  Returns false if (1) a page table page cannot be allocated
4662 * without sleeping, (2) a mapping already exists at the specified virtual
4663 * address, or (3) a PV entry cannot be allocated without reclaiming another
4664 * PV entry.
4665 */
4666static bool
4667pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4668    struct rwlock **lockp)
4669{
4670	pd_entry_t newpde;
4671	pt_entry_t PG_V;
4672
4673	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4674	PG_V = pmap_valid_bit(pmap);
4675	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4676	    PG_PS | PG_V;
4677	if ((m->oflags & VPO_UNMANAGED) == 0)
4678		newpde |= PG_MANAGED;
4679	if ((prot & VM_PROT_EXECUTE) == 0)
4680		newpde |= pg_nx;
4681	if (va < VM_MAXUSER_ADDRESS)
4682		newpde |= PG_U;
4683	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
4684	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
4685	    KERN_SUCCESS);
4686}
4687
4688/*
4689 * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
4690 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
4691 * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
4692 * a mapping already exists at the specified virtual address.  Returns
4693 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
4694 * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
4695 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
4696 *
4697 * The parameter "m" is only used when creating a managed, writeable mapping.
4698 */
4699static int
4700pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
4701    vm_page_t m, struct rwlock **lockp)
4702{
4703	struct spglist free;
4704	pd_entry_t oldpde, *pde;
4705	pt_entry_t PG_G, PG_RW, PG_V;
4706	vm_page_t mt, pdpg;
4707
4708	PG_G = pmap_global_bit(pmap);
4709	PG_RW = pmap_rw_bit(pmap);
4710	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
4711	    ("pmap_enter_pde: newpde is missing PG_M"));
4712	PG_V = pmap_valid_bit(pmap);
4713	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4714
4715	if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
4716	    NULL : lockp)) == NULL) {
4717		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4718		    " in pmap %p", va, pmap);
4719		return (KERN_RESOURCE_SHORTAGE);
4720	}
4721	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4722	pde = &pde[pmap_pde_index(va)];
4723	oldpde = *pde;
4724	if ((oldpde & PG_V) != 0) {
4725		KASSERT(pdpg->wire_count > 1,
4726		    ("pmap_enter_pde: pdpg's wire count is too low"));
4727		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
4728			pdpg->wire_count--;
4729			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4730			    " in pmap %p", va, pmap);
4731			return (KERN_FAILURE);
4732		}
4733		/* Break the existing mapping(s). */
4734		SLIST_INIT(&free);
4735		if ((oldpde & PG_PS) != 0) {
4736			/*
4737			 * The reference to the PD page that was acquired by
4738			 * pmap_allocpde() ensures that it won't be freed.
4739			 * However, if the PDE resulted from a promotion, then
4740			 * a reserved PT page could be freed.
4741			 */
4742			(void)pmap_remove_pde(pmap, pde, va, &free, lockp);
4743			if ((oldpde & PG_G) == 0)
4744				pmap_invalidate_pde_page(pmap, va, oldpde);
4745		} else {
4746			pmap_delayed_invl_started();
4747			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
4748			    lockp))
4749		               pmap_invalidate_all(pmap);
4750			pmap_delayed_invl_finished();
4751		}
4752		pmap_free_zero_pages(&free);
4753		if (va >= VM_MAXUSER_ADDRESS) {
4754			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4755			if (pmap_insert_pt_page(pmap, mt)) {
4756				/*
4757				 * XXX Currently, this can't happen because
4758				 * we do not perform pmap_enter(psind == 1)
4759				 * on the kernel pmap.
4760				 */
4761				panic("pmap_enter_pde: trie insert failed");
4762			}
4763		} else
4764			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
4765			    pde));
4766	}
4767	if ((newpde & PG_MANAGED) != 0) {
4768		/*
4769		 * Abort this mapping if its PV entry could not be created.
4770		 */
4771		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
4772			SLIST_INIT(&free);
4773			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
4774				/*
4775				 * Although "va" is not mapped, paging-
4776				 * structure caches could nonetheless have
4777				 * entries that refer to the freed page table
4778				 * pages.  Invalidate those entries.
4779				 */
4780				pmap_invalidate_page(pmap, va);
4781				pmap_free_zero_pages(&free);
4782			}
4783			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4784			    " in pmap %p", va, pmap);
4785			return (KERN_RESOURCE_SHORTAGE);
4786		}
4787		if ((newpde & PG_RW) != 0) {
4788			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4789				vm_page_aflag_set(mt, PGA_WRITEABLE);
4790		}
4791	}
4792
4793	/*
4794	 * Increment counters.
4795	 */
4796	if ((newpde & PG_W) != 0)
4797		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
4798	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4799
4800	/*
4801	 * Map the superpage.  (This is not a promoted mapping; there will not
4802	 * be any lingering 4KB page mappings in the TLB.)
4803	 */
4804	pde_store(pde, newpde);
4805
4806	atomic_add_long(&pmap_pde_mappings, 1);
4807	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4808	    " in pmap %p", va, pmap);
4809	return (KERN_SUCCESS);
4810}
4811
4812/*
4813 * Maps a sequence of resident pages belonging to the same object.
4814 * The sequence begins with the given page m_start.  This page is
4815 * mapped at the given virtual address start.  Each subsequent page is
4816 * mapped at a virtual address that is offset from start by the same
4817 * amount as the page is offset from m_start within the object.  The
4818 * last page in the sequence is the page with the largest offset from
4819 * m_start that can be mapped at a virtual address less than the given
4820 * virtual address end.  Not every virtual page between start and end
4821 * is mapped; only those for which a resident page exists with the
4822 * corresponding offset from m_start are mapped.
4823 */
4824void
4825pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4826    vm_page_t m_start, vm_prot_t prot)
4827{
4828	struct rwlock *lock;
4829	vm_offset_t va;
4830	vm_page_t m, mpte;
4831	vm_pindex_t diff, psize;
4832
4833	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4834
4835	psize = atop(end - start);
4836	mpte = NULL;
4837	m = m_start;
4838	lock = NULL;
4839	PMAP_LOCK(pmap);
4840	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4841		va = start + ptoa(diff);
4842		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4843		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4844		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
4845			m = &m[NBPDR / PAGE_SIZE - 1];
4846		else
4847			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4848			    mpte, &lock);
4849		m = TAILQ_NEXT(m, listq);
4850	}
4851	if (lock != NULL)
4852		rw_wunlock(lock);
4853	PMAP_UNLOCK(pmap);
4854}
4855
4856/*
4857 * this code makes some *MAJOR* assumptions:
4858 * 1. Current pmap & pmap exists.
4859 * 2. Not wired.
4860 * 3. Read access.
4861 * 4. No page table pages.
4862 * but is *MUCH* faster than pmap_enter...
4863 */
4864
4865void
4866pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4867{
4868	struct rwlock *lock;
4869
4870	lock = NULL;
4871	PMAP_LOCK(pmap);
4872	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4873	if (lock != NULL)
4874		rw_wunlock(lock);
4875	PMAP_UNLOCK(pmap);
4876}
4877
4878static vm_page_t
4879pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4880    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4881{
4882	struct spglist free;
4883	pt_entry_t *pte, PG_V;
4884	vm_paddr_t pa;
4885
4886	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4887	    (m->oflags & VPO_UNMANAGED) != 0,
4888	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4889	PG_V = pmap_valid_bit(pmap);
4890	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4891
4892	/*
4893	 * In the case that a page table page is not
4894	 * resident, we are creating it here.
4895	 */
4896	if (va < VM_MAXUSER_ADDRESS) {
4897		vm_pindex_t ptepindex;
4898		pd_entry_t *ptepa;
4899
4900		/*
4901		 * Calculate pagetable page index
4902		 */
4903		ptepindex = pmap_pde_pindex(va);
4904		if (mpte && (mpte->pindex == ptepindex)) {
4905			mpte->wire_count++;
4906		} else {
4907			/*
4908			 * Get the page directory entry
4909			 */
4910			ptepa = pmap_pde(pmap, va);
4911
4912			/*
4913			 * If the page table page is mapped, we just increment
4914			 * the hold count, and activate it.  Otherwise, we
4915			 * attempt to allocate a page table page.  If this
4916			 * attempt fails, we don't retry.  Instead, we give up.
4917			 */
4918			if (ptepa && (*ptepa & PG_V) != 0) {
4919				if (*ptepa & PG_PS)
4920					return (NULL);
4921				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4922				mpte->wire_count++;
4923			} else {
4924				/*
4925				 * Pass NULL instead of the PV list lock
4926				 * pointer, because we don't intend to sleep.
4927				 */
4928				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4929				if (mpte == NULL)
4930					return (mpte);
4931			}
4932		}
4933		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4934		pte = &pte[pmap_pte_index(va)];
4935	} else {
4936		mpte = NULL;
4937		pte = vtopte(va);
4938	}
4939	if (*pte) {
4940		if (mpte != NULL) {
4941			mpte->wire_count--;
4942			mpte = NULL;
4943		}
4944		return (mpte);
4945	}
4946
4947	/*
4948	 * Enter on the PV list if part of our managed memory.
4949	 */
4950	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4951	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4952		if (mpte != NULL) {
4953			SLIST_INIT(&free);
4954			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4955				/*
4956				 * Although "va" is not mapped, paging-
4957				 * structure caches could nonetheless have
4958				 * entries that refer to the freed page table
4959				 * pages.  Invalidate those entries.
4960				 */
4961				pmap_invalidate_page(pmap, va);
4962				pmap_free_zero_pages(&free);
4963			}
4964			mpte = NULL;
4965		}
4966		return (mpte);
4967	}
4968
4969	/*
4970	 * Increment counters
4971	 */
4972	pmap_resident_count_inc(pmap, 1);
4973
4974	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4975	if ((prot & VM_PROT_EXECUTE) == 0)
4976		pa |= pg_nx;
4977
4978	/*
4979	 * Now validate mapping with RO protection
4980	 */
4981	if ((m->oflags & VPO_UNMANAGED) != 0)
4982		pte_store(pte, pa | PG_V | PG_U);
4983	else
4984		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4985	return (mpte);
4986}
4987
4988/*
4989 * Make a temporary mapping for a physical address.  This is only intended
4990 * to be used for panic dumps.
4991 */
4992void *
4993pmap_kenter_temporary(vm_paddr_t pa, int i)
4994{
4995	vm_offset_t va;
4996
4997	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4998	pmap_kenter(va, pa);
4999	invlpg(va);
5000	return ((void *)crashdumpmap);
5001}
5002
5003/*
5004 * This code maps large physical mmap regions into the
5005 * processor address space.  Note that some shortcuts
5006 * are taken, but the code works.
5007 */
5008void
5009pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5010    vm_pindex_t pindex, vm_size_t size)
5011{
5012	pd_entry_t *pde;
5013	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5014	vm_paddr_t pa, ptepa;
5015	vm_page_t p, pdpg;
5016	int pat_mode;
5017
5018	PG_A = pmap_accessed_bit(pmap);
5019	PG_M = pmap_modified_bit(pmap);
5020	PG_V = pmap_valid_bit(pmap);
5021	PG_RW = pmap_rw_bit(pmap);
5022
5023	VM_OBJECT_ASSERT_WLOCKED(object);
5024	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5025	    ("pmap_object_init_pt: non-device object"));
5026	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
5027		if (!pmap_ps_enabled(pmap))
5028			return;
5029		if (!vm_object_populate(object, pindex, pindex + atop(size)))
5030			return;
5031		p = vm_page_lookup(object, pindex);
5032		KASSERT(p->valid == VM_PAGE_BITS_ALL,
5033		    ("pmap_object_init_pt: invalid page %p", p));
5034		pat_mode = p->md.pat_mode;
5035
5036		/*
5037		 * Abort the mapping if the first page is not physically
5038		 * aligned to a 2MB page boundary.
5039		 */
5040		ptepa = VM_PAGE_TO_PHYS(p);
5041		if (ptepa & (NBPDR - 1))
5042			return;
5043
5044		/*
5045		 * Skip the first page.  Abort the mapping if the rest of
5046		 * the pages are not physically contiguous or have differing
5047		 * memory attributes.
5048		 */
5049		p = TAILQ_NEXT(p, listq);
5050		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
5051		    pa += PAGE_SIZE) {
5052			KASSERT(p->valid == VM_PAGE_BITS_ALL,
5053			    ("pmap_object_init_pt: invalid page %p", p));
5054			if (pa != VM_PAGE_TO_PHYS(p) ||
5055			    pat_mode != p->md.pat_mode)
5056				return;
5057			p = TAILQ_NEXT(p, listq);
5058		}
5059
5060		/*
5061		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
5062		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
5063		 * will not affect the termination of this loop.
5064		 */
5065		PMAP_LOCK(pmap);
5066		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
5067		    pa < ptepa + size; pa += NBPDR) {
5068			pdpg = pmap_allocpde(pmap, addr, NULL);
5069			if (pdpg == NULL) {
5070				/*
5071				 * The creation of mappings below is only an
5072				 * optimization.  If a page directory page
5073				 * cannot be allocated without blocking,
5074				 * continue on to the next mapping rather than
5075				 * blocking.
5076				 */
5077				addr += NBPDR;
5078				continue;
5079			}
5080			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5081			pde = &pde[pmap_pde_index(addr)];
5082			if ((*pde & PG_V) == 0) {
5083				pde_store(pde, pa | PG_PS | PG_M | PG_A |
5084				    PG_U | PG_RW | PG_V);
5085				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5086				atomic_add_long(&pmap_pde_mappings, 1);
5087			} else {
5088				/* Continue on if the PDE is already valid. */
5089				pdpg->wire_count--;
5090				KASSERT(pdpg->wire_count > 0,
5091				    ("pmap_object_init_pt: missing reference "
5092				    "to page directory page, va: 0x%lx", addr));
5093			}
5094			addr += NBPDR;
5095		}
5096		PMAP_UNLOCK(pmap);
5097	}
5098}
5099
5100/*
5101 *	Clear the wired attribute from the mappings for the specified range of
5102 *	addresses in the given pmap.  Every valid mapping within that range
5103 *	must have the wired attribute set.  In contrast, invalid mappings
5104 *	cannot have the wired attribute set, so they are ignored.
5105 *
5106 *	The wired attribute of the page table entry is not a hardware
5107 *	feature, so there is no need to invalidate any TLB entries.
5108 *	Since pmap_demote_pde() for the wired entry must never fail,
5109 *	pmap_delayed_invl_started()/finished() calls around the
5110 *	function are not needed.
5111 */
5112void
5113pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5114{
5115	vm_offset_t va_next;
5116	pml4_entry_t *pml4e;
5117	pdp_entry_t *pdpe;
5118	pd_entry_t *pde;
5119	pt_entry_t *pte, PG_V;
5120
5121	PG_V = pmap_valid_bit(pmap);
5122	PMAP_LOCK(pmap);
5123	for (; sva < eva; sva = va_next) {
5124		pml4e = pmap_pml4e(pmap, sva);
5125		if ((*pml4e & PG_V) == 0) {
5126			va_next = (sva + NBPML4) & ~PML4MASK;
5127			if (va_next < sva)
5128				va_next = eva;
5129			continue;
5130		}
5131		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5132		if ((*pdpe & PG_V) == 0) {
5133			va_next = (sva + NBPDP) & ~PDPMASK;
5134			if (va_next < sva)
5135				va_next = eva;
5136			continue;
5137		}
5138		va_next = (sva + NBPDR) & ~PDRMASK;
5139		if (va_next < sva)
5140			va_next = eva;
5141		pde = pmap_pdpe_to_pde(pdpe, sva);
5142		if ((*pde & PG_V) == 0)
5143			continue;
5144		if ((*pde & PG_PS) != 0) {
5145			if ((*pde & PG_W) == 0)
5146				panic("pmap_unwire: pde %#jx is missing PG_W",
5147				    (uintmax_t)*pde);
5148
5149			/*
5150			 * Are we unwiring the entire large page?  If not,
5151			 * demote the mapping and fall through.
5152			 */
5153			if (sva + NBPDR == va_next && eva >= va_next) {
5154				atomic_clear_long(pde, PG_W);
5155				pmap->pm_stats.wired_count -= NBPDR /
5156				    PAGE_SIZE;
5157				continue;
5158			} else if (!pmap_demote_pde(pmap, pde, sva))
5159				panic("pmap_unwire: demotion failed");
5160		}
5161		if (va_next > eva)
5162			va_next = eva;
5163		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5164		    sva += PAGE_SIZE) {
5165			if ((*pte & PG_V) == 0)
5166				continue;
5167			if ((*pte & PG_W) == 0)
5168				panic("pmap_unwire: pte %#jx is missing PG_W",
5169				    (uintmax_t)*pte);
5170
5171			/*
5172			 * PG_W must be cleared atomically.  Although the pmap
5173			 * lock synchronizes access to PG_W, another processor
5174			 * could be setting PG_M and/or PG_A concurrently.
5175			 */
5176			atomic_clear_long(pte, PG_W);
5177			pmap->pm_stats.wired_count--;
5178		}
5179	}
5180	PMAP_UNLOCK(pmap);
5181}
5182
5183/*
5184 *	Copy the range specified by src_addr/len
5185 *	from the source map to the range dst_addr/len
5186 *	in the destination map.
5187 *
5188 *	This routine is only advisory and need not do anything.
5189 */
5190
5191void
5192pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5193    vm_offset_t src_addr)
5194{
5195	struct rwlock *lock;
5196	struct spglist free;
5197	vm_offset_t addr;
5198	vm_offset_t end_addr = src_addr + len;
5199	vm_offset_t va_next;
5200	vm_page_t dst_pdpg, dstmpte, srcmpte;
5201	pt_entry_t PG_A, PG_M, PG_V;
5202
5203	if (dst_addr != src_addr)
5204		return;
5205
5206	if (dst_pmap->pm_type != src_pmap->pm_type)
5207		return;
5208
5209	/*
5210	 * EPT page table entries that require emulation of A/D bits are
5211	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
5212	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
5213	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
5214	 * implementations flag an EPT misconfiguration for exec-only
5215	 * mappings we skip this function entirely for emulated pmaps.
5216	 */
5217	if (pmap_emulate_ad_bits(dst_pmap))
5218		return;
5219
5220	lock = NULL;
5221	if (dst_pmap < src_pmap) {
5222		PMAP_LOCK(dst_pmap);
5223		PMAP_LOCK(src_pmap);
5224	} else {
5225		PMAP_LOCK(src_pmap);
5226		PMAP_LOCK(dst_pmap);
5227	}
5228
5229	PG_A = pmap_accessed_bit(dst_pmap);
5230	PG_M = pmap_modified_bit(dst_pmap);
5231	PG_V = pmap_valid_bit(dst_pmap);
5232
5233	for (addr = src_addr; addr < end_addr; addr = va_next) {
5234		pt_entry_t *src_pte, *dst_pte;
5235		pml4_entry_t *pml4e;
5236		pdp_entry_t *pdpe;
5237		pd_entry_t srcptepaddr, *pde;
5238
5239		KASSERT(addr < UPT_MIN_ADDRESS,
5240		    ("pmap_copy: invalid to pmap_copy page tables"));
5241
5242		pml4e = pmap_pml4e(src_pmap, addr);
5243		if ((*pml4e & PG_V) == 0) {
5244			va_next = (addr + NBPML4) & ~PML4MASK;
5245			if (va_next < addr)
5246				va_next = end_addr;
5247			continue;
5248		}
5249
5250		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
5251		if ((*pdpe & PG_V) == 0) {
5252			va_next = (addr + NBPDP) & ~PDPMASK;
5253			if (va_next < addr)
5254				va_next = end_addr;
5255			continue;
5256		}
5257
5258		va_next = (addr + NBPDR) & ~PDRMASK;
5259		if (va_next < addr)
5260			va_next = end_addr;
5261
5262		pde = pmap_pdpe_to_pde(pdpe, addr);
5263		srcptepaddr = *pde;
5264		if (srcptepaddr == 0)
5265			continue;
5266
5267		if (srcptepaddr & PG_PS) {
5268			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5269				continue;
5270			dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
5271			if (dst_pdpg == NULL)
5272				break;
5273			pde = (pd_entry_t *)
5274			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
5275			pde = &pde[pmap_pde_index(addr)];
5276			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5277			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
5278			    PMAP_ENTER_NORECLAIM, &lock))) {
5279				*pde = srcptepaddr & ~PG_W;
5280				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5281				atomic_add_long(&pmap_pde_mappings, 1);
5282			} else
5283				dst_pdpg->wire_count--;
5284			continue;
5285		}
5286
5287		srcptepaddr &= PG_FRAME;
5288		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5289		KASSERT(srcmpte->wire_count > 0,
5290		    ("pmap_copy: source page table page is unused"));
5291
5292		if (va_next > end_addr)
5293			va_next = end_addr;
5294
5295		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5296		src_pte = &src_pte[pmap_pte_index(addr)];
5297		dstmpte = NULL;
5298		while (addr < va_next) {
5299			pt_entry_t ptetemp;
5300			ptetemp = *src_pte;
5301			/*
5302			 * we only virtual copy managed pages
5303			 */
5304			if ((ptetemp & PG_MANAGED) != 0) {
5305				if (dstmpte != NULL &&
5306				    dstmpte->pindex == pmap_pde_pindex(addr))
5307					dstmpte->wire_count++;
5308				else if ((dstmpte = pmap_allocpte(dst_pmap,
5309				    addr, NULL)) == NULL)
5310					goto out;
5311				dst_pte = (pt_entry_t *)
5312				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5313				dst_pte = &dst_pte[pmap_pte_index(addr)];
5314				if (*dst_pte == 0 &&
5315				    pmap_try_insert_pv_entry(dst_pmap, addr,
5316				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5317				    &lock)) {
5318					/*
5319					 * Clear the wired, modified, and
5320					 * accessed (referenced) bits
5321					 * during the copy.
5322					 */
5323					*dst_pte = ptetemp & ~(PG_W | PG_M |
5324					    PG_A);
5325					pmap_resident_count_inc(dst_pmap, 1);
5326				} else {
5327					SLIST_INIT(&free);
5328					if (pmap_unwire_ptp(dst_pmap, addr,
5329					    dstmpte, &free)) {
5330						/*
5331						 * Although "addr" is not
5332						 * mapped, paging-structure
5333						 * caches could nonetheless
5334						 * have entries that refer to
5335						 * the freed page table pages.
5336						 * Invalidate those entries.
5337						 */
5338						pmap_invalidate_page(dst_pmap,
5339						    addr);
5340						pmap_free_zero_pages(&free);
5341					}
5342					goto out;
5343				}
5344				if (dstmpte->wire_count >= srcmpte->wire_count)
5345					break;
5346			}
5347			addr += PAGE_SIZE;
5348			src_pte++;
5349		}
5350	}
5351out:
5352	if (lock != NULL)
5353		rw_wunlock(lock);
5354	PMAP_UNLOCK(src_pmap);
5355	PMAP_UNLOCK(dst_pmap);
5356}
5357
5358/*
5359 *	pmap_zero_page zeros the specified hardware page by mapping
5360 *	the page into KVM and using bzero to clear its contents.
5361 */
5362void
5363pmap_zero_page(vm_page_t m)
5364{
5365	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5366
5367	pagezero((void *)va);
5368}
5369
5370/*
5371 *	pmap_zero_page_area zeros the specified hardware page by mapping
5372 *	the page into KVM and using bzero to clear its contents.
5373 *
5374 *	off and size may not cover an area beyond a single hardware page.
5375 */
5376void
5377pmap_zero_page_area(vm_page_t m, int off, int size)
5378{
5379	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5380
5381	if (off == 0 && size == PAGE_SIZE)
5382		pagezero((void *)va);
5383	else
5384		bzero((char *)va + off, size);
5385}
5386
5387/*
5388 *	pmap_zero_page_idle zeros the specified hardware page by mapping
5389 *	the page into KVM and using bzero to clear its contents.  This
5390 *	is intended to be called from the vm_pagezero process only and
5391 *	outside of Giant.
5392 */
5393void
5394pmap_zero_page_idle(vm_page_t m)
5395{
5396	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5397
5398	pagezero((void *)va);
5399}
5400
5401/*
5402 *	pmap_copy_page copies the specified (machine independent)
5403 *	page by mapping the page into virtual memory and using
5404 *	bcopy to copy the page, one machine dependent page at a
5405 *	time.
5406 */
5407void
5408pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5409{
5410	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5411	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5412
5413	pagecopy((void *)src, (void *)dst);
5414}
5415
5416int unmapped_buf_allowed = 1;
5417
5418void
5419pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5420    vm_offset_t b_offset, int xfersize)
5421{
5422	void *a_cp, *b_cp;
5423	vm_page_t pages[2];
5424	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
5425	int cnt;
5426	boolean_t mapped;
5427
5428	while (xfersize > 0) {
5429		a_pg_offset = a_offset & PAGE_MASK;
5430		pages[0] = ma[a_offset >> PAGE_SHIFT];
5431		b_pg_offset = b_offset & PAGE_MASK;
5432		pages[1] = mb[b_offset >> PAGE_SHIFT];
5433		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5434		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5435		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
5436		a_cp = (char *)vaddr[0] + a_pg_offset;
5437		b_cp = (char *)vaddr[1] + b_pg_offset;
5438		bcopy(a_cp, b_cp, cnt);
5439		if (__predict_false(mapped))
5440			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
5441		a_offset += cnt;
5442		b_offset += cnt;
5443		xfersize -= cnt;
5444	}
5445}
5446
5447/*
5448 * Returns true if the pmap's pv is one of the first
5449 * 16 pvs linked to from this page.  This count may
5450 * be changed upwards or downwards in the future; it
5451 * is only necessary that true be returned for a small
5452 * subset of pmaps for proper page aging.
5453 */
5454boolean_t
5455pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5456{
5457	struct md_page *pvh;
5458	struct rwlock *lock;
5459	pv_entry_t pv;
5460	int loops = 0;
5461	boolean_t rv;
5462
5463	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5464	    ("pmap_page_exists_quick: page %p is not managed", m));
5465	rv = FALSE;
5466	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5467	rw_rlock(lock);
5468	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5469		if (PV_PMAP(pv) == pmap) {
5470			rv = TRUE;
5471			break;
5472		}
5473		loops++;
5474		if (loops >= 16)
5475			break;
5476	}
5477	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5478		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5479		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5480			if (PV_PMAP(pv) == pmap) {
5481				rv = TRUE;
5482				break;
5483			}
5484			loops++;
5485			if (loops >= 16)
5486				break;
5487		}
5488	}
5489	rw_runlock(lock);
5490	return (rv);
5491}
5492
5493/*
5494 *	pmap_page_wired_mappings:
5495 *
5496 *	Return the number of managed mappings to the given physical page
5497 *	that are wired.
5498 */
5499int
5500pmap_page_wired_mappings(vm_page_t m)
5501{
5502	struct rwlock *lock;
5503	struct md_page *pvh;
5504	pmap_t pmap;
5505	pt_entry_t *pte;
5506	pv_entry_t pv;
5507	int count, md_gen, pvh_gen;
5508
5509	if ((m->oflags & VPO_UNMANAGED) != 0)
5510		return (0);
5511	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5512	rw_rlock(lock);
5513restart:
5514	count = 0;
5515	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5516		pmap = PV_PMAP(pv);
5517		if (!PMAP_TRYLOCK(pmap)) {
5518			md_gen = m->md.pv_gen;
5519			rw_runlock(lock);
5520			PMAP_LOCK(pmap);
5521			rw_rlock(lock);
5522			if (md_gen != m->md.pv_gen) {
5523				PMAP_UNLOCK(pmap);
5524				goto restart;
5525			}
5526		}
5527		pte = pmap_pte(pmap, pv->pv_va);
5528		if ((*pte & PG_W) != 0)
5529			count++;
5530		PMAP_UNLOCK(pmap);
5531	}
5532	if ((m->flags & PG_FICTITIOUS) == 0) {
5533		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5534		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5535			pmap = PV_PMAP(pv);
5536			if (!PMAP_TRYLOCK(pmap)) {
5537				md_gen = m->md.pv_gen;
5538				pvh_gen = pvh->pv_gen;
5539				rw_runlock(lock);
5540				PMAP_LOCK(pmap);
5541				rw_rlock(lock);
5542				if (md_gen != m->md.pv_gen ||
5543				    pvh_gen != pvh->pv_gen) {
5544					PMAP_UNLOCK(pmap);
5545					goto restart;
5546				}
5547			}
5548			pte = pmap_pde(pmap, pv->pv_va);
5549			if ((*pte & PG_W) != 0)
5550				count++;
5551			PMAP_UNLOCK(pmap);
5552		}
5553	}
5554	rw_runlock(lock);
5555	return (count);
5556}
5557
5558/*
5559 * Returns TRUE if the given page is mapped individually or as part of
5560 * a 2mpage.  Otherwise, returns FALSE.
5561 */
5562boolean_t
5563pmap_page_is_mapped(vm_page_t m)
5564{
5565	struct rwlock *lock;
5566	boolean_t rv;
5567
5568	if ((m->oflags & VPO_UNMANAGED) != 0)
5569		return (FALSE);
5570	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5571	rw_rlock(lock);
5572	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5573	    ((m->flags & PG_FICTITIOUS) == 0 &&
5574	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5575	rw_runlock(lock);
5576	return (rv);
5577}
5578
5579/*
5580 * Destroy all managed, non-wired mappings in the given user-space
5581 * pmap.  This pmap cannot be active on any processor besides the
5582 * caller.
5583 *
5584 * This function cannot be applied to the kernel pmap.  Moreover, it
5585 * is not intended for general use.  It is only to be used during
5586 * process termination.  Consequently, it can be implemented in ways
5587 * that make it faster than pmap_remove().  First, it can more quickly
5588 * destroy mappings by iterating over the pmap's collection of PV
5589 * entries, rather than searching the page table.  Second, it doesn't
5590 * have to test and clear the page table entries atomically, because
5591 * no processor is currently accessing the user address space.  In
5592 * particular, a page table entry's dirty bit won't change state once
5593 * this function starts.
5594 *
5595 * Although this function destroys all of the pmap's managed,
5596 * non-wired mappings, it can delay and batch the invalidation of TLB
5597 * entries without calling pmap_delayed_invl_started() and
5598 * pmap_delayed_invl_finished().  Because the pmap is not active on
5599 * any other processor, none of these TLB entries will ever be used
5600 * before their eventual invalidation.  Consequently, there is no need
5601 * for either pmap_remove_all() or pmap_remove_write() to wait for
5602 * that eventual TLB invalidation.
5603 */
5604void
5605pmap_remove_pages(pmap_t pmap)
5606{
5607	pd_entry_t ptepde;
5608	pt_entry_t *pte, tpte;
5609	pt_entry_t PG_M, PG_RW, PG_V;
5610	struct spglist free;
5611	vm_page_t m, mpte, mt;
5612	pv_entry_t pv;
5613	struct md_page *pvh;
5614	struct pv_chunk *pc, *npc;
5615	struct rwlock *lock;
5616	int64_t bit;
5617	uint64_t inuse, bitmask;
5618	int allfree, field, freed, idx;
5619	boolean_t superpage;
5620	vm_paddr_t pa;
5621
5622	/*
5623	 * Assert that the given pmap is only active on the current
5624	 * CPU.  Unfortunately, we cannot block another CPU from
5625	 * activating the pmap while this function is executing.
5626	 */
5627	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5628#ifdef INVARIANTS
5629	{
5630		cpuset_t other_cpus;
5631
5632		other_cpus = all_cpus;
5633		critical_enter();
5634		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5635		CPU_AND(&other_cpus, &pmap->pm_active);
5636		critical_exit();
5637		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5638	}
5639#endif
5640
5641	lock = NULL;
5642	PG_M = pmap_modified_bit(pmap);
5643	PG_V = pmap_valid_bit(pmap);
5644	PG_RW = pmap_rw_bit(pmap);
5645
5646	SLIST_INIT(&free);
5647	PMAP_LOCK(pmap);
5648	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5649		allfree = 1;
5650		freed = 0;
5651		for (field = 0; field < _NPCM; field++) {
5652			inuse = ~pc->pc_map[field] & pc_freemask[field];
5653			while (inuse != 0) {
5654				bit = bsfq(inuse);
5655				bitmask = 1UL << bit;
5656				idx = field * 64 + bit;
5657				pv = &pc->pc_pventry[idx];
5658				inuse &= ~bitmask;
5659
5660				pte = pmap_pdpe(pmap, pv->pv_va);
5661				ptepde = *pte;
5662				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5663				tpte = *pte;
5664				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5665					superpage = FALSE;
5666					ptepde = tpte;
5667					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5668					    PG_FRAME);
5669					pte = &pte[pmap_pte_index(pv->pv_va)];
5670					tpte = *pte;
5671				} else {
5672					/*
5673					 * Keep track whether 'tpte' is a
5674					 * superpage explicitly instead of
5675					 * relying on PG_PS being set.
5676					 *
5677					 * This is because PG_PS is numerically
5678					 * identical to PG_PTE_PAT and thus a
5679					 * regular page could be mistaken for
5680					 * a superpage.
5681					 */
5682					superpage = TRUE;
5683				}
5684
5685				if ((tpte & PG_V) == 0) {
5686					panic("bad pte va %lx pte %lx",
5687					    pv->pv_va, tpte);
5688				}
5689
5690/*
5691 * We cannot remove wired pages from a process' mapping at this time
5692 */
5693				if (tpte & PG_W) {
5694					allfree = 0;
5695					continue;
5696				}
5697
5698				if (superpage)
5699					pa = tpte & PG_PS_FRAME;
5700				else
5701					pa = tpte & PG_FRAME;
5702
5703				m = PHYS_TO_VM_PAGE(pa);
5704				KASSERT(m->phys_addr == pa,
5705				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5706				    m, (uintmax_t)m->phys_addr,
5707				    (uintmax_t)tpte));
5708
5709				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5710				    m < &vm_page_array[vm_page_array_size],
5711				    ("pmap_remove_pages: bad tpte %#jx",
5712				    (uintmax_t)tpte));
5713
5714				pte_clear(pte);
5715
5716				/*
5717				 * Update the vm_page_t clean/reference bits.
5718				 */
5719				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5720					if (superpage) {
5721						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5722							vm_page_dirty(mt);
5723					} else
5724						vm_page_dirty(m);
5725				}
5726
5727				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5728
5729				/* Mark free */
5730				pc->pc_map[field] |= bitmask;
5731				if (superpage) {
5732					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5733					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5734					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5735					pvh->pv_gen++;
5736					if (TAILQ_EMPTY(&pvh->pv_list)) {
5737						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5738							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5739							    TAILQ_EMPTY(&mt->md.pv_list))
5740								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5741					}
5742					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5743					if (mpte != NULL) {
5744						pmap_resident_count_dec(pmap, 1);
5745						KASSERT(mpte->wire_count == NPTEPG,
5746						    ("pmap_remove_pages: pte page wire count error"));
5747						mpte->wire_count = 0;
5748						pmap_add_delayed_free_list(mpte, &free, FALSE);
5749					}
5750				} else {
5751					pmap_resident_count_dec(pmap, 1);
5752					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5753					m->md.pv_gen++;
5754					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5755					    TAILQ_EMPTY(&m->md.pv_list) &&
5756					    (m->flags & PG_FICTITIOUS) == 0) {
5757						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5758						if (TAILQ_EMPTY(&pvh->pv_list))
5759							vm_page_aflag_clear(m, PGA_WRITEABLE);
5760					}
5761				}
5762				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5763				freed++;
5764			}
5765		}
5766		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5767		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5768		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5769		if (allfree) {
5770			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5771			free_pv_chunk(pc);
5772		}
5773	}
5774	if (lock != NULL)
5775		rw_wunlock(lock);
5776	pmap_invalidate_all(pmap);
5777	PMAP_UNLOCK(pmap);
5778	pmap_free_zero_pages(&free);
5779}
5780
5781static boolean_t
5782pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5783{
5784	struct rwlock *lock;
5785	pv_entry_t pv;
5786	struct md_page *pvh;
5787	pt_entry_t *pte, mask;
5788	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5789	pmap_t pmap;
5790	int md_gen, pvh_gen;
5791	boolean_t rv;
5792
5793	rv = FALSE;
5794	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5795	rw_rlock(lock);
5796restart:
5797	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5798		pmap = PV_PMAP(pv);
5799		if (!PMAP_TRYLOCK(pmap)) {
5800			md_gen = m->md.pv_gen;
5801			rw_runlock(lock);
5802			PMAP_LOCK(pmap);
5803			rw_rlock(lock);
5804			if (md_gen != m->md.pv_gen) {
5805				PMAP_UNLOCK(pmap);
5806				goto restart;
5807			}
5808		}
5809		pte = pmap_pte(pmap, pv->pv_va);
5810		mask = 0;
5811		if (modified) {
5812			PG_M = pmap_modified_bit(pmap);
5813			PG_RW = pmap_rw_bit(pmap);
5814			mask |= PG_RW | PG_M;
5815		}
5816		if (accessed) {
5817			PG_A = pmap_accessed_bit(pmap);
5818			PG_V = pmap_valid_bit(pmap);
5819			mask |= PG_V | PG_A;
5820		}
5821		rv = (*pte & mask) == mask;
5822		PMAP_UNLOCK(pmap);
5823		if (rv)
5824			goto out;
5825	}
5826	if ((m->flags & PG_FICTITIOUS) == 0) {
5827		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5828		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5829			pmap = PV_PMAP(pv);
5830			if (!PMAP_TRYLOCK(pmap)) {
5831				md_gen = m->md.pv_gen;
5832				pvh_gen = pvh->pv_gen;
5833				rw_runlock(lock);
5834				PMAP_LOCK(pmap);
5835				rw_rlock(lock);
5836				if (md_gen != m->md.pv_gen ||
5837				    pvh_gen != pvh->pv_gen) {
5838					PMAP_UNLOCK(pmap);
5839					goto restart;
5840				}
5841			}
5842			pte = pmap_pde(pmap, pv->pv_va);
5843			mask = 0;
5844			if (modified) {
5845				PG_M = pmap_modified_bit(pmap);
5846				PG_RW = pmap_rw_bit(pmap);
5847				mask |= PG_RW | PG_M;
5848			}
5849			if (accessed) {
5850				PG_A = pmap_accessed_bit(pmap);
5851				PG_V = pmap_valid_bit(pmap);
5852				mask |= PG_V | PG_A;
5853			}
5854			rv = (*pte & mask) == mask;
5855			PMAP_UNLOCK(pmap);
5856			if (rv)
5857				goto out;
5858		}
5859	}
5860out:
5861	rw_runlock(lock);
5862	return (rv);
5863}
5864
5865/*
5866 *	pmap_is_modified:
5867 *
5868 *	Return whether or not the specified physical page was modified
5869 *	in any physical maps.
5870 */
5871boolean_t
5872pmap_is_modified(vm_page_t m)
5873{
5874
5875	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5876	    ("pmap_is_modified: page %p is not managed", m));
5877
5878	/*
5879	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5880	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5881	 * is clear, no PTEs can have PG_M set.
5882	 */
5883	VM_OBJECT_ASSERT_WLOCKED(m->object);
5884	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5885		return (FALSE);
5886	return (pmap_page_test_mappings(m, FALSE, TRUE));
5887}
5888
5889/*
5890 *	pmap_is_prefaultable:
5891 *
5892 *	Return whether or not the specified virtual address is eligible
5893 *	for prefault.
5894 */
5895boolean_t
5896pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5897{
5898	pd_entry_t *pde;
5899	pt_entry_t *pte, PG_V;
5900	boolean_t rv;
5901
5902	PG_V = pmap_valid_bit(pmap);
5903	rv = FALSE;
5904	PMAP_LOCK(pmap);
5905	pde = pmap_pde(pmap, addr);
5906	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5907		pte = pmap_pde_to_pte(pde, addr);
5908		rv = (*pte & PG_V) == 0;
5909	}
5910	PMAP_UNLOCK(pmap);
5911	return (rv);
5912}
5913
5914/*
5915 *	pmap_is_referenced:
5916 *
5917 *	Return whether or not the specified physical page was referenced
5918 *	in any physical maps.
5919 */
5920boolean_t
5921pmap_is_referenced(vm_page_t m)
5922{
5923
5924	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5925	    ("pmap_is_referenced: page %p is not managed", m));
5926	return (pmap_page_test_mappings(m, TRUE, FALSE));
5927}
5928
5929/*
5930 * Clear the write and modified bits in each of the given page's mappings.
5931 */
5932void
5933pmap_remove_write(vm_page_t m)
5934{
5935	struct md_page *pvh;
5936	pmap_t pmap;
5937	struct rwlock *lock;
5938	pv_entry_t next_pv, pv;
5939	pd_entry_t *pde;
5940	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5941	vm_offset_t va;
5942	int pvh_gen, md_gen;
5943
5944	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5945	    ("pmap_remove_write: page %p is not managed", m));
5946
5947	/*
5948	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5949	 * set by another thread while the object is locked.  Thus,
5950	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5951	 */
5952	VM_OBJECT_ASSERT_WLOCKED(m->object);
5953	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5954		return;
5955	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5956	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5957	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
5958retry_pv_loop:
5959	rw_wlock(lock);
5960	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5961		pmap = PV_PMAP(pv);
5962		if (!PMAP_TRYLOCK(pmap)) {
5963			pvh_gen = pvh->pv_gen;
5964			rw_wunlock(lock);
5965			PMAP_LOCK(pmap);
5966			rw_wlock(lock);
5967			if (pvh_gen != pvh->pv_gen) {
5968				PMAP_UNLOCK(pmap);
5969				rw_wunlock(lock);
5970				goto retry_pv_loop;
5971			}
5972		}
5973		PG_RW = pmap_rw_bit(pmap);
5974		va = pv->pv_va;
5975		pde = pmap_pde(pmap, va);
5976		if ((*pde & PG_RW) != 0)
5977			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5978		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5979		    ("inconsistent pv lock %p %p for page %p",
5980		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5981		PMAP_UNLOCK(pmap);
5982	}
5983	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5984		pmap = PV_PMAP(pv);
5985		if (!PMAP_TRYLOCK(pmap)) {
5986			pvh_gen = pvh->pv_gen;
5987			md_gen = m->md.pv_gen;
5988			rw_wunlock(lock);
5989			PMAP_LOCK(pmap);
5990			rw_wlock(lock);
5991			if (pvh_gen != pvh->pv_gen ||
5992			    md_gen != m->md.pv_gen) {
5993				PMAP_UNLOCK(pmap);
5994				rw_wunlock(lock);
5995				goto retry_pv_loop;
5996			}
5997		}
5998		PG_M = pmap_modified_bit(pmap);
5999		PG_RW = pmap_rw_bit(pmap);
6000		pde = pmap_pde(pmap, pv->pv_va);
6001		KASSERT((*pde & PG_PS) == 0,
6002		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
6003		    m));
6004		pte = pmap_pde_to_pte(pde, pv->pv_va);
6005retry:
6006		oldpte = *pte;
6007		if (oldpte & PG_RW) {
6008			if (!atomic_cmpset_long(pte, oldpte, oldpte &
6009			    ~(PG_RW | PG_M)))
6010				goto retry;
6011			if ((oldpte & PG_M) != 0)
6012				vm_page_dirty(m);
6013			pmap_invalidate_page(pmap, pv->pv_va);
6014		}
6015		PMAP_UNLOCK(pmap);
6016	}
6017	rw_wunlock(lock);
6018	vm_page_aflag_clear(m, PGA_WRITEABLE);
6019	pmap_delayed_invl_wait(m);
6020}
6021
6022static __inline boolean_t
6023safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
6024{
6025
6026	if (!pmap_emulate_ad_bits(pmap))
6027		return (TRUE);
6028
6029	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
6030
6031	/*
6032	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
6033	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
6034	 * if the EPT_PG_WRITE bit is set.
6035	 */
6036	if ((pte & EPT_PG_WRITE) != 0)
6037		return (FALSE);
6038
6039	/*
6040	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
6041	 */
6042	if ((pte & EPT_PG_EXECUTE) == 0 ||
6043	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
6044		return (TRUE);
6045	else
6046		return (FALSE);
6047}
6048
6049/*
6050 *	pmap_ts_referenced:
6051 *
6052 *	Return a count of reference bits for a page, clearing those bits.
6053 *	It is not necessary for every reference bit to be cleared, but it
6054 *	is necessary that 0 only be returned when there are truly no
6055 *	reference bits set.
6056 *
6057 *	As an optimization, update the page's dirty field if a modified bit is
6058 *	found while counting reference bits.  This opportunistic update can be
6059 *	performed at low cost and can eliminate the need for some future calls
6060 *	to pmap_is_modified().  However, since this function stops after
6061 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6062 *	dirty pages.  Those dirty pages will only be detected by a future call
6063 *	to pmap_is_modified().
6064 *
6065 *	A DI block is not needed within this function, because
6066 *	invalidations are performed before the PV list lock is
6067 *	released.
6068 */
6069int
6070pmap_ts_referenced(vm_page_t m)
6071{
6072	struct md_page *pvh;
6073	pv_entry_t pv, pvf;
6074	pmap_t pmap;
6075	struct rwlock *lock;
6076	pd_entry_t oldpde, *pde;
6077	pt_entry_t *pte, PG_A, PG_M, PG_RW;
6078	vm_offset_t va;
6079	vm_paddr_t pa;
6080	int cleared, md_gen, not_cleared, pvh_gen;
6081	struct spglist free;
6082	boolean_t demoted;
6083
6084	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6085	    ("pmap_ts_referenced: page %p is not managed", m));
6086	SLIST_INIT(&free);
6087	cleared = 0;
6088	pa = VM_PAGE_TO_PHYS(m);
6089	lock = PHYS_TO_PV_LIST_LOCK(pa);
6090	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
6091	rw_wlock(lock);
6092retry:
6093	not_cleared = 0;
6094	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6095		goto small_mappings;
6096	pv = pvf;
6097	do {
6098		if (pvf == NULL)
6099			pvf = pv;
6100		pmap = PV_PMAP(pv);
6101		if (!PMAP_TRYLOCK(pmap)) {
6102			pvh_gen = pvh->pv_gen;
6103			rw_wunlock(lock);
6104			PMAP_LOCK(pmap);
6105			rw_wlock(lock);
6106			if (pvh_gen != pvh->pv_gen) {
6107				PMAP_UNLOCK(pmap);
6108				goto retry;
6109			}
6110		}
6111		PG_A = pmap_accessed_bit(pmap);
6112		PG_M = pmap_modified_bit(pmap);
6113		PG_RW = pmap_rw_bit(pmap);
6114		va = pv->pv_va;
6115		pde = pmap_pde(pmap, pv->pv_va);
6116		oldpde = *pde;
6117		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6118			/*
6119			 * Although "oldpde" is mapping a 2MB page, because
6120			 * this function is called at a 4KB page granularity,
6121			 * we only update the 4KB page under test.
6122			 */
6123			vm_page_dirty(m);
6124		}
6125		if ((oldpde & PG_A) != 0) {
6126			/*
6127			 * Since this reference bit is shared by 512 4KB
6128			 * pages, it should not be cleared every time it is
6129			 * tested.  Apply a simple "hash" function on the
6130			 * physical page number, the virtual superpage number,
6131			 * and the pmap address to select one 4KB page out of
6132			 * the 512 on which testing the reference bit will
6133			 * result in clearing that reference bit.  This
6134			 * function is designed to avoid the selection of the
6135			 * same 4KB page for every 2MB page mapping.
6136			 *
6137			 * On demotion, a mapping that hasn't been referenced
6138			 * is simply destroyed.  To avoid the possibility of a
6139			 * subsequent page fault on a demoted wired mapping,
6140			 * always leave its reference bit set.  Moreover,
6141			 * since the superpage is wired, the current state of
6142			 * its reference bit won't affect page replacement.
6143			 */
6144			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
6145			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
6146			    (oldpde & PG_W) == 0) {
6147				if (safe_to_clear_referenced(pmap, oldpde)) {
6148					atomic_clear_long(pde, PG_A);
6149					pmap_invalidate_page(pmap, pv->pv_va);
6150					demoted = FALSE;
6151				} else if (pmap_demote_pde_locked(pmap, pde,
6152				    pv->pv_va, &lock)) {
6153					/*
6154					 * Remove the mapping to a single page
6155					 * so that a subsequent access may
6156					 * repromote.  Since the underlying
6157					 * page table page is fully populated,
6158					 * this removal never frees a page
6159					 * table page.
6160					 */
6161					demoted = TRUE;
6162					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6163					    PG_PS_FRAME);
6164					pte = pmap_pde_to_pte(pde, va);
6165					pmap_remove_pte(pmap, pte, va, *pde,
6166					    NULL, &lock);
6167					pmap_invalidate_page(pmap, va);
6168				} else
6169					demoted = TRUE;
6170
6171				if (demoted) {
6172					/*
6173					 * The superpage mapping was removed
6174					 * entirely and therefore 'pv' is no
6175					 * longer valid.
6176					 */
6177					if (pvf == pv)
6178						pvf = NULL;
6179					pv = NULL;
6180				}
6181				cleared++;
6182				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6183				    ("inconsistent pv lock %p %p for page %p",
6184				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6185			} else
6186				not_cleared++;
6187		}
6188		PMAP_UNLOCK(pmap);
6189		/* Rotate the PV list if it has more than one entry. */
6190		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6191			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6192			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6193			pvh->pv_gen++;
6194		}
6195		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6196			goto out;
6197	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6198small_mappings:
6199	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6200		goto out;
6201	pv = pvf;
6202	do {
6203		if (pvf == NULL)
6204			pvf = pv;
6205		pmap = PV_PMAP(pv);
6206		if (!PMAP_TRYLOCK(pmap)) {
6207			pvh_gen = pvh->pv_gen;
6208			md_gen = m->md.pv_gen;
6209			rw_wunlock(lock);
6210			PMAP_LOCK(pmap);
6211			rw_wlock(lock);
6212			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6213				PMAP_UNLOCK(pmap);
6214				goto retry;
6215			}
6216		}
6217		PG_A = pmap_accessed_bit(pmap);
6218		PG_M = pmap_modified_bit(pmap);
6219		PG_RW = pmap_rw_bit(pmap);
6220		pde = pmap_pde(pmap, pv->pv_va);
6221		KASSERT((*pde & PG_PS) == 0,
6222		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
6223		    m));
6224		pte = pmap_pde_to_pte(pde, pv->pv_va);
6225		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6226			vm_page_dirty(m);
6227		if ((*pte & PG_A) != 0) {
6228			if (safe_to_clear_referenced(pmap, *pte)) {
6229				atomic_clear_long(pte, PG_A);
6230				pmap_invalidate_page(pmap, pv->pv_va);
6231				cleared++;
6232			} else if ((*pte & PG_W) == 0) {
6233				/*
6234				 * Wired pages cannot be paged out so
6235				 * doing accessed bit emulation for
6236				 * them is wasted effort. We do the
6237				 * hard work for unwired pages only.
6238				 */
6239				pmap_remove_pte(pmap, pte, pv->pv_va,
6240				    *pde, &free, &lock);
6241				pmap_invalidate_page(pmap, pv->pv_va);
6242				cleared++;
6243				if (pvf == pv)
6244					pvf = NULL;
6245				pv = NULL;
6246				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6247				    ("inconsistent pv lock %p %p for page %p",
6248				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6249			} else
6250				not_cleared++;
6251		}
6252		PMAP_UNLOCK(pmap);
6253		/* Rotate the PV list if it has more than one entry. */
6254		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6255			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6256			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6257			m->md.pv_gen++;
6258		}
6259	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6260	    not_cleared < PMAP_TS_REFERENCED_MAX);
6261out:
6262	rw_wunlock(lock);
6263	pmap_free_zero_pages(&free);
6264	return (cleared + not_cleared);
6265}
6266
6267/*
6268 *	Apply the given advice to the specified range of addresses within the
6269 *	given pmap.  Depending on the advice, clear the referenced and/or
6270 *	modified flags in each mapping and set the mapped page's dirty field.
6271 */
6272void
6273pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6274{
6275	struct rwlock *lock;
6276	pml4_entry_t *pml4e;
6277	pdp_entry_t *pdpe;
6278	pd_entry_t oldpde, *pde;
6279	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6280	vm_offset_t va, va_next;
6281	vm_page_t m;
6282	boolean_t anychanged;
6283
6284	if (advice != MADV_DONTNEED && advice != MADV_FREE)
6285		return;
6286
6287	/*
6288	 * A/D bit emulation requires an alternate code path when clearing
6289	 * the modified and accessed bits below. Since this function is
6290	 * advisory in nature we skip it entirely for pmaps that require
6291	 * A/D bit emulation.
6292	 */
6293	if (pmap_emulate_ad_bits(pmap))
6294		return;
6295
6296	PG_A = pmap_accessed_bit(pmap);
6297	PG_G = pmap_global_bit(pmap);
6298	PG_M = pmap_modified_bit(pmap);
6299	PG_V = pmap_valid_bit(pmap);
6300	PG_RW = pmap_rw_bit(pmap);
6301	anychanged = FALSE;
6302	pmap_delayed_invl_started();
6303	PMAP_LOCK(pmap);
6304	for (; sva < eva; sva = va_next) {
6305		pml4e = pmap_pml4e(pmap, sva);
6306		if ((*pml4e & PG_V) == 0) {
6307			va_next = (sva + NBPML4) & ~PML4MASK;
6308			if (va_next < sva)
6309				va_next = eva;
6310			continue;
6311		}
6312		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6313		if ((*pdpe & PG_V) == 0) {
6314			va_next = (sva + NBPDP) & ~PDPMASK;
6315			if (va_next < sva)
6316				va_next = eva;
6317			continue;
6318		}
6319		va_next = (sva + NBPDR) & ~PDRMASK;
6320		if (va_next < sva)
6321			va_next = eva;
6322		pde = pmap_pdpe_to_pde(pdpe, sva);
6323		oldpde = *pde;
6324		if ((oldpde & PG_V) == 0)
6325			continue;
6326		else if ((oldpde & PG_PS) != 0) {
6327			if ((oldpde & PG_MANAGED) == 0)
6328				continue;
6329			lock = NULL;
6330			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6331				if (lock != NULL)
6332					rw_wunlock(lock);
6333
6334				/*
6335				 * The large page mapping was destroyed.
6336				 */
6337				continue;
6338			}
6339
6340			/*
6341			 * Unless the page mappings are wired, remove the
6342			 * mapping to a single page so that a subsequent
6343			 * access may repromote.  Since the underlying page
6344			 * table page is fully populated, this removal never
6345			 * frees a page table page.
6346			 */
6347			if ((oldpde & PG_W) == 0) {
6348				pte = pmap_pde_to_pte(pde, sva);
6349				KASSERT((*pte & PG_V) != 0,
6350				    ("pmap_advise: invalid PTE"));
6351				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6352				    &lock);
6353				anychanged = TRUE;
6354			}
6355			if (lock != NULL)
6356				rw_wunlock(lock);
6357		}
6358		if (va_next > eva)
6359			va_next = eva;
6360		va = va_next;
6361		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6362		    sva += PAGE_SIZE) {
6363			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
6364				goto maybe_invlrng;
6365			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6366				if (advice == MADV_DONTNEED) {
6367					/*
6368					 * Future calls to pmap_is_modified()
6369					 * can be avoided by making the page
6370					 * dirty now.
6371					 */
6372					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6373					vm_page_dirty(m);
6374				}
6375				atomic_clear_long(pte, PG_M | PG_A);
6376			} else if ((*pte & PG_A) != 0)
6377				atomic_clear_long(pte, PG_A);
6378			else
6379				goto maybe_invlrng;
6380
6381			if ((*pte & PG_G) != 0) {
6382				if (va == va_next)
6383					va = sva;
6384			} else
6385				anychanged = TRUE;
6386			continue;
6387maybe_invlrng:
6388			if (va != va_next) {
6389				pmap_invalidate_range(pmap, va, sva);
6390				va = va_next;
6391			}
6392		}
6393		if (va != va_next)
6394			pmap_invalidate_range(pmap, va, sva);
6395	}
6396	if (anychanged)
6397		pmap_invalidate_all(pmap);
6398	PMAP_UNLOCK(pmap);
6399	pmap_delayed_invl_finished();
6400}
6401
6402/*
6403 *	Clear the modify bits on the specified physical page.
6404 */
6405void
6406pmap_clear_modify(vm_page_t m)
6407{
6408	struct md_page *pvh;
6409	pmap_t pmap;
6410	pv_entry_t next_pv, pv;
6411	pd_entry_t oldpde, *pde;
6412	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6413	struct rwlock *lock;
6414	vm_offset_t va;
6415	int md_gen, pvh_gen;
6416
6417	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6418	    ("pmap_clear_modify: page %p is not managed", m));
6419	VM_OBJECT_ASSERT_WLOCKED(m->object);
6420	KASSERT(!vm_page_xbusied(m),
6421	    ("pmap_clear_modify: page %p is exclusive busied", m));
6422
6423	/*
6424	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6425	 * If the object containing the page is locked and the page is not
6426	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6427	 */
6428	if ((m->aflags & PGA_WRITEABLE) == 0)
6429		return;
6430	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6431	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
6432	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6433	rw_wlock(lock);
6434restart:
6435	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6436		pmap = PV_PMAP(pv);
6437		if (!PMAP_TRYLOCK(pmap)) {
6438			pvh_gen = pvh->pv_gen;
6439			rw_wunlock(lock);
6440			PMAP_LOCK(pmap);
6441			rw_wlock(lock);
6442			if (pvh_gen != pvh->pv_gen) {
6443				PMAP_UNLOCK(pmap);
6444				goto restart;
6445			}
6446		}
6447		PG_M = pmap_modified_bit(pmap);
6448		PG_V = pmap_valid_bit(pmap);
6449		PG_RW = pmap_rw_bit(pmap);
6450		va = pv->pv_va;
6451		pde = pmap_pde(pmap, va);
6452		oldpde = *pde;
6453		if ((oldpde & PG_RW) != 0) {
6454			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6455				if ((oldpde & PG_W) == 0) {
6456					/*
6457					 * Write protect the mapping to a
6458					 * single page so that a subsequent
6459					 * write access may repromote.
6460					 */
6461					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6462					    PG_PS_FRAME);
6463					pte = pmap_pde_to_pte(pde, va);
6464					oldpte = *pte;
6465					if ((oldpte & PG_V) != 0) {
6466						while (!atomic_cmpset_long(pte,
6467						    oldpte,
6468						    oldpte & ~(PG_M | PG_RW)))
6469							oldpte = *pte;
6470						vm_page_dirty(m);
6471						pmap_invalidate_page(pmap, va);
6472					}
6473				}
6474			}
6475		}
6476		PMAP_UNLOCK(pmap);
6477	}
6478	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6479		pmap = PV_PMAP(pv);
6480		if (!PMAP_TRYLOCK(pmap)) {
6481			md_gen = m->md.pv_gen;
6482			pvh_gen = pvh->pv_gen;
6483			rw_wunlock(lock);
6484			PMAP_LOCK(pmap);
6485			rw_wlock(lock);
6486			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6487				PMAP_UNLOCK(pmap);
6488				goto restart;
6489			}
6490		}
6491		PG_M = pmap_modified_bit(pmap);
6492		PG_RW = pmap_rw_bit(pmap);
6493		pde = pmap_pde(pmap, pv->pv_va);
6494		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6495		    " a 2mpage in page %p's pv list", m));
6496		pte = pmap_pde_to_pte(pde, pv->pv_va);
6497		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6498			atomic_clear_long(pte, PG_M);
6499			pmap_invalidate_page(pmap, pv->pv_va);
6500		}
6501		PMAP_UNLOCK(pmap);
6502	}
6503	rw_wunlock(lock);
6504}
6505
6506/*
6507 * Miscellaneous support routines follow
6508 */
6509
6510/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6511static __inline void
6512pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6513{
6514	u_int opte, npte;
6515
6516	/*
6517	 * The cache mode bits are all in the low 32-bits of the
6518	 * PTE, so we can just spin on updating the low 32-bits.
6519	 */
6520	do {
6521		opte = *(u_int *)pte;
6522		npte = opte & ~mask;
6523		npte |= cache_bits;
6524	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6525}
6526
6527/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6528static __inline void
6529pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6530{
6531	u_int opde, npde;
6532
6533	/*
6534	 * The cache mode bits are all in the low 32-bits of the
6535	 * PDE, so we can just spin on updating the low 32-bits.
6536	 */
6537	do {
6538		opde = *(u_int *)pde;
6539		npde = opde & ~mask;
6540		npde |= cache_bits;
6541	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6542}
6543
6544/*
6545 * Map a set of physical memory pages into the kernel virtual
6546 * address space. Return a pointer to where it is mapped. This
6547 * routine is intended to be used for mapping device memory,
6548 * NOT real memory.
6549 */
6550void *
6551pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6552{
6553	struct pmap_preinit_mapping *ppim;
6554	vm_offset_t va, offset;
6555	vm_size_t tmpsize;
6556	int i;
6557
6558	offset = pa & PAGE_MASK;
6559	size = round_page(offset + size);
6560	pa = trunc_page(pa);
6561
6562	if (!pmap_initialized) {
6563		va = 0;
6564		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6565			ppim = pmap_preinit_mapping + i;
6566			if (ppim->va == 0) {
6567				ppim->pa = pa;
6568				ppim->sz = size;
6569				ppim->mode = mode;
6570				ppim->va = virtual_avail;
6571				virtual_avail += size;
6572				va = ppim->va;
6573				break;
6574			}
6575		}
6576		if (va == 0)
6577			panic("%s: too many preinit mappings", __func__);
6578	} else {
6579		/*
6580		 * If we have a preinit mapping, re-use it.
6581		 */
6582		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6583			ppim = pmap_preinit_mapping + i;
6584			if (ppim->pa == pa && ppim->sz == size &&
6585			    ppim->mode == mode)
6586				return ((void *)(ppim->va + offset));
6587		}
6588		/*
6589		 * If the specified range of physical addresses fits within
6590		 * the direct map window, use the direct map.
6591		 */
6592		if (pa < dmaplimit && pa + size < dmaplimit) {
6593			va = PHYS_TO_DMAP(pa);
6594			if (!pmap_change_attr(va, size, mode))
6595				return ((void *)(va + offset));
6596		}
6597		va = kva_alloc(size);
6598		if (va == 0)
6599			panic("%s: Couldn't allocate KVA", __func__);
6600	}
6601	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6602		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6603	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6604	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
6605	return ((void *)(va + offset));
6606}
6607
6608void *
6609pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6610{
6611
6612	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6613}
6614
6615void *
6616pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6617{
6618
6619	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6620}
6621
6622void
6623pmap_unmapdev(vm_offset_t va, vm_size_t size)
6624{
6625	struct pmap_preinit_mapping *ppim;
6626	vm_offset_t offset;
6627	int i;
6628
6629	/* If we gave a direct map region in pmap_mapdev, do nothing */
6630	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6631		return;
6632	offset = va & PAGE_MASK;
6633	size = round_page(offset + size);
6634	va = trunc_page(va);
6635	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6636		ppim = pmap_preinit_mapping + i;
6637		if (ppim->va == va && ppim->sz == size) {
6638			if (pmap_initialized)
6639				return;
6640			ppim->pa = 0;
6641			ppim->va = 0;
6642			ppim->sz = 0;
6643			ppim->mode = 0;
6644			if (va + size == virtual_avail)
6645				virtual_avail = va;
6646			return;
6647		}
6648	}
6649	if (pmap_initialized)
6650		kva_free(va, size);
6651}
6652
6653/*
6654 * Tries to demote a 1GB page mapping.
6655 */
6656static boolean_t
6657pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6658{
6659	pdp_entry_t newpdpe, oldpdpe;
6660	pd_entry_t *firstpde, newpde, *pde;
6661	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6662	vm_paddr_t pdpgpa;
6663	vm_page_t pdpg;
6664
6665	PG_A = pmap_accessed_bit(pmap);
6666	PG_M = pmap_modified_bit(pmap);
6667	PG_V = pmap_valid_bit(pmap);
6668	PG_RW = pmap_rw_bit(pmap);
6669
6670	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6671	oldpdpe = *pdpe;
6672	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6673	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6674	if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6675	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6676		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6677		    " in pmap %p", va, pmap);
6678		return (FALSE);
6679	}
6680	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
6681	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
6682	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6683	KASSERT((oldpdpe & PG_A) != 0,
6684	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6685	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6686	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6687	newpde = oldpdpe;
6688
6689	/*
6690	 * Initialize the page directory page.
6691	 */
6692	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6693		*pde = newpde;
6694		newpde += NBPDR;
6695	}
6696
6697	/*
6698	 * Demote the mapping.
6699	 */
6700	*pdpe = newpdpe;
6701
6702	/*
6703	 * Invalidate a stale recursive mapping of the page directory page.
6704	 */
6705	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6706
6707	pmap_pdpe_demotions++;
6708	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6709	    " in pmap %p", va, pmap);
6710	return (TRUE);
6711}
6712
6713/*
6714 * Sets the memory attribute for the specified page.
6715 */
6716void
6717pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6718{
6719
6720	m->md.pat_mode = ma;
6721
6722	/*
6723	 * If "m" is a normal page, update its direct mapping.  This update
6724	 * can be relied upon to perform any cache operations that are
6725	 * required for data coherence.
6726	 */
6727	if ((m->flags & PG_FICTITIOUS) == 0 &&
6728	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6729	    m->md.pat_mode))
6730		panic("memory attribute change on the direct map failed");
6731}
6732
6733/*
6734 * Changes the specified virtual address range's memory type to that given by
6735 * the parameter "mode".  The specified virtual address range must be
6736 * completely contained within either the direct map or the kernel map.  If
6737 * the virtual address range is contained within the kernel map, then the
6738 * memory type for each of the corresponding ranges of the direct map is also
6739 * changed.  (The corresponding ranges of the direct map are those ranges that
6740 * map the same physical pages as the specified virtual address range.)  These
6741 * changes to the direct map are necessary because Intel describes the
6742 * behavior of their processors as "undefined" if two or more mappings to the
6743 * same physical page have different memory types.
6744 *
6745 * Returns zero if the change completed successfully, and either EINVAL or
6746 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6747 * of the virtual address range was not mapped, and ENOMEM is returned if
6748 * there was insufficient memory available to complete the change.  In the
6749 * latter case, the memory type may have been changed on some part of the
6750 * virtual address range or the direct map.
6751 */
6752int
6753pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6754{
6755	int error;
6756
6757	PMAP_LOCK(kernel_pmap);
6758	error = pmap_change_attr_locked(va, size, mode);
6759	PMAP_UNLOCK(kernel_pmap);
6760	return (error);
6761}
6762
6763static int
6764pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6765{
6766	vm_offset_t base, offset, tmpva;
6767	vm_paddr_t pa_start, pa_end, pa_end1;
6768	pdp_entry_t *pdpe;
6769	pd_entry_t *pde;
6770	pt_entry_t *pte;
6771	int cache_bits_pte, cache_bits_pde, error;
6772	boolean_t changed;
6773
6774	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6775	base = trunc_page(va);
6776	offset = va & PAGE_MASK;
6777	size = round_page(offset + size);
6778
6779	/*
6780	 * Only supported on kernel virtual addresses, including the direct
6781	 * map but excluding the recursive map.
6782	 */
6783	if (base < DMAP_MIN_ADDRESS)
6784		return (EINVAL);
6785
6786	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6787	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6788	changed = FALSE;
6789
6790	/*
6791	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6792	 * into 4KB pages if required.
6793	 */
6794	for (tmpva = base; tmpva < base + size; ) {
6795		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6796		if (pdpe == NULL || *pdpe == 0)
6797			return (EINVAL);
6798		if (*pdpe & PG_PS) {
6799			/*
6800			 * If the current 1GB page already has the required
6801			 * memory type, then we need not demote this page. Just
6802			 * increment tmpva to the next 1GB page frame.
6803			 */
6804			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6805				tmpva = trunc_1gpage(tmpva) + NBPDP;
6806				continue;
6807			}
6808
6809			/*
6810			 * If the current offset aligns with a 1GB page frame
6811			 * and there is at least 1GB left within the range, then
6812			 * we need not break down this page into 2MB pages.
6813			 */
6814			if ((tmpva & PDPMASK) == 0 &&
6815			    tmpva + PDPMASK < base + size) {
6816				tmpva += NBPDP;
6817				continue;
6818			}
6819			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6820				return (ENOMEM);
6821		}
6822		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6823		if (*pde == 0)
6824			return (EINVAL);
6825		if (*pde & PG_PS) {
6826			/*
6827			 * If the current 2MB page already has the required
6828			 * memory type, then we need not demote this page. Just
6829			 * increment tmpva to the next 2MB page frame.
6830			 */
6831			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6832				tmpva = trunc_2mpage(tmpva) + NBPDR;
6833				continue;
6834			}
6835
6836			/*
6837			 * If the current offset aligns with a 2MB page frame
6838			 * and there is at least 2MB left within the range, then
6839			 * we need not break down this page into 4KB pages.
6840			 */
6841			if ((tmpva & PDRMASK) == 0 &&
6842			    tmpva + PDRMASK < base + size) {
6843				tmpva += NBPDR;
6844				continue;
6845			}
6846			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6847				return (ENOMEM);
6848		}
6849		pte = pmap_pde_to_pte(pde, tmpva);
6850		if (*pte == 0)
6851			return (EINVAL);
6852		tmpva += PAGE_SIZE;
6853	}
6854	error = 0;
6855
6856	/*
6857	 * Ok, all the pages exist, so run through them updating their
6858	 * cache mode if required.
6859	 */
6860	pa_start = pa_end = 0;
6861	for (tmpva = base; tmpva < base + size; ) {
6862		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6863		if (*pdpe & PG_PS) {
6864			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6865				pmap_pde_attr(pdpe, cache_bits_pde,
6866				    X86_PG_PDE_CACHE);
6867				changed = TRUE;
6868			}
6869			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6870			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
6871				if (pa_start == pa_end) {
6872					/* Start physical address run. */
6873					pa_start = *pdpe & PG_PS_FRAME;
6874					pa_end = pa_start + NBPDP;
6875				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6876					pa_end += NBPDP;
6877				else {
6878					/* Run ended, update direct map. */
6879					error = pmap_change_attr_locked(
6880					    PHYS_TO_DMAP(pa_start),
6881					    pa_end - pa_start, mode);
6882					if (error != 0)
6883						break;
6884					/* Start physical address run. */
6885					pa_start = *pdpe & PG_PS_FRAME;
6886					pa_end = pa_start + NBPDP;
6887				}
6888			}
6889			tmpva = trunc_1gpage(tmpva) + NBPDP;
6890			continue;
6891		}
6892		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6893		if (*pde & PG_PS) {
6894			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6895				pmap_pde_attr(pde, cache_bits_pde,
6896				    X86_PG_PDE_CACHE);
6897				changed = TRUE;
6898			}
6899			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6900			    (*pde & PG_PS_FRAME) < dmaplimit) {
6901				if (pa_start == pa_end) {
6902					/* Start physical address run. */
6903					pa_start = *pde & PG_PS_FRAME;
6904					pa_end = pa_start + NBPDR;
6905				} else if (pa_end == (*pde & PG_PS_FRAME))
6906					pa_end += NBPDR;
6907				else {
6908					/* Run ended, update direct map. */
6909					error = pmap_change_attr_locked(
6910					    PHYS_TO_DMAP(pa_start),
6911					    pa_end - pa_start, mode);
6912					if (error != 0)
6913						break;
6914					/* Start physical address run. */
6915					pa_start = *pde & PG_PS_FRAME;
6916					pa_end = pa_start + NBPDR;
6917				}
6918			}
6919			tmpva = trunc_2mpage(tmpva) + NBPDR;
6920		} else {
6921			pte = pmap_pde_to_pte(pde, tmpva);
6922			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6923				pmap_pte_attr(pte, cache_bits_pte,
6924				    X86_PG_PTE_CACHE);
6925				changed = TRUE;
6926			}
6927			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6928			    (*pte & PG_FRAME) < dmaplimit) {
6929				if (pa_start == pa_end) {
6930					/* Start physical address run. */
6931					pa_start = *pte & PG_FRAME;
6932					pa_end = pa_start + PAGE_SIZE;
6933				} else if (pa_end == (*pte & PG_FRAME))
6934					pa_end += PAGE_SIZE;
6935				else {
6936					/* Run ended, update direct map. */
6937					error = pmap_change_attr_locked(
6938					    PHYS_TO_DMAP(pa_start),
6939					    pa_end - pa_start, mode);
6940					if (error != 0)
6941						break;
6942					/* Start physical address run. */
6943					pa_start = *pte & PG_FRAME;
6944					pa_end = pa_start + PAGE_SIZE;
6945				}
6946			}
6947			tmpva += PAGE_SIZE;
6948		}
6949	}
6950	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6951		pa_end1 = MIN(pa_end, dmaplimit);
6952		if (pa_start != pa_end1)
6953			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6954			    pa_end1 - pa_start, mode);
6955	}
6956
6957	/*
6958	 * Flush CPU caches if required to make sure any data isn't cached that
6959	 * shouldn't be, etc.
6960	 */
6961	if (changed) {
6962		pmap_invalidate_range(kernel_pmap, base, tmpva);
6963		pmap_invalidate_cache_range(base, tmpva, FALSE);
6964	}
6965	return (error);
6966}
6967
6968/*
6969 * Demotes any mapping within the direct map region that covers more than the
6970 * specified range of physical addresses.  This range's size must be a power
6971 * of two and its starting address must be a multiple of its size.  Since the
6972 * demotion does not change any attributes of the mapping, a TLB invalidation
6973 * is not mandatory.  The caller may, however, request a TLB invalidation.
6974 */
6975void
6976pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6977{
6978	pdp_entry_t *pdpe;
6979	pd_entry_t *pde;
6980	vm_offset_t va;
6981	boolean_t changed;
6982
6983	if (len == 0)
6984		return;
6985	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6986	KASSERT((base & (len - 1)) == 0,
6987	    ("pmap_demote_DMAP: base is not a multiple of len"));
6988	if (len < NBPDP && base < dmaplimit) {
6989		va = PHYS_TO_DMAP(base);
6990		changed = FALSE;
6991		PMAP_LOCK(kernel_pmap);
6992		pdpe = pmap_pdpe(kernel_pmap, va);
6993		if ((*pdpe & X86_PG_V) == 0)
6994			panic("pmap_demote_DMAP: invalid PDPE");
6995		if ((*pdpe & PG_PS) != 0) {
6996			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6997				panic("pmap_demote_DMAP: PDPE failed");
6998			changed = TRUE;
6999		}
7000		if (len < NBPDR) {
7001			pde = pmap_pdpe_to_pde(pdpe, va);
7002			if ((*pde & X86_PG_V) == 0)
7003				panic("pmap_demote_DMAP: invalid PDE");
7004			if ((*pde & PG_PS) != 0) {
7005				if (!pmap_demote_pde(kernel_pmap, pde, va))
7006					panic("pmap_demote_DMAP: PDE failed");
7007				changed = TRUE;
7008			}
7009		}
7010		if (changed && invalidate)
7011			pmap_invalidate_page(kernel_pmap, va);
7012		PMAP_UNLOCK(kernel_pmap);
7013	}
7014}
7015
7016/*
7017 * perform the pmap work for mincore
7018 */
7019int
7020pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
7021{
7022	pd_entry_t *pdep;
7023	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
7024	vm_paddr_t pa;
7025	int val;
7026
7027	PG_A = pmap_accessed_bit(pmap);
7028	PG_M = pmap_modified_bit(pmap);
7029	PG_V = pmap_valid_bit(pmap);
7030	PG_RW = pmap_rw_bit(pmap);
7031
7032	PMAP_LOCK(pmap);
7033retry:
7034	pdep = pmap_pde(pmap, addr);
7035	if (pdep != NULL && (*pdep & PG_V)) {
7036		if (*pdep & PG_PS) {
7037			pte = *pdep;
7038			/* Compute the physical address of the 4KB page. */
7039			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
7040			    PG_FRAME;
7041			val = MINCORE_SUPER;
7042		} else {
7043			pte = *pmap_pde_to_pte(pdep, addr);
7044			pa = pte & PG_FRAME;
7045			val = 0;
7046		}
7047	} else {
7048		pte = 0;
7049		pa = 0;
7050		val = 0;
7051	}
7052	if ((pte & PG_V) != 0) {
7053		val |= MINCORE_INCORE;
7054		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7055			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7056		if ((pte & PG_A) != 0)
7057			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7058	}
7059	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7060	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
7061	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
7062		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
7063		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
7064			goto retry;
7065	} else
7066		PA_UNLOCK_COND(*locked_pa);
7067	PMAP_UNLOCK(pmap);
7068	return (val);
7069}
7070
7071static uint64_t
7072pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
7073{
7074	uint32_t gen, new_gen, pcid_next;
7075
7076	CRITICAL_ASSERT(curthread);
7077	gen = PCPU_GET(pcid_gen);
7078	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
7079	    pmap->pm_pcids[cpuid].pm_gen == gen)
7080		return (CR3_PCID_SAVE);
7081	pcid_next = PCPU_GET(pcid_next);
7082	KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
7083	    cpuid, pcid_next));
7084	if (pcid_next == PMAP_PCID_OVERMAX) {
7085		new_gen = gen + 1;
7086		if (new_gen == 0)
7087			new_gen = 1;
7088		PCPU_SET(pcid_gen, new_gen);
7089		pcid_next = PMAP_PCID_KERN + 1;
7090	} else {
7091		new_gen = gen;
7092	}
7093	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
7094	pmap->pm_pcids[cpuid].pm_gen = new_gen;
7095	PCPU_SET(pcid_next, pcid_next + 1);
7096	return (0);
7097}
7098
7099void
7100pmap_activate_sw(struct thread *td)
7101{
7102	pmap_t oldpmap, pmap;
7103	uint64_t cached, cr3;
7104	register_t rflags;
7105	u_int cpuid;
7106
7107	oldpmap = PCPU_GET(curpmap);
7108	pmap = vmspace_pmap(td->td_proc->p_vmspace);
7109	if (oldpmap == pmap)
7110		return;
7111	cpuid = PCPU_GET(cpuid);
7112#ifdef SMP
7113	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7114#else
7115	CPU_SET(cpuid, &pmap->pm_active);
7116#endif
7117	cr3 = rcr3();
7118	if (pmap_pcid_enabled) {
7119		cached = pmap_pcid_alloc(pmap, cpuid);
7120		KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 &&
7121		    pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
7122		    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
7123		    pmap->pm_pcids[cpuid].pm_pcid));
7124		KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
7125		    pmap == kernel_pmap,
7126		    ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x",
7127		    td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
7128
7129		/*
7130		 * If the INVPCID instruction is not available,
7131		 * invltlb_pcid_handler() is used for handle
7132		 * invalidate_all IPI, which checks for curpmap ==
7133		 * smp_tlb_pmap.  Below operations sequence has a
7134		 * window where %CR3 is loaded with the new pmap's
7135		 * PML4 address, but curpmap value is not yet updated.
7136		 * This causes invltlb IPI handler, called between the
7137		 * updates, to execute as NOP, which leaves stale TLB
7138		 * entries.
7139		 *
7140		 * Note that the most typical use of
7141		 * pmap_activate_sw(), from the context switch, is
7142		 * immune to this race, because interrupts are
7143		 * disabled (while the thread lock is owned), and IPI
7144		 * happends after curpmap is updated.  Protect other
7145		 * callers in a similar way, by disabling interrupts
7146		 * around the %cr3 register reload and curpmap
7147		 * assignment.
7148		 */
7149		if (!invpcid_works)
7150			rflags = intr_disable();
7151
7152		if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) {
7153			load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
7154			    cached);
7155			if (cached)
7156				PCPU_INC(pm_save_cnt);
7157		}
7158		PCPU_SET(curpmap, pmap);
7159		if (!invpcid_works)
7160			intr_restore(rflags);
7161	} else if (cr3 != pmap->pm_cr3) {
7162		load_cr3(pmap->pm_cr3);
7163		PCPU_SET(curpmap, pmap);
7164	}
7165#ifdef SMP
7166	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
7167#else
7168	CPU_CLR(cpuid, &oldpmap->pm_active);
7169#endif
7170}
7171
7172void
7173pmap_activate(struct thread *td)
7174{
7175
7176	critical_enter();
7177	pmap_activate_sw(td);
7178	critical_exit();
7179}
7180
7181void
7182pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
7183{
7184}
7185
7186/*
7187 *	Increase the starting virtual address of the given mapping if a
7188 *	different alignment might result in more superpage mappings.
7189 */
7190void
7191pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7192    vm_offset_t *addr, vm_size_t size)
7193{
7194	vm_offset_t superpage_offset;
7195
7196	if (size < NBPDR)
7197		return;
7198	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7199		offset += ptoa(object->pg_color);
7200	superpage_offset = offset & PDRMASK;
7201	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
7202	    (*addr & PDRMASK) == superpage_offset)
7203		return;
7204	if ((*addr & PDRMASK) < superpage_offset)
7205		*addr = (*addr & ~PDRMASK) + superpage_offset;
7206	else
7207		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
7208}
7209
7210#ifdef INVARIANTS
7211static unsigned long num_dirty_emulations;
7212SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
7213	     &num_dirty_emulations, 0, NULL);
7214
7215static unsigned long num_accessed_emulations;
7216SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
7217	     &num_accessed_emulations, 0, NULL);
7218
7219static unsigned long num_superpage_accessed_emulations;
7220SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
7221	     &num_superpage_accessed_emulations, 0, NULL);
7222
7223static unsigned long ad_emulation_superpage_promotions;
7224SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
7225	     &ad_emulation_superpage_promotions, 0, NULL);
7226#endif	/* INVARIANTS */
7227
7228int
7229pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
7230{
7231	int rv;
7232	struct rwlock *lock;
7233#if VM_NRESERVLEVEL > 0
7234	vm_page_t m, mpte;
7235#endif
7236	pd_entry_t *pde;
7237	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
7238
7239	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
7240	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
7241
7242	if (!pmap_emulate_ad_bits(pmap))
7243		return (-1);
7244
7245	PG_A = pmap_accessed_bit(pmap);
7246	PG_M = pmap_modified_bit(pmap);
7247	PG_V = pmap_valid_bit(pmap);
7248	PG_RW = pmap_rw_bit(pmap);
7249
7250	rv = -1;
7251	lock = NULL;
7252	PMAP_LOCK(pmap);
7253
7254	pde = pmap_pde(pmap, va);
7255	if (pde == NULL || (*pde & PG_V) == 0)
7256		goto done;
7257
7258	if ((*pde & PG_PS) != 0) {
7259		if (ftype == VM_PROT_READ) {
7260#ifdef INVARIANTS
7261			atomic_add_long(&num_superpage_accessed_emulations, 1);
7262#endif
7263			*pde |= PG_A;
7264			rv = 0;
7265		}
7266		goto done;
7267	}
7268
7269	pte = pmap_pde_to_pte(pde, va);
7270	if ((*pte & PG_V) == 0)
7271		goto done;
7272
7273	if (ftype == VM_PROT_WRITE) {
7274		if ((*pte & PG_RW) == 0)
7275			goto done;
7276		/*
7277		 * Set the modified and accessed bits simultaneously.
7278		 *
7279		 * Intel EPT PTEs that do software emulation of A/D bits map
7280		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
7281		 * An EPT misconfiguration is triggered if the PTE is writable
7282		 * but not readable (WR=10). This is avoided by setting PG_A
7283		 * and PG_M simultaneously.
7284		 */
7285		*pte |= PG_M | PG_A;
7286	} else {
7287		*pte |= PG_A;
7288	}
7289
7290#if VM_NRESERVLEVEL > 0
7291	/* try to promote the mapping */
7292	if (va < VM_MAXUSER_ADDRESS)
7293		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7294	else
7295		mpte = NULL;
7296
7297	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
7298
7299	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
7300	    pmap_ps_enabled(pmap) &&
7301	    (m->flags & PG_FICTITIOUS) == 0 &&
7302	    vm_reserv_level_iffullpop(m) == 0) {
7303		pmap_promote_pde(pmap, pde, va, &lock);
7304#ifdef INVARIANTS
7305		atomic_add_long(&ad_emulation_superpage_promotions, 1);
7306#endif
7307	}
7308#endif
7309
7310#ifdef INVARIANTS
7311	if (ftype == VM_PROT_WRITE)
7312		atomic_add_long(&num_dirty_emulations, 1);
7313	else
7314		atomic_add_long(&num_accessed_emulations, 1);
7315#endif
7316	rv = 0;		/* success */
7317done:
7318	if (lock != NULL)
7319		rw_wunlock(lock);
7320	PMAP_UNLOCK(pmap);
7321	return (rv);
7322}
7323
7324void
7325pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
7326{
7327	pml4_entry_t *pml4;
7328	pdp_entry_t *pdp;
7329	pd_entry_t *pde;
7330	pt_entry_t *pte, PG_V;
7331	int idx;
7332
7333	idx = 0;
7334	PG_V = pmap_valid_bit(pmap);
7335	PMAP_LOCK(pmap);
7336
7337	pml4 = pmap_pml4e(pmap, va);
7338	ptr[idx++] = *pml4;
7339	if ((*pml4 & PG_V) == 0)
7340		goto done;
7341
7342	pdp = pmap_pml4e_to_pdpe(pml4, va);
7343	ptr[idx++] = *pdp;
7344	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
7345		goto done;
7346
7347	pde = pmap_pdpe_to_pde(pdp, va);
7348	ptr[idx++] = *pde;
7349	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
7350		goto done;
7351
7352	pte = pmap_pde_to_pte(pde, va);
7353	ptr[idx++] = *pte;
7354
7355done:
7356	PMAP_UNLOCK(pmap);
7357	*num = idx;
7358}
7359
7360/**
7361 * Get the kernel virtual address of a set of physical pages. If there are
7362 * physical addresses not covered by the DMAP perform a transient mapping
7363 * that will be removed when calling pmap_unmap_io_transient.
7364 *
7365 * \param page        The pages the caller wishes to obtain the virtual
7366 *                    address on the kernel memory map.
7367 * \param vaddr       On return contains the kernel virtual memory address
7368 *                    of the pages passed in the page parameter.
7369 * \param count       Number of pages passed in.
7370 * \param can_fault   TRUE if the thread using the mapped pages can take
7371 *                    page faults, FALSE otherwise.
7372 *
7373 * \returns TRUE if the caller must call pmap_unmap_io_transient when
7374 *          finished or FALSE otherwise.
7375 *
7376 */
7377boolean_t
7378pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7379    boolean_t can_fault)
7380{
7381	vm_paddr_t paddr;
7382	boolean_t needs_mapping;
7383	pt_entry_t *pte;
7384	int cache_bits, error, i;
7385
7386	/*
7387	 * Allocate any KVA space that we need, this is done in a separate
7388	 * loop to prevent calling vmem_alloc while pinned.
7389	 */
7390	needs_mapping = FALSE;
7391	for (i = 0; i < count; i++) {
7392		paddr = VM_PAGE_TO_PHYS(page[i]);
7393		if (__predict_false(paddr >= dmaplimit)) {
7394			error = vmem_alloc(kernel_arena, PAGE_SIZE,
7395			    M_BESTFIT | M_WAITOK, &vaddr[i]);
7396			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7397			needs_mapping = TRUE;
7398		} else {
7399			vaddr[i] = PHYS_TO_DMAP(paddr);
7400		}
7401	}
7402
7403	/* Exit early if everything is covered by the DMAP */
7404	if (!needs_mapping)
7405		return (FALSE);
7406
7407	/*
7408	 * NB:  The sequence of updating a page table followed by accesses
7409	 * to the corresponding pages used in the !DMAP case is subject to
7410	 * the situation described in the "AMD64 Architecture Programmer's
7411	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
7412	 * Coherency Considerations".  Therefore, issuing the INVLPG right
7413	 * after modifying the PTE bits is crucial.
7414	 */
7415	if (!can_fault)
7416		sched_pin();
7417	for (i = 0; i < count; i++) {
7418		paddr = VM_PAGE_TO_PHYS(page[i]);
7419		if (paddr >= dmaplimit) {
7420			if (can_fault) {
7421				/*
7422				 * Slow path, since we can get page faults
7423				 * while mappings are active don't pin the
7424				 * thread to the CPU and instead add a global
7425				 * mapping visible to all CPUs.
7426				 */
7427				pmap_qenter(vaddr[i], &page[i], 1);
7428			} else {
7429				pte = vtopte(vaddr[i]);
7430				cache_bits = pmap_cache_bits(kernel_pmap,
7431				    page[i]->md.pat_mode, 0);
7432				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
7433				    cache_bits);
7434				invlpg(vaddr[i]);
7435			}
7436		}
7437	}
7438
7439	return (needs_mapping);
7440}
7441
7442void
7443pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7444    boolean_t can_fault)
7445{
7446	vm_paddr_t paddr;
7447	int i;
7448
7449	if (!can_fault)
7450		sched_unpin();
7451	for (i = 0; i < count; i++) {
7452		paddr = VM_PAGE_TO_PHYS(page[i]);
7453		if (paddr >= dmaplimit) {
7454			if (can_fault)
7455				pmap_qremove(vaddr[i], 1);
7456			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
7457		}
7458	}
7459}
7460
7461vm_offset_t
7462pmap_quick_enter_page(vm_page_t m)
7463{
7464	vm_paddr_t paddr;
7465
7466	paddr = VM_PAGE_TO_PHYS(m);
7467	if (paddr < dmaplimit)
7468		return (PHYS_TO_DMAP(paddr));
7469	mtx_lock_spin(&qframe_mtx);
7470	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
7471	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
7472	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
7473	return (qframe);
7474}
7475
7476void
7477pmap_quick_remove_page(vm_offset_t addr)
7478{
7479
7480	if (addr != qframe)
7481		return;
7482	pte_store(vtopte(qframe), 0);
7483	invlpg(qframe);
7484	mtx_unlock_spin(&qframe_mtx);
7485}
7486
7487#include "opt_ddb.h"
7488#ifdef DDB
7489#include <ddb/ddb.h>
7490
7491DB_SHOW_COMMAND(pte, pmap_print_pte)
7492{
7493	pmap_t pmap;
7494	pml4_entry_t *pml4;
7495	pdp_entry_t *pdp;
7496	pd_entry_t *pde;
7497	pt_entry_t *pte, PG_V;
7498	vm_offset_t va;
7499
7500	if (have_addr) {
7501		va = (vm_offset_t)addr;
7502		pmap = PCPU_GET(curpmap); /* XXX */
7503	} else {
7504		db_printf("show pte addr\n");
7505		return;
7506	}
7507	PG_V = pmap_valid_bit(pmap);
7508	pml4 = pmap_pml4e(pmap, va);
7509	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
7510	if ((*pml4 & PG_V) == 0) {
7511		db_printf("\n");
7512		return;
7513	}
7514	pdp = pmap_pml4e_to_pdpe(pml4, va);
7515	db_printf(" pdpe %#016lx", *pdp);
7516	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
7517		db_printf("\n");
7518		return;
7519	}
7520	pde = pmap_pdpe_to_pde(pdp, va);
7521	db_printf(" pde %#016lx", *pde);
7522	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
7523		db_printf("\n");
7524		return;
7525	}
7526	pte = pmap_pde_to_pte(pde, va);
7527	db_printf(" pte %#016lx\n", *pte);
7528}
7529
7530DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
7531{
7532	vm_paddr_t a;
7533
7534	if (have_addr) {
7535		a = (vm_paddr_t)addr;
7536		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
7537	} else {
7538		db_printf("show phys2dmap addr\n");
7539	}
7540}
7541#endif
7542