pmap.c revision 328386
199357Smarkm/*- 299357Smarkm * Copyright (c) 1991 Regents of the University of California. 399357Smarkm * All rights reserved. 499357Smarkm * Copyright (c) 1994 John S. Dyson 599357Smarkm * All rights reserved. 699357Smarkm * Copyright (c) 1994 David Greenman 799357Smarkm * All rights reserved. 899357Smarkm * Copyright (c) 2003 Peter Wemm 999357Smarkm * All rights reserved. 1099357Smarkm * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 1199357Smarkm * All rights reserved. 1299357Smarkm * 1399357Smarkm * This code is derived from software contributed to Berkeley by 1499357Smarkm * the Systems Programming Group of the University of Utah Computer 1599357Smarkm * Science Department and William Jolitz of UUNET Technologies Inc. 1699357Smarkm * 1799357Smarkm * Redistribution and use in source and binary forms, with or without 1899357Smarkm * modification, are permitted provided that the following conditions 1999357Smarkm * are met: 2099357Smarkm * 1. Redistributions of source code must retain the above copyright 2199357Smarkm * notice, this list of conditions and the following disclaimer. 2299357Smarkm * 2. Redistributions in binary form must reproduce the above copyright 2399357Smarkm * notice, this list of conditions and the following disclaimer in the 2499357Smarkm * documentation and/or other materials provided with the distribution. 2599357Smarkm * 3. All advertising materials mentioning features or use of this software 2699357Smarkm * must display the following acknowledgement: 2799357Smarkm * This product includes software developed by the University of 2899357Smarkm * California, Berkeley and its contributors. 2999357Smarkm * 4. Neither the name of the University nor the names of its contributors 3099357Smarkm * may be used to endorse or promote products derived from this software 3199357Smarkm * without specific prior written permission. 3299357Smarkm * 3399357Smarkm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 3499357Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3599357Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36106053Swollman * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3799357Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3899357Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3999357Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4099357Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 4199357Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 4299357Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 4399357Smarkm * SUCH DAMAGE. 4499357Smarkm * 4599357Smarkm * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 4699357Smarkm */ 4799357Smarkm/*- 4899357Smarkm * Copyright (c) 2003 Networks Associates Technology, Inc. 4999357Smarkm * All rights reserved. 5099357Smarkm * 5199357Smarkm * This software was developed for the FreeBSD Project by Jake Burkholder, 5299357Smarkm * Safeport Network Services, and Network Associates Laboratories, the 5399357Smarkm * Security Research Division of Network Associates, Inc. under 5499357Smarkm * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 5599357Smarkm * CHATS research program. 5699357Smarkm * 5799357Smarkm * Redistribution and use in source and binary forms, with or without 5899357Smarkm * modification, are permitted provided that the following conditions 5999357Smarkm * are met: 6099357Smarkm * 1. Redistributions of source code must retain the above copyright 6199357Smarkm * notice, this list of conditions and the following disclaimer. 6299357Smarkm * 2. Redistributions in binary form must reproduce the above copyright 6399357Smarkm * notice, this list of conditions and the following disclaimer in the 6499357Smarkm * documentation and/or other materials provided with the distribution. 6599357Smarkm * 6699357Smarkm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 6799357Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6899357Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6999357Smarkm * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 7099357Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 7199357Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 7299357Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 7399357Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 7499357Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 7599357Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 7699357Smarkm * SUCH DAMAGE. 7799357Smarkm */ 7899357Smarkm 7999357Smarkm#define AMD64_NPT_AWARE 8099357Smarkm 8199357Smarkm#include <sys/cdefs.h> 8299357Smarkm__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/pmap.c 328386 2018-01-25 02:45:21Z pkelsey $"); 8399357Smarkm 8499357Smarkm/* 8599357Smarkm * Manages physical address maps. 8699357Smarkm * 8799357Smarkm * Since the information managed by this module is 8899357Smarkm * also stored by the logical address mapping module, 8999357Smarkm * this module may throw away valid virtual-to-physical 9099357Smarkm * mappings at almost any time. However, invalidations 9199357Smarkm * of virtual-to-physical mappings must be done as 9299357Smarkm * requested. 9399357Smarkm * 9499357Smarkm * In order to cope with hardware architectures which 9599357Smarkm * make virtual-to-physical map invalidates expensive, 9699357Smarkm * this module may delay invalidate or reduced protection 9799357Smarkm * operations until such time as they are actually 9899357Smarkm * necessary. This module is given full information as 9999357Smarkm * to which processors are currently using which maps, 10099357Smarkm * and to when physical maps must be made correct. 10199357Smarkm */ 10299357Smarkm 10399357Smarkm#include "opt_pmap.h" 10499357Smarkm#include "opt_vm.h" 10599357Smarkm 10699357Smarkm#include <sys/param.h> 10799357Smarkm#include <sys/bitstring.h> 10899357Smarkm#include <sys/bus.h> 10999357Smarkm#include <sys/systm.h> 11099357Smarkm#include <sys/kernel.h> 11199357Smarkm#include <sys/ktr.h> 11299357Smarkm#include <sys/lock.h> 11399357Smarkm#include <sys/malloc.h> 11499357Smarkm#include <sys/mman.h> 11599357Smarkm#include <sys/mutex.h> 11699357Smarkm#include <sys/proc.h> 11799357Smarkm#include <sys/rwlock.h> 11899357Smarkm#include <sys/sx.h> 11999357Smarkm#include <sys/turnstile.h> 12099357Smarkm#include <sys/vmem.h> 12199357Smarkm#include <sys/vmmeter.h> 12299357Smarkm#include <sys/sched.h> 12399357Smarkm#include <sys/sysctl.h> 12499357Smarkm#include <sys/smp.h> 12599357Smarkm 12699357Smarkm#include <vm/vm.h> 12799357Smarkm#include <vm/vm_param.h> 12899357Smarkm#include <vm/vm_kern.h> 12999357Smarkm#include <vm/vm_page.h> 13099357Smarkm#include <vm/vm_map.h> 13199357Smarkm#include <vm/vm_object.h> 13299357Smarkm#include <vm/vm_extern.h> 13399357Smarkm#include <vm/vm_pageout.h> 13499357Smarkm#include <vm/vm_pager.h> 13599357Smarkm#include <vm/vm_phys.h> 13699357Smarkm#include <vm/vm_radix.h> 13799357Smarkm#include <vm/vm_reserv.h> 13899357Smarkm#include <vm/uma.h> 13999357Smarkm 14099357Smarkm#include <machine/intr_machdep.h> 14199357Smarkm#include <x86/apicvar.h> 14299357Smarkm#include <machine/cpu.h> 14399357Smarkm#include <machine/cputypes.h> 14499357Smarkm#include <machine/md_var.h> 14599357Smarkm#include <machine/pcb.h> 14699357Smarkm#include <machine/specialreg.h> 14799357Smarkm#ifdef SMP 14899357Smarkm#include <machine/smp.h> 14999357Smarkm#endif 15099357Smarkm 15199357Smarkmstatic __inline boolean_t 15299357Smarkmpmap_type_guest(pmap_t pmap) 15399357Smarkm{ 15499357Smarkm 15599357Smarkm return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 15699357Smarkm} 15799357Smarkm 15899357Smarkmstatic __inline boolean_t 15999357Smarkmpmap_emulate_ad_bits(pmap_t pmap) 16099357Smarkm{ 16199357Smarkm 16299357Smarkm return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 16399357Smarkm} 16499357Smarkm 16599357Smarkmstatic __inline pt_entry_t 16699357Smarkmpmap_valid_bit(pmap_t pmap) 16799357Smarkm{ 16899357Smarkm pt_entry_t mask; 16999357Smarkm 17099357Smarkm switch (pmap->pm_type) { 17199357Smarkm case PT_X86: 17299357Smarkm case PT_RVI: 17399357Smarkm mask = X86_PG_V; 17499357Smarkm break; 17599357Smarkm case PT_EPT: 17699357Smarkm if (pmap_emulate_ad_bits(pmap)) 17799357Smarkm mask = EPT_PG_EMUL_V; 17899357Smarkm else 17999357Smarkm mask = EPT_PG_READ; 18099357Smarkm break; 18199357Smarkm default: 18299357Smarkm panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 18399357Smarkm } 18499357Smarkm 18599357Smarkm return (mask); 18699357Smarkm} 18799357Smarkm 18899357Smarkmstatic __inline pt_entry_t 18999357Smarkmpmap_rw_bit(pmap_t pmap) 19099357Smarkm{ 19199357Smarkm pt_entry_t mask; 19299357Smarkm 19399357Smarkm switch (pmap->pm_type) { 19499357Smarkm case PT_X86: 19599357Smarkm case PT_RVI: 19699357Smarkm mask = X86_PG_RW; 19799357Smarkm break; 19899357Smarkm case PT_EPT: 19999357Smarkm if (pmap_emulate_ad_bits(pmap)) 20099357Smarkm mask = EPT_PG_EMUL_RW; 20199357Smarkm else 20299357Smarkm mask = EPT_PG_WRITE; 20399357Smarkm break; 20499357Smarkm default: 20599357Smarkm panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 20699357Smarkm } 20799357Smarkm 20899357Smarkm return (mask); 20999357Smarkm} 21099357Smarkm 21199357Smarkmstatic __inline pt_entry_t 21299357Smarkmpmap_global_bit(pmap_t pmap) 21399357Smarkm{ 21499357Smarkm pt_entry_t mask; 21599357Smarkm 21699357Smarkm switch (pmap->pm_type) { 21799357Smarkm case PT_X86: 21899357Smarkm mask = X86_PG_G; 21999357Smarkm break; 22099357Smarkm case PT_RVI: 22199357Smarkm case PT_EPT: 22299357Smarkm mask = 0; 22399357Smarkm break; 22499357Smarkm default: 22599357Smarkm panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 22699357Smarkm } 22799357Smarkm 22899357Smarkm return (mask); 22999357Smarkm} 23099357Smarkm 23199357Smarkmstatic __inline pt_entry_t 23299357Smarkmpmap_accessed_bit(pmap_t pmap) 23399357Smarkm{ 23499357Smarkm pt_entry_t mask; 23599357Smarkm 23699357Smarkm switch (pmap->pm_type) { 23799357Smarkm case PT_X86: 23899357Smarkm case PT_RVI: 23999357Smarkm mask = X86_PG_A; 24099357Smarkm break; 24199357Smarkm case PT_EPT: 24299357Smarkm if (pmap_emulate_ad_bits(pmap)) 24399357Smarkm mask = EPT_PG_READ; 24499357Smarkm else 24599357Smarkm mask = EPT_PG_A; 24699357Smarkm break; 24799357Smarkm default: 24899357Smarkm panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 24999357Smarkm } 25099357Smarkm 25199357Smarkm return (mask); 25299357Smarkm} 25399357Smarkm 25499357Smarkmstatic __inline pt_entry_t 25599357Smarkmpmap_modified_bit(pmap_t pmap) 25699357Smarkm{ 25799357Smarkm pt_entry_t mask; 25899357Smarkm 25999357Smarkm switch (pmap->pm_type) { 26099357Smarkm case PT_X86: 26199357Smarkm case PT_RVI: 26299357Smarkm mask = X86_PG_M; 26399357Smarkm break; 26499357Smarkm case PT_EPT: 26599357Smarkm if (pmap_emulate_ad_bits(pmap)) 26699357Smarkm mask = EPT_PG_WRITE; 26799357Smarkm else 26899357Smarkm mask = EPT_PG_M; 26999357Smarkm break; 27099357Smarkm default: 27199357Smarkm panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 27299357Smarkm } 27399357Smarkm 27499357Smarkm return (mask); 27599357Smarkm} 27699357Smarkm 27799357Smarkmextern struct pcpu __pcpu[]; 27899357Smarkm 27999357Smarkm#if !defined(DIAGNOSTIC) 28099357Smarkm#ifdef __GNUC_GNU_INLINE__ 28199357Smarkm#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 28299357Smarkm#else 28399357Smarkm#define PMAP_INLINE extern inline 28499357Smarkm#endif 28599357Smarkm#else 28699357Smarkm#define PMAP_INLINE 28799357Smarkm#endif 28899357Smarkm 28999357Smarkm#ifdef PV_STATS 29099357Smarkm#define PV_STAT(x) do { x ; } while (0) 29199357Smarkm#else 29299357Smarkm#define PV_STAT(x) do { } while (0) 29399357Smarkm#endif 29499357Smarkm 29599357Smarkm#define pa_index(pa) ((pa) >> PDRSHIFT) 29699357Smarkm#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 29799357Smarkm 29899357Smarkm#define NPV_LIST_LOCKS MAXCPU 29999357Smarkm 30099357Smarkm#define PHYS_TO_PV_LIST_LOCK(pa) \ 30199357Smarkm (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 30299357Smarkm 30399357Smarkm#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 30499357Smarkm struct rwlock **_lockp = (lockp); \ 30599357Smarkm struct rwlock *_new_lock; \ 30699357Smarkm \ 30799357Smarkm _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 30899357Smarkm if (_new_lock != *_lockp) { \ 30999357Smarkm if (*_lockp != NULL) \ 31099357Smarkm rw_wunlock(*_lockp); \ 31199357Smarkm *_lockp = _new_lock; \ 31299357Smarkm rw_wlock(*_lockp); \ 31399357Smarkm } \ 31499357Smarkm} while (0) 31599357Smarkm 31699357Smarkm#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 31799357Smarkm CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 31899357Smarkm 31999357Smarkm#define RELEASE_PV_LIST_LOCK(lockp) do { \ 32099357Smarkm struct rwlock **_lockp = (lockp); \ 32199357Smarkm \ 32299357Smarkm if (*_lockp != NULL) { \ 32399357Smarkm rw_wunlock(*_lockp); \ 32499357Smarkm *_lockp = NULL; \ 32599357Smarkm } \ 32699357Smarkm} while (0) 32799357Smarkm 32899357Smarkm#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 32999357Smarkm PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 33099357Smarkm 33199357Smarkmstruct pmap kernel_pmap_store; 33299357Smarkm 33399357Smarkmvm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 33499357Smarkmvm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 33599357Smarkm 33699357Smarkmint nkpt; 33799357SmarkmSYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 33899357Smarkm "Number of kernel page table pages allocated on bootup"); 33999357Smarkm 34099357Smarkmstatic int ndmpdp; 34199357Smarkmvm_paddr_t dmaplimit; 34299357Smarkmvm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 34399357Smarkmpt_entry_t pg_nx; 34499357Smarkm 34599357Smarkmstatic SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 34699357Smarkm 34799357Smarkmstatic int pat_works = 1; 34899357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 34999357Smarkm "Is page attribute table fully functional?"); 35099357Smarkm 35199357Smarkmstatic int pg_ps_enabled = 1; 35299357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 35399357Smarkm &pg_ps_enabled, 0, "Are large page mappings enabled?"); 35499357Smarkm 35599357Smarkm#define PAT_INDEX_SIZE 8 35699357Smarkmstatic int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 35799357Smarkm 35899357Smarkmstatic u_int64_t KPTphys; /* phys addr of kernel level 1 */ 35999357Smarkmstatic u_int64_t KPDphys; /* phys addr of kernel level 2 */ 36099357Smarkmu_int64_t KPDPphys; /* phys addr of kernel level 3 */ 36199357Smarkmu_int64_t KPML4phys; /* phys addr of kernel level 4 */ 36299357Smarkm 36399357Smarkmstatic u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 36499357Smarkmstatic u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 36599357Smarkmstatic int ndmpdpphys; /* number of DMPDPphys pages */ 36699357Smarkm 36799357Smarkm/* 36899357Smarkm * pmap_mapdev support pre initialization (i.e. console) 36999357Smarkm */ 37099357Smarkm#define PMAP_PREINIT_MAPPING_COUNT 8 37199357Smarkmstatic struct pmap_preinit_mapping { 37299357Smarkm vm_paddr_t pa; 37399357Smarkm vm_offset_t va; 37499357Smarkm vm_size_t sz; 37599357Smarkm int mode; 37699357Smarkm} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 37799357Smarkmstatic int pmap_initialized; 37899357Smarkm 37999357Smarkm/* 38099357Smarkm * Data for the pv entry allocation mechanism. 38199357Smarkm * Updates to pv_invl_gen are protected by the pv_list_locks[] 38299357Smarkm * elements, but reads are not. 38399357Smarkm */ 38499357Smarkmstatic TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 385167260Skevlostatic struct mtx pv_chunks_mutex; 38699357Smarkmstatic struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 38799357Smarkmstatic u_long pv_invl_gen[NPV_LIST_LOCKS]; 38899357Smarkmstatic struct md_page *pv_table; 38999357Smarkmstatic struct md_page pv_dummy; 39099357Smarkm 39199357Smarkm/* 39299357Smarkm * All those kernel PT submaps that BSD is so fond of 39399357Smarkm */ 39499357Smarkmpt_entry_t *CMAP1 = NULL; 39599357Smarkmcaddr_t CADDR1 = 0; 39699357Smarkmstatic vm_offset_t qframe = 0; 39799357Smarkmstatic struct mtx qframe_mtx; 39899357Smarkm 39999357Smarkmstatic int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 40099357Smarkm 40199357Smarkmint pmap_pcid_enabled = 1; 40299357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 40399357Smarkm &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 40499357Smarkmint invpcid_works = 0; 40599357SmarkmSYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 40699357Smarkm "Is the invpcid instruction available ?"); 40799357Smarkm 40899357Smarkmstatic int 40999357Smarkmpmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 41099357Smarkm{ 41199357Smarkm int i; 41299357Smarkm uint64_t res; 41399357Smarkm 41499357Smarkm res = 0; 41599357Smarkm CPU_FOREACH(i) { 41699357Smarkm res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 41799357Smarkm } 41899357Smarkm return (sysctl_handle_64(oidp, &res, 0, req)); 41999357Smarkm} 42099357SmarkmSYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 42199357Smarkm CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 42299357Smarkm "Count of saved TLB context on switch"); 42399357Smarkm 42499357Smarkmstatic LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 42599357Smarkm LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 42699357Smarkmstatic struct mtx invl_gen_mtx; 42799357Smarkmstatic u_long pmap_invl_gen = 0; 42899357Smarkm/* Fake lock object to satisfy turnstiles interface. */ 42999357Smarkmstatic struct lock_object invl_gen_ts = { 43099357Smarkm .lo_name = "invlts", 43199357Smarkm}; 43299357Smarkm 43399357Smarkmstatic bool 43499357Smarkmpmap_not_in_di(void) 43599357Smarkm{ 43699357Smarkm 43799357Smarkm return (curthread->td_md.md_invl_gen.gen == 0); 43899357Smarkm} 43999357Smarkm 44099357Smarkm#define PMAP_ASSERT_NOT_IN_DI() \ 44199357Smarkm KASSERT(pmap_not_in_di(), ("DI already started")) 44299357Smarkm 44399357Smarkm/* 44499357Smarkm * Start a new Delayed Invalidation (DI) block of code, executed by 44599357Smarkm * the current thread. Within a DI block, the current thread may 44699357Smarkm * destroy both the page table and PV list entries for a mapping and 44799357Smarkm * then release the corresponding PV list lock before ensuring that 44899357Smarkm * the mapping is flushed from the TLBs of any processors with the 44999357Smarkm * pmap active. 45099357Smarkm */ 45199357Smarkmstatic void 45299357Smarkmpmap_delayed_invl_started(void) 45399357Smarkm{ 45499357Smarkm struct pmap_invl_gen *invl_gen; 45599357Smarkm u_long currgen; 45699357Smarkm 45799357Smarkm invl_gen = &curthread->td_md.md_invl_gen; 45899357Smarkm PMAP_ASSERT_NOT_IN_DI(); 45999357Smarkm mtx_lock(&invl_gen_mtx); 46099357Smarkm if (LIST_EMPTY(&pmap_invl_gen_tracker)) 46199357Smarkm currgen = pmap_invl_gen; 46299357Smarkm else 46399357Smarkm currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 46499357Smarkm invl_gen->gen = currgen + 1; 46599357Smarkm LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 46699357Smarkm mtx_unlock(&invl_gen_mtx); 46799357Smarkm} 46899357Smarkm 46999357Smarkm/* 47099357Smarkm * Finish the DI block, previously started by the current thread. All 47199357Smarkm * required TLB flushes for the pages marked by 47299357Smarkm * pmap_delayed_invl_page() must be finished before this function is 47399357Smarkm * called. 47499357Smarkm * 47599357Smarkm * This function works by bumping the global DI generation number to 47699357Smarkm * the generation number of the current thread's DI, unless there is a 47799357Smarkm * pending DI that started earlier. In the latter case, bumping the 47899357Smarkm * global DI generation number would incorrectly signal that the 47999357Smarkm * earlier DI had finished. Instead, this function bumps the earlier 48099357Smarkm * DI's generation number to match the generation number of the 48199357Smarkm * current thread's DI. 48299357Smarkm */ 48399357Smarkmstatic void 48499357Smarkmpmap_delayed_invl_finished(void) 48599357Smarkm{ 48699357Smarkm struct pmap_invl_gen *invl_gen, *next; 48799357Smarkm struct turnstile *ts; 48899357Smarkm 48999357Smarkm invl_gen = &curthread->td_md.md_invl_gen; 49099357Smarkm KASSERT(invl_gen->gen != 0, ("missed invl_started")); 49199357Smarkm mtx_lock(&invl_gen_mtx); 49299357Smarkm next = LIST_NEXT(invl_gen, link); 49399357Smarkm if (next == NULL) { 49499357Smarkm turnstile_chain_lock(&invl_gen_ts); 49599357Smarkm ts = turnstile_lookup(&invl_gen_ts); 49699357Smarkm pmap_invl_gen = invl_gen->gen; 49799357Smarkm if (ts != NULL) { 49899357Smarkm turnstile_broadcast(ts, TS_SHARED_QUEUE); 49999357Smarkm turnstile_unpend(ts, TS_SHARED_LOCK); 50099357Smarkm } 50199357Smarkm turnstile_chain_unlock(&invl_gen_ts); 50299357Smarkm } else { 50399357Smarkm next->gen = invl_gen->gen; 50499357Smarkm } 50599357Smarkm LIST_REMOVE(invl_gen, link); 50699357Smarkm mtx_unlock(&invl_gen_mtx); 50799357Smarkm invl_gen->gen = 0; 50899357Smarkm} 50999357Smarkm 51099357Smarkm#ifdef PV_STATS 51199357Smarkmstatic long invl_wait; 51299357SmarkmSYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 51399357Smarkm "Number of times DI invalidation blocked pmap_remove_all/write"); 51499357Smarkm#endif 51599357Smarkm 51699357Smarkmstatic u_long * 51799357Smarkmpmap_delayed_invl_genp(vm_page_t m) 51899357Smarkm{ 51999357Smarkm 52099357Smarkm return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 52199357Smarkm} 52299357Smarkm 52399357Smarkm/* 52499357Smarkm * Ensure that all currently executing DI blocks, that need to flush 52599357Smarkm * TLB for the given page m, actually flushed the TLB at the time the 52699357Smarkm * function returned. If the page m has an empty PV list and we call 52799357Smarkm * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 52899357Smarkm * valid mapping for the page m in either its page table or TLB. 52999357Smarkm * 53099357Smarkm * This function works by blocking until the global DI generation 53199357Smarkm * number catches up with the generation number associated with the 53299357Smarkm * given page m and its PV list. Since this function's callers 53399357Smarkm * typically own an object lock and sometimes own a page lock, it 53499357Smarkm * cannot sleep. Instead, it blocks on a turnstile to relinquish the 53599357Smarkm * processor. 53699357Smarkm */ 53799357Smarkmstatic void 53899357Smarkmpmap_delayed_invl_wait(vm_page_t m) 53999357Smarkm{ 54099357Smarkm struct turnstile *ts; 54199357Smarkm u_long *m_gen; 54299357Smarkm#ifdef PV_STATS 54399357Smarkm bool accounted = false; 54499357Smarkm#endif 54599357Smarkm 54699357Smarkm m_gen = pmap_delayed_invl_genp(m); 54799357Smarkm while (*m_gen > pmap_invl_gen) { 54899357Smarkm#ifdef PV_STATS 54999357Smarkm if (!accounted) { 55099357Smarkm atomic_add_long(&invl_wait, 1); 55199357Smarkm accounted = true; 55299357Smarkm } 55399357Smarkm#endif 55499357Smarkm ts = turnstile_trywait(&invl_gen_ts); 55599357Smarkm if (*m_gen > pmap_invl_gen) 55699357Smarkm turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 55799357Smarkm else 55899357Smarkm turnstile_cancel(ts); 55999357Smarkm } 56099357Smarkm} 56199357Smarkm 56299357Smarkm/* 56399357Smarkm * Mark the page m's PV list as participating in the current thread's 56499357Smarkm * DI block. Any threads concurrently using m's PV list to remove or 56599357Smarkm * restrict all mappings to m will wait for the current thread's DI 56699357Smarkm * block to complete before proceeding. 56799357Smarkm * 56899357Smarkm * The function works by setting the DI generation number for m's PV 56999357Smarkm * list to at least the DI generation number of the current thread. 57099357Smarkm * This forces a caller of pmap_delayed_invl_wait() to block until 57199357Smarkm * current thread calls pmap_delayed_invl_finished(). 57299357Smarkm */ 57399357Smarkmstatic void 574147685Srupmap_delayed_invl_page(vm_page_t m) 57599357Smarkm{ 57699357Smarkm u_long gen, *m_gen; 57799357Smarkm 57899357Smarkm rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 57999357Smarkm gen = curthread->td_md.md_invl_gen.gen; 58099357Smarkm if (gen == 0) 58199357Smarkm return; 582147685Sru m_gen = pmap_delayed_invl_genp(m); 583147685Sru if (*m_gen < gen) 58499357Smarkm *m_gen = gen; 58599357Smarkm} 58699357Smarkm 58799357Smarkm/* 58899357Smarkm * Crashdump maps. 58999357Smarkm */ 59099357Smarkmstatic caddr_t crashdumpmap; 59199357Smarkm 59299357Smarkm/* 59399357Smarkm * Internal flags for pmap_enter()'s helper functions. 59499357Smarkm */ 59599357Smarkm#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 59699357Smarkm#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 59799357Smarkm 59899357Smarkmstatic void free_pv_chunk(struct pv_chunk *pc); 59999357Smarkmstatic void free_pv_entry(pmap_t pmap, pv_entry_t pv); 60099357Smarkmstatic pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 60199357Smarkmstatic int popcnt_pc_map_pq(uint64_t *map); 60299357Smarkmstatic vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 60399357Smarkmstatic void reserve_pv_entries(pmap_t pmap, int needed, 60499357Smarkm struct rwlock **lockp); 60599357Smarkmstatic void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 60699357Smarkm struct rwlock **lockp); 60799357Smarkmstatic bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 60899357Smarkm u_int flags, struct rwlock **lockp); 60999357Smarkm#if VM_NRESERVLEVEL > 0 61099357Smarkmstatic void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 61199357Smarkm struct rwlock **lockp); 61299357Smarkm#endif 61399357Smarkmstatic void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 61499357Smarkmstatic pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 61599357Smarkm vm_offset_t va); 61699357Smarkm 61799357Smarkmstatic int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 61899357Smarkmstatic boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 61999357Smarkmstatic boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 62099357Smarkm vm_offset_t va, struct rwlock **lockp); 62199357Smarkmstatic boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 62299357Smarkm vm_offset_t va); 62399357Smarkmstatic bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 62499357Smarkm vm_prot_t prot, struct rwlock **lockp); 62599357Smarkmstatic int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 62699357Smarkm u_int flags, vm_page_t m, struct rwlock **lockp); 62799357Smarkmstatic vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 62899357Smarkm vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 62999357Smarkmstatic void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 63099357Smarkmstatic int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 63199357Smarkmstatic void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 63299357Smarkm pd_entry_t pde); 63399357Smarkmstatic void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 63499357Smarkmstatic void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 63599357Smarkm#if VM_NRESERVLEVEL > 0 63699357Smarkmstatic void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 63799357Smarkm struct rwlock **lockp); 63899357Smarkm#endif 63999357Smarkmstatic boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 64099357Smarkm vm_prot_t prot); 64199357Smarkmstatic void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 64299357Smarkmstatic int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 64399357Smarkm struct spglist *free, struct rwlock **lockp); 64499357Smarkmstatic int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 64599357Smarkm pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 64699357Smarkmstatic vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 64799357Smarkmstatic void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 64899357Smarkm struct spglist *free); 64999357Smarkmstatic bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 65099357Smarkm pd_entry_t *pde, struct spglist *free, 65199357Smarkm struct rwlock **lockp); 65299357Smarkmstatic boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 65399357Smarkm vm_page_t m, struct rwlock **lockp); 65499357Smarkmstatic void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 65599357Smarkm pd_entry_t newpde); 65699357Smarkmstatic void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 65799357Smarkm 65899357Smarkmstatic vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 65999357Smarkm struct rwlock **lockp); 66099357Smarkmstatic vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 66199357Smarkm struct rwlock **lockp); 66299357Smarkmstatic vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 66399357Smarkm struct rwlock **lockp); 66499357Smarkm 66599357Smarkmstatic void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 66699357Smarkm struct spglist *free); 66799357Smarkmstatic int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 66899357Smarkmstatic vm_offset_t pmap_kmem_choose(vm_offset_t addr); 66999357Smarkm 67099357Smarkm/* 67199357Smarkm * Move the kernel virtual free pointer to the next 67299357Smarkm * 2MB. This is used to help improve performance 67399357Smarkm * by using a large (2MB) page for much of the kernel 67499357Smarkm * (.text, .data, .bss) 67599357Smarkm */ 67699357Smarkmstatic vm_offset_t 67799357Smarkmpmap_kmem_choose(vm_offset_t addr) 67899357Smarkm{ 67999357Smarkm vm_offset_t newaddr = addr; 68099357Smarkm 68199357Smarkm newaddr = roundup2(addr, NBPDR); 68299357Smarkm return (newaddr); 68399357Smarkm} 68499357Smarkm 68599357Smarkm/********************/ 68699357Smarkm/* Inline functions */ 68799357Smarkm/********************/ 68899357Smarkm 68999357Smarkm/* Return a non-clipped PD index for a given VA */ 69099357Smarkmstatic __inline vm_pindex_t 69199357Smarkmpmap_pde_pindex(vm_offset_t va) 69299357Smarkm{ 69399357Smarkm return (va >> PDRSHIFT); 69499357Smarkm} 69599357Smarkm 69699357Smarkm 69799357Smarkm/* Return a pointer to the PML4 slot that corresponds to a VA */ 69899357Smarkmstatic __inline pml4_entry_t * 69999357Smarkmpmap_pml4e(pmap_t pmap, vm_offset_t va) 70099357Smarkm{ 70199357Smarkm 70299357Smarkm return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 70399357Smarkm} 70499357Smarkm 70599357Smarkm/* Return a pointer to the PDP slot that corresponds to a VA */ 70699357Smarkmstatic __inline pdp_entry_t * 70799357Smarkmpmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 70899357Smarkm{ 70999357Smarkm pdp_entry_t *pdpe; 71099357Smarkm 71199357Smarkm pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 71299357Smarkm return (&pdpe[pmap_pdpe_index(va)]); 71399357Smarkm} 71499357Smarkm 71599357Smarkm/* Return a pointer to the PDP slot that corresponds to a VA */ 71699357Smarkmstatic __inline pdp_entry_t * 71799357Smarkmpmap_pdpe(pmap_t pmap, vm_offset_t va) 71899357Smarkm{ 71999357Smarkm pml4_entry_t *pml4e; 72099357Smarkm pt_entry_t PG_V; 72199357Smarkm 72299357Smarkm PG_V = pmap_valid_bit(pmap); 72399357Smarkm pml4e = pmap_pml4e(pmap, va); 72499357Smarkm if ((*pml4e & PG_V) == 0) 72599357Smarkm return (NULL); 72699357Smarkm return (pmap_pml4e_to_pdpe(pml4e, va)); 72799357Smarkm} 72899357Smarkm 72999357Smarkm/* Return a pointer to the PD slot that corresponds to a VA */ 73099357Smarkmstatic __inline pd_entry_t * 73199357Smarkmpmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 73299357Smarkm{ 73399357Smarkm pd_entry_t *pde; 73499357Smarkm 73599357Smarkm pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 73699357Smarkm return (&pde[pmap_pde_index(va)]); 73799357Smarkm} 73899357Smarkm 73999357Smarkm/* Return a pointer to the PD slot that corresponds to a VA */ 74099357Smarkmstatic __inline pd_entry_t * 74199357Smarkmpmap_pde(pmap_t pmap, vm_offset_t va) 74299357Smarkm{ 74399357Smarkm pdp_entry_t *pdpe; 74499357Smarkm pt_entry_t PG_V; 74599357Smarkm 74699357Smarkm PG_V = pmap_valid_bit(pmap); 74799357Smarkm pdpe = pmap_pdpe(pmap, va); 74899357Smarkm if (pdpe == NULL || (*pdpe & PG_V) == 0) 74999357Smarkm return (NULL); 75099357Smarkm return (pmap_pdpe_to_pde(pdpe, va)); 75199357Smarkm} 75299357Smarkm 75399357Smarkm/* Return a pointer to the PT slot that corresponds to a VA */ 75499357Smarkmstatic __inline pt_entry_t * 75599357Smarkmpmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 75699357Smarkm{ 75799357Smarkm pt_entry_t *pte; 75899357Smarkm 75999357Smarkm pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 76099357Smarkm return (&pte[pmap_pte_index(va)]); 76199357Smarkm} 76299357Smarkm 76399357Smarkm/* Return a pointer to the PT slot that corresponds to a VA */ 76499357Smarkmstatic __inline pt_entry_t * 76599357Smarkmpmap_pte(pmap_t pmap, vm_offset_t va) 76699357Smarkm{ 76799357Smarkm pd_entry_t *pde; 76899357Smarkm pt_entry_t PG_V; 76999357Smarkm 77099357Smarkm PG_V = pmap_valid_bit(pmap); 77199357Smarkm pde = pmap_pde(pmap, va); 77299357Smarkm if (pde == NULL || (*pde & PG_V) == 0) 77399357Smarkm return (NULL); 77499357Smarkm if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 77599357Smarkm return ((pt_entry_t *)pde); 77699357Smarkm return (pmap_pde_to_pte(pde, va)); 77799357Smarkm} 77899357Smarkm 77999357Smarkmstatic __inline void 78099357Smarkmpmap_resident_count_inc(pmap_t pmap, int count) 78199357Smarkm{ 78299357Smarkm 78399357Smarkm PMAP_LOCK_ASSERT(pmap, MA_OWNED); 78499357Smarkm pmap->pm_stats.resident_count += count; 78599357Smarkm} 78699357Smarkm 78799357Smarkmstatic __inline void 78899357Smarkmpmap_resident_count_dec(pmap_t pmap, int count) 78999357Smarkm{ 79099357Smarkm 79199357Smarkm PMAP_LOCK_ASSERT(pmap, MA_OWNED); 79299357Smarkm KASSERT(pmap->pm_stats.resident_count >= count, 79399357Smarkm ("pmap %p resident count underflow %ld %d", pmap, 79499357Smarkm pmap->pm_stats.resident_count, count)); 79599357Smarkm pmap->pm_stats.resident_count -= count; 79699357Smarkm} 79799357Smarkm 79899357SmarkmPMAP_INLINE pt_entry_t * 79999357Smarkmvtopte(vm_offset_t va) 80099357Smarkm{ 80199357Smarkm u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 80299357Smarkm 80399357Smarkm KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 80499357Smarkm 80599357Smarkm return (PTmap + ((va >> PAGE_SHIFT) & mask)); 80699357Smarkm} 80799357Smarkm 80899357Smarkmstatic __inline pd_entry_t * 80999357Smarkmvtopde(vm_offset_t va) 81099357Smarkm{ 81199357Smarkm u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 81299357Smarkm 81399357Smarkm KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 81499357Smarkm 81599357Smarkm return (PDmap + ((va >> PDRSHIFT) & mask)); 81699357Smarkm} 81799357Smarkm 81899357Smarkmstatic u_int64_t 81999357Smarkmallocpages(vm_paddr_t *firstaddr, int n) 82099357Smarkm{ 82199357Smarkm u_int64_t ret; 82299357Smarkm 82399357Smarkm ret = *firstaddr; 82499357Smarkm bzero((void *)ret, n * PAGE_SIZE); 82599357Smarkm *firstaddr += n * PAGE_SIZE; 82699357Smarkm return (ret); 82799357Smarkm} 82899357Smarkm 82999357SmarkmCTASSERT(powerof2(NDMPML4E)); 83099357Smarkm 83199357Smarkm/* number of kernel PDP slots */ 83299357Smarkm#define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 83399357Smarkm 83499357Smarkmstatic void 835154151Sflznkpt_init(vm_paddr_t addr) 836154151Sflz{ 837154151Sflz int pt_pages; 83899357Smarkm 83999357Smarkm#ifdef NKPT 84099357Smarkm pt_pages = NKPT; 84199357Smarkm#else 84299357Smarkm pt_pages = howmany(addr, 1 << PDRSHIFT); 843 pt_pages += NKPDPE(pt_pages); 844 845 /* 846 * Add some slop beyond the bare minimum required for bootstrapping 847 * the kernel. 848 * 849 * This is quite important when allocating KVA for kernel modules. 850 * The modules are required to be linked in the negative 2GB of 851 * the address space. If we run out of KVA in this region then 852 * pmap_growkernel() will need to allocate page table pages to map 853 * the entire 512GB of KVA space which is an unnecessary tax on 854 * physical memory. 855 * 856 * Secondly, device memory mapped as part of setting up the low- 857 * level console(s) is taken from KVA, starting at virtual_avail. 858 * This is because cninit() is called after pmap_bootstrap() but 859 * before vm_init() and pmap_init(). 20MB for a frame buffer is 860 * not uncommon. 861 */ 862 pt_pages += 32; /* 64MB additional slop. */ 863#endif 864 nkpt = pt_pages; 865} 866 867static void 868create_pagetables(vm_paddr_t *firstaddr) 869{ 870 int i, j, ndm1g, nkpdpe; 871 pt_entry_t *pt_p; 872 pd_entry_t *pd_p; 873 pdp_entry_t *pdp_p; 874 pml4_entry_t *p4_p; 875 876 /* Allocate page table pages for the direct map */ 877 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 878 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 879 ndmpdp = 4; 880 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 881 if (ndmpdpphys > NDMPML4E) { 882 /* 883 * Each NDMPML4E allows 512 GB, so limit to that, 884 * and then readjust ndmpdp and ndmpdpphys. 885 */ 886 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 887 Maxmem = atop(NDMPML4E * NBPML4); 888 ndmpdpphys = NDMPML4E; 889 ndmpdp = NDMPML4E * NPDEPG; 890 } 891 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 892 ndm1g = 0; 893 if ((amd_feature & AMDID_PAGE1GB) != 0) 894 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 895 if (ndm1g < ndmpdp) 896 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 897 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 898 899 /* Allocate pages */ 900 KPML4phys = allocpages(firstaddr, 1); 901 KPDPphys = allocpages(firstaddr, NKPML4E); 902 903 /* 904 * Allocate the initial number of kernel page table pages required to 905 * bootstrap. We defer this until after all memory-size dependent 906 * allocations are done (e.g. direct map), so that we don't have to 907 * build in too much slop in our estimate. 908 * 909 * Note that when NKPML4E > 1, we have an empty page underneath 910 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 911 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 912 */ 913 nkpt_init(*firstaddr); 914 nkpdpe = NKPDPE(nkpt); 915 916 KPTphys = allocpages(firstaddr, nkpt); 917 KPDphys = allocpages(firstaddr, nkpdpe); 918 919 /* Fill in the underlying page table pages */ 920 /* Nominally read-only (but really R/W) from zero to physfree */ 921 /* XXX not fully used, underneath 2M pages */ 922 pt_p = (pt_entry_t *)KPTphys; 923 for (i = 0; ptoa(i) < *firstaddr; i++) 924 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 925 926 /* Now map the page tables at their location within PTmap */ 927 pd_p = (pd_entry_t *)KPDphys; 928 for (i = 0; i < nkpt; i++) 929 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 930 931 /* Map from zero to end of allocations under 2M pages */ 932 /* This replaces some of the KPTphys entries above */ 933 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 934 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 935 X86_PG_G; 936 937 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 938 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 939 for (i = 0; i < nkpdpe; i++) 940 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 941 PG_U; 942 943 /* 944 * Now, set up the direct map region using 2MB and/or 1GB pages. If 945 * the end of physical memory is not aligned to a 1GB page boundary, 946 * then the residual physical memory is mapped with 2MB pages. Later, 947 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 948 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 949 * that are partially used. 950 */ 951 pd_p = (pd_entry_t *)DMPDphys; 952 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 953 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 954 /* Preset PG_M and PG_A because demotion expects it. */ 955 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 956 X86_PG_M | X86_PG_A; 957 } 958 pdp_p = (pdp_entry_t *)DMPDPphys; 959 for (i = 0; i < ndm1g; i++) { 960 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 961 /* Preset PG_M and PG_A because demotion expects it. */ 962 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 963 X86_PG_M | X86_PG_A; 964 } 965 for (j = 0; i < ndmpdp; i++, j++) { 966 pdp_p[i] = DMPDphys + ptoa(j); 967 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 968 } 969 970 /* And recursively map PML4 to itself in order to get PTmap */ 971 p4_p = (pml4_entry_t *)KPML4phys; 972 p4_p[PML4PML4I] = KPML4phys; 973 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 974 975 /* Connect the Direct Map slot(s) up to the PML4. */ 976 for (i = 0; i < ndmpdpphys; i++) { 977 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 978 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 979 } 980 981 /* Connect the KVA slots up to the PML4 */ 982 for (i = 0; i < NKPML4E; i++) { 983 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 984 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 985 } 986} 987 988/* 989 * Bootstrap the system enough to run with virtual memory. 990 * 991 * On amd64 this is called after mapping has already been enabled 992 * and just syncs the pmap module with what has already been done. 993 * [We can't call it easily with mapping off since the kernel is not 994 * mapped with PA == VA, hence we would have to relocate every address 995 * from the linked base (virtual) address "KERNBASE" to the actual 996 * (physical) address starting relative to 0] 997 */ 998void 999pmap_bootstrap(vm_paddr_t *firstaddr) 1000{ 1001 vm_offset_t va; 1002 pt_entry_t *pte; 1003 int i; 1004 1005 /* 1006 * Create an initial set of page tables to run the kernel in. 1007 */ 1008 create_pagetables(firstaddr); 1009 1010 /* 1011 * Add a physical memory segment (vm_phys_seg) corresponding to the 1012 * preallocated kernel page table pages so that vm_page structures 1013 * representing these pages will be created. The vm_page structures 1014 * are required for promotion of the corresponding kernel virtual 1015 * addresses to superpage mappings. 1016 */ 1017 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1018 1019 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 1020 virtual_avail = pmap_kmem_choose(virtual_avail); 1021 1022 virtual_end = VM_MAX_KERNEL_ADDRESS; 1023 1024 1025 /* XXX do %cr0 as well */ 1026 load_cr4(rcr4() | CR4_PGE); 1027 load_cr3(KPML4phys); 1028 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1029 load_cr4(rcr4() | CR4_SMEP); 1030 1031 /* 1032 * Initialize the kernel pmap (which is statically allocated). 1033 */ 1034 PMAP_LOCK_INIT(kernel_pmap); 1035 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1036 kernel_pmap->pm_cr3 = KPML4phys; 1037 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1038 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1039 kernel_pmap->pm_flags = pmap_flags; 1040 1041 /* 1042 * Initialize the TLB invalidations generation number lock. 1043 */ 1044 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1045 1046 /* 1047 * Reserve some special page table entries/VA space for temporary 1048 * mapping of pages. 1049 */ 1050#define SYSMAP(c, p, v, n) \ 1051 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1052 1053 va = virtual_avail; 1054 pte = vtopte(va); 1055 1056 /* 1057 * Crashdump maps. The first page is reused as CMAP1 for the 1058 * memory test. 1059 */ 1060 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1061 CADDR1 = crashdumpmap; 1062 1063 virtual_avail = va; 1064 1065 /* 1066 * Initialize the PAT MSR. 1067 * pmap_init_pat() clears and sets CR4_PGE, which, as a 1068 * side-effect, invalidates stale PG_G TLB entries that might 1069 * have been created in our pre-boot environment. 1070 */ 1071 pmap_init_pat(); 1072 1073 /* Initialize TLB Context Id. */ 1074 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1075 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1076 /* Check for INVPCID support */ 1077 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 1078 != 0; 1079 for (i = 0; i < MAXCPU; i++) { 1080 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1081 kernel_pmap->pm_pcids[i].pm_gen = 1; 1082 } 1083 __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1; 1084 __pcpu[0].pc_pcid_gen = 1; 1085 /* 1086 * pcpu area for APs is zeroed during AP startup. 1087 * pc_pcid_next and pc_pcid_gen are initialized by AP 1088 * during pcpu setup. 1089 */ 1090 load_cr4(rcr4() | CR4_PCIDE); 1091 } else { 1092 pmap_pcid_enabled = 0; 1093 } 1094} 1095 1096/* 1097 * Setup the PAT MSR. 1098 */ 1099void 1100pmap_init_pat(void) 1101{ 1102 int pat_table[PAT_INDEX_SIZE]; 1103 uint64_t pat_msr; 1104 u_long cr0, cr4; 1105 int i; 1106 1107 /* Bail if this CPU doesn't implement PAT. */ 1108 if ((cpu_feature & CPUID_PAT) == 0) 1109 panic("no PAT??"); 1110 1111 /* Set default PAT index table. */ 1112 for (i = 0; i < PAT_INDEX_SIZE; i++) 1113 pat_table[i] = -1; 1114 pat_table[PAT_WRITE_BACK] = 0; 1115 pat_table[PAT_WRITE_THROUGH] = 1; 1116 pat_table[PAT_UNCACHEABLE] = 3; 1117 pat_table[PAT_WRITE_COMBINING] = 3; 1118 pat_table[PAT_WRITE_PROTECTED] = 3; 1119 pat_table[PAT_UNCACHED] = 3; 1120 1121 /* Initialize default PAT entries. */ 1122 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1123 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1124 PAT_VALUE(2, PAT_UNCACHED) | 1125 PAT_VALUE(3, PAT_UNCACHEABLE) | 1126 PAT_VALUE(4, PAT_WRITE_BACK) | 1127 PAT_VALUE(5, PAT_WRITE_THROUGH) | 1128 PAT_VALUE(6, PAT_UNCACHED) | 1129 PAT_VALUE(7, PAT_UNCACHEABLE); 1130 1131 if (pat_works) { 1132 /* 1133 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1134 * Program 5 and 6 as WP and WC. 1135 * Leave 4 and 7 as WB and UC. 1136 */ 1137 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 1138 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1139 PAT_VALUE(6, PAT_WRITE_COMBINING); 1140 pat_table[PAT_UNCACHED] = 2; 1141 pat_table[PAT_WRITE_PROTECTED] = 5; 1142 pat_table[PAT_WRITE_COMBINING] = 6; 1143 } else { 1144 /* 1145 * Just replace PAT Index 2 with WC instead of UC-. 1146 */ 1147 pat_msr &= ~PAT_MASK(2); 1148 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 1149 pat_table[PAT_WRITE_COMBINING] = 2; 1150 } 1151 1152 /* Disable PGE. */ 1153 cr4 = rcr4(); 1154 load_cr4(cr4 & ~CR4_PGE); 1155 1156 /* Disable caches (CD = 1, NW = 0). */ 1157 cr0 = rcr0(); 1158 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1159 1160 /* Flushes caches and TLBs. */ 1161 wbinvd(); 1162 invltlb(); 1163 1164 /* Update PAT and index table. */ 1165 wrmsr(MSR_PAT, pat_msr); 1166 for (i = 0; i < PAT_INDEX_SIZE; i++) 1167 pat_index[i] = pat_table[i]; 1168 1169 /* Flush caches and TLBs again. */ 1170 wbinvd(); 1171 invltlb(); 1172 1173 /* Restore caches and PGE. */ 1174 load_cr0(cr0); 1175 load_cr4(cr4); 1176} 1177 1178/* 1179 * Initialize a vm_page's machine-dependent fields. 1180 */ 1181void 1182pmap_page_init(vm_page_t m) 1183{ 1184 1185 TAILQ_INIT(&m->md.pv_list); 1186 m->md.pat_mode = PAT_WRITE_BACK; 1187} 1188 1189/* 1190 * Initialize the pmap module. 1191 * Called by vm_init, to initialize any structures that the pmap 1192 * system needs to map virtual memory. 1193 */ 1194void 1195pmap_init(void) 1196{ 1197 struct pmap_preinit_mapping *ppim; 1198 vm_page_t mpte; 1199 vm_size_t s; 1200 int error, i, pv_npg; 1201 1202 /* 1203 * Initialize the vm page array entries for the kernel pmap's 1204 * page table pages. 1205 */ 1206 for (i = 0; i < nkpt; i++) { 1207 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1208 KASSERT(mpte >= vm_page_array && 1209 mpte < &vm_page_array[vm_page_array_size], 1210 ("pmap_init: page table page is out of range")); 1211 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1212 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1213 } 1214 1215 /* 1216 * If the kernel is running on a virtual machine, then it must assume 1217 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1218 * be prepared for the hypervisor changing the vendor and family that 1219 * are reported by CPUID. Consequently, the workaround for AMD Family 1220 * 10h Erratum 383 is enabled if the processor's feature set does not 1221 * include at least one feature that is only supported by older Intel 1222 * or newer AMD processors. 1223 */ 1224 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1225 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1226 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1227 AMDID2_FMA4)) == 0) 1228 workaround_erratum383 = 1; 1229 1230 /* 1231 * Are large page mappings enabled? 1232 */ 1233 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1234 if (pg_ps_enabled) { 1235 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1236 ("pmap_init: can't assign to pagesizes[1]")); 1237 pagesizes[1] = NBPDR; 1238 } 1239 1240 /* 1241 * Initialize the pv chunk list mutex. 1242 */ 1243 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1244 1245 /* 1246 * Initialize the pool of pv list locks. 1247 */ 1248 for (i = 0; i < NPV_LIST_LOCKS; i++) 1249 rw_init(&pv_list_locks[i], "pmap pv list"); 1250 1251 /* 1252 * Calculate the size of the pv head table for superpages. 1253 */ 1254 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1255 1256 /* 1257 * Allocate memory for the pv head table for superpages. 1258 */ 1259 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1260 s = round_page(s); 1261 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1262 M_WAITOK | M_ZERO); 1263 for (i = 0; i < pv_npg; i++) 1264 TAILQ_INIT(&pv_table[i].pv_list); 1265 TAILQ_INIT(&pv_dummy.pv_list); 1266 1267 pmap_initialized = 1; 1268 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1269 ppim = pmap_preinit_mapping + i; 1270 if (ppim->va == 0) 1271 continue; 1272 /* Make the direct map consistent */ 1273 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { 1274 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1275 ppim->sz, ppim->mode); 1276 } 1277 if (!bootverbose) 1278 continue; 1279 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1280 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1281 } 1282 1283 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1284 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1285 (vmem_addr_t *)&qframe); 1286 if (error != 0) 1287 panic("qframe allocation failed"); 1288} 1289 1290static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1291 "2MB page mapping counters"); 1292 1293static u_long pmap_pde_demotions; 1294SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1295 &pmap_pde_demotions, 0, "2MB page demotions"); 1296 1297static u_long pmap_pde_mappings; 1298SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1299 &pmap_pde_mappings, 0, "2MB page mappings"); 1300 1301static u_long pmap_pde_p_failures; 1302SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1303 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1304 1305static u_long pmap_pde_promotions; 1306SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1307 &pmap_pde_promotions, 0, "2MB page promotions"); 1308 1309static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1310 "1GB page mapping counters"); 1311 1312static u_long pmap_pdpe_demotions; 1313SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1314 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1315 1316/*************************************************** 1317 * Low level helper routines..... 1318 ***************************************************/ 1319 1320static pt_entry_t 1321pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1322{ 1323 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1324 1325 switch (pmap->pm_type) { 1326 case PT_X86: 1327 case PT_RVI: 1328 /* Verify that both PAT bits are not set at the same time */ 1329 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1330 ("Invalid PAT bits in entry %#lx", entry)); 1331 1332 /* Swap the PAT bits if one of them is set */ 1333 if ((entry & x86_pat_bits) != 0) 1334 entry ^= x86_pat_bits; 1335 break; 1336 case PT_EPT: 1337 /* 1338 * Nothing to do - the memory attributes are represented 1339 * the same way for regular pages and superpages. 1340 */ 1341 break; 1342 default: 1343 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1344 } 1345 1346 return (entry); 1347} 1348 1349/* 1350 * Determine the appropriate bits to set in a PTE or PDE for a specified 1351 * caching mode. 1352 */ 1353int 1354pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1355{ 1356 int cache_bits, pat_flag, pat_idx; 1357 1358 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1359 panic("Unknown caching mode %d\n", mode); 1360 1361 switch (pmap->pm_type) { 1362 case PT_X86: 1363 case PT_RVI: 1364 /* The PAT bit is different for PTE's and PDE's. */ 1365 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1366 1367 /* Map the caching mode to a PAT index. */ 1368 pat_idx = pat_index[mode]; 1369 1370 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1371 cache_bits = 0; 1372 if (pat_idx & 0x4) 1373 cache_bits |= pat_flag; 1374 if (pat_idx & 0x2) 1375 cache_bits |= PG_NC_PCD; 1376 if (pat_idx & 0x1) 1377 cache_bits |= PG_NC_PWT; 1378 break; 1379 1380 case PT_EPT: 1381 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1382 break; 1383 1384 default: 1385 panic("unsupported pmap type %d", pmap->pm_type); 1386 } 1387 1388 return (cache_bits); 1389} 1390 1391static int 1392pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1393{ 1394 int mask; 1395 1396 switch (pmap->pm_type) { 1397 case PT_X86: 1398 case PT_RVI: 1399 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1400 break; 1401 case PT_EPT: 1402 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1403 break; 1404 default: 1405 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1406 } 1407 1408 return (mask); 1409} 1410 1411bool 1412pmap_ps_enabled(pmap_t pmap) 1413{ 1414 1415 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1416} 1417 1418static void 1419pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1420{ 1421 1422 switch (pmap->pm_type) { 1423 case PT_X86: 1424 break; 1425 case PT_RVI: 1426 case PT_EPT: 1427 /* 1428 * XXX 1429 * This is a little bogus since the generation number is 1430 * supposed to be bumped up when a region of the address 1431 * space is invalidated in the page tables. 1432 * 1433 * In this case the old PDE entry is valid but yet we want 1434 * to make sure that any mappings using the old entry are 1435 * invalidated in the TLB. 1436 * 1437 * The reason this works as expected is because we rendezvous 1438 * "all" host cpus and force any vcpu context to exit as a 1439 * side-effect. 1440 */ 1441 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1442 break; 1443 default: 1444 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1445 } 1446 pde_store(pde, newpde); 1447} 1448 1449/* 1450 * After changing the page size for the specified virtual address in the page 1451 * table, flush the corresponding entries from the processor's TLB. Only the 1452 * calling processor's TLB is affected. 1453 * 1454 * The calling thread must be pinned to a processor. 1455 */ 1456static void 1457pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1458{ 1459 pt_entry_t PG_G; 1460 1461 if (pmap_type_guest(pmap)) 1462 return; 1463 1464 KASSERT(pmap->pm_type == PT_X86, 1465 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1466 1467 PG_G = pmap_global_bit(pmap); 1468 1469 if ((newpde & PG_PS) == 0) 1470 /* Demotion: flush a specific 2MB page mapping. */ 1471 invlpg(va); 1472 else if ((newpde & PG_G) == 0) 1473 /* 1474 * Promotion: flush every 4KB page mapping from the TLB 1475 * because there are too many to flush individually. 1476 */ 1477 invltlb(); 1478 else { 1479 /* 1480 * Promotion: flush every 4KB page mapping from the TLB, 1481 * including any global (PG_G) mappings. 1482 */ 1483 invltlb_glob(); 1484 } 1485} 1486#ifdef SMP 1487 1488/* 1489 * For SMP, these functions have to use the IPI mechanism for coherence. 1490 * 1491 * N.B.: Before calling any of the following TLB invalidation functions, 1492 * the calling processor must ensure that all stores updating a non- 1493 * kernel page table are globally performed. Otherwise, another 1494 * processor could cache an old, pre-update entry without being 1495 * invalidated. This can happen one of two ways: (1) The pmap becomes 1496 * active on another processor after its pm_active field is checked by 1497 * one of the following functions but before a store updating the page 1498 * table is globally performed. (2) The pmap becomes active on another 1499 * processor before its pm_active field is checked but due to 1500 * speculative loads one of the following functions stills reads the 1501 * pmap as inactive on the other processor. 1502 * 1503 * The kernel page table is exempt because its pm_active field is 1504 * immutable. The kernel page table is always active on every 1505 * processor. 1506 */ 1507 1508/* 1509 * Interrupt the cpus that are executing in the guest context. 1510 * This will force the vcpu to exit and the cached EPT mappings 1511 * will be invalidated by the host before the next vmresume. 1512 */ 1513static __inline void 1514pmap_invalidate_ept(pmap_t pmap) 1515{ 1516 int ipinum; 1517 1518 sched_pin(); 1519 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1520 ("pmap_invalidate_ept: absurd pm_active")); 1521 1522 /* 1523 * The TLB mappings associated with a vcpu context are not 1524 * flushed each time a different vcpu is chosen to execute. 1525 * 1526 * This is in contrast with a process's vtop mappings that 1527 * are flushed from the TLB on each context switch. 1528 * 1529 * Therefore we need to do more than just a TLB shootdown on 1530 * the active cpus in 'pmap->pm_active'. To do this we keep 1531 * track of the number of invalidations performed on this pmap. 1532 * 1533 * Each vcpu keeps a cache of this counter and compares it 1534 * just before a vmresume. If the counter is out-of-date an 1535 * invept will be done to flush stale mappings from the TLB. 1536 */ 1537 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1538 1539 /* 1540 * Force the vcpu to exit and trap back into the hypervisor. 1541 */ 1542 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1543 ipi_selected(pmap->pm_active, ipinum); 1544 sched_unpin(); 1545} 1546 1547void 1548pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1549{ 1550 cpuset_t *mask; 1551 u_int cpuid, i; 1552 1553 if (pmap_type_guest(pmap)) { 1554 pmap_invalidate_ept(pmap); 1555 return; 1556 } 1557 1558 KASSERT(pmap->pm_type == PT_X86, 1559 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1560 1561 sched_pin(); 1562 if (pmap == kernel_pmap) { 1563 invlpg(va); 1564 mask = &all_cpus; 1565 } else { 1566 cpuid = PCPU_GET(cpuid); 1567 if (pmap == PCPU_GET(curpmap)) 1568 invlpg(va); 1569 else if (pmap_pcid_enabled) 1570 pmap->pm_pcids[cpuid].pm_gen = 0; 1571 if (pmap_pcid_enabled) { 1572 CPU_FOREACH(i) { 1573 if (cpuid != i) 1574 pmap->pm_pcids[i].pm_gen = 0; 1575 } 1576 } 1577 mask = &pmap->pm_active; 1578 } 1579 smp_masked_invlpg(*mask, va); 1580 sched_unpin(); 1581} 1582 1583/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1584#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1585 1586void 1587pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1588{ 1589 cpuset_t *mask; 1590 vm_offset_t addr; 1591 u_int cpuid, i; 1592 1593 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1594 pmap_invalidate_all(pmap); 1595 return; 1596 } 1597 1598 if (pmap_type_guest(pmap)) { 1599 pmap_invalidate_ept(pmap); 1600 return; 1601 } 1602 1603 KASSERT(pmap->pm_type == PT_X86, 1604 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1605 1606 sched_pin(); 1607 cpuid = PCPU_GET(cpuid); 1608 if (pmap == kernel_pmap) { 1609 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1610 invlpg(addr); 1611 mask = &all_cpus; 1612 } else { 1613 if (pmap == PCPU_GET(curpmap)) { 1614 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1615 invlpg(addr); 1616 } else if (pmap_pcid_enabled) { 1617 pmap->pm_pcids[cpuid].pm_gen = 0; 1618 } 1619 if (pmap_pcid_enabled) { 1620 CPU_FOREACH(i) { 1621 if (cpuid != i) 1622 pmap->pm_pcids[i].pm_gen = 0; 1623 } 1624 } 1625 mask = &pmap->pm_active; 1626 } 1627 smp_masked_invlpg_range(*mask, sva, eva); 1628 sched_unpin(); 1629} 1630 1631void 1632pmap_invalidate_all(pmap_t pmap) 1633{ 1634 cpuset_t *mask; 1635 struct invpcid_descr d; 1636 u_int cpuid, i; 1637 1638 if (pmap_type_guest(pmap)) { 1639 pmap_invalidate_ept(pmap); 1640 return; 1641 } 1642 1643 KASSERT(pmap->pm_type == PT_X86, 1644 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1645 1646 sched_pin(); 1647 if (pmap == kernel_pmap) { 1648 if (pmap_pcid_enabled && invpcid_works) { 1649 bzero(&d, sizeof(d)); 1650 invpcid(&d, INVPCID_CTXGLOB); 1651 } else { 1652 invltlb_glob(); 1653 } 1654 mask = &all_cpus; 1655 } else { 1656 cpuid = PCPU_GET(cpuid); 1657 if (pmap == PCPU_GET(curpmap)) { 1658 if (pmap_pcid_enabled) { 1659 if (invpcid_works) { 1660 d.pcid = pmap->pm_pcids[cpuid].pm_pcid; 1661 d.pad = 0; 1662 d.addr = 0; 1663 invpcid(&d, INVPCID_CTX); 1664 } else { 1665 load_cr3(pmap->pm_cr3 | pmap->pm_pcids 1666 [PCPU_GET(cpuid)].pm_pcid); 1667 } 1668 } else { 1669 invltlb(); 1670 } 1671 } else if (pmap_pcid_enabled) { 1672 pmap->pm_pcids[cpuid].pm_gen = 0; 1673 } 1674 if (pmap_pcid_enabled) { 1675 CPU_FOREACH(i) { 1676 if (cpuid != i) 1677 pmap->pm_pcids[i].pm_gen = 0; 1678 } 1679 } 1680 mask = &pmap->pm_active; 1681 } 1682 smp_masked_invltlb(*mask, pmap); 1683 sched_unpin(); 1684} 1685 1686void 1687pmap_invalidate_cache(void) 1688{ 1689 1690 sched_pin(); 1691 wbinvd(); 1692 smp_cache_flush(); 1693 sched_unpin(); 1694} 1695 1696struct pde_action { 1697 cpuset_t invalidate; /* processors that invalidate their TLB */ 1698 pmap_t pmap; 1699 vm_offset_t va; 1700 pd_entry_t *pde; 1701 pd_entry_t newpde; 1702 u_int store; /* processor that updates the PDE */ 1703}; 1704 1705static void 1706pmap_update_pde_action(void *arg) 1707{ 1708 struct pde_action *act = arg; 1709 1710 if (act->store == PCPU_GET(cpuid)) 1711 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1712} 1713 1714static void 1715pmap_update_pde_teardown(void *arg) 1716{ 1717 struct pde_action *act = arg; 1718 1719 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1720 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1721} 1722 1723/* 1724 * Change the page size for the specified virtual address in a way that 1725 * prevents any possibility of the TLB ever having two entries that map the 1726 * same virtual address using different page sizes. This is the recommended 1727 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1728 * machine check exception for a TLB state that is improperly diagnosed as a 1729 * hardware error. 1730 */ 1731static void 1732pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1733{ 1734 struct pde_action act; 1735 cpuset_t active, other_cpus; 1736 u_int cpuid; 1737 1738 sched_pin(); 1739 cpuid = PCPU_GET(cpuid); 1740 other_cpus = all_cpus; 1741 CPU_CLR(cpuid, &other_cpus); 1742 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 1743 active = all_cpus; 1744 else { 1745 active = pmap->pm_active; 1746 } 1747 if (CPU_OVERLAP(&active, &other_cpus)) { 1748 act.store = cpuid; 1749 act.invalidate = active; 1750 act.va = va; 1751 act.pmap = pmap; 1752 act.pde = pde; 1753 act.newpde = newpde; 1754 CPU_SET(cpuid, &active); 1755 smp_rendezvous_cpus(active, 1756 smp_no_rendezvous_barrier, pmap_update_pde_action, 1757 pmap_update_pde_teardown, &act); 1758 } else { 1759 pmap_update_pde_store(pmap, pde, newpde); 1760 if (CPU_ISSET(cpuid, &active)) 1761 pmap_update_pde_invalidate(pmap, va, newpde); 1762 } 1763 sched_unpin(); 1764} 1765#else /* !SMP */ 1766/* 1767 * Normal, non-SMP, invalidation functions. 1768 */ 1769void 1770pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1771{ 1772 1773 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1774 pmap->pm_eptgen++; 1775 return; 1776 } 1777 KASSERT(pmap->pm_type == PT_X86, 1778 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1779 1780 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1781 invlpg(va); 1782 else if (pmap_pcid_enabled) 1783 pmap->pm_pcids[0].pm_gen = 0; 1784} 1785 1786void 1787pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1788{ 1789 vm_offset_t addr; 1790 1791 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1792 pmap->pm_eptgen++; 1793 return; 1794 } 1795 KASSERT(pmap->pm_type == PT_X86, 1796 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1797 1798 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 1799 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1800 invlpg(addr); 1801 } else if (pmap_pcid_enabled) { 1802 pmap->pm_pcids[0].pm_gen = 0; 1803 } 1804} 1805 1806void 1807pmap_invalidate_all(pmap_t pmap) 1808{ 1809 struct invpcid_descr d; 1810 1811 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1812 pmap->pm_eptgen++; 1813 return; 1814 } 1815 KASSERT(pmap->pm_type == PT_X86, 1816 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 1817 1818 if (pmap == kernel_pmap) { 1819 if (pmap_pcid_enabled && invpcid_works) { 1820 bzero(&d, sizeof(d)); 1821 invpcid(&d, INVPCID_CTXGLOB); 1822 } else { 1823 invltlb_glob(); 1824 } 1825 } else if (pmap == PCPU_GET(curpmap)) { 1826 if (pmap_pcid_enabled) { 1827 if (invpcid_works) { 1828 d.pcid = pmap->pm_pcids[0].pm_pcid; 1829 d.pad = 0; 1830 d.addr = 0; 1831 invpcid(&d, INVPCID_CTX); 1832 } else { 1833 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. 1834 pm_pcid); 1835 } 1836 } else { 1837 invltlb(); 1838 } 1839 } else if (pmap_pcid_enabled) { 1840 pmap->pm_pcids[0].pm_gen = 0; 1841 } 1842} 1843 1844PMAP_INLINE void 1845pmap_invalidate_cache(void) 1846{ 1847 1848 wbinvd(); 1849} 1850 1851static void 1852pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1853{ 1854 1855 pmap_update_pde_store(pmap, pde, newpde); 1856 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1857 pmap_update_pde_invalidate(pmap, va, newpde); 1858 else 1859 pmap->pm_pcids[0].pm_gen = 0; 1860} 1861#endif /* !SMP */ 1862 1863static void 1864pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1865{ 1866 1867 /* 1868 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1869 * by a promotion that did not invalidate the 512 4KB page mappings 1870 * that might exist in the TLB. Consequently, at this point, the TLB 1871 * may hold both 4KB and 2MB page mappings for the address range [va, 1872 * va + NBPDR). Therefore, the entire range must be invalidated here. 1873 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1874 * 4KB page mappings for the address range [va, va + NBPDR), and so a 1875 * single INVLPG suffices to invalidate the 2MB page mapping from the 1876 * TLB. 1877 */ 1878 if ((pde & PG_PROMOTED) != 0) 1879 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 1880 else 1881 pmap_invalidate_page(pmap, va); 1882} 1883 1884#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1885 1886void 1887pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1888{ 1889 1890 if (force) { 1891 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 1892 } else { 1893 KASSERT((sva & PAGE_MASK) == 0, 1894 ("pmap_invalidate_cache_range: sva not page-aligned")); 1895 KASSERT((eva & PAGE_MASK) == 0, 1896 ("pmap_invalidate_cache_range: eva not page-aligned")); 1897 } 1898 1899 if ((cpu_feature & CPUID_SS) != 0 && !force) 1900 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1901 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1902 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1903 /* 1904 * XXX: Some CPUs fault, hang, or trash the local APIC 1905 * registers if we use CLFLUSH on the local APIC 1906 * range. The local APIC is always uncached, so we 1907 * don't need to flush for that range anyway. 1908 */ 1909 if (pmap_kextract(sva) == lapic_paddr) 1910 return; 1911 1912 /* 1913 * Otherwise, do per-cache line flush. Use the sfence 1914 * instruction to insure that previous stores are 1915 * included in the write-back. The processor 1916 * propagates flush to other processors in the cache 1917 * coherence domain. 1918 */ 1919 sfence(); 1920 for (; sva < eva; sva += cpu_clflush_line_size) 1921 clflushopt(sva); 1922 sfence(); 1923 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1924 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1925 if (pmap_kextract(sva) == lapic_paddr) 1926 return; 1927 /* 1928 * Writes are ordered by CLFLUSH on Intel CPUs. 1929 */ 1930 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1931 mfence(); 1932 for (; sva < eva; sva += cpu_clflush_line_size) 1933 clflush(sva); 1934 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1935 mfence(); 1936 } else { 1937 1938 /* 1939 * No targeted cache flush methods are supported by CPU, 1940 * or the supplied range is bigger than 2MB. 1941 * Globally invalidate cache. 1942 */ 1943 pmap_invalidate_cache(); 1944 } 1945} 1946 1947/* 1948 * Remove the specified set of pages from the data and instruction caches. 1949 * 1950 * In contrast to pmap_invalidate_cache_range(), this function does not 1951 * rely on the CPU's self-snoop feature, because it is intended for use 1952 * when moving pages into a different cache domain. 1953 */ 1954void 1955pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1956{ 1957 vm_offset_t daddr, eva; 1958 int i; 1959 bool useclflushopt; 1960 1961 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 1962 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1963 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 1964 pmap_invalidate_cache(); 1965 else { 1966 if (useclflushopt) 1967 sfence(); 1968 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 1969 mfence(); 1970 for (i = 0; i < count; i++) { 1971 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1972 eva = daddr + PAGE_SIZE; 1973 for (; daddr < eva; daddr += cpu_clflush_line_size) { 1974 if (useclflushopt) 1975 clflushopt(daddr); 1976 else 1977 clflush(daddr); 1978 } 1979 } 1980 if (useclflushopt) 1981 sfence(); 1982 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 1983 mfence(); 1984 } 1985} 1986 1987/* 1988 * Routine: pmap_extract 1989 * Function: 1990 * Extract the physical page address associated 1991 * with the given map/virtual_address pair. 1992 */ 1993vm_paddr_t 1994pmap_extract(pmap_t pmap, vm_offset_t va) 1995{ 1996 pdp_entry_t *pdpe; 1997 pd_entry_t *pde; 1998 pt_entry_t *pte, PG_V; 1999 vm_paddr_t pa; 2000 2001 pa = 0; 2002 PG_V = pmap_valid_bit(pmap); 2003 PMAP_LOCK(pmap); 2004 pdpe = pmap_pdpe(pmap, va); 2005 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2006 if ((*pdpe & PG_PS) != 0) 2007 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 2008 else { 2009 pde = pmap_pdpe_to_pde(pdpe, va); 2010 if ((*pde & PG_V) != 0) { 2011 if ((*pde & PG_PS) != 0) { 2012 pa = (*pde & PG_PS_FRAME) | 2013 (va & PDRMASK); 2014 } else { 2015 pte = pmap_pde_to_pte(pde, va); 2016 pa = (*pte & PG_FRAME) | 2017 (va & PAGE_MASK); 2018 } 2019 } 2020 } 2021 } 2022 PMAP_UNLOCK(pmap); 2023 return (pa); 2024} 2025 2026/* 2027 * Routine: pmap_extract_and_hold 2028 * Function: 2029 * Atomically extract and hold the physical page 2030 * with the given pmap and virtual address pair 2031 * if that mapping permits the given protection. 2032 */ 2033vm_page_t 2034pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2035{ 2036 pd_entry_t pde, *pdep; 2037 pt_entry_t pte, PG_RW, PG_V; 2038 vm_paddr_t pa; 2039 vm_page_t m; 2040 2041 pa = 0; 2042 m = NULL; 2043 PG_RW = pmap_rw_bit(pmap); 2044 PG_V = pmap_valid_bit(pmap); 2045 PMAP_LOCK(pmap); 2046retry: 2047 pdep = pmap_pde(pmap, va); 2048 if (pdep != NULL && (pde = *pdep)) { 2049 if (pde & PG_PS) { 2050 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 2051 if (vm_page_pa_tryrelock(pmap, (pde & 2052 PG_PS_FRAME) | (va & PDRMASK), &pa)) 2053 goto retry; 2054 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 2055 (va & PDRMASK)); 2056 vm_page_hold(m); 2057 } 2058 } else { 2059 pte = *pmap_pde_to_pte(pdep, va); 2060 if ((pte & PG_V) && 2061 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 2062 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 2063 &pa)) 2064 goto retry; 2065 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2066 vm_page_hold(m); 2067 } 2068 } 2069 } 2070 PA_UNLOCK_COND(pa); 2071 PMAP_UNLOCK(pmap); 2072 return (m); 2073} 2074 2075vm_paddr_t 2076pmap_kextract(vm_offset_t va) 2077{ 2078 pd_entry_t pde; 2079 vm_paddr_t pa; 2080 2081 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 2082 pa = DMAP_TO_PHYS(va); 2083 } else { 2084 pde = *vtopde(va); 2085 if (pde & PG_PS) { 2086 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 2087 } else { 2088 /* 2089 * Beware of a concurrent promotion that changes the 2090 * PDE at this point! For example, vtopte() must not 2091 * be used to access the PTE because it would use the 2092 * new PDE. It is, however, safe to use the old PDE 2093 * because the page table page is preserved by the 2094 * promotion. 2095 */ 2096 pa = *pmap_pde_to_pte(&pde, va); 2097 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 2098 } 2099 } 2100 return (pa); 2101} 2102 2103/*************************************************** 2104 * Low level mapping routines..... 2105 ***************************************************/ 2106 2107/* 2108 * Add a wired page to the kva. 2109 * Note: not SMP coherent. 2110 */ 2111PMAP_INLINE void 2112pmap_kenter(vm_offset_t va, vm_paddr_t pa) 2113{ 2114 pt_entry_t *pte; 2115 2116 pte = vtopte(va); 2117 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 2118} 2119 2120static __inline void 2121pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 2122{ 2123 pt_entry_t *pte; 2124 int cache_bits; 2125 2126 pte = vtopte(va); 2127 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 2128 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 2129} 2130 2131/* 2132 * Remove a page from the kernel pagetables. 2133 * Note: not SMP coherent. 2134 */ 2135PMAP_INLINE void 2136pmap_kremove(vm_offset_t va) 2137{ 2138 pt_entry_t *pte; 2139 2140 pte = vtopte(va); 2141 pte_clear(pte); 2142} 2143 2144/* 2145 * Used to map a range of physical addresses into kernel 2146 * virtual address space. 2147 * 2148 * The value passed in '*virt' is a suggested virtual address for 2149 * the mapping. Architectures which can support a direct-mapped 2150 * physical to virtual region can return the appropriate address 2151 * within that region, leaving '*virt' unchanged. Other 2152 * architectures should map the pages starting at '*virt' and 2153 * update '*virt' with the first usable address after the mapped 2154 * region. 2155 */ 2156vm_offset_t 2157pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2158{ 2159 return PHYS_TO_DMAP(start); 2160} 2161 2162 2163/* 2164 * Add a list of wired pages to the kva 2165 * this routine is only used for temporary 2166 * kernel mappings that do not need to have 2167 * page modification or references recorded. 2168 * Note that old mappings are simply written 2169 * over. The page *must* be wired. 2170 * Note: SMP coherent. Uses a ranged shootdown IPI. 2171 */ 2172void 2173pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2174{ 2175 pt_entry_t *endpte, oldpte, pa, *pte; 2176 vm_page_t m; 2177 int cache_bits; 2178 2179 oldpte = 0; 2180 pte = vtopte(sva); 2181 endpte = pte + count; 2182 while (pte < endpte) { 2183 m = *ma++; 2184 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2185 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2186 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2187 oldpte |= *pte; 2188 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 2189 } 2190 pte++; 2191 } 2192 if (__predict_false((oldpte & X86_PG_V) != 0)) 2193 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2194 PAGE_SIZE); 2195} 2196 2197/* 2198 * This routine tears out page mappings from the 2199 * kernel -- it is meant only for temporary mappings. 2200 * Note: SMP coherent. Uses a ranged shootdown IPI. 2201 */ 2202void 2203pmap_qremove(vm_offset_t sva, int count) 2204{ 2205 vm_offset_t va; 2206 2207 va = sva; 2208 while (count-- > 0) { 2209 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2210 pmap_kremove(va); 2211 va += PAGE_SIZE; 2212 } 2213 pmap_invalidate_range(kernel_pmap, sva, va); 2214} 2215 2216/*************************************************** 2217 * Page table page management routines..... 2218 ***************************************************/ 2219static __inline void 2220pmap_free_zero_pages(struct spglist *free) 2221{ 2222 vm_page_t m; 2223 int count; 2224 2225 for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) { 2226 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2227 /* Preserve the page's PG_ZERO setting. */ 2228 vm_page_free_toq(m); 2229 } 2230 atomic_subtract_int(&vm_cnt.v_wire_count, count); 2231} 2232 2233/* 2234 * Schedule the specified unused page table page to be freed. Specifically, 2235 * add the page to the specified list of pages that will be released to the 2236 * physical memory manager after the TLB has been updated. 2237 */ 2238static __inline void 2239pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2240 boolean_t set_PG_ZERO) 2241{ 2242 2243 if (set_PG_ZERO) 2244 m->flags |= PG_ZERO; 2245 else 2246 m->flags &= ~PG_ZERO; 2247 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2248} 2249 2250/* 2251 * Inserts the specified page table page into the specified pmap's collection 2252 * of idle page table pages. Each of a pmap's page table pages is responsible 2253 * for mapping a distinct range of virtual addresses. The pmap's collection is 2254 * ordered by this virtual address range. 2255 */ 2256static __inline int 2257pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2258{ 2259 2260 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2261 return (vm_radix_insert(&pmap->pm_root, mpte)); 2262} 2263 2264/* 2265 * Removes the page table page mapping the specified virtual address from the 2266 * specified pmap's collection of idle page table pages, and returns it. 2267 * Otherwise, returns NULL if there is no page table page corresponding to the 2268 * specified virtual address. 2269 */ 2270static __inline vm_page_t 2271pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 2272{ 2273 2274 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2275 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 2276} 2277 2278/* 2279 * Decrements a page table page's wire count, which is used to record the 2280 * number of valid page table entries within the page. If the wire count 2281 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2282 * page table page was unmapped and FALSE otherwise. 2283 */ 2284static inline boolean_t 2285pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2286{ 2287 2288 --m->wire_count; 2289 if (m->wire_count == 0) { 2290 _pmap_unwire_ptp(pmap, va, m, free); 2291 return (TRUE); 2292 } else 2293 return (FALSE); 2294} 2295 2296static void 2297_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2298{ 2299 2300 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2301 /* 2302 * unmap the page table page 2303 */ 2304 if (m->pindex >= (NUPDE + NUPDPE)) { 2305 /* PDP page */ 2306 pml4_entry_t *pml4; 2307 pml4 = pmap_pml4e(pmap, va); 2308 *pml4 = 0; 2309 } else if (m->pindex >= NUPDE) { 2310 /* PD page */ 2311 pdp_entry_t *pdp; 2312 pdp = pmap_pdpe(pmap, va); 2313 *pdp = 0; 2314 } else { 2315 /* PTE page */ 2316 pd_entry_t *pd; 2317 pd = pmap_pde(pmap, va); 2318 *pd = 0; 2319 } 2320 pmap_resident_count_dec(pmap, 1); 2321 if (m->pindex < NUPDE) { 2322 /* We just released a PT, unhold the matching PD */ 2323 vm_page_t pdpg; 2324 2325 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2326 pmap_unwire_ptp(pmap, va, pdpg, free); 2327 } 2328 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2329 /* We just released a PD, unhold the matching PDP */ 2330 vm_page_t pdppg; 2331 2332 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2333 pmap_unwire_ptp(pmap, va, pdppg, free); 2334 } 2335 2336 /* 2337 * Put page on a list so that it is released after 2338 * *ALL* TLB shootdown is done 2339 */ 2340 pmap_add_delayed_free_list(m, free, TRUE); 2341} 2342 2343/* 2344 * After removing a page table entry, this routine is used to 2345 * conditionally free the page, and manage the hold/wire counts. 2346 */ 2347static int 2348pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2349 struct spglist *free) 2350{ 2351 vm_page_t mpte; 2352 2353 if (va >= VM_MAXUSER_ADDRESS) 2354 return (0); 2355 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2356 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2357 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2358} 2359 2360void 2361pmap_pinit0(pmap_t pmap) 2362{ 2363 int i; 2364 2365 PMAP_LOCK_INIT(pmap); 2366 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2367 pmap->pm_cr3 = KPML4phys; 2368 pmap->pm_root.rt_root = 0; 2369 CPU_ZERO(&pmap->pm_active); 2370 TAILQ_INIT(&pmap->pm_pvchunk); 2371 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2372 pmap->pm_flags = pmap_flags; 2373 CPU_FOREACH(i) { 2374 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2375 pmap->pm_pcids[i].pm_gen = 0; 2376 } 2377 PCPU_SET(curpmap, kernel_pmap); 2378 pmap_activate(curthread); 2379 CPU_FILL(&kernel_pmap->pm_active); 2380} 2381 2382void 2383pmap_pinit_pml4(vm_page_t pml4pg) 2384{ 2385 pml4_entry_t *pm_pml4; 2386 int i; 2387 2388 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2389 2390 /* Wire in kernel global address entries. */ 2391 for (i = 0; i < NKPML4E; i++) { 2392 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 2393 X86_PG_V | PG_U; 2394 } 2395 for (i = 0; i < ndmpdpphys; i++) { 2396 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 2397 X86_PG_V | PG_U; 2398 } 2399 2400 /* install self-referential address mapping entry(s) */ 2401 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 2402 X86_PG_A | X86_PG_M; 2403} 2404 2405/* 2406 * Initialize a preallocated and zeroed pmap structure, 2407 * such as one in a vmspace structure. 2408 */ 2409int 2410pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2411{ 2412 vm_page_t pml4pg; 2413 vm_paddr_t pml4phys; 2414 int i; 2415 2416 /* 2417 * allocate the page directory page 2418 */ 2419 pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2420 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 2421 2422 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2423 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2424 CPU_FOREACH(i) { 2425 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2426 pmap->pm_pcids[i].pm_gen = 0; 2427 } 2428 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2429 2430 if ((pml4pg->flags & PG_ZERO) == 0) 2431 pagezero(pmap->pm_pml4); 2432 2433 /* 2434 * Do not install the host kernel mappings in the nested page 2435 * tables. These mappings are meaningless in the guest physical 2436 * address space. 2437 */ 2438 if ((pmap->pm_type = pm_type) == PT_X86) { 2439 pmap->pm_cr3 = pml4phys; 2440 pmap_pinit_pml4(pml4pg); 2441 } 2442 2443 pmap->pm_root.rt_root = 0; 2444 CPU_ZERO(&pmap->pm_active); 2445 TAILQ_INIT(&pmap->pm_pvchunk); 2446 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2447 pmap->pm_flags = flags; 2448 pmap->pm_eptgen = 0; 2449 2450 return (1); 2451} 2452 2453int 2454pmap_pinit(pmap_t pmap) 2455{ 2456 2457 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2458} 2459 2460/* 2461 * This routine is called if the desired page table page does not exist. 2462 * 2463 * If page table page allocation fails, this routine may sleep before 2464 * returning NULL. It sleeps only if a lock pointer was given. 2465 * 2466 * Note: If a page allocation fails at page table level two or three, 2467 * one or two pages may be held during the wait, only to be released 2468 * afterwards. This conservative approach is easily argued to avoid 2469 * race conditions. 2470 */ 2471static vm_page_t 2472_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2473{ 2474 vm_page_t m, pdppg, pdpg; 2475 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2476 2477 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2478 2479 PG_A = pmap_accessed_bit(pmap); 2480 PG_M = pmap_modified_bit(pmap); 2481 PG_V = pmap_valid_bit(pmap); 2482 PG_RW = pmap_rw_bit(pmap); 2483 2484 /* 2485 * Allocate a page table page. 2486 */ 2487 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2488 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2489 if (lockp != NULL) { 2490 RELEASE_PV_LIST_LOCK(lockp); 2491 PMAP_UNLOCK(pmap); 2492 PMAP_ASSERT_NOT_IN_DI(); 2493 VM_WAIT; 2494 PMAP_LOCK(pmap); 2495 } 2496 2497 /* 2498 * Indicate the need to retry. While waiting, the page table 2499 * page may have been allocated. 2500 */ 2501 return (NULL); 2502 } 2503 if ((m->flags & PG_ZERO) == 0) 2504 pmap_zero_page(m); 2505 2506 /* 2507 * Map the pagetable page into the process address space, if 2508 * it isn't already there. 2509 */ 2510 2511 if (ptepindex >= (NUPDE + NUPDPE)) { 2512 pml4_entry_t *pml4; 2513 vm_pindex_t pml4index; 2514 2515 /* Wire up a new PDPE page */ 2516 pml4index = ptepindex - (NUPDE + NUPDPE); 2517 pml4 = &pmap->pm_pml4[pml4index]; 2518 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2519 2520 } else if (ptepindex >= NUPDE) { 2521 vm_pindex_t pml4index; 2522 vm_pindex_t pdpindex; 2523 pml4_entry_t *pml4; 2524 pdp_entry_t *pdp; 2525 2526 /* Wire up a new PDE page */ 2527 pdpindex = ptepindex - NUPDE; 2528 pml4index = pdpindex >> NPML4EPGSHIFT; 2529 2530 pml4 = &pmap->pm_pml4[pml4index]; 2531 if ((*pml4 & PG_V) == 0) { 2532 /* Have to allocate a new pdp, recurse */ 2533 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2534 lockp) == NULL) { 2535 --m->wire_count; 2536 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2537 vm_page_free_zero(m); 2538 return (NULL); 2539 } 2540 } else { 2541 /* Add reference to pdp page */ 2542 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2543 pdppg->wire_count++; 2544 } 2545 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2546 2547 /* Now find the pdp page */ 2548 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2549 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2550 2551 } else { 2552 vm_pindex_t pml4index; 2553 vm_pindex_t pdpindex; 2554 pml4_entry_t *pml4; 2555 pdp_entry_t *pdp; 2556 pd_entry_t *pd; 2557 2558 /* Wire up a new PTE page */ 2559 pdpindex = ptepindex >> NPDPEPGSHIFT; 2560 pml4index = pdpindex >> NPML4EPGSHIFT; 2561 2562 /* First, find the pdp and check that its valid. */ 2563 pml4 = &pmap->pm_pml4[pml4index]; 2564 if ((*pml4 & PG_V) == 0) { 2565 /* Have to allocate a new pd, recurse */ 2566 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2567 lockp) == NULL) { 2568 --m->wire_count; 2569 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2570 vm_page_free_zero(m); 2571 return (NULL); 2572 } 2573 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2574 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2575 } else { 2576 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2577 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2578 if ((*pdp & PG_V) == 0) { 2579 /* Have to allocate a new pd, recurse */ 2580 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2581 lockp) == NULL) { 2582 --m->wire_count; 2583 atomic_subtract_int(&vm_cnt.v_wire_count, 2584 1); 2585 vm_page_free_zero(m); 2586 return (NULL); 2587 } 2588 } else { 2589 /* Add reference to the pd page */ 2590 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2591 pdpg->wire_count++; 2592 } 2593 } 2594 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2595 2596 /* Now we know where the page directory page is */ 2597 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2598 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2599 } 2600 2601 pmap_resident_count_inc(pmap, 1); 2602 2603 return (m); 2604} 2605 2606static vm_page_t 2607pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2608{ 2609 vm_pindex_t pdpindex, ptepindex; 2610 pdp_entry_t *pdpe, PG_V; 2611 vm_page_t pdpg; 2612 2613 PG_V = pmap_valid_bit(pmap); 2614 2615retry: 2616 pdpe = pmap_pdpe(pmap, va); 2617 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2618 /* Add a reference to the pd page. */ 2619 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2620 pdpg->wire_count++; 2621 } else { 2622 /* Allocate a pd page. */ 2623 ptepindex = pmap_pde_pindex(va); 2624 pdpindex = ptepindex >> NPDPEPGSHIFT; 2625 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2626 if (pdpg == NULL && lockp != NULL) 2627 goto retry; 2628 } 2629 return (pdpg); 2630} 2631 2632static vm_page_t 2633pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2634{ 2635 vm_pindex_t ptepindex; 2636 pd_entry_t *pd, PG_V; 2637 vm_page_t m; 2638 2639 PG_V = pmap_valid_bit(pmap); 2640 2641 /* 2642 * Calculate pagetable page index 2643 */ 2644 ptepindex = pmap_pde_pindex(va); 2645retry: 2646 /* 2647 * Get the page directory entry 2648 */ 2649 pd = pmap_pde(pmap, va); 2650 2651 /* 2652 * This supports switching from a 2MB page to a 2653 * normal 4K page. 2654 */ 2655 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2656 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2657 /* 2658 * Invalidation of the 2MB page mapping may have caused 2659 * the deallocation of the underlying PD page. 2660 */ 2661 pd = NULL; 2662 } 2663 } 2664 2665 /* 2666 * If the page table page is mapped, we just increment the 2667 * hold count, and activate it. 2668 */ 2669 if (pd != NULL && (*pd & PG_V) != 0) { 2670 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2671 m->wire_count++; 2672 } else { 2673 /* 2674 * Here if the pte page isn't mapped, or if it has been 2675 * deallocated. 2676 */ 2677 m = _pmap_allocpte(pmap, ptepindex, lockp); 2678 if (m == NULL && lockp != NULL) 2679 goto retry; 2680 } 2681 return (m); 2682} 2683 2684 2685/*************************************************** 2686 * Pmap allocation/deallocation routines. 2687 ***************************************************/ 2688 2689/* 2690 * Release any resources held by the given physical map. 2691 * Called when a pmap initialized by pmap_pinit is being released. 2692 * Should only be called if the map contains no valid mappings. 2693 */ 2694void 2695pmap_release(pmap_t pmap) 2696{ 2697 vm_page_t m; 2698 int i; 2699 2700 KASSERT(pmap->pm_stats.resident_count == 0, 2701 ("pmap_release: pmap resident count %ld != 0", 2702 pmap->pm_stats.resident_count)); 2703 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2704 ("pmap_release: pmap has reserved page table page(s)")); 2705 KASSERT(CPU_EMPTY(&pmap->pm_active), 2706 ("releasing active pmap %p", pmap)); 2707 2708 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2709 2710 for (i = 0; i < NKPML4E; i++) /* KVA */ 2711 pmap->pm_pml4[KPML4BASE + i] = 0; 2712 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2713 pmap->pm_pml4[DMPML4I + i] = 0; 2714 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2715 2716 m->wire_count--; 2717 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2718 vm_page_free_zero(m); 2719} 2720 2721static int 2722kvm_size(SYSCTL_HANDLER_ARGS) 2723{ 2724 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2725 2726 return sysctl_handle_long(oidp, &ksize, 0, req); 2727} 2728SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2729 0, 0, kvm_size, "LU", "Size of KVM"); 2730 2731static int 2732kvm_free(SYSCTL_HANDLER_ARGS) 2733{ 2734 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2735 2736 return sysctl_handle_long(oidp, &kfree, 0, req); 2737} 2738SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2739 0, 0, kvm_free, "LU", "Amount of KVM free"); 2740 2741/* 2742 * grow the number of kernel page table entries, if needed 2743 */ 2744void 2745pmap_growkernel(vm_offset_t addr) 2746{ 2747 vm_paddr_t paddr; 2748 vm_page_t nkpg; 2749 pd_entry_t *pde, newpdir; 2750 pdp_entry_t *pdpe; 2751 2752 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2753 2754 /* 2755 * Return if "addr" is within the range of kernel page table pages 2756 * that were preallocated during pmap bootstrap. Moreover, leave 2757 * "kernel_vm_end" and the kernel page table as they were. 2758 * 2759 * The correctness of this action is based on the following 2760 * argument: vm_map_insert() allocates contiguous ranges of the 2761 * kernel virtual address space. It calls this function if a range 2762 * ends after "kernel_vm_end". If the kernel is mapped between 2763 * "kernel_vm_end" and "addr", then the range cannot begin at 2764 * "kernel_vm_end". In fact, its beginning address cannot be less 2765 * than the kernel. Thus, there is no immediate need to allocate 2766 * any new kernel page table pages between "kernel_vm_end" and 2767 * "KERNBASE". 2768 */ 2769 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2770 return; 2771 2772 addr = roundup2(addr, NBPDR); 2773 if (addr - 1 >= kernel_map->max_offset) 2774 addr = kernel_map->max_offset; 2775 while (kernel_vm_end < addr) { 2776 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2777 if ((*pdpe & X86_PG_V) == 0) { 2778 /* We need a new PDP entry */ 2779 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2780 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2781 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2782 if (nkpg == NULL) 2783 panic("pmap_growkernel: no memory to grow kernel"); 2784 if ((nkpg->flags & PG_ZERO) == 0) 2785 pmap_zero_page(nkpg); 2786 paddr = VM_PAGE_TO_PHYS(nkpg); 2787 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2788 X86_PG_A | X86_PG_M); 2789 continue; /* try again */ 2790 } 2791 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2792 if ((*pde & X86_PG_V) != 0) { 2793 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2794 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2795 kernel_vm_end = kernel_map->max_offset; 2796 break; 2797 } 2798 continue; 2799 } 2800 2801 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2802 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2803 VM_ALLOC_ZERO); 2804 if (nkpg == NULL) 2805 panic("pmap_growkernel: no memory to grow kernel"); 2806 if ((nkpg->flags & PG_ZERO) == 0) 2807 pmap_zero_page(nkpg); 2808 paddr = VM_PAGE_TO_PHYS(nkpg); 2809 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2810 pde_store(pde, newpdir); 2811 2812 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2813 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2814 kernel_vm_end = kernel_map->max_offset; 2815 break; 2816 } 2817 } 2818} 2819 2820 2821/*************************************************** 2822 * page management routines. 2823 ***************************************************/ 2824 2825CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2826CTASSERT(_NPCM == 3); 2827CTASSERT(_NPCPV == 168); 2828 2829static __inline struct pv_chunk * 2830pv_to_chunk(pv_entry_t pv) 2831{ 2832 2833 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2834} 2835 2836#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2837 2838#define PC_FREE0 0xfffffffffffffffful 2839#define PC_FREE1 0xfffffffffffffffful 2840#define PC_FREE2 0x000000fffffffffful 2841 2842static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2843 2844#ifdef PV_STATS 2845static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2846 2847SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2848 "Current number of pv entry chunks"); 2849SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2850 "Current number of pv entry chunks allocated"); 2851SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2852 "Current number of pv entry chunks frees"); 2853SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2854 "Number of times tried to get a chunk page but failed."); 2855 2856static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2857static int pv_entry_spare; 2858 2859SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2860 "Current number of pv entry frees"); 2861SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2862 "Current number of pv entry allocs"); 2863SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2864 "Current number of pv entries"); 2865SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2866 "Current number of spare pv entries"); 2867#endif 2868 2869static void 2870reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 2871{ 2872 2873 if (pmap == NULL) 2874 return; 2875 pmap_invalidate_all(pmap); 2876 if (pmap != locked_pmap) 2877 PMAP_UNLOCK(pmap); 2878 if (start_di) 2879 pmap_delayed_invl_finished(); 2880} 2881 2882/* 2883 * We are in a serious low memory condition. Resort to 2884 * drastic measures to free some pages so we can allocate 2885 * another pv entry chunk. 2886 * 2887 * Returns NULL if PV entries were reclaimed from the specified pmap. 2888 * 2889 * We do not, however, unmap 2mpages because subsequent accesses will 2890 * allocate per-page pv entries until repromotion occurs, thereby 2891 * exacerbating the shortage of free pv entries. 2892 */ 2893static vm_page_t 2894reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2895{ 2896 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2897 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2898 struct md_page *pvh; 2899 pd_entry_t *pde; 2900 pmap_t next_pmap, pmap; 2901 pt_entry_t *pte, tpte; 2902 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2903 pv_entry_t pv; 2904 vm_offset_t va; 2905 vm_page_t m, m_pc; 2906 struct spglist free; 2907 uint64_t inuse; 2908 int bit, field, freed; 2909 bool start_di; 2910 static int active_reclaims = 0; 2911 2912 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2913 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2914 pmap = NULL; 2915 m_pc = NULL; 2916 PG_G = PG_A = PG_M = PG_RW = 0; 2917 SLIST_INIT(&free); 2918 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2919 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2920 pc_marker = (struct pv_chunk *)&pc_marker_b; 2921 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2922 2923 /* 2924 * A delayed invalidation block should already be active if 2925 * pmap_advise() or pmap_remove() called this function by way 2926 * of pmap_demote_pde_locked(). 2927 */ 2928 start_di = pmap_not_in_di(); 2929 2930 mtx_lock(&pv_chunks_mutex); 2931 active_reclaims++; 2932 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 2933 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 2934 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2935 SLIST_EMPTY(&free)) { 2936 next_pmap = pc->pc_pmap; 2937 if (next_pmap == NULL) { 2938 /* 2939 * The next chunk is a marker. However, it is 2940 * not our marker, so active_reclaims must be 2941 * > 1. Consequently, the next_chunk code 2942 * will not rotate the pv_chunks list. 2943 */ 2944 goto next_chunk; 2945 } 2946 mtx_unlock(&pv_chunks_mutex); 2947 2948 /* 2949 * A pv_chunk can only be removed from the pc_lru list 2950 * when both pc_chunks_mutex is owned and the 2951 * corresponding pmap is locked. 2952 */ 2953 if (pmap != next_pmap) { 2954 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 2955 start_di); 2956 pmap = next_pmap; 2957 /* Avoid deadlock and lock recursion. */ 2958 if (pmap > locked_pmap) { 2959 RELEASE_PV_LIST_LOCK(lockp); 2960 PMAP_LOCK(pmap); 2961 if (start_di) 2962 pmap_delayed_invl_started(); 2963 mtx_lock(&pv_chunks_mutex); 2964 continue; 2965 } else if (pmap != locked_pmap) { 2966 if (PMAP_TRYLOCK(pmap)) { 2967 if (start_di) 2968 pmap_delayed_invl_started(); 2969 mtx_lock(&pv_chunks_mutex); 2970 continue; 2971 } else { 2972 pmap = NULL; /* pmap is not locked */ 2973 mtx_lock(&pv_chunks_mutex); 2974 pc = TAILQ_NEXT(pc_marker, pc_lru); 2975 if (pc == NULL || 2976 pc->pc_pmap != next_pmap) 2977 continue; 2978 goto next_chunk; 2979 } 2980 } else if (start_di) 2981 pmap_delayed_invl_started(); 2982 PG_G = pmap_global_bit(pmap); 2983 PG_A = pmap_accessed_bit(pmap); 2984 PG_M = pmap_modified_bit(pmap); 2985 PG_RW = pmap_rw_bit(pmap); 2986 } 2987 2988 /* 2989 * Destroy every non-wired, 4 KB page mapping in the chunk. 2990 */ 2991 freed = 0; 2992 for (field = 0; field < _NPCM; field++) { 2993 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2994 inuse != 0; inuse &= ~(1UL << bit)) { 2995 bit = bsfq(inuse); 2996 pv = &pc->pc_pventry[field * 64 + bit]; 2997 va = pv->pv_va; 2998 pde = pmap_pde(pmap, va); 2999 if ((*pde & PG_PS) != 0) 3000 continue; 3001 pte = pmap_pde_to_pte(pde, va); 3002 if ((*pte & PG_W) != 0) 3003 continue; 3004 tpte = pte_load_clear(pte); 3005 if ((tpte & PG_G) != 0) 3006 pmap_invalidate_page(pmap, va); 3007 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3008 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3009 vm_page_dirty(m); 3010 if ((tpte & PG_A) != 0) 3011 vm_page_aflag_set(m, PGA_REFERENCED); 3012 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3013 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3014 m->md.pv_gen++; 3015 if (TAILQ_EMPTY(&m->md.pv_list) && 3016 (m->flags & PG_FICTITIOUS) == 0) { 3017 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3018 if (TAILQ_EMPTY(&pvh->pv_list)) { 3019 vm_page_aflag_clear(m, 3020 PGA_WRITEABLE); 3021 } 3022 } 3023 pmap_delayed_invl_page(m); 3024 pc->pc_map[field] |= 1UL << bit; 3025 pmap_unuse_pt(pmap, va, *pde, &free); 3026 freed++; 3027 } 3028 } 3029 if (freed == 0) { 3030 mtx_lock(&pv_chunks_mutex); 3031 goto next_chunk; 3032 } 3033 /* Every freed mapping is for a 4 KB page. */ 3034 pmap_resident_count_dec(pmap, freed); 3035 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3036 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3037 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3038 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3039 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 3040 pc->pc_map[2] == PC_FREE2) { 3041 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3042 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3043 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3044 /* Entire chunk is free; return it. */ 3045 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3046 dump_drop_page(m_pc->phys_addr); 3047 mtx_lock(&pv_chunks_mutex); 3048 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3049 break; 3050 } 3051 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3052 mtx_lock(&pv_chunks_mutex); 3053 /* One freed pv entry in locked_pmap is sufficient. */ 3054 if (pmap == locked_pmap) 3055 break; 3056next_chunk: 3057 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3058 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 3059 if (active_reclaims == 1 && pmap != NULL) { 3060 /* 3061 * Rotate the pv chunks list so that we do not 3062 * scan the same pv chunks that could not be 3063 * freed (because they contained a wired 3064 * and/or superpage mapping) on every 3065 * invocation of reclaim_pv_chunk(). 3066 */ 3067 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 3068 MPASS(pc->pc_pmap != NULL); 3069 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3070 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3071 } 3072 } 3073 } 3074 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3075 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 3076 active_reclaims--; 3077 mtx_unlock(&pv_chunks_mutex); 3078 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 3079 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3080 m_pc = SLIST_FIRST(&free); 3081 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3082 /* Recycle a freed page table page. */ 3083 m_pc->wire_count = 1; 3084 } 3085 pmap_free_zero_pages(&free); 3086 return (m_pc); 3087} 3088 3089/* 3090 * free the pv_entry back to the free list 3091 */ 3092static void 3093free_pv_entry(pmap_t pmap, pv_entry_t pv) 3094{ 3095 struct pv_chunk *pc; 3096 int idx, field, bit; 3097 3098 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3099 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3100 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3101 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3102 pc = pv_to_chunk(pv); 3103 idx = pv - &pc->pc_pventry[0]; 3104 field = idx / 64; 3105 bit = idx % 64; 3106 pc->pc_map[field] |= 1ul << bit; 3107 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 3108 pc->pc_map[2] != PC_FREE2) { 3109 /* 98% of the time, pc is already at the head of the list. */ 3110 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3111 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3112 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3113 } 3114 return; 3115 } 3116 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3117 free_pv_chunk(pc); 3118} 3119 3120static void 3121free_pv_chunk(struct pv_chunk *pc) 3122{ 3123 vm_page_t m; 3124 3125 mtx_lock(&pv_chunks_mutex); 3126 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3127 mtx_unlock(&pv_chunks_mutex); 3128 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3129 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3130 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3131 /* entire chunk is free, return it */ 3132 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3133 dump_drop_page(m->phys_addr); 3134 vm_page_unwire(m, PQ_NONE); 3135 vm_page_free(m); 3136} 3137 3138/* 3139 * Returns a new PV entry, allocating a new PV chunk from the system when 3140 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3141 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3142 * returned. 3143 * 3144 * The given PV list lock may be released. 3145 */ 3146static pv_entry_t 3147get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3148{ 3149 int bit, field; 3150 pv_entry_t pv; 3151 struct pv_chunk *pc; 3152 vm_page_t m; 3153 3154 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3155 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3156retry: 3157 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3158 if (pc != NULL) { 3159 for (field = 0; field < _NPCM; field++) { 3160 if (pc->pc_map[field]) { 3161 bit = bsfq(pc->pc_map[field]); 3162 break; 3163 } 3164 } 3165 if (field < _NPCM) { 3166 pv = &pc->pc_pventry[field * 64 + bit]; 3167 pc->pc_map[field] &= ~(1ul << bit); 3168 /* If this was the last item, move it to tail */ 3169 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 3170 pc->pc_map[2] == 0) { 3171 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3172 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3173 pc_list); 3174 } 3175 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3176 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3177 return (pv); 3178 } 3179 } 3180 /* No free items, allocate another chunk */ 3181 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3182 VM_ALLOC_WIRED); 3183 if (m == NULL) { 3184 if (lockp == NULL) { 3185 PV_STAT(pc_chunk_tryfail++); 3186 return (NULL); 3187 } 3188 m = reclaim_pv_chunk(pmap, lockp); 3189 if (m == NULL) 3190 goto retry; 3191 } 3192 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3193 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3194 dump_add_page(m->phys_addr); 3195 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3196 pc->pc_pmap = pmap; 3197 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3198 pc->pc_map[1] = PC_FREE1; 3199 pc->pc_map[2] = PC_FREE2; 3200 mtx_lock(&pv_chunks_mutex); 3201 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3202 mtx_unlock(&pv_chunks_mutex); 3203 pv = &pc->pc_pventry[0]; 3204 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3205 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3206 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3207 return (pv); 3208} 3209 3210/* 3211 * Returns the number of one bits within the given PV chunk map. 3212 * 3213 * The erratas for Intel processors state that "POPCNT Instruction May 3214 * Take Longer to Execute Than Expected". It is believed that the 3215 * issue is the spurious dependency on the destination register. 3216 * Provide a hint to the register rename logic that the destination 3217 * value is overwritten, by clearing it, as suggested in the 3218 * optimization manual. It should be cheap for unaffected processors 3219 * as well. 3220 * 3221 * Reference numbers for erratas are 3222 * 4th Gen Core: HSD146 3223 * 5th Gen Core: BDM85 3224 * 6th Gen Core: SKL029 3225 */ 3226static int 3227popcnt_pc_map_pq(uint64_t *map) 3228{ 3229 u_long result, tmp; 3230 3231 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 3232 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 3233 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 3234 : "=&r" (result), "=&r" (tmp) 3235 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 3236 return (result); 3237} 3238 3239/* 3240 * Ensure that the number of spare PV entries in the specified pmap meets or 3241 * exceeds the given count, "needed". 3242 * 3243 * The given PV list lock may be released. 3244 */ 3245static void 3246reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3247{ 3248 struct pch new_tail; 3249 struct pv_chunk *pc; 3250 int avail, free; 3251 vm_page_t m; 3252 3253 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3254 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3255 3256 /* 3257 * Newly allocated PV chunks must be stored in a private list until 3258 * the required number of PV chunks have been allocated. Otherwise, 3259 * reclaim_pv_chunk() could recycle one of these chunks. In 3260 * contrast, these chunks must be added to the pmap upon allocation. 3261 */ 3262 TAILQ_INIT(&new_tail); 3263retry: 3264 avail = 0; 3265 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3266#ifndef __POPCNT__ 3267 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 3268 bit_count((bitstr_t *)pc->pc_map, 0, 3269 sizeof(pc->pc_map) * NBBY, &free); 3270 else 3271#endif 3272 free = popcnt_pc_map_pq(pc->pc_map); 3273 if (free == 0) 3274 break; 3275 avail += free; 3276 if (avail >= needed) 3277 break; 3278 } 3279 for (; avail < needed; avail += _NPCPV) { 3280 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3281 VM_ALLOC_WIRED); 3282 if (m == NULL) { 3283 m = reclaim_pv_chunk(pmap, lockp); 3284 if (m == NULL) 3285 goto retry; 3286 } 3287 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3288 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3289 dump_add_page(m->phys_addr); 3290 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3291 pc->pc_pmap = pmap; 3292 pc->pc_map[0] = PC_FREE0; 3293 pc->pc_map[1] = PC_FREE1; 3294 pc->pc_map[2] = PC_FREE2; 3295 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3296 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3297 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3298 } 3299 if (!TAILQ_EMPTY(&new_tail)) { 3300 mtx_lock(&pv_chunks_mutex); 3301 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3302 mtx_unlock(&pv_chunks_mutex); 3303 } 3304} 3305 3306/* 3307 * First find and then remove the pv entry for the specified pmap and virtual 3308 * address from the specified pv list. Returns the pv entry if found and NULL 3309 * otherwise. This operation can be performed on pv lists for either 4KB or 3310 * 2MB page mappings. 3311 */ 3312static __inline pv_entry_t 3313pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3314{ 3315 pv_entry_t pv; 3316 3317 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3318 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3319 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3320 pvh->pv_gen++; 3321 break; 3322 } 3323 } 3324 return (pv); 3325} 3326 3327/* 3328 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3329 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3330 * entries for each of the 4KB page mappings. 3331 */ 3332static void 3333pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3334 struct rwlock **lockp) 3335{ 3336 struct md_page *pvh; 3337 struct pv_chunk *pc; 3338 pv_entry_t pv; 3339 vm_offset_t va_last; 3340 vm_page_t m; 3341 int bit, field; 3342 3343 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3344 KASSERT((pa & PDRMASK) == 0, 3345 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3346 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3347 3348 /* 3349 * Transfer the 2mpage's pv entry for this mapping to the first 3350 * page's pv list. Once this transfer begins, the pv list lock 3351 * must not be released until the last pv entry is reinstantiated. 3352 */ 3353 pvh = pa_to_pvh(pa); 3354 va = trunc_2mpage(va); 3355 pv = pmap_pvh_remove(pvh, pmap, va); 3356 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3357 m = PHYS_TO_VM_PAGE(pa); 3358 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3359 m->md.pv_gen++; 3360 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3361 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3362 va_last = va + NBPDR - PAGE_SIZE; 3363 for (;;) { 3364 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3365 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3366 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3367 for (field = 0; field < _NPCM; field++) { 3368 while (pc->pc_map[field]) { 3369 bit = bsfq(pc->pc_map[field]); 3370 pc->pc_map[field] &= ~(1ul << bit); 3371 pv = &pc->pc_pventry[field * 64 + bit]; 3372 va += PAGE_SIZE; 3373 pv->pv_va = va; 3374 m++; 3375 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3376 ("pmap_pv_demote_pde: page %p is not managed", m)); 3377 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3378 m->md.pv_gen++; 3379 if (va == va_last) 3380 goto out; 3381 } 3382 } 3383 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3384 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3385 } 3386out: 3387 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3388 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3389 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3390 } 3391 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3392 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3393} 3394 3395#if VM_NRESERVLEVEL > 0 3396/* 3397 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3398 * replace the many pv entries for the 4KB page mappings by a single pv entry 3399 * for the 2MB page mapping. 3400 */ 3401static void 3402pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3403 struct rwlock **lockp) 3404{ 3405 struct md_page *pvh; 3406 pv_entry_t pv; 3407 vm_offset_t va_last; 3408 vm_page_t m; 3409 3410 KASSERT((pa & PDRMASK) == 0, 3411 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3412 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3413 3414 /* 3415 * Transfer the first page's pv entry for this mapping to the 2mpage's 3416 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3417 * a transfer avoids the possibility that get_pv_entry() calls 3418 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3419 * mappings that is being promoted. 3420 */ 3421 m = PHYS_TO_VM_PAGE(pa); 3422 va = trunc_2mpage(va); 3423 pv = pmap_pvh_remove(&m->md, pmap, va); 3424 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3425 pvh = pa_to_pvh(pa); 3426 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3427 pvh->pv_gen++; 3428 /* Free the remaining NPTEPG - 1 pv entries. */ 3429 va_last = va + NBPDR - PAGE_SIZE; 3430 do { 3431 m++; 3432 va += PAGE_SIZE; 3433 pmap_pvh_free(&m->md, pmap, va); 3434 } while (va < va_last); 3435} 3436#endif /* VM_NRESERVLEVEL > 0 */ 3437 3438/* 3439 * First find and then destroy the pv entry for the specified pmap and virtual 3440 * address. This operation can be performed on pv lists for either 4KB or 2MB 3441 * page mappings. 3442 */ 3443static void 3444pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3445{ 3446 pv_entry_t pv; 3447 3448 pv = pmap_pvh_remove(pvh, pmap, va); 3449 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3450 free_pv_entry(pmap, pv); 3451} 3452 3453/* 3454 * Conditionally create the PV entry for a 4KB page mapping if the required 3455 * memory can be allocated without resorting to reclamation. 3456 */ 3457static boolean_t 3458pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3459 struct rwlock **lockp) 3460{ 3461 pv_entry_t pv; 3462 3463 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3464 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3465 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3466 pv->pv_va = va; 3467 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3468 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3469 m->md.pv_gen++; 3470 return (TRUE); 3471 } else 3472 return (FALSE); 3473} 3474 3475/* 3476 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3477 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3478 * false if the PV entry cannot be allocated without resorting to reclamation. 3479 */ 3480static bool 3481pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 3482 struct rwlock **lockp) 3483{ 3484 struct md_page *pvh; 3485 pv_entry_t pv; 3486 vm_paddr_t pa; 3487 3488 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3489 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3490 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3491 NULL : lockp)) == NULL) 3492 return (false); 3493 pv->pv_va = va; 3494 pa = pde & PG_PS_FRAME; 3495 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3496 pvh = pa_to_pvh(pa); 3497 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3498 pvh->pv_gen++; 3499 return (true); 3500} 3501 3502/* 3503 * Fills a page table page with mappings to consecutive physical pages. 3504 */ 3505static void 3506pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3507{ 3508 pt_entry_t *pte; 3509 3510 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3511 *pte = newpte; 3512 newpte += PAGE_SIZE; 3513 } 3514} 3515 3516/* 3517 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3518 * mapping is invalidated. 3519 */ 3520static boolean_t 3521pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3522{ 3523 struct rwlock *lock; 3524 boolean_t rv; 3525 3526 lock = NULL; 3527 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3528 if (lock != NULL) 3529 rw_wunlock(lock); 3530 return (rv); 3531} 3532 3533static boolean_t 3534pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3535 struct rwlock **lockp) 3536{ 3537 pd_entry_t newpde, oldpde; 3538 pt_entry_t *firstpte, newpte; 3539 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3540 vm_paddr_t mptepa; 3541 vm_page_t mpte; 3542 struct spglist free; 3543 vm_offset_t sva; 3544 int PG_PTE_CACHE; 3545 3546 PG_G = pmap_global_bit(pmap); 3547 PG_A = pmap_accessed_bit(pmap); 3548 PG_M = pmap_modified_bit(pmap); 3549 PG_RW = pmap_rw_bit(pmap); 3550 PG_V = pmap_valid_bit(pmap); 3551 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3552 3553 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3554 oldpde = *pde; 3555 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3556 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3557 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 3558 NULL) { 3559 KASSERT((oldpde & PG_W) == 0, 3560 ("pmap_demote_pde: page table page for a wired mapping" 3561 " is missing")); 3562 3563 /* 3564 * Invalidate the 2MB page mapping and return "failure" if the 3565 * mapping was never accessed or the allocation of the new 3566 * page table page fails. If the 2MB page mapping belongs to 3567 * the direct map region of the kernel's address space, then 3568 * the page allocation request specifies the highest possible 3569 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3570 * normal. Page table pages are preallocated for every other 3571 * part of the kernel address space, so the direct map region 3572 * is the only part of the kernel address space that must be 3573 * handled here. 3574 */ 3575 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3576 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3577 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3578 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3579 SLIST_INIT(&free); 3580 sva = trunc_2mpage(va); 3581 pmap_remove_pde(pmap, pde, sva, &free, lockp); 3582 if ((oldpde & PG_G) == 0) 3583 pmap_invalidate_pde_page(pmap, sva, oldpde); 3584 pmap_free_zero_pages(&free); 3585 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3586 " in pmap %p", va, pmap); 3587 return (FALSE); 3588 } 3589 if (va < VM_MAXUSER_ADDRESS) 3590 pmap_resident_count_inc(pmap, 1); 3591 } 3592 mptepa = VM_PAGE_TO_PHYS(mpte); 3593 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3594 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3595 KASSERT((oldpde & PG_A) != 0, 3596 ("pmap_demote_pde: oldpde is missing PG_A")); 3597 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3598 ("pmap_demote_pde: oldpde is missing PG_M")); 3599 newpte = oldpde & ~PG_PS; 3600 newpte = pmap_swap_pat(pmap, newpte); 3601 3602 /* 3603 * If the page table page is new, initialize it. 3604 */ 3605 if (mpte->wire_count == 1) { 3606 mpte->wire_count = NPTEPG; 3607 pmap_fill_ptp(firstpte, newpte); 3608 } 3609 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3610 ("pmap_demote_pde: firstpte and newpte map different physical" 3611 " addresses")); 3612 3613 /* 3614 * If the mapping has changed attributes, update the page table 3615 * entries. 3616 */ 3617 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3618 pmap_fill_ptp(firstpte, newpte); 3619 3620 /* 3621 * The spare PV entries must be reserved prior to demoting the 3622 * mapping, that is, prior to changing the PDE. Otherwise, the state 3623 * of the PDE and the PV lists will be inconsistent, which can result 3624 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3625 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3626 * PV entry for the 2MB page mapping that is being demoted. 3627 */ 3628 if ((oldpde & PG_MANAGED) != 0) 3629 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3630 3631 /* 3632 * Demote the mapping. This pmap is locked. The old PDE has 3633 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3634 * set. Thus, there is no danger of a race with another 3635 * processor changing the setting of PG_A and/or PG_M between 3636 * the read above and the store below. 3637 */ 3638 if (workaround_erratum383) 3639 pmap_update_pde(pmap, va, pde, newpde); 3640 else 3641 pde_store(pde, newpde); 3642 3643 /* 3644 * Invalidate a stale recursive mapping of the page table page. 3645 */ 3646 if (va >= VM_MAXUSER_ADDRESS) 3647 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3648 3649 /* 3650 * Demote the PV entry. 3651 */ 3652 if ((oldpde & PG_MANAGED) != 0) 3653 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3654 3655 atomic_add_long(&pmap_pde_demotions, 1); 3656 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3657 " in pmap %p", va, pmap); 3658 return (TRUE); 3659} 3660 3661/* 3662 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3663 */ 3664static void 3665pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3666{ 3667 pd_entry_t newpde; 3668 vm_paddr_t mptepa; 3669 vm_page_t mpte; 3670 3671 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3672 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3673 mpte = pmap_remove_pt_page(pmap, va); 3674 if (mpte == NULL) 3675 panic("pmap_remove_kernel_pde: Missing pt page."); 3676 3677 mptepa = VM_PAGE_TO_PHYS(mpte); 3678 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3679 3680 /* 3681 * Initialize the page table page. 3682 */ 3683 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3684 3685 /* 3686 * Demote the mapping. 3687 */ 3688 if (workaround_erratum383) 3689 pmap_update_pde(pmap, va, pde, newpde); 3690 else 3691 pde_store(pde, newpde); 3692 3693 /* 3694 * Invalidate a stale recursive mapping of the page table page. 3695 */ 3696 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3697} 3698 3699/* 3700 * pmap_remove_pde: do the things to unmap a superpage in a process 3701 */ 3702static int 3703pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3704 struct spglist *free, struct rwlock **lockp) 3705{ 3706 struct md_page *pvh; 3707 pd_entry_t oldpde; 3708 vm_offset_t eva, va; 3709 vm_page_t m, mpte; 3710 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3711 3712 PG_G = pmap_global_bit(pmap); 3713 PG_A = pmap_accessed_bit(pmap); 3714 PG_M = pmap_modified_bit(pmap); 3715 PG_RW = pmap_rw_bit(pmap); 3716 3717 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3718 KASSERT((sva & PDRMASK) == 0, 3719 ("pmap_remove_pde: sva is not 2mpage aligned")); 3720 oldpde = pte_load_clear(pdq); 3721 if (oldpde & PG_W) 3722 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3723 if ((oldpde & PG_G) != 0) 3724 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3725 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3726 if (oldpde & PG_MANAGED) { 3727 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3728 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3729 pmap_pvh_free(pvh, pmap, sva); 3730 eva = sva + NBPDR; 3731 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3732 va < eva; va += PAGE_SIZE, m++) { 3733 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3734 vm_page_dirty(m); 3735 if (oldpde & PG_A) 3736 vm_page_aflag_set(m, PGA_REFERENCED); 3737 if (TAILQ_EMPTY(&m->md.pv_list) && 3738 TAILQ_EMPTY(&pvh->pv_list)) 3739 vm_page_aflag_clear(m, PGA_WRITEABLE); 3740 pmap_delayed_invl_page(m); 3741 } 3742 } 3743 if (pmap == kernel_pmap) { 3744 pmap_remove_kernel_pde(pmap, pdq, sva); 3745 } else { 3746 mpte = pmap_remove_pt_page(pmap, sva); 3747 if (mpte != NULL) { 3748 pmap_resident_count_dec(pmap, 1); 3749 KASSERT(mpte->wire_count == NPTEPG, 3750 ("pmap_remove_pde: pte page wire count error")); 3751 mpte->wire_count = 0; 3752 pmap_add_delayed_free_list(mpte, free, FALSE); 3753 } 3754 } 3755 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3756} 3757 3758/* 3759 * pmap_remove_pte: do the things to unmap a page in a process 3760 */ 3761static int 3762pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3763 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3764{ 3765 struct md_page *pvh; 3766 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3767 vm_page_t m; 3768 3769 PG_A = pmap_accessed_bit(pmap); 3770 PG_M = pmap_modified_bit(pmap); 3771 PG_RW = pmap_rw_bit(pmap); 3772 3773 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3774 oldpte = pte_load_clear(ptq); 3775 if (oldpte & PG_W) 3776 pmap->pm_stats.wired_count -= 1; 3777 pmap_resident_count_dec(pmap, 1); 3778 if (oldpte & PG_MANAGED) { 3779 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3780 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3781 vm_page_dirty(m); 3782 if (oldpte & PG_A) 3783 vm_page_aflag_set(m, PGA_REFERENCED); 3784 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3785 pmap_pvh_free(&m->md, pmap, va); 3786 if (TAILQ_EMPTY(&m->md.pv_list) && 3787 (m->flags & PG_FICTITIOUS) == 0) { 3788 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3789 if (TAILQ_EMPTY(&pvh->pv_list)) 3790 vm_page_aflag_clear(m, PGA_WRITEABLE); 3791 } 3792 pmap_delayed_invl_page(m); 3793 } 3794 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3795} 3796 3797/* 3798 * Remove a single page from a process address space 3799 */ 3800static void 3801pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3802 struct spglist *free) 3803{ 3804 struct rwlock *lock; 3805 pt_entry_t *pte, PG_V; 3806 3807 PG_V = pmap_valid_bit(pmap); 3808 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3809 if ((*pde & PG_V) == 0) 3810 return; 3811 pte = pmap_pde_to_pte(pde, va); 3812 if ((*pte & PG_V) == 0) 3813 return; 3814 lock = NULL; 3815 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3816 if (lock != NULL) 3817 rw_wunlock(lock); 3818 pmap_invalidate_page(pmap, va); 3819} 3820 3821/* 3822 * Removes the specified range of addresses from the page table page. 3823 */ 3824static bool 3825pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3826 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 3827{ 3828 pt_entry_t PG_G, *pte; 3829 vm_offset_t va; 3830 bool anyvalid; 3831 3832 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3833 PG_G = pmap_global_bit(pmap); 3834 anyvalid = false; 3835 va = eva; 3836 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 3837 sva += PAGE_SIZE) { 3838 if (*pte == 0) { 3839 if (va != eva) { 3840 pmap_invalidate_range(pmap, va, sva); 3841 va = eva; 3842 } 3843 continue; 3844 } 3845 if ((*pte & PG_G) == 0) 3846 anyvalid = true; 3847 else if (va == eva) 3848 va = sva; 3849 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 3850 sva += PAGE_SIZE; 3851 break; 3852 } 3853 } 3854 if (va != eva) 3855 pmap_invalidate_range(pmap, va, sva); 3856 return (anyvalid); 3857} 3858 3859/* 3860 * Remove the given range of addresses from the specified map. 3861 * 3862 * It is assumed that the start and end are properly 3863 * rounded to the page size. 3864 */ 3865void 3866pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3867{ 3868 struct rwlock *lock; 3869 vm_offset_t va_next; 3870 pml4_entry_t *pml4e; 3871 pdp_entry_t *pdpe; 3872 pd_entry_t ptpaddr, *pde; 3873 pt_entry_t PG_G, PG_V; 3874 struct spglist free; 3875 int anyvalid; 3876 3877 PG_G = pmap_global_bit(pmap); 3878 PG_V = pmap_valid_bit(pmap); 3879 3880 /* 3881 * Perform an unsynchronized read. This is, however, safe. 3882 */ 3883 if (pmap->pm_stats.resident_count == 0) 3884 return; 3885 3886 anyvalid = 0; 3887 SLIST_INIT(&free); 3888 3889 pmap_delayed_invl_started(); 3890 PMAP_LOCK(pmap); 3891 3892 /* 3893 * special handling of removing one page. a very 3894 * common operation and easy to short circuit some 3895 * code. 3896 */ 3897 if (sva + PAGE_SIZE == eva) { 3898 pde = pmap_pde(pmap, sva); 3899 if (pde && (*pde & PG_PS) == 0) { 3900 pmap_remove_page(pmap, sva, pde, &free); 3901 goto out; 3902 } 3903 } 3904 3905 lock = NULL; 3906 for (; sva < eva; sva = va_next) { 3907 3908 if (pmap->pm_stats.resident_count == 0) 3909 break; 3910 3911 pml4e = pmap_pml4e(pmap, sva); 3912 if ((*pml4e & PG_V) == 0) { 3913 va_next = (sva + NBPML4) & ~PML4MASK; 3914 if (va_next < sva) 3915 va_next = eva; 3916 continue; 3917 } 3918 3919 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3920 if ((*pdpe & PG_V) == 0) { 3921 va_next = (sva + NBPDP) & ~PDPMASK; 3922 if (va_next < sva) 3923 va_next = eva; 3924 continue; 3925 } 3926 3927 /* 3928 * Calculate index for next page table. 3929 */ 3930 va_next = (sva + NBPDR) & ~PDRMASK; 3931 if (va_next < sva) 3932 va_next = eva; 3933 3934 pde = pmap_pdpe_to_pde(pdpe, sva); 3935 ptpaddr = *pde; 3936 3937 /* 3938 * Weed out invalid mappings. 3939 */ 3940 if (ptpaddr == 0) 3941 continue; 3942 3943 /* 3944 * Check for large page. 3945 */ 3946 if ((ptpaddr & PG_PS) != 0) { 3947 /* 3948 * Are we removing the entire large page? If not, 3949 * demote the mapping and fall through. 3950 */ 3951 if (sva + NBPDR == va_next && eva >= va_next) { 3952 /* 3953 * The TLB entry for a PG_G mapping is 3954 * invalidated by pmap_remove_pde(). 3955 */ 3956 if ((ptpaddr & PG_G) == 0) 3957 anyvalid = 1; 3958 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3959 continue; 3960 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3961 &lock)) { 3962 /* The large page mapping was destroyed. */ 3963 continue; 3964 } else 3965 ptpaddr = *pde; 3966 } 3967 3968 /* 3969 * Limit our scan to either the end of the va represented 3970 * by the current page table page, or to the end of the 3971 * range being removed. 3972 */ 3973 if (va_next > eva) 3974 va_next = eva; 3975 3976 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 3977 anyvalid = 1; 3978 } 3979 if (lock != NULL) 3980 rw_wunlock(lock); 3981out: 3982 if (anyvalid) 3983 pmap_invalidate_all(pmap); 3984 PMAP_UNLOCK(pmap); 3985 pmap_delayed_invl_finished(); 3986 pmap_free_zero_pages(&free); 3987} 3988 3989/* 3990 * Routine: pmap_remove_all 3991 * Function: 3992 * Removes this physical page from 3993 * all physical maps in which it resides. 3994 * Reflects back modify bits to the pager. 3995 * 3996 * Notes: 3997 * Original versions of this routine were very 3998 * inefficient because they iteratively called 3999 * pmap_remove (slow...) 4000 */ 4001 4002void 4003pmap_remove_all(vm_page_t m) 4004{ 4005 struct md_page *pvh; 4006 pv_entry_t pv; 4007 pmap_t pmap; 4008 struct rwlock *lock; 4009 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 4010 pd_entry_t *pde; 4011 vm_offset_t va; 4012 struct spglist free; 4013 int pvh_gen, md_gen; 4014 4015 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4016 ("pmap_remove_all: page %p is not managed", m)); 4017 SLIST_INIT(&free); 4018 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4019 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4020 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4021retry: 4022 rw_wlock(lock); 4023 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4024 pmap = PV_PMAP(pv); 4025 if (!PMAP_TRYLOCK(pmap)) { 4026 pvh_gen = pvh->pv_gen; 4027 rw_wunlock(lock); 4028 PMAP_LOCK(pmap); 4029 rw_wlock(lock); 4030 if (pvh_gen != pvh->pv_gen) { 4031 rw_wunlock(lock); 4032 PMAP_UNLOCK(pmap); 4033 goto retry; 4034 } 4035 } 4036 va = pv->pv_va; 4037 pde = pmap_pde(pmap, va); 4038 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 4039 PMAP_UNLOCK(pmap); 4040 } 4041 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4042 pmap = PV_PMAP(pv); 4043 if (!PMAP_TRYLOCK(pmap)) { 4044 pvh_gen = pvh->pv_gen; 4045 md_gen = m->md.pv_gen; 4046 rw_wunlock(lock); 4047 PMAP_LOCK(pmap); 4048 rw_wlock(lock); 4049 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4050 rw_wunlock(lock); 4051 PMAP_UNLOCK(pmap); 4052 goto retry; 4053 } 4054 } 4055 PG_A = pmap_accessed_bit(pmap); 4056 PG_M = pmap_modified_bit(pmap); 4057 PG_RW = pmap_rw_bit(pmap); 4058 pmap_resident_count_dec(pmap, 1); 4059 pde = pmap_pde(pmap, pv->pv_va); 4060 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 4061 " a 2mpage in page %p's pv list", m)); 4062 pte = pmap_pde_to_pte(pde, pv->pv_va); 4063 tpte = pte_load_clear(pte); 4064 if (tpte & PG_W) 4065 pmap->pm_stats.wired_count--; 4066 if (tpte & PG_A) 4067 vm_page_aflag_set(m, PGA_REFERENCED); 4068 4069 /* 4070 * Update the vm_page_t clean and reference bits. 4071 */ 4072 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4073 vm_page_dirty(m); 4074 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 4075 pmap_invalidate_page(pmap, pv->pv_va); 4076 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4077 m->md.pv_gen++; 4078 free_pv_entry(pmap, pv); 4079 PMAP_UNLOCK(pmap); 4080 } 4081 vm_page_aflag_clear(m, PGA_WRITEABLE); 4082 rw_wunlock(lock); 4083 pmap_delayed_invl_wait(m); 4084 pmap_free_zero_pages(&free); 4085} 4086 4087/* 4088 * pmap_protect_pde: do the things to protect a 2mpage in a process 4089 */ 4090static boolean_t 4091pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 4092{ 4093 pd_entry_t newpde, oldpde; 4094 vm_offset_t eva, va; 4095 vm_page_t m; 4096 boolean_t anychanged; 4097 pt_entry_t PG_G, PG_M, PG_RW; 4098 4099 PG_G = pmap_global_bit(pmap); 4100 PG_M = pmap_modified_bit(pmap); 4101 PG_RW = pmap_rw_bit(pmap); 4102 4103 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4104 KASSERT((sva & PDRMASK) == 0, 4105 ("pmap_protect_pde: sva is not 2mpage aligned")); 4106 anychanged = FALSE; 4107retry: 4108 oldpde = newpde = *pde; 4109 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4110 (PG_MANAGED | PG_M | PG_RW)) { 4111 eva = sva + NBPDR; 4112 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4113 va < eva; va += PAGE_SIZE, m++) 4114 vm_page_dirty(m); 4115 } 4116 if ((prot & VM_PROT_WRITE) == 0) 4117 newpde &= ~(PG_RW | PG_M); 4118 if ((prot & VM_PROT_EXECUTE) == 0) 4119 newpde |= pg_nx; 4120 if (newpde != oldpde) { 4121 /* 4122 * As an optimization to future operations on this PDE, clear 4123 * PG_PROMOTED. The impending invalidation will remove any 4124 * lingering 4KB page mappings from the TLB. 4125 */ 4126 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 4127 goto retry; 4128 if ((oldpde & PG_G) != 0) 4129 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4130 else 4131 anychanged = TRUE; 4132 } 4133 return (anychanged); 4134} 4135 4136/* 4137 * Set the physical protection on the 4138 * specified range of this map as requested. 4139 */ 4140void 4141pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4142{ 4143 vm_offset_t va_next; 4144 pml4_entry_t *pml4e; 4145 pdp_entry_t *pdpe; 4146 pd_entry_t ptpaddr, *pde; 4147 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 4148 boolean_t anychanged; 4149 4150 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4151 if (prot == VM_PROT_NONE) { 4152 pmap_remove(pmap, sva, eva); 4153 return; 4154 } 4155 4156 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4157 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4158 return; 4159 4160 PG_G = pmap_global_bit(pmap); 4161 PG_M = pmap_modified_bit(pmap); 4162 PG_V = pmap_valid_bit(pmap); 4163 PG_RW = pmap_rw_bit(pmap); 4164 anychanged = FALSE; 4165 4166 /* 4167 * Although this function delays and batches the invalidation 4168 * of stale TLB entries, it does not need to call 4169 * pmap_delayed_invl_started() and 4170 * pmap_delayed_invl_finished(), because it does not 4171 * ordinarily destroy mappings. Stale TLB entries from 4172 * protection-only changes need only be invalidated before the 4173 * pmap lock is released, because protection-only changes do 4174 * not destroy PV entries. Even operations that iterate over 4175 * a physical page's PV list of mappings, like 4176 * pmap_remove_write(), acquire the pmap lock for each 4177 * mapping. Consequently, for protection-only changes, the 4178 * pmap lock suffices to synchronize both page table and TLB 4179 * updates. 4180 * 4181 * This function only destroys a mapping if pmap_demote_pde() 4182 * fails. In that case, stale TLB entries are immediately 4183 * invalidated. 4184 */ 4185 4186 PMAP_LOCK(pmap); 4187 for (; sva < eva; sva = va_next) { 4188 4189 pml4e = pmap_pml4e(pmap, sva); 4190 if ((*pml4e & PG_V) == 0) { 4191 va_next = (sva + NBPML4) & ~PML4MASK; 4192 if (va_next < sva) 4193 va_next = eva; 4194 continue; 4195 } 4196 4197 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4198 if ((*pdpe & PG_V) == 0) { 4199 va_next = (sva + NBPDP) & ~PDPMASK; 4200 if (va_next < sva) 4201 va_next = eva; 4202 continue; 4203 } 4204 4205 va_next = (sva + NBPDR) & ~PDRMASK; 4206 if (va_next < sva) 4207 va_next = eva; 4208 4209 pde = pmap_pdpe_to_pde(pdpe, sva); 4210 ptpaddr = *pde; 4211 4212 /* 4213 * Weed out invalid mappings. 4214 */ 4215 if (ptpaddr == 0) 4216 continue; 4217 4218 /* 4219 * Check for large page. 4220 */ 4221 if ((ptpaddr & PG_PS) != 0) { 4222 /* 4223 * Are we protecting the entire large page? If not, 4224 * demote the mapping and fall through. 4225 */ 4226 if (sva + NBPDR == va_next && eva >= va_next) { 4227 /* 4228 * The TLB entry for a PG_G mapping is 4229 * invalidated by pmap_protect_pde(). 4230 */ 4231 if (pmap_protect_pde(pmap, pde, sva, prot)) 4232 anychanged = TRUE; 4233 continue; 4234 } else if (!pmap_demote_pde(pmap, pde, sva)) { 4235 /* 4236 * The large page mapping was destroyed. 4237 */ 4238 continue; 4239 } 4240 } 4241 4242 if (va_next > eva) 4243 va_next = eva; 4244 4245 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4246 sva += PAGE_SIZE) { 4247 pt_entry_t obits, pbits; 4248 vm_page_t m; 4249 4250retry: 4251 obits = pbits = *pte; 4252 if ((pbits & PG_V) == 0) 4253 continue; 4254 4255 if ((prot & VM_PROT_WRITE) == 0) { 4256 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4257 (PG_MANAGED | PG_M | PG_RW)) { 4258 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4259 vm_page_dirty(m); 4260 } 4261 pbits &= ~(PG_RW | PG_M); 4262 } 4263 if ((prot & VM_PROT_EXECUTE) == 0) 4264 pbits |= pg_nx; 4265 4266 if (pbits != obits) { 4267 if (!atomic_cmpset_long(pte, obits, pbits)) 4268 goto retry; 4269 if (obits & PG_G) 4270 pmap_invalidate_page(pmap, sva); 4271 else 4272 anychanged = TRUE; 4273 } 4274 } 4275 } 4276 if (anychanged) 4277 pmap_invalidate_all(pmap); 4278 PMAP_UNLOCK(pmap); 4279} 4280 4281#if VM_NRESERVLEVEL > 0 4282/* 4283 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4284 * single page table page (PTP) to a single 2MB page mapping. For promotion 4285 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4286 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4287 * identical characteristics. 4288 */ 4289static void 4290pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4291 struct rwlock **lockp) 4292{ 4293 pd_entry_t newpde; 4294 pt_entry_t *firstpte, oldpte, pa, *pte; 4295 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 4296 vm_page_t mpte; 4297 int PG_PTE_CACHE; 4298 4299 PG_A = pmap_accessed_bit(pmap); 4300 PG_G = pmap_global_bit(pmap); 4301 PG_M = pmap_modified_bit(pmap); 4302 PG_V = pmap_valid_bit(pmap); 4303 PG_RW = pmap_rw_bit(pmap); 4304 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4305 4306 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4307 4308 /* 4309 * Examine the first PTE in the specified PTP. Abort if this PTE is 4310 * either invalid, unused, or does not map the first 4KB physical page 4311 * within a 2MB page. 4312 */ 4313 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4314setpde: 4315 newpde = *firstpte; 4316 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4317 atomic_add_long(&pmap_pde_p_failures, 1); 4318 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4319 " in pmap %p", va, pmap); 4320 return; 4321 } 4322 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4323 /* 4324 * When PG_M is already clear, PG_RW can be cleared without 4325 * a TLB invalidation. 4326 */ 4327 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4328 goto setpde; 4329 newpde &= ~PG_RW; 4330 } 4331 4332 /* 4333 * Examine each of the other PTEs in the specified PTP. Abort if this 4334 * PTE maps an unexpected 4KB physical page or does not have identical 4335 * characteristics to the first PTE. 4336 */ 4337 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4338 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4339setpte: 4340 oldpte = *pte; 4341 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4342 atomic_add_long(&pmap_pde_p_failures, 1); 4343 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4344 " in pmap %p", va, pmap); 4345 return; 4346 } 4347 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4348 /* 4349 * When PG_M is already clear, PG_RW can be cleared 4350 * without a TLB invalidation. 4351 */ 4352 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4353 goto setpte; 4354 oldpte &= ~PG_RW; 4355 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4356 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4357 (va & ~PDRMASK), pmap); 4358 } 4359 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4360 atomic_add_long(&pmap_pde_p_failures, 1); 4361 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4362 " in pmap %p", va, pmap); 4363 return; 4364 } 4365 pa -= PAGE_SIZE; 4366 } 4367 4368 /* 4369 * Save the page table page in its current state until the PDE 4370 * mapping the superpage is demoted by pmap_demote_pde() or 4371 * destroyed by pmap_remove_pde(). 4372 */ 4373 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4374 KASSERT(mpte >= vm_page_array && 4375 mpte < &vm_page_array[vm_page_array_size], 4376 ("pmap_promote_pde: page table page is out of range")); 4377 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4378 ("pmap_promote_pde: page table page's pindex is wrong")); 4379 if (pmap_insert_pt_page(pmap, mpte)) { 4380 atomic_add_long(&pmap_pde_p_failures, 1); 4381 CTR2(KTR_PMAP, 4382 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4383 pmap); 4384 return; 4385 } 4386 4387 /* 4388 * Promote the pv entries. 4389 */ 4390 if ((newpde & PG_MANAGED) != 0) 4391 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4392 4393 /* 4394 * Propagate the PAT index to its proper position. 4395 */ 4396 newpde = pmap_swap_pat(pmap, newpde); 4397 4398 /* 4399 * Map the superpage. 4400 */ 4401 if (workaround_erratum383) 4402 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4403 else 4404 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 4405 4406 atomic_add_long(&pmap_pde_promotions, 1); 4407 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4408 " in pmap %p", va, pmap); 4409} 4410#endif /* VM_NRESERVLEVEL > 0 */ 4411 4412/* 4413 * Insert the given physical page (p) at 4414 * the specified virtual address (v) in the 4415 * target physical map with the protection requested. 4416 * 4417 * If specified, the page will be wired down, meaning 4418 * that the related pte can not be reclaimed. 4419 * 4420 * NB: This is the only routine which MAY NOT lazy-evaluate 4421 * or lose information. That is, this routine must actually 4422 * insert this page into the given map NOW. 4423 * 4424 * When destroying both a page table and PV entry, this function 4425 * performs the TLB invalidation before releasing the PV list 4426 * lock, so we do not need pmap_delayed_invl_page() calls here. 4427 */ 4428int 4429pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4430 u_int flags, int8_t psind) 4431{ 4432 struct rwlock *lock; 4433 pd_entry_t *pde; 4434 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4435 pt_entry_t newpte, origpte; 4436 pv_entry_t pv; 4437 vm_paddr_t opa, pa; 4438 vm_page_t mpte, om; 4439 int rv; 4440 boolean_t nosleep; 4441 4442 PG_A = pmap_accessed_bit(pmap); 4443 PG_G = pmap_global_bit(pmap); 4444 PG_M = pmap_modified_bit(pmap); 4445 PG_V = pmap_valid_bit(pmap); 4446 PG_RW = pmap_rw_bit(pmap); 4447 4448 va = trunc_page(va); 4449 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4450 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4451 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4452 va)); 4453 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4454 va >= kmi.clean_eva, 4455 ("pmap_enter: managed mapping within the clean submap")); 4456 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4457 VM_OBJECT_ASSERT_LOCKED(m->object); 4458 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 4459 ("pmap_enter: flags %u has reserved bits set", flags)); 4460 pa = VM_PAGE_TO_PHYS(m); 4461 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4462 if ((flags & VM_PROT_WRITE) != 0) 4463 newpte |= PG_M; 4464 if ((prot & VM_PROT_WRITE) != 0) 4465 newpte |= PG_RW; 4466 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4467 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4468 if ((prot & VM_PROT_EXECUTE) == 0) 4469 newpte |= pg_nx; 4470 if ((flags & PMAP_ENTER_WIRED) != 0) 4471 newpte |= PG_W; 4472 if (va < VM_MAXUSER_ADDRESS) 4473 newpte |= PG_U; 4474 if (pmap == kernel_pmap) 4475 newpte |= PG_G; 4476 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 4477 4478 /* 4479 * Set modified bit gratuitously for writeable mappings if 4480 * the page is unmanaged. We do not want to take a fault 4481 * to do the dirty bit accounting for these mappings. 4482 */ 4483 if ((m->oflags & VPO_UNMANAGED) != 0) { 4484 if ((newpte & PG_RW) != 0) 4485 newpte |= PG_M; 4486 } else 4487 newpte |= PG_MANAGED; 4488 4489 lock = NULL; 4490 PMAP_LOCK(pmap); 4491 if (psind == 1) { 4492 /* Assert the required virtual and physical alignment. */ 4493 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 4494 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 4495 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 4496 goto out; 4497 } 4498 mpte = NULL; 4499 4500 /* 4501 * In the case that a page table page is not 4502 * resident, we are creating it here. 4503 */ 4504retry: 4505 pde = pmap_pde(pmap, va); 4506 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4507 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4508 pte = pmap_pde_to_pte(pde, va); 4509 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4510 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4511 mpte->wire_count++; 4512 } 4513 } else if (va < VM_MAXUSER_ADDRESS) { 4514 /* 4515 * Here if the pte page isn't mapped, or if it has been 4516 * deallocated. 4517 */ 4518 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4519 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4520 nosleep ? NULL : &lock); 4521 if (mpte == NULL && nosleep) { 4522 rv = KERN_RESOURCE_SHORTAGE; 4523 goto out; 4524 } 4525 goto retry; 4526 } else 4527 panic("pmap_enter: invalid page directory va=%#lx", va); 4528 4529 origpte = *pte; 4530 4531 /* 4532 * Is the specified virtual address already mapped? 4533 */ 4534 if ((origpte & PG_V) != 0) { 4535 /* 4536 * Wiring change, just update stats. We don't worry about 4537 * wiring PT pages as they remain resident as long as there 4538 * are valid mappings in them. Hence, if a user page is wired, 4539 * the PT page will be also. 4540 */ 4541 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4542 pmap->pm_stats.wired_count++; 4543 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4544 pmap->pm_stats.wired_count--; 4545 4546 /* 4547 * Remove the extra PT page reference. 4548 */ 4549 if (mpte != NULL) { 4550 mpte->wire_count--; 4551 KASSERT(mpte->wire_count > 0, 4552 ("pmap_enter: missing reference to page table page," 4553 " va: 0x%lx", va)); 4554 } 4555 4556 /* 4557 * Has the physical page changed? 4558 */ 4559 opa = origpte & PG_FRAME; 4560 if (opa == pa) { 4561 /* 4562 * No, might be a protection or wiring change. 4563 */ 4564 if ((origpte & PG_MANAGED) != 0 && 4565 (newpte & PG_RW) != 0) 4566 vm_page_aflag_set(m, PGA_WRITEABLE); 4567 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4568 goto unchanged; 4569 goto validate; 4570 } 4571 } else { 4572 /* 4573 * Increment the counters. 4574 */ 4575 if ((newpte & PG_W) != 0) 4576 pmap->pm_stats.wired_count++; 4577 pmap_resident_count_inc(pmap, 1); 4578 } 4579 4580 /* 4581 * Enter on the PV list if part of our managed memory. 4582 */ 4583 if ((newpte & PG_MANAGED) != 0) { 4584 pv = get_pv_entry(pmap, &lock); 4585 pv->pv_va = va; 4586 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4587 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4588 m->md.pv_gen++; 4589 if ((newpte & PG_RW) != 0) 4590 vm_page_aflag_set(m, PGA_WRITEABLE); 4591 } 4592 4593 /* 4594 * Update the PTE. 4595 */ 4596 if ((origpte & PG_V) != 0) { 4597validate: 4598 origpte = pte_load_store(pte, newpte); 4599 opa = origpte & PG_FRAME; 4600 if (opa != pa) { 4601 if ((origpte & PG_MANAGED) != 0) { 4602 om = PHYS_TO_VM_PAGE(opa); 4603 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4604 PG_RW)) 4605 vm_page_dirty(om); 4606 if ((origpte & PG_A) != 0) 4607 vm_page_aflag_set(om, PGA_REFERENCED); 4608 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4609 pmap_pvh_free(&om->md, pmap, va); 4610 if ((om->aflags & PGA_WRITEABLE) != 0 && 4611 TAILQ_EMPTY(&om->md.pv_list) && 4612 ((om->flags & PG_FICTITIOUS) != 0 || 4613 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4614 vm_page_aflag_clear(om, PGA_WRITEABLE); 4615 } 4616 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4617 PG_RW)) == (PG_M | PG_RW)) { 4618 if ((origpte & PG_MANAGED) != 0) 4619 vm_page_dirty(m); 4620 4621 /* 4622 * Although the PTE may still have PG_RW set, TLB 4623 * invalidation may nonetheless be required because 4624 * the PTE no longer has PG_M set. 4625 */ 4626 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4627 /* 4628 * This PTE change does not require TLB invalidation. 4629 */ 4630 goto unchanged; 4631 } 4632 if ((origpte & PG_A) != 0) 4633 pmap_invalidate_page(pmap, va); 4634 } else 4635 pte_store(pte, newpte); 4636 4637unchanged: 4638 4639#if VM_NRESERVLEVEL > 0 4640 /* 4641 * If both the page table page and the reservation are fully 4642 * populated, then attempt promotion. 4643 */ 4644 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4645 pmap_ps_enabled(pmap) && 4646 (m->flags & PG_FICTITIOUS) == 0 && 4647 vm_reserv_level_iffullpop(m) == 0) 4648 pmap_promote_pde(pmap, pde, va, &lock); 4649#endif 4650 4651 rv = KERN_SUCCESS; 4652out: 4653 if (lock != NULL) 4654 rw_wunlock(lock); 4655 PMAP_UNLOCK(pmap); 4656 return (rv); 4657} 4658 4659/* 4660 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 4661 * if successful. Returns false if (1) a page table page cannot be allocated 4662 * without sleeping, (2) a mapping already exists at the specified virtual 4663 * address, or (3) a PV entry cannot be allocated without reclaiming another 4664 * PV entry. 4665 */ 4666static bool 4667pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4668 struct rwlock **lockp) 4669{ 4670 pd_entry_t newpde; 4671 pt_entry_t PG_V; 4672 4673 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4674 PG_V = pmap_valid_bit(pmap); 4675 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4676 PG_PS | PG_V; 4677 if ((m->oflags & VPO_UNMANAGED) == 0) 4678 newpde |= PG_MANAGED; 4679 if ((prot & VM_PROT_EXECUTE) == 0) 4680 newpde |= pg_nx; 4681 if (va < VM_MAXUSER_ADDRESS) 4682 newpde |= PG_U; 4683 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 4684 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 4685 KERN_SUCCESS); 4686} 4687 4688/* 4689 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4690 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4691 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4692 * a mapping already exists at the specified virtual address. Returns 4693 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 4694 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 4695 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 4696 * 4697 * The parameter "m" is only used when creating a managed, writeable mapping. 4698 */ 4699static int 4700pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 4701 vm_page_t m, struct rwlock **lockp) 4702{ 4703 struct spglist free; 4704 pd_entry_t oldpde, *pde; 4705 pt_entry_t PG_G, PG_RW, PG_V; 4706 vm_page_t mt, pdpg; 4707 4708 PG_G = pmap_global_bit(pmap); 4709 PG_RW = pmap_rw_bit(pmap); 4710 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 4711 ("pmap_enter_pde: newpde is missing PG_M")); 4712 PG_V = pmap_valid_bit(pmap); 4713 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4714 4715 if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 4716 NULL : lockp)) == NULL) { 4717 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4718 " in pmap %p", va, pmap); 4719 return (KERN_RESOURCE_SHORTAGE); 4720 } 4721 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4722 pde = &pde[pmap_pde_index(va)]; 4723 oldpde = *pde; 4724 if ((oldpde & PG_V) != 0) { 4725 KASSERT(pdpg->wire_count > 1, 4726 ("pmap_enter_pde: pdpg's wire count is too low")); 4727 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4728 pdpg->wire_count--; 4729 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4730 " in pmap %p", va, pmap); 4731 return (KERN_FAILURE); 4732 } 4733 /* Break the existing mapping(s). */ 4734 SLIST_INIT(&free); 4735 if ((oldpde & PG_PS) != 0) { 4736 /* 4737 * The reference to the PD page that was acquired by 4738 * pmap_allocpde() ensures that it won't be freed. 4739 * However, if the PDE resulted from a promotion, then 4740 * a reserved PT page could be freed. 4741 */ 4742 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 4743 if ((oldpde & PG_G) == 0) 4744 pmap_invalidate_pde_page(pmap, va, oldpde); 4745 } else { 4746 pmap_delayed_invl_started(); 4747 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 4748 lockp)) 4749 pmap_invalidate_all(pmap); 4750 pmap_delayed_invl_finished(); 4751 } 4752 pmap_free_zero_pages(&free); 4753 if (va >= VM_MAXUSER_ADDRESS) { 4754 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4755 if (pmap_insert_pt_page(pmap, mt)) { 4756 /* 4757 * XXX Currently, this can't happen because 4758 * we do not perform pmap_enter(psind == 1) 4759 * on the kernel pmap. 4760 */ 4761 panic("pmap_enter_pde: trie insert failed"); 4762 } 4763 } else 4764 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 4765 pde)); 4766 } 4767 if ((newpde & PG_MANAGED) != 0) { 4768 /* 4769 * Abort this mapping if its PV entry could not be created. 4770 */ 4771 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 4772 SLIST_INIT(&free); 4773 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 4774 /* 4775 * Although "va" is not mapped, paging- 4776 * structure caches could nonetheless have 4777 * entries that refer to the freed page table 4778 * pages. Invalidate those entries. 4779 */ 4780 pmap_invalidate_page(pmap, va); 4781 pmap_free_zero_pages(&free); 4782 } 4783 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4784 " in pmap %p", va, pmap); 4785 return (KERN_RESOURCE_SHORTAGE); 4786 } 4787 if ((newpde & PG_RW) != 0) { 4788 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4789 vm_page_aflag_set(mt, PGA_WRITEABLE); 4790 } 4791 } 4792 4793 /* 4794 * Increment counters. 4795 */ 4796 if ((newpde & PG_W) != 0) 4797 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 4798 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4799 4800 /* 4801 * Map the superpage. (This is not a promoted mapping; there will not 4802 * be any lingering 4KB page mappings in the TLB.) 4803 */ 4804 pde_store(pde, newpde); 4805 4806 atomic_add_long(&pmap_pde_mappings, 1); 4807 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4808 " in pmap %p", va, pmap); 4809 return (KERN_SUCCESS); 4810} 4811 4812/* 4813 * Maps a sequence of resident pages belonging to the same object. 4814 * The sequence begins with the given page m_start. This page is 4815 * mapped at the given virtual address start. Each subsequent page is 4816 * mapped at a virtual address that is offset from start by the same 4817 * amount as the page is offset from m_start within the object. The 4818 * last page in the sequence is the page with the largest offset from 4819 * m_start that can be mapped at a virtual address less than the given 4820 * virtual address end. Not every virtual page between start and end 4821 * is mapped; only those for which a resident page exists with the 4822 * corresponding offset from m_start are mapped. 4823 */ 4824void 4825pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4826 vm_page_t m_start, vm_prot_t prot) 4827{ 4828 struct rwlock *lock; 4829 vm_offset_t va; 4830 vm_page_t m, mpte; 4831 vm_pindex_t diff, psize; 4832 4833 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4834 4835 psize = atop(end - start); 4836 mpte = NULL; 4837 m = m_start; 4838 lock = NULL; 4839 PMAP_LOCK(pmap); 4840 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4841 va = start + ptoa(diff); 4842 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4843 m->psind == 1 && pmap_ps_enabled(pmap) && 4844 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 4845 m = &m[NBPDR / PAGE_SIZE - 1]; 4846 else 4847 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4848 mpte, &lock); 4849 m = TAILQ_NEXT(m, listq); 4850 } 4851 if (lock != NULL) 4852 rw_wunlock(lock); 4853 PMAP_UNLOCK(pmap); 4854} 4855 4856/* 4857 * this code makes some *MAJOR* assumptions: 4858 * 1. Current pmap & pmap exists. 4859 * 2. Not wired. 4860 * 3. Read access. 4861 * 4. No page table pages. 4862 * but is *MUCH* faster than pmap_enter... 4863 */ 4864 4865void 4866pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4867{ 4868 struct rwlock *lock; 4869 4870 lock = NULL; 4871 PMAP_LOCK(pmap); 4872 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4873 if (lock != NULL) 4874 rw_wunlock(lock); 4875 PMAP_UNLOCK(pmap); 4876} 4877 4878static vm_page_t 4879pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4880 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4881{ 4882 struct spglist free; 4883 pt_entry_t *pte, PG_V; 4884 vm_paddr_t pa; 4885 4886 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4887 (m->oflags & VPO_UNMANAGED) != 0, 4888 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4889 PG_V = pmap_valid_bit(pmap); 4890 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4891 4892 /* 4893 * In the case that a page table page is not 4894 * resident, we are creating it here. 4895 */ 4896 if (va < VM_MAXUSER_ADDRESS) { 4897 vm_pindex_t ptepindex; 4898 pd_entry_t *ptepa; 4899 4900 /* 4901 * Calculate pagetable page index 4902 */ 4903 ptepindex = pmap_pde_pindex(va); 4904 if (mpte && (mpte->pindex == ptepindex)) { 4905 mpte->wire_count++; 4906 } else { 4907 /* 4908 * Get the page directory entry 4909 */ 4910 ptepa = pmap_pde(pmap, va); 4911 4912 /* 4913 * If the page table page is mapped, we just increment 4914 * the hold count, and activate it. Otherwise, we 4915 * attempt to allocate a page table page. If this 4916 * attempt fails, we don't retry. Instead, we give up. 4917 */ 4918 if (ptepa && (*ptepa & PG_V) != 0) { 4919 if (*ptepa & PG_PS) 4920 return (NULL); 4921 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4922 mpte->wire_count++; 4923 } else { 4924 /* 4925 * Pass NULL instead of the PV list lock 4926 * pointer, because we don't intend to sleep. 4927 */ 4928 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4929 if (mpte == NULL) 4930 return (mpte); 4931 } 4932 } 4933 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4934 pte = &pte[pmap_pte_index(va)]; 4935 } else { 4936 mpte = NULL; 4937 pte = vtopte(va); 4938 } 4939 if (*pte) { 4940 if (mpte != NULL) { 4941 mpte->wire_count--; 4942 mpte = NULL; 4943 } 4944 return (mpte); 4945 } 4946 4947 /* 4948 * Enter on the PV list if part of our managed memory. 4949 */ 4950 if ((m->oflags & VPO_UNMANAGED) == 0 && 4951 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4952 if (mpte != NULL) { 4953 SLIST_INIT(&free); 4954 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4955 /* 4956 * Although "va" is not mapped, paging- 4957 * structure caches could nonetheless have 4958 * entries that refer to the freed page table 4959 * pages. Invalidate those entries. 4960 */ 4961 pmap_invalidate_page(pmap, va); 4962 pmap_free_zero_pages(&free); 4963 } 4964 mpte = NULL; 4965 } 4966 return (mpte); 4967 } 4968 4969 /* 4970 * Increment counters 4971 */ 4972 pmap_resident_count_inc(pmap, 1); 4973 4974 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4975 if ((prot & VM_PROT_EXECUTE) == 0) 4976 pa |= pg_nx; 4977 4978 /* 4979 * Now validate mapping with RO protection 4980 */ 4981 if ((m->oflags & VPO_UNMANAGED) != 0) 4982 pte_store(pte, pa | PG_V | PG_U); 4983 else 4984 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4985 return (mpte); 4986} 4987 4988/* 4989 * Make a temporary mapping for a physical address. This is only intended 4990 * to be used for panic dumps. 4991 */ 4992void * 4993pmap_kenter_temporary(vm_paddr_t pa, int i) 4994{ 4995 vm_offset_t va; 4996 4997 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4998 pmap_kenter(va, pa); 4999 invlpg(va); 5000 return ((void *)crashdumpmap); 5001} 5002 5003/* 5004 * This code maps large physical mmap regions into the 5005 * processor address space. Note that some shortcuts 5006 * are taken, but the code works. 5007 */ 5008void 5009pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 5010 vm_pindex_t pindex, vm_size_t size) 5011{ 5012 pd_entry_t *pde; 5013 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5014 vm_paddr_t pa, ptepa; 5015 vm_page_t p, pdpg; 5016 int pat_mode; 5017 5018 PG_A = pmap_accessed_bit(pmap); 5019 PG_M = pmap_modified_bit(pmap); 5020 PG_V = pmap_valid_bit(pmap); 5021 PG_RW = pmap_rw_bit(pmap); 5022 5023 VM_OBJECT_ASSERT_WLOCKED(object); 5024 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 5025 ("pmap_object_init_pt: non-device object")); 5026 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 5027 if (!pmap_ps_enabled(pmap)) 5028 return; 5029 if (!vm_object_populate(object, pindex, pindex + atop(size))) 5030 return; 5031 p = vm_page_lookup(object, pindex); 5032 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5033 ("pmap_object_init_pt: invalid page %p", p)); 5034 pat_mode = p->md.pat_mode; 5035 5036 /* 5037 * Abort the mapping if the first page is not physically 5038 * aligned to a 2MB page boundary. 5039 */ 5040 ptepa = VM_PAGE_TO_PHYS(p); 5041 if (ptepa & (NBPDR - 1)) 5042 return; 5043 5044 /* 5045 * Skip the first page. Abort the mapping if the rest of 5046 * the pages are not physically contiguous or have differing 5047 * memory attributes. 5048 */ 5049 p = TAILQ_NEXT(p, listq); 5050 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 5051 pa += PAGE_SIZE) { 5052 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5053 ("pmap_object_init_pt: invalid page %p", p)); 5054 if (pa != VM_PAGE_TO_PHYS(p) || 5055 pat_mode != p->md.pat_mode) 5056 return; 5057 p = TAILQ_NEXT(p, listq); 5058 } 5059 5060 /* 5061 * Map using 2MB pages. Since "ptepa" is 2M aligned and 5062 * "size" is a multiple of 2M, adding the PAT setting to "pa" 5063 * will not affect the termination of this loop. 5064 */ 5065 PMAP_LOCK(pmap); 5066 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 5067 pa < ptepa + size; pa += NBPDR) { 5068 pdpg = pmap_allocpde(pmap, addr, NULL); 5069 if (pdpg == NULL) { 5070 /* 5071 * The creation of mappings below is only an 5072 * optimization. If a page directory page 5073 * cannot be allocated without blocking, 5074 * continue on to the next mapping rather than 5075 * blocking. 5076 */ 5077 addr += NBPDR; 5078 continue; 5079 } 5080 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5081 pde = &pde[pmap_pde_index(addr)]; 5082 if ((*pde & PG_V) == 0) { 5083 pde_store(pde, pa | PG_PS | PG_M | PG_A | 5084 PG_U | PG_RW | PG_V); 5085 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 5086 atomic_add_long(&pmap_pde_mappings, 1); 5087 } else { 5088 /* Continue on if the PDE is already valid. */ 5089 pdpg->wire_count--; 5090 KASSERT(pdpg->wire_count > 0, 5091 ("pmap_object_init_pt: missing reference " 5092 "to page directory page, va: 0x%lx", addr)); 5093 } 5094 addr += NBPDR; 5095 } 5096 PMAP_UNLOCK(pmap); 5097 } 5098} 5099 5100/* 5101 * Clear the wired attribute from the mappings for the specified range of 5102 * addresses in the given pmap. Every valid mapping within that range 5103 * must have the wired attribute set. In contrast, invalid mappings 5104 * cannot have the wired attribute set, so they are ignored. 5105 * 5106 * The wired attribute of the page table entry is not a hardware 5107 * feature, so there is no need to invalidate any TLB entries. 5108 * Since pmap_demote_pde() for the wired entry must never fail, 5109 * pmap_delayed_invl_started()/finished() calls around the 5110 * function are not needed. 5111 */ 5112void 5113pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5114{ 5115 vm_offset_t va_next; 5116 pml4_entry_t *pml4e; 5117 pdp_entry_t *pdpe; 5118 pd_entry_t *pde; 5119 pt_entry_t *pte, PG_V; 5120 5121 PG_V = pmap_valid_bit(pmap); 5122 PMAP_LOCK(pmap); 5123 for (; sva < eva; sva = va_next) { 5124 pml4e = pmap_pml4e(pmap, sva); 5125 if ((*pml4e & PG_V) == 0) { 5126 va_next = (sva + NBPML4) & ~PML4MASK; 5127 if (va_next < sva) 5128 va_next = eva; 5129 continue; 5130 } 5131 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5132 if ((*pdpe & PG_V) == 0) { 5133 va_next = (sva + NBPDP) & ~PDPMASK; 5134 if (va_next < sva) 5135 va_next = eva; 5136 continue; 5137 } 5138 va_next = (sva + NBPDR) & ~PDRMASK; 5139 if (va_next < sva) 5140 va_next = eva; 5141 pde = pmap_pdpe_to_pde(pdpe, sva); 5142 if ((*pde & PG_V) == 0) 5143 continue; 5144 if ((*pde & PG_PS) != 0) { 5145 if ((*pde & PG_W) == 0) 5146 panic("pmap_unwire: pde %#jx is missing PG_W", 5147 (uintmax_t)*pde); 5148 5149 /* 5150 * Are we unwiring the entire large page? If not, 5151 * demote the mapping and fall through. 5152 */ 5153 if (sva + NBPDR == va_next && eva >= va_next) { 5154 atomic_clear_long(pde, PG_W); 5155 pmap->pm_stats.wired_count -= NBPDR / 5156 PAGE_SIZE; 5157 continue; 5158 } else if (!pmap_demote_pde(pmap, pde, sva)) 5159 panic("pmap_unwire: demotion failed"); 5160 } 5161 if (va_next > eva) 5162 va_next = eva; 5163 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5164 sva += PAGE_SIZE) { 5165 if ((*pte & PG_V) == 0) 5166 continue; 5167 if ((*pte & PG_W) == 0) 5168 panic("pmap_unwire: pte %#jx is missing PG_W", 5169 (uintmax_t)*pte); 5170 5171 /* 5172 * PG_W must be cleared atomically. Although the pmap 5173 * lock synchronizes access to PG_W, another processor 5174 * could be setting PG_M and/or PG_A concurrently. 5175 */ 5176 atomic_clear_long(pte, PG_W); 5177 pmap->pm_stats.wired_count--; 5178 } 5179 } 5180 PMAP_UNLOCK(pmap); 5181} 5182 5183/* 5184 * Copy the range specified by src_addr/len 5185 * from the source map to the range dst_addr/len 5186 * in the destination map. 5187 * 5188 * This routine is only advisory and need not do anything. 5189 */ 5190 5191void 5192pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5193 vm_offset_t src_addr) 5194{ 5195 struct rwlock *lock; 5196 struct spglist free; 5197 vm_offset_t addr; 5198 vm_offset_t end_addr = src_addr + len; 5199 vm_offset_t va_next; 5200 vm_page_t dst_pdpg, dstmpte, srcmpte; 5201 pt_entry_t PG_A, PG_M, PG_V; 5202 5203 if (dst_addr != src_addr) 5204 return; 5205 5206 if (dst_pmap->pm_type != src_pmap->pm_type) 5207 return; 5208 5209 /* 5210 * EPT page table entries that require emulation of A/D bits are 5211 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 5212 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 5213 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 5214 * implementations flag an EPT misconfiguration for exec-only 5215 * mappings we skip this function entirely for emulated pmaps. 5216 */ 5217 if (pmap_emulate_ad_bits(dst_pmap)) 5218 return; 5219 5220 lock = NULL; 5221 if (dst_pmap < src_pmap) { 5222 PMAP_LOCK(dst_pmap); 5223 PMAP_LOCK(src_pmap); 5224 } else { 5225 PMAP_LOCK(src_pmap); 5226 PMAP_LOCK(dst_pmap); 5227 } 5228 5229 PG_A = pmap_accessed_bit(dst_pmap); 5230 PG_M = pmap_modified_bit(dst_pmap); 5231 PG_V = pmap_valid_bit(dst_pmap); 5232 5233 for (addr = src_addr; addr < end_addr; addr = va_next) { 5234 pt_entry_t *src_pte, *dst_pte; 5235 pml4_entry_t *pml4e; 5236 pdp_entry_t *pdpe; 5237 pd_entry_t srcptepaddr, *pde; 5238 5239 KASSERT(addr < UPT_MIN_ADDRESS, 5240 ("pmap_copy: invalid to pmap_copy page tables")); 5241 5242 pml4e = pmap_pml4e(src_pmap, addr); 5243 if ((*pml4e & PG_V) == 0) { 5244 va_next = (addr + NBPML4) & ~PML4MASK; 5245 if (va_next < addr) 5246 va_next = end_addr; 5247 continue; 5248 } 5249 5250 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 5251 if ((*pdpe & PG_V) == 0) { 5252 va_next = (addr + NBPDP) & ~PDPMASK; 5253 if (va_next < addr) 5254 va_next = end_addr; 5255 continue; 5256 } 5257 5258 va_next = (addr + NBPDR) & ~PDRMASK; 5259 if (va_next < addr) 5260 va_next = end_addr; 5261 5262 pde = pmap_pdpe_to_pde(pdpe, addr); 5263 srcptepaddr = *pde; 5264 if (srcptepaddr == 0) 5265 continue; 5266 5267 if (srcptepaddr & PG_PS) { 5268 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 5269 continue; 5270 dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); 5271 if (dst_pdpg == NULL) 5272 break; 5273 pde = (pd_entry_t *) 5274 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 5275 pde = &pde[pmap_pde_index(addr)]; 5276 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 5277 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 5278 PMAP_ENTER_NORECLAIM, &lock))) { 5279 *pde = srcptepaddr & ~PG_W; 5280 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 5281 atomic_add_long(&pmap_pde_mappings, 1); 5282 } else 5283 dst_pdpg->wire_count--; 5284 continue; 5285 } 5286 5287 srcptepaddr &= PG_FRAME; 5288 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5289 KASSERT(srcmpte->wire_count > 0, 5290 ("pmap_copy: source page table page is unused")); 5291 5292 if (va_next > end_addr) 5293 va_next = end_addr; 5294 5295 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5296 src_pte = &src_pte[pmap_pte_index(addr)]; 5297 dstmpte = NULL; 5298 while (addr < va_next) { 5299 pt_entry_t ptetemp; 5300 ptetemp = *src_pte; 5301 /* 5302 * we only virtual copy managed pages 5303 */ 5304 if ((ptetemp & PG_MANAGED) != 0) { 5305 if (dstmpte != NULL && 5306 dstmpte->pindex == pmap_pde_pindex(addr)) 5307 dstmpte->wire_count++; 5308 else if ((dstmpte = pmap_allocpte(dst_pmap, 5309 addr, NULL)) == NULL) 5310 goto out; 5311 dst_pte = (pt_entry_t *) 5312 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5313 dst_pte = &dst_pte[pmap_pte_index(addr)]; 5314 if (*dst_pte == 0 && 5315 pmap_try_insert_pv_entry(dst_pmap, addr, 5316 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5317 &lock)) { 5318 /* 5319 * Clear the wired, modified, and 5320 * accessed (referenced) bits 5321 * during the copy. 5322 */ 5323 *dst_pte = ptetemp & ~(PG_W | PG_M | 5324 PG_A); 5325 pmap_resident_count_inc(dst_pmap, 1); 5326 } else { 5327 SLIST_INIT(&free); 5328 if (pmap_unwire_ptp(dst_pmap, addr, 5329 dstmpte, &free)) { 5330 /* 5331 * Although "addr" is not 5332 * mapped, paging-structure 5333 * caches could nonetheless 5334 * have entries that refer to 5335 * the freed page table pages. 5336 * Invalidate those entries. 5337 */ 5338 pmap_invalidate_page(dst_pmap, 5339 addr); 5340 pmap_free_zero_pages(&free); 5341 } 5342 goto out; 5343 } 5344 if (dstmpte->wire_count >= srcmpte->wire_count) 5345 break; 5346 } 5347 addr += PAGE_SIZE; 5348 src_pte++; 5349 } 5350 } 5351out: 5352 if (lock != NULL) 5353 rw_wunlock(lock); 5354 PMAP_UNLOCK(src_pmap); 5355 PMAP_UNLOCK(dst_pmap); 5356} 5357 5358/* 5359 * pmap_zero_page zeros the specified hardware page by mapping 5360 * the page into KVM and using bzero to clear its contents. 5361 */ 5362void 5363pmap_zero_page(vm_page_t m) 5364{ 5365 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5366 5367 pagezero((void *)va); 5368} 5369 5370/* 5371 * pmap_zero_page_area zeros the specified hardware page by mapping 5372 * the page into KVM and using bzero to clear its contents. 5373 * 5374 * off and size may not cover an area beyond a single hardware page. 5375 */ 5376void 5377pmap_zero_page_area(vm_page_t m, int off, int size) 5378{ 5379 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5380 5381 if (off == 0 && size == PAGE_SIZE) 5382 pagezero((void *)va); 5383 else 5384 bzero((char *)va + off, size); 5385} 5386 5387/* 5388 * pmap_zero_page_idle zeros the specified hardware page by mapping 5389 * the page into KVM and using bzero to clear its contents. This 5390 * is intended to be called from the vm_pagezero process only and 5391 * outside of Giant. 5392 */ 5393void 5394pmap_zero_page_idle(vm_page_t m) 5395{ 5396 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5397 5398 pagezero((void *)va); 5399} 5400 5401/* 5402 * pmap_copy_page copies the specified (machine independent) 5403 * page by mapping the page into virtual memory and using 5404 * bcopy to copy the page, one machine dependent page at a 5405 * time. 5406 */ 5407void 5408pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5409{ 5410 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5411 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5412 5413 pagecopy((void *)src, (void *)dst); 5414} 5415 5416int unmapped_buf_allowed = 1; 5417 5418void 5419pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5420 vm_offset_t b_offset, int xfersize) 5421{ 5422 void *a_cp, *b_cp; 5423 vm_page_t pages[2]; 5424 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 5425 int cnt; 5426 boolean_t mapped; 5427 5428 while (xfersize > 0) { 5429 a_pg_offset = a_offset & PAGE_MASK; 5430 pages[0] = ma[a_offset >> PAGE_SHIFT]; 5431 b_pg_offset = b_offset & PAGE_MASK; 5432 pages[1] = mb[b_offset >> PAGE_SHIFT]; 5433 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5434 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5435 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 5436 a_cp = (char *)vaddr[0] + a_pg_offset; 5437 b_cp = (char *)vaddr[1] + b_pg_offset; 5438 bcopy(a_cp, b_cp, cnt); 5439 if (__predict_false(mapped)) 5440 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 5441 a_offset += cnt; 5442 b_offset += cnt; 5443 xfersize -= cnt; 5444 } 5445} 5446 5447/* 5448 * Returns true if the pmap's pv is one of the first 5449 * 16 pvs linked to from this page. This count may 5450 * be changed upwards or downwards in the future; it 5451 * is only necessary that true be returned for a small 5452 * subset of pmaps for proper page aging. 5453 */ 5454boolean_t 5455pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5456{ 5457 struct md_page *pvh; 5458 struct rwlock *lock; 5459 pv_entry_t pv; 5460 int loops = 0; 5461 boolean_t rv; 5462 5463 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5464 ("pmap_page_exists_quick: page %p is not managed", m)); 5465 rv = FALSE; 5466 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5467 rw_rlock(lock); 5468 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5469 if (PV_PMAP(pv) == pmap) { 5470 rv = TRUE; 5471 break; 5472 } 5473 loops++; 5474 if (loops >= 16) 5475 break; 5476 } 5477 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5478 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5479 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5480 if (PV_PMAP(pv) == pmap) { 5481 rv = TRUE; 5482 break; 5483 } 5484 loops++; 5485 if (loops >= 16) 5486 break; 5487 } 5488 } 5489 rw_runlock(lock); 5490 return (rv); 5491} 5492 5493/* 5494 * pmap_page_wired_mappings: 5495 * 5496 * Return the number of managed mappings to the given physical page 5497 * that are wired. 5498 */ 5499int 5500pmap_page_wired_mappings(vm_page_t m) 5501{ 5502 struct rwlock *lock; 5503 struct md_page *pvh; 5504 pmap_t pmap; 5505 pt_entry_t *pte; 5506 pv_entry_t pv; 5507 int count, md_gen, pvh_gen; 5508 5509 if ((m->oflags & VPO_UNMANAGED) != 0) 5510 return (0); 5511 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5512 rw_rlock(lock); 5513restart: 5514 count = 0; 5515 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5516 pmap = PV_PMAP(pv); 5517 if (!PMAP_TRYLOCK(pmap)) { 5518 md_gen = m->md.pv_gen; 5519 rw_runlock(lock); 5520 PMAP_LOCK(pmap); 5521 rw_rlock(lock); 5522 if (md_gen != m->md.pv_gen) { 5523 PMAP_UNLOCK(pmap); 5524 goto restart; 5525 } 5526 } 5527 pte = pmap_pte(pmap, pv->pv_va); 5528 if ((*pte & PG_W) != 0) 5529 count++; 5530 PMAP_UNLOCK(pmap); 5531 } 5532 if ((m->flags & PG_FICTITIOUS) == 0) { 5533 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5534 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5535 pmap = PV_PMAP(pv); 5536 if (!PMAP_TRYLOCK(pmap)) { 5537 md_gen = m->md.pv_gen; 5538 pvh_gen = pvh->pv_gen; 5539 rw_runlock(lock); 5540 PMAP_LOCK(pmap); 5541 rw_rlock(lock); 5542 if (md_gen != m->md.pv_gen || 5543 pvh_gen != pvh->pv_gen) { 5544 PMAP_UNLOCK(pmap); 5545 goto restart; 5546 } 5547 } 5548 pte = pmap_pde(pmap, pv->pv_va); 5549 if ((*pte & PG_W) != 0) 5550 count++; 5551 PMAP_UNLOCK(pmap); 5552 } 5553 } 5554 rw_runlock(lock); 5555 return (count); 5556} 5557 5558/* 5559 * Returns TRUE if the given page is mapped individually or as part of 5560 * a 2mpage. Otherwise, returns FALSE. 5561 */ 5562boolean_t 5563pmap_page_is_mapped(vm_page_t m) 5564{ 5565 struct rwlock *lock; 5566 boolean_t rv; 5567 5568 if ((m->oflags & VPO_UNMANAGED) != 0) 5569 return (FALSE); 5570 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5571 rw_rlock(lock); 5572 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5573 ((m->flags & PG_FICTITIOUS) == 0 && 5574 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5575 rw_runlock(lock); 5576 return (rv); 5577} 5578 5579/* 5580 * Destroy all managed, non-wired mappings in the given user-space 5581 * pmap. This pmap cannot be active on any processor besides the 5582 * caller. 5583 * 5584 * This function cannot be applied to the kernel pmap. Moreover, it 5585 * is not intended for general use. It is only to be used during 5586 * process termination. Consequently, it can be implemented in ways 5587 * that make it faster than pmap_remove(). First, it can more quickly 5588 * destroy mappings by iterating over the pmap's collection of PV 5589 * entries, rather than searching the page table. Second, it doesn't 5590 * have to test and clear the page table entries atomically, because 5591 * no processor is currently accessing the user address space. In 5592 * particular, a page table entry's dirty bit won't change state once 5593 * this function starts. 5594 * 5595 * Although this function destroys all of the pmap's managed, 5596 * non-wired mappings, it can delay and batch the invalidation of TLB 5597 * entries without calling pmap_delayed_invl_started() and 5598 * pmap_delayed_invl_finished(). Because the pmap is not active on 5599 * any other processor, none of these TLB entries will ever be used 5600 * before their eventual invalidation. Consequently, there is no need 5601 * for either pmap_remove_all() or pmap_remove_write() to wait for 5602 * that eventual TLB invalidation. 5603 */ 5604void 5605pmap_remove_pages(pmap_t pmap) 5606{ 5607 pd_entry_t ptepde; 5608 pt_entry_t *pte, tpte; 5609 pt_entry_t PG_M, PG_RW, PG_V; 5610 struct spglist free; 5611 vm_page_t m, mpte, mt; 5612 pv_entry_t pv; 5613 struct md_page *pvh; 5614 struct pv_chunk *pc, *npc; 5615 struct rwlock *lock; 5616 int64_t bit; 5617 uint64_t inuse, bitmask; 5618 int allfree, field, freed, idx; 5619 boolean_t superpage; 5620 vm_paddr_t pa; 5621 5622 /* 5623 * Assert that the given pmap is only active on the current 5624 * CPU. Unfortunately, we cannot block another CPU from 5625 * activating the pmap while this function is executing. 5626 */ 5627 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5628#ifdef INVARIANTS 5629 { 5630 cpuset_t other_cpus; 5631 5632 other_cpus = all_cpus; 5633 critical_enter(); 5634 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5635 CPU_AND(&other_cpus, &pmap->pm_active); 5636 critical_exit(); 5637 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5638 } 5639#endif 5640 5641 lock = NULL; 5642 PG_M = pmap_modified_bit(pmap); 5643 PG_V = pmap_valid_bit(pmap); 5644 PG_RW = pmap_rw_bit(pmap); 5645 5646 SLIST_INIT(&free); 5647 PMAP_LOCK(pmap); 5648 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5649 allfree = 1; 5650 freed = 0; 5651 for (field = 0; field < _NPCM; field++) { 5652 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5653 while (inuse != 0) { 5654 bit = bsfq(inuse); 5655 bitmask = 1UL << bit; 5656 idx = field * 64 + bit; 5657 pv = &pc->pc_pventry[idx]; 5658 inuse &= ~bitmask; 5659 5660 pte = pmap_pdpe(pmap, pv->pv_va); 5661 ptepde = *pte; 5662 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5663 tpte = *pte; 5664 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5665 superpage = FALSE; 5666 ptepde = tpte; 5667 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5668 PG_FRAME); 5669 pte = &pte[pmap_pte_index(pv->pv_va)]; 5670 tpte = *pte; 5671 } else { 5672 /* 5673 * Keep track whether 'tpte' is a 5674 * superpage explicitly instead of 5675 * relying on PG_PS being set. 5676 * 5677 * This is because PG_PS is numerically 5678 * identical to PG_PTE_PAT and thus a 5679 * regular page could be mistaken for 5680 * a superpage. 5681 */ 5682 superpage = TRUE; 5683 } 5684 5685 if ((tpte & PG_V) == 0) { 5686 panic("bad pte va %lx pte %lx", 5687 pv->pv_va, tpte); 5688 } 5689 5690/* 5691 * We cannot remove wired pages from a process' mapping at this time 5692 */ 5693 if (tpte & PG_W) { 5694 allfree = 0; 5695 continue; 5696 } 5697 5698 if (superpage) 5699 pa = tpte & PG_PS_FRAME; 5700 else 5701 pa = tpte & PG_FRAME; 5702 5703 m = PHYS_TO_VM_PAGE(pa); 5704 KASSERT(m->phys_addr == pa, 5705 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5706 m, (uintmax_t)m->phys_addr, 5707 (uintmax_t)tpte)); 5708 5709 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5710 m < &vm_page_array[vm_page_array_size], 5711 ("pmap_remove_pages: bad tpte %#jx", 5712 (uintmax_t)tpte)); 5713 5714 pte_clear(pte); 5715 5716 /* 5717 * Update the vm_page_t clean/reference bits. 5718 */ 5719 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5720 if (superpage) { 5721 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5722 vm_page_dirty(mt); 5723 } else 5724 vm_page_dirty(m); 5725 } 5726 5727 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5728 5729 /* Mark free */ 5730 pc->pc_map[field] |= bitmask; 5731 if (superpage) { 5732 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5733 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5734 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5735 pvh->pv_gen++; 5736 if (TAILQ_EMPTY(&pvh->pv_list)) { 5737 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5738 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5739 TAILQ_EMPTY(&mt->md.pv_list)) 5740 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5741 } 5742 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5743 if (mpte != NULL) { 5744 pmap_resident_count_dec(pmap, 1); 5745 KASSERT(mpte->wire_count == NPTEPG, 5746 ("pmap_remove_pages: pte page wire count error")); 5747 mpte->wire_count = 0; 5748 pmap_add_delayed_free_list(mpte, &free, FALSE); 5749 } 5750 } else { 5751 pmap_resident_count_dec(pmap, 1); 5752 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5753 m->md.pv_gen++; 5754 if ((m->aflags & PGA_WRITEABLE) != 0 && 5755 TAILQ_EMPTY(&m->md.pv_list) && 5756 (m->flags & PG_FICTITIOUS) == 0) { 5757 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5758 if (TAILQ_EMPTY(&pvh->pv_list)) 5759 vm_page_aflag_clear(m, PGA_WRITEABLE); 5760 } 5761 } 5762 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5763 freed++; 5764 } 5765 } 5766 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5767 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5768 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5769 if (allfree) { 5770 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5771 free_pv_chunk(pc); 5772 } 5773 } 5774 if (lock != NULL) 5775 rw_wunlock(lock); 5776 pmap_invalidate_all(pmap); 5777 PMAP_UNLOCK(pmap); 5778 pmap_free_zero_pages(&free); 5779} 5780 5781static boolean_t 5782pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5783{ 5784 struct rwlock *lock; 5785 pv_entry_t pv; 5786 struct md_page *pvh; 5787 pt_entry_t *pte, mask; 5788 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5789 pmap_t pmap; 5790 int md_gen, pvh_gen; 5791 boolean_t rv; 5792 5793 rv = FALSE; 5794 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5795 rw_rlock(lock); 5796restart: 5797 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5798 pmap = PV_PMAP(pv); 5799 if (!PMAP_TRYLOCK(pmap)) { 5800 md_gen = m->md.pv_gen; 5801 rw_runlock(lock); 5802 PMAP_LOCK(pmap); 5803 rw_rlock(lock); 5804 if (md_gen != m->md.pv_gen) { 5805 PMAP_UNLOCK(pmap); 5806 goto restart; 5807 } 5808 } 5809 pte = pmap_pte(pmap, pv->pv_va); 5810 mask = 0; 5811 if (modified) { 5812 PG_M = pmap_modified_bit(pmap); 5813 PG_RW = pmap_rw_bit(pmap); 5814 mask |= PG_RW | PG_M; 5815 } 5816 if (accessed) { 5817 PG_A = pmap_accessed_bit(pmap); 5818 PG_V = pmap_valid_bit(pmap); 5819 mask |= PG_V | PG_A; 5820 } 5821 rv = (*pte & mask) == mask; 5822 PMAP_UNLOCK(pmap); 5823 if (rv) 5824 goto out; 5825 } 5826 if ((m->flags & PG_FICTITIOUS) == 0) { 5827 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5828 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5829 pmap = PV_PMAP(pv); 5830 if (!PMAP_TRYLOCK(pmap)) { 5831 md_gen = m->md.pv_gen; 5832 pvh_gen = pvh->pv_gen; 5833 rw_runlock(lock); 5834 PMAP_LOCK(pmap); 5835 rw_rlock(lock); 5836 if (md_gen != m->md.pv_gen || 5837 pvh_gen != pvh->pv_gen) { 5838 PMAP_UNLOCK(pmap); 5839 goto restart; 5840 } 5841 } 5842 pte = pmap_pde(pmap, pv->pv_va); 5843 mask = 0; 5844 if (modified) { 5845 PG_M = pmap_modified_bit(pmap); 5846 PG_RW = pmap_rw_bit(pmap); 5847 mask |= PG_RW | PG_M; 5848 } 5849 if (accessed) { 5850 PG_A = pmap_accessed_bit(pmap); 5851 PG_V = pmap_valid_bit(pmap); 5852 mask |= PG_V | PG_A; 5853 } 5854 rv = (*pte & mask) == mask; 5855 PMAP_UNLOCK(pmap); 5856 if (rv) 5857 goto out; 5858 } 5859 } 5860out: 5861 rw_runlock(lock); 5862 return (rv); 5863} 5864 5865/* 5866 * pmap_is_modified: 5867 * 5868 * Return whether or not the specified physical page was modified 5869 * in any physical maps. 5870 */ 5871boolean_t 5872pmap_is_modified(vm_page_t m) 5873{ 5874 5875 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5876 ("pmap_is_modified: page %p is not managed", m)); 5877 5878 /* 5879 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5880 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5881 * is clear, no PTEs can have PG_M set. 5882 */ 5883 VM_OBJECT_ASSERT_WLOCKED(m->object); 5884 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5885 return (FALSE); 5886 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5887} 5888 5889/* 5890 * pmap_is_prefaultable: 5891 * 5892 * Return whether or not the specified virtual address is eligible 5893 * for prefault. 5894 */ 5895boolean_t 5896pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5897{ 5898 pd_entry_t *pde; 5899 pt_entry_t *pte, PG_V; 5900 boolean_t rv; 5901 5902 PG_V = pmap_valid_bit(pmap); 5903 rv = FALSE; 5904 PMAP_LOCK(pmap); 5905 pde = pmap_pde(pmap, addr); 5906 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5907 pte = pmap_pde_to_pte(pde, addr); 5908 rv = (*pte & PG_V) == 0; 5909 } 5910 PMAP_UNLOCK(pmap); 5911 return (rv); 5912} 5913 5914/* 5915 * pmap_is_referenced: 5916 * 5917 * Return whether or not the specified physical page was referenced 5918 * in any physical maps. 5919 */ 5920boolean_t 5921pmap_is_referenced(vm_page_t m) 5922{ 5923 5924 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5925 ("pmap_is_referenced: page %p is not managed", m)); 5926 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5927} 5928 5929/* 5930 * Clear the write and modified bits in each of the given page's mappings. 5931 */ 5932void 5933pmap_remove_write(vm_page_t m) 5934{ 5935 struct md_page *pvh; 5936 pmap_t pmap; 5937 struct rwlock *lock; 5938 pv_entry_t next_pv, pv; 5939 pd_entry_t *pde; 5940 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5941 vm_offset_t va; 5942 int pvh_gen, md_gen; 5943 5944 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5945 ("pmap_remove_write: page %p is not managed", m)); 5946 5947 /* 5948 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5949 * set by another thread while the object is locked. Thus, 5950 * if PGA_WRITEABLE is clear, no page table entries need updating. 5951 */ 5952 VM_OBJECT_ASSERT_WLOCKED(m->object); 5953 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5954 return; 5955 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5956 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5957 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5958retry_pv_loop: 5959 rw_wlock(lock); 5960 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5961 pmap = PV_PMAP(pv); 5962 if (!PMAP_TRYLOCK(pmap)) { 5963 pvh_gen = pvh->pv_gen; 5964 rw_wunlock(lock); 5965 PMAP_LOCK(pmap); 5966 rw_wlock(lock); 5967 if (pvh_gen != pvh->pv_gen) { 5968 PMAP_UNLOCK(pmap); 5969 rw_wunlock(lock); 5970 goto retry_pv_loop; 5971 } 5972 } 5973 PG_RW = pmap_rw_bit(pmap); 5974 va = pv->pv_va; 5975 pde = pmap_pde(pmap, va); 5976 if ((*pde & PG_RW) != 0) 5977 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5978 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5979 ("inconsistent pv lock %p %p for page %p", 5980 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5981 PMAP_UNLOCK(pmap); 5982 } 5983 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5984 pmap = PV_PMAP(pv); 5985 if (!PMAP_TRYLOCK(pmap)) { 5986 pvh_gen = pvh->pv_gen; 5987 md_gen = m->md.pv_gen; 5988 rw_wunlock(lock); 5989 PMAP_LOCK(pmap); 5990 rw_wlock(lock); 5991 if (pvh_gen != pvh->pv_gen || 5992 md_gen != m->md.pv_gen) { 5993 PMAP_UNLOCK(pmap); 5994 rw_wunlock(lock); 5995 goto retry_pv_loop; 5996 } 5997 } 5998 PG_M = pmap_modified_bit(pmap); 5999 PG_RW = pmap_rw_bit(pmap); 6000 pde = pmap_pde(pmap, pv->pv_va); 6001 KASSERT((*pde & PG_PS) == 0, 6002 ("pmap_remove_write: found a 2mpage in page %p's pv list", 6003 m)); 6004 pte = pmap_pde_to_pte(pde, pv->pv_va); 6005retry: 6006 oldpte = *pte; 6007 if (oldpte & PG_RW) { 6008 if (!atomic_cmpset_long(pte, oldpte, oldpte & 6009 ~(PG_RW | PG_M))) 6010 goto retry; 6011 if ((oldpte & PG_M) != 0) 6012 vm_page_dirty(m); 6013 pmap_invalidate_page(pmap, pv->pv_va); 6014 } 6015 PMAP_UNLOCK(pmap); 6016 } 6017 rw_wunlock(lock); 6018 vm_page_aflag_clear(m, PGA_WRITEABLE); 6019 pmap_delayed_invl_wait(m); 6020} 6021 6022static __inline boolean_t 6023safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 6024{ 6025 6026 if (!pmap_emulate_ad_bits(pmap)) 6027 return (TRUE); 6028 6029 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 6030 6031 /* 6032 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 6033 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 6034 * if the EPT_PG_WRITE bit is set. 6035 */ 6036 if ((pte & EPT_PG_WRITE) != 0) 6037 return (FALSE); 6038 6039 /* 6040 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 6041 */ 6042 if ((pte & EPT_PG_EXECUTE) == 0 || 6043 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 6044 return (TRUE); 6045 else 6046 return (FALSE); 6047} 6048 6049/* 6050 * pmap_ts_referenced: 6051 * 6052 * Return a count of reference bits for a page, clearing those bits. 6053 * It is not necessary for every reference bit to be cleared, but it 6054 * is necessary that 0 only be returned when there are truly no 6055 * reference bits set. 6056 * 6057 * As an optimization, update the page's dirty field if a modified bit is 6058 * found while counting reference bits. This opportunistic update can be 6059 * performed at low cost and can eliminate the need for some future calls 6060 * to pmap_is_modified(). However, since this function stops after 6061 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 6062 * dirty pages. Those dirty pages will only be detected by a future call 6063 * to pmap_is_modified(). 6064 * 6065 * A DI block is not needed within this function, because 6066 * invalidations are performed before the PV list lock is 6067 * released. 6068 */ 6069int 6070pmap_ts_referenced(vm_page_t m) 6071{ 6072 struct md_page *pvh; 6073 pv_entry_t pv, pvf; 6074 pmap_t pmap; 6075 struct rwlock *lock; 6076 pd_entry_t oldpde, *pde; 6077 pt_entry_t *pte, PG_A, PG_M, PG_RW; 6078 vm_offset_t va; 6079 vm_paddr_t pa; 6080 int cleared, md_gen, not_cleared, pvh_gen; 6081 struct spglist free; 6082 boolean_t demoted; 6083 6084 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6085 ("pmap_ts_referenced: page %p is not managed", m)); 6086 SLIST_INIT(&free); 6087 cleared = 0; 6088 pa = VM_PAGE_TO_PHYS(m); 6089 lock = PHYS_TO_PV_LIST_LOCK(pa); 6090 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 6091 rw_wlock(lock); 6092retry: 6093 not_cleared = 0; 6094 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 6095 goto small_mappings; 6096 pv = pvf; 6097 do { 6098 if (pvf == NULL) 6099 pvf = pv; 6100 pmap = PV_PMAP(pv); 6101 if (!PMAP_TRYLOCK(pmap)) { 6102 pvh_gen = pvh->pv_gen; 6103 rw_wunlock(lock); 6104 PMAP_LOCK(pmap); 6105 rw_wlock(lock); 6106 if (pvh_gen != pvh->pv_gen) { 6107 PMAP_UNLOCK(pmap); 6108 goto retry; 6109 } 6110 } 6111 PG_A = pmap_accessed_bit(pmap); 6112 PG_M = pmap_modified_bit(pmap); 6113 PG_RW = pmap_rw_bit(pmap); 6114 va = pv->pv_va; 6115 pde = pmap_pde(pmap, pv->pv_va); 6116 oldpde = *pde; 6117 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6118 /* 6119 * Although "oldpde" is mapping a 2MB page, because 6120 * this function is called at a 4KB page granularity, 6121 * we only update the 4KB page under test. 6122 */ 6123 vm_page_dirty(m); 6124 } 6125 if ((oldpde & PG_A) != 0) { 6126 /* 6127 * Since this reference bit is shared by 512 4KB 6128 * pages, it should not be cleared every time it is 6129 * tested. Apply a simple "hash" function on the 6130 * physical page number, the virtual superpage number, 6131 * and the pmap address to select one 4KB page out of 6132 * the 512 on which testing the reference bit will 6133 * result in clearing that reference bit. This 6134 * function is designed to avoid the selection of the 6135 * same 4KB page for every 2MB page mapping. 6136 * 6137 * On demotion, a mapping that hasn't been referenced 6138 * is simply destroyed. To avoid the possibility of a 6139 * subsequent page fault on a demoted wired mapping, 6140 * always leave its reference bit set. Moreover, 6141 * since the superpage is wired, the current state of 6142 * its reference bit won't affect page replacement. 6143 */ 6144 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 6145 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 6146 (oldpde & PG_W) == 0) { 6147 if (safe_to_clear_referenced(pmap, oldpde)) { 6148 atomic_clear_long(pde, PG_A); 6149 pmap_invalidate_page(pmap, pv->pv_va); 6150 demoted = FALSE; 6151 } else if (pmap_demote_pde_locked(pmap, pde, 6152 pv->pv_va, &lock)) { 6153 /* 6154 * Remove the mapping to a single page 6155 * so that a subsequent access may 6156 * repromote. Since the underlying 6157 * page table page is fully populated, 6158 * this removal never frees a page 6159 * table page. 6160 */ 6161 demoted = TRUE; 6162 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6163 PG_PS_FRAME); 6164 pte = pmap_pde_to_pte(pde, va); 6165 pmap_remove_pte(pmap, pte, va, *pde, 6166 NULL, &lock); 6167 pmap_invalidate_page(pmap, va); 6168 } else 6169 demoted = TRUE; 6170 6171 if (demoted) { 6172 /* 6173 * The superpage mapping was removed 6174 * entirely and therefore 'pv' is no 6175 * longer valid. 6176 */ 6177 if (pvf == pv) 6178 pvf = NULL; 6179 pv = NULL; 6180 } 6181 cleared++; 6182 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6183 ("inconsistent pv lock %p %p for page %p", 6184 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6185 } else 6186 not_cleared++; 6187 } 6188 PMAP_UNLOCK(pmap); 6189 /* Rotate the PV list if it has more than one entry. */ 6190 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6191 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6192 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 6193 pvh->pv_gen++; 6194 } 6195 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 6196 goto out; 6197 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 6198small_mappings: 6199 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 6200 goto out; 6201 pv = pvf; 6202 do { 6203 if (pvf == NULL) 6204 pvf = pv; 6205 pmap = PV_PMAP(pv); 6206 if (!PMAP_TRYLOCK(pmap)) { 6207 pvh_gen = pvh->pv_gen; 6208 md_gen = m->md.pv_gen; 6209 rw_wunlock(lock); 6210 PMAP_LOCK(pmap); 6211 rw_wlock(lock); 6212 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6213 PMAP_UNLOCK(pmap); 6214 goto retry; 6215 } 6216 } 6217 PG_A = pmap_accessed_bit(pmap); 6218 PG_M = pmap_modified_bit(pmap); 6219 PG_RW = pmap_rw_bit(pmap); 6220 pde = pmap_pde(pmap, pv->pv_va); 6221 KASSERT((*pde & PG_PS) == 0, 6222 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 6223 m)); 6224 pte = pmap_pde_to_pte(pde, pv->pv_va); 6225 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6226 vm_page_dirty(m); 6227 if ((*pte & PG_A) != 0) { 6228 if (safe_to_clear_referenced(pmap, *pte)) { 6229 atomic_clear_long(pte, PG_A); 6230 pmap_invalidate_page(pmap, pv->pv_va); 6231 cleared++; 6232 } else if ((*pte & PG_W) == 0) { 6233 /* 6234 * Wired pages cannot be paged out so 6235 * doing accessed bit emulation for 6236 * them is wasted effort. We do the 6237 * hard work for unwired pages only. 6238 */ 6239 pmap_remove_pte(pmap, pte, pv->pv_va, 6240 *pde, &free, &lock); 6241 pmap_invalidate_page(pmap, pv->pv_va); 6242 cleared++; 6243 if (pvf == pv) 6244 pvf = NULL; 6245 pv = NULL; 6246 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6247 ("inconsistent pv lock %p %p for page %p", 6248 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6249 } else 6250 not_cleared++; 6251 } 6252 PMAP_UNLOCK(pmap); 6253 /* Rotate the PV list if it has more than one entry. */ 6254 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6255 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6256 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6257 m->md.pv_gen++; 6258 } 6259 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6260 not_cleared < PMAP_TS_REFERENCED_MAX); 6261out: 6262 rw_wunlock(lock); 6263 pmap_free_zero_pages(&free); 6264 return (cleared + not_cleared); 6265} 6266 6267/* 6268 * Apply the given advice to the specified range of addresses within the 6269 * given pmap. Depending on the advice, clear the referenced and/or 6270 * modified flags in each mapping and set the mapped page's dirty field. 6271 */ 6272void 6273pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6274{ 6275 struct rwlock *lock; 6276 pml4_entry_t *pml4e; 6277 pdp_entry_t *pdpe; 6278 pd_entry_t oldpde, *pde; 6279 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 6280 vm_offset_t va, va_next; 6281 vm_page_t m; 6282 boolean_t anychanged; 6283 6284 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6285 return; 6286 6287 /* 6288 * A/D bit emulation requires an alternate code path when clearing 6289 * the modified and accessed bits below. Since this function is 6290 * advisory in nature we skip it entirely for pmaps that require 6291 * A/D bit emulation. 6292 */ 6293 if (pmap_emulate_ad_bits(pmap)) 6294 return; 6295 6296 PG_A = pmap_accessed_bit(pmap); 6297 PG_G = pmap_global_bit(pmap); 6298 PG_M = pmap_modified_bit(pmap); 6299 PG_V = pmap_valid_bit(pmap); 6300 PG_RW = pmap_rw_bit(pmap); 6301 anychanged = FALSE; 6302 pmap_delayed_invl_started(); 6303 PMAP_LOCK(pmap); 6304 for (; sva < eva; sva = va_next) { 6305 pml4e = pmap_pml4e(pmap, sva); 6306 if ((*pml4e & PG_V) == 0) { 6307 va_next = (sva + NBPML4) & ~PML4MASK; 6308 if (va_next < sva) 6309 va_next = eva; 6310 continue; 6311 } 6312 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6313 if ((*pdpe & PG_V) == 0) { 6314 va_next = (sva + NBPDP) & ~PDPMASK; 6315 if (va_next < sva) 6316 va_next = eva; 6317 continue; 6318 } 6319 va_next = (sva + NBPDR) & ~PDRMASK; 6320 if (va_next < sva) 6321 va_next = eva; 6322 pde = pmap_pdpe_to_pde(pdpe, sva); 6323 oldpde = *pde; 6324 if ((oldpde & PG_V) == 0) 6325 continue; 6326 else if ((oldpde & PG_PS) != 0) { 6327 if ((oldpde & PG_MANAGED) == 0) 6328 continue; 6329 lock = NULL; 6330 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6331 if (lock != NULL) 6332 rw_wunlock(lock); 6333 6334 /* 6335 * The large page mapping was destroyed. 6336 */ 6337 continue; 6338 } 6339 6340 /* 6341 * Unless the page mappings are wired, remove the 6342 * mapping to a single page so that a subsequent 6343 * access may repromote. Since the underlying page 6344 * table page is fully populated, this removal never 6345 * frees a page table page. 6346 */ 6347 if ((oldpde & PG_W) == 0) { 6348 pte = pmap_pde_to_pte(pde, sva); 6349 KASSERT((*pte & PG_V) != 0, 6350 ("pmap_advise: invalid PTE")); 6351 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6352 &lock); 6353 anychanged = TRUE; 6354 } 6355 if (lock != NULL) 6356 rw_wunlock(lock); 6357 } 6358 if (va_next > eva) 6359 va_next = eva; 6360 va = va_next; 6361 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6362 sva += PAGE_SIZE) { 6363 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 6364 goto maybe_invlrng; 6365 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6366 if (advice == MADV_DONTNEED) { 6367 /* 6368 * Future calls to pmap_is_modified() 6369 * can be avoided by making the page 6370 * dirty now. 6371 */ 6372 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6373 vm_page_dirty(m); 6374 } 6375 atomic_clear_long(pte, PG_M | PG_A); 6376 } else if ((*pte & PG_A) != 0) 6377 atomic_clear_long(pte, PG_A); 6378 else 6379 goto maybe_invlrng; 6380 6381 if ((*pte & PG_G) != 0) { 6382 if (va == va_next) 6383 va = sva; 6384 } else 6385 anychanged = TRUE; 6386 continue; 6387maybe_invlrng: 6388 if (va != va_next) { 6389 pmap_invalidate_range(pmap, va, sva); 6390 va = va_next; 6391 } 6392 } 6393 if (va != va_next) 6394 pmap_invalidate_range(pmap, va, sva); 6395 } 6396 if (anychanged) 6397 pmap_invalidate_all(pmap); 6398 PMAP_UNLOCK(pmap); 6399 pmap_delayed_invl_finished(); 6400} 6401 6402/* 6403 * Clear the modify bits on the specified physical page. 6404 */ 6405void 6406pmap_clear_modify(vm_page_t m) 6407{ 6408 struct md_page *pvh; 6409 pmap_t pmap; 6410 pv_entry_t next_pv, pv; 6411 pd_entry_t oldpde, *pde; 6412 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6413 struct rwlock *lock; 6414 vm_offset_t va; 6415 int md_gen, pvh_gen; 6416 6417 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6418 ("pmap_clear_modify: page %p is not managed", m)); 6419 VM_OBJECT_ASSERT_WLOCKED(m->object); 6420 KASSERT(!vm_page_xbusied(m), 6421 ("pmap_clear_modify: page %p is exclusive busied", m)); 6422 6423 /* 6424 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6425 * If the object containing the page is locked and the page is not 6426 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6427 */ 6428 if ((m->aflags & PGA_WRITEABLE) == 0) 6429 return; 6430 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6431 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6432 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6433 rw_wlock(lock); 6434restart: 6435 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6436 pmap = PV_PMAP(pv); 6437 if (!PMAP_TRYLOCK(pmap)) { 6438 pvh_gen = pvh->pv_gen; 6439 rw_wunlock(lock); 6440 PMAP_LOCK(pmap); 6441 rw_wlock(lock); 6442 if (pvh_gen != pvh->pv_gen) { 6443 PMAP_UNLOCK(pmap); 6444 goto restart; 6445 } 6446 } 6447 PG_M = pmap_modified_bit(pmap); 6448 PG_V = pmap_valid_bit(pmap); 6449 PG_RW = pmap_rw_bit(pmap); 6450 va = pv->pv_va; 6451 pde = pmap_pde(pmap, va); 6452 oldpde = *pde; 6453 if ((oldpde & PG_RW) != 0) { 6454 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6455 if ((oldpde & PG_W) == 0) { 6456 /* 6457 * Write protect the mapping to a 6458 * single page so that a subsequent 6459 * write access may repromote. 6460 */ 6461 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6462 PG_PS_FRAME); 6463 pte = pmap_pde_to_pte(pde, va); 6464 oldpte = *pte; 6465 if ((oldpte & PG_V) != 0) { 6466 while (!atomic_cmpset_long(pte, 6467 oldpte, 6468 oldpte & ~(PG_M | PG_RW))) 6469 oldpte = *pte; 6470 vm_page_dirty(m); 6471 pmap_invalidate_page(pmap, va); 6472 } 6473 } 6474 } 6475 } 6476 PMAP_UNLOCK(pmap); 6477 } 6478 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6479 pmap = PV_PMAP(pv); 6480 if (!PMAP_TRYLOCK(pmap)) { 6481 md_gen = m->md.pv_gen; 6482 pvh_gen = pvh->pv_gen; 6483 rw_wunlock(lock); 6484 PMAP_LOCK(pmap); 6485 rw_wlock(lock); 6486 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6487 PMAP_UNLOCK(pmap); 6488 goto restart; 6489 } 6490 } 6491 PG_M = pmap_modified_bit(pmap); 6492 PG_RW = pmap_rw_bit(pmap); 6493 pde = pmap_pde(pmap, pv->pv_va); 6494 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6495 " a 2mpage in page %p's pv list", m)); 6496 pte = pmap_pde_to_pte(pde, pv->pv_va); 6497 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6498 atomic_clear_long(pte, PG_M); 6499 pmap_invalidate_page(pmap, pv->pv_va); 6500 } 6501 PMAP_UNLOCK(pmap); 6502 } 6503 rw_wunlock(lock); 6504} 6505 6506/* 6507 * Miscellaneous support routines follow 6508 */ 6509 6510/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6511static __inline void 6512pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6513{ 6514 u_int opte, npte; 6515 6516 /* 6517 * The cache mode bits are all in the low 32-bits of the 6518 * PTE, so we can just spin on updating the low 32-bits. 6519 */ 6520 do { 6521 opte = *(u_int *)pte; 6522 npte = opte & ~mask; 6523 npte |= cache_bits; 6524 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6525} 6526 6527/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6528static __inline void 6529pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6530{ 6531 u_int opde, npde; 6532 6533 /* 6534 * The cache mode bits are all in the low 32-bits of the 6535 * PDE, so we can just spin on updating the low 32-bits. 6536 */ 6537 do { 6538 opde = *(u_int *)pde; 6539 npde = opde & ~mask; 6540 npde |= cache_bits; 6541 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6542} 6543 6544/* 6545 * Map a set of physical memory pages into the kernel virtual 6546 * address space. Return a pointer to where it is mapped. This 6547 * routine is intended to be used for mapping device memory, 6548 * NOT real memory. 6549 */ 6550void * 6551pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6552{ 6553 struct pmap_preinit_mapping *ppim; 6554 vm_offset_t va, offset; 6555 vm_size_t tmpsize; 6556 int i; 6557 6558 offset = pa & PAGE_MASK; 6559 size = round_page(offset + size); 6560 pa = trunc_page(pa); 6561 6562 if (!pmap_initialized) { 6563 va = 0; 6564 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6565 ppim = pmap_preinit_mapping + i; 6566 if (ppim->va == 0) { 6567 ppim->pa = pa; 6568 ppim->sz = size; 6569 ppim->mode = mode; 6570 ppim->va = virtual_avail; 6571 virtual_avail += size; 6572 va = ppim->va; 6573 break; 6574 } 6575 } 6576 if (va == 0) 6577 panic("%s: too many preinit mappings", __func__); 6578 } else { 6579 /* 6580 * If we have a preinit mapping, re-use it. 6581 */ 6582 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6583 ppim = pmap_preinit_mapping + i; 6584 if (ppim->pa == pa && ppim->sz == size && 6585 ppim->mode == mode) 6586 return ((void *)(ppim->va + offset)); 6587 } 6588 /* 6589 * If the specified range of physical addresses fits within 6590 * the direct map window, use the direct map. 6591 */ 6592 if (pa < dmaplimit && pa + size < dmaplimit) { 6593 va = PHYS_TO_DMAP(pa); 6594 if (!pmap_change_attr(va, size, mode)) 6595 return ((void *)(va + offset)); 6596 } 6597 va = kva_alloc(size); 6598 if (va == 0) 6599 panic("%s: Couldn't allocate KVA", __func__); 6600 } 6601 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6602 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6603 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6604 pmap_invalidate_cache_range(va, va + tmpsize, FALSE); 6605 return ((void *)(va + offset)); 6606} 6607 6608void * 6609pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6610{ 6611 6612 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6613} 6614 6615void * 6616pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6617{ 6618 6619 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6620} 6621 6622void 6623pmap_unmapdev(vm_offset_t va, vm_size_t size) 6624{ 6625 struct pmap_preinit_mapping *ppim; 6626 vm_offset_t offset; 6627 int i; 6628 6629 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6630 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6631 return; 6632 offset = va & PAGE_MASK; 6633 size = round_page(offset + size); 6634 va = trunc_page(va); 6635 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6636 ppim = pmap_preinit_mapping + i; 6637 if (ppim->va == va && ppim->sz == size) { 6638 if (pmap_initialized) 6639 return; 6640 ppim->pa = 0; 6641 ppim->va = 0; 6642 ppim->sz = 0; 6643 ppim->mode = 0; 6644 if (va + size == virtual_avail) 6645 virtual_avail = va; 6646 return; 6647 } 6648 } 6649 if (pmap_initialized) 6650 kva_free(va, size); 6651} 6652 6653/* 6654 * Tries to demote a 1GB page mapping. 6655 */ 6656static boolean_t 6657pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6658{ 6659 pdp_entry_t newpdpe, oldpdpe; 6660 pd_entry_t *firstpde, newpde, *pde; 6661 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6662 vm_paddr_t pdpgpa; 6663 vm_page_t pdpg; 6664 6665 PG_A = pmap_accessed_bit(pmap); 6666 PG_M = pmap_modified_bit(pmap); 6667 PG_V = pmap_valid_bit(pmap); 6668 PG_RW = pmap_rw_bit(pmap); 6669 6670 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6671 oldpdpe = *pdpe; 6672 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6673 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6674 if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6675 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6676 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6677 " in pmap %p", va, pmap); 6678 return (FALSE); 6679 } 6680 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 6681 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 6682 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6683 KASSERT((oldpdpe & PG_A) != 0, 6684 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6685 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6686 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6687 newpde = oldpdpe; 6688 6689 /* 6690 * Initialize the page directory page. 6691 */ 6692 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6693 *pde = newpde; 6694 newpde += NBPDR; 6695 } 6696 6697 /* 6698 * Demote the mapping. 6699 */ 6700 *pdpe = newpdpe; 6701 6702 /* 6703 * Invalidate a stale recursive mapping of the page directory page. 6704 */ 6705 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6706 6707 pmap_pdpe_demotions++; 6708 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6709 " in pmap %p", va, pmap); 6710 return (TRUE); 6711} 6712 6713/* 6714 * Sets the memory attribute for the specified page. 6715 */ 6716void 6717pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6718{ 6719 6720 m->md.pat_mode = ma; 6721 6722 /* 6723 * If "m" is a normal page, update its direct mapping. This update 6724 * can be relied upon to perform any cache operations that are 6725 * required for data coherence. 6726 */ 6727 if ((m->flags & PG_FICTITIOUS) == 0 && 6728 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6729 m->md.pat_mode)) 6730 panic("memory attribute change on the direct map failed"); 6731} 6732 6733/* 6734 * Changes the specified virtual address range's memory type to that given by 6735 * the parameter "mode". The specified virtual address range must be 6736 * completely contained within either the direct map or the kernel map. If 6737 * the virtual address range is contained within the kernel map, then the 6738 * memory type for each of the corresponding ranges of the direct map is also 6739 * changed. (The corresponding ranges of the direct map are those ranges that 6740 * map the same physical pages as the specified virtual address range.) These 6741 * changes to the direct map are necessary because Intel describes the 6742 * behavior of their processors as "undefined" if two or more mappings to the 6743 * same physical page have different memory types. 6744 * 6745 * Returns zero if the change completed successfully, and either EINVAL or 6746 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6747 * of the virtual address range was not mapped, and ENOMEM is returned if 6748 * there was insufficient memory available to complete the change. In the 6749 * latter case, the memory type may have been changed on some part of the 6750 * virtual address range or the direct map. 6751 */ 6752int 6753pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6754{ 6755 int error; 6756 6757 PMAP_LOCK(kernel_pmap); 6758 error = pmap_change_attr_locked(va, size, mode); 6759 PMAP_UNLOCK(kernel_pmap); 6760 return (error); 6761} 6762 6763static int 6764pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6765{ 6766 vm_offset_t base, offset, tmpva; 6767 vm_paddr_t pa_start, pa_end, pa_end1; 6768 pdp_entry_t *pdpe; 6769 pd_entry_t *pde; 6770 pt_entry_t *pte; 6771 int cache_bits_pte, cache_bits_pde, error; 6772 boolean_t changed; 6773 6774 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6775 base = trunc_page(va); 6776 offset = va & PAGE_MASK; 6777 size = round_page(offset + size); 6778 6779 /* 6780 * Only supported on kernel virtual addresses, including the direct 6781 * map but excluding the recursive map. 6782 */ 6783 if (base < DMAP_MIN_ADDRESS) 6784 return (EINVAL); 6785 6786 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6787 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6788 changed = FALSE; 6789 6790 /* 6791 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6792 * into 4KB pages if required. 6793 */ 6794 for (tmpva = base; tmpva < base + size; ) { 6795 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6796 if (pdpe == NULL || *pdpe == 0) 6797 return (EINVAL); 6798 if (*pdpe & PG_PS) { 6799 /* 6800 * If the current 1GB page already has the required 6801 * memory type, then we need not demote this page. Just 6802 * increment tmpva to the next 1GB page frame. 6803 */ 6804 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6805 tmpva = trunc_1gpage(tmpva) + NBPDP; 6806 continue; 6807 } 6808 6809 /* 6810 * If the current offset aligns with a 1GB page frame 6811 * and there is at least 1GB left within the range, then 6812 * we need not break down this page into 2MB pages. 6813 */ 6814 if ((tmpva & PDPMASK) == 0 && 6815 tmpva + PDPMASK < base + size) { 6816 tmpva += NBPDP; 6817 continue; 6818 } 6819 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6820 return (ENOMEM); 6821 } 6822 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6823 if (*pde == 0) 6824 return (EINVAL); 6825 if (*pde & PG_PS) { 6826 /* 6827 * If the current 2MB page already has the required 6828 * memory type, then we need not demote this page. Just 6829 * increment tmpva to the next 2MB page frame. 6830 */ 6831 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6832 tmpva = trunc_2mpage(tmpva) + NBPDR; 6833 continue; 6834 } 6835 6836 /* 6837 * If the current offset aligns with a 2MB page frame 6838 * and there is at least 2MB left within the range, then 6839 * we need not break down this page into 4KB pages. 6840 */ 6841 if ((tmpva & PDRMASK) == 0 && 6842 tmpva + PDRMASK < base + size) { 6843 tmpva += NBPDR; 6844 continue; 6845 } 6846 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6847 return (ENOMEM); 6848 } 6849 pte = pmap_pde_to_pte(pde, tmpva); 6850 if (*pte == 0) 6851 return (EINVAL); 6852 tmpva += PAGE_SIZE; 6853 } 6854 error = 0; 6855 6856 /* 6857 * Ok, all the pages exist, so run through them updating their 6858 * cache mode if required. 6859 */ 6860 pa_start = pa_end = 0; 6861 for (tmpva = base; tmpva < base + size; ) { 6862 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6863 if (*pdpe & PG_PS) { 6864 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6865 pmap_pde_attr(pdpe, cache_bits_pde, 6866 X86_PG_PDE_CACHE); 6867 changed = TRUE; 6868 } 6869 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6870 (*pdpe & PG_PS_FRAME) < dmaplimit) { 6871 if (pa_start == pa_end) { 6872 /* Start physical address run. */ 6873 pa_start = *pdpe & PG_PS_FRAME; 6874 pa_end = pa_start + NBPDP; 6875 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6876 pa_end += NBPDP; 6877 else { 6878 /* Run ended, update direct map. */ 6879 error = pmap_change_attr_locked( 6880 PHYS_TO_DMAP(pa_start), 6881 pa_end - pa_start, mode); 6882 if (error != 0) 6883 break; 6884 /* Start physical address run. */ 6885 pa_start = *pdpe & PG_PS_FRAME; 6886 pa_end = pa_start + NBPDP; 6887 } 6888 } 6889 tmpva = trunc_1gpage(tmpva) + NBPDP; 6890 continue; 6891 } 6892 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6893 if (*pde & PG_PS) { 6894 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6895 pmap_pde_attr(pde, cache_bits_pde, 6896 X86_PG_PDE_CACHE); 6897 changed = TRUE; 6898 } 6899 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6900 (*pde & PG_PS_FRAME) < dmaplimit) { 6901 if (pa_start == pa_end) { 6902 /* Start physical address run. */ 6903 pa_start = *pde & PG_PS_FRAME; 6904 pa_end = pa_start + NBPDR; 6905 } else if (pa_end == (*pde & PG_PS_FRAME)) 6906 pa_end += NBPDR; 6907 else { 6908 /* Run ended, update direct map. */ 6909 error = pmap_change_attr_locked( 6910 PHYS_TO_DMAP(pa_start), 6911 pa_end - pa_start, mode); 6912 if (error != 0) 6913 break; 6914 /* Start physical address run. */ 6915 pa_start = *pde & PG_PS_FRAME; 6916 pa_end = pa_start + NBPDR; 6917 } 6918 } 6919 tmpva = trunc_2mpage(tmpva) + NBPDR; 6920 } else { 6921 pte = pmap_pde_to_pte(pde, tmpva); 6922 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6923 pmap_pte_attr(pte, cache_bits_pte, 6924 X86_PG_PTE_CACHE); 6925 changed = TRUE; 6926 } 6927 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6928 (*pte & PG_FRAME) < dmaplimit) { 6929 if (pa_start == pa_end) { 6930 /* Start physical address run. */ 6931 pa_start = *pte & PG_FRAME; 6932 pa_end = pa_start + PAGE_SIZE; 6933 } else if (pa_end == (*pte & PG_FRAME)) 6934 pa_end += PAGE_SIZE; 6935 else { 6936 /* Run ended, update direct map. */ 6937 error = pmap_change_attr_locked( 6938 PHYS_TO_DMAP(pa_start), 6939 pa_end - pa_start, mode); 6940 if (error != 0) 6941 break; 6942 /* Start physical address run. */ 6943 pa_start = *pte & PG_FRAME; 6944 pa_end = pa_start + PAGE_SIZE; 6945 } 6946 } 6947 tmpva += PAGE_SIZE; 6948 } 6949 } 6950 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6951 pa_end1 = MIN(pa_end, dmaplimit); 6952 if (pa_start != pa_end1) 6953 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6954 pa_end1 - pa_start, mode); 6955 } 6956 6957 /* 6958 * Flush CPU caches if required to make sure any data isn't cached that 6959 * shouldn't be, etc. 6960 */ 6961 if (changed) { 6962 pmap_invalidate_range(kernel_pmap, base, tmpva); 6963 pmap_invalidate_cache_range(base, tmpva, FALSE); 6964 } 6965 return (error); 6966} 6967 6968/* 6969 * Demotes any mapping within the direct map region that covers more than the 6970 * specified range of physical addresses. This range's size must be a power 6971 * of two and its starting address must be a multiple of its size. Since the 6972 * demotion does not change any attributes of the mapping, a TLB invalidation 6973 * is not mandatory. The caller may, however, request a TLB invalidation. 6974 */ 6975void 6976pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6977{ 6978 pdp_entry_t *pdpe; 6979 pd_entry_t *pde; 6980 vm_offset_t va; 6981 boolean_t changed; 6982 6983 if (len == 0) 6984 return; 6985 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6986 KASSERT((base & (len - 1)) == 0, 6987 ("pmap_demote_DMAP: base is not a multiple of len")); 6988 if (len < NBPDP && base < dmaplimit) { 6989 va = PHYS_TO_DMAP(base); 6990 changed = FALSE; 6991 PMAP_LOCK(kernel_pmap); 6992 pdpe = pmap_pdpe(kernel_pmap, va); 6993 if ((*pdpe & X86_PG_V) == 0) 6994 panic("pmap_demote_DMAP: invalid PDPE"); 6995 if ((*pdpe & PG_PS) != 0) { 6996 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6997 panic("pmap_demote_DMAP: PDPE failed"); 6998 changed = TRUE; 6999 } 7000 if (len < NBPDR) { 7001 pde = pmap_pdpe_to_pde(pdpe, va); 7002 if ((*pde & X86_PG_V) == 0) 7003 panic("pmap_demote_DMAP: invalid PDE"); 7004 if ((*pde & PG_PS) != 0) { 7005 if (!pmap_demote_pde(kernel_pmap, pde, va)) 7006 panic("pmap_demote_DMAP: PDE failed"); 7007 changed = TRUE; 7008 } 7009 } 7010 if (changed && invalidate) 7011 pmap_invalidate_page(kernel_pmap, va); 7012 PMAP_UNLOCK(kernel_pmap); 7013 } 7014} 7015 7016/* 7017 * perform the pmap work for mincore 7018 */ 7019int 7020pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 7021{ 7022 pd_entry_t *pdep; 7023 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 7024 vm_paddr_t pa; 7025 int val; 7026 7027 PG_A = pmap_accessed_bit(pmap); 7028 PG_M = pmap_modified_bit(pmap); 7029 PG_V = pmap_valid_bit(pmap); 7030 PG_RW = pmap_rw_bit(pmap); 7031 7032 PMAP_LOCK(pmap); 7033retry: 7034 pdep = pmap_pde(pmap, addr); 7035 if (pdep != NULL && (*pdep & PG_V)) { 7036 if (*pdep & PG_PS) { 7037 pte = *pdep; 7038 /* Compute the physical address of the 4KB page. */ 7039 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 7040 PG_FRAME; 7041 val = MINCORE_SUPER; 7042 } else { 7043 pte = *pmap_pde_to_pte(pdep, addr); 7044 pa = pte & PG_FRAME; 7045 val = 0; 7046 } 7047 } else { 7048 pte = 0; 7049 pa = 0; 7050 val = 0; 7051 } 7052 if ((pte & PG_V) != 0) { 7053 val |= MINCORE_INCORE; 7054 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7055 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 7056 if ((pte & PG_A) != 0) 7057 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 7058 } 7059 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 7060 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 7061 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 7062 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 7063 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 7064 goto retry; 7065 } else 7066 PA_UNLOCK_COND(*locked_pa); 7067 PMAP_UNLOCK(pmap); 7068 return (val); 7069} 7070 7071static uint64_t 7072pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 7073{ 7074 uint32_t gen, new_gen, pcid_next; 7075 7076 CRITICAL_ASSERT(curthread); 7077 gen = PCPU_GET(pcid_gen); 7078 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || 7079 pmap->pm_pcids[cpuid].pm_gen == gen) 7080 return (CR3_PCID_SAVE); 7081 pcid_next = PCPU_GET(pcid_next); 7082 KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", 7083 cpuid, pcid_next)); 7084 if (pcid_next == PMAP_PCID_OVERMAX) { 7085 new_gen = gen + 1; 7086 if (new_gen == 0) 7087 new_gen = 1; 7088 PCPU_SET(pcid_gen, new_gen); 7089 pcid_next = PMAP_PCID_KERN + 1; 7090 } else { 7091 new_gen = gen; 7092 } 7093 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 7094 pmap->pm_pcids[cpuid].pm_gen = new_gen; 7095 PCPU_SET(pcid_next, pcid_next + 1); 7096 return (0); 7097} 7098 7099void 7100pmap_activate_sw(struct thread *td) 7101{ 7102 pmap_t oldpmap, pmap; 7103 uint64_t cached, cr3; 7104 register_t rflags; 7105 u_int cpuid; 7106 7107 oldpmap = PCPU_GET(curpmap); 7108 pmap = vmspace_pmap(td->td_proc->p_vmspace); 7109 if (oldpmap == pmap) 7110 return; 7111 cpuid = PCPU_GET(cpuid); 7112#ifdef SMP 7113 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 7114#else 7115 CPU_SET(cpuid, &pmap->pm_active); 7116#endif 7117 cr3 = rcr3(); 7118 if (pmap_pcid_enabled) { 7119 cached = pmap_pcid_alloc(pmap, cpuid); 7120 KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && 7121 pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 7122 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 7123 pmap->pm_pcids[cpuid].pm_pcid)); 7124 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 7125 pmap == kernel_pmap, 7126 ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", 7127 td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 7128 7129 /* 7130 * If the INVPCID instruction is not available, 7131 * invltlb_pcid_handler() is used for handle 7132 * invalidate_all IPI, which checks for curpmap == 7133 * smp_tlb_pmap. Below operations sequence has a 7134 * window where %CR3 is loaded with the new pmap's 7135 * PML4 address, but curpmap value is not yet updated. 7136 * This causes invltlb IPI handler, called between the 7137 * updates, to execute as NOP, which leaves stale TLB 7138 * entries. 7139 * 7140 * Note that the most typical use of 7141 * pmap_activate_sw(), from the context switch, is 7142 * immune to this race, because interrupts are 7143 * disabled (while the thread lock is owned), and IPI 7144 * happends after curpmap is updated. Protect other 7145 * callers in a similar way, by disabling interrupts 7146 * around the %cr3 register reload and curpmap 7147 * assignment. 7148 */ 7149 if (!invpcid_works) 7150 rflags = intr_disable(); 7151 7152 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { 7153 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 7154 cached); 7155 if (cached) 7156 PCPU_INC(pm_save_cnt); 7157 } 7158 PCPU_SET(curpmap, pmap); 7159 if (!invpcid_works) 7160 intr_restore(rflags); 7161 } else if (cr3 != pmap->pm_cr3) { 7162 load_cr3(pmap->pm_cr3); 7163 PCPU_SET(curpmap, pmap); 7164 } 7165#ifdef SMP 7166 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 7167#else 7168 CPU_CLR(cpuid, &oldpmap->pm_active); 7169#endif 7170} 7171 7172void 7173pmap_activate(struct thread *td) 7174{ 7175 7176 critical_enter(); 7177 pmap_activate_sw(td); 7178 critical_exit(); 7179} 7180 7181void 7182pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 7183{ 7184} 7185 7186/* 7187 * Increase the starting virtual address of the given mapping if a 7188 * different alignment might result in more superpage mappings. 7189 */ 7190void 7191pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7192 vm_offset_t *addr, vm_size_t size) 7193{ 7194 vm_offset_t superpage_offset; 7195 7196 if (size < NBPDR) 7197 return; 7198 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7199 offset += ptoa(object->pg_color); 7200 superpage_offset = offset & PDRMASK; 7201 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 7202 (*addr & PDRMASK) == superpage_offset) 7203 return; 7204 if ((*addr & PDRMASK) < superpage_offset) 7205 *addr = (*addr & ~PDRMASK) + superpage_offset; 7206 else 7207 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 7208} 7209 7210#ifdef INVARIANTS 7211static unsigned long num_dirty_emulations; 7212SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 7213 &num_dirty_emulations, 0, NULL); 7214 7215static unsigned long num_accessed_emulations; 7216SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 7217 &num_accessed_emulations, 0, NULL); 7218 7219static unsigned long num_superpage_accessed_emulations; 7220SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 7221 &num_superpage_accessed_emulations, 0, NULL); 7222 7223static unsigned long ad_emulation_superpage_promotions; 7224SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 7225 &ad_emulation_superpage_promotions, 0, NULL); 7226#endif /* INVARIANTS */ 7227 7228int 7229pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 7230{ 7231 int rv; 7232 struct rwlock *lock; 7233#if VM_NRESERVLEVEL > 0 7234 vm_page_t m, mpte; 7235#endif 7236 pd_entry_t *pde; 7237 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 7238 7239 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 7240 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 7241 7242 if (!pmap_emulate_ad_bits(pmap)) 7243 return (-1); 7244 7245 PG_A = pmap_accessed_bit(pmap); 7246 PG_M = pmap_modified_bit(pmap); 7247 PG_V = pmap_valid_bit(pmap); 7248 PG_RW = pmap_rw_bit(pmap); 7249 7250 rv = -1; 7251 lock = NULL; 7252 PMAP_LOCK(pmap); 7253 7254 pde = pmap_pde(pmap, va); 7255 if (pde == NULL || (*pde & PG_V) == 0) 7256 goto done; 7257 7258 if ((*pde & PG_PS) != 0) { 7259 if (ftype == VM_PROT_READ) { 7260#ifdef INVARIANTS 7261 atomic_add_long(&num_superpage_accessed_emulations, 1); 7262#endif 7263 *pde |= PG_A; 7264 rv = 0; 7265 } 7266 goto done; 7267 } 7268 7269 pte = pmap_pde_to_pte(pde, va); 7270 if ((*pte & PG_V) == 0) 7271 goto done; 7272 7273 if (ftype == VM_PROT_WRITE) { 7274 if ((*pte & PG_RW) == 0) 7275 goto done; 7276 /* 7277 * Set the modified and accessed bits simultaneously. 7278 * 7279 * Intel EPT PTEs that do software emulation of A/D bits map 7280 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 7281 * An EPT misconfiguration is triggered if the PTE is writable 7282 * but not readable (WR=10). This is avoided by setting PG_A 7283 * and PG_M simultaneously. 7284 */ 7285 *pte |= PG_M | PG_A; 7286 } else { 7287 *pte |= PG_A; 7288 } 7289 7290#if VM_NRESERVLEVEL > 0 7291 /* try to promote the mapping */ 7292 if (va < VM_MAXUSER_ADDRESS) 7293 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7294 else 7295 mpte = NULL; 7296 7297 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 7298 7299 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 7300 pmap_ps_enabled(pmap) && 7301 (m->flags & PG_FICTITIOUS) == 0 && 7302 vm_reserv_level_iffullpop(m) == 0) { 7303 pmap_promote_pde(pmap, pde, va, &lock); 7304#ifdef INVARIANTS 7305 atomic_add_long(&ad_emulation_superpage_promotions, 1); 7306#endif 7307 } 7308#endif 7309 7310#ifdef INVARIANTS 7311 if (ftype == VM_PROT_WRITE) 7312 atomic_add_long(&num_dirty_emulations, 1); 7313 else 7314 atomic_add_long(&num_accessed_emulations, 1); 7315#endif 7316 rv = 0; /* success */ 7317done: 7318 if (lock != NULL) 7319 rw_wunlock(lock); 7320 PMAP_UNLOCK(pmap); 7321 return (rv); 7322} 7323 7324void 7325pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 7326{ 7327 pml4_entry_t *pml4; 7328 pdp_entry_t *pdp; 7329 pd_entry_t *pde; 7330 pt_entry_t *pte, PG_V; 7331 int idx; 7332 7333 idx = 0; 7334 PG_V = pmap_valid_bit(pmap); 7335 PMAP_LOCK(pmap); 7336 7337 pml4 = pmap_pml4e(pmap, va); 7338 ptr[idx++] = *pml4; 7339 if ((*pml4 & PG_V) == 0) 7340 goto done; 7341 7342 pdp = pmap_pml4e_to_pdpe(pml4, va); 7343 ptr[idx++] = *pdp; 7344 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 7345 goto done; 7346 7347 pde = pmap_pdpe_to_pde(pdp, va); 7348 ptr[idx++] = *pde; 7349 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 7350 goto done; 7351 7352 pte = pmap_pde_to_pte(pde, va); 7353 ptr[idx++] = *pte; 7354 7355done: 7356 PMAP_UNLOCK(pmap); 7357 *num = idx; 7358} 7359 7360/** 7361 * Get the kernel virtual address of a set of physical pages. If there are 7362 * physical addresses not covered by the DMAP perform a transient mapping 7363 * that will be removed when calling pmap_unmap_io_transient. 7364 * 7365 * \param page The pages the caller wishes to obtain the virtual 7366 * address on the kernel memory map. 7367 * \param vaddr On return contains the kernel virtual memory address 7368 * of the pages passed in the page parameter. 7369 * \param count Number of pages passed in. 7370 * \param can_fault TRUE if the thread using the mapped pages can take 7371 * page faults, FALSE otherwise. 7372 * 7373 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7374 * finished or FALSE otherwise. 7375 * 7376 */ 7377boolean_t 7378pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7379 boolean_t can_fault) 7380{ 7381 vm_paddr_t paddr; 7382 boolean_t needs_mapping; 7383 pt_entry_t *pte; 7384 int cache_bits, error, i; 7385 7386 /* 7387 * Allocate any KVA space that we need, this is done in a separate 7388 * loop to prevent calling vmem_alloc while pinned. 7389 */ 7390 needs_mapping = FALSE; 7391 for (i = 0; i < count; i++) { 7392 paddr = VM_PAGE_TO_PHYS(page[i]); 7393 if (__predict_false(paddr >= dmaplimit)) { 7394 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7395 M_BESTFIT | M_WAITOK, &vaddr[i]); 7396 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7397 needs_mapping = TRUE; 7398 } else { 7399 vaddr[i] = PHYS_TO_DMAP(paddr); 7400 } 7401 } 7402 7403 /* Exit early if everything is covered by the DMAP */ 7404 if (!needs_mapping) 7405 return (FALSE); 7406 7407 /* 7408 * NB: The sequence of updating a page table followed by accesses 7409 * to the corresponding pages used in the !DMAP case is subject to 7410 * the situation described in the "AMD64 Architecture Programmer's 7411 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 7412 * Coherency Considerations". Therefore, issuing the INVLPG right 7413 * after modifying the PTE bits is crucial. 7414 */ 7415 if (!can_fault) 7416 sched_pin(); 7417 for (i = 0; i < count; i++) { 7418 paddr = VM_PAGE_TO_PHYS(page[i]); 7419 if (paddr >= dmaplimit) { 7420 if (can_fault) { 7421 /* 7422 * Slow path, since we can get page faults 7423 * while mappings are active don't pin the 7424 * thread to the CPU and instead add a global 7425 * mapping visible to all CPUs. 7426 */ 7427 pmap_qenter(vaddr[i], &page[i], 1); 7428 } else { 7429 pte = vtopte(vaddr[i]); 7430 cache_bits = pmap_cache_bits(kernel_pmap, 7431 page[i]->md.pat_mode, 0); 7432 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 7433 cache_bits); 7434 invlpg(vaddr[i]); 7435 } 7436 } 7437 } 7438 7439 return (needs_mapping); 7440} 7441 7442void 7443pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7444 boolean_t can_fault) 7445{ 7446 vm_paddr_t paddr; 7447 int i; 7448 7449 if (!can_fault) 7450 sched_unpin(); 7451 for (i = 0; i < count; i++) { 7452 paddr = VM_PAGE_TO_PHYS(page[i]); 7453 if (paddr >= dmaplimit) { 7454 if (can_fault) 7455 pmap_qremove(vaddr[i], 1); 7456 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 7457 } 7458 } 7459} 7460 7461vm_offset_t 7462pmap_quick_enter_page(vm_page_t m) 7463{ 7464 vm_paddr_t paddr; 7465 7466 paddr = VM_PAGE_TO_PHYS(m); 7467 if (paddr < dmaplimit) 7468 return (PHYS_TO_DMAP(paddr)); 7469 mtx_lock_spin(&qframe_mtx); 7470 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 7471 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 7472 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 7473 return (qframe); 7474} 7475 7476void 7477pmap_quick_remove_page(vm_offset_t addr) 7478{ 7479 7480 if (addr != qframe) 7481 return; 7482 pte_store(vtopte(qframe), 0); 7483 invlpg(qframe); 7484 mtx_unlock_spin(&qframe_mtx); 7485} 7486 7487#include "opt_ddb.h" 7488#ifdef DDB 7489#include <ddb/ddb.h> 7490 7491DB_SHOW_COMMAND(pte, pmap_print_pte) 7492{ 7493 pmap_t pmap; 7494 pml4_entry_t *pml4; 7495 pdp_entry_t *pdp; 7496 pd_entry_t *pde; 7497 pt_entry_t *pte, PG_V; 7498 vm_offset_t va; 7499 7500 if (have_addr) { 7501 va = (vm_offset_t)addr; 7502 pmap = PCPU_GET(curpmap); /* XXX */ 7503 } else { 7504 db_printf("show pte addr\n"); 7505 return; 7506 } 7507 PG_V = pmap_valid_bit(pmap); 7508 pml4 = pmap_pml4e(pmap, va); 7509 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 7510 if ((*pml4 & PG_V) == 0) { 7511 db_printf("\n"); 7512 return; 7513 } 7514 pdp = pmap_pml4e_to_pdpe(pml4, va); 7515 db_printf(" pdpe %#016lx", *pdp); 7516 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 7517 db_printf("\n"); 7518 return; 7519 } 7520 pde = pmap_pdpe_to_pde(pdp, va); 7521 db_printf(" pde %#016lx", *pde); 7522 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 7523 db_printf("\n"); 7524 return; 7525 } 7526 pte = pmap_pde_to_pte(pde, va); 7527 db_printf(" pte %#016lx\n", *pte); 7528} 7529 7530DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 7531{ 7532 vm_paddr_t a; 7533 7534 if (have_addr) { 7535 a = (vm_paddr_t)addr; 7536 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 7537 } else { 7538 db_printf("show phys2dmap addr\n"); 7539 } 7540} 7541#endif 7542