pmap.c revision 298653
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#define AMD64_NPT_AWARE 80 81#include <sys/cdefs.h> 82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 298653 2016-04-26 17:39:54Z pfg $"); 83 84/* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103#include "opt_pmap.h" 104#include "opt_vm.h" 105 106#include <sys/param.h> 107#include <sys/bus.h> 108#include <sys/systm.h> 109#include <sys/kernel.h> 110#include <sys/ktr.h> 111#include <sys/lock.h> 112#include <sys/malloc.h> 113#include <sys/mman.h> 114#include <sys/mutex.h> 115#include <sys/proc.h> 116#include <sys/rwlock.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#include <sys/_unrhdr.h> 122#include <sys/smp.h> 123 124#include <vm/vm.h> 125#include <vm/vm_param.h> 126#include <vm/vm_kern.h> 127#include <vm/vm_page.h> 128#include <vm/vm_map.h> 129#include <vm/vm_object.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_pageout.h> 132#include <vm/vm_pager.h> 133#include <vm/vm_phys.h> 134#include <vm/vm_radix.h> 135#include <vm/vm_reserv.h> 136#include <vm/uma.h> 137 138#include <machine/intr_machdep.h> 139#include <machine/apicvar.h> 140#include <machine/cpu.h> 141#include <machine/cputypes.h> 142#include <machine/md_var.h> 143#include <machine/pcb.h> 144#include <machine/specialreg.h> 145#ifdef SMP 146#include <machine/smp.h> 147#endif 148 149static __inline boolean_t 150pmap_type_guest(pmap_t pmap) 151{ 152 153 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 154} 155 156static __inline boolean_t 157pmap_emulate_ad_bits(pmap_t pmap) 158{ 159 160 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 161} 162 163static __inline pt_entry_t 164pmap_valid_bit(pmap_t pmap) 165{ 166 pt_entry_t mask; 167 168 switch (pmap->pm_type) { 169 case PT_X86: 170 case PT_RVI: 171 mask = X86_PG_V; 172 break; 173 case PT_EPT: 174 if (pmap_emulate_ad_bits(pmap)) 175 mask = EPT_PG_EMUL_V; 176 else 177 mask = EPT_PG_READ; 178 break; 179 default: 180 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 181 } 182 183 return (mask); 184} 185 186static __inline pt_entry_t 187pmap_rw_bit(pmap_t pmap) 188{ 189 pt_entry_t mask; 190 191 switch (pmap->pm_type) { 192 case PT_X86: 193 case PT_RVI: 194 mask = X86_PG_RW; 195 break; 196 case PT_EPT: 197 if (pmap_emulate_ad_bits(pmap)) 198 mask = EPT_PG_EMUL_RW; 199 else 200 mask = EPT_PG_WRITE; 201 break; 202 default: 203 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 204 } 205 206 return (mask); 207} 208 209static __inline pt_entry_t 210pmap_global_bit(pmap_t pmap) 211{ 212 pt_entry_t mask; 213 214 switch (pmap->pm_type) { 215 case PT_X86: 216 mask = X86_PG_G; 217 break; 218 case PT_RVI: 219 case PT_EPT: 220 mask = 0; 221 break; 222 default: 223 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 224 } 225 226 return (mask); 227} 228 229static __inline pt_entry_t 230pmap_accessed_bit(pmap_t pmap) 231{ 232 pt_entry_t mask; 233 234 switch (pmap->pm_type) { 235 case PT_X86: 236 case PT_RVI: 237 mask = X86_PG_A; 238 break; 239 case PT_EPT: 240 if (pmap_emulate_ad_bits(pmap)) 241 mask = EPT_PG_READ; 242 else 243 mask = EPT_PG_A; 244 break; 245 default: 246 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 247 } 248 249 return (mask); 250} 251 252static __inline pt_entry_t 253pmap_modified_bit(pmap_t pmap) 254{ 255 pt_entry_t mask; 256 257 switch (pmap->pm_type) { 258 case PT_X86: 259 case PT_RVI: 260 mask = X86_PG_M; 261 break; 262 case PT_EPT: 263 if (pmap_emulate_ad_bits(pmap)) 264 mask = EPT_PG_WRITE; 265 else 266 mask = EPT_PG_M; 267 break; 268 default: 269 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 270 } 271 272 return (mask); 273} 274 275#if !defined(DIAGNOSTIC) 276#ifdef __GNUC_GNU_INLINE__ 277#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 278#else 279#define PMAP_INLINE extern inline 280#endif 281#else 282#define PMAP_INLINE 283#endif 284 285#ifdef PV_STATS 286#define PV_STAT(x) do { x ; } while (0) 287#else 288#define PV_STAT(x) do { } while (0) 289#endif 290 291#define pa_index(pa) ((pa) >> PDRSHIFT) 292#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 293 294#define NPV_LIST_LOCKS MAXCPU 295 296#define PHYS_TO_PV_LIST_LOCK(pa) \ 297 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 298 299#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 300 struct rwlock **_lockp = (lockp); \ 301 struct rwlock *_new_lock; \ 302 \ 303 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 304 if (_new_lock != *_lockp) { \ 305 if (*_lockp != NULL) \ 306 rw_wunlock(*_lockp); \ 307 *_lockp = _new_lock; \ 308 rw_wlock(*_lockp); \ 309 } \ 310} while (0) 311 312#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 313 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 314 315#define RELEASE_PV_LIST_LOCK(lockp) do { \ 316 struct rwlock **_lockp = (lockp); \ 317 \ 318 if (*_lockp != NULL) { \ 319 rw_wunlock(*_lockp); \ 320 *_lockp = NULL; \ 321 } \ 322} while (0) 323 324#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 325 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 326 327struct pmap kernel_pmap_store; 328 329vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 330vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 331 332int nkpt; 333SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 334 "Number of kernel page table pages allocated on bootup"); 335 336static int ndmpdp; 337vm_paddr_t dmaplimit; 338vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 339pt_entry_t pg_nx; 340 341static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 342 343static int pat_works = 1; 344SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 345 "Is page attribute table fully functional?"); 346 347static int pg_ps_enabled = 1; 348SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 349 "Are large page mappings enabled?"); 350 351#define PAT_INDEX_SIZE 8 352static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 353 354static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 355static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 356u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 357u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 358 359static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 360static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 361static int ndmpdpphys; /* number of DMPDPphys pages */ 362 363/* 364 * pmap_mapdev support pre initialization (i.e. console) 365 */ 366#define PMAP_PREINIT_MAPPING_COUNT 8 367static struct pmap_preinit_mapping { 368 vm_paddr_t pa; 369 vm_offset_t va; 370 vm_size_t sz; 371 int mode; 372} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 373static int pmap_initialized; 374 375static struct rwlock_padalign pvh_global_lock; 376 377/* 378 * Data for the pv entry allocation mechanism 379 */ 380static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 381static struct mtx pv_chunks_mutex; 382static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 383static struct md_page *pv_table; 384 385/* 386 * All those kernel PT submaps that BSD is so fond of 387 */ 388pt_entry_t *CMAP1 = 0; 389caddr_t CADDR1 = 0; 390 391static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 392 393static struct unrhdr pcid_unr; 394static struct mtx pcid_mtx; 395int pmap_pcid_enabled = 0; 396SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled, 397 0, "Is TLB Context ID enabled ?"); 398int invpcid_works = 0; 399SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 400 "Is the invpcid instruction available ?"); 401 402static int 403pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 404{ 405 int i; 406 uint64_t res; 407 408 res = 0; 409 CPU_FOREACH(i) { 410 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 411 } 412 return (sysctl_handle_64(oidp, &res, 0, req)); 413} 414SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 415 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 416 "Count of saved TLB context on switch"); 417 418/* pmap_copy_pages() over non-DMAP */ 419static struct mtx cpage_lock; 420static vm_offset_t cpage_a; 421static vm_offset_t cpage_b; 422 423/* 424 * Crashdump maps. 425 */ 426static caddr_t crashdumpmap; 427 428static void free_pv_chunk(struct pv_chunk *pc); 429static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 430static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 431static int popcnt_pc_map_elem(uint64_t elem); 432static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 433static void reserve_pv_entries(pmap_t pmap, int needed, 434 struct rwlock **lockp); 435static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 436 struct rwlock **lockp); 437static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 438 struct rwlock **lockp); 439static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 440 struct rwlock **lockp); 441static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 442static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 443 vm_offset_t va); 444 445static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 446static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 447static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 448 vm_offset_t va, struct rwlock **lockp); 449static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 450 vm_offset_t va); 451static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 452 vm_prot_t prot, struct rwlock **lockp); 453static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 454 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 455static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 456static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 457static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 458static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 459static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 460static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 461 struct rwlock **lockp); 462static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 463 vm_prot_t prot); 464static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 465static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 466 struct spglist *free, struct rwlock **lockp); 467static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 468 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 469static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 470static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 471 struct spglist *free); 472static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 473 vm_page_t m, struct rwlock **lockp); 474static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 475 pd_entry_t newpde); 476static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 477 478static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 479 struct rwlock **lockp); 480static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 481 struct rwlock **lockp); 482static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 483 struct rwlock **lockp); 484 485static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 486 struct spglist *free); 487static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 488static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 489 490/* 491 * Move the kernel virtual free pointer to the next 492 * 2MB. This is used to help improve performance 493 * by using a large (2MB) page for much of the kernel 494 * (.text, .data, .bss) 495 */ 496static vm_offset_t 497pmap_kmem_choose(vm_offset_t addr) 498{ 499 vm_offset_t newaddr = addr; 500 501 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 502 return (newaddr); 503} 504 505/********************/ 506/* Inline functions */ 507/********************/ 508 509/* Return a non-clipped PD index for a given VA */ 510static __inline vm_pindex_t 511pmap_pde_pindex(vm_offset_t va) 512{ 513 return (va >> PDRSHIFT); 514} 515 516 517/* Return various clipped indexes for a given VA */ 518static __inline vm_pindex_t 519pmap_pte_index(vm_offset_t va) 520{ 521 522 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 523} 524 525static __inline vm_pindex_t 526pmap_pde_index(vm_offset_t va) 527{ 528 529 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 530} 531 532static __inline vm_pindex_t 533pmap_pdpe_index(vm_offset_t va) 534{ 535 536 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 537} 538 539static __inline vm_pindex_t 540pmap_pml4e_index(vm_offset_t va) 541{ 542 543 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 544} 545 546/* Return a pointer to the PML4 slot that corresponds to a VA */ 547static __inline pml4_entry_t * 548pmap_pml4e(pmap_t pmap, vm_offset_t va) 549{ 550 551 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 552} 553 554/* Return a pointer to the PDP slot that corresponds to a VA */ 555static __inline pdp_entry_t * 556pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 557{ 558 pdp_entry_t *pdpe; 559 560 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 561 return (&pdpe[pmap_pdpe_index(va)]); 562} 563 564/* Return a pointer to the PDP slot that corresponds to a VA */ 565static __inline pdp_entry_t * 566pmap_pdpe(pmap_t pmap, vm_offset_t va) 567{ 568 pml4_entry_t *pml4e; 569 pt_entry_t PG_V; 570 571 PG_V = pmap_valid_bit(pmap); 572 pml4e = pmap_pml4e(pmap, va); 573 if ((*pml4e & PG_V) == 0) 574 return (NULL); 575 return (pmap_pml4e_to_pdpe(pml4e, va)); 576} 577 578/* Return a pointer to the PD slot that corresponds to a VA */ 579static __inline pd_entry_t * 580pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 581{ 582 pd_entry_t *pde; 583 584 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 585 return (&pde[pmap_pde_index(va)]); 586} 587 588/* Return a pointer to the PD slot that corresponds to a VA */ 589static __inline pd_entry_t * 590pmap_pde(pmap_t pmap, vm_offset_t va) 591{ 592 pdp_entry_t *pdpe; 593 pt_entry_t PG_V; 594 595 PG_V = pmap_valid_bit(pmap); 596 pdpe = pmap_pdpe(pmap, va); 597 if (pdpe == NULL || (*pdpe & PG_V) == 0) 598 return (NULL); 599 return (pmap_pdpe_to_pde(pdpe, va)); 600} 601 602/* Return a pointer to the PT slot that corresponds to a VA */ 603static __inline pt_entry_t * 604pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 605{ 606 pt_entry_t *pte; 607 608 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 609 return (&pte[pmap_pte_index(va)]); 610} 611 612/* Return a pointer to the PT slot that corresponds to a VA */ 613static __inline pt_entry_t * 614pmap_pte(pmap_t pmap, vm_offset_t va) 615{ 616 pd_entry_t *pde; 617 pt_entry_t PG_V; 618 619 PG_V = pmap_valid_bit(pmap); 620 pde = pmap_pde(pmap, va); 621 if (pde == NULL || (*pde & PG_V) == 0) 622 return (NULL); 623 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 624 return ((pt_entry_t *)pde); 625 return (pmap_pde_to_pte(pde, va)); 626} 627 628static __inline void 629pmap_resident_count_inc(pmap_t pmap, int count) 630{ 631 632 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 633 pmap->pm_stats.resident_count += count; 634} 635 636static __inline void 637pmap_resident_count_dec(pmap_t pmap, int count) 638{ 639 640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 641 KASSERT(pmap->pm_stats.resident_count >= count, 642 ("pmap %p resident count underflow %ld %d", pmap, 643 pmap->pm_stats.resident_count, count)); 644 pmap->pm_stats.resident_count -= count; 645} 646 647PMAP_INLINE pt_entry_t * 648vtopte(vm_offset_t va) 649{ 650 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 651 652 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 653 654 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 655} 656 657static __inline pd_entry_t * 658vtopde(vm_offset_t va) 659{ 660 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 661 662 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 663 664 return (PDmap + ((va >> PDRSHIFT) & mask)); 665} 666 667static u_int64_t 668allocpages(vm_paddr_t *firstaddr, int n) 669{ 670 u_int64_t ret; 671 672 ret = *firstaddr; 673 bzero((void *)ret, n * PAGE_SIZE); 674 *firstaddr += n * PAGE_SIZE; 675 return (ret); 676} 677 678CTASSERT(powerof2(NDMPML4E)); 679 680/* number of kernel PDP slots */ 681#define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 682 683static void 684nkpt_init(vm_paddr_t addr) 685{ 686 int pt_pages; 687 688#ifdef NKPT 689 pt_pages = NKPT; 690#else 691 pt_pages = howmany(addr, 1 << PDRSHIFT); 692 pt_pages += NKPDPE(pt_pages); 693 694 /* 695 * Add some slop beyond the bare minimum required for bootstrapping 696 * the kernel. 697 * 698 * This is quite important when allocating KVA for kernel modules. 699 * The modules are required to be linked in the negative 2GB of 700 * the address space. If we run out of KVA in this region then 701 * pmap_growkernel() will need to allocate page table pages to map 702 * the entire 512GB of KVA space which is an unnecessary tax on 703 * physical memory. 704 * 705 * Secondly, device memory mapped as part of setting up the low- 706 * level console(s) is taken from KVA, starting at virtual_avail. 707 * This is because cninit() is called after pmap_bootstrap() but 708 * before vm_init() and pmap_init(). 20MB for a frame buffer is 709 * not uncommon. 710 */ 711 pt_pages += 32; /* 64MB additional slop. */ 712#endif 713 nkpt = pt_pages; 714} 715 716static void 717create_pagetables(vm_paddr_t *firstaddr) 718{ 719 int i, j, ndm1g, nkpdpe; 720 pt_entry_t *pt_p; 721 pd_entry_t *pd_p; 722 pdp_entry_t *pdp_p; 723 pml4_entry_t *p4_p; 724 725 /* Allocate page table pages for the direct map */ 726 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 727 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 728 ndmpdp = 4; 729 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 730 if (ndmpdpphys > NDMPML4E) { 731 /* 732 * Each NDMPML4E allows 512 GB, so limit to that, 733 * and then readjust ndmpdp and ndmpdpphys. 734 */ 735 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 736 Maxmem = atop(NDMPML4E * NBPML4); 737 ndmpdpphys = NDMPML4E; 738 ndmpdp = NDMPML4E * NPDEPG; 739 } 740 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 741 ndm1g = 0; 742 if ((amd_feature & AMDID_PAGE1GB) != 0) 743 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 744 if (ndm1g < ndmpdp) 745 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 746 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 747 748 /* Allocate pages */ 749 KPML4phys = allocpages(firstaddr, 1); 750 KPDPphys = allocpages(firstaddr, NKPML4E); 751 752 /* 753 * Allocate the initial number of kernel page table pages required to 754 * bootstrap. We defer this until after all memory-size dependent 755 * allocations are done (e.g. direct map), so that we don't have to 756 * build in too much slop in our estimate. 757 * 758 * Note that when NKPML4E > 1, we have an empty page underneath 759 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 760 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 761 */ 762 nkpt_init(*firstaddr); 763 nkpdpe = NKPDPE(nkpt); 764 765 KPTphys = allocpages(firstaddr, nkpt); 766 KPDphys = allocpages(firstaddr, nkpdpe); 767 768 /* Fill in the underlying page table pages */ 769 /* Nominally read-only (but really R/W) from zero to physfree */ 770 /* XXX not fully used, underneath 2M pages */ 771 pt_p = (pt_entry_t *)KPTphys; 772 for (i = 0; ptoa(i) < *firstaddr; i++) 773 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 774 775 /* Now map the page tables at their location within PTmap */ 776 pd_p = (pd_entry_t *)KPDphys; 777 for (i = 0; i < nkpt; i++) 778 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 779 780 /* Map from zero to end of allocations under 2M pages */ 781 /* This replaces some of the KPTphys entries above */ 782 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 783 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 784 X86_PG_G; 785 786 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 787 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 788 for (i = 0; i < nkpdpe; i++) 789 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 790 PG_U; 791 792 /* 793 * Now, set up the direct map region using 2MB and/or 1GB pages. If 794 * the end of physical memory is not aligned to a 1GB page boundary, 795 * then the residual physical memory is mapped with 2MB pages. Later, 796 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 797 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 798 * that are partially used. 799 */ 800 pd_p = (pd_entry_t *)DMPDphys; 801 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 802 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 803 /* Preset PG_M and PG_A because demotion expects it. */ 804 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 805 X86_PG_M | X86_PG_A; 806 } 807 pdp_p = (pdp_entry_t *)DMPDPphys; 808 for (i = 0; i < ndm1g; i++) { 809 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 810 /* Preset PG_M and PG_A because demotion expects it. */ 811 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 812 X86_PG_M | X86_PG_A; 813 } 814 for (j = 0; i < ndmpdp; i++, j++) { 815 pdp_p[i] = DMPDphys + ptoa(j); 816 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 817 } 818 819 /* And recursively map PML4 to itself in order to get PTmap */ 820 p4_p = (pml4_entry_t *)KPML4phys; 821 p4_p[PML4PML4I] = KPML4phys; 822 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 823 824 /* Connect the Direct Map slot(s) up to the PML4. */ 825 for (i = 0; i < ndmpdpphys; i++) { 826 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 827 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 828 } 829 830 /* Connect the KVA slots up to the PML4 */ 831 for (i = 0; i < NKPML4E; i++) { 832 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 833 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 834 } 835} 836 837/* 838 * Bootstrap the system enough to run with virtual memory. 839 * 840 * On amd64 this is called after mapping has already been enabled 841 * and just syncs the pmap module with what has already been done. 842 * [We can't call it easily with mapping off since the kernel is not 843 * mapped with PA == VA, hence we would have to relocate every address 844 * from the linked base (virtual) address "KERNBASE" to the actual 845 * (physical) address starting relative to 0] 846 */ 847void 848pmap_bootstrap(vm_paddr_t *firstaddr) 849{ 850 vm_offset_t va; 851 pt_entry_t *pte; 852 853 /* 854 * Create an initial set of page tables to run the kernel in. 855 */ 856 create_pagetables(firstaddr); 857 858 /* 859 * Add a physical memory segment (vm_phys_seg) corresponding to the 860 * preallocated kernel page table pages so that vm_page structures 861 * representing these pages will be created. The vm_page structures 862 * are required for promotion of the corresponding kernel virtual 863 * addresses to superpage mappings. 864 */ 865 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 866 867 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 868 virtual_avail = pmap_kmem_choose(virtual_avail); 869 870 virtual_end = VM_MAX_KERNEL_ADDRESS; 871 872 873 /* XXX do %cr0 as well */ 874 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 875 load_cr3(KPML4phys); 876 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 877 load_cr4(rcr4() | CR4_SMEP); 878 879 /* 880 * Initialize the kernel pmap (which is statically allocated). 881 */ 882 PMAP_LOCK_INIT(kernel_pmap); 883 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 884 kernel_pmap->pm_cr3 = KPML4phys; 885 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 886 CPU_FILL(&kernel_pmap->pm_save); /* always superset of pm_active */ 887 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 888 kernel_pmap->pm_flags = pmap_flags; 889 890 /* 891 * Initialize the global pv list lock. 892 */ 893 rw_init(&pvh_global_lock, "pmap pv global"); 894 895 /* 896 * Reserve some special page table entries/VA space for temporary 897 * mapping of pages. 898 */ 899#define SYSMAP(c, p, v, n) \ 900 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 901 902 va = virtual_avail; 903 pte = vtopte(va); 904 905 /* 906 * Crashdump maps. The first page is reused as CMAP1 for the 907 * memory test. 908 */ 909 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 910 CADDR1 = crashdumpmap; 911 912 virtual_avail = va; 913 914 /* Initialize the PAT MSR. */ 915 pmap_init_pat(); 916 917 /* Initialize TLB Context Id. */ 918 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 919 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 920 load_cr4(rcr4() | CR4_PCIDE); 921 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF); 922 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx); 923 /* Check for INVPCID support */ 924 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 925 != 0; 926 kernel_pmap->pm_pcid = 0; 927#ifndef SMP 928 pmap_pcid_enabled = 0; 929#endif 930 } else 931 pmap_pcid_enabled = 0; 932} 933 934/* 935 * Setup the PAT MSR. 936 */ 937void 938pmap_init_pat(void) 939{ 940 int pat_table[PAT_INDEX_SIZE]; 941 uint64_t pat_msr; 942 u_long cr0, cr4; 943 int i; 944 945 /* Bail if this CPU doesn't implement PAT. */ 946 if ((cpu_feature & CPUID_PAT) == 0) 947 panic("no PAT??"); 948 949 /* Set default PAT index table. */ 950 for (i = 0; i < PAT_INDEX_SIZE; i++) 951 pat_table[i] = -1; 952 pat_table[PAT_WRITE_BACK] = 0; 953 pat_table[PAT_WRITE_THROUGH] = 1; 954 pat_table[PAT_UNCACHEABLE] = 3; 955 pat_table[PAT_WRITE_COMBINING] = 3; 956 pat_table[PAT_WRITE_PROTECTED] = 3; 957 pat_table[PAT_UNCACHED] = 3; 958 959 /* Initialize default PAT entries. */ 960 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 961 PAT_VALUE(1, PAT_WRITE_THROUGH) | 962 PAT_VALUE(2, PAT_UNCACHED) | 963 PAT_VALUE(3, PAT_UNCACHEABLE) | 964 PAT_VALUE(4, PAT_WRITE_BACK) | 965 PAT_VALUE(5, PAT_WRITE_THROUGH) | 966 PAT_VALUE(6, PAT_UNCACHED) | 967 PAT_VALUE(7, PAT_UNCACHEABLE); 968 969 if (pat_works) { 970 /* 971 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 972 * Program 5 and 6 as WP and WC. 973 * Leave 4 and 7 as WB and UC. 974 */ 975 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 976 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 977 PAT_VALUE(6, PAT_WRITE_COMBINING); 978 pat_table[PAT_UNCACHED] = 2; 979 pat_table[PAT_WRITE_PROTECTED] = 5; 980 pat_table[PAT_WRITE_COMBINING] = 6; 981 } else { 982 /* 983 * Just replace PAT Index 2 with WC instead of UC-. 984 */ 985 pat_msr &= ~PAT_MASK(2); 986 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 987 pat_table[PAT_WRITE_COMBINING] = 2; 988 } 989 990 /* Disable PGE. */ 991 cr4 = rcr4(); 992 load_cr4(cr4 & ~CR4_PGE); 993 994 /* Disable caches (CD = 1, NW = 0). */ 995 cr0 = rcr0(); 996 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 997 998 /* Flushes caches and TLBs. */ 999 wbinvd(); 1000 invltlb(); 1001 1002 /* Update PAT and index table. */ 1003 wrmsr(MSR_PAT, pat_msr); 1004 for (i = 0; i < PAT_INDEX_SIZE; i++) 1005 pat_index[i] = pat_table[i]; 1006 1007 /* Flush caches and TLBs again. */ 1008 wbinvd(); 1009 invltlb(); 1010 1011 /* Restore caches and PGE. */ 1012 load_cr0(cr0); 1013 load_cr4(cr4); 1014} 1015 1016/* 1017 * Initialize a vm_page's machine-dependent fields. 1018 */ 1019void 1020pmap_page_init(vm_page_t m) 1021{ 1022 1023 TAILQ_INIT(&m->md.pv_list); 1024 m->md.pat_mode = PAT_WRITE_BACK; 1025} 1026 1027/* 1028 * Initialize the pmap module. 1029 * Called by vm_init, to initialize any structures that the pmap 1030 * system needs to map virtual memory. 1031 */ 1032void 1033pmap_init(void) 1034{ 1035 struct pmap_preinit_mapping *ppim; 1036 vm_page_t mpte; 1037 vm_size_t s; 1038 int i, pv_npg; 1039 1040 /* 1041 * Initialize the vm page array entries for the kernel pmap's 1042 * page table pages. 1043 */ 1044 for (i = 0; i < nkpt; i++) { 1045 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1046 KASSERT(mpte >= vm_page_array && 1047 mpte < &vm_page_array[vm_page_array_size], 1048 ("pmap_init: page table page is out of range")); 1049 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1050 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1051 } 1052 1053 /* 1054 * If the kernel is running on a virtual machine, then it must assume 1055 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1056 * be prepared for the hypervisor changing the vendor and family that 1057 * are reported by CPUID. Consequently, the workaround for AMD Family 1058 * 10h Erratum 383 is enabled if the processor's feature set does not 1059 * include at least one feature that is only supported by older Intel 1060 * or newer AMD processors. 1061 */ 1062 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 1063 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1064 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1065 AMDID2_FMA4)) == 0) 1066 workaround_erratum383 = 1; 1067 1068 /* 1069 * Are large page mappings enabled? 1070 */ 1071 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1072 if (pg_ps_enabled) { 1073 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1074 ("pmap_init: can't assign to pagesizes[1]")); 1075 pagesizes[1] = NBPDR; 1076 } 1077 1078 /* 1079 * Initialize the pv chunk list mutex. 1080 */ 1081 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1082 1083 /* 1084 * Initialize the pool of pv list locks. 1085 */ 1086 for (i = 0; i < NPV_LIST_LOCKS; i++) 1087 rw_init(&pv_list_locks[i], "pmap pv list"); 1088 1089 /* 1090 * Calculate the size of the pv head table for superpages. 1091 */ 1092 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1093 1094 /* 1095 * Allocate memory for the pv head table for superpages. 1096 */ 1097 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1098 s = round_page(s); 1099 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1100 M_WAITOK | M_ZERO); 1101 for (i = 0; i < pv_npg; i++) 1102 TAILQ_INIT(&pv_table[i].pv_list); 1103 1104 mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF); 1105 cpage_a = kva_alloc(PAGE_SIZE); 1106 cpage_b = kva_alloc(PAGE_SIZE); 1107 1108 pmap_initialized = 1; 1109 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1110 ppim = pmap_preinit_mapping + i; 1111 if (ppim->va == 0) 1112 continue; 1113 /* Make the direct map consistent */ 1114 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { 1115 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1116 ppim->sz, ppim->mode); 1117 } 1118 if (!bootverbose) 1119 continue; 1120 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1121 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1122 } 1123} 1124 1125static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1126 "2MB page mapping counters"); 1127 1128static u_long pmap_pde_demotions; 1129SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1130 &pmap_pde_demotions, 0, "2MB page demotions"); 1131 1132static u_long pmap_pde_mappings; 1133SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1134 &pmap_pde_mappings, 0, "2MB page mappings"); 1135 1136static u_long pmap_pde_p_failures; 1137SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1138 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1139 1140static u_long pmap_pde_promotions; 1141SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1142 &pmap_pde_promotions, 0, "2MB page promotions"); 1143 1144static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1145 "1GB page mapping counters"); 1146 1147static u_long pmap_pdpe_demotions; 1148SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1149 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1150 1151/*************************************************** 1152 * Low level helper routines..... 1153 ***************************************************/ 1154 1155static pt_entry_t 1156pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1157{ 1158 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1159 1160 switch (pmap->pm_type) { 1161 case PT_X86: 1162 case PT_RVI: 1163 /* Verify that both PAT bits are not set at the same time */ 1164 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1165 ("Invalid PAT bits in entry %#lx", entry)); 1166 1167 /* Swap the PAT bits if one of them is set */ 1168 if ((entry & x86_pat_bits) != 0) 1169 entry ^= x86_pat_bits; 1170 break; 1171 case PT_EPT: 1172 /* 1173 * Nothing to do - the memory attributes are represented 1174 * the same way for regular pages and superpages. 1175 */ 1176 break; 1177 default: 1178 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1179 } 1180 1181 return (entry); 1182} 1183 1184/* 1185 * Determine the appropriate bits to set in a PTE or PDE for a specified 1186 * caching mode. 1187 */ 1188static int 1189pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1190{ 1191 int cache_bits, pat_flag, pat_idx; 1192 1193 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1194 panic("Unknown caching mode %d\n", mode); 1195 1196 switch (pmap->pm_type) { 1197 case PT_X86: 1198 case PT_RVI: 1199 /* The PAT bit is different for PTE's and PDE's. */ 1200 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1201 1202 /* Map the caching mode to a PAT index. */ 1203 pat_idx = pat_index[mode]; 1204 1205 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1206 cache_bits = 0; 1207 if (pat_idx & 0x4) 1208 cache_bits |= pat_flag; 1209 if (pat_idx & 0x2) 1210 cache_bits |= PG_NC_PCD; 1211 if (pat_idx & 0x1) 1212 cache_bits |= PG_NC_PWT; 1213 break; 1214 1215 case PT_EPT: 1216 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1217 break; 1218 1219 default: 1220 panic("unsupported pmap type %d", pmap->pm_type); 1221 } 1222 1223 return (cache_bits); 1224} 1225 1226static int 1227pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1228{ 1229 int mask; 1230 1231 switch (pmap->pm_type) { 1232 case PT_X86: 1233 case PT_RVI: 1234 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1235 break; 1236 case PT_EPT: 1237 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1238 break; 1239 default: 1240 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1241 } 1242 1243 return (mask); 1244} 1245 1246static __inline boolean_t 1247pmap_ps_enabled(pmap_t pmap) 1248{ 1249 1250 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1251} 1252 1253static void 1254pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1255{ 1256 1257 switch (pmap->pm_type) { 1258 case PT_X86: 1259 break; 1260 case PT_RVI: 1261 case PT_EPT: 1262 /* 1263 * XXX 1264 * This is a little bogus since the generation number is 1265 * supposed to be bumped up when a region of the address 1266 * space is invalidated in the page tables. 1267 * 1268 * In this case the old PDE entry is valid but yet we want 1269 * to make sure that any mappings using the old entry are 1270 * invalidated in the TLB. 1271 * 1272 * The reason this works as expected is because we rendezvous 1273 * "all" host cpus and force any vcpu context to exit as a 1274 * side-effect. 1275 */ 1276 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1277 break; 1278 default: 1279 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1280 } 1281 pde_store(pde, newpde); 1282} 1283 1284/* 1285 * After changing the page size for the specified virtual address in the page 1286 * table, flush the corresponding entries from the processor's TLB. Only the 1287 * calling processor's TLB is affected. 1288 * 1289 * The calling thread must be pinned to a processor. 1290 */ 1291static void 1292pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1293{ 1294 pt_entry_t PG_G; 1295 1296 if (pmap_type_guest(pmap)) 1297 return; 1298 1299 KASSERT(pmap->pm_type == PT_X86, 1300 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1301 1302 PG_G = pmap_global_bit(pmap); 1303 1304 if ((newpde & PG_PS) == 0) 1305 /* Demotion: flush a specific 2MB page mapping. */ 1306 invlpg(va); 1307 else if ((newpde & PG_G) == 0) 1308 /* 1309 * Promotion: flush every 4KB page mapping from the TLB 1310 * because there are too many to flush individually. 1311 */ 1312 invltlb(); 1313 else { 1314 /* 1315 * Promotion: flush every 4KB page mapping from the TLB, 1316 * including any global (PG_G) mappings. 1317 */ 1318 invltlb_globpcid(); 1319 } 1320} 1321#ifdef SMP 1322 1323static void 1324pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) 1325{ 1326 struct invpcid_descr d; 1327 uint64_t cr3; 1328 1329 if (invpcid_works) { 1330 d.pcid = pmap->pm_pcid; 1331 d.pad = 0; 1332 d.addr = va; 1333 invpcid(&d, INVPCID_ADDR); 1334 return; 1335 } 1336 1337 cr3 = rcr3(); 1338 critical_enter(); 1339 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1340 invlpg(va); 1341 load_cr3(cr3 | CR3_PCID_SAVE); 1342 critical_exit(); 1343} 1344 1345/* 1346 * For SMP, these functions have to use the IPI mechanism for coherence. 1347 * 1348 * N.B.: Before calling any of the following TLB invalidation functions, 1349 * the calling processor must ensure that all stores updating a non- 1350 * kernel page table are globally performed. Otherwise, another 1351 * processor could cache an old, pre-update entry without being 1352 * invalidated. This can happen one of two ways: (1) The pmap becomes 1353 * active on another processor after its pm_active field is checked by 1354 * one of the following functions but before a store updating the page 1355 * table is globally performed. (2) The pmap becomes active on another 1356 * processor before its pm_active field is checked but due to 1357 * speculative loads one of the following functions stills reads the 1358 * pmap as inactive on the other processor. 1359 * 1360 * The kernel page table is exempt because its pm_active field is 1361 * immutable. The kernel page table is always active on every 1362 * processor. 1363 */ 1364 1365/* 1366 * Interrupt the cpus that are executing in the guest context. 1367 * This will force the vcpu to exit and the cached EPT mappings 1368 * will be invalidated by the host before the next vmresume. 1369 */ 1370static __inline void 1371pmap_invalidate_ept(pmap_t pmap) 1372{ 1373 int ipinum; 1374 1375 sched_pin(); 1376 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1377 ("pmap_invalidate_ept: absurd pm_active")); 1378 1379 /* 1380 * The TLB mappings associated with a vcpu context are not 1381 * flushed each time a different vcpu is chosen to execute. 1382 * 1383 * This is in contrast with a process's vtop mappings that 1384 * are flushed from the TLB on each context switch. 1385 * 1386 * Therefore we need to do more than just a TLB shootdown on 1387 * the active cpus in 'pmap->pm_active'. To do this we keep 1388 * track of the number of invalidations performed on this pmap. 1389 * 1390 * Each vcpu keeps a cache of this counter and compares it 1391 * just before a vmresume. If the counter is out-of-date an 1392 * invept will be done to flush stale mappings from the TLB. 1393 */ 1394 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1395 1396 /* 1397 * Force the vcpu to exit and trap back into the hypervisor. 1398 */ 1399 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1400 ipi_selected(pmap->pm_active, ipinum); 1401 sched_unpin(); 1402} 1403 1404void 1405pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1406{ 1407 cpuset_t other_cpus; 1408 u_int cpuid; 1409 1410 if (pmap_type_guest(pmap)) { 1411 pmap_invalidate_ept(pmap); 1412 return; 1413 } 1414 1415 KASSERT(pmap->pm_type == PT_X86, 1416 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1417 1418 sched_pin(); 1419 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1420 if (!pmap_pcid_enabled) { 1421 invlpg(va); 1422 } else { 1423 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1424 if (pmap == PCPU_GET(curpmap)) 1425 invlpg(va); 1426 else 1427 pmap_invalidate_page_pcid(pmap, va); 1428 } else { 1429 invltlb_globpcid(); 1430 } 1431 } 1432 smp_invlpg(pmap, va); 1433 } else { 1434 cpuid = PCPU_GET(cpuid); 1435 other_cpus = all_cpus; 1436 CPU_CLR(cpuid, &other_cpus); 1437 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1438 invlpg(va); 1439 else if (pmap_pcid_enabled) { 1440 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1441 pmap_invalidate_page_pcid(pmap, va); 1442 else 1443 invltlb_globpcid(); 1444 } 1445 if (pmap_pcid_enabled) 1446 CPU_AND(&other_cpus, &pmap->pm_save); 1447 else 1448 CPU_AND(&other_cpus, &pmap->pm_active); 1449 if (!CPU_EMPTY(&other_cpus)) 1450 smp_masked_invlpg(other_cpus, pmap, va); 1451 } 1452 sched_unpin(); 1453} 1454 1455static void 1456pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1457{ 1458 struct invpcid_descr d; 1459 uint64_t cr3; 1460 vm_offset_t addr; 1461 1462 if (invpcid_works) { 1463 d.pcid = pmap->pm_pcid; 1464 d.pad = 0; 1465 for (addr = sva; addr < eva; addr += PAGE_SIZE) { 1466 d.addr = addr; 1467 invpcid(&d, INVPCID_ADDR); 1468 } 1469 return; 1470 } 1471 1472 cr3 = rcr3(); 1473 critical_enter(); 1474 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1475 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1476 invlpg(addr); 1477 load_cr3(cr3 | CR3_PCID_SAVE); 1478 critical_exit(); 1479} 1480 1481void 1482pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1483{ 1484 cpuset_t other_cpus; 1485 vm_offset_t addr; 1486 u_int cpuid; 1487 1488 if (pmap_type_guest(pmap)) { 1489 pmap_invalidate_ept(pmap); 1490 return; 1491 } 1492 1493 KASSERT(pmap->pm_type == PT_X86, 1494 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1495 1496 sched_pin(); 1497 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1498 if (!pmap_pcid_enabled) { 1499 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1500 invlpg(addr); 1501 } else { 1502 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1503 if (pmap == PCPU_GET(curpmap)) { 1504 for (addr = sva; addr < eva; 1505 addr += PAGE_SIZE) 1506 invlpg(addr); 1507 } else { 1508 pmap_invalidate_range_pcid(pmap, 1509 sva, eva); 1510 } 1511 } else { 1512 invltlb_globpcid(); 1513 } 1514 } 1515 smp_invlpg_range(pmap, sva, eva); 1516 } else { 1517 cpuid = PCPU_GET(cpuid); 1518 other_cpus = all_cpus; 1519 CPU_CLR(cpuid, &other_cpus); 1520 if (CPU_ISSET(cpuid, &pmap->pm_active)) { 1521 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1522 invlpg(addr); 1523 } else if (pmap_pcid_enabled) { 1524 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1525 pmap_invalidate_range_pcid(pmap, sva, eva); 1526 else 1527 invltlb_globpcid(); 1528 } 1529 if (pmap_pcid_enabled) 1530 CPU_AND(&other_cpus, &pmap->pm_save); 1531 else 1532 CPU_AND(&other_cpus, &pmap->pm_active); 1533 if (!CPU_EMPTY(&other_cpus)) 1534 smp_masked_invlpg_range(other_cpus, pmap, sva, eva); 1535 } 1536 sched_unpin(); 1537} 1538 1539void 1540pmap_invalidate_all(pmap_t pmap) 1541{ 1542 cpuset_t other_cpus; 1543 struct invpcid_descr d; 1544 uint64_t cr3; 1545 u_int cpuid; 1546 1547 if (pmap_type_guest(pmap)) { 1548 pmap_invalidate_ept(pmap); 1549 return; 1550 } 1551 1552 KASSERT(pmap->pm_type == PT_X86, 1553 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1554 1555 sched_pin(); 1556 cpuid = PCPU_GET(cpuid); 1557 if (pmap == kernel_pmap || 1558 (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) || 1559 !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1560 if (invpcid_works) { 1561 bzero(&d, sizeof(d)); 1562 invpcid(&d, INVPCID_CTXGLOB); 1563 } else { 1564 invltlb_globpcid(); 1565 } 1566 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1567 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1568 smp_invltlb(pmap); 1569 } else { 1570 other_cpus = all_cpus; 1571 CPU_CLR(cpuid, &other_cpus); 1572 1573 /* 1574 * This logic is duplicated in the Xinvltlb shootdown 1575 * IPI handler. 1576 */ 1577 if (pmap_pcid_enabled) { 1578 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1579 if (invpcid_works) { 1580 d.pcid = pmap->pm_pcid; 1581 d.pad = 0; 1582 d.addr = 0; 1583 invpcid(&d, INVPCID_CTX); 1584 } else { 1585 cr3 = rcr3(); 1586 critical_enter(); 1587 1588 /* 1589 * Bit 63 is clear, pcid TLB 1590 * entries are invalidated. 1591 */ 1592 load_cr3(pmap->pm_cr3); 1593 load_cr3(cr3 | CR3_PCID_SAVE); 1594 critical_exit(); 1595 } 1596 } else { 1597 invltlb_globpcid(); 1598 } 1599 } else if (CPU_ISSET(cpuid, &pmap->pm_active)) 1600 invltlb(); 1601 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1602 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1603 if (pmap_pcid_enabled) 1604 CPU_AND(&other_cpus, &pmap->pm_save); 1605 else 1606 CPU_AND(&other_cpus, &pmap->pm_active); 1607 if (!CPU_EMPTY(&other_cpus)) 1608 smp_masked_invltlb(other_cpus, pmap); 1609 } 1610 sched_unpin(); 1611} 1612 1613void 1614pmap_invalidate_cache(void) 1615{ 1616 1617 sched_pin(); 1618 wbinvd(); 1619 smp_cache_flush(); 1620 sched_unpin(); 1621} 1622 1623struct pde_action { 1624 cpuset_t invalidate; /* processors that invalidate their TLB */ 1625 pmap_t pmap; 1626 vm_offset_t va; 1627 pd_entry_t *pde; 1628 pd_entry_t newpde; 1629 u_int store; /* processor that updates the PDE */ 1630}; 1631 1632static void 1633pmap_update_pde_action(void *arg) 1634{ 1635 struct pde_action *act = arg; 1636 1637 if (act->store == PCPU_GET(cpuid)) 1638 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1639} 1640 1641static void 1642pmap_update_pde_teardown(void *arg) 1643{ 1644 struct pde_action *act = arg; 1645 1646 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1647 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1648} 1649 1650/* 1651 * Change the page size for the specified virtual address in a way that 1652 * prevents any possibility of the TLB ever having two entries that map the 1653 * same virtual address using different page sizes. This is the recommended 1654 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1655 * machine check exception for a TLB state that is improperly diagnosed as a 1656 * hardware error. 1657 */ 1658static void 1659pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1660{ 1661 struct pde_action act; 1662 cpuset_t active, other_cpus; 1663 u_int cpuid; 1664 1665 sched_pin(); 1666 cpuid = PCPU_GET(cpuid); 1667 other_cpus = all_cpus; 1668 CPU_CLR(cpuid, &other_cpus); 1669 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 1670 active = all_cpus; 1671 else { 1672 active = pmap->pm_active; 1673 CPU_AND_ATOMIC(&pmap->pm_save, &active); 1674 } 1675 if (CPU_OVERLAP(&active, &other_cpus)) { 1676 act.store = cpuid; 1677 act.invalidate = active; 1678 act.va = va; 1679 act.pmap = pmap; 1680 act.pde = pde; 1681 act.newpde = newpde; 1682 CPU_SET(cpuid, &active); 1683 smp_rendezvous_cpus(active, 1684 smp_no_rendevous_barrier, pmap_update_pde_action, 1685 pmap_update_pde_teardown, &act); 1686 } else { 1687 pmap_update_pde_store(pmap, pde, newpde); 1688 if (CPU_ISSET(cpuid, &active)) 1689 pmap_update_pde_invalidate(pmap, va, newpde); 1690 } 1691 sched_unpin(); 1692} 1693#else /* !SMP */ 1694/* 1695 * Normal, non-SMP, invalidation functions. 1696 * We inline these within pmap.c for speed. 1697 */ 1698PMAP_INLINE void 1699pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1700{ 1701 1702 switch (pmap->pm_type) { 1703 case PT_X86: 1704 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1705 invlpg(va); 1706 break; 1707 case PT_RVI: 1708 case PT_EPT: 1709 pmap->pm_eptgen++; 1710 break; 1711 default: 1712 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type); 1713 } 1714} 1715 1716PMAP_INLINE void 1717pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1718{ 1719 vm_offset_t addr; 1720 1721 switch (pmap->pm_type) { 1722 case PT_X86: 1723 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1724 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1725 invlpg(addr); 1726 break; 1727 case PT_RVI: 1728 case PT_EPT: 1729 pmap->pm_eptgen++; 1730 break; 1731 default: 1732 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); 1733 } 1734} 1735 1736PMAP_INLINE void 1737pmap_invalidate_all(pmap_t pmap) 1738{ 1739 1740 switch (pmap->pm_type) { 1741 case PT_X86: 1742 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1743 invltlb(); 1744 break; 1745 case PT_RVI: 1746 case PT_EPT: 1747 pmap->pm_eptgen++; 1748 break; 1749 default: 1750 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type); 1751 } 1752} 1753 1754PMAP_INLINE void 1755pmap_invalidate_cache(void) 1756{ 1757 1758 wbinvd(); 1759} 1760 1761static void 1762pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1763{ 1764 1765 pmap_update_pde_store(pmap, pde, newpde); 1766 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1767 pmap_update_pde_invalidate(pmap, va, newpde); 1768 else 1769 CPU_ZERO(&pmap->pm_save); 1770} 1771#endif /* !SMP */ 1772 1773#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1774 1775void 1776pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1777{ 1778 1779 if (force) { 1780 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1781 } else { 1782 KASSERT((sva & PAGE_MASK) == 0, 1783 ("pmap_invalidate_cache_range: sva not page-aligned")); 1784 KASSERT((eva & PAGE_MASK) == 0, 1785 ("pmap_invalidate_cache_range: eva not page-aligned")); 1786 } 1787 1788 if ((cpu_feature & CPUID_SS) != 0 && !force) 1789 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1790 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1791 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1792 1793 /* 1794 * XXX: Some CPUs fault, hang, or trash the local APIC 1795 * registers if we use CLFLUSH on the local APIC 1796 * range. The local APIC is always uncached, so we 1797 * don't need to flush for that range anyway. 1798 */ 1799 if (pmap_kextract(sva) == lapic_paddr) 1800 return; 1801 1802 /* 1803 * Otherwise, do per-cache line flush. Use the mfence 1804 * instruction to insure that previous stores are 1805 * included in the write-back. The processor 1806 * propagates flush to other processors in the cache 1807 * coherence domain. 1808 */ 1809 mfence(); 1810 for (; sva < eva; sva += cpu_clflush_line_size) 1811 clflush(sva); 1812 mfence(); 1813 } else { 1814 1815 /* 1816 * No targeted cache flush methods are supported by CPU, 1817 * or the supplied range is bigger than 2MB. 1818 * Globally invalidate cache. 1819 */ 1820 pmap_invalidate_cache(); 1821 } 1822} 1823 1824/* 1825 * Remove the specified set of pages from the data and instruction caches. 1826 * 1827 * In contrast to pmap_invalidate_cache_range(), this function does not 1828 * rely on the CPU's self-snoop feature, because it is intended for use 1829 * when moving pages into a different cache domain. 1830 */ 1831void 1832pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1833{ 1834 vm_offset_t daddr, eva; 1835 int i; 1836 1837 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1838 (cpu_feature & CPUID_CLFSH) == 0) 1839 pmap_invalidate_cache(); 1840 else { 1841 mfence(); 1842 for (i = 0; i < count; i++) { 1843 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1844 eva = daddr + PAGE_SIZE; 1845 for (; daddr < eva; daddr += cpu_clflush_line_size) 1846 clflush(daddr); 1847 } 1848 mfence(); 1849 } 1850} 1851 1852/* 1853 * Routine: pmap_extract 1854 * Function: 1855 * Extract the physical page address associated 1856 * with the given map/virtual_address pair. 1857 */ 1858vm_paddr_t 1859pmap_extract(pmap_t pmap, vm_offset_t va) 1860{ 1861 pdp_entry_t *pdpe; 1862 pd_entry_t *pde; 1863 pt_entry_t *pte, PG_V; 1864 vm_paddr_t pa; 1865 1866 pa = 0; 1867 PG_V = pmap_valid_bit(pmap); 1868 PMAP_LOCK(pmap); 1869 pdpe = pmap_pdpe(pmap, va); 1870 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1871 if ((*pdpe & PG_PS) != 0) 1872 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1873 else { 1874 pde = pmap_pdpe_to_pde(pdpe, va); 1875 if ((*pde & PG_V) != 0) { 1876 if ((*pde & PG_PS) != 0) { 1877 pa = (*pde & PG_PS_FRAME) | 1878 (va & PDRMASK); 1879 } else { 1880 pte = pmap_pde_to_pte(pde, va); 1881 pa = (*pte & PG_FRAME) | 1882 (va & PAGE_MASK); 1883 } 1884 } 1885 } 1886 } 1887 PMAP_UNLOCK(pmap); 1888 return (pa); 1889} 1890 1891/* 1892 * Routine: pmap_extract_and_hold 1893 * Function: 1894 * Atomically extract and hold the physical page 1895 * with the given pmap and virtual address pair 1896 * if that mapping permits the given protection. 1897 */ 1898vm_page_t 1899pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1900{ 1901 pd_entry_t pde, *pdep; 1902 pt_entry_t pte, PG_RW, PG_V; 1903 vm_paddr_t pa; 1904 vm_page_t m; 1905 1906 pa = 0; 1907 m = NULL; 1908 PG_RW = pmap_rw_bit(pmap); 1909 PG_V = pmap_valid_bit(pmap); 1910 PMAP_LOCK(pmap); 1911retry: 1912 pdep = pmap_pde(pmap, va); 1913 if (pdep != NULL && (pde = *pdep)) { 1914 if (pde & PG_PS) { 1915 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1916 if (vm_page_pa_tryrelock(pmap, (pde & 1917 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1918 goto retry; 1919 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1920 (va & PDRMASK)); 1921 vm_page_hold(m); 1922 } 1923 } else { 1924 pte = *pmap_pde_to_pte(pdep, va); 1925 if ((pte & PG_V) && 1926 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1927 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1928 &pa)) 1929 goto retry; 1930 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1931 vm_page_hold(m); 1932 } 1933 } 1934 } 1935 PA_UNLOCK_COND(pa); 1936 PMAP_UNLOCK(pmap); 1937 return (m); 1938} 1939 1940vm_paddr_t 1941pmap_kextract(vm_offset_t va) 1942{ 1943 pd_entry_t pde; 1944 vm_paddr_t pa; 1945 1946 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1947 pa = DMAP_TO_PHYS(va); 1948 } else { 1949 pde = *vtopde(va); 1950 if (pde & PG_PS) { 1951 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1952 } else { 1953 /* 1954 * Beware of a concurrent promotion that changes the 1955 * PDE at this point! For example, vtopte() must not 1956 * be used to access the PTE because it would use the 1957 * new PDE. It is, however, safe to use the old PDE 1958 * because the page table page is preserved by the 1959 * promotion. 1960 */ 1961 pa = *pmap_pde_to_pte(&pde, va); 1962 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1963 } 1964 } 1965 return (pa); 1966} 1967 1968/*************************************************** 1969 * Low level mapping routines..... 1970 ***************************************************/ 1971 1972/* 1973 * Add a wired page to the kva. 1974 * Note: not SMP coherent. 1975 */ 1976PMAP_INLINE void 1977pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1978{ 1979 pt_entry_t *pte; 1980 1981 pte = vtopte(va); 1982 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 1983} 1984 1985static __inline void 1986pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1987{ 1988 pt_entry_t *pte; 1989 int cache_bits; 1990 1991 pte = vtopte(va); 1992 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 1993 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 1994} 1995 1996/* 1997 * Remove a page from the kernel pagetables. 1998 * Note: not SMP coherent. 1999 */ 2000PMAP_INLINE void 2001pmap_kremove(vm_offset_t va) 2002{ 2003 pt_entry_t *pte; 2004 2005 pte = vtopte(va); 2006 pte_clear(pte); 2007} 2008 2009/* 2010 * Used to map a range of physical addresses into kernel 2011 * virtual address space. 2012 * 2013 * The value passed in '*virt' is a suggested virtual address for 2014 * the mapping. Architectures which can support a direct-mapped 2015 * physical to virtual region can return the appropriate address 2016 * within that region, leaving '*virt' unchanged. Other 2017 * architectures should map the pages starting at '*virt' and 2018 * update '*virt' with the first usable address after the mapped 2019 * region. 2020 */ 2021vm_offset_t 2022pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2023{ 2024 return PHYS_TO_DMAP(start); 2025} 2026 2027 2028/* 2029 * Add a list of wired pages to the kva 2030 * this routine is only used for temporary 2031 * kernel mappings that do not need to have 2032 * page modification or references recorded. 2033 * Note that old mappings are simply written 2034 * over. The page *must* be wired. 2035 * Note: SMP coherent. Uses a ranged shootdown IPI. 2036 */ 2037void 2038pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2039{ 2040 pt_entry_t *endpte, oldpte, pa, *pte; 2041 vm_page_t m; 2042 int cache_bits; 2043 2044 oldpte = 0; 2045 pte = vtopte(sva); 2046 endpte = pte + count; 2047 while (pte < endpte) { 2048 m = *ma++; 2049 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2050 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2051 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2052 oldpte |= *pte; 2053 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 2054 } 2055 pte++; 2056 } 2057 if (__predict_false((oldpte & X86_PG_V) != 0)) 2058 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2059 PAGE_SIZE); 2060} 2061 2062/* 2063 * This routine tears out page mappings from the 2064 * kernel -- it is meant only for temporary mappings. 2065 * Note: SMP coherent. Uses a ranged shootdown IPI. 2066 */ 2067void 2068pmap_qremove(vm_offset_t sva, int count) 2069{ 2070 vm_offset_t va; 2071 2072 va = sva; 2073 while (count-- > 0) { 2074 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2075 pmap_kremove(va); 2076 va += PAGE_SIZE; 2077 } 2078 pmap_invalidate_range(kernel_pmap, sva, va); 2079} 2080 2081/*************************************************** 2082 * Page table page management routines..... 2083 ***************************************************/ 2084static __inline void 2085pmap_free_zero_pages(struct spglist *free) 2086{ 2087 vm_page_t m; 2088 2089 while ((m = SLIST_FIRST(free)) != NULL) { 2090 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2091 /* Preserve the page's PG_ZERO setting. */ 2092 vm_page_free_toq(m); 2093 } 2094} 2095 2096/* 2097 * Schedule the specified unused page table page to be freed. Specifically, 2098 * add the page to the specified list of pages that will be released to the 2099 * physical memory manager after the TLB has been updated. 2100 */ 2101static __inline void 2102pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2103 boolean_t set_PG_ZERO) 2104{ 2105 2106 if (set_PG_ZERO) 2107 m->flags |= PG_ZERO; 2108 else 2109 m->flags &= ~PG_ZERO; 2110 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2111} 2112 2113/* 2114 * Inserts the specified page table page into the specified pmap's collection 2115 * of idle page table pages. Each of a pmap's page table pages is responsible 2116 * for mapping a distinct range of virtual addresses. The pmap's collection is 2117 * ordered by this virtual address range. 2118 */ 2119static __inline int 2120pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2121{ 2122 2123 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2124 return (vm_radix_insert(&pmap->pm_root, mpte)); 2125} 2126 2127/* 2128 * Looks for a page table page mapping the specified virtual address in the 2129 * specified pmap's collection of idle page table pages. Returns NULL if there 2130 * is no page table page corresponding to the specified virtual address. 2131 */ 2132static __inline vm_page_t 2133pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 2134{ 2135 2136 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2137 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); 2138} 2139 2140/* 2141 * Removes the specified page table page from the specified pmap's collection 2142 * of idle page table pages. The specified page table page must be a member of 2143 * the pmap's collection. 2144 */ 2145static __inline void 2146pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 2147{ 2148 2149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2150 vm_radix_remove(&pmap->pm_root, mpte->pindex); 2151} 2152 2153/* 2154 * Decrements a page table page's wire count, which is used to record the 2155 * number of valid page table entries within the page. If the wire count 2156 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2157 * page table page was unmapped and FALSE otherwise. 2158 */ 2159static inline boolean_t 2160pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2161{ 2162 2163 --m->wire_count; 2164 if (m->wire_count == 0) { 2165 _pmap_unwire_ptp(pmap, va, m, free); 2166 return (TRUE); 2167 } else 2168 return (FALSE); 2169} 2170 2171static void 2172_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2173{ 2174 2175 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2176 /* 2177 * unmap the page table page 2178 */ 2179 if (m->pindex >= (NUPDE + NUPDPE)) { 2180 /* PDP page */ 2181 pml4_entry_t *pml4; 2182 pml4 = pmap_pml4e(pmap, va); 2183 *pml4 = 0; 2184 } else if (m->pindex >= NUPDE) { 2185 /* PD page */ 2186 pdp_entry_t *pdp; 2187 pdp = pmap_pdpe(pmap, va); 2188 *pdp = 0; 2189 } else { 2190 /* PTE page */ 2191 pd_entry_t *pd; 2192 pd = pmap_pde(pmap, va); 2193 *pd = 0; 2194 } 2195 pmap_resident_count_dec(pmap, 1); 2196 if (m->pindex < NUPDE) { 2197 /* We just released a PT, unhold the matching PD */ 2198 vm_page_t pdpg; 2199 2200 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2201 pmap_unwire_ptp(pmap, va, pdpg, free); 2202 } 2203 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2204 /* We just released a PD, unhold the matching PDP */ 2205 vm_page_t pdppg; 2206 2207 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2208 pmap_unwire_ptp(pmap, va, pdppg, free); 2209 } 2210 2211 /* 2212 * This is a release store so that the ordinary store unmapping 2213 * the page table page is globally performed before TLB shoot- 2214 * down is begun. 2215 */ 2216 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 2217 2218 /* 2219 * Put page on a list so that it is released after 2220 * *ALL* TLB shootdown is done 2221 */ 2222 pmap_add_delayed_free_list(m, free, TRUE); 2223} 2224 2225/* 2226 * After removing a page table entry, this routine is used to 2227 * conditionally free the page, and manage the hold/wire counts. 2228 */ 2229static int 2230pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2231 struct spglist *free) 2232{ 2233 vm_page_t mpte; 2234 2235 if (va >= VM_MAXUSER_ADDRESS) 2236 return (0); 2237 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2238 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2239 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2240} 2241 2242void 2243pmap_pinit0(pmap_t pmap) 2244{ 2245 2246 PMAP_LOCK_INIT(pmap); 2247 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2248 pmap->pm_cr3 = KPML4phys; 2249 pmap->pm_root.rt_root = 0; 2250 CPU_ZERO(&pmap->pm_active); 2251 CPU_ZERO(&pmap->pm_save); 2252 PCPU_SET(curpmap, pmap); 2253 TAILQ_INIT(&pmap->pm_pvchunk); 2254 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2255 pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; 2256 pmap->pm_flags = pmap_flags; 2257} 2258 2259/* 2260 * Initialize a preallocated and zeroed pmap structure, 2261 * such as one in a vmspace structure. 2262 */ 2263int 2264pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2265{ 2266 vm_page_t pml4pg; 2267 vm_paddr_t pml4phys; 2268 int i; 2269 2270 /* 2271 * allocate the page directory page 2272 */ 2273 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2274 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 2275 VM_WAIT; 2276 2277 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2278 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2279 pmap->pm_pcid = -1; 2280 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2281 2282 if ((pml4pg->flags & PG_ZERO) == 0) 2283 pagezero(pmap->pm_pml4); 2284 2285 /* 2286 * Do not install the host kernel mappings in the nested page 2287 * tables. These mappings are meaningless in the guest physical 2288 * address space. 2289 */ 2290 if ((pmap->pm_type = pm_type) == PT_X86) { 2291 pmap->pm_cr3 = pml4phys; 2292 2293 /* Wire in kernel global address entries. */ 2294 for (i = 0; i < NKPML4E; i++) { 2295 pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | 2296 X86_PG_RW | X86_PG_V | PG_U; 2297 } 2298 for (i = 0; i < ndmpdpphys; i++) { 2299 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | 2300 X86_PG_RW | X86_PG_V | PG_U; 2301 } 2302 2303 /* install self-referential address mapping entry(s) */ 2304 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | 2305 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2306 2307 if (pmap_pcid_enabled) { 2308 pmap->pm_pcid = alloc_unr(&pcid_unr); 2309 if (pmap->pm_pcid != -1) 2310 pmap->pm_cr3 |= pmap->pm_pcid; 2311 } 2312 } 2313 2314 pmap->pm_root.rt_root = 0; 2315 CPU_ZERO(&pmap->pm_active); 2316 TAILQ_INIT(&pmap->pm_pvchunk); 2317 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2318 pmap->pm_flags = flags; 2319 pmap->pm_eptgen = 0; 2320 CPU_ZERO(&pmap->pm_save); 2321 2322 return (1); 2323} 2324 2325int 2326pmap_pinit(pmap_t pmap) 2327{ 2328 2329 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2330} 2331 2332/* 2333 * This routine is called if the desired page table page does not exist. 2334 * 2335 * If page table page allocation fails, this routine may sleep before 2336 * returning NULL. It sleeps only if a lock pointer was given. 2337 * 2338 * Note: If a page allocation fails at page table level two or three, 2339 * one or two pages may be held during the wait, only to be released 2340 * afterwards. This conservative approach is easily argued to avoid 2341 * race conditions. 2342 */ 2343static vm_page_t 2344_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2345{ 2346 vm_page_t m, pdppg, pdpg; 2347 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2348 2349 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2350 2351 PG_A = pmap_accessed_bit(pmap); 2352 PG_M = pmap_modified_bit(pmap); 2353 PG_V = pmap_valid_bit(pmap); 2354 PG_RW = pmap_rw_bit(pmap); 2355 2356 /* 2357 * Allocate a page table page. 2358 */ 2359 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2360 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2361 if (lockp != NULL) { 2362 RELEASE_PV_LIST_LOCK(lockp); 2363 PMAP_UNLOCK(pmap); 2364 rw_runlock(&pvh_global_lock); 2365 VM_WAIT; 2366 rw_rlock(&pvh_global_lock); 2367 PMAP_LOCK(pmap); 2368 } 2369 2370 /* 2371 * Indicate the need to retry. While waiting, the page table 2372 * page may have been allocated. 2373 */ 2374 return (NULL); 2375 } 2376 if ((m->flags & PG_ZERO) == 0) 2377 pmap_zero_page(m); 2378 2379 /* 2380 * Map the pagetable page into the process address space, if 2381 * it isn't already there. 2382 */ 2383 2384 if (ptepindex >= (NUPDE + NUPDPE)) { 2385 pml4_entry_t *pml4; 2386 vm_pindex_t pml4index; 2387 2388 /* Wire up a new PDPE page */ 2389 pml4index = ptepindex - (NUPDE + NUPDPE); 2390 pml4 = &pmap->pm_pml4[pml4index]; 2391 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2392 2393 } else if (ptepindex >= NUPDE) { 2394 vm_pindex_t pml4index; 2395 vm_pindex_t pdpindex; 2396 pml4_entry_t *pml4; 2397 pdp_entry_t *pdp; 2398 2399 /* Wire up a new PDE page */ 2400 pdpindex = ptepindex - NUPDE; 2401 pml4index = pdpindex >> NPML4EPGSHIFT; 2402 2403 pml4 = &pmap->pm_pml4[pml4index]; 2404 if ((*pml4 & PG_V) == 0) { 2405 /* Have to allocate a new pdp, recurse */ 2406 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2407 lockp) == NULL) { 2408 --m->wire_count; 2409 atomic_subtract_int(&cnt.v_wire_count, 1); 2410 vm_page_free_zero(m); 2411 return (NULL); 2412 } 2413 } else { 2414 /* Add reference to pdp page */ 2415 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2416 pdppg->wire_count++; 2417 } 2418 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2419 2420 /* Now find the pdp page */ 2421 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2422 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2423 2424 } else { 2425 vm_pindex_t pml4index; 2426 vm_pindex_t pdpindex; 2427 pml4_entry_t *pml4; 2428 pdp_entry_t *pdp; 2429 pd_entry_t *pd; 2430 2431 /* Wire up a new PTE page */ 2432 pdpindex = ptepindex >> NPDPEPGSHIFT; 2433 pml4index = pdpindex >> NPML4EPGSHIFT; 2434 2435 /* First, find the pdp and check that its valid. */ 2436 pml4 = &pmap->pm_pml4[pml4index]; 2437 if ((*pml4 & PG_V) == 0) { 2438 /* Have to allocate a new pd, recurse */ 2439 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2440 lockp) == NULL) { 2441 --m->wire_count; 2442 atomic_subtract_int(&cnt.v_wire_count, 1); 2443 vm_page_free_zero(m); 2444 return (NULL); 2445 } 2446 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2447 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2448 } else { 2449 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2450 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2451 if ((*pdp & PG_V) == 0) { 2452 /* Have to allocate a new pd, recurse */ 2453 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2454 lockp) == NULL) { 2455 --m->wire_count; 2456 atomic_subtract_int(&cnt.v_wire_count, 2457 1); 2458 vm_page_free_zero(m); 2459 return (NULL); 2460 } 2461 } else { 2462 /* Add reference to the pd page */ 2463 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2464 pdpg->wire_count++; 2465 } 2466 } 2467 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2468 2469 /* Now we know where the page directory page is */ 2470 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2471 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2472 } 2473 2474 pmap_resident_count_inc(pmap, 1); 2475 2476 return (m); 2477} 2478 2479static vm_page_t 2480pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2481{ 2482 vm_pindex_t pdpindex, ptepindex; 2483 pdp_entry_t *pdpe, PG_V; 2484 vm_page_t pdpg; 2485 2486 PG_V = pmap_valid_bit(pmap); 2487 2488retry: 2489 pdpe = pmap_pdpe(pmap, va); 2490 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2491 /* Add a reference to the pd page. */ 2492 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2493 pdpg->wire_count++; 2494 } else { 2495 /* Allocate a pd page. */ 2496 ptepindex = pmap_pde_pindex(va); 2497 pdpindex = ptepindex >> NPDPEPGSHIFT; 2498 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2499 if (pdpg == NULL && lockp != NULL) 2500 goto retry; 2501 } 2502 return (pdpg); 2503} 2504 2505static vm_page_t 2506pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2507{ 2508 vm_pindex_t ptepindex; 2509 pd_entry_t *pd, PG_V; 2510 vm_page_t m; 2511 2512 PG_V = pmap_valid_bit(pmap); 2513 2514 /* 2515 * Calculate pagetable page index 2516 */ 2517 ptepindex = pmap_pde_pindex(va); 2518retry: 2519 /* 2520 * Get the page directory entry 2521 */ 2522 pd = pmap_pde(pmap, va); 2523 2524 /* 2525 * This supports switching from a 2MB page to a 2526 * normal 4K page. 2527 */ 2528 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2529 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2530 /* 2531 * Invalidation of the 2MB page mapping may have caused 2532 * the deallocation of the underlying PD page. 2533 */ 2534 pd = NULL; 2535 } 2536 } 2537 2538 /* 2539 * If the page table page is mapped, we just increment the 2540 * hold count, and activate it. 2541 */ 2542 if (pd != NULL && (*pd & PG_V) != 0) { 2543 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2544 m->wire_count++; 2545 } else { 2546 /* 2547 * Here if the pte page isn't mapped, or if it has been 2548 * deallocated. 2549 */ 2550 m = _pmap_allocpte(pmap, ptepindex, lockp); 2551 if (m == NULL && lockp != NULL) 2552 goto retry; 2553 } 2554 return (m); 2555} 2556 2557 2558/*************************************************** 2559 * Pmap allocation/deallocation routines. 2560 ***************************************************/ 2561 2562/* 2563 * Release any resources held by the given physical map. 2564 * Called when a pmap initialized by pmap_pinit is being released. 2565 * Should only be called if the map contains no valid mappings. 2566 */ 2567void 2568pmap_release(pmap_t pmap) 2569{ 2570 vm_page_t m; 2571 int i; 2572 2573 KASSERT(pmap->pm_stats.resident_count == 0, 2574 ("pmap_release: pmap resident count %ld != 0", 2575 pmap->pm_stats.resident_count)); 2576 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2577 ("pmap_release: pmap has reserved page table page(s)")); 2578 2579 if (pmap_pcid_enabled) { 2580 /* 2581 * Invalidate any left TLB entries, to allow the reuse 2582 * of the pcid. 2583 */ 2584 pmap_invalidate_all(pmap); 2585 } 2586 2587 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2588 2589 for (i = 0; i < NKPML4E; i++) /* KVA */ 2590 pmap->pm_pml4[KPML4BASE + i] = 0; 2591 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2592 pmap->pm_pml4[DMPML4I + i] = 0; 2593 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2594 2595 m->wire_count--; 2596 atomic_subtract_int(&cnt.v_wire_count, 1); 2597 vm_page_free_zero(m); 2598 if (pmap->pm_pcid != -1) 2599 free_unr(&pcid_unr, pmap->pm_pcid); 2600} 2601 2602static int 2603kvm_size(SYSCTL_HANDLER_ARGS) 2604{ 2605 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2606 2607 return sysctl_handle_long(oidp, &ksize, 0, req); 2608} 2609SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2610 0, 0, kvm_size, "LU", "Size of KVM"); 2611 2612static int 2613kvm_free(SYSCTL_HANDLER_ARGS) 2614{ 2615 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2616 2617 return sysctl_handle_long(oidp, &kfree, 0, req); 2618} 2619SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2620 0, 0, kvm_free, "LU", "Amount of KVM free"); 2621 2622/* 2623 * grow the number of kernel page table entries, if needed 2624 */ 2625void 2626pmap_growkernel(vm_offset_t addr) 2627{ 2628 vm_paddr_t paddr; 2629 vm_page_t nkpg; 2630 pd_entry_t *pde, newpdir; 2631 pdp_entry_t *pdpe; 2632 2633 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2634 2635 /* 2636 * Return if "addr" is within the range of kernel page table pages 2637 * that were preallocated during pmap bootstrap. Moreover, leave 2638 * "kernel_vm_end" and the kernel page table as they were. 2639 * 2640 * The correctness of this action is based on the following 2641 * argument: vm_map_insert() allocates contiguous ranges of the 2642 * kernel virtual address space. It calls this function if a range 2643 * ends after "kernel_vm_end". If the kernel is mapped between 2644 * "kernel_vm_end" and "addr", then the range cannot begin at 2645 * "kernel_vm_end". In fact, its beginning address cannot be less 2646 * than the kernel. Thus, there is no immediate need to allocate 2647 * any new kernel page table pages between "kernel_vm_end" and 2648 * "KERNBASE". 2649 */ 2650 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2651 return; 2652 2653 addr = roundup2(addr, NBPDR); 2654 if (addr - 1 >= kernel_map->max_offset) 2655 addr = kernel_map->max_offset; 2656 while (kernel_vm_end < addr) { 2657 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2658 if ((*pdpe & X86_PG_V) == 0) { 2659 /* We need a new PDP entry */ 2660 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2661 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2662 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2663 if (nkpg == NULL) 2664 panic("pmap_growkernel: no memory to grow kernel"); 2665 if ((nkpg->flags & PG_ZERO) == 0) 2666 pmap_zero_page(nkpg); 2667 paddr = VM_PAGE_TO_PHYS(nkpg); 2668 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2669 X86_PG_A | X86_PG_M); 2670 continue; /* try again */ 2671 } 2672 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2673 if ((*pde & X86_PG_V) != 0) { 2674 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2675 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2676 kernel_vm_end = kernel_map->max_offset; 2677 break; 2678 } 2679 continue; 2680 } 2681 2682 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2683 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2684 VM_ALLOC_ZERO); 2685 if (nkpg == NULL) 2686 panic("pmap_growkernel: no memory to grow kernel"); 2687 if ((nkpg->flags & PG_ZERO) == 0) 2688 pmap_zero_page(nkpg); 2689 paddr = VM_PAGE_TO_PHYS(nkpg); 2690 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2691 pde_store(pde, newpdir); 2692 2693 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2694 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2695 kernel_vm_end = kernel_map->max_offset; 2696 break; 2697 } 2698 } 2699} 2700 2701 2702/*************************************************** 2703 * page management routines. 2704 ***************************************************/ 2705 2706CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2707CTASSERT(_NPCM == 3); 2708CTASSERT(_NPCPV == 168); 2709 2710static __inline struct pv_chunk * 2711pv_to_chunk(pv_entry_t pv) 2712{ 2713 2714 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2715} 2716 2717#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2718 2719#define PC_FREE0 0xfffffffffffffffful 2720#define PC_FREE1 0xfffffffffffffffful 2721#define PC_FREE2 0x000000fffffffffful 2722 2723static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2724 2725#ifdef PV_STATS 2726static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2727 2728SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2729 "Current number of pv entry chunks"); 2730SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2731 "Current number of pv entry chunks allocated"); 2732SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2733 "Current number of pv entry chunks frees"); 2734SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2735 "Number of times tried to get a chunk page but failed."); 2736 2737static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2738static int pv_entry_spare; 2739 2740SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2741 "Current number of pv entry frees"); 2742SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2743 "Current number of pv entry allocs"); 2744SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2745 "Current number of pv entries"); 2746SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2747 "Current number of spare pv entries"); 2748#endif 2749 2750/* 2751 * We are in a serious low memory condition. Resort to 2752 * drastic measures to free some pages so we can allocate 2753 * another pv entry chunk. 2754 * 2755 * Returns NULL if PV entries were reclaimed from the specified pmap. 2756 * 2757 * We do not, however, unmap 2mpages because subsequent accesses will 2758 * allocate per-page pv entries until repromotion occurs, thereby 2759 * exacerbating the shortage of free pv entries. 2760 */ 2761static vm_page_t 2762reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2763{ 2764 struct pch new_tail; 2765 struct pv_chunk *pc; 2766 struct md_page *pvh; 2767 pd_entry_t *pde; 2768 pmap_t pmap; 2769 pt_entry_t *pte, tpte; 2770 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2771 pv_entry_t pv; 2772 vm_offset_t va; 2773 vm_page_t m, m_pc; 2774 struct spglist free; 2775 uint64_t inuse; 2776 int bit, field, freed; 2777 2778 rw_assert(&pvh_global_lock, RA_LOCKED); 2779 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2780 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2781 pmap = NULL; 2782 m_pc = NULL; 2783 PG_G = PG_A = PG_M = PG_RW = 0; 2784 SLIST_INIT(&free); 2785 TAILQ_INIT(&new_tail); 2786 mtx_lock(&pv_chunks_mutex); 2787 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { 2788 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2789 mtx_unlock(&pv_chunks_mutex); 2790 if (pmap != pc->pc_pmap) { 2791 if (pmap != NULL) { 2792 pmap_invalidate_all(pmap); 2793 if (pmap != locked_pmap) 2794 PMAP_UNLOCK(pmap); 2795 } 2796 pmap = pc->pc_pmap; 2797 /* Avoid deadlock and lock recursion. */ 2798 if (pmap > locked_pmap) { 2799 RELEASE_PV_LIST_LOCK(lockp); 2800 PMAP_LOCK(pmap); 2801 } else if (pmap != locked_pmap && 2802 !PMAP_TRYLOCK(pmap)) { 2803 pmap = NULL; 2804 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2805 mtx_lock(&pv_chunks_mutex); 2806 continue; 2807 } 2808 PG_G = pmap_global_bit(pmap); 2809 PG_A = pmap_accessed_bit(pmap); 2810 PG_M = pmap_modified_bit(pmap); 2811 PG_RW = pmap_rw_bit(pmap); 2812 } 2813 2814 /* 2815 * Destroy every non-wired, 4 KB page mapping in the chunk. 2816 */ 2817 freed = 0; 2818 for (field = 0; field < _NPCM; field++) { 2819 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2820 inuse != 0; inuse &= ~(1UL << bit)) { 2821 bit = bsfq(inuse); 2822 pv = &pc->pc_pventry[field * 64 + bit]; 2823 va = pv->pv_va; 2824 pde = pmap_pde(pmap, va); 2825 if ((*pde & PG_PS) != 0) 2826 continue; 2827 pte = pmap_pde_to_pte(pde, va); 2828 if ((*pte & PG_W) != 0) 2829 continue; 2830 tpte = pte_load_clear(pte); 2831 if ((tpte & PG_G) != 0) 2832 pmap_invalidate_page(pmap, va); 2833 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2834 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2835 vm_page_dirty(m); 2836 if ((tpte & PG_A) != 0) 2837 vm_page_aflag_set(m, PGA_REFERENCED); 2838 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2839 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2840 m->md.pv_gen++; 2841 if (TAILQ_EMPTY(&m->md.pv_list) && 2842 (m->flags & PG_FICTITIOUS) == 0) { 2843 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2844 if (TAILQ_EMPTY(&pvh->pv_list)) { 2845 vm_page_aflag_clear(m, 2846 PGA_WRITEABLE); 2847 } 2848 } 2849 pc->pc_map[field] |= 1UL << bit; 2850 pmap_unuse_pt(pmap, va, *pde, &free); 2851 freed++; 2852 } 2853 } 2854 if (freed == 0) { 2855 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2856 mtx_lock(&pv_chunks_mutex); 2857 continue; 2858 } 2859 /* Every freed mapping is for a 4 KB page. */ 2860 pmap_resident_count_dec(pmap, freed); 2861 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2862 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2863 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2864 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2865 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2866 pc->pc_map[2] == PC_FREE2) { 2867 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2868 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2869 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2870 /* Entire chunk is free; return it. */ 2871 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2872 dump_drop_page(m_pc->phys_addr); 2873 mtx_lock(&pv_chunks_mutex); 2874 break; 2875 } 2876 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2877 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2878 mtx_lock(&pv_chunks_mutex); 2879 /* One freed pv entry in locked_pmap is sufficient. */ 2880 if (pmap == locked_pmap) 2881 break; 2882 } 2883 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2884 mtx_unlock(&pv_chunks_mutex); 2885 if (pmap != NULL) { 2886 pmap_invalidate_all(pmap); 2887 if (pmap != locked_pmap) 2888 PMAP_UNLOCK(pmap); 2889 } 2890 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2891 m_pc = SLIST_FIRST(&free); 2892 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2893 /* Recycle a freed page table page. */ 2894 m_pc->wire_count = 1; 2895 atomic_add_int(&cnt.v_wire_count, 1); 2896 } 2897 pmap_free_zero_pages(&free); 2898 return (m_pc); 2899} 2900 2901/* 2902 * free the pv_entry back to the free list 2903 */ 2904static void 2905free_pv_entry(pmap_t pmap, pv_entry_t pv) 2906{ 2907 struct pv_chunk *pc; 2908 int idx, field, bit; 2909 2910 rw_assert(&pvh_global_lock, RA_LOCKED); 2911 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2912 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2913 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2914 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2915 pc = pv_to_chunk(pv); 2916 idx = pv - &pc->pc_pventry[0]; 2917 field = idx / 64; 2918 bit = idx % 64; 2919 pc->pc_map[field] |= 1ul << bit; 2920 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2921 pc->pc_map[2] != PC_FREE2) { 2922 /* 98% of the time, pc is already at the head of the list. */ 2923 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2924 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2925 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2926 } 2927 return; 2928 } 2929 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2930 free_pv_chunk(pc); 2931} 2932 2933static void 2934free_pv_chunk(struct pv_chunk *pc) 2935{ 2936 vm_page_t m; 2937 2938 mtx_lock(&pv_chunks_mutex); 2939 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2940 mtx_unlock(&pv_chunks_mutex); 2941 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2942 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2943 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2944 /* entire chunk is free, return it */ 2945 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2946 dump_drop_page(m->phys_addr); 2947 vm_page_unwire(m, 0); 2948 vm_page_free(m); 2949} 2950 2951/* 2952 * Returns a new PV entry, allocating a new PV chunk from the system when 2953 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2954 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2955 * returned. 2956 * 2957 * The given PV list lock may be released. 2958 */ 2959static pv_entry_t 2960get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2961{ 2962 int bit, field; 2963 pv_entry_t pv; 2964 struct pv_chunk *pc; 2965 vm_page_t m; 2966 2967 rw_assert(&pvh_global_lock, RA_LOCKED); 2968 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2969 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2970retry: 2971 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2972 if (pc != NULL) { 2973 for (field = 0; field < _NPCM; field++) { 2974 if (pc->pc_map[field]) { 2975 bit = bsfq(pc->pc_map[field]); 2976 break; 2977 } 2978 } 2979 if (field < _NPCM) { 2980 pv = &pc->pc_pventry[field * 64 + bit]; 2981 pc->pc_map[field] &= ~(1ul << bit); 2982 /* If this was the last item, move it to tail */ 2983 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2984 pc->pc_map[2] == 0) { 2985 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2986 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2987 pc_list); 2988 } 2989 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2990 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2991 return (pv); 2992 } 2993 } 2994 /* No free items, allocate another chunk */ 2995 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2996 VM_ALLOC_WIRED); 2997 if (m == NULL) { 2998 if (lockp == NULL) { 2999 PV_STAT(pc_chunk_tryfail++); 3000 return (NULL); 3001 } 3002 m = reclaim_pv_chunk(pmap, lockp); 3003 if (m == NULL) 3004 goto retry; 3005 } 3006 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3007 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3008 dump_add_page(m->phys_addr); 3009 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3010 pc->pc_pmap = pmap; 3011 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3012 pc->pc_map[1] = PC_FREE1; 3013 pc->pc_map[2] = PC_FREE2; 3014 mtx_lock(&pv_chunks_mutex); 3015 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3016 mtx_unlock(&pv_chunks_mutex); 3017 pv = &pc->pc_pventry[0]; 3018 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3019 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3020 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3021 return (pv); 3022} 3023 3024/* 3025 * Returns the number of one bits within the given PV chunk map element. 3026 */ 3027static int 3028popcnt_pc_map_elem(uint64_t elem) 3029{ 3030 int count; 3031 3032 /* 3033 * This simple method of counting the one bits performs well because 3034 * the given element typically contains more zero bits than one bits. 3035 */ 3036 count = 0; 3037 for (; elem != 0; elem &= elem - 1) 3038 count++; 3039 return (count); 3040} 3041 3042/* 3043 * Ensure that the number of spare PV entries in the specified pmap meets or 3044 * exceeds the given count, "needed". 3045 * 3046 * The given PV list lock may be released. 3047 */ 3048static void 3049reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3050{ 3051 struct pch new_tail; 3052 struct pv_chunk *pc; 3053 int avail, free; 3054 vm_page_t m; 3055 3056 rw_assert(&pvh_global_lock, RA_LOCKED); 3057 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3058 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3059 3060 /* 3061 * Newly allocated PV chunks must be stored in a private list until 3062 * the required number of PV chunks have been allocated. Otherwise, 3063 * reclaim_pv_chunk() could recycle one of these chunks. In 3064 * contrast, these chunks must be added to the pmap upon allocation. 3065 */ 3066 TAILQ_INIT(&new_tail); 3067retry: 3068 avail = 0; 3069 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3070 if ((cpu_feature2 & CPUID2_POPCNT) == 0) { 3071 free = popcnt_pc_map_elem(pc->pc_map[0]); 3072 free += popcnt_pc_map_elem(pc->pc_map[1]); 3073 free += popcnt_pc_map_elem(pc->pc_map[2]); 3074 } else { 3075 free = popcntq(pc->pc_map[0]); 3076 free += popcntq(pc->pc_map[1]); 3077 free += popcntq(pc->pc_map[2]); 3078 } 3079 if (free == 0) 3080 break; 3081 avail += free; 3082 if (avail >= needed) 3083 break; 3084 } 3085 for (; avail < needed; avail += _NPCPV) { 3086 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3087 VM_ALLOC_WIRED); 3088 if (m == NULL) { 3089 m = reclaim_pv_chunk(pmap, lockp); 3090 if (m == NULL) 3091 goto retry; 3092 } 3093 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3094 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3095 dump_add_page(m->phys_addr); 3096 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3097 pc->pc_pmap = pmap; 3098 pc->pc_map[0] = PC_FREE0; 3099 pc->pc_map[1] = PC_FREE1; 3100 pc->pc_map[2] = PC_FREE2; 3101 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3102 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3103 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3104 } 3105 if (!TAILQ_EMPTY(&new_tail)) { 3106 mtx_lock(&pv_chunks_mutex); 3107 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3108 mtx_unlock(&pv_chunks_mutex); 3109 } 3110} 3111 3112/* 3113 * First find and then remove the pv entry for the specified pmap and virtual 3114 * address from the specified pv list. Returns the pv entry if found and NULL 3115 * otherwise. This operation can be performed on pv lists for either 4KB or 3116 * 2MB page mappings. 3117 */ 3118static __inline pv_entry_t 3119pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3120{ 3121 pv_entry_t pv; 3122 3123 rw_assert(&pvh_global_lock, RA_LOCKED); 3124 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3125 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3126 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3127 pvh->pv_gen++; 3128 break; 3129 } 3130 } 3131 return (pv); 3132} 3133 3134/* 3135 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3136 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3137 * entries for each of the 4KB page mappings. 3138 */ 3139static void 3140pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3141 struct rwlock **lockp) 3142{ 3143 struct md_page *pvh; 3144 struct pv_chunk *pc; 3145 pv_entry_t pv; 3146 vm_offset_t va_last; 3147 vm_page_t m; 3148 int bit, field; 3149 3150 rw_assert(&pvh_global_lock, RA_LOCKED); 3151 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3152 KASSERT((pa & PDRMASK) == 0, 3153 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3154 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3155 3156 /* 3157 * Transfer the 2mpage's pv entry for this mapping to the first 3158 * page's pv list. Once this transfer begins, the pv list lock 3159 * must not be released until the last pv entry is reinstantiated. 3160 */ 3161 pvh = pa_to_pvh(pa); 3162 va = trunc_2mpage(va); 3163 pv = pmap_pvh_remove(pvh, pmap, va); 3164 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3165 m = PHYS_TO_VM_PAGE(pa); 3166 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3167 m->md.pv_gen++; 3168 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3169 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3170 va_last = va + NBPDR - PAGE_SIZE; 3171 for (;;) { 3172 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3173 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3174 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3175 for (field = 0; field < _NPCM; field++) { 3176 while (pc->pc_map[field]) { 3177 bit = bsfq(pc->pc_map[field]); 3178 pc->pc_map[field] &= ~(1ul << bit); 3179 pv = &pc->pc_pventry[field * 64 + bit]; 3180 va += PAGE_SIZE; 3181 pv->pv_va = va; 3182 m++; 3183 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3184 ("pmap_pv_demote_pde: page %p is not managed", m)); 3185 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3186 m->md.pv_gen++; 3187 if (va == va_last) 3188 goto out; 3189 } 3190 } 3191 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3192 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3193 } 3194out: 3195 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3196 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3197 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3198 } 3199 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3200 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3201} 3202 3203/* 3204 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3205 * replace the many pv entries for the 4KB page mappings by a single pv entry 3206 * for the 2MB page mapping. 3207 */ 3208static void 3209pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3210 struct rwlock **lockp) 3211{ 3212 struct md_page *pvh; 3213 pv_entry_t pv; 3214 vm_offset_t va_last; 3215 vm_page_t m; 3216 3217 rw_assert(&pvh_global_lock, RA_LOCKED); 3218 KASSERT((pa & PDRMASK) == 0, 3219 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3220 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3221 3222 /* 3223 * Transfer the first page's pv entry for this mapping to the 2mpage's 3224 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3225 * a transfer avoids the possibility that get_pv_entry() calls 3226 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3227 * mappings that is being promoted. 3228 */ 3229 m = PHYS_TO_VM_PAGE(pa); 3230 va = trunc_2mpage(va); 3231 pv = pmap_pvh_remove(&m->md, pmap, va); 3232 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3233 pvh = pa_to_pvh(pa); 3234 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3235 pvh->pv_gen++; 3236 /* Free the remaining NPTEPG - 1 pv entries. */ 3237 va_last = va + NBPDR - PAGE_SIZE; 3238 do { 3239 m++; 3240 va += PAGE_SIZE; 3241 pmap_pvh_free(&m->md, pmap, va); 3242 } while (va < va_last); 3243} 3244 3245/* 3246 * First find and then destroy the pv entry for the specified pmap and virtual 3247 * address. This operation can be performed on pv lists for either 4KB or 2MB 3248 * page mappings. 3249 */ 3250static void 3251pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3252{ 3253 pv_entry_t pv; 3254 3255 pv = pmap_pvh_remove(pvh, pmap, va); 3256 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3257 free_pv_entry(pmap, pv); 3258} 3259 3260/* 3261 * Conditionally create the PV entry for a 4KB page mapping if the required 3262 * memory can be allocated without resorting to reclamation. 3263 */ 3264static boolean_t 3265pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3266 struct rwlock **lockp) 3267{ 3268 pv_entry_t pv; 3269 3270 rw_assert(&pvh_global_lock, RA_LOCKED); 3271 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3272 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3273 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3274 pv->pv_va = va; 3275 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3276 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3277 m->md.pv_gen++; 3278 return (TRUE); 3279 } else 3280 return (FALSE); 3281} 3282 3283/* 3284 * Conditionally create the PV entry for a 2MB page mapping if the required 3285 * memory can be allocated without resorting to reclamation. 3286 */ 3287static boolean_t 3288pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3289 struct rwlock **lockp) 3290{ 3291 struct md_page *pvh; 3292 pv_entry_t pv; 3293 3294 rw_assert(&pvh_global_lock, RA_LOCKED); 3295 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3296 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3297 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3298 pv->pv_va = va; 3299 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3300 pvh = pa_to_pvh(pa); 3301 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3302 pvh->pv_gen++; 3303 return (TRUE); 3304 } else 3305 return (FALSE); 3306} 3307 3308/* 3309 * Fills a page table page with mappings to consecutive physical pages. 3310 */ 3311static void 3312pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3313{ 3314 pt_entry_t *pte; 3315 3316 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3317 *pte = newpte; 3318 newpte += PAGE_SIZE; 3319 } 3320} 3321 3322/* 3323 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3324 * mapping is invalidated. 3325 */ 3326static boolean_t 3327pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3328{ 3329 struct rwlock *lock; 3330 boolean_t rv; 3331 3332 lock = NULL; 3333 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3334 if (lock != NULL) 3335 rw_wunlock(lock); 3336 return (rv); 3337} 3338 3339static boolean_t 3340pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3341 struct rwlock **lockp) 3342{ 3343 pd_entry_t newpde, oldpde; 3344 pt_entry_t *firstpte, newpte; 3345 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3346 vm_paddr_t mptepa; 3347 vm_page_t mpte; 3348 struct spglist free; 3349 int PG_PTE_CACHE; 3350 3351 PG_G = pmap_global_bit(pmap); 3352 PG_A = pmap_accessed_bit(pmap); 3353 PG_M = pmap_modified_bit(pmap); 3354 PG_RW = pmap_rw_bit(pmap); 3355 PG_V = pmap_valid_bit(pmap); 3356 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3357 3358 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3359 oldpde = *pde; 3360 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3361 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3362 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 3363 NULL) 3364 pmap_remove_pt_page(pmap, mpte); 3365 else { 3366 KASSERT((oldpde & PG_W) == 0, 3367 ("pmap_demote_pde: page table page for a wired mapping" 3368 " is missing")); 3369 3370 /* 3371 * Invalidate the 2MB page mapping and return "failure" if the 3372 * mapping was never accessed or the allocation of the new 3373 * page table page fails. If the 2MB page mapping belongs to 3374 * the direct map region of the kernel's address space, then 3375 * the page allocation request specifies the highest possible 3376 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3377 * normal. Page table pages are preallocated for every other 3378 * part of the kernel address space, so the direct map region 3379 * is the only part of the kernel address space that must be 3380 * handled here. 3381 */ 3382 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3383 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3384 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3385 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3386 SLIST_INIT(&free); 3387 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, 3388 lockp); 3389 pmap_invalidate_page(pmap, trunc_2mpage(va)); 3390 pmap_free_zero_pages(&free); 3391 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3392 " in pmap %p", va, pmap); 3393 return (FALSE); 3394 } 3395 if (va < VM_MAXUSER_ADDRESS) 3396 pmap_resident_count_inc(pmap, 1); 3397 } 3398 mptepa = VM_PAGE_TO_PHYS(mpte); 3399 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3400 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3401 KASSERT((oldpde & PG_A) != 0, 3402 ("pmap_demote_pde: oldpde is missing PG_A")); 3403 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3404 ("pmap_demote_pde: oldpde is missing PG_M")); 3405 newpte = oldpde & ~PG_PS; 3406 newpte = pmap_swap_pat(pmap, newpte); 3407 3408 /* 3409 * If the page table page is new, initialize it. 3410 */ 3411 if (mpte->wire_count == 1) { 3412 mpte->wire_count = NPTEPG; 3413 pmap_fill_ptp(firstpte, newpte); 3414 } 3415 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3416 ("pmap_demote_pde: firstpte and newpte map different physical" 3417 " addresses")); 3418 3419 /* 3420 * If the mapping has changed attributes, update the page table 3421 * entries. 3422 */ 3423 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3424 pmap_fill_ptp(firstpte, newpte); 3425 3426 /* 3427 * The spare PV entries must be reserved prior to demoting the 3428 * mapping, that is, prior to changing the PDE. Otherwise, the state 3429 * of the PDE and the PV lists will be inconsistent, which can result 3430 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3431 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3432 * PV entry for the 2MB page mapping that is being demoted. 3433 */ 3434 if ((oldpde & PG_MANAGED) != 0) 3435 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3436 3437 /* 3438 * Demote the mapping. This pmap is locked. The old PDE has 3439 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3440 * set. Thus, there is no danger of a race with another 3441 * processor changing the setting of PG_A and/or PG_M between 3442 * the read above and the store below. 3443 */ 3444 if (workaround_erratum383) 3445 pmap_update_pde(pmap, va, pde, newpde); 3446 else 3447 pde_store(pde, newpde); 3448 3449 /* 3450 * Invalidate a stale recursive mapping of the page table page. 3451 */ 3452 if (va >= VM_MAXUSER_ADDRESS) 3453 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3454 3455 /* 3456 * Demote the PV entry. 3457 */ 3458 if ((oldpde & PG_MANAGED) != 0) 3459 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3460 3461 atomic_add_long(&pmap_pde_demotions, 1); 3462 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3463 " in pmap %p", va, pmap); 3464 return (TRUE); 3465} 3466 3467/* 3468 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3469 */ 3470static void 3471pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3472{ 3473 pd_entry_t newpde; 3474 vm_paddr_t mptepa; 3475 vm_page_t mpte; 3476 3477 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3478 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3479 mpte = pmap_lookup_pt_page(pmap, va); 3480 if (mpte == NULL) 3481 panic("pmap_remove_kernel_pde: Missing pt page."); 3482 3483 pmap_remove_pt_page(pmap, mpte); 3484 mptepa = VM_PAGE_TO_PHYS(mpte); 3485 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3486 3487 /* 3488 * Initialize the page table page. 3489 */ 3490 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3491 3492 /* 3493 * Demote the mapping. 3494 */ 3495 if (workaround_erratum383) 3496 pmap_update_pde(pmap, va, pde, newpde); 3497 else 3498 pde_store(pde, newpde); 3499 3500 /* 3501 * Invalidate a stale recursive mapping of the page table page. 3502 */ 3503 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3504} 3505 3506/* 3507 * pmap_remove_pde: do the things to unmap a superpage in a process 3508 */ 3509static int 3510pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3511 struct spglist *free, struct rwlock **lockp) 3512{ 3513 struct md_page *pvh; 3514 pd_entry_t oldpde; 3515 vm_offset_t eva, va; 3516 vm_page_t m, mpte; 3517 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3518 3519 PG_G = pmap_global_bit(pmap); 3520 PG_A = pmap_accessed_bit(pmap); 3521 PG_M = pmap_modified_bit(pmap); 3522 PG_RW = pmap_rw_bit(pmap); 3523 3524 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3525 KASSERT((sva & PDRMASK) == 0, 3526 ("pmap_remove_pde: sva is not 2mpage aligned")); 3527 oldpde = pte_load_clear(pdq); 3528 if (oldpde & PG_W) 3529 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3530 3531 /* 3532 * Machines that don't support invlpg, also don't support 3533 * PG_G. 3534 */ 3535 if (oldpde & PG_G) 3536 pmap_invalidate_page(kernel_pmap, sva); 3537 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3538 if (oldpde & PG_MANAGED) { 3539 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3540 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3541 pmap_pvh_free(pvh, pmap, sva); 3542 eva = sva + NBPDR; 3543 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3544 va < eva; va += PAGE_SIZE, m++) { 3545 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3546 vm_page_dirty(m); 3547 if (oldpde & PG_A) 3548 vm_page_aflag_set(m, PGA_REFERENCED); 3549 if (TAILQ_EMPTY(&m->md.pv_list) && 3550 TAILQ_EMPTY(&pvh->pv_list)) 3551 vm_page_aflag_clear(m, PGA_WRITEABLE); 3552 } 3553 } 3554 if (pmap == kernel_pmap) { 3555 pmap_remove_kernel_pde(pmap, pdq, sva); 3556 } else { 3557 mpte = pmap_lookup_pt_page(pmap, sva); 3558 if (mpte != NULL) { 3559 pmap_remove_pt_page(pmap, mpte); 3560 pmap_resident_count_dec(pmap, 1); 3561 KASSERT(mpte->wire_count == NPTEPG, 3562 ("pmap_remove_pde: pte page wire count error")); 3563 mpte->wire_count = 0; 3564 pmap_add_delayed_free_list(mpte, free, FALSE); 3565 atomic_subtract_int(&cnt.v_wire_count, 1); 3566 } 3567 } 3568 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3569} 3570 3571/* 3572 * pmap_remove_pte: do the things to unmap a page in a process 3573 */ 3574static int 3575pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3576 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3577{ 3578 struct md_page *pvh; 3579 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3580 vm_page_t m; 3581 3582 PG_A = pmap_accessed_bit(pmap); 3583 PG_M = pmap_modified_bit(pmap); 3584 PG_RW = pmap_rw_bit(pmap); 3585 3586 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3587 oldpte = pte_load_clear(ptq); 3588 if (oldpte & PG_W) 3589 pmap->pm_stats.wired_count -= 1; 3590 pmap_resident_count_dec(pmap, 1); 3591 if (oldpte & PG_MANAGED) { 3592 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3593 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3594 vm_page_dirty(m); 3595 if (oldpte & PG_A) 3596 vm_page_aflag_set(m, PGA_REFERENCED); 3597 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3598 pmap_pvh_free(&m->md, pmap, va); 3599 if (TAILQ_EMPTY(&m->md.pv_list) && 3600 (m->flags & PG_FICTITIOUS) == 0) { 3601 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3602 if (TAILQ_EMPTY(&pvh->pv_list)) 3603 vm_page_aflag_clear(m, PGA_WRITEABLE); 3604 } 3605 } 3606 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3607} 3608 3609/* 3610 * Remove a single page from a process address space 3611 */ 3612static void 3613pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3614 struct spglist *free) 3615{ 3616 struct rwlock *lock; 3617 pt_entry_t *pte, PG_V; 3618 3619 PG_V = pmap_valid_bit(pmap); 3620 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3621 if ((*pde & PG_V) == 0) 3622 return; 3623 pte = pmap_pde_to_pte(pde, va); 3624 if ((*pte & PG_V) == 0) 3625 return; 3626 lock = NULL; 3627 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3628 if (lock != NULL) 3629 rw_wunlock(lock); 3630 pmap_invalidate_page(pmap, va); 3631} 3632 3633/* 3634 * Remove the given range of addresses from the specified map. 3635 * 3636 * It is assumed that the start and end are properly 3637 * rounded to the page size. 3638 */ 3639void 3640pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3641{ 3642 struct rwlock *lock; 3643 vm_offset_t va, va_next; 3644 pml4_entry_t *pml4e; 3645 pdp_entry_t *pdpe; 3646 pd_entry_t ptpaddr, *pde; 3647 pt_entry_t *pte, PG_G, PG_V; 3648 struct spglist free; 3649 int anyvalid; 3650 3651 PG_G = pmap_global_bit(pmap); 3652 PG_V = pmap_valid_bit(pmap); 3653 3654 /* 3655 * Perform an unsynchronized read. This is, however, safe. 3656 */ 3657 if (pmap->pm_stats.resident_count == 0) 3658 return; 3659 3660 anyvalid = 0; 3661 SLIST_INIT(&free); 3662 3663 rw_rlock(&pvh_global_lock); 3664 PMAP_LOCK(pmap); 3665 3666 /* 3667 * special handling of removing one page. a very 3668 * common operation and easy to short circuit some 3669 * code. 3670 */ 3671 if (sva + PAGE_SIZE == eva) { 3672 pde = pmap_pde(pmap, sva); 3673 if (pde && (*pde & PG_PS) == 0) { 3674 pmap_remove_page(pmap, sva, pde, &free); 3675 goto out; 3676 } 3677 } 3678 3679 lock = NULL; 3680 for (; sva < eva; sva = va_next) { 3681 3682 if (pmap->pm_stats.resident_count == 0) 3683 break; 3684 3685 pml4e = pmap_pml4e(pmap, sva); 3686 if ((*pml4e & PG_V) == 0) { 3687 va_next = (sva + NBPML4) & ~PML4MASK; 3688 if (va_next < sva) 3689 va_next = eva; 3690 continue; 3691 } 3692 3693 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3694 if ((*pdpe & PG_V) == 0) { 3695 va_next = (sva + NBPDP) & ~PDPMASK; 3696 if (va_next < sva) 3697 va_next = eva; 3698 continue; 3699 } 3700 3701 /* 3702 * Calculate index for next page table. 3703 */ 3704 va_next = (sva + NBPDR) & ~PDRMASK; 3705 if (va_next < sva) 3706 va_next = eva; 3707 3708 pde = pmap_pdpe_to_pde(pdpe, sva); 3709 ptpaddr = *pde; 3710 3711 /* 3712 * Weed out invalid mappings. 3713 */ 3714 if (ptpaddr == 0) 3715 continue; 3716 3717 /* 3718 * Check for large page. 3719 */ 3720 if ((ptpaddr & PG_PS) != 0) { 3721 /* 3722 * Are we removing the entire large page? If not, 3723 * demote the mapping and fall through. 3724 */ 3725 if (sva + NBPDR == va_next && eva >= va_next) { 3726 /* 3727 * The TLB entry for a PG_G mapping is 3728 * invalidated by pmap_remove_pde(). 3729 */ 3730 if ((ptpaddr & PG_G) == 0) 3731 anyvalid = 1; 3732 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3733 continue; 3734 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3735 &lock)) { 3736 /* The large page mapping was destroyed. */ 3737 continue; 3738 } else 3739 ptpaddr = *pde; 3740 } 3741 3742 /* 3743 * Limit our scan to either the end of the va represented 3744 * by the current page table page, or to the end of the 3745 * range being removed. 3746 */ 3747 if (va_next > eva) 3748 va_next = eva; 3749 3750 va = va_next; 3751 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3752 sva += PAGE_SIZE) { 3753 if (*pte == 0) { 3754 if (va != va_next) { 3755 pmap_invalidate_range(pmap, va, sva); 3756 va = va_next; 3757 } 3758 continue; 3759 } 3760 if ((*pte & PG_G) == 0) 3761 anyvalid = 1; 3762 else if (va == va_next) 3763 va = sva; 3764 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3765 &lock)) { 3766 sva += PAGE_SIZE; 3767 break; 3768 } 3769 } 3770 if (va != va_next) 3771 pmap_invalidate_range(pmap, va, sva); 3772 } 3773 if (lock != NULL) 3774 rw_wunlock(lock); 3775out: 3776 if (anyvalid) 3777 pmap_invalidate_all(pmap); 3778 rw_runlock(&pvh_global_lock); 3779 PMAP_UNLOCK(pmap); 3780 pmap_free_zero_pages(&free); 3781} 3782 3783/* 3784 * Routine: pmap_remove_all 3785 * Function: 3786 * Removes this physical page from 3787 * all physical maps in which it resides. 3788 * Reflects back modify bits to the pager. 3789 * 3790 * Notes: 3791 * Original versions of this routine were very 3792 * inefficient because they iteratively called 3793 * pmap_remove (slow...) 3794 */ 3795 3796void 3797pmap_remove_all(vm_page_t m) 3798{ 3799 struct md_page *pvh; 3800 pv_entry_t pv; 3801 pmap_t pmap; 3802 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 3803 pd_entry_t *pde; 3804 vm_offset_t va; 3805 struct spglist free; 3806 3807 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3808 ("pmap_remove_all: page %p is not managed", m)); 3809 SLIST_INIT(&free); 3810 rw_wlock(&pvh_global_lock); 3811 if ((m->flags & PG_FICTITIOUS) != 0) 3812 goto small_mappings; 3813 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3814 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3815 pmap = PV_PMAP(pv); 3816 PMAP_LOCK(pmap); 3817 va = pv->pv_va; 3818 pde = pmap_pde(pmap, va); 3819 (void)pmap_demote_pde(pmap, pde, va); 3820 PMAP_UNLOCK(pmap); 3821 } 3822small_mappings: 3823 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3824 pmap = PV_PMAP(pv); 3825 PMAP_LOCK(pmap); 3826 PG_A = pmap_accessed_bit(pmap); 3827 PG_M = pmap_modified_bit(pmap); 3828 PG_RW = pmap_rw_bit(pmap); 3829 pmap_resident_count_dec(pmap, 1); 3830 pde = pmap_pde(pmap, pv->pv_va); 3831 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3832 " a 2mpage in page %p's pv list", m)); 3833 pte = pmap_pde_to_pte(pde, pv->pv_va); 3834 tpte = pte_load_clear(pte); 3835 if (tpte & PG_W) 3836 pmap->pm_stats.wired_count--; 3837 if (tpte & PG_A) 3838 vm_page_aflag_set(m, PGA_REFERENCED); 3839 3840 /* 3841 * Update the vm_page_t clean and reference bits. 3842 */ 3843 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3844 vm_page_dirty(m); 3845 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3846 pmap_invalidate_page(pmap, pv->pv_va); 3847 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3848 m->md.pv_gen++; 3849 free_pv_entry(pmap, pv); 3850 PMAP_UNLOCK(pmap); 3851 } 3852 vm_page_aflag_clear(m, PGA_WRITEABLE); 3853 rw_wunlock(&pvh_global_lock); 3854 pmap_free_zero_pages(&free); 3855} 3856 3857/* 3858 * pmap_protect_pde: do the things to protect a 2mpage in a process 3859 */ 3860static boolean_t 3861pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3862{ 3863 pd_entry_t newpde, oldpde; 3864 vm_offset_t eva, va; 3865 vm_page_t m; 3866 boolean_t anychanged; 3867 pt_entry_t PG_G, PG_M, PG_RW; 3868 3869 PG_G = pmap_global_bit(pmap); 3870 PG_M = pmap_modified_bit(pmap); 3871 PG_RW = pmap_rw_bit(pmap); 3872 3873 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3874 KASSERT((sva & PDRMASK) == 0, 3875 ("pmap_protect_pde: sva is not 2mpage aligned")); 3876 anychanged = FALSE; 3877retry: 3878 oldpde = newpde = *pde; 3879 if (oldpde & PG_MANAGED) { 3880 eva = sva + NBPDR; 3881 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3882 va < eva; va += PAGE_SIZE, m++) 3883 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3884 vm_page_dirty(m); 3885 } 3886 if ((prot & VM_PROT_WRITE) == 0) 3887 newpde &= ~(PG_RW | PG_M); 3888 if ((prot & VM_PROT_EXECUTE) == 0) 3889 newpde |= pg_nx; 3890 if (newpde != oldpde) { 3891 if (!atomic_cmpset_long(pde, oldpde, newpde)) 3892 goto retry; 3893 if (oldpde & PG_G) 3894 pmap_invalidate_page(pmap, sva); 3895 else 3896 anychanged = TRUE; 3897 } 3898 return (anychanged); 3899} 3900 3901/* 3902 * Set the physical protection on the 3903 * specified range of this map as requested. 3904 */ 3905void 3906pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3907{ 3908 vm_offset_t va_next; 3909 pml4_entry_t *pml4e; 3910 pdp_entry_t *pdpe; 3911 pd_entry_t ptpaddr, *pde; 3912 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 3913 boolean_t anychanged, pv_lists_locked; 3914 3915 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3916 if (prot == VM_PROT_NONE) { 3917 pmap_remove(pmap, sva, eva); 3918 return; 3919 } 3920 3921 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3922 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3923 return; 3924 3925 PG_G = pmap_global_bit(pmap); 3926 PG_M = pmap_modified_bit(pmap); 3927 PG_V = pmap_valid_bit(pmap); 3928 PG_RW = pmap_rw_bit(pmap); 3929 pv_lists_locked = FALSE; 3930resume: 3931 anychanged = FALSE; 3932 3933 PMAP_LOCK(pmap); 3934 for (; sva < eva; sva = va_next) { 3935 3936 pml4e = pmap_pml4e(pmap, sva); 3937 if ((*pml4e & PG_V) == 0) { 3938 va_next = (sva + NBPML4) & ~PML4MASK; 3939 if (va_next < sva) 3940 va_next = eva; 3941 continue; 3942 } 3943 3944 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3945 if ((*pdpe & PG_V) == 0) { 3946 va_next = (sva + NBPDP) & ~PDPMASK; 3947 if (va_next < sva) 3948 va_next = eva; 3949 continue; 3950 } 3951 3952 va_next = (sva + NBPDR) & ~PDRMASK; 3953 if (va_next < sva) 3954 va_next = eva; 3955 3956 pde = pmap_pdpe_to_pde(pdpe, sva); 3957 ptpaddr = *pde; 3958 3959 /* 3960 * Weed out invalid mappings. 3961 */ 3962 if (ptpaddr == 0) 3963 continue; 3964 3965 /* 3966 * Check for large page. 3967 */ 3968 if ((ptpaddr & PG_PS) != 0) { 3969 /* 3970 * Are we protecting the entire large page? If not, 3971 * demote the mapping and fall through. 3972 */ 3973 if (sva + NBPDR == va_next && eva >= va_next) { 3974 /* 3975 * The TLB entry for a PG_G mapping is 3976 * invalidated by pmap_protect_pde(). 3977 */ 3978 if (pmap_protect_pde(pmap, pde, sva, prot)) 3979 anychanged = TRUE; 3980 continue; 3981 } else { 3982 if (!pv_lists_locked) { 3983 pv_lists_locked = TRUE; 3984 if (!rw_try_rlock(&pvh_global_lock)) { 3985 if (anychanged) 3986 pmap_invalidate_all( 3987 pmap); 3988 PMAP_UNLOCK(pmap); 3989 rw_rlock(&pvh_global_lock); 3990 goto resume; 3991 } 3992 } 3993 if (!pmap_demote_pde(pmap, pde, sva)) { 3994 /* 3995 * The large page mapping was 3996 * destroyed. 3997 */ 3998 continue; 3999 } 4000 } 4001 } 4002 4003 if (va_next > eva) 4004 va_next = eva; 4005 4006 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4007 sva += PAGE_SIZE) { 4008 pt_entry_t obits, pbits; 4009 vm_page_t m; 4010 4011retry: 4012 obits = pbits = *pte; 4013 if ((pbits & PG_V) == 0) 4014 continue; 4015 4016 if ((prot & VM_PROT_WRITE) == 0) { 4017 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4018 (PG_MANAGED | PG_M | PG_RW)) { 4019 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4020 vm_page_dirty(m); 4021 } 4022 pbits &= ~(PG_RW | PG_M); 4023 } 4024 if ((prot & VM_PROT_EXECUTE) == 0) 4025 pbits |= pg_nx; 4026 4027 if (pbits != obits) { 4028 if (!atomic_cmpset_long(pte, obits, pbits)) 4029 goto retry; 4030 if (obits & PG_G) 4031 pmap_invalidate_page(pmap, sva); 4032 else 4033 anychanged = TRUE; 4034 } 4035 } 4036 } 4037 if (anychanged) 4038 pmap_invalidate_all(pmap); 4039 if (pv_lists_locked) 4040 rw_runlock(&pvh_global_lock); 4041 PMAP_UNLOCK(pmap); 4042} 4043 4044/* 4045 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4046 * single page table page (PTP) to a single 2MB page mapping. For promotion 4047 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4048 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4049 * identical characteristics. 4050 */ 4051static void 4052pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4053 struct rwlock **lockp) 4054{ 4055 pd_entry_t newpde; 4056 pt_entry_t *firstpte, oldpte, pa, *pte; 4057 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 4058 vm_page_t mpte; 4059 int PG_PTE_CACHE; 4060 4061 PG_A = pmap_accessed_bit(pmap); 4062 PG_G = pmap_global_bit(pmap); 4063 PG_M = pmap_modified_bit(pmap); 4064 PG_V = pmap_valid_bit(pmap); 4065 PG_RW = pmap_rw_bit(pmap); 4066 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4067 4068 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4069 4070 /* 4071 * Examine the first PTE in the specified PTP. Abort if this PTE is 4072 * either invalid, unused, or does not map the first 4KB physical page 4073 * within a 2MB page. 4074 */ 4075 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4076setpde: 4077 newpde = *firstpte; 4078 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4079 atomic_add_long(&pmap_pde_p_failures, 1); 4080 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4081 " in pmap %p", va, pmap); 4082 return; 4083 } 4084 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4085 /* 4086 * When PG_M is already clear, PG_RW can be cleared without 4087 * a TLB invalidation. 4088 */ 4089 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4090 goto setpde; 4091 newpde &= ~PG_RW; 4092 } 4093 4094 /* 4095 * Examine each of the other PTEs in the specified PTP. Abort if this 4096 * PTE maps an unexpected 4KB physical page or does not have identical 4097 * characteristics to the first PTE. 4098 */ 4099 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4100 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4101setpte: 4102 oldpte = *pte; 4103 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4104 atomic_add_long(&pmap_pde_p_failures, 1); 4105 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4106 " in pmap %p", va, pmap); 4107 return; 4108 } 4109 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4110 /* 4111 * When PG_M is already clear, PG_RW can be cleared 4112 * without a TLB invalidation. 4113 */ 4114 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4115 goto setpte; 4116 oldpte &= ~PG_RW; 4117 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4118 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4119 (va & ~PDRMASK), pmap); 4120 } 4121 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4122 atomic_add_long(&pmap_pde_p_failures, 1); 4123 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4124 " in pmap %p", va, pmap); 4125 return; 4126 } 4127 pa -= PAGE_SIZE; 4128 } 4129 4130 /* 4131 * Save the page table page in its current state until the PDE 4132 * mapping the superpage is demoted by pmap_demote_pde() or 4133 * destroyed by pmap_remove_pde(). 4134 */ 4135 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4136 KASSERT(mpte >= vm_page_array && 4137 mpte < &vm_page_array[vm_page_array_size], 4138 ("pmap_promote_pde: page table page is out of range")); 4139 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4140 ("pmap_promote_pde: page table page's pindex is wrong")); 4141 if (pmap_insert_pt_page(pmap, mpte)) { 4142 atomic_add_long(&pmap_pde_p_failures, 1); 4143 CTR2(KTR_PMAP, 4144 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4145 pmap); 4146 return; 4147 } 4148 4149 /* 4150 * Promote the pv entries. 4151 */ 4152 if ((newpde & PG_MANAGED) != 0) 4153 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4154 4155 /* 4156 * Propagate the PAT index to its proper position. 4157 */ 4158 newpde = pmap_swap_pat(pmap, newpde); 4159 4160 /* 4161 * Map the superpage. 4162 */ 4163 if (workaround_erratum383) 4164 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4165 else 4166 pde_store(pde, PG_PS | newpde); 4167 4168 atomic_add_long(&pmap_pde_promotions, 1); 4169 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4170 " in pmap %p", va, pmap); 4171} 4172 4173/* 4174 * Insert the given physical page (p) at 4175 * the specified virtual address (v) in the 4176 * target physical map with the protection requested. 4177 * 4178 * If specified, the page will be wired down, meaning 4179 * that the related pte can not be reclaimed. 4180 * 4181 * NB: This is the only routine which MAY NOT lazy-evaluate 4182 * or lose information. That is, this routine must actually 4183 * insert this page into the given map NOW. 4184 */ 4185int 4186pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4187 u_int flags, int8_t psind __unused) 4188{ 4189 struct rwlock *lock; 4190 pd_entry_t *pde; 4191 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4192 pt_entry_t newpte, origpte; 4193 pv_entry_t pv; 4194 vm_paddr_t opa, pa; 4195 vm_page_t mpte, om; 4196 boolean_t nosleep; 4197 4198 PG_A = pmap_accessed_bit(pmap); 4199 PG_G = pmap_global_bit(pmap); 4200 PG_M = pmap_modified_bit(pmap); 4201 PG_V = pmap_valid_bit(pmap); 4202 PG_RW = pmap_rw_bit(pmap); 4203 4204 va = trunc_page(va); 4205 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4206 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4207 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4208 va)); 4209 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4210 va >= kmi.clean_eva, 4211 ("pmap_enter: managed mapping within the clean submap")); 4212 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4213 VM_OBJECT_ASSERT_LOCKED(m->object); 4214 pa = VM_PAGE_TO_PHYS(m); 4215 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4216 if ((flags & VM_PROT_WRITE) != 0) 4217 newpte |= PG_M; 4218 if ((prot & VM_PROT_WRITE) != 0) 4219 newpte |= PG_RW; 4220 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4221 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4222 if ((prot & VM_PROT_EXECUTE) == 0) 4223 newpte |= pg_nx; 4224 if ((flags & PMAP_ENTER_WIRED) != 0) 4225 newpte |= PG_W; 4226 if (va < VM_MAXUSER_ADDRESS) 4227 newpte |= PG_U; 4228 if (pmap == kernel_pmap) 4229 newpte |= PG_G; 4230 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); 4231 4232 /* 4233 * Set modified bit gratuitously for writeable mappings if 4234 * the page is unmanaged. We do not want to take a fault 4235 * to do the dirty bit accounting for these mappings. 4236 */ 4237 if ((m->oflags & VPO_UNMANAGED) != 0) { 4238 if ((newpte & PG_RW) != 0) 4239 newpte |= PG_M; 4240 } 4241 4242 mpte = NULL; 4243 4244 lock = NULL; 4245 rw_rlock(&pvh_global_lock); 4246 PMAP_LOCK(pmap); 4247 4248 /* 4249 * In the case that a page table page is not 4250 * resident, we are creating it here. 4251 */ 4252retry: 4253 pde = pmap_pde(pmap, va); 4254 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4255 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4256 pte = pmap_pde_to_pte(pde, va); 4257 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4258 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4259 mpte->wire_count++; 4260 } 4261 } else if (va < VM_MAXUSER_ADDRESS) { 4262 /* 4263 * Here if the pte page isn't mapped, or if it has been 4264 * deallocated. 4265 */ 4266 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4267 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4268 nosleep ? NULL : &lock); 4269 if (mpte == NULL && nosleep) { 4270 if (lock != NULL) 4271 rw_wunlock(lock); 4272 rw_runlock(&pvh_global_lock); 4273 PMAP_UNLOCK(pmap); 4274 return (KERN_RESOURCE_SHORTAGE); 4275 } 4276 goto retry; 4277 } else 4278 panic("pmap_enter: invalid page directory va=%#lx", va); 4279 4280 origpte = *pte; 4281 4282 /* 4283 * Is the specified virtual address already mapped? 4284 */ 4285 if ((origpte & PG_V) != 0) { 4286 /* 4287 * Wiring change, just update stats. We don't worry about 4288 * wiring PT pages as they remain resident as long as there 4289 * are valid mappings in them. Hence, if a user page is wired, 4290 * the PT page will be also. 4291 */ 4292 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4293 pmap->pm_stats.wired_count++; 4294 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4295 pmap->pm_stats.wired_count--; 4296 4297 /* 4298 * Remove the extra PT page reference. 4299 */ 4300 if (mpte != NULL) { 4301 mpte->wire_count--; 4302 KASSERT(mpte->wire_count > 0, 4303 ("pmap_enter: missing reference to page table page," 4304 " va: 0x%lx", va)); 4305 } 4306 4307 /* 4308 * Has the physical page changed? 4309 */ 4310 opa = origpte & PG_FRAME; 4311 if (opa == pa) { 4312 /* 4313 * No, might be a protection or wiring change. 4314 */ 4315 if ((origpte & PG_MANAGED) != 0) { 4316 newpte |= PG_MANAGED; 4317 if ((newpte & PG_RW) != 0) 4318 vm_page_aflag_set(m, PGA_WRITEABLE); 4319 } 4320 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4321 goto unchanged; 4322 goto validate; 4323 } 4324 } else { 4325 /* 4326 * Increment the counters. 4327 */ 4328 if ((newpte & PG_W) != 0) 4329 pmap->pm_stats.wired_count++; 4330 pmap_resident_count_inc(pmap, 1); 4331 } 4332 4333 /* 4334 * Enter on the PV list if part of our managed memory. 4335 */ 4336 if ((m->oflags & VPO_UNMANAGED) == 0) { 4337 newpte |= PG_MANAGED; 4338 pv = get_pv_entry(pmap, &lock); 4339 pv->pv_va = va; 4340 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4341 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4342 m->md.pv_gen++; 4343 if ((newpte & PG_RW) != 0) 4344 vm_page_aflag_set(m, PGA_WRITEABLE); 4345 } 4346 4347 /* 4348 * Update the PTE. 4349 */ 4350 if ((origpte & PG_V) != 0) { 4351validate: 4352 origpte = pte_load_store(pte, newpte); 4353 opa = origpte & PG_FRAME; 4354 if (opa != pa) { 4355 if ((origpte & PG_MANAGED) != 0) { 4356 om = PHYS_TO_VM_PAGE(opa); 4357 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4358 PG_RW)) 4359 vm_page_dirty(om); 4360 if ((origpte & PG_A) != 0) 4361 vm_page_aflag_set(om, PGA_REFERENCED); 4362 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4363 pmap_pvh_free(&om->md, pmap, va); 4364 if ((om->aflags & PGA_WRITEABLE) != 0 && 4365 TAILQ_EMPTY(&om->md.pv_list) && 4366 ((om->flags & PG_FICTITIOUS) != 0 || 4367 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4368 vm_page_aflag_clear(om, PGA_WRITEABLE); 4369 } 4370 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4371 PG_RW)) == (PG_M | PG_RW)) { 4372 if ((origpte & PG_MANAGED) != 0) 4373 vm_page_dirty(m); 4374 4375 /* 4376 * Although the PTE may still have PG_RW set, TLB 4377 * invalidation may nonetheless be required because 4378 * the PTE no longer has PG_M set. 4379 */ 4380 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4381 /* 4382 * This PTE change does not require TLB invalidation. 4383 */ 4384 goto unchanged; 4385 } 4386 if ((origpte & PG_A) != 0) 4387 pmap_invalidate_page(pmap, va); 4388 } else 4389 pte_store(pte, newpte); 4390 4391unchanged: 4392 4393 /* 4394 * If both the page table page and the reservation are fully 4395 * populated, then attempt promotion. 4396 */ 4397 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4398 pmap_ps_enabled(pmap) && 4399 (m->flags & PG_FICTITIOUS) == 0 && 4400 vm_reserv_level_iffullpop(m) == 0) 4401 pmap_promote_pde(pmap, pde, va, &lock); 4402 4403 if (lock != NULL) 4404 rw_wunlock(lock); 4405 rw_runlock(&pvh_global_lock); 4406 PMAP_UNLOCK(pmap); 4407 return (KERN_SUCCESS); 4408} 4409 4410/* 4411 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 4412 * otherwise. Fails if (1) a page table page cannot be allocated without 4413 * blocking, (2) a mapping already exists at the specified virtual address, or 4414 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4415 */ 4416static boolean_t 4417pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4418 struct rwlock **lockp) 4419{ 4420 pd_entry_t *pde, newpde; 4421 pt_entry_t PG_V; 4422 vm_page_t mpde; 4423 struct spglist free; 4424 4425 PG_V = pmap_valid_bit(pmap); 4426 rw_assert(&pvh_global_lock, RA_LOCKED); 4427 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4428 4429 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 4430 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4431 " in pmap %p", va, pmap); 4432 return (FALSE); 4433 } 4434 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 4435 pde = &pde[pmap_pde_index(va)]; 4436 if ((*pde & PG_V) != 0) { 4437 KASSERT(mpde->wire_count > 1, 4438 ("pmap_enter_pde: mpde's wire count is too low")); 4439 mpde->wire_count--; 4440 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4441 " in pmap %p", va, pmap); 4442 return (FALSE); 4443 } 4444 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4445 PG_PS | PG_V; 4446 if ((m->oflags & VPO_UNMANAGED) == 0) { 4447 newpde |= PG_MANAGED; 4448 4449 /* 4450 * Abort this mapping if its PV entry could not be created. 4451 */ 4452 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 4453 lockp)) { 4454 SLIST_INIT(&free); 4455 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 4456 pmap_invalidate_page(pmap, va); 4457 pmap_free_zero_pages(&free); 4458 } 4459 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4460 " in pmap %p", va, pmap); 4461 return (FALSE); 4462 } 4463 } 4464 if ((prot & VM_PROT_EXECUTE) == 0) 4465 newpde |= pg_nx; 4466 if (va < VM_MAXUSER_ADDRESS) 4467 newpde |= PG_U; 4468 4469 /* 4470 * Increment counters. 4471 */ 4472 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4473 4474 /* 4475 * Map the superpage. 4476 */ 4477 pde_store(pde, newpde); 4478 4479 atomic_add_long(&pmap_pde_mappings, 1); 4480 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4481 " in pmap %p", va, pmap); 4482 return (TRUE); 4483} 4484 4485/* 4486 * Maps a sequence of resident pages belonging to the same object. 4487 * The sequence begins with the given page m_start. This page is 4488 * mapped at the given virtual address start. Each subsequent page is 4489 * mapped at a virtual address that is offset from start by the same 4490 * amount as the page is offset from m_start within the object. The 4491 * last page in the sequence is the page with the largest offset from 4492 * m_start that can be mapped at a virtual address less than the given 4493 * virtual address end. Not every virtual page between start and end 4494 * is mapped; only those for which a resident page exists with the 4495 * corresponding offset from m_start are mapped. 4496 */ 4497void 4498pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4499 vm_page_t m_start, vm_prot_t prot) 4500{ 4501 struct rwlock *lock; 4502 vm_offset_t va; 4503 vm_page_t m, mpte; 4504 vm_pindex_t diff, psize; 4505 4506 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4507 4508 psize = atop(end - start); 4509 mpte = NULL; 4510 m = m_start; 4511 lock = NULL; 4512 rw_rlock(&pvh_global_lock); 4513 PMAP_LOCK(pmap); 4514 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4515 va = start + ptoa(diff); 4516 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4517 m->psind == 1 && pmap_ps_enabled(pmap) && 4518 pmap_enter_pde(pmap, va, m, prot, &lock)) 4519 m = &m[NBPDR / PAGE_SIZE - 1]; 4520 else 4521 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4522 mpte, &lock); 4523 m = TAILQ_NEXT(m, listq); 4524 } 4525 if (lock != NULL) 4526 rw_wunlock(lock); 4527 rw_runlock(&pvh_global_lock); 4528 PMAP_UNLOCK(pmap); 4529} 4530 4531/* 4532 * this code makes some *MAJOR* assumptions: 4533 * 1. Current pmap & pmap exists. 4534 * 2. Not wired. 4535 * 3. Read access. 4536 * 4. No page table pages. 4537 * but is *MUCH* faster than pmap_enter... 4538 */ 4539 4540void 4541pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4542{ 4543 struct rwlock *lock; 4544 4545 lock = NULL; 4546 rw_rlock(&pvh_global_lock); 4547 PMAP_LOCK(pmap); 4548 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4549 if (lock != NULL) 4550 rw_wunlock(lock); 4551 rw_runlock(&pvh_global_lock); 4552 PMAP_UNLOCK(pmap); 4553} 4554 4555static vm_page_t 4556pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4557 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4558{ 4559 struct spglist free; 4560 pt_entry_t *pte, PG_V; 4561 vm_paddr_t pa; 4562 4563 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4564 (m->oflags & VPO_UNMANAGED) != 0, 4565 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4566 PG_V = pmap_valid_bit(pmap); 4567 rw_assert(&pvh_global_lock, RA_LOCKED); 4568 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4569 4570 /* 4571 * In the case that a page table page is not 4572 * resident, we are creating it here. 4573 */ 4574 if (va < VM_MAXUSER_ADDRESS) { 4575 vm_pindex_t ptepindex; 4576 pd_entry_t *ptepa; 4577 4578 /* 4579 * Calculate pagetable page index 4580 */ 4581 ptepindex = pmap_pde_pindex(va); 4582 if (mpte && (mpte->pindex == ptepindex)) { 4583 mpte->wire_count++; 4584 } else { 4585 /* 4586 * Get the page directory entry 4587 */ 4588 ptepa = pmap_pde(pmap, va); 4589 4590 /* 4591 * If the page table page is mapped, we just increment 4592 * the hold count, and activate it. Otherwise, we 4593 * attempt to allocate a page table page. If this 4594 * attempt fails, we don't retry. Instead, we give up. 4595 */ 4596 if (ptepa && (*ptepa & PG_V) != 0) { 4597 if (*ptepa & PG_PS) 4598 return (NULL); 4599 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4600 mpte->wire_count++; 4601 } else { 4602 /* 4603 * Pass NULL instead of the PV list lock 4604 * pointer, because we don't intend to sleep. 4605 */ 4606 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4607 if (mpte == NULL) 4608 return (mpte); 4609 } 4610 } 4611 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4612 pte = &pte[pmap_pte_index(va)]; 4613 } else { 4614 mpte = NULL; 4615 pte = vtopte(va); 4616 } 4617 if (*pte) { 4618 if (mpte != NULL) { 4619 mpte->wire_count--; 4620 mpte = NULL; 4621 } 4622 return (mpte); 4623 } 4624 4625 /* 4626 * Enter on the PV list if part of our managed memory. 4627 */ 4628 if ((m->oflags & VPO_UNMANAGED) == 0 && 4629 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4630 if (mpte != NULL) { 4631 SLIST_INIT(&free); 4632 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4633 pmap_invalidate_page(pmap, va); 4634 pmap_free_zero_pages(&free); 4635 } 4636 mpte = NULL; 4637 } 4638 return (mpte); 4639 } 4640 4641 /* 4642 * Increment counters 4643 */ 4644 pmap_resident_count_inc(pmap, 1); 4645 4646 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4647 if ((prot & VM_PROT_EXECUTE) == 0) 4648 pa |= pg_nx; 4649 4650 /* 4651 * Now validate mapping with RO protection 4652 */ 4653 if ((m->oflags & VPO_UNMANAGED) != 0) 4654 pte_store(pte, pa | PG_V | PG_U); 4655 else 4656 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4657 return (mpte); 4658} 4659 4660/* 4661 * Make a temporary mapping for a physical address. This is only intended 4662 * to be used for panic dumps. 4663 */ 4664void * 4665pmap_kenter_temporary(vm_paddr_t pa, int i) 4666{ 4667 vm_offset_t va; 4668 4669 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4670 pmap_kenter(va, pa); 4671 invlpg(va); 4672 return ((void *)crashdumpmap); 4673} 4674 4675/* 4676 * This code maps large physical mmap regions into the 4677 * processor address space. Note that some shortcuts 4678 * are taken, but the code works. 4679 */ 4680void 4681pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4682 vm_pindex_t pindex, vm_size_t size) 4683{ 4684 pd_entry_t *pde; 4685 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4686 vm_paddr_t pa, ptepa; 4687 vm_page_t p, pdpg; 4688 int pat_mode; 4689 4690 PG_A = pmap_accessed_bit(pmap); 4691 PG_M = pmap_modified_bit(pmap); 4692 PG_V = pmap_valid_bit(pmap); 4693 PG_RW = pmap_rw_bit(pmap); 4694 4695 VM_OBJECT_ASSERT_WLOCKED(object); 4696 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4697 ("pmap_object_init_pt: non-device object")); 4698 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4699 if (!pmap_ps_enabled(pmap)) 4700 return; 4701 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4702 return; 4703 p = vm_page_lookup(object, pindex); 4704 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4705 ("pmap_object_init_pt: invalid page %p", p)); 4706 pat_mode = p->md.pat_mode; 4707 4708 /* 4709 * Abort the mapping if the first page is not physically 4710 * aligned to a 2MB page boundary. 4711 */ 4712 ptepa = VM_PAGE_TO_PHYS(p); 4713 if (ptepa & (NBPDR - 1)) 4714 return; 4715 4716 /* 4717 * Skip the first page. Abort the mapping if the rest of 4718 * the pages are not physically contiguous or have differing 4719 * memory attributes. 4720 */ 4721 p = TAILQ_NEXT(p, listq); 4722 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4723 pa += PAGE_SIZE) { 4724 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4725 ("pmap_object_init_pt: invalid page %p", p)); 4726 if (pa != VM_PAGE_TO_PHYS(p) || 4727 pat_mode != p->md.pat_mode) 4728 return; 4729 p = TAILQ_NEXT(p, listq); 4730 } 4731 4732 /* 4733 * Map using 2MB pages. Since "ptepa" is 2M aligned and 4734 * "size" is a multiple of 2M, adding the PAT setting to "pa" 4735 * will not affect the termination of this loop. 4736 */ 4737 PMAP_LOCK(pmap); 4738 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4739 pa < ptepa + size; pa += NBPDR) { 4740 pdpg = pmap_allocpde(pmap, addr, NULL); 4741 if (pdpg == NULL) { 4742 /* 4743 * The creation of mappings below is only an 4744 * optimization. If a page directory page 4745 * cannot be allocated without blocking, 4746 * continue on to the next mapping rather than 4747 * blocking. 4748 */ 4749 addr += NBPDR; 4750 continue; 4751 } 4752 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4753 pde = &pde[pmap_pde_index(addr)]; 4754 if ((*pde & PG_V) == 0) { 4755 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4756 PG_U | PG_RW | PG_V); 4757 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4758 atomic_add_long(&pmap_pde_mappings, 1); 4759 } else { 4760 /* Continue on if the PDE is already valid. */ 4761 pdpg->wire_count--; 4762 KASSERT(pdpg->wire_count > 0, 4763 ("pmap_object_init_pt: missing reference " 4764 "to page directory page, va: 0x%lx", addr)); 4765 } 4766 addr += NBPDR; 4767 } 4768 PMAP_UNLOCK(pmap); 4769 } 4770} 4771 4772/* 4773 * Clear the wired attribute from the mappings for the specified range of 4774 * addresses in the given pmap. Every valid mapping within that range 4775 * must have the wired attribute set. In contrast, invalid mappings 4776 * cannot have the wired attribute set, so they are ignored. 4777 * 4778 * The wired attribute of the page table entry is not a hardware feature, 4779 * so there is no need to invalidate any TLB entries. 4780 */ 4781void 4782pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4783{ 4784 vm_offset_t va_next; 4785 pml4_entry_t *pml4e; 4786 pdp_entry_t *pdpe; 4787 pd_entry_t *pde; 4788 pt_entry_t *pte, PG_V; 4789 boolean_t pv_lists_locked; 4790 4791 PG_V = pmap_valid_bit(pmap); 4792 pv_lists_locked = FALSE; 4793resume: 4794 PMAP_LOCK(pmap); 4795 for (; sva < eva; sva = va_next) { 4796 pml4e = pmap_pml4e(pmap, sva); 4797 if ((*pml4e & PG_V) == 0) { 4798 va_next = (sva + NBPML4) & ~PML4MASK; 4799 if (va_next < sva) 4800 va_next = eva; 4801 continue; 4802 } 4803 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4804 if ((*pdpe & PG_V) == 0) { 4805 va_next = (sva + NBPDP) & ~PDPMASK; 4806 if (va_next < sva) 4807 va_next = eva; 4808 continue; 4809 } 4810 va_next = (sva + NBPDR) & ~PDRMASK; 4811 if (va_next < sva) 4812 va_next = eva; 4813 pde = pmap_pdpe_to_pde(pdpe, sva); 4814 if ((*pde & PG_V) == 0) 4815 continue; 4816 if ((*pde & PG_PS) != 0) { 4817 if ((*pde & PG_W) == 0) 4818 panic("pmap_unwire: pde %#jx is missing PG_W", 4819 (uintmax_t)*pde); 4820 4821 /* 4822 * Are we unwiring the entire large page? If not, 4823 * demote the mapping and fall through. 4824 */ 4825 if (sva + NBPDR == va_next && eva >= va_next) { 4826 atomic_clear_long(pde, PG_W); 4827 pmap->pm_stats.wired_count -= NBPDR / 4828 PAGE_SIZE; 4829 continue; 4830 } else { 4831 if (!pv_lists_locked) { 4832 pv_lists_locked = TRUE; 4833 if (!rw_try_rlock(&pvh_global_lock)) { 4834 PMAP_UNLOCK(pmap); 4835 rw_rlock(&pvh_global_lock); 4836 /* Repeat sva. */ 4837 goto resume; 4838 } 4839 } 4840 if (!pmap_demote_pde(pmap, pde, sva)) 4841 panic("pmap_unwire: demotion failed"); 4842 } 4843 } 4844 if (va_next > eva) 4845 va_next = eva; 4846 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4847 sva += PAGE_SIZE) { 4848 if ((*pte & PG_V) == 0) 4849 continue; 4850 if ((*pte & PG_W) == 0) 4851 panic("pmap_unwire: pte %#jx is missing PG_W", 4852 (uintmax_t)*pte); 4853 4854 /* 4855 * PG_W must be cleared atomically. Although the pmap 4856 * lock synchronizes access to PG_W, another processor 4857 * could be setting PG_M and/or PG_A concurrently. 4858 */ 4859 atomic_clear_long(pte, PG_W); 4860 pmap->pm_stats.wired_count--; 4861 } 4862 } 4863 if (pv_lists_locked) 4864 rw_runlock(&pvh_global_lock); 4865 PMAP_UNLOCK(pmap); 4866} 4867 4868/* 4869 * Copy the range specified by src_addr/len 4870 * from the source map to the range dst_addr/len 4871 * in the destination map. 4872 * 4873 * This routine is only advisory and need not do anything. 4874 */ 4875 4876void 4877pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4878 vm_offset_t src_addr) 4879{ 4880 struct rwlock *lock; 4881 struct spglist free; 4882 vm_offset_t addr; 4883 vm_offset_t end_addr = src_addr + len; 4884 vm_offset_t va_next; 4885 pt_entry_t PG_A, PG_M, PG_V; 4886 4887 if (dst_addr != src_addr) 4888 return; 4889 4890 if (dst_pmap->pm_type != src_pmap->pm_type) 4891 return; 4892 4893 /* 4894 * EPT page table entries that require emulation of A/D bits are 4895 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 4896 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 4897 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 4898 * implementations flag an EPT misconfiguration for exec-only 4899 * mappings we skip this function entirely for emulated pmaps. 4900 */ 4901 if (pmap_emulate_ad_bits(dst_pmap)) 4902 return; 4903 4904 lock = NULL; 4905 rw_rlock(&pvh_global_lock); 4906 if (dst_pmap < src_pmap) { 4907 PMAP_LOCK(dst_pmap); 4908 PMAP_LOCK(src_pmap); 4909 } else { 4910 PMAP_LOCK(src_pmap); 4911 PMAP_LOCK(dst_pmap); 4912 } 4913 4914 PG_A = pmap_accessed_bit(dst_pmap); 4915 PG_M = pmap_modified_bit(dst_pmap); 4916 PG_V = pmap_valid_bit(dst_pmap); 4917 4918 for (addr = src_addr; addr < end_addr; addr = va_next) { 4919 pt_entry_t *src_pte, *dst_pte; 4920 vm_page_t dstmpde, dstmpte, srcmpte; 4921 pml4_entry_t *pml4e; 4922 pdp_entry_t *pdpe; 4923 pd_entry_t srcptepaddr, *pde; 4924 4925 KASSERT(addr < UPT_MIN_ADDRESS, 4926 ("pmap_copy: invalid to pmap_copy page tables")); 4927 4928 pml4e = pmap_pml4e(src_pmap, addr); 4929 if ((*pml4e & PG_V) == 0) { 4930 va_next = (addr + NBPML4) & ~PML4MASK; 4931 if (va_next < addr) 4932 va_next = end_addr; 4933 continue; 4934 } 4935 4936 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 4937 if ((*pdpe & PG_V) == 0) { 4938 va_next = (addr + NBPDP) & ~PDPMASK; 4939 if (va_next < addr) 4940 va_next = end_addr; 4941 continue; 4942 } 4943 4944 va_next = (addr + NBPDR) & ~PDRMASK; 4945 if (va_next < addr) 4946 va_next = end_addr; 4947 4948 pde = pmap_pdpe_to_pde(pdpe, addr); 4949 srcptepaddr = *pde; 4950 if (srcptepaddr == 0) 4951 continue; 4952 4953 if (srcptepaddr & PG_PS) { 4954 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4955 continue; 4956 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 4957 if (dstmpde == NULL) 4958 break; 4959 pde = (pd_entry_t *) 4960 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 4961 pde = &pde[pmap_pde_index(addr)]; 4962 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 4963 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4964 PG_PS_FRAME, &lock))) { 4965 *pde = srcptepaddr & ~PG_W; 4966 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 4967 } else 4968 dstmpde->wire_count--; 4969 continue; 4970 } 4971 4972 srcptepaddr &= PG_FRAME; 4973 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4974 KASSERT(srcmpte->wire_count > 0, 4975 ("pmap_copy: source page table page is unused")); 4976 4977 if (va_next > end_addr) 4978 va_next = end_addr; 4979 4980 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4981 src_pte = &src_pte[pmap_pte_index(addr)]; 4982 dstmpte = NULL; 4983 while (addr < va_next) { 4984 pt_entry_t ptetemp; 4985 ptetemp = *src_pte; 4986 /* 4987 * we only virtual copy managed pages 4988 */ 4989 if ((ptetemp & PG_MANAGED) != 0) { 4990 if (dstmpte != NULL && 4991 dstmpte->pindex == pmap_pde_pindex(addr)) 4992 dstmpte->wire_count++; 4993 else if ((dstmpte = pmap_allocpte(dst_pmap, 4994 addr, NULL)) == NULL) 4995 goto out; 4996 dst_pte = (pt_entry_t *) 4997 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4998 dst_pte = &dst_pte[pmap_pte_index(addr)]; 4999 if (*dst_pte == 0 && 5000 pmap_try_insert_pv_entry(dst_pmap, addr, 5001 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5002 &lock)) { 5003 /* 5004 * Clear the wired, modified, and 5005 * accessed (referenced) bits 5006 * during the copy. 5007 */ 5008 *dst_pte = ptetemp & ~(PG_W | PG_M | 5009 PG_A); 5010 pmap_resident_count_inc(dst_pmap, 1); 5011 } else { 5012 SLIST_INIT(&free); 5013 if (pmap_unwire_ptp(dst_pmap, addr, 5014 dstmpte, &free)) { 5015 pmap_invalidate_page(dst_pmap, 5016 addr); 5017 pmap_free_zero_pages(&free); 5018 } 5019 goto out; 5020 } 5021 if (dstmpte->wire_count >= srcmpte->wire_count) 5022 break; 5023 } 5024 addr += PAGE_SIZE; 5025 src_pte++; 5026 } 5027 } 5028out: 5029 if (lock != NULL) 5030 rw_wunlock(lock); 5031 rw_runlock(&pvh_global_lock); 5032 PMAP_UNLOCK(src_pmap); 5033 PMAP_UNLOCK(dst_pmap); 5034} 5035 5036/* 5037 * pmap_zero_page zeros the specified hardware page by mapping 5038 * the page into KVM and using bzero to clear its contents. 5039 */ 5040void 5041pmap_zero_page(vm_page_t m) 5042{ 5043 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5044 5045 pagezero((void *)va); 5046} 5047 5048/* 5049 * pmap_zero_page_area zeros the specified hardware page by mapping 5050 * the page into KVM and using bzero to clear its contents. 5051 * 5052 * off and size may not cover an area beyond a single hardware page. 5053 */ 5054void 5055pmap_zero_page_area(vm_page_t m, int off, int size) 5056{ 5057 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5058 5059 if (off == 0 && size == PAGE_SIZE) 5060 pagezero((void *)va); 5061 else 5062 bzero((char *)va + off, size); 5063} 5064 5065/* 5066 * pmap_zero_page_idle zeros the specified hardware page by mapping 5067 * the page into KVM and using bzero to clear its contents. This 5068 * is intended to be called from the vm_pagezero process only and 5069 * outside of Giant. 5070 */ 5071void 5072pmap_zero_page_idle(vm_page_t m) 5073{ 5074 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5075 5076 pagezero((void *)va); 5077} 5078 5079/* 5080 * pmap_copy_page copies the specified (machine independent) 5081 * page by mapping the page into virtual memory and using 5082 * bcopy to copy the page, one machine dependent page at a 5083 * time. 5084 */ 5085void 5086pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5087{ 5088 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5089 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5090 5091 pagecopy((void *)src, (void *)dst); 5092} 5093 5094int unmapped_buf_allowed = 1; 5095 5096void 5097pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5098 vm_offset_t b_offset, int xfersize) 5099{ 5100 void *a_cp, *b_cp; 5101 vm_page_t m_a, m_b; 5102 vm_paddr_t p_a, p_b; 5103 pt_entry_t *pte; 5104 vm_offset_t a_pg_offset, b_pg_offset; 5105 int cnt; 5106 boolean_t pinned; 5107 5108 /* 5109 * NB: The sequence of updating a page table followed by accesses 5110 * to the corresponding pages used in the !DMAP case is subject to 5111 * the situation described in the "AMD64 Architecture Programmer's 5112 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 5113 * Coherency Considerations". Therefore, issuing the INVLPG right 5114 * after modifying the PTE bits is crucial. 5115 */ 5116 pinned = FALSE; 5117 while (xfersize > 0) { 5118 a_pg_offset = a_offset & PAGE_MASK; 5119 m_a = ma[a_offset >> PAGE_SHIFT]; 5120 p_a = m_a->phys_addr; 5121 b_pg_offset = b_offset & PAGE_MASK; 5122 m_b = mb[b_offset >> PAGE_SHIFT]; 5123 p_b = m_b->phys_addr; 5124 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5125 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5126 if (__predict_false(p_a < DMAP_MIN_ADDRESS || 5127 p_a > DMAP_MIN_ADDRESS + dmaplimit)) { 5128 mtx_lock(&cpage_lock); 5129 sched_pin(); 5130 pinned = TRUE; 5131 pte = vtopte(cpage_a); 5132 *pte = p_a | X86_PG_A | X86_PG_V | 5133 pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0); 5134 invlpg(cpage_a); 5135 a_cp = (char *)cpage_a + a_pg_offset; 5136 } else { 5137 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 5138 } 5139 if (__predict_false(p_b < DMAP_MIN_ADDRESS || 5140 p_b > DMAP_MIN_ADDRESS + dmaplimit)) { 5141 if (!pinned) { 5142 mtx_lock(&cpage_lock); 5143 sched_pin(); 5144 pinned = TRUE; 5145 } 5146 pte = vtopte(cpage_b); 5147 *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW | 5148 X86_PG_V | pmap_cache_bits(kernel_pmap, 5149 m_b->md.pat_mode, 0); 5150 invlpg(cpage_b); 5151 b_cp = (char *)cpage_b + b_pg_offset; 5152 } else { 5153 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 5154 } 5155 bcopy(a_cp, b_cp, cnt); 5156 if (__predict_false(pinned)) { 5157 sched_unpin(); 5158 mtx_unlock(&cpage_lock); 5159 pinned = FALSE; 5160 } 5161 a_offset += cnt; 5162 b_offset += cnt; 5163 xfersize -= cnt; 5164 } 5165} 5166 5167/* 5168 * Returns true if the pmap's pv is one of the first 5169 * 16 pvs linked to from this page. This count may 5170 * be changed upwards or downwards in the future; it 5171 * is only necessary that true be returned for a small 5172 * subset of pmaps for proper page aging. 5173 */ 5174boolean_t 5175pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5176{ 5177 struct md_page *pvh; 5178 struct rwlock *lock; 5179 pv_entry_t pv; 5180 int loops = 0; 5181 boolean_t rv; 5182 5183 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5184 ("pmap_page_exists_quick: page %p is not managed", m)); 5185 rv = FALSE; 5186 rw_rlock(&pvh_global_lock); 5187 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5188 rw_rlock(lock); 5189 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5190 if (PV_PMAP(pv) == pmap) { 5191 rv = TRUE; 5192 break; 5193 } 5194 loops++; 5195 if (loops >= 16) 5196 break; 5197 } 5198 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5199 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5200 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5201 if (PV_PMAP(pv) == pmap) { 5202 rv = TRUE; 5203 break; 5204 } 5205 loops++; 5206 if (loops >= 16) 5207 break; 5208 } 5209 } 5210 rw_runlock(lock); 5211 rw_runlock(&pvh_global_lock); 5212 return (rv); 5213} 5214 5215/* 5216 * pmap_page_wired_mappings: 5217 * 5218 * Return the number of managed mappings to the given physical page 5219 * that are wired. 5220 */ 5221int 5222pmap_page_wired_mappings(vm_page_t m) 5223{ 5224 struct rwlock *lock; 5225 struct md_page *pvh; 5226 pmap_t pmap; 5227 pt_entry_t *pte; 5228 pv_entry_t pv; 5229 int count, md_gen, pvh_gen; 5230 5231 if ((m->oflags & VPO_UNMANAGED) != 0) 5232 return (0); 5233 rw_rlock(&pvh_global_lock); 5234 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5235 rw_rlock(lock); 5236restart: 5237 count = 0; 5238 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5239 pmap = PV_PMAP(pv); 5240 if (!PMAP_TRYLOCK(pmap)) { 5241 md_gen = m->md.pv_gen; 5242 rw_runlock(lock); 5243 PMAP_LOCK(pmap); 5244 rw_rlock(lock); 5245 if (md_gen != m->md.pv_gen) { 5246 PMAP_UNLOCK(pmap); 5247 goto restart; 5248 } 5249 } 5250 pte = pmap_pte(pmap, pv->pv_va); 5251 if ((*pte & PG_W) != 0) 5252 count++; 5253 PMAP_UNLOCK(pmap); 5254 } 5255 if ((m->flags & PG_FICTITIOUS) == 0) { 5256 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5257 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5258 pmap = PV_PMAP(pv); 5259 if (!PMAP_TRYLOCK(pmap)) { 5260 md_gen = m->md.pv_gen; 5261 pvh_gen = pvh->pv_gen; 5262 rw_runlock(lock); 5263 PMAP_LOCK(pmap); 5264 rw_rlock(lock); 5265 if (md_gen != m->md.pv_gen || 5266 pvh_gen != pvh->pv_gen) { 5267 PMAP_UNLOCK(pmap); 5268 goto restart; 5269 } 5270 } 5271 pte = pmap_pde(pmap, pv->pv_va); 5272 if ((*pte & PG_W) != 0) 5273 count++; 5274 PMAP_UNLOCK(pmap); 5275 } 5276 } 5277 rw_runlock(lock); 5278 rw_runlock(&pvh_global_lock); 5279 return (count); 5280} 5281 5282/* 5283 * Returns TRUE if the given page is mapped individually or as part of 5284 * a 2mpage. Otherwise, returns FALSE. 5285 */ 5286boolean_t 5287pmap_page_is_mapped(vm_page_t m) 5288{ 5289 struct rwlock *lock; 5290 boolean_t rv; 5291 5292 if ((m->oflags & VPO_UNMANAGED) != 0) 5293 return (FALSE); 5294 rw_rlock(&pvh_global_lock); 5295 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5296 rw_rlock(lock); 5297 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5298 ((m->flags & PG_FICTITIOUS) == 0 && 5299 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5300 rw_runlock(lock); 5301 rw_runlock(&pvh_global_lock); 5302 return (rv); 5303} 5304 5305/* 5306 * Destroy all managed, non-wired mappings in the given user-space 5307 * pmap. This pmap cannot be active on any processor besides the 5308 * caller. 5309 * 5310 * This function cannot be applied to the kernel pmap. Moreover, it 5311 * is not intended for general use. It is only to be used during 5312 * process termination. Consequently, it can be implemented in ways 5313 * that make it faster than pmap_remove(). First, it can more quickly 5314 * destroy mappings by iterating over the pmap's collection of PV 5315 * entries, rather than searching the page table. Second, it doesn't 5316 * have to test and clear the page table entries atomically, because 5317 * no processor is currently accessing the user address space. In 5318 * particular, a page table entry's dirty bit won't change state once 5319 * this function starts. 5320 */ 5321void 5322pmap_remove_pages(pmap_t pmap) 5323{ 5324 pd_entry_t ptepde; 5325 pt_entry_t *pte, tpte; 5326 pt_entry_t PG_M, PG_RW, PG_V; 5327 struct spglist free; 5328 vm_page_t m, mpte, mt; 5329 pv_entry_t pv; 5330 struct md_page *pvh; 5331 struct pv_chunk *pc, *npc; 5332 struct rwlock *lock; 5333 int64_t bit; 5334 uint64_t inuse, bitmask; 5335 int allfree, field, freed, idx; 5336 boolean_t superpage; 5337 vm_paddr_t pa; 5338 5339 /* 5340 * Assert that the given pmap is only active on the current 5341 * CPU. Unfortunately, we cannot block another CPU from 5342 * activating the pmap while this function is executing. 5343 */ 5344 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5345#ifdef INVARIANTS 5346 { 5347 cpuset_t other_cpus; 5348 5349 other_cpus = all_cpus; 5350 critical_enter(); 5351 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5352 CPU_AND(&other_cpus, &pmap->pm_active); 5353 critical_exit(); 5354 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5355 } 5356#endif 5357 5358 lock = NULL; 5359 PG_M = pmap_modified_bit(pmap); 5360 PG_V = pmap_valid_bit(pmap); 5361 PG_RW = pmap_rw_bit(pmap); 5362 5363 SLIST_INIT(&free); 5364 rw_rlock(&pvh_global_lock); 5365 PMAP_LOCK(pmap); 5366 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5367 allfree = 1; 5368 freed = 0; 5369 for (field = 0; field < _NPCM; field++) { 5370 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5371 while (inuse != 0) { 5372 bit = bsfq(inuse); 5373 bitmask = 1UL << bit; 5374 idx = field * 64 + bit; 5375 pv = &pc->pc_pventry[idx]; 5376 inuse &= ~bitmask; 5377 5378 pte = pmap_pdpe(pmap, pv->pv_va); 5379 ptepde = *pte; 5380 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5381 tpte = *pte; 5382 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5383 superpage = FALSE; 5384 ptepde = tpte; 5385 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5386 PG_FRAME); 5387 pte = &pte[pmap_pte_index(pv->pv_va)]; 5388 tpte = *pte; 5389 } else { 5390 /* 5391 * Keep track whether 'tpte' is a 5392 * superpage explicitly instead of 5393 * relying on PG_PS being set. 5394 * 5395 * This is because PG_PS is numerically 5396 * identical to PG_PTE_PAT and thus a 5397 * regular page could be mistaken for 5398 * a superpage. 5399 */ 5400 superpage = TRUE; 5401 } 5402 5403 if ((tpte & PG_V) == 0) { 5404 panic("bad pte va %lx pte %lx", 5405 pv->pv_va, tpte); 5406 } 5407 5408/* 5409 * We cannot remove wired pages from a process' mapping at this time 5410 */ 5411 if (tpte & PG_W) { 5412 allfree = 0; 5413 continue; 5414 } 5415 5416 if (superpage) 5417 pa = tpte & PG_PS_FRAME; 5418 else 5419 pa = tpte & PG_FRAME; 5420 5421 m = PHYS_TO_VM_PAGE(pa); 5422 KASSERT(m->phys_addr == pa, 5423 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5424 m, (uintmax_t)m->phys_addr, 5425 (uintmax_t)tpte)); 5426 5427 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5428 m < &vm_page_array[vm_page_array_size], 5429 ("pmap_remove_pages: bad tpte %#jx", 5430 (uintmax_t)tpte)); 5431 5432 pte_clear(pte); 5433 5434 /* 5435 * Update the vm_page_t clean/reference bits. 5436 */ 5437 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5438 if (superpage) { 5439 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5440 vm_page_dirty(mt); 5441 } else 5442 vm_page_dirty(m); 5443 } 5444 5445 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5446 5447 /* Mark free */ 5448 pc->pc_map[field] |= bitmask; 5449 if (superpage) { 5450 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5451 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5452 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5453 pvh->pv_gen++; 5454 if (TAILQ_EMPTY(&pvh->pv_list)) { 5455 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5456 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5457 TAILQ_EMPTY(&mt->md.pv_list)) 5458 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5459 } 5460 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 5461 if (mpte != NULL) { 5462 pmap_remove_pt_page(pmap, mpte); 5463 pmap_resident_count_dec(pmap, 1); 5464 KASSERT(mpte->wire_count == NPTEPG, 5465 ("pmap_remove_pages: pte page wire count error")); 5466 mpte->wire_count = 0; 5467 pmap_add_delayed_free_list(mpte, &free, FALSE); 5468 atomic_subtract_int(&cnt.v_wire_count, 1); 5469 } 5470 } else { 5471 pmap_resident_count_dec(pmap, 1); 5472 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5473 m->md.pv_gen++; 5474 if ((m->aflags & PGA_WRITEABLE) != 0 && 5475 TAILQ_EMPTY(&m->md.pv_list) && 5476 (m->flags & PG_FICTITIOUS) == 0) { 5477 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5478 if (TAILQ_EMPTY(&pvh->pv_list)) 5479 vm_page_aflag_clear(m, PGA_WRITEABLE); 5480 } 5481 } 5482 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5483 freed++; 5484 } 5485 } 5486 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5487 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5488 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5489 if (allfree) { 5490 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5491 free_pv_chunk(pc); 5492 } 5493 } 5494 if (lock != NULL) 5495 rw_wunlock(lock); 5496 pmap_invalidate_all(pmap); 5497 rw_runlock(&pvh_global_lock); 5498 PMAP_UNLOCK(pmap); 5499 pmap_free_zero_pages(&free); 5500} 5501 5502static boolean_t 5503pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5504{ 5505 struct rwlock *lock; 5506 pv_entry_t pv; 5507 struct md_page *pvh; 5508 pt_entry_t *pte, mask; 5509 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5510 pmap_t pmap; 5511 int md_gen, pvh_gen; 5512 boolean_t rv; 5513 5514 rv = FALSE; 5515 rw_rlock(&pvh_global_lock); 5516 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5517 rw_rlock(lock); 5518restart: 5519 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5520 pmap = PV_PMAP(pv); 5521 if (!PMAP_TRYLOCK(pmap)) { 5522 md_gen = m->md.pv_gen; 5523 rw_runlock(lock); 5524 PMAP_LOCK(pmap); 5525 rw_rlock(lock); 5526 if (md_gen != m->md.pv_gen) { 5527 PMAP_UNLOCK(pmap); 5528 goto restart; 5529 } 5530 } 5531 pte = pmap_pte(pmap, pv->pv_va); 5532 mask = 0; 5533 if (modified) { 5534 PG_M = pmap_modified_bit(pmap); 5535 PG_RW = pmap_rw_bit(pmap); 5536 mask |= PG_RW | PG_M; 5537 } 5538 if (accessed) { 5539 PG_A = pmap_accessed_bit(pmap); 5540 PG_V = pmap_valid_bit(pmap); 5541 mask |= PG_V | PG_A; 5542 } 5543 rv = (*pte & mask) == mask; 5544 PMAP_UNLOCK(pmap); 5545 if (rv) 5546 goto out; 5547 } 5548 if ((m->flags & PG_FICTITIOUS) == 0) { 5549 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5550 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5551 pmap = PV_PMAP(pv); 5552 if (!PMAP_TRYLOCK(pmap)) { 5553 md_gen = m->md.pv_gen; 5554 pvh_gen = pvh->pv_gen; 5555 rw_runlock(lock); 5556 PMAP_LOCK(pmap); 5557 rw_rlock(lock); 5558 if (md_gen != m->md.pv_gen || 5559 pvh_gen != pvh->pv_gen) { 5560 PMAP_UNLOCK(pmap); 5561 goto restart; 5562 } 5563 } 5564 pte = pmap_pde(pmap, pv->pv_va); 5565 mask = 0; 5566 if (modified) { 5567 PG_M = pmap_modified_bit(pmap); 5568 PG_RW = pmap_rw_bit(pmap); 5569 mask |= PG_RW | PG_M; 5570 } 5571 if (accessed) { 5572 PG_A = pmap_accessed_bit(pmap); 5573 PG_V = pmap_valid_bit(pmap); 5574 mask |= PG_V | PG_A; 5575 } 5576 rv = (*pte & mask) == mask; 5577 PMAP_UNLOCK(pmap); 5578 if (rv) 5579 goto out; 5580 } 5581 } 5582out: 5583 rw_runlock(lock); 5584 rw_runlock(&pvh_global_lock); 5585 return (rv); 5586} 5587 5588/* 5589 * pmap_is_modified: 5590 * 5591 * Return whether or not the specified physical page was modified 5592 * in any physical maps. 5593 */ 5594boolean_t 5595pmap_is_modified(vm_page_t m) 5596{ 5597 5598 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5599 ("pmap_is_modified: page %p is not managed", m)); 5600 5601 /* 5602 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5603 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5604 * is clear, no PTEs can have PG_M set. 5605 */ 5606 VM_OBJECT_ASSERT_WLOCKED(m->object); 5607 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5608 return (FALSE); 5609 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5610} 5611 5612/* 5613 * pmap_is_prefaultable: 5614 * 5615 * Return whether or not the specified virtual address is eligible 5616 * for prefault. 5617 */ 5618boolean_t 5619pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5620{ 5621 pd_entry_t *pde; 5622 pt_entry_t *pte, PG_V; 5623 boolean_t rv; 5624 5625 PG_V = pmap_valid_bit(pmap); 5626 rv = FALSE; 5627 PMAP_LOCK(pmap); 5628 pde = pmap_pde(pmap, addr); 5629 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5630 pte = pmap_pde_to_pte(pde, addr); 5631 rv = (*pte & PG_V) == 0; 5632 } 5633 PMAP_UNLOCK(pmap); 5634 return (rv); 5635} 5636 5637/* 5638 * pmap_is_referenced: 5639 * 5640 * Return whether or not the specified physical page was referenced 5641 * in any physical maps. 5642 */ 5643boolean_t 5644pmap_is_referenced(vm_page_t m) 5645{ 5646 5647 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5648 ("pmap_is_referenced: page %p is not managed", m)); 5649 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5650} 5651 5652/* 5653 * Clear the write and modified bits in each of the given page's mappings. 5654 */ 5655void 5656pmap_remove_write(vm_page_t m) 5657{ 5658 struct md_page *pvh; 5659 pmap_t pmap; 5660 struct rwlock *lock; 5661 pv_entry_t next_pv, pv; 5662 pd_entry_t *pde; 5663 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5664 vm_offset_t va; 5665 int pvh_gen, md_gen; 5666 5667 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5668 ("pmap_remove_write: page %p is not managed", m)); 5669 5670 /* 5671 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5672 * set by another thread while the object is locked. Thus, 5673 * if PGA_WRITEABLE is clear, no page table entries need updating. 5674 */ 5675 VM_OBJECT_ASSERT_WLOCKED(m->object); 5676 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5677 return; 5678 rw_rlock(&pvh_global_lock); 5679 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5680 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5681retry_pv_loop: 5682 rw_wlock(lock); 5683 if ((m->flags & PG_FICTITIOUS) != 0) 5684 goto small_mappings; 5685 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5686 pmap = PV_PMAP(pv); 5687 if (!PMAP_TRYLOCK(pmap)) { 5688 pvh_gen = pvh->pv_gen; 5689 rw_wunlock(lock); 5690 PMAP_LOCK(pmap); 5691 rw_wlock(lock); 5692 if (pvh_gen != pvh->pv_gen) { 5693 PMAP_UNLOCK(pmap); 5694 rw_wunlock(lock); 5695 goto retry_pv_loop; 5696 } 5697 } 5698 PG_RW = pmap_rw_bit(pmap); 5699 va = pv->pv_va; 5700 pde = pmap_pde(pmap, va); 5701 if ((*pde & PG_RW) != 0) 5702 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5703 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5704 ("inconsistent pv lock %p %p for page %p", 5705 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5706 PMAP_UNLOCK(pmap); 5707 } 5708small_mappings: 5709 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5710 pmap = PV_PMAP(pv); 5711 if (!PMAP_TRYLOCK(pmap)) { 5712 pvh_gen = pvh->pv_gen; 5713 md_gen = m->md.pv_gen; 5714 rw_wunlock(lock); 5715 PMAP_LOCK(pmap); 5716 rw_wlock(lock); 5717 if (pvh_gen != pvh->pv_gen || 5718 md_gen != m->md.pv_gen) { 5719 PMAP_UNLOCK(pmap); 5720 rw_wunlock(lock); 5721 goto retry_pv_loop; 5722 } 5723 } 5724 PG_M = pmap_modified_bit(pmap); 5725 PG_RW = pmap_rw_bit(pmap); 5726 pde = pmap_pde(pmap, pv->pv_va); 5727 KASSERT((*pde & PG_PS) == 0, 5728 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5729 m)); 5730 pte = pmap_pde_to_pte(pde, pv->pv_va); 5731retry: 5732 oldpte = *pte; 5733 if (oldpte & PG_RW) { 5734 if (!atomic_cmpset_long(pte, oldpte, oldpte & 5735 ~(PG_RW | PG_M))) 5736 goto retry; 5737 if ((oldpte & PG_M) != 0) 5738 vm_page_dirty(m); 5739 pmap_invalidate_page(pmap, pv->pv_va); 5740 } 5741 PMAP_UNLOCK(pmap); 5742 } 5743 rw_wunlock(lock); 5744 vm_page_aflag_clear(m, PGA_WRITEABLE); 5745 rw_runlock(&pvh_global_lock); 5746} 5747 5748static __inline boolean_t 5749safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 5750{ 5751 5752 if (!pmap_emulate_ad_bits(pmap)) 5753 return (TRUE); 5754 5755 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 5756 5757 /* 5758 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration 5759 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 5760 * if the EPT_PG_WRITE bit is set. 5761 */ 5762 if ((pte & EPT_PG_WRITE) != 0) 5763 return (FALSE); 5764 5765 /* 5766 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 5767 */ 5768 if ((pte & EPT_PG_EXECUTE) == 0 || 5769 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 5770 return (TRUE); 5771 else 5772 return (FALSE); 5773} 5774 5775#define PMAP_TS_REFERENCED_MAX 5 5776 5777/* 5778 * pmap_ts_referenced: 5779 * 5780 * Return a count of reference bits for a page, clearing those bits. 5781 * It is not necessary for every reference bit to be cleared, but it 5782 * is necessary that 0 only be returned when there are truly no 5783 * reference bits set. 5784 * 5785 * XXX: The exact number of bits to check and clear is a matter that 5786 * should be tested and standardized at some point in the future for 5787 * optimal aging of shared pages. 5788 */ 5789int 5790pmap_ts_referenced(vm_page_t m) 5791{ 5792 struct md_page *pvh; 5793 pv_entry_t pv, pvf; 5794 pmap_t pmap; 5795 struct rwlock *lock; 5796 pd_entry_t oldpde, *pde; 5797 pt_entry_t *pte, PG_A; 5798 vm_offset_t va; 5799 vm_paddr_t pa; 5800 int cleared, md_gen, not_cleared, pvh_gen; 5801 struct spglist free; 5802 boolean_t demoted; 5803 5804 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5805 ("pmap_ts_referenced: page %p is not managed", m)); 5806 SLIST_INIT(&free); 5807 cleared = 0; 5808 pa = VM_PAGE_TO_PHYS(m); 5809 lock = PHYS_TO_PV_LIST_LOCK(pa); 5810 pvh = pa_to_pvh(pa); 5811 rw_rlock(&pvh_global_lock); 5812 rw_wlock(lock); 5813retry: 5814 not_cleared = 0; 5815 if ((m->flags & PG_FICTITIOUS) != 0 || 5816 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5817 goto small_mappings; 5818 pv = pvf; 5819 do { 5820 if (pvf == NULL) 5821 pvf = pv; 5822 pmap = PV_PMAP(pv); 5823 if (!PMAP_TRYLOCK(pmap)) { 5824 pvh_gen = pvh->pv_gen; 5825 rw_wunlock(lock); 5826 PMAP_LOCK(pmap); 5827 rw_wlock(lock); 5828 if (pvh_gen != pvh->pv_gen) { 5829 PMAP_UNLOCK(pmap); 5830 goto retry; 5831 } 5832 } 5833 PG_A = pmap_accessed_bit(pmap); 5834 va = pv->pv_va; 5835 pde = pmap_pde(pmap, pv->pv_va); 5836 oldpde = *pde; 5837 if ((*pde & PG_A) != 0) { 5838 /* 5839 * Since this reference bit is shared by 512 4KB 5840 * pages, it should not be cleared every time it is 5841 * tested. Apply a simple "hash" function on the 5842 * physical page number, the virtual superpage number, 5843 * and the pmap address to select one 4KB page out of 5844 * the 512 on which testing the reference bit will 5845 * result in clearing that reference bit. This 5846 * function is designed to avoid the selection of the 5847 * same 4KB page for every 2MB page mapping. 5848 * 5849 * On demotion, a mapping that hasn't been referenced 5850 * is simply destroyed. To avoid the possibility of a 5851 * subsequent page fault on a demoted wired mapping, 5852 * always leave its reference bit set. Moreover, 5853 * since the superpage is wired, the current state of 5854 * its reference bit won't affect page replacement. 5855 */ 5856 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5857 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5858 (*pde & PG_W) == 0) { 5859 if (safe_to_clear_referenced(pmap, oldpde)) { 5860 atomic_clear_long(pde, PG_A); 5861 pmap_invalidate_page(pmap, pv->pv_va); 5862 demoted = FALSE; 5863 } else if (pmap_demote_pde_locked(pmap, pde, 5864 pv->pv_va, &lock)) { 5865 /* 5866 * Remove the mapping to a single page 5867 * so that a subsequent access may 5868 * repromote. Since the underlying 5869 * page table page is fully populated, 5870 * this removal never frees a page 5871 * table page. 5872 */ 5873 demoted = TRUE; 5874 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5875 PG_PS_FRAME); 5876 pte = pmap_pde_to_pte(pde, va); 5877 pmap_remove_pte(pmap, pte, va, *pde, 5878 NULL, &lock); 5879 pmap_invalidate_page(pmap, va); 5880 } else 5881 demoted = TRUE; 5882 5883 if (demoted) { 5884 /* 5885 * The superpage mapping was removed 5886 * entirely and therefore 'pv' is no 5887 * longer valid. 5888 */ 5889 if (pvf == pv) 5890 pvf = NULL; 5891 pv = NULL; 5892 } 5893 cleared++; 5894 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5895 ("inconsistent pv lock %p %p for page %p", 5896 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5897 } else 5898 not_cleared++; 5899 } 5900 PMAP_UNLOCK(pmap); 5901 /* Rotate the PV list if it has more than one entry. */ 5902 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5903 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5904 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5905 pvh->pv_gen++; 5906 } 5907 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5908 goto out; 5909 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5910small_mappings: 5911 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5912 goto out; 5913 pv = pvf; 5914 do { 5915 if (pvf == NULL) 5916 pvf = pv; 5917 pmap = PV_PMAP(pv); 5918 if (!PMAP_TRYLOCK(pmap)) { 5919 pvh_gen = pvh->pv_gen; 5920 md_gen = m->md.pv_gen; 5921 rw_wunlock(lock); 5922 PMAP_LOCK(pmap); 5923 rw_wlock(lock); 5924 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5925 PMAP_UNLOCK(pmap); 5926 goto retry; 5927 } 5928 } 5929 PG_A = pmap_accessed_bit(pmap); 5930 pde = pmap_pde(pmap, pv->pv_va); 5931 KASSERT((*pde & PG_PS) == 0, 5932 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 5933 m)); 5934 pte = pmap_pde_to_pte(pde, pv->pv_va); 5935 if ((*pte & PG_A) != 0) { 5936 if (safe_to_clear_referenced(pmap, *pte)) { 5937 atomic_clear_long(pte, PG_A); 5938 pmap_invalidate_page(pmap, pv->pv_va); 5939 cleared++; 5940 } else if ((*pte & PG_W) == 0) { 5941 /* 5942 * Wired pages cannot be paged out so 5943 * doing accessed bit emulation for 5944 * them is wasted effort. We do the 5945 * hard work for unwired pages only. 5946 */ 5947 pmap_remove_pte(pmap, pte, pv->pv_va, 5948 *pde, &free, &lock); 5949 pmap_invalidate_page(pmap, pv->pv_va); 5950 cleared++; 5951 if (pvf == pv) 5952 pvf = NULL; 5953 pv = NULL; 5954 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5955 ("inconsistent pv lock %p %p for page %p", 5956 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5957 } else 5958 not_cleared++; 5959 } 5960 PMAP_UNLOCK(pmap); 5961 /* Rotate the PV list if it has more than one entry. */ 5962 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5963 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5964 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5965 m->md.pv_gen++; 5966 } 5967 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5968 not_cleared < PMAP_TS_REFERENCED_MAX); 5969out: 5970 rw_wunlock(lock); 5971 rw_runlock(&pvh_global_lock); 5972 pmap_free_zero_pages(&free); 5973 return (cleared + not_cleared); 5974} 5975 5976/* 5977 * Apply the given advice to the specified range of addresses within the 5978 * given pmap. Depending on the advice, clear the referenced and/or 5979 * modified flags in each mapping and set the mapped page's dirty field. 5980 */ 5981void 5982pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5983{ 5984 struct rwlock *lock; 5985 pml4_entry_t *pml4e; 5986 pdp_entry_t *pdpe; 5987 pd_entry_t oldpde, *pde; 5988 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 5989 vm_offset_t va_next; 5990 vm_page_t m; 5991 boolean_t anychanged, pv_lists_locked; 5992 5993 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5994 return; 5995 5996 /* 5997 * A/D bit emulation requires an alternate code path when clearing 5998 * the modified and accessed bits below. Since this function is 5999 * advisory in nature we skip it entirely for pmaps that require 6000 * A/D bit emulation. 6001 */ 6002 if (pmap_emulate_ad_bits(pmap)) 6003 return; 6004 6005 PG_A = pmap_accessed_bit(pmap); 6006 PG_G = pmap_global_bit(pmap); 6007 PG_M = pmap_modified_bit(pmap); 6008 PG_V = pmap_valid_bit(pmap); 6009 PG_RW = pmap_rw_bit(pmap); 6010 6011 pv_lists_locked = FALSE; 6012resume: 6013 anychanged = FALSE; 6014 PMAP_LOCK(pmap); 6015 for (; sva < eva; sva = va_next) { 6016 pml4e = pmap_pml4e(pmap, sva); 6017 if ((*pml4e & PG_V) == 0) { 6018 va_next = (sva + NBPML4) & ~PML4MASK; 6019 if (va_next < sva) 6020 va_next = eva; 6021 continue; 6022 } 6023 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6024 if ((*pdpe & PG_V) == 0) { 6025 va_next = (sva + NBPDP) & ~PDPMASK; 6026 if (va_next < sva) 6027 va_next = eva; 6028 continue; 6029 } 6030 va_next = (sva + NBPDR) & ~PDRMASK; 6031 if (va_next < sva) 6032 va_next = eva; 6033 pde = pmap_pdpe_to_pde(pdpe, sva); 6034 oldpde = *pde; 6035 if ((oldpde & PG_V) == 0) 6036 continue; 6037 else if ((oldpde & PG_PS) != 0) { 6038 if ((oldpde & PG_MANAGED) == 0) 6039 continue; 6040 if (!pv_lists_locked) { 6041 pv_lists_locked = TRUE; 6042 if (!rw_try_rlock(&pvh_global_lock)) { 6043 if (anychanged) 6044 pmap_invalidate_all(pmap); 6045 PMAP_UNLOCK(pmap); 6046 rw_rlock(&pvh_global_lock); 6047 goto resume; 6048 } 6049 } 6050 lock = NULL; 6051 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6052 if (lock != NULL) 6053 rw_wunlock(lock); 6054 6055 /* 6056 * The large page mapping was destroyed. 6057 */ 6058 continue; 6059 } 6060 6061 /* 6062 * Unless the page mappings are wired, remove the 6063 * mapping to a single page so that a subsequent 6064 * access may repromote. Since the underlying page 6065 * table page is fully populated, this removal never 6066 * frees a page table page. 6067 */ 6068 if ((oldpde & PG_W) == 0) { 6069 pte = pmap_pde_to_pte(pde, sva); 6070 KASSERT((*pte & PG_V) != 0, 6071 ("pmap_advise: invalid PTE")); 6072 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6073 &lock); 6074 anychanged = TRUE; 6075 } 6076 if (lock != NULL) 6077 rw_wunlock(lock); 6078 } 6079 if (va_next > eva) 6080 va_next = eva; 6081 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6082 sva += PAGE_SIZE) { 6083 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 6084 PG_V)) 6085 continue; 6086 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6087 if (advice == MADV_DONTNEED) { 6088 /* 6089 * Future calls to pmap_is_modified() 6090 * can be avoided by making the page 6091 * dirty now. 6092 */ 6093 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6094 vm_page_dirty(m); 6095 } 6096 atomic_clear_long(pte, PG_M | PG_A); 6097 } else if ((*pte & PG_A) != 0) 6098 atomic_clear_long(pte, PG_A); 6099 else 6100 continue; 6101 if ((*pte & PG_G) != 0) 6102 pmap_invalidate_page(pmap, sva); 6103 else 6104 anychanged = TRUE; 6105 } 6106 } 6107 if (anychanged) 6108 pmap_invalidate_all(pmap); 6109 if (pv_lists_locked) 6110 rw_runlock(&pvh_global_lock); 6111 PMAP_UNLOCK(pmap); 6112} 6113 6114/* 6115 * Clear the modify bits on the specified physical page. 6116 */ 6117void 6118pmap_clear_modify(vm_page_t m) 6119{ 6120 struct md_page *pvh; 6121 pmap_t pmap; 6122 pv_entry_t next_pv, pv; 6123 pd_entry_t oldpde, *pde; 6124 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6125 struct rwlock *lock; 6126 vm_offset_t va; 6127 int md_gen, pvh_gen; 6128 6129 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6130 ("pmap_clear_modify: page %p is not managed", m)); 6131 VM_OBJECT_ASSERT_WLOCKED(m->object); 6132 KASSERT(!vm_page_xbusied(m), 6133 ("pmap_clear_modify: page %p is exclusive busied", m)); 6134 6135 /* 6136 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6137 * If the object containing the page is locked and the page is not 6138 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6139 */ 6140 if ((m->aflags & PGA_WRITEABLE) == 0) 6141 return; 6142 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6143 rw_rlock(&pvh_global_lock); 6144 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6145 rw_wlock(lock); 6146restart: 6147 if ((m->flags & PG_FICTITIOUS) != 0) 6148 goto small_mappings; 6149 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6150 pmap = PV_PMAP(pv); 6151 if (!PMAP_TRYLOCK(pmap)) { 6152 pvh_gen = pvh->pv_gen; 6153 rw_wunlock(lock); 6154 PMAP_LOCK(pmap); 6155 rw_wlock(lock); 6156 if (pvh_gen != pvh->pv_gen) { 6157 PMAP_UNLOCK(pmap); 6158 goto restart; 6159 } 6160 } 6161 PG_M = pmap_modified_bit(pmap); 6162 PG_V = pmap_valid_bit(pmap); 6163 PG_RW = pmap_rw_bit(pmap); 6164 va = pv->pv_va; 6165 pde = pmap_pde(pmap, va); 6166 oldpde = *pde; 6167 if ((oldpde & PG_RW) != 0) { 6168 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6169 if ((oldpde & PG_W) == 0) { 6170 /* 6171 * Write protect the mapping to a 6172 * single page so that a subsequent 6173 * write access may repromote. 6174 */ 6175 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6176 PG_PS_FRAME); 6177 pte = pmap_pde_to_pte(pde, va); 6178 oldpte = *pte; 6179 if ((oldpte & PG_V) != 0) { 6180 while (!atomic_cmpset_long(pte, 6181 oldpte, 6182 oldpte & ~(PG_M | PG_RW))) 6183 oldpte = *pte; 6184 vm_page_dirty(m); 6185 pmap_invalidate_page(pmap, va); 6186 } 6187 } 6188 } 6189 } 6190 PMAP_UNLOCK(pmap); 6191 } 6192small_mappings: 6193 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6194 pmap = PV_PMAP(pv); 6195 if (!PMAP_TRYLOCK(pmap)) { 6196 md_gen = m->md.pv_gen; 6197 pvh_gen = pvh->pv_gen; 6198 rw_wunlock(lock); 6199 PMAP_LOCK(pmap); 6200 rw_wlock(lock); 6201 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6202 PMAP_UNLOCK(pmap); 6203 goto restart; 6204 } 6205 } 6206 PG_M = pmap_modified_bit(pmap); 6207 PG_RW = pmap_rw_bit(pmap); 6208 pde = pmap_pde(pmap, pv->pv_va); 6209 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6210 " a 2mpage in page %p's pv list", m)); 6211 pte = pmap_pde_to_pte(pde, pv->pv_va); 6212 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6213 atomic_clear_long(pte, PG_M); 6214 pmap_invalidate_page(pmap, pv->pv_va); 6215 } 6216 PMAP_UNLOCK(pmap); 6217 } 6218 rw_wunlock(lock); 6219 rw_runlock(&pvh_global_lock); 6220} 6221 6222/* 6223 * Miscellaneous support routines follow 6224 */ 6225 6226/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6227static __inline void 6228pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6229{ 6230 u_int opte, npte; 6231 6232 /* 6233 * The cache mode bits are all in the low 32-bits of the 6234 * PTE, so we can just spin on updating the low 32-bits. 6235 */ 6236 do { 6237 opte = *(u_int *)pte; 6238 npte = opte & ~mask; 6239 npte |= cache_bits; 6240 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6241} 6242 6243/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6244static __inline void 6245pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6246{ 6247 u_int opde, npde; 6248 6249 /* 6250 * The cache mode bits are all in the low 32-bits of the 6251 * PDE, so we can just spin on updating the low 32-bits. 6252 */ 6253 do { 6254 opde = *(u_int *)pde; 6255 npde = opde & ~mask; 6256 npde |= cache_bits; 6257 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6258} 6259 6260/* 6261 * Map a set of physical memory pages into the kernel virtual 6262 * address space. Return a pointer to where it is mapped. This 6263 * routine is intended to be used for mapping device memory, 6264 * NOT real memory. 6265 */ 6266void * 6267pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6268{ 6269 struct pmap_preinit_mapping *ppim; 6270 vm_offset_t va, offset; 6271 vm_size_t tmpsize; 6272 int i; 6273 6274 offset = pa & PAGE_MASK; 6275 size = round_page(offset + size); 6276 pa = trunc_page(pa); 6277 6278 if (!pmap_initialized) { 6279 va = 0; 6280 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6281 ppim = pmap_preinit_mapping + i; 6282 if (ppim->va == 0) { 6283 ppim->pa = pa; 6284 ppim->sz = size; 6285 ppim->mode = mode; 6286 ppim->va = virtual_avail; 6287 virtual_avail += size; 6288 va = ppim->va; 6289 break; 6290 } 6291 } 6292 if (va == 0) 6293 panic("%s: too many preinit mappings", __func__); 6294 } else { 6295 /* 6296 * If we have a preinit mapping, re-use it. 6297 */ 6298 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6299 ppim = pmap_preinit_mapping + i; 6300 if (ppim->pa == pa && ppim->sz == size && 6301 ppim->mode == mode) 6302 return ((void *)(ppim->va + offset)); 6303 } 6304 /* 6305 * If the specified range of physical addresses fits within 6306 * the direct map window, use the direct map. 6307 */ 6308 if (pa < dmaplimit && pa + size < dmaplimit) { 6309 va = PHYS_TO_DMAP(pa); 6310 if (!pmap_change_attr(va, size, mode)) 6311 return ((void *)(va + offset)); 6312 } 6313 va = kva_alloc(size); 6314 if (va == 0) 6315 panic("%s: Couldn't allocate KVA", __func__); 6316 } 6317 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6318 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6319 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6320 pmap_invalidate_cache_range(va, va + tmpsize, FALSE); 6321 return ((void *)(va + offset)); 6322} 6323 6324void * 6325pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6326{ 6327 6328 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6329} 6330 6331void * 6332pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6333{ 6334 6335 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6336} 6337 6338void 6339pmap_unmapdev(vm_offset_t va, vm_size_t size) 6340{ 6341 struct pmap_preinit_mapping *ppim; 6342 vm_offset_t offset; 6343 int i; 6344 6345 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6346 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6347 return; 6348 offset = va & PAGE_MASK; 6349 size = round_page(offset + size); 6350 va = trunc_page(va); 6351 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6352 ppim = pmap_preinit_mapping + i; 6353 if (ppim->va == va && ppim->sz == size) { 6354 if (pmap_initialized) 6355 return; 6356 ppim->pa = 0; 6357 ppim->va = 0; 6358 ppim->sz = 0; 6359 ppim->mode = 0; 6360 if (va + size == virtual_avail) 6361 virtual_avail = va; 6362 return; 6363 } 6364 } 6365 if (pmap_initialized) 6366 kva_free(va, size); 6367} 6368 6369/* 6370 * Tries to demote a 1GB page mapping. 6371 */ 6372static boolean_t 6373pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6374{ 6375 pdp_entry_t newpdpe, oldpdpe; 6376 pd_entry_t *firstpde, newpde, *pde; 6377 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6378 vm_paddr_t mpdepa; 6379 vm_page_t mpde; 6380 6381 PG_A = pmap_accessed_bit(pmap); 6382 PG_M = pmap_modified_bit(pmap); 6383 PG_V = pmap_valid_bit(pmap); 6384 PG_RW = pmap_rw_bit(pmap); 6385 6386 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6387 oldpdpe = *pdpe; 6388 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6389 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6390 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6391 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6392 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6393 " in pmap %p", va, pmap); 6394 return (FALSE); 6395 } 6396 mpdepa = VM_PAGE_TO_PHYS(mpde); 6397 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 6398 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6399 KASSERT((oldpdpe & PG_A) != 0, 6400 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6401 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6402 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6403 newpde = oldpdpe; 6404 6405 /* 6406 * Initialize the page directory page. 6407 */ 6408 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6409 *pde = newpde; 6410 newpde += NBPDR; 6411 } 6412 6413 /* 6414 * Demote the mapping. 6415 */ 6416 *pdpe = newpdpe; 6417 6418 /* 6419 * Invalidate a stale recursive mapping of the page directory page. 6420 */ 6421 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6422 6423 pmap_pdpe_demotions++; 6424 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6425 " in pmap %p", va, pmap); 6426 return (TRUE); 6427} 6428 6429/* 6430 * Sets the memory attribute for the specified page. 6431 */ 6432void 6433pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6434{ 6435 6436 m->md.pat_mode = ma; 6437 6438 /* 6439 * If "m" is a normal page, update its direct mapping. This update 6440 * can be relied upon to perform any cache operations that are 6441 * required for data coherence. 6442 */ 6443 if ((m->flags & PG_FICTITIOUS) == 0 && 6444 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6445 m->md.pat_mode)) 6446 panic("memory attribute change on the direct map failed"); 6447} 6448 6449/* 6450 * Changes the specified virtual address range's memory type to that given by 6451 * the parameter "mode". The specified virtual address range must be 6452 * completely contained within either the direct map or the kernel map. If 6453 * the virtual address range is contained within the kernel map, then the 6454 * memory type for each of the corresponding ranges of the direct map is also 6455 * changed. (The corresponding ranges of the direct map are those ranges that 6456 * map the same physical pages as the specified virtual address range.) These 6457 * changes to the direct map are necessary because Intel describes the 6458 * behavior of their processors as "undefined" if two or more mappings to the 6459 * same physical page have different memory types. 6460 * 6461 * Returns zero if the change completed successfully, and either EINVAL or 6462 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6463 * of the virtual address range was not mapped, and ENOMEM is returned if 6464 * there was insufficient memory available to complete the change. In the 6465 * latter case, the memory type may have been changed on some part of the 6466 * virtual address range or the direct map. 6467 */ 6468int 6469pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6470{ 6471 int error; 6472 6473 PMAP_LOCK(kernel_pmap); 6474 error = pmap_change_attr_locked(va, size, mode); 6475 PMAP_UNLOCK(kernel_pmap); 6476 return (error); 6477} 6478 6479static int 6480pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6481{ 6482 vm_offset_t base, offset, tmpva; 6483 vm_paddr_t pa_start, pa_end; 6484 pdp_entry_t *pdpe; 6485 pd_entry_t *pde; 6486 pt_entry_t *pte; 6487 int cache_bits_pte, cache_bits_pde, error; 6488 boolean_t changed; 6489 6490 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6491 base = trunc_page(va); 6492 offset = va & PAGE_MASK; 6493 size = round_page(offset + size); 6494 6495 /* 6496 * Only supported on kernel virtual addresses, including the direct 6497 * map but excluding the recursive map. 6498 */ 6499 if (base < DMAP_MIN_ADDRESS) 6500 return (EINVAL); 6501 6502 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6503 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6504 changed = FALSE; 6505 6506 /* 6507 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6508 * into 4KB pages if required. 6509 */ 6510 for (tmpva = base; tmpva < base + size; ) { 6511 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6512 if (*pdpe == 0) 6513 return (EINVAL); 6514 if (*pdpe & PG_PS) { 6515 /* 6516 * If the current 1GB page already has the required 6517 * memory type, then we need not demote this page. Just 6518 * increment tmpva to the next 1GB page frame. 6519 */ 6520 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6521 tmpva = trunc_1gpage(tmpva) + NBPDP; 6522 continue; 6523 } 6524 6525 /* 6526 * If the current offset aligns with a 1GB page frame 6527 * and there is at least 1GB left within the range, then 6528 * we need not break down this page into 2MB pages. 6529 */ 6530 if ((tmpva & PDPMASK) == 0 && 6531 tmpva + PDPMASK < base + size) { 6532 tmpva += NBPDP; 6533 continue; 6534 } 6535 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6536 return (ENOMEM); 6537 } 6538 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6539 if (*pde == 0) 6540 return (EINVAL); 6541 if (*pde & PG_PS) { 6542 /* 6543 * If the current 2MB page already has the required 6544 * memory type, then we need not demote this page. Just 6545 * increment tmpva to the next 2MB page frame. 6546 */ 6547 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6548 tmpva = trunc_2mpage(tmpva) + NBPDR; 6549 continue; 6550 } 6551 6552 /* 6553 * If the current offset aligns with a 2MB page frame 6554 * and there is at least 2MB left within the range, then 6555 * we need not break down this page into 4KB pages. 6556 */ 6557 if ((tmpva & PDRMASK) == 0 && 6558 tmpva + PDRMASK < base + size) { 6559 tmpva += NBPDR; 6560 continue; 6561 } 6562 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6563 return (ENOMEM); 6564 } 6565 pte = pmap_pde_to_pte(pde, tmpva); 6566 if (*pte == 0) 6567 return (EINVAL); 6568 tmpva += PAGE_SIZE; 6569 } 6570 error = 0; 6571 6572 /* 6573 * Ok, all the pages exist, so run through them updating their 6574 * cache mode if required. 6575 */ 6576 pa_start = pa_end = 0; 6577 for (tmpva = base; tmpva < base + size; ) { 6578 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6579 if (*pdpe & PG_PS) { 6580 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6581 pmap_pde_attr(pdpe, cache_bits_pde, 6582 X86_PG_PDE_CACHE); 6583 changed = TRUE; 6584 } 6585 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6586 if (pa_start == pa_end) { 6587 /* Start physical address run. */ 6588 pa_start = *pdpe & PG_PS_FRAME; 6589 pa_end = pa_start + NBPDP; 6590 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6591 pa_end += NBPDP; 6592 else { 6593 /* Run ended, update direct map. */ 6594 error = pmap_change_attr_locked( 6595 PHYS_TO_DMAP(pa_start), 6596 pa_end - pa_start, mode); 6597 if (error != 0) 6598 break; 6599 /* Start physical address run. */ 6600 pa_start = *pdpe & PG_PS_FRAME; 6601 pa_end = pa_start + NBPDP; 6602 } 6603 } 6604 tmpva = trunc_1gpage(tmpva) + NBPDP; 6605 continue; 6606 } 6607 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6608 if (*pde & PG_PS) { 6609 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6610 pmap_pde_attr(pde, cache_bits_pde, 6611 X86_PG_PDE_CACHE); 6612 changed = TRUE; 6613 } 6614 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6615 if (pa_start == pa_end) { 6616 /* Start physical address run. */ 6617 pa_start = *pde & PG_PS_FRAME; 6618 pa_end = pa_start + NBPDR; 6619 } else if (pa_end == (*pde & PG_PS_FRAME)) 6620 pa_end += NBPDR; 6621 else { 6622 /* Run ended, update direct map. */ 6623 error = pmap_change_attr_locked( 6624 PHYS_TO_DMAP(pa_start), 6625 pa_end - pa_start, mode); 6626 if (error != 0) 6627 break; 6628 /* Start physical address run. */ 6629 pa_start = *pde & PG_PS_FRAME; 6630 pa_end = pa_start + NBPDR; 6631 } 6632 } 6633 tmpva = trunc_2mpage(tmpva) + NBPDR; 6634 } else { 6635 pte = pmap_pde_to_pte(pde, tmpva); 6636 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6637 pmap_pte_attr(pte, cache_bits_pte, 6638 X86_PG_PTE_CACHE); 6639 changed = TRUE; 6640 } 6641 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6642 if (pa_start == pa_end) { 6643 /* Start physical address run. */ 6644 pa_start = *pte & PG_FRAME; 6645 pa_end = pa_start + PAGE_SIZE; 6646 } else if (pa_end == (*pte & PG_FRAME)) 6647 pa_end += PAGE_SIZE; 6648 else { 6649 /* Run ended, update direct map. */ 6650 error = pmap_change_attr_locked( 6651 PHYS_TO_DMAP(pa_start), 6652 pa_end - pa_start, mode); 6653 if (error != 0) 6654 break; 6655 /* Start physical address run. */ 6656 pa_start = *pte & PG_FRAME; 6657 pa_end = pa_start + PAGE_SIZE; 6658 } 6659 } 6660 tmpva += PAGE_SIZE; 6661 } 6662 } 6663 if (error == 0 && pa_start != pa_end) 6664 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6665 pa_end - pa_start, mode); 6666 6667 /* 6668 * Flush CPU caches if required to make sure any data isn't cached that 6669 * shouldn't be, etc. 6670 */ 6671 if (changed) { 6672 pmap_invalidate_range(kernel_pmap, base, tmpva); 6673 pmap_invalidate_cache_range(base, tmpva, FALSE); 6674 } 6675 return (error); 6676} 6677 6678/* 6679 * Demotes any mapping within the direct map region that covers more than the 6680 * specified range of physical addresses. This range's size must be a power 6681 * of two and its starting address must be a multiple of its size. Since the 6682 * demotion does not change any attributes of the mapping, a TLB invalidation 6683 * is not mandatory. The caller may, however, request a TLB invalidation. 6684 */ 6685void 6686pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6687{ 6688 pdp_entry_t *pdpe; 6689 pd_entry_t *pde; 6690 vm_offset_t va; 6691 boolean_t changed; 6692 6693 if (len == 0) 6694 return; 6695 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6696 KASSERT((base & (len - 1)) == 0, 6697 ("pmap_demote_DMAP: base is not a multiple of len")); 6698 if (len < NBPDP && base < dmaplimit) { 6699 va = PHYS_TO_DMAP(base); 6700 changed = FALSE; 6701 PMAP_LOCK(kernel_pmap); 6702 pdpe = pmap_pdpe(kernel_pmap, va); 6703 if ((*pdpe & X86_PG_V) == 0) 6704 panic("pmap_demote_DMAP: invalid PDPE"); 6705 if ((*pdpe & PG_PS) != 0) { 6706 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6707 panic("pmap_demote_DMAP: PDPE failed"); 6708 changed = TRUE; 6709 } 6710 if (len < NBPDR) { 6711 pde = pmap_pdpe_to_pde(pdpe, va); 6712 if ((*pde & X86_PG_V) == 0) 6713 panic("pmap_demote_DMAP: invalid PDE"); 6714 if ((*pde & PG_PS) != 0) { 6715 if (!pmap_demote_pde(kernel_pmap, pde, va)) 6716 panic("pmap_demote_DMAP: PDE failed"); 6717 changed = TRUE; 6718 } 6719 } 6720 if (changed && invalidate) 6721 pmap_invalidate_page(kernel_pmap, va); 6722 PMAP_UNLOCK(kernel_pmap); 6723 } 6724} 6725 6726/* 6727 * perform the pmap work for mincore 6728 */ 6729int 6730pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6731{ 6732 pd_entry_t *pdep; 6733 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 6734 vm_paddr_t pa; 6735 int val; 6736 6737 PG_A = pmap_accessed_bit(pmap); 6738 PG_M = pmap_modified_bit(pmap); 6739 PG_V = pmap_valid_bit(pmap); 6740 PG_RW = pmap_rw_bit(pmap); 6741 6742 PMAP_LOCK(pmap); 6743retry: 6744 pdep = pmap_pde(pmap, addr); 6745 if (pdep != NULL && (*pdep & PG_V)) { 6746 if (*pdep & PG_PS) { 6747 pte = *pdep; 6748 /* Compute the physical address of the 4KB page. */ 6749 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 6750 PG_FRAME; 6751 val = MINCORE_SUPER; 6752 } else { 6753 pte = *pmap_pde_to_pte(pdep, addr); 6754 pa = pte & PG_FRAME; 6755 val = 0; 6756 } 6757 } else { 6758 pte = 0; 6759 pa = 0; 6760 val = 0; 6761 } 6762 if ((pte & PG_V) != 0) { 6763 val |= MINCORE_INCORE; 6764 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6765 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6766 if ((pte & PG_A) != 0) 6767 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6768 } 6769 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6770 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 6771 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 6772 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6773 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6774 goto retry; 6775 } else 6776 PA_UNLOCK_COND(*locked_pa); 6777 PMAP_UNLOCK(pmap); 6778 return (val); 6779} 6780 6781void 6782pmap_activate(struct thread *td) 6783{ 6784 pmap_t pmap, oldpmap; 6785 u_int cpuid; 6786 6787 critical_enter(); 6788 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6789 oldpmap = PCPU_GET(curpmap); 6790 cpuid = PCPU_GET(cpuid); 6791#ifdef SMP 6792 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6793 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6794 CPU_SET_ATOMIC(cpuid, &pmap->pm_save); 6795#else 6796 CPU_CLR(cpuid, &oldpmap->pm_active); 6797 CPU_SET(cpuid, &pmap->pm_active); 6798 CPU_SET(cpuid, &pmap->pm_save); 6799#endif 6800 td->td_pcb->pcb_cr3 = pmap->pm_cr3; 6801 load_cr3(pmap->pm_cr3); 6802 PCPU_SET(curpmap, pmap); 6803 critical_exit(); 6804} 6805 6806void 6807pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 6808{ 6809} 6810 6811/* 6812 * Increase the starting virtual address of the given mapping if a 6813 * different alignment might result in more superpage mappings. 6814 */ 6815void 6816pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6817 vm_offset_t *addr, vm_size_t size) 6818{ 6819 vm_offset_t superpage_offset; 6820 6821 if (size < NBPDR) 6822 return; 6823 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6824 offset += ptoa(object->pg_color); 6825 superpage_offset = offset & PDRMASK; 6826 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 6827 (*addr & PDRMASK) == superpage_offset) 6828 return; 6829 if ((*addr & PDRMASK) < superpage_offset) 6830 *addr = (*addr & ~PDRMASK) + superpage_offset; 6831 else 6832 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 6833} 6834 6835#ifdef INVARIANTS 6836static unsigned long num_dirty_emulations; 6837SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 6838 &num_dirty_emulations, 0, NULL); 6839 6840static unsigned long num_accessed_emulations; 6841SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 6842 &num_accessed_emulations, 0, NULL); 6843 6844static unsigned long num_superpage_accessed_emulations; 6845SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 6846 &num_superpage_accessed_emulations, 0, NULL); 6847 6848static unsigned long ad_emulation_superpage_promotions; 6849SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 6850 &ad_emulation_superpage_promotions, 0, NULL); 6851#endif /* INVARIANTS */ 6852 6853int 6854pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 6855{ 6856 int rv; 6857 struct rwlock *lock; 6858 vm_page_t m, mpte; 6859 pd_entry_t *pde; 6860 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 6861 boolean_t pv_lists_locked; 6862 6863 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 6864 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 6865 6866 if (!pmap_emulate_ad_bits(pmap)) 6867 return (-1); 6868 6869 PG_A = pmap_accessed_bit(pmap); 6870 PG_M = pmap_modified_bit(pmap); 6871 PG_V = pmap_valid_bit(pmap); 6872 PG_RW = pmap_rw_bit(pmap); 6873 6874 rv = -1; 6875 lock = NULL; 6876 pv_lists_locked = FALSE; 6877retry: 6878 PMAP_LOCK(pmap); 6879 6880 pde = pmap_pde(pmap, va); 6881 if (pde == NULL || (*pde & PG_V) == 0) 6882 goto done; 6883 6884 if ((*pde & PG_PS) != 0) { 6885 if (ftype == VM_PROT_READ) { 6886#ifdef INVARIANTS 6887 atomic_add_long(&num_superpage_accessed_emulations, 1); 6888#endif 6889 *pde |= PG_A; 6890 rv = 0; 6891 } 6892 goto done; 6893 } 6894 6895 pte = pmap_pde_to_pte(pde, va); 6896 if ((*pte & PG_V) == 0) 6897 goto done; 6898 6899 if (ftype == VM_PROT_WRITE) { 6900 if ((*pte & PG_RW) == 0) 6901 goto done; 6902 /* 6903 * Set the modified and accessed bits simultaneously. 6904 * 6905 * Intel EPT PTEs that do software emulation of A/D bits map 6906 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 6907 * An EPT misconfiguration is triggered if the PTE is writable 6908 * but not readable (WR=10). This is avoided by setting PG_A 6909 * and PG_M simultaneously. 6910 */ 6911 *pte |= PG_M | PG_A; 6912 } else { 6913 *pte |= PG_A; 6914 } 6915 6916 /* try to promote the mapping */ 6917 if (va < VM_MAXUSER_ADDRESS) 6918 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6919 else 6920 mpte = NULL; 6921 6922 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6923 6924 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 6925 pmap_ps_enabled(pmap) && 6926 (m->flags & PG_FICTITIOUS) == 0 && 6927 vm_reserv_level_iffullpop(m) == 0) { 6928 if (!pv_lists_locked) { 6929 pv_lists_locked = TRUE; 6930 if (!rw_try_rlock(&pvh_global_lock)) { 6931 PMAP_UNLOCK(pmap); 6932 rw_rlock(&pvh_global_lock); 6933 goto retry; 6934 } 6935 } 6936 pmap_promote_pde(pmap, pde, va, &lock); 6937#ifdef INVARIANTS 6938 atomic_add_long(&ad_emulation_superpage_promotions, 1); 6939#endif 6940 } 6941#ifdef INVARIANTS 6942 if (ftype == VM_PROT_WRITE) 6943 atomic_add_long(&num_dirty_emulations, 1); 6944 else 6945 atomic_add_long(&num_accessed_emulations, 1); 6946#endif 6947 rv = 0; /* success */ 6948done: 6949 if (lock != NULL) 6950 rw_wunlock(lock); 6951 if (pv_lists_locked) 6952 rw_runlock(&pvh_global_lock); 6953 PMAP_UNLOCK(pmap); 6954 return (rv); 6955} 6956 6957void 6958pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 6959{ 6960 pml4_entry_t *pml4; 6961 pdp_entry_t *pdp; 6962 pd_entry_t *pde; 6963 pt_entry_t *pte, PG_V; 6964 int idx; 6965 6966 idx = 0; 6967 PG_V = pmap_valid_bit(pmap); 6968 PMAP_LOCK(pmap); 6969 6970 pml4 = pmap_pml4e(pmap, va); 6971 ptr[idx++] = *pml4; 6972 if ((*pml4 & PG_V) == 0) 6973 goto done; 6974 6975 pdp = pmap_pml4e_to_pdpe(pml4, va); 6976 ptr[idx++] = *pdp; 6977 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 6978 goto done; 6979 6980 pde = pmap_pdpe_to_pde(pdp, va); 6981 ptr[idx++] = *pde; 6982 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 6983 goto done; 6984 6985 pte = pmap_pde_to_pte(pde, va); 6986 ptr[idx++] = *pte; 6987 6988done: 6989 PMAP_UNLOCK(pmap); 6990 *num = idx; 6991} 6992 6993#include "opt_ddb.h" 6994#ifdef DDB 6995#include <ddb/ddb.h> 6996 6997DB_SHOW_COMMAND(pte, pmap_print_pte) 6998{ 6999 pmap_t pmap; 7000 pml4_entry_t *pml4; 7001 pdp_entry_t *pdp; 7002 pd_entry_t *pde; 7003 pt_entry_t *pte, PG_V; 7004 vm_offset_t va; 7005 7006 if (have_addr) { 7007 va = (vm_offset_t)addr; 7008 pmap = PCPU_GET(curpmap); /* XXX */ 7009 } else { 7010 db_printf("show pte addr\n"); 7011 return; 7012 } 7013 PG_V = pmap_valid_bit(pmap); 7014 pml4 = pmap_pml4e(pmap, va); 7015 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 7016 if ((*pml4 & PG_V) == 0) { 7017 db_printf("\n"); 7018 return; 7019 } 7020 pdp = pmap_pml4e_to_pdpe(pml4, va); 7021 db_printf(" pdpe %#016lx", *pdp); 7022 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 7023 db_printf("\n"); 7024 return; 7025 } 7026 pde = pmap_pdpe_to_pde(pdp, va); 7027 db_printf(" pde %#016lx", *pde); 7028 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 7029 db_printf("\n"); 7030 return; 7031 } 7032 pte = pmap_pde_to_pte(pde, va); 7033 db_printf(" pte %#016lx\n", *pte); 7034} 7035 7036DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 7037{ 7038 vm_paddr_t a; 7039 7040 if (have_addr) { 7041 a = (vm_paddr_t)addr; 7042 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 7043 } else { 7044 db_printf("show phys2dmap addr\n"); 7045 } 7046} 7047#endif 7048