pmap.c revision 270439
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#define AMD64_NPT_AWARE 80 81#include <sys/cdefs.h> 82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 270439 2014-08-24 07:53:15Z kib $"); 83 84/* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103#include "opt_pmap.h" 104#include "opt_vm.h" 105 106#include <sys/param.h> 107#include <sys/bus.h> 108#include <sys/systm.h> 109#include <sys/kernel.h> 110#include <sys/ktr.h> 111#include <sys/lock.h> 112#include <sys/malloc.h> 113#include <sys/mman.h> 114#include <sys/mutex.h> 115#include <sys/proc.h> 116#include <sys/rwlock.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#include <sys/_unrhdr.h> 122#include <sys/smp.h> 123 124#include <vm/vm.h> 125#include <vm/vm_param.h> 126#include <vm/vm_kern.h> 127#include <vm/vm_page.h> 128#include <vm/vm_map.h> 129#include <vm/vm_object.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_pageout.h> 132#include <vm/vm_pager.h> 133#include <vm/vm_radix.h> 134#include <vm/vm_reserv.h> 135#include <vm/uma.h> 136 137#include <machine/intr_machdep.h> 138#include <machine/apicvar.h> 139#include <machine/cpu.h> 140#include <machine/cputypes.h> 141#include <machine/md_var.h> 142#include <machine/pcb.h> 143#include <machine/specialreg.h> 144#ifdef SMP 145#include <machine/smp.h> 146#endif 147 148static __inline boolean_t 149pmap_emulate_ad_bits(pmap_t pmap) 150{ 151 152 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 153} 154 155static __inline pt_entry_t 156pmap_valid_bit(pmap_t pmap) 157{ 158 pt_entry_t mask; 159 160 switch (pmap->pm_type) { 161 case PT_X86: 162 mask = X86_PG_V; 163 break; 164 case PT_EPT: 165 if (pmap_emulate_ad_bits(pmap)) 166 mask = EPT_PG_EMUL_V; 167 else 168 mask = EPT_PG_READ; 169 break; 170 default: 171 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 172 } 173 174 return (mask); 175} 176 177static __inline pt_entry_t 178pmap_rw_bit(pmap_t pmap) 179{ 180 pt_entry_t mask; 181 182 switch (pmap->pm_type) { 183 case PT_X86: 184 mask = X86_PG_RW; 185 break; 186 case PT_EPT: 187 if (pmap_emulate_ad_bits(pmap)) 188 mask = EPT_PG_EMUL_RW; 189 else 190 mask = EPT_PG_WRITE; 191 break; 192 default: 193 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 194 } 195 196 return (mask); 197} 198 199static __inline pt_entry_t 200pmap_global_bit(pmap_t pmap) 201{ 202 pt_entry_t mask; 203 204 switch (pmap->pm_type) { 205 case PT_X86: 206 mask = X86_PG_G; 207 break; 208 case PT_EPT: 209 mask = 0; 210 break; 211 default: 212 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216} 217 218static __inline pt_entry_t 219pmap_accessed_bit(pmap_t pmap) 220{ 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 mask = X86_PG_A; 226 break; 227 case PT_EPT: 228 if (pmap_emulate_ad_bits(pmap)) 229 mask = EPT_PG_READ; 230 else 231 mask = EPT_PG_A; 232 break; 233 default: 234 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 235 } 236 237 return (mask); 238} 239 240static __inline pt_entry_t 241pmap_modified_bit(pmap_t pmap) 242{ 243 pt_entry_t mask; 244 245 switch (pmap->pm_type) { 246 case PT_X86: 247 mask = X86_PG_M; 248 break; 249 case PT_EPT: 250 if (pmap_emulate_ad_bits(pmap)) 251 mask = EPT_PG_WRITE; 252 else 253 mask = EPT_PG_M; 254 break; 255 default: 256 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 257 } 258 259 return (mask); 260} 261 262#if !defined(DIAGNOSTIC) 263#ifdef __GNUC_GNU_INLINE__ 264#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 265#else 266#define PMAP_INLINE extern inline 267#endif 268#else 269#define PMAP_INLINE 270#endif 271 272#ifdef PV_STATS 273#define PV_STAT(x) do { x ; } while (0) 274#else 275#define PV_STAT(x) do { } while (0) 276#endif 277 278#define pa_index(pa) ((pa) >> PDRSHIFT) 279#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 280 281#define NPV_LIST_LOCKS MAXCPU 282 283#define PHYS_TO_PV_LIST_LOCK(pa) \ 284 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 285 286#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 287 struct rwlock **_lockp = (lockp); \ 288 struct rwlock *_new_lock; \ 289 \ 290 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 291 if (_new_lock != *_lockp) { \ 292 if (*_lockp != NULL) \ 293 rw_wunlock(*_lockp); \ 294 *_lockp = _new_lock; \ 295 rw_wlock(*_lockp); \ 296 } \ 297} while (0) 298 299#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 300 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 301 302#define RELEASE_PV_LIST_LOCK(lockp) do { \ 303 struct rwlock **_lockp = (lockp); \ 304 \ 305 if (*_lockp != NULL) { \ 306 rw_wunlock(*_lockp); \ 307 *_lockp = NULL; \ 308 } \ 309} while (0) 310 311#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 312 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 313 314struct pmap kernel_pmap_store; 315 316vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 317vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 318 319int nkpt; 320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 321 "Number of kernel page table pages allocated on bootup"); 322 323static int ndmpdp; 324vm_paddr_t dmaplimit; 325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 326pt_entry_t pg_nx; 327 328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 329 330static int pat_works = 1; 331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 332 "Is page attribute table fully functional?"); 333 334static int pg_ps_enabled = 1; 335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 336 "Are large page mappings enabled?"); 337 338#define PAT_INDEX_SIZE 8 339static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 340 341static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 342static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 343u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 344u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 345 346static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 347static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 348static int ndmpdpphys; /* number of DMPDPphys pages */ 349 350static struct rwlock_padalign pvh_global_lock; 351 352/* 353 * Data for the pv entry allocation mechanism 354 */ 355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 356static struct mtx pv_chunks_mutex; 357static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 358static struct md_page *pv_table; 359 360/* 361 * All those kernel PT submaps that BSD is so fond of 362 */ 363pt_entry_t *CMAP1 = 0; 364caddr_t CADDR1 = 0; 365 366static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 367 368static struct unrhdr pcid_unr; 369static struct mtx pcid_mtx; 370int pmap_pcid_enabled = 0; 371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled, 372 0, "Is TLB Context ID enabled ?"); 373int invpcid_works = 0; 374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 375 "Is the invpcid instruction available ?"); 376 377static int 378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 379{ 380 int i; 381 uint64_t res; 382 383 res = 0; 384 CPU_FOREACH(i) { 385 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 386 } 387 return (sysctl_handle_64(oidp, &res, 0, req)); 388} 389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 390 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 391 "Count of saved TLB context on switch"); 392 393/* pmap_copy_pages() over non-DMAP */ 394static struct mtx cpage_lock; 395static vm_offset_t cpage_a; 396static vm_offset_t cpage_b; 397 398/* 399 * Crashdump maps. 400 */ 401static caddr_t crashdumpmap; 402 403static void free_pv_chunk(struct pv_chunk *pc); 404static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 405static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 406static int popcnt_pc_map_elem(uint64_t elem); 407static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 408static void reserve_pv_entries(pmap_t pmap, int needed, 409 struct rwlock **lockp); 410static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 411 struct rwlock **lockp); 412static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 413 struct rwlock **lockp); 414static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 415 struct rwlock **lockp); 416static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 417static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 418 vm_offset_t va); 419 420static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 421static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 422static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 423 vm_offset_t va, struct rwlock **lockp); 424static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 425 vm_offset_t va); 426static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 427 vm_prot_t prot, struct rwlock **lockp); 428static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 429 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 430static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 431static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 432static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 433static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 434static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 435static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 436 struct rwlock **lockp); 437static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 438 vm_prot_t prot); 439static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 440static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 441 struct spglist *free, struct rwlock **lockp); 442static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 443 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 444static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 445static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 446 struct spglist *free); 447static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 448 vm_page_t m, struct rwlock **lockp); 449static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 450 pd_entry_t newpde); 451static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 452 453static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 454 struct rwlock **lockp); 455static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 456 struct rwlock **lockp); 457static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 458 struct rwlock **lockp); 459 460static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 461 struct spglist *free); 462static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 463static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 464 465/* 466 * Move the kernel virtual free pointer to the next 467 * 2MB. This is used to help improve performance 468 * by using a large (2MB) page for much of the kernel 469 * (.text, .data, .bss) 470 */ 471static vm_offset_t 472pmap_kmem_choose(vm_offset_t addr) 473{ 474 vm_offset_t newaddr = addr; 475 476 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 477 return (newaddr); 478} 479 480/********************/ 481/* Inline functions */ 482/********************/ 483 484/* Return a non-clipped PD index for a given VA */ 485static __inline vm_pindex_t 486pmap_pde_pindex(vm_offset_t va) 487{ 488 return (va >> PDRSHIFT); 489} 490 491 492/* Return various clipped indexes for a given VA */ 493static __inline vm_pindex_t 494pmap_pte_index(vm_offset_t va) 495{ 496 497 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 498} 499 500static __inline vm_pindex_t 501pmap_pde_index(vm_offset_t va) 502{ 503 504 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 505} 506 507static __inline vm_pindex_t 508pmap_pdpe_index(vm_offset_t va) 509{ 510 511 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 512} 513 514static __inline vm_pindex_t 515pmap_pml4e_index(vm_offset_t va) 516{ 517 518 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 519} 520 521/* Return a pointer to the PML4 slot that corresponds to a VA */ 522static __inline pml4_entry_t * 523pmap_pml4e(pmap_t pmap, vm_offset_t va) 524{ 525 526 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 527} 528 529/* Return a pointer to the PDP slot that corresponds to a VA */ 530static __inline pdp_entry_t * 531pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 532{ 533 pdp_entry_t *pdpe; 534 535 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 536 return (&pdpe[pmap_pdpe_index(va)]); 537} 538 539/* Return a pointer to the PDP slot that corresponds to a VA */ 540static __inline pdp_entry_t * 541pmap_pdpe(pmap_t pmap, vm_offset_t va) 542{ 543 pml4_entry_t *pml4e; 544 pt_entry_t PG_V; 545 546 PG_V = pmap_valid_bit(pmap); 547 pml4e = pmap_pml4e(pmap, va); 548 if ((*pml4e & PG_V) == 0) 549 return (NULL); 550 return (pmap_pml4e_to_pdpe(pml4e, va)); 551} 552 553/* Return a pointer to the PD slot that corresponds to a VA */ 554static __inline pd_entry_t * 555pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 556{ 557 pd_entry_t *pde; 558 559 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 560 return (&pde[pmap_pde_index(va)]); 561} 562 563/* Return a pointer to the PD slot that corresponds to a VA */ 564static __inline pd_entry_t * 565pmap_pde(pmap_t pmap, vm_offset_t va) 566{ 567 pdp_entry_t *pdpe; 568 pt_entry_t PG_V; 569 570 PG_V = pmap_valid_bit(pmap); 571 pdpe = pmap_pdpe(pmap, va); 572 if (pdpe == NULL || (*pdpe & PG_V) == 0) 573 return (NULL); 574 return (pmap_pdpe_to_pde(pdpe, va)); 575} 576 577/* Return a pointer to the PT slot that corresponds to a VA */ 578static __inline pt_entry_t * 579pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 580{ 581 pt_entry_t *pte; 582 583 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 584 return (&pte[pmap_pte_index(va)]); 585} 586 587/* Return a pointer to the PT slot that corresponds to a VA */ 588static __inline pt_entry_t * 589pmap_pte(pmap_t pmap, vm_offset_t va) 590{ 591 pd_entry_t *pde; 592 pt_entry_t PG_V; 593 594 PG_V = pmap_valid_bit(pmap); 595 pde = pmap_pde(pmap, va); 596 if (pde == NULL || (*pde & PG_V) == 0) 597 return (NULL); 598 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 599 return ((pt_entry_t *)pde); 600 return (pmap_pde_to_pte(pde, va)); 601} 602 603static __inline void 604pmap_resident_count_inc(pmap_t pmap, int count) 605{ 606 607 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 608 pmap->pm_stats.resident_count += count; 609} 610 611static __inline void 612pmap_resident_count_dec(pmap_t pmap, int count) 613{ 614 615 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 616 KASSERT(pmap->pm_stats.resident_count >= count, 617 ("pmap %p resident count underflow %ld %d", pmap, 618 pmap->pm_stats.resident_count, count)); 619 pmap->pm_stats.resident_count -= count; 620} 621 622PMAP_INLINE pt_entry_t * 623vtopte(vm_offset_t va) 624{ 625 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 626 627 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 628 629 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 630} 631 632static __inline pd_entry_t * 633vtopde(vm_offset_t va) 634{ 635 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 636 637 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 638 639 return (PDmap + ((va >> PDRSHIFT) & mask)); 640} 641 642static u_int64_t 643allocpages(vm_paddr_t *firstaddr, int n) 644{ 645 u_int64_t ret; 646 647 ret = *firstaddr; 648 bzero((void *)ret, n * PAGE_SIZE); 649 *firstaddr += n * PAGE_SIZE; 650 return (ret); 651} 652 653CTASSERT(powerof2(NDMPML4E)); 654 655/* number of kernel PDP slots */ 656#define NKPDPE(ptpgs) howmany((ptpgs), NPDEPG) 657 658static void 659nkpt_init(vm_paddr_t addr) 660{ 661 int pt_pages; 662 663#ifdef NKPT 664 pt_pages = NKPT; 665#else 666 pt_pages = howmany(addr, 1 << PDRSHIFT); 667 pt_pages += NKPDPE(pt_pages); 668 669 /* 670 * Add some slop beyond the bare minimum required for bootstrapping 671 * the kernel. 672 * 673 * This is quite important when allocating KVA for kernel modules. 674 * The modules are required to be linked in the negative 2GB of 675 * the address space. If we run out of KVA in this region then 676 * pmap_growkernel() will need to allocate page table pages to map 677 * the entire 512GB of KVA space which is an unnecessary tax on 678 * physical memory. 679 */ 680 pt_pages += 8; /* 16MB additional slop for kernel modules */ 681#endif 682 nkpt = pt_pages; 683} 684 685static void 686create_pagetables(vm_paddr_t *firstaddr) 687{ 688 int i, j, ndm1g, nkpdpe; 689 pt_entry_t *pt_p; 690 pd_entry_t *pd_p; 691 pdp_entry_t *pdp_p; 692 pml4_entry_t *p4_p; 693 694 /* Allocate page table pages for the direct map */ 695 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 696 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 697 ndmpdp = 4; 698 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 699 if (ndmpdpphys > NDMPML4E) { 700 /* 701 * Each NDMPML4E allows 512 GB, so limit to that, 702 * and then readjust ndmpdp and ndmpdpphys. 703 */ 704 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 705 Maxmem = atop(NDMPML4E * NBPML4); 706 ndmpdpphys = NDMPML4E; 707 ndmpdp = NDMPML4E * NPDEPG; 708 } 709 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 710 ndm1g = 0; 711 if ((amd_feature & AMDID_PAGE1GB) != 0) 712 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 713 if (ndm1g < ndmpdp) 714 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 715 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 716 717 /* Allocate pages */ 718 KPML4phys = allocpages(firstaddr, 1); 719 KPDPphys = allocpages(firstaddr, NKPML4E); 720 721 /* 722 * Allocate the initial number of kernel page table pages required to 723 * bootstrap. We defer this until after all memory-size dependent 724 * allocations are done (e.g. direct map), so that we don't have to 725 * build in too much slop in our estimate. 726 * 727 * Note that when NKPML4E > 1, we have an empty page underneath 728 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 729 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 730 */ 731 nkpt_init(*firstaddr); 732 nkpdpe = NKPDPE(nkpt); 733 734 KPTphys = allocpages(firstaddr, nkpt); 735 KPDphys = allocpages(firstaddr, nkpdpe); 736 737 /* Fill in the underlying page table pages */ 738 /* Nominally read-only (but really R/W) from zero to physfree */ 739 /* XXX not fully used, underneath 2M pages */ 740 pt_p = (pt_entry_t *)KPTphys; 741 for (i = 0; ptoa(i) < *firstaddr; i++) 742 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 743 744 /* Now map the page tables at their location within PTmap */ 745 pd_p = (pd_entry_t *)KPDphys; 746 for (i = 0; i < nkpt; i++) 747 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 748 749 /* Map from zero to end of allocations under 2M pages */ 750 /* This replaces some of the KPTphys entries above */ 751 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 752 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 753 X86_PG_G; 754 755 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 756 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 757 for (i = 0; i < nkpdpe; i++) 758 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 759 PG_U; 760 761 /* 762 * Now, set up the direct map region using 2MB and/or 1GB pages. If 763 * the end of physical memory is not aligned to a 1GB page boundary, 764 * then the residual physical memory is mapped with 2MB pages. Later, 765 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 766 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 767 * that are partially used. 768 */ 769 pd_p = (pd_entry_t *)DMPDphys; 770 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 771 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 772 /* Preset PG_M and PG_A because demotion expects it. */ 773 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 774 X86_PG_M | X86_PG_A; 775 } 776 pdp_p = (pdp_entry_t *)DMPDPphys; 777 for (i = 0; i < ndm1g; i++) { 778 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 779 /* Preset PG_M and PG_A because demotion expects it. */ 780 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 781 X86_PG_M | X86_PG_A; 782 } 783 for (j = 0; i < ndmpdp; i++, j++) { 784 pdp_p[i] = DMPDphys + ptoa(j); 785 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 786 } 787 788 /* And recursively map PML4 to itself in order to get PTmap */ 789 p4_p = (pml4_entry_t *)KPML4phys; 790 p4_p[PML4PML4I] = KPML4phys; 791 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 792 793 /* Connect the Direct Map slot(s) up to the PML4. */ 794 for (i = 0; i < ndmpdpphys; i++) { 795 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 796 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 797 } 798 799 /* Connect the KVA slots up to the PML4 */ 800 for (i = 0; i < NKPML4E; i++) { 801 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 802 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 803 } 804} 805 806/* 807 * Bootstrap the system enough to run with virtual memory. 808 * 809 * On amd64 this is called after mapping has already been enabled 810 * and just syncs the pmap module with what has already been done. 811 * [We can't call it easily with mapping off since the kernel is not 812 * mapped with PA == VA, hence we would have to relocate every address 813 * from the linked base (virtual) address "KERNBASE" to the actual 814 * (physical) address starting relative to 0] 815 */ 816void 817pmap_bootstrap(vm_paddr_t *firstaddr) 818{ 819 vm_offset_t va; 820 pt_entry_t *pte; 821 822 /* 823 * Create an initial set of page tables to run the kernel in. 824 */ 825 create_pagetables(firstaddr); 826 827 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 828 virtual_avail = pmap_kmem_choose(virtual_avail); 829 830 virtual_end = VM_MAX_KERNEL_ADDRESS; 831 832 833 /* XXX do %cr0 as well */ 834 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 835 load_cr3(KPML4phys); 836 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 837 load_cr4(rcr4() | CR4_SMEP); 838 839 /* 840 * Initialize the kernel pmap (which is statically allocated). 841 */ 842 PMAP_LOCK_INIT(kernel_pmap); 843 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 844 kernel_pmap->pm_cr3 = KPML4phys; 845 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 846 CPU_FILL(&kernel_pmap->pm_save); /* always superset of pm_active */ 847 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 848 kernel_pmap->pm_flags = pmap_flags; 849 850 /* 851 * Initialize the global pv list lock. 852 */ 853 rw_init(&pvh_global_lock, "pmap pv global"); 854 855 /* 856 * Reserve some special page table entries/VA space for temporary 857 * mapping of pages. 858 */ 859#define SYSMAP(c, p, v, n) \ 860 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 861 862 va = virtual_avail; 863 pte = vtopte(va); 864 865 /* 866 * Crashdump maps. The first page is reused as CMAP1 for the 867 * memory test. 868 */ 869 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 870 CADDR1 = crashdumpmap; 871 872 virtual_avail = va; 873 874 /* Initialize the PAT MSR. */ 875 pmap_init_pat(); 876 877 /* Initialize TLB Context Id. */ 878 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 879 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 880 load_cr4(rcr4() | CR4_PCIDE); 881 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF); 882 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx); 883 /* Check for INVPCID support */ 884 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 885 != 0; 886 kernel_pmap->pm_pcid = 0; 887#ifndef SMP 888 pmap_pcid_enabled = 0; 889#endif 890 } else 891 pmap_pcid_enabled = 0; 892} 893 894/* 895 * Setup the PAT MSR. 896 */ 897void 898pmap_init_pat(void) 899{ 900 int pat_table[PAT_INDEX_SIZE]; 901 uint64_t pat_msr; 902 u_long cr0, cr4; 903 int i; 904 905 /* Bail if this CPU doesn't implement PAT. */ 906 if ((cpu_feature & CPUID_PAT) == 0) 907 panic("no PAT??"); 908 909 /* Set default PAT index table. */ 910 for (i = 0; i < PAT_INDEX_SIZE; i++) 911 pat_table[i] = -1; 912 pat_table[PAT_WRITE_BACK] = 0; 913 pat_table[PAT_WRITE_THROUGH] = 1; 914 pat_table[PAT_UNCACHEABLE] = 3; 915 pat_table[PAT_WRITE_COMBINING] = 3; 916 pat_table[PAT_WRITE_PROTECTED] = 3; 917 pat_table[PAT_UNCACHED] = 3; 918 919 /* Initialize default PAT entries. */ 920 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 921 PAT_VALUE(1, PAT_WRITE_THROUGH) | 922 PAT_VALUE(2, PAT_UNCACHED) | 923 PAT_VALUE(3, PAT_UNCACHEABLE) | 924 PAT_VALUE(4, PAT_WRITE_BACK) | 925 PAT_VALUE(5, PAT_WRITE_THROUGH) | 926 PAT_VALUE(6, PAT_UNCACHED) | 927 PAT_VALUE(7, PAT_UNCACHEABLE); 928 929 if (pat_works) { 930 /* 931 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 932 * Program 5 and 6 as WP and WC. 933 * Leave 4 and 7 as WB and UC. 934 */ 935 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 936 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 937 PAT_VALUE(6, PAT_WRITE_COMBINING); 938 pat_table[PAT_UNCACHED] = 2; 939 pat_table[PAT_WRITE_PROTECTED] = 5; 940 pat_table[PAT_WRITE_COMBINING] = 6; 941 } else { 942 /* 943 * Just replace PAT Index 2 with WC instead of UC-. 944 */ 945 pat_msr &= ~PAT_MASK(2); 946 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 947 pat_table[PAT_WRITE_COMBINING] = 2; 948 } 949 950 /* Disable PGE. */ 951 cr4 = rcr4(); 952 load_cr4(cr4 & ~CR4_PGE); 953 954 /* Disable caches (CD = 1, NW = 0). */ 955 cr0 = rcr0(); 956 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 957 958 /* Flushes caches and TLBs. */ 959 wbinvd(); 960 invltlb(); 961 962 /* Update PAT and index table. */ 963 wrmsr(MSR_PAT, pat_msr); 964 for (i = 0; i < PAT_INDEX_SIZE; i++) 965 pat_index[i] = pat_table[i]; 966 967 /* Flush caches and TLBs again. */ 968 wbinvd(); 969 invltlb(); 970 971 /* Restore caches and PGE. */ 972 load_cr0(cr0); 973 load_cr4(cr4); 974} 975 976/* 977 * Initialize a vm_page's machine-dependent fields. 978 */ 979void 980pmap_page_init(vm_page_t m) 981{ 982 983 TAILQ_INIT(&m->md.pv_list); 984 m->md.pat_mode = PAT_WRITE_BACK; 985} 986 987/* 988 * Initialize the pmap module. 989 * Called by vm_init, to initialize any structures that the pmap 990 * system needs to map virtual memory. 991 */ 992void 993pmap_init(void) 994{ 995 vm_page_t mpte; 996 vm_size_t s; 997 int i, pv_npg; 998 999 /* 1000 * Initialize the vm page array entries for the kernel pmap's 1001 * page table pages. 1002 */ 1003 for (i = 0; i < nkpt; i++) { 1004 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1005 KASSERT(mpte >= vm_page_array && 1006 mpte < &vm_page_array[vm_page_array_size], 1007 ("pmap_init: page table page is out of range")); 1008 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1009 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1010 } 1011 1012 /* 1013 * If the kernel is running on a virtual machine, then it must assume 1014 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1015 * be prepared for the hypervisor changing the vendor and family that 1016 * are reported by CPUID. Consequently, the workaround for AMD Family 1017 * 10h Erratum 383 is enabled if the processor's feature set does not 1018 * include at least one feature that is only supported by older Intel 1019 * or newer AMD processors. 1020 */ 1021 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 1022 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1023 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1024 AMDID2_FMA4)) == 0) 1025 workaround_erratum383 = 1; 1026 1027 /* 1028 * Are large page mappings enabled? 1029 */ 1030 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1031 if (pg_ps_enabled) { 1032 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1033 ("pmap_init: can't assign to pagesizes[1]")); 1034 pagesizes[1] = NBPDR; 1035 } 1036 1037 /* 1038 * Initialize the pv chunk list mutex. 1039 */ 1040 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1041 1042 /* 1043 * Initialize the pool of pv list locks. 1044 */ 1045 for (i = 0; i < NPV_LIST_LOCKS; i++) 1046 rw_init(&pv_list_locks[i], "pmap pv list"); 1047 1048 /* 1049 * Calculate the size of the pv head table for superpages. 1050 */ 1051 for (i = 0; phys_avail[i + 1]; i += 2); 1052 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 1053 1054 /* 1055 * Allocate memory for the pv head table for superpages. 1056 */ 1057 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1058 s = round_page(s); 1059 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1060 M_WAITOK | M_ZERO); 1061 for (i = 0; i < pv_npg; i++) 1062 TAILQ_INIT(&pv_table[i].pv_list); 1063 1064 mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF); 1065 cpage_a = kva_alloc(PAGE_SIZE); 1066 cpage_b = kva_alloc(PAGE_SIZE); 1067} 1068 1069static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1070 "2MB page mapping counters"); 1071 1072static u_long pmap_pde_demotions; 1073SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1074 &pmap_pde_demotions, 0, "2MB page demotions"); 1075 1076static u_long pmap_pde_mappings; 1077SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1078 &pmap_pde_mappings, 0, "2MB page mappings"); 1079 1080static u_long pmap_pde_p_failures; 1081SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1082 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1083 1084static u_long pmap_pde_promotions; 1085SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1086 &pmap_pde_promotions, 0, "2MB page promotions"); 1087 1088static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1089 "1GB page mapping counters"); 1090 1091static u_long pmap_pdpe_demotions; 1092SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1093 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1094 1095/*************************************************** 1096 * Low level helper routines..... 1097 ***************************************************/ 1098 1099static pt_entry_t 1100pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1101{ 1102 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1103 1104 switch (pmap->pm_type) { 1105 case PT_X86: 1106 /* Verify that both PAT bits are not set at the same time */ 1107 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1108 ("Invalid PAT bits in entry %#lx", entry)); 1109 1110 /* Swap the PAT bits if one of them is set */ 1111 if ((entry & x86_pat_bits) != 0) 1112 entry ^= x86_pat_bits; 1113 break; 1114 case PT_EPT: 1115 /* 1116 * Nothing to do - the memory attributes are represented 1117 * the same way for regular pages and superpages. 1118 */ 1119 break; 1120 default: 1121 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1122 } 1123 1124 return (entry); 1125} 1126 1127/* 1128 * Determine the appropriate bits to set in a PTE or PDE for a specified 1129 * caching mode. 1130 */ 1131static int 1132pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1133{ 1134 int cache_bits, pat_flag, pat_idx; 1135 1136 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1137 panic("Unknown caching mode %d\n", mode); 1138 1139 switch (pmap->pm_type) { 1140 case PT_X86: 1141 /* The PAT bit is different for PTE's and PDE's. */ 1142 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1143 1144 /* Map the caching mode to a PAT index. */ 1145 pat_idx = pat_index[mode]; 1146 1147 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1148 cache_bits = 0; 1149 if (pat_idx & 0x4) 1150 cache_bits |= pat_flag; 1151 if (pat_idx & 0x2) 1152 cache_bits |= PG_NC_PCD; 1153 if (pat_idx & 0x1) 1154 cache_bits |= PG_NC_PWT; 1155 break; 1156 1157 case PT_EPT: 1158 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1159 break; 1160 1161 default: 1162 panic("unsupported pmap type %d", pmap->pm_type); 1163 } 1164 1165 return (cache_bits); 1166} 1167 1168static int 1169pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1170{ 1171 int mask; 1172 1173 switch (pmap->pm_type) { 1174 case PT_X86: 1175 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1176 break; 1177 case PT_EPT: 1178 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1179 break; 1180 default: 1181 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1182 } 1183 1184 return (mask); 1185} 1186 1187static __inline boolean_t 1188pmap_ps_enabled(pmap_t pmap) 1189{ 1190 1191 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1192} 1193 1194static void 1195pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1196{ 1197 1198 switch (pmap->pm_type) { 1199 case PT_X86: 1200 break; 1201 case PT_EPT: 1202 /* 1203 * XXX 1204 * This is a little bogus since the generation number is 1205 * supposed to be bumped up when a region of the address 1206 * space is invalidated in the page tables. 1207 * 1208 * In this case the old PDE entry is valid but yet we want 1209 * to make sure that any mappings using the old entry are 1210 * invalidated in the TLB. 1211 * 1212 * The reason this works as expected is because we rendezvous 1213 * "all" host cpus and force any vcpu context to exit as a 1214 * side-effect. 1215 */ 1216 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1217 break; 1218 default: 1219 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1220 } 1221 pde_store(pde, newpde); 1222} 1223 1224/* 1225 * After changing the page size for the specified virtual address in the page 1226 * table, flush the corresponding entries from the processor's TLB. Only the 1227 * calling processor's TLB is affected. 1228 * 1229 * The calling thread must be pinned to a processor. 1230 */ 1231static void 1232pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1233{ 1234 pt_entry_t PG_G; 1235 1236 if (pmap->pm_type == PT_EPT) 1237 return; 1238 1239 KASSERT(pmap->pm_type == PT_X86, 1240 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1241 1242 PG_G = pmap_global_bit(pmap); 1243 1244 if ((newpde & PG_PS) == 0) 1245 /* Demotion: flush a specific 2MB page mapping. */ 1246 invlpg(va); 1247 else if ((newpde & PG_G) == 0) 1248 /* 1249 * Promotion: flush every 4KB page mapping from the TLB 1250 * because there are too many to flush individually. 1251 */ 1252 invltlb(); 1253 else { 1254 /* 1255 * Promotion: flush every 4KB page mapping from the TLB, 1256 * including any global (PG_G) mappings. 1257 */ 1258 invltlb_globpcid(); 1259 } 1260} 1261#ifdef SMP 1262 1263static void 1264pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) 1265{ 1266 struct invpcid_descr d; 1267 uint64_t cr3; 1268 1269 if (invpcid_works) { 1270 d.pcid = pmap->pm_pcid; 1271 d.pad = 0; 1272 d.addr = va; 1273 invpcid(&d, INVPCID_ADDR); 1274 return; 1275 } 1276 1277 cr3 = rcr3(); 1278 critical_enter(); 1279 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1280 invlpg(va); 1281 load_cr3(cr3 | CR3_PCID_SAVE); 1282 critical_exit(); 1283} 1284 1285/* 1286 * For SMP, these functions have to use the IPI mechanism for coherence. 1287 * 1288 * N.B.: Before calling any of the following TLB invalidation functions, 1289 * the calling processor must ensure that all stores updating a non- 1290 * kernel page table are globally performed. Otherwise, another 1291 * processor could cache an old, pre-update entry without being 1292 * invalidated. This can happen one of two ways: (1) The pmap becomes 1293 * active on another processor after its pm_active field is checked by 1294 * one of the following functions but before a store updating the page 1295 * table is globally performed. (2) The pmap becomes active on another 1296 * processor before its pm_active field is checked but due to 1297 * speculative loads one of the following functions stills reads the 1298 * pmap as inactive on the other processor. 1299 * 1300 * The kernel page table is exempt because its pm_active field is 1301 * immutable. The kernel page table is always active on every 1302 * processor. 1303 */ 1304 1305/* 1306 * Interrupt the cpus that are executing in the guest context. 1307 * This will force the vcpu to exit and the cached EPT mappings 1308 * will be invalidated by the host before the next vmresume. 1309 */ 1310static __inline void 1311pmap_invalidate_ept(pmap_t pmap) 1312{ 1313 int ipinum; 1314 1315 sched_pin(); 1316 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1317 ("pmap_invalidate_ept: absurd pm_active")); 1318 1319 /* 1320 * The TLB mappings associated with a vcpu context are not 1321 * flushed each time a different vcpu is chosen to execute. 1322 * 1323 * This is in contrast with a process's vtop mappings that 1324 * are flushed from the TLB on each context switch. 1325 * 1326 * Therefore we need to do more than just a TLB shootdown on 1327 * the active cpus in 'pmap->pm_active'. To do this we keep 1328 * track of the number of invalidations performed on this pmap. 1329 * 1330 * Each vcpu keeps a cache of this counter and compares it 1331 * just before a vmresume. If the counter is out-of-date an 1332 * invept will be done to flush stale mappings from the TLB. 1333 */ 1334 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1335 1336 /* 1337 * Force the vcpu to exit and trap back into the hypervisor. 1338 */ 1339 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1340 ipi_selected(pmap->pm_active, ipinum); 1341 sched_unpin(); 1342} 1343 1344void 1345pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1346{ 1347 cpuset_t other_cpus; 1348 u_int cpuid; 1349 1350 if (pmap->pm_type == PT_EPT) { 1351 pmap_invalidate_ept(pmap); 1352 return; 1353 } 1354 1355 KASSERT(pmap->pm_type == PT_X86, 1356 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1357 1358 sched_pin(); 1359 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1360 if (!pmap_pcid_enabled) { 1361 invlpg(va); 1362 } else { 1363 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1364 if (pmap == PCPU_GET(curpmap)) 1365 invlpg(va); 1366 else 1367 pmap_invalidate_page_pcid(pmap, va); 1368 } else { 1369 invltlb_globpcid(); 1370 } 1371 } 1372 smp_invlpg(pmap, va); 1373 } else { 1374 cpuid = PCPU_GET(cpuid); 1375 other_cpus = all_cpus; 1376 CPU_CLR(cpuid, &other_cpus); 1377 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1378 invlpg(va); 1379 else if (pmap_pcid_enabled) { 1380 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1381 pmap_invalidate_page_pcid(pmap, va); 1382 else 1383 invltlb_globpcid(); 1384 } 1385 if (pmap_pcid_enabled) 1386 CPU_AND(&other_cpus, &pmap->pm_save); 1387 else 1388 CPU_AND(&other_cpus, &pmap->pm_active); 1389 if (!CPU_EMPTY(&other_cpus)) 1390 smp_masked_invlpg(other_cpus, pmap, va); 1391 } 1392 sched_unpin(); 1393} 1394 1395static void 1396pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1397{ 1398 struct invpcid_descr d; 1399 uint64_t cr3; 1400 vm_offset_t addr; 1401 1402 if (invpcid_works) { 1403 d.pcid = pmap->pm_pcid; 1404 d.pad = 0; 1405 for (addr = sva; addr < eva; addr += PAGE_SIZE) { 1406 d.addr = addr; 1407 invpcid(&d, INVPCID_ADDR); 1408 } 1409 return; 1410 } 1411 1412 cr3 = rcr3(); 1413 critical_enter(); 1414 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1415 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1416 invlpg(addr); 1417 load_cr3(cr3 | CR3_PCID_SAVE); 1418 critical_exit(); 1419} 1420 1421void 1422pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1423{ 1424 cpuset_t other_cpus; 1425 vm_offset_t addr; 1426 u_int cpuid; 1427 1428 if (pmap->pm_type == PT_EPT) { 1429 pmap_invalidate_ept(pmap); 1430 return; 1431 } 1432 1433 KASSERT(pmap->pm_type == PT_X86, 1434 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1435 1436 sched_pin(); 1437 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1438 if (!pmap_pcid_enabled) { 1439 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1440 invlpg(addr); 1441 } else { 1442 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1443 if (pmap == PCPU_GET(curpmap)) { 1444 for (addr = sva; addr < eva; 1445 addr += PAGE_SIZE) 1446 invlpg(addr); 1447 } else { 1448 pmap_invalidate_range_pcid(pmap, 1449 sva, eva); 1450 } 1451 } else { 1452 invltlb_globpcid(); 1453 } 1454 } 1455 smp_invlpg_range(pmap, sva, eva); 1456 } else { 1457 cpuid = PCPU_GET(cpuid); 1458 other_cpus = all_cpus; 1459 CPU_CLR(cpuid, &other_cpus); 1460 if (CPU_ISSET(cpuid, &pmap->pm_active)) { 1461 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1462 invlpg(addr); 1463 } else if (pmap_pcid_enabled) { 1464 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1465 pmap_invalidate_range_pcid(pmap, sva, eva); 1466 else 1467 invltlb_globpcid(); 1468 } 1469 if (pmap_pcid_enabled) 1470 CPU_AND(&other_cpus, &pmap->pm_save); 1471 else 1472 CPU_AND(&other_cpus, &pmap->pm_active); 1473 if (!CPU_EMPTY(&other_cpus)) 1474 smp_masked_invlpg_range(other_cpus, pmap, sva, eva); 1475 } 1476 sched_unpin(); 1477} 1478 1479void 1480pmap_invalidate_all(pmap_t pmap) 1481{ 1482 cpuset_t other_cpus; 1483 struct invpcid_descr d; 1484 uint64_t cr3; 1485 u_int cpuid; 1486 1487 if (pmap->pm_type == PT_EPT) { 1488 pmap_invalidate_ept(pmap); 1489 return; 1490 } 1491 1492 KASSERT(pmap->pm_type == PT_X86, 1493 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1494 1495 sched_pin(); 1496 cpuid = PCPU_GET(cpuid); 1497 if (pmap == kernel_pmap || 1498 (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) || 1499 !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1500 if (invpcid_works) { 1501 bzero(&d, sizeof(d)); 1502 invpcid(&d, INVPCID_CTXGLOB); 1503 } else { 1504 invltlb_globpcid(); 1505 } 1506 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1507 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1508 smp_invltlb(pmap); 1509 } else { 1510 other_cpus = all_cpus; 1511 CPU_CLR(cpuid, &other_cpus); 1512 1513 /* 1514 * This logic is duplicated in the Xinvltlb shootdown 1515 * IPI handler. 1516 */ 1517 if (pmap_pcid_enabled) { 1518 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1519 if (invpcid_works) { 1520 d.pcid = pmap->pm_pcid; 1521 d.pad = 0; 1522 d.addr = 0; 1523 invpcid(&d, INVPCID_CTX); 1524 } else { 1525 cr3 = rcr3(); 1526 critical_enter(); 1527 1528 /* 1529 * Bit 63 is clear, pcid TLB 1530 * entries are invalidated. 1531 */ 1532 load_cr3(pmap->pm_cr3); 1533 load_cr3(cr3 | CR3_PCID_SAVE); 1534 critical_exit(); 1535 } 1536 } else { 1537 invltlb_globpcid(); 1538 } 1539 } else if (CPU_ISSET(cpuid, &pmap->pm_active)) 1540 invltlb(); 1541 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1542 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1543 if (pmap_pcid_enabled) 1544 CPU_AND(&other_cpus, &pmap->pm_save); 1545 else 1546 CPU_AND(&other_cpus, &pmap->pm_active); 1547 if (!CPU_EMPTY(&other_cpus)) 1548 smp_masked_invltlb(other_cpus, pmap); 1549 } 1550 sched_unpin(); 1551} 1552 1553void 1554pmap_invalidate_cache(void) 1555{ 1556 1557 sched_pin(); 1558 wbinvd(); 1559 smp_cache_flush(); 1560 sched_unpin(); 1561} 1562 1563struct pde_action { 1564 cpuset_t invalidate; /* processors that invalidate their TLB */ 1565 pmap_t pmap; 1566 vm_offset_t va; 1567 pd_entry_t *pde; 1568 pd_entry_t newpde; 1569 u_int store; /* processor that updates the PDE */ 1570}; 1571 1572static void 1573pmap_update_pde_action(void *arg) 1574{ 1575 struct pde_action *act = arg; 1576 1577 if (act->store == PCPU_GET(cpuid)) 1578 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1579} 1580 1581static void 1582pmap_update_pde_teardown(void *arg) 1583{ 1584 struct pde_action *act = arg; 1585 1586 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1587 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1588} 1589 1590/* 1591 * Change the page size for the specified virtual address in a way that 1592 * prevents any possibility of the TLB ever having two entries that map the 1593 * same virtual address using different page sizes. This is the recommended 1594 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1595 * machine check exception for a TLB state that is improperly diagnosed as a 1596 * hardware error. 1597 */ 1598static void 1599pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1600{ 1601 struct pde_action act; 1602 cpuset_t active, other_cpus; 1603 u_int cpuid; 1604 1605 sched_pin(); 1606 cpuid = PCPU_GET(cpuid); 1607 other_cpus = all_cpus; 1608 CPU_CLR(cpuid, &other_cpus); 1609 if (pmap == kernel_pmap || pmap->pm_type == PT_EPT) 1610 active = all_cpus; 1611 else { 1612 active = pmap->pm_active; 1613 CPU_AND_ATOMIC(&pmap->pm_save, &active); 1614 } 1615 if (CPU_OVERLAP(&active, &other_cpus)) { 1616 act.store = cpuid; 1617 act.invalidate = active; 1618 act.va = va; 1619 act.pmap = pmap; 1620 act.pde = pde; 1621 act.newpde = newpde; 1622 CPU_SET(cpuid, &active); 1623 smp_rendezvous_cpus(active, 1624 smp_no_rendevous_barrier, pmap_update_pde_action, 1625 pmap_update_pde_teardown, &act); 1626 } else { 1627 pmap_update_pde_store(pmap, pde, newpde); 1628 if (CPU_ISSET(cpuid, &active)) 1629 pmap_update_pde_invalidate(pmap, va, newpde); 1630 } 1631 sched_unpin(); 1632} 1633#else /* !SMP */ 1634/* 1635 * Normal, non-SMP, invalidation functions. 1636 * We inline these within pmap.c for speed. 1637 */ 1638PMAP_INLINE void 1639pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1640{ 1641 1642 switch (pmap->pm_type) { 1643 case PT_X86: 1644 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1645 invlpg(va); 1646 break; 1647 case PT_EPT: 1648 pmap->pm_eptgen++; 1649 break; 1650 default: 1651 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type); 1652 } 1653} 1654 1655PMAP_INLINE void 1656pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1657{ 1658 vm_offset_t addr; 1659 1660 switch (pmap->pm_type) { 1661 case PT_X86: 1662 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1663 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1664 invlpg(addr); 1665 break; 1666 case PT_EPT: 1667 pmap->pm_eptgen++; 1668 break; 1669 default: 1670 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); 1671 } 1672} 1673 1674PMAP_INLINE void 1675pmap_invalidate_all(pmap_t pmap) 1676{ 1677 1678 switch (pmap->pm_type) { 1679 case PT_X86: 1680 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1681 invltlb(); 1682 break; 1683 case PT_EPT: 1684 pmap->pm_eptgen++; 1685 break; 1686 default: 1687 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type); 1688 } 1689} 1690 1691PMAP_INLINE void 1692pmap_invalidate_cache(void) 1693{ 1694 1695 wbinvd(); 1696} 1697 1698static void 1699pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1700{ 1701 1702 pmap_update_pde_store(pmap, pde, newpde); 1703 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1704 pmap_update_pde_invalidate(pmap, va, newpde); 1705 else 1706 CPU_ZERO(&pmap->pm_save); 1707} 1708#endif /* !SMP */ 1709 1710#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1711 1712void 1713pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1714{ 1715 1716 KASSERT((sva & PAGE_MASK) == 0, 1717 ("pmap_invalidate_cache_range: sva not page-aligned")); 1718 KASSERT((eva & PAGE_MASK) == 0, 1719 ("pmap_invalidate_cache_range: eva not page-aligned")); 1720 1721 if (cpu_feature & CPUID_SS) 1722 ; /* If "Self Snoop" is supported, do nothing. */ 1723 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1724 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1725 1726 /* 1727 * XXX: Some CPUs fault, hang, or trash the local APIC 1728 * registers if we use CLFLUSH on the local APIC 1729 * range. The local APIC is always uncached, so we 1730 * don't need to flush for that range anyway. 1731 */ 1732 if (pmap_kextract(sva) == lapic_paddr) 1733 return; 1734 1735 /* 1736 * Otherwise, do per-cache line flush. Use the mfence 1737 * instruction to insure that previous stores are 1738 * included in the write-back. The processor 1739 * propagates flush to other processors in the cache 1740 * coherence domain. 1741 */ 1742 mfence(); 1743 for (; sva < eva; sva += cpu_clflush_line_size) 1744 clflush(sva); 1745 mfence(); 1746 } else { 1747 1748 /* 1749 * No targeted cache flush methods are supported by CPU, 1750 * or the supplied range is bigger than 2MB. 1751 * Globally invalidate cache. 1752 */ 1753 pmap_invalidate_cache(); 1754 } 1755} 1756 1757/* 1758 * Remove the specified set of pages from the data and instruction caches. 1759 * 1760 * In contrast to pmap_invalidate_cache_range(), this function does not 1761 * rely on the CPU's self-snoop feature, because it is intended for use 1762 * when moving pages into a different cache domain. 1763 */ 1764void 1765pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1766{ 1767 vm_offset_t daddr, eva; 1768 int i; 1769 1770 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1771 (cpu_feature & CPUID_CLFSH) == 0) 1772 pmap_invalidate_cache(); 1773 else { 1774 mfence(); 1775 for (i = 0; i < count; i++) { 1776 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1777 eva = daddr + PAGE_SIZE; 1778 for (; daddr < eva; daddr += cpu_clflush_line_size) 1779 clflush(daddr); 1780 } 1781 mfence(); 1782 } 1783} 1784 1785/* 1786 * Routine: pmap_extract 1787 * Function: 1788 * Extract the physical page address associated 1789 * with the given map/virtual_address pair. 1790 */ 1791vm_paddr_t 1792pmap_extract(pmap_t pmap, vm_offset_t va) 1793{ 1794 pdp_entry_t *pdpe; 1795 pd_entry_t *pde; 1796 pt_entry_t *pte, PG_V; 1797 vm_paddr_t pa; 1798 1799 pa = 0; 1800 PG_V = pmap_valid_bit(pmap); 1801 PMAP_LOCK(pmap); 1802 pdpe = pmap_pdpe(pmap, va); 1803 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1804 if ((*pdpe & PG_PS) != 0) 1805 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1806 else { 1807 pde = pmap_pdpe_to_pde(pdpe, va); 1808 if ((*pde & PG_V) != 0) { 1809 if ((*pde & PG_PS) != 0) { 1810 pa = (*pde & PG_PS_FRAME) | 1811 (va & PDRMASK); 1812 } else { 1813 pte = pmap_pde_to_pte(pde, va); 1814 pa = (*pte & PG_FRAME) | 1815 (va & PAGE_MASK); 1816 } 1817 } 1818 } 1819 } 1820 PMAP_UNLOCK(pmap); 1821 return (pa); 1822} 1823 1824/* 1825 * Routine: pmap_extract_and_hold 1826 * Function: 1827 * Atomically extract and hold the physical page 1828 * with the given pmap and virtual address pair 1829 * if that mapping permits the given protection. 1830 */ 1831vm_page_t 1832pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1833{ 1834 pd_entry_t pde, *pdep; 1835 pt_entry_t pte, PG_RW, PG_V; 1836 vm_paddr_t pa; 1837 vm_page_t m; 1838 1839 pa = 0; 1840 m = NULL; 1841 PG_RW = pmap_rw_bit(pmap); 1842 PG_V = pmap_valid_bit(pmap); 1843 PMAP_LOCK(pmap); 1844retry: 1845 pdep = pmap_pde(pmap, va); 1846 if (pdep != NULL && (pde = *pdep)) { 1847 if (pde & PG_PS) { 1848 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1849 if (vm_page_pa_tryrelock(pmap, (pde & 1850 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1851 goto retry; 1852 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1853 (va & PDRMASK)); 1854 vm_page_hold(m); 1855 } 1856 } else { 1857 pte = *pmap_pde_to_pte(pdep, va); 1858 if ((pte & PG_V) && 1859 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1860 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1861 &pa)) 1862 goto retry; 1863 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1864 vm_page_hold(m); 1865 } 1866 } 1867 } 1868 PA_UNLOCK_COND(pa); 1869 PMAP_UNLOCK(pmap); 1870 return (m); 1871} 1872 1873vm_paddr_t 1874pmap_kextract(vm_offset_t va) 1875{ 1876 pd_entry_t pde; 1877 vm_paddr_t pa; 1878 1879 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1880 pa = DMAP_TO_PHYS(va); 1881 } else { 1882 pde = *vtopde(va); 1883 if (pde & PG_PS) { 1884 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1885 } else { 1886 /* 1887 * Beware of a concurrent promotion that changes the 1888 * PDE at this point! For example, vtopte() must not 1889 * be used to access the PTE because it would use the 1890 * new PDE. It is, however, safe to use the old PDE 1891 * because the page table page is preserved by the 1892 * promotion. 1893 */ 1894 pa = *pmap_pde_to_pte(&pde, va); 1895 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1896 } 1897 } 1898 return (pa); 1899} 1900 1901/*************************************************** 1902 * Low level mapping routines..... 1903 ***************************************************/ 1904 1905/* 1906 * Add a wired page to the kva. 1907 * Note: not SMP coherent. 1908 */ 1909PMAP_INLINE void 1910pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1911{ 1912 pt_entry_t *pte; 1913 1914 pte = vtopte(va); 1915 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 1916} 1917 1918static __inline void 1919pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1920{ 1921 pt_entry_t *pte; 1922 int cache_bits; 1923 1924 pte = vtopte(va); 1925 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 1926 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 1927} 1928 1929/* 1930 * Remove a page from the kernel pagetables. 1931 * Note: not SMP coherent. 1932 */ 1933PMAP_INLINE void 1934pmap_kremove(vm_offset_t va) 1935{ 1936 pt_entry_t *pte; 1937 1938 pte = vtopte(va); 1939 pte_clear(pte); 1940} 1941 1942/* 1943 * Used to map a range of physical addresses into kernel 1944 * virtual address space. 1945 * 1946 * The value passed in '*virt' is a suggested virtual address for 1947 * the mapping. Architectures which can support a direct-mapped 1948 * physical to virtual region can return the appropriate address 1949 * within that region, leaving '*virt' unchanged. Other 1950 * architectures should map the pages starting at '*virt' and 1951 * update '*virt' with the first usable address after the mapped 1952 * region. 1953 */ 1954vm_offset_t 1955pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1956{ 1957 return PHYS_TO_DMAP(start); 1958} 1959 1960 1961/* 1962 * Add a list of wired pages to the kva 1963 * this routine is only used for temporary 1964 * kernel mappings that do not need to have 1965 * page modification or references recorded. 1966 * Note that old mappings are simply written 1967 * over. The page *must* be wired. 1968 * Note: SMP coherent. Uses a ranged shootdown IPI. 1969 */ 1970void 1971pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1972{ 1973 pt_entry_t *endpte, oldpte, pa, *pte; 1974 vm_page_t m; 1975 int cache_bits; 1976 1977 oldpte = 0; 1978 pte = vtopte(sva); 1979 endpte = pte + count; 1980 while (pte < endpte) { 1981 m = *ma++; 1982 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 1983 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 1984 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 1985 oldpte |= *pte; 1986 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 1987 } 1988 pte++; 1989 } 1990 if (__predict_false((oldpte & X86_PG_V) != 0)) 1991 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1992 PAGE_SIZE); 1993} 1994 1995/* 1996 * This routine tears out page mappings from the 1997 * kernel -- it is meant only for temporary mappings. 1998 * Note: SMP coherent. Uses a ranged shootdown IPI. 1999 */ 2000void 2001pmap_qremove(vm_offset_t sva, int count) 2002{ 2003 vm_offset_t va; 2004 2005 va = sva; 2006 while (count-- > 0) { 2007 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2008 pmap_kremove(va); 2009 va += PAGE_SIZE; 2010 } 2011 pmap_invalidate_range(kernel_pmap, sva, va); 2012} 2013 2014/*************************************************** 2015 * Page table page management routines..... 2016 ***************************************************/ 2017static __inline void 2018pmap_free_zero_pages(struct spglist *free) 2019{ 2020 vm_page_t m; 2021 2022 while ((m = SLIST_FIRST(free)) != NULL) { 2023 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2024 /* Preserve the page's PG_ZERO setting. */ 2025 vm_page_free_toq(m); 2026 } 2027} 2028 2029/* 2030 * Schedule the specified unused page table page to be freed. Specifically, 2031 * add the page to the specified list of pages that will be released to the 2032 * physical memory manager after the TLB has been updated. 2033 */ 2034static __inline void 2035pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2036 boolean_t set_PG_ZERO) 2037{ 2038 2039 if (set_PG_ZERO) 2040 m->flags |= PG_ZERO; 2041 else 2042 m->flags &= ~PG_ZERO; 2043 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2044} 2045 2046/* 2047 * Inserts the specified page table page into the specified pmap's collection 2048 * of idle page table pages. Each of a pmap's page table pages is responsible 2049 * for mapping a distinct range of virtual addresses. The pmap's collection is 2050 * ordered by this virtual address range. 2051 */ 2052static __inline int 2053pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2054{ 2055 2056 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2057 return (vm_radix_insert(&pmap->pm_root, mpte)); 2058} 2059 2060/* 2061 * Looks for a page table page mapping the specified virtual address in the 2062 * specified pmap's collection of idle page table pages. Returns NULL if there 2063 * is no page table page corresponding to the specified virtual address. 2064 */ 2065static __inline vm_page_t 2066pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 2067{ 2068 2069 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2070 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); 2071} 2072 2073/* 2074 * Removes the specified page table page from the specified pmap's collection 2075 * of idle page table pages. The specified page table page must be a member of 2076 * the pmap's collection. 2077 */ 2078static __inline void 2079pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 2080{ 2081 2082 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2083 vm_radix_remove(&pmap->pm_root, mpte->pindex); 2084} 2085 2086/* 2087 * Decrements a page table page's wire count, which is used to record the 2088 * number of valid page table entries within the page. If the wire count 2089 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2090 * page table page was unmapped and FALSE otherwise. 2091 */ 2092static inline boolean_t 2093pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2094{ 2095 2096 --m->wire_count; 2097 if (m->wire_count == 0) { 2098 _pmap_unwire_ptp(pmap, va, m, free); 2099 return (TRUE); 2100 } else 2101 return (FALSE); 2102} 2103 2104static void 2105_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2106{ 2107 2108 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2109 /* 2110 * unmap the page table page 2111 */ 2112 if (m->pindex >= (NUPDE + NUPDPE)) { 2113 /* PDP page */ 2114 pml4_entry_t *pml4; 2115 pml4 = pmap_pml4e(pmap, va); 2116 *pml4 = 0; 2117 } else if (m->pindex >= NUPDE) { 2118 /* PD page */ 2119 pdp_entry_t *pdp; 2120 pdp = pmap_pdpe(pmap, va); 2121 *pdp = 0; 2122 } else { 2123 /* PTE page */ 2124 pd_entry_t *pd; 2125 pd = pmap_pde(pmap, va); 2126 *pd = 0; 2127 } 2128 pmap_resident_count_dec(pmap, 1); 2129 if (m->pindex < NUPDE) { 2130 /* We just released a PT, unhold the matching PD */ 2131 vm_page_t pdpg; 2132 2133 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2134 pmap_unwire_ptp(pmap, va, pdpg, free); 2135 } 2136 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2137 /* We just released a PD, unhold the matching PDP */ 2138 vm_page_t pdppg; 2139 2140 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2141 pmap_unwire_ptp(pmap, va, pdppg, free); 2142 } 2143 2144 /* 2145 * This is a release store so that the ordinary store unmapping 2146 * the page table page is globally performed before TLB shoot- 2147 * down is begun. 2148 */ 2149 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 2150 2151 /* 2152 * Put page on a list so that it is released after 2153 * *ALL* TLB shootdown is done 2154 */ 2155 pmap_add_delayed_free_list(m, free, TRUE); 2156} 2157 2158/* 2159 * After removing a page table entry, this routine is used to 2160 * conditionally free the page, and manage the hold/wire counts. 2161 */ 2162static int 2163pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2164 struct spglist *free) 2165{ 2166 vm_page_t mpte; 2167 2168 if (va >= VM_MAXUSER_ADDRESS) 2169 return (0); 2170 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2171 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2172 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2173} 2174 2175void 2176pmap_pinit0(pmap_t pmap) 2177{ 2178 2179 PMAP_LOCK_INIT(pmap); 2180 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2181 pmap->pm_cr3 = KPML4phys; 2182 pmap->pm_root.rt_root = 0; 2183 CPU_ZERO(&pmap->pm_active); 2184 CPU_ZERO(&pmap->pm_save); 2185 PCPU_SET(curpmap, pmap); 2186 TAILQ_INIT(&pmap->pm_pvchunk); 2187 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2188 pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; 2189 pmap->pm_flags = pmap_flags; 2190} 2191 2192/* 2193 * Initialize a preallocated and zeroed pmap structure, 2194 * such as one in a vmspace structure. 2195 */ 2196int 2197pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2198{ 2199 vm_page_t pml4pg; 2200 vm_paddr_t pml4phys; 2201 int i; 2202 2203 /* 2204 * allocate the page directory page 2205 */ 2206 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2207 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 2208 VM_WAIT; 2209 2210 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2211 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2212 pmap->pm_pcid = -1; 2213 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2214 2215 if ((pml4pg->flags & PG_ZERO) == 0) 2216 pagezero(pmap->pm_pml4); 2217 2218 /* 2219 * Do not install the host kernel mappings in the nested page 2220 * tables. These mappings are meaningless in the guest physical 2221 * address space. 2222 */ 2223 if ((pmap->pm_type = pm_type) == PT_X86) { 2224 pmap->pm_cr3 = pml4phys; 2225 2226 /* Wire in kernel global address entries. */ 2227 for (i = 0; i < NKPML4E; i++) { 2228 pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | 2229 X86_PG_RW | X86_PG_V | PG_U; 2230 } 2231 for (i = 0; i < ndmpdpphys; i++) { 2232 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | 2233 X86_PG_RW | X86_PG_V | PG_U; 2234 } 2235 2236 /* install self-referential address mapping entry(s) */ 2237 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | 2238 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2239 2240 if (pmap_pcid_enabled) { 2241 pmap->pm_pcid = alloc_unr(&pcid_unr); 2242 if (pmap->pm_pcid != -1) 2243 pmap->pm_cr3 |= pmap->pm_pcid; 2244 } 2245 } 2246 2247 pmap->pm_root.rt_root = 0; 2248 CPU_ZERO(&pmap->pm_active); 2249 TAILQ_INIT(&pmap->pm_pvchunk); 2250 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2251 pmap->pm_flags = flags; 2252 pmap->pm_eptgen = 0; 2253 CPU_ZERO(&pmap->pm_save); 2254 2255 return (1); 2256} 2257 2258int 2259pmap_pinit(pmap_t pmap) 2260{ 2261 2262 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2263} 2264 2265/* 2266 * This routine is called if the desired page table page does not exist. 2267 * 2268 * If page table page allocation fails, this routine may sleep before 2269 * returning NULL. It sleeps only if a lock pointer was given. 2270 * 2271 * Note: If a page allocation fails at page table level two or three, 2272 * one or two pages may be held during the wait, only to be released 2273 * afterwards. This conservative approach is easily argued to avoid 2274 * race conditions. 2275 */ 2276static vm_page_t 2277_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2278{ 2279 vm_page_t m, pdppg, pdpg; 2280 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2281 2282 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2283 2284 PG_A = pmap_accessed_bit(pmap); 2285 PG_M = pmap_modified_bit(pmap); 2286 PG_V = pmap_valid_bit(pmap); 2287 PG_RW = pmap_rw_bit(pmap); 2288 2289 /* 2290 * Allocate a page table page. 2291 */ 2292 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2293 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2294 if (lockp != NULL) { 2295 RELEASE_PV_LIST_LOCK(lockp); 2296 PMAP_UNLOCK(pmap); 2297 rw_runlock(&pvh_global_lock); 2298 VM_WAIT; 2299 rw_rlock(&pvh_global_lock); 2300 PMAP_LOCK(pmap); 2301 } 2302 2303 /* 2304 * Indicate the need to retry. While waiting, the page table 2305 * page may have been allocated. 2306 */ 2307 return (NULL); 2308 } 2309 if ((m->flags & PG_ZERO) == 0) 2310 pmap_zero_page(m); 2311 2312 /* 2313 * Map the pagetable page into the process address space, if 2314 * it isn't already there. 2315 */ 2316 2317 if (ptepindex >= (NUPDE + NUPDPE)) { 2318 pml4_entry_t *pml4; 2319 vm_pindex_t pml4index; 2320 2321 /* Wire up a new PDPE page */ 2322 pml4index = ptepindex - (NUPDE + NUPDPE); 2323 pml4 = &pmap->pm_pml4[pml4index]; 2324 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2325 2326 } else if (ptepindex >= NUPDE) { 2327 vm_pindex_t pml4index; 2328 vm_pindex_t pdpindex; 2329 pml4_entry_t *pml4; 2330 pdp_entry_t *pdp; 2331 2332 /* Wire up a new PDE page */ 2333 pdpindex = ptepindex - NUPDE; 2334 pml4index = pdpindex >> NPML4EPGSHIFT; 2335 2336 pml4 = &pmap->pm_pml4[pml4index]; 2337 if ((*pml4 & PG_V) == 0) { 2338 /* Have to allocate a new pdp, recurse */ 2339 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2340 lockp) == NULL) { 2341 --m->wire_count; 2342 atomic_subtract_int(&cnt.v_wire_count, 1); 2343 vm_page_free_zero(m); 2344 return (NULL); 2345 } 2346 } else { 2347 /* Add reference to pdp page */ 2348 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2349 pdppg->wire_count++; 2350 } 2351 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2352 2353 /* Now find the pdp page */ 2354 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2355 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2356 2357 } else { 2358 vm_pindex_t pml4index; 2359 vm_pindex_t pdpindex; 2360 pml4_entry_t *pml4; 2361 pdp_entry_t *pdp; 2362 pd_entry_t *pd; 2363 2364 /* Wire up a new PTE page */ 2365 pdpindex = ptepindex >> NPDPEPGSHIFT; 2366 pml4index = pdpindex >> NPML4EPGSHIFT; 2367 2368 /* First, find the pdp and check that its valid. */ 2369 pml4 = &pmap->pm_pml4[pml4index]; 2370 if ((*pml4 & PG_V) == 0) { 2371 /* Have to allocate a new pd, recurse */ 2372 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2373 lockp) == NULL) { 2374 --m->wire_count; 2375 atomic_subtract_int(&cnt.v_wire_count, 1); 2376 vm_page_free_zero(m); 2377 return (NULL); 2378 } 2379 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2380 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2381 } else { 2382 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2383 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2384 if ((*pdp & PG_V) == 0) { 2385 /* Have to allocate a new pd, recurse */ 2386 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2387 lockp) == NULL) { 2388 --m->wire_count; 2389 atomic_subtract_int(&cnt.v_wire_count, 2390 1); 2391 vm_page_free_zero(m); 2392 return (NULL); 2393 } 2394 } else { 2395 /* Add reference to the pd page */ 2396 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2397 pdpg->wire_count++; 2398 } 2399 } 2400 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2401 2402 /* Now we know where the page directory page is */ 2403 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2404 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2405 } 2406 2407 pmap_resident_count_inc(pmap, 1); 2408 2409 return (m); 2410} 2411 2412static vm_page_t 2413pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2414{ 2415 vm_pindex_t pdpindex, ptepindex; 2416 pdp_entry_t *pdpe, PG_V; 2417 vm_page_t pdpg; 2418 2419 PG_V = pmap_valid_bit(pmap); 2420 2421retry: 2422 pdpe = pmap_pdpe(pmap, va); 2423 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2424 /* Add a reference to the pd page. */ 2425 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2426 pdpg->wire_count++; 2427 } else { 2428 /* Allocate a pd page. */ 2429 ptepindex = pmap_pde_pindex(va); 2430 pdpindex = ptepindex >> NPDPEPGSHIFT; 2431 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2432 if (pdpg == NULL && lockp != NULL) 2433 goto retry; 2434 } 2435 return (pdpg); 2436} 2437 2438static vm_page_t 2439pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2440{ 2441 vm_pindex_t ptepindex; 2442 pd_entry_t *pd, PG_V; 2443 vm_page_t m; 2444 2445 PG_V = pmap_valid_bit(pmap); 2446 2447 /* 2448 * Calculate pagetable page index 2449 */ 2450 ptepindex = pmap_pde_pindex(va); 2451retry: 2452 /* 2453 * Get the page directory entry 2454 */ 2455 pd = pmap_pde(pmap, va); 2456 2457 /* 2458 * This supports switching from a 2MB page to a 2459 * normal 4K page. 2460 */ 2461 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2462 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2463 /* 2464 * Invalidation of the 2MB page mapping may have caused 2465 * the deallocation of the underlying PD page. 2466 */ 2467 pd = NULL; 2468 } 2469 } 2470 2471 /* 2472 * If the page table page is mapped, we just increment the 2473 * hold count, and activate it. 2474 */ 2475 if (pd != NULL && (*pd & PG_V) != 0) { 2476 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2477 m->wire_count++; 2478 } else { 2479 /* 2480 * Here if the pte page isn't mapped, or if it has been 2481 * deallocated. 2482 */ 2483 m = _pmap_allocpte(pmap, ptepindex, lockp); 2484 if (m == NULL && lockp != NULL) 2485 goto retry; 2486 } 2487 return (m); 2488} 2489 2490 2491/*************************************************** 2492 * Pmap allocation/deallocation routines. 2493 ***************************************************/ 2494 2495/* 2496 * Release any resources held by the given physical map. 2497 * Called when a pmap initialized by pmap_pinit is being released. 2498 * Should only be called if the map contains no valid mappings. 2499 */ 2500void 2501pmap_release(pmap_t pmap) 2502{ 2503 vm_page_t m; 2504 int i; 2505 2506 KASSERT(pmap->pm_stats.resident_count == 0, 2507 ("pmap_release: pmap resident count %ld != 0", 2508 pmap->pm_stats.resident_count)); 2509 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2510 ("pmap_release: pmap has reserved page table page(s)")); 2511 2512 if (pmap_pcid_enabled) { 2513 /* 2514 * Invalidate any left TLB entries, to allow the reuse 2515 * of the pcid. 2516 */ 2517 pmap_invalidate_all(pmap); 2518 } 2519 2520 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2521 2522 for (i = 0; i < NKPML4E; i++) /* KVA */ 2523 pmap->pm_pml4[KPML4BASE + i] = 0; 2524 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2525 pmap->pm_pml4[DMPML4I + i] = 0; 2526 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2527 2528 m->wire_count--; 2529 atomic_subtract_int(&cnt.v_wire_count, 1); 2530 vm_page_free_zero(m); 2531 if (pmap->pm_pcid != -1) 2532 free_unr(&pcid_unr, pmap->pm_pcid); 2533} 2534 2535static int 2536kvm_size(SYSCTL_HANDLER_ARGS) 2537{ 2538 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2539 2540 return sysctl_handle_long(oidp, &ksize, 0, req); 2541} 2542SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2543 0, 0, kvm_size, "LU", "Size of KVM"); 2544 2545static int 2546kvm_free(SYSCTL_HANDLER_ARGS) 2547{ 2548 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2549 2550 return sysctl_handle_long(oidp, &kfree, 0, req); 2551} 2552SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2553 0, 0, kvm_free, "LU", "Amount of KVM free"); 2554 2555/* 2556 * grow the number of kernel page table entries, if needed 2557 */ 2558void 2559pmap_growkernel(vm_offset_t addr) 2560{ 2561 vm_paddr_t paddr; 2562 vm_page_t nkpg; 2563 pd_entry_t *pde, newpdir; 2564 pdp_entry_t *pdpe; 2565 2566 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2567 2568 /* 2569 * Return if "addr" is within the range of kernel page table pages 2570 * that were preallocated during pmap bootstrap. Moreover, leave 2571 * "kernel_vm_end" and the kernel page table as they were. 2572 * 2573 * The correctness of this action is based on the following 2574 * argument: vm_map_findspace() allocates contiguous ranges of the 2575 * kernel virtual address space. It calls this function if a range 2576 * ends after "kernel_vm_end". If the kernel is mapped between 2577 * "kernel_vm_end" and "addr", then the range cannot begin at 2578 * "kernel_vm_end". In fact, its beginning address cannot be less 2579 * than the kernel. Thus, there is no immediate need to allocate 2580 * any new kernel page table pages between "kernel_vm_end" and 2581 * "KERNBASE". 2582 */ 2583 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2584 return; 2585 2586 addr = roundup2(addr, NBPDR); 2587 if (addr - 1 >= kernel_map->max_offset) 2588 addr = kernel_map->max_offset; 2589 while (kernel_vm_end < addr) { 2590 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2591 if ((*pdpe & X86_PG_V) == 0) { 2592 /* We need a new PDP entry */ 2593 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2594 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2595 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2596 if (nkpg == NULL) 2597 panic("pmap_growkernel: no memory to grow kernel"); 2598 if ((nkpg->flags & PG_ZERO) == 0) 2599 pmap_zero_page(nkpg); 2600 paddr = VM_PAGE_TO_PHYS(nkpg); 2601 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2602 X86_PG_A | X86_PG_M); 2603 continue; /* try again */ 2604 } 2605 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2606 if ((*pde & X86_PG_V) != 0) { 2607 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2608 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2609 kernel_vm_end = kernel_map->max_offset; 2610 break; 2611 } 2612 continue; 2613 } 2614 2615 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2616 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2617 VM_ALLOC_ZERO); 2618 if (nkpg == NULL) 2619 panic("pmap_growkernel: no memory to grow kernel"); 2620 if ((nkpg->flags & PG_ZERO) == 0) 2621 pmap_zero_page(nkpg); 2622 paddr = VM_PAGE_TO_PHYS(nkpg); 2623 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2624 pde_store(pde, newpdir); 2625 2626 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2627 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2628 kernel_vm_end = kernel_map->max_offset; 2629 break; 2630 } 2631 } 2632} 2633 2634 2635/*************************************************** 2636 * page management routines. 2637 ***************************************************/ 2638 2639CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2640CTASSERT(_NPCM == 3); 2641CTASSERT(_NPCPV == 168); 2642 2643static __inline struct pv_chunk * 2644pv_to_chunk(pv_entry_t pv) 2645{ 2646 2647 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2648} 2649 2650#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2651 2652#define PC_FREE0 0xfffffffffffffffful 2653#define PC_FREE1 0xfffffffffffffffful 2654#define PC_FREE2 0x000000fffffffffful 2655 2656static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2657 2658#ifdef PV_STATS 2659static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2660 2661SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2662 "Current number of pv entry chunks"); 2663SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2664 "Current number of pv entry chunks allocated"); 2665SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2666 "Current number of pv entry chunks frees"); 2667SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2668 "Number of times tried to get a chunk page but failed."); 2669 2670static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2671static int pv_entry_spare; 2672 2673SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2674 "Current number of pv entry frees"); 2675SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2676 "Current number of pv entry allocs"); 2677SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2678 "Current number of pv entries"); 2679SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2680 "Current number of spare pv entries"); 2681#endif 2682 2683/* 2684 * We are in a serious low memory condition. Resort to 2685 * drastic measures to free some pages so we can allocate 2686 * another pv entry chunk. 2687 * 2688 * Returns NULL if PV entries were reclaimed from the specified pmap. 2689 * 2690 * We do not, however, unmap 2mpages because subsequent accesses will 2691 * allocate per-page pv entries until repromotion occurs, thereby 2692 * exacerbating the shortage of free pv entries. 2693 */ 2694static vm_page_t 2695reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2696{ 2697 struct pch new_tail; 2698 struct pv_chunk *pc; 2699 struct md_page *pvh; 2700 pd_entry_t *pde; 2701 pmap_t pmap; 2702 pt_entry_t *pte, tpte; 2703 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2704 pv_entry_t pv; 2705 vm_offset_t va; 2706 vm_page_t m, m_pc; 2707 struct spglist free; 2708 uint64_t inuse; 2709 int bit, field, freed; 2710 2711 rw_assert(&pvh_global_lock, RA_LOCKED); 2712 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2713 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2714 pmap = NULL; 2715 m_pc = NULL; 2716 PG_G = PG_A = PG_M = PG_RW = 0; 2717 SLIST_INIT(&free); 2718 TAILQ_INIT(&new_tail); 2719 mtx_lock(&pv_chunks_mutex); 2720 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { 2721 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2722 mtx_unlock(&pv_chunks_mutex); 2723 if (pmap != pc->pc_pmap) { 2724 if (pmap != NULL) { 2725 pmap_invalidate_all(pmap); 2726 if (pmap != locked_pmap) 2727 PMAP_UNLOCK(pmap); 2728 } 2729 pmap = pc->pc_pmap; 2730 /* Avoid deadlock and lock recursion. */ 2731 if (pmap > locked_pmap) { 2732 RELEASE_PV_LIST_LOCK(lockp); 2733 PMAP_LOCK(pmap); 2734 } else if (pmap != locked_pmap && 2735 !PMAP_TRYLOCK(pmap)) { 2736 pmap = NULL; 2737 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2738 mtx_lock(&pv_chunks_mutex); 2739 continue; 2740 } 2741 PG_G = pmap_global_bit(pmap); 2742 PG_A = pmap_accessed_bit(pmap); 2743 PG_M = pmap_modified_bit(pmap); 2744 PG_RW = pmap_rw_bit(pmap); 2745 } 2746 2747 /* 2748 * Destroy every non-wired, 4 KB page mapping in the chunk. 2749 */ 2750 freed = 0; 2751 for (field = 0; field < _NPCM; field++) { 2752 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2753 inuse != 0; inuse &= ~(1UL << bit)) { 2754 bit = bsfq(inuse); 2755 pv = &pc->pc_pventry[field * 64 + bit]; 2756 va = pv->pv_va; 2757 pde = pmap_pde(pmap, va); 2758 if ((*pde & PG_PS) != 0) 2759 continue; 2760 pte = pmap_pde_to_pte(pde, va); 2761 if ((*pte & PG_W) != 0) 2762 continue; 2763 tpte = pte_load_clear(pte); 2764 if ((tpte & PG_G) != 0) 2765 pmap_invalidate_page(pmap, va); 2766 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2767 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2768 vm_page_dirty(m); 2769 if ((tpte & PG_A) != 0) 2770 vm_page_aflag_set(m, PGA_REFERENCED); 2771 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2772 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2773 m->md.pv_gen++; 2774 if (TAILQ_EMPTY(&m->md.pv_list) && 2775 (m->flags & PG_FICTITIOUS) == 0) { 2776 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2777 if (TAILQ_EMPTY(&pvh->pv_list)) { 2778 vm_page_aflag_clear(m, 2779 PGA_WRITEABLE); 2780 } 2781 } 2782 pc->pc_map[field] |= 1UL << bit; 2783 pmap_unuse_pt(pmap, va, *pde, &free); 2784 freed++; 2785 } 2786 } 2787 if (freed == 0) { 2788 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2789 mtx_lock(&pv_chunks_mutex); 2790 continue; 2791 } 2792 /* Every freed mapping is for a 4 KB page. */ 2793 pmap_resident_count_dec(pmap, freed); 2794 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2795 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2796 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2797 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2798 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2799 pc->pc_map[2] == PC_FREE2) { 2800 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2801 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2802 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2803 /* Entire chunk is free; return it. */ 2804 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2805 dump_drop_page(m_pc->phys_addr); 2806 mtx_lock(&pv_chunks_mutex); 2807 break; 2808 } 2809 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2810 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2811 mtx_lock(&pv_chunks_mutex); 2812 /* One freed pv entry in locked_pmap is sufficient. */ 2813 if (pmap == locked_pmap) 2814 break; 2815 } 2816 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2817 mtx_unlock(&pv_chunks_mutex); 2818 if (pmap != NULL) { 2819 pmap_invalidate_all(pmap); 2820 if (pmap != locked_pmap) 2821 PMAP_UNLOCK(pmap); 2822 } 2823 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2824 m_pc = SLIST_FIRST(&free); 2825 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2826 /* Recycle a freed page table page. */ 2827 m_pc->wire_count = 1; 2828 atomic_add_int(&cnt.v_wire_count, 1); 2829 } 2830 pmap_free_zero_pages(&free); 2831 return (m_pc); 2832} 2833 2834/* 2835 * free the pv_entry back to the free list 2836 */ 2837static void 2838free_pv_entry(pmap_t pmap, pv_entry_t pv) 2839{ 2840 struct pv_chunk *pc; 2841 int idx, field, bit; 2842 2843 rw_assert(&pvh_global_lock, RA_LOCKED); 2844 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2845 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2846 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2847 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2848 pc = pv_to_chunk(pv); 2849 idx = pv - &pc->pc_pventry[0]; 2850 field = idx / 64; 2851 bit = idx % 64; 2852 pc->pc_map[field] |= 1ul << bit; 2853 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2854 pc->pc_map[2] != PC_FREE2) { 2855 /* 98% of the time, pc is already at the head of the list. */ 2856 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2857 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2858 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2859 } 2860 return; 2861 } 2862 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2863 free_pv_chunk(pc); 2864} 2865 2866static void 2867free_pv_chunk(struct pv_chunk *pc) 2868{ 2869 vm_page_t m; 2870 2871 mtx_lock(&pv_chunks_mutex); 2872 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2873 mtx_unlock(&pv_chunks_mutex); 2874 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2875 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2876 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2877 /* entire chunk is free, return it */ 2878 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2879 dump_drop_page(m->phys_addr); 2880 vm_page_unwire(m, 0); 2881 vm_page_free(m); 2882} 2883 2884/* 2885 * Returns a new PV entry, allocating a new PV chunk from the system when 2886 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2887 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2888 * returned. 2889 * 2890 * The given PV list lock may be released. 2891 */ 2892static pv_entry_t 2893get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2894{ 2895 int bit, field; 2896 pv_entry_t pv; 2897 struct pv_chunk *pc; 2898 vm_page_t m; 2899 2900 rw_assert(&pvh_global_lock, RA_LOCKED); 2901 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2902 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2903retry: 2904 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2905 if (pc != NULL) { 2906 for (field = 0; field < _NPCM; field++) { 2907 if (pc->pc_map[field]) { 2908 bit = bsfq(pc->pc_map[field]); 2909 break; 2910 } 2911 } 2912 if (field < _NPCM) { 2913 pv = &pc->pc_pventry[field * 64 + bit]; 2914 pc->pc_map[field] &= ~(1ul << bit); 2915 /* If this was the last item, move it to tail */ 2916 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2917 pc->pc_map[2] == 0) { 2918 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2919 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2920 pc_list); 2921 } 2922 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2923 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2924 return (pv); 2925 } 2926 } 2927 /* No free items, allocate another chunk */ 2928 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2929 VM_ALLOC_WIRED); 2930 if (m == NULL) { 2931 if (lockp == NULL) { 2932 PV_STAT(pc_chunk_tryfail++); 2933 return (NULL); 2934 } 2935 m = reclaim_pv_chunk(pmap, lockp); 2936 if (m == NULL) 2937 goto retry; 2938 } 2939 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2940 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2941 dump_add_page(m->phys_addr); 2942 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2943 pc->pc_pmap = pmap; 2944 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2945 pc->pc_map[1] = PC_FREE1; 2946 pc->pc_map[2] = PC_FREE2; 2947 mtx_lock(&pv_chunks_mutex); 2948 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2949 mtx_unlock(&pv_chunks_mutex); 2950 pv = &pc->pc_pventry[0]; 2951 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2952 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2953 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2954 return (pv); 2955} 2956 2957/* 2958 * Returns the number of one bits within the given PV chunk map element. 2959 */ 2960static int 2961popcnt_pc_map_elem(uint64_t elem) 2962{ 2963 int count; 2964 2965 /* 2966 * This simple method of counting the one bits performs well because 2967 * the given element typically contains more zero bits than one bits. 2968 */ 2969 count = 0; 2970 for (; elem != 0; elem &= elem - 1) 2971 count++; 2972 return (count); 2973} 2974 2975/* 2976 * Ensure that the number of spare PV entries in the specified pmap meets or 2977 * exceeds the given count, "needed". 2978 * 2979 * The given PV list lock may be released. 2980 */ 2981static void 2982reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2983{ 2984 struct pch new_tail; 2985 struct pv_chunk *pc; 2986 int avail, free; 2987 vm_page_t m; 2988 2989 rw_assert(&pvh_global_lock, RA_LOCKED); 2990 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2991 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2992 2993 /* 2994 * Newly allocated PV chunks must be stored in a private list until 2995 * the required number of PV chunks have been allocated. Otherwise, 2996 * reclaim_pv_chunk() could recycle one of these chunks. In 2997 * contrast, these chunks must be added to the pmap upon allocation. 2998 */ 2999 TAILQ_INIT(&new_tail); 3000retry: 3001 avail = 0; 3002 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3003 if ((cpu_feature2 & CPUID2_POPCNT) == 0) { 3004 free = popcnt_pc_map_elem(pc->pc_map[0]); 3005 free += popcnt_pc_map_elem(pc->pc_map[1]); 3006 free += popcnt_pc_map_elem(pc->pc_map[2]); 3007 } else { 3008 free = popcntq(pc->pc_map[0]); 3009 free += popcntq(pc->pc_map[1]); 3010 free += popcntq(pc->pc_map[2]); 3011 } 3012 if (free == 0) 3013 break; 3014 avail += free; 3015 if (avail >= needed) 3016 break; 3017 } 3018 for (; avail < needed; avail += _NPCPV) { 3019 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3020 VM_ALLOC_WIRED); 3021 if (m == NULL) { 3022 m = reclaim_pv_chunk(pmap, lockp); 3023 if (m == NULL) 3024 goto retry; 3025 } 3026 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3027 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3028 dump_add_page(m->phys_addr); 3029 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3030 pc->pc_pmap = pmap; 3031 pc->pc_map[0] = PC_FREE0; 3032 pc->pc_map[1] = PC_FREE1; 3033 pc->pc_map[2] = PC_FREE2; 3034 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3035 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3036 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3037 } 3038 if (!TAILQ_EMPTY(&new_tail)) { 3039 mtx_lock(&pv_chunks_mutex); 3040 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3041 mtx_unlock(&pv_chunks_mutex); 3042 } 3043} 3044 3045/* 3046 * First find and then remove the pv entry for the specified pmap and virtual 3047 * address from the specified pv list. Returns the pv entry if found and NULL 3048 * otherwise. This operation can be performed on pv lists for either 4KB or 3049 * 2MB page mappings. 3050 */ 3051static __inline pv_entry_t 3052pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3053{ 3054 pv_entry_t pv; 3055 3056 rw_assert(&pvh_global_lock, RA_LOCKED); 3057 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3058 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3059 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3060 pvh->pv_gen++; 3061 break; 3062 } 3063 } 3064 return (pv); 3065} 3066 3067/* 3068 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3069 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3070 * entries for each of the 4KB page mappings. 3071 */ 3072static void 3073pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3074 struct rwlock **lockp) 3075{ 3076 struct md_page *pvh; 3077 struct pv_chunk *pc; 3078 pv_entry_t pv; 3079 vm_offset_t va_last; 3080 vm_page_t m; 3081 int bit, field; 3082 3083 rw_assert(&pvh_global_lock, RA_LOCKED); 3084 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3085 KASSERT((pa & PDRMASK) == 0, 3086 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3087 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3088 3089 /* 3090 * Transfer the 2mpage's pv entry for this mapping to the first 3091 * page's pv list. Once this transfer begins, the pv list lock 3092 * must not be released until the last pv entry is reinstantiated. 3093 */ 3094 pvh = pa_to_pvh(pa); 3095 va = trunc_2mpage(va); 3096 pv = pmap_pvh_remove(pvh, pmap, va); 3097 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3098 m = PHYS_TO_VM_PAGE(pa); 3099 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3100 m->md.pv_gen++; 3101 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3102 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3103 va_last = va + NBPDR - PAGE_SIZE; 3104 for (;;) { 3105 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3106 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3107 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3108 for (field = 0; field < _NPCM; field++) { 3109 while (pc->pc_map[field]) { 3110 bit = bsfq(pc->pc_map[field]); 3111 pc->pc_map[field] &= ~(1ul << bit); 3112 pv = &pc->pc_pventry[field * 64 + bit]; 3113 va += PAGE_SIZE; 3114 pv->pv_va = va; 3115 m++; 3116 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3117 ("pmap_pv_demote_pde: page %p is not managed", m)); 3118 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3119 m->md.pv_gen++; 3120 if (va == va_last) 3121 goto out; 3122 } 3123 } 3124 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3125 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3126 } 3127out: 3128 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3129 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3130 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3131 } 3132 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3133 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3134} 3135 3136/* 3137 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3138 * replace the many pv entries for the 4KB page mappings by a single pv entry 3139 * for the 2MB page mapping. 3140 */ 3141static void 3142pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3143 struct rwlock **lockp) 3144{ 3145 struct md_page *pvh; 3146 pv_entry_t pv; 3147 vm_offset_t va_last; 3148 vm_page_t m; 3149 3150 rw_assert(&pvh_global_lock, RA_LOCKED); 3151 KASSERT((pa & PDRMASK) == 0, 3152 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3153 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3154 3155 /* 3156 * Transfer the first page's pv entry for this mapping to the 2mpage's 3157 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3158 * a transfer avoids the possibility that get_pv_entry() calls 3159 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3160 * mappings that is being promoted. 3161 */ 3162 m = PHYS_TO_VM_PAGE(pa); 3163 va = trunc_2mpage(va); 3164 pv = pmap_pvh_remove(&m->md, pmap, va); 3165 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3166 pvh = pa_to_pvh(pa); 3167 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3168 pvh->pv_gen++; 3169 /* Free the remaining NPTEPG - 1 pv entries. */ 3170 va_last = va + NBPDR - PAGE_SIZE; 3171 do { 3172 m++; 3173 va += PAGE_SIZE; 3174 pmap_pvh_free(&m->md, pmap, va); 3175 } while (va < va_last); 3176} 3177 3178/* 3179 * First find and then destroy the pv entry for the specified pmap and virtual 3180 * address. This operation can be performed on pv lists for either 4KB or 2MB 3181 * page mappings. 3182 */ 3183static void 3184pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3185{ 3186 pv_entry_t pv; 3187 3188 pv = pmap_pvh_remove(pvh, pmap, va); 3189 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3190 free_pv_entry(pmap, pv); 3191} 3192 3193/* 3194 * Conditionally create the PV entry for a 4KB page mapping if the required 3195 * memory can be allocated without resorting to reclamation. 3196 */ 3197static boolean_t 3198pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3199 struct rwlock **lockp) 3200{ 3201 pv_entry_t pv; 3202 3203 rw_assert(&pvh_global_lock, RA_LOCKED); 3204 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3205 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3206 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3207 pv->pv_va = va; 3208 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3209 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3210 m->md.pv_gen++; 3211 return (TRUE); 3212 } else 3213 return (FALSE); 3214} 3215 3216/* 3217 * Conditionally create the PV entry for a 2MB page mapping if the required 3218 * memory can be allocated without resorting to reclamation. 3219 */ 3220static boolean_t 3221pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3222 struct rwlock **lockp) 3223{ 3224 struct md_page *pvh; 3225 pv_entry_t pv; 3226 3227 rw_assert(&pvh_global_lock, RA_LOCKED); 3228 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3229 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3230 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3231 pv->pv_va = va; 3232 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3233 pvh = pa_to_pvh(pa); 3234 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3235 pvh->pv_gen++; 3236 return (TRUE); 3237 } else 3238 return (FALSE); 3239} 3240 3241/* 3242 * Fills a page table page with mappings to consecutive physical pages. 3243 */ 3244static void 3245pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3246{ 3247 pt_entry_t *pte; 3248 3249 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3250 *pte = newpte; 3251 newpte += PAGE_SIZE; 3252 } 3253} 3254 3255/* 3256 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3257 * mapping is invalidated. 3258 */ 3259static boolean_t 3260pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3261{ 3262 struct rwlock *lock; 3263 boolean_t rv; 3264 3265 lock = NULL; 3266 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3267 if (lock != NULL) 3268 rw_wunlock(lock); 3269 return (rv); 3270} 3271 3272static boolean_t 3273pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3274 struct rwlock **lockp) 3275{ 3276 pd_entry_t newpde, oldpde; 3277 pt_entry_t *firstpte, newpte; 3278 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3279 vm_paddr_t mptepa; 3280 vm_page_t mpte; 3281 struct spglist free; 3282 int PG_PTE_CACHE; 3283 3284 PG_G = pmap_global_bit(pmap); 3285 PG_A = pmap_accessed_bit(pmap); 3286 PG_M = pmap_modified_bit(pmap); 3287 PG_RW = pmap_rw_bit(pmap); 3288 PG_V = pmap_valid_bit(pmap); 3289 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3290 3291 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3292 oldpde = *pde; 3293 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3294 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3295 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 3296 NULL) 3297 pmap_remove_pt_page(pmap, mpte); 3298 else { 3299 KASSERT((oldpde & PG_W) == 0, 3300 ("pmap_demote_pde: page table page for a wired mapping" 3301 " is missing")); 3302 3303 /* 3304 * Invalidate the 2MB page mapping and return "failure" if the 3305 * mapping was never accessed or the allocation of the new 3306 * page table page fails. If the 2MB page mapping belongs to 3307 * the direct map region of the kernel's address space, then 3308 * the page allocation request specifies the highest possible 3309 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3310 * normal. Page table pages are preallocated for every other 3311 * part of the kernel address space, so the direct map region 3312 * is the only part of the kernel address space that must be 3313 * handled here. 3314 */ 3315 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3316 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3317 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3318 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3319 SLIST_INIT(&free); 3320 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, 3321 lockp); 3322 pmap_invalidate_page(pmap, trunc_2mpage(va)); 3323 pmap_free_zero_pages(&free); 3324 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3325 " in pmap %p", va, pmap); 3326 return (FALSE); 3327 } 3328 if (va < VM_MAXUSER_ADDRESS) 3329 pmap_resident_count_inc(pmap, 1); 3330 } 3331 mptepa = VM_PAGE_TO_PHYS(mpte); 3332 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3333 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3334 KASSERT((oldpde & PG_A) != 0, 3335 ("pmap_demote_pde: oldpde is missing PG_A")); 3336 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3337 ("pmap_demote_pde: oldpde is missing PG_M")); 3338 newpte = oldpde & ~PG_PS; 3339 newpte = pmap_swap_pat(pmap, newpte); 3340 3341 /* 3342 * If the page table page is new, initialize it. 3343 */ 3344 if (mpte->wire_count == 1) { 3345 mpte->wire_count = NPTEPG; 3346 pmap_fill_ptp(firstpte, newpte); 3347 } 3348 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3349 ("pmap_demote_pde: firstpte and newpte map different physical" 3350 " addresses")); 3351 3352 /* 3353 * If the mapping has changed attributes, update the page table 3354 * entries. 3355 */ 3356 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3357 pmap_fill_ptp(firstpte, newpte); 3358 3359 /* 3360 * The spare PV entries must be reserved prior to demoting the 3361 * mapping, that is, prior to changing the PDE. Otherwise, the state 3362 * of the PDE and the PV lists will be inconsistent, which can result 3363 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3364 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3365 * PV entry for the 2MB page mapping that is being demoted. 3366 */ 3367 if ((oldpde & PG_MANAGED) != 0) 3368 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3369 3370 /* 3371 * Demote the mapping. This pmap is locked. The old PDE has 3372 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3373 * set. Thus, there is no danger of a race with another 3374 * processor changing the setting of PG_A and/or PG_M between 3375 * the read above and the store below. 3376 */ 3377 if (workaround_erratum383) 3378 pmap_update_pde(pmap, va, pde, newpde); 3379 else 3380 pde_store(pde, newpde); 3381 3382 /* 3383 * Invalidate a stale recursive mapping of the page table page. 3384 */ 3385 if (va >= VM_MAXUSER_ADDRESS) 3386 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3387 3388 /* 3389 * Demote the PV entry. 3390 */ 3391 if ((oldpde & PG_MANAGED) != 0) 3392 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3393 3394 atomic_add_long(&pmap_pde_demotions, 1); 3395 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3396 " in pmap %p", va, pmap); 3397 return (TRUE); 3398} 3399 3400/* 3401 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3402 */ 3403static void 3404pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3405{ 3406 pd_entry_t newpde; 3407 vm_paddr_t mptepa; 3408 vm_page_t mpte; 3409 3410 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3411 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3412 mpte = pmap_lookup_pt_page(pmap, va); 3413 if (mpte == NULL) 3414 panic("pmap_remove_kernel_pde: Missing pt page."); 3415 3416 pmap_remove_pt_page(pmap, mpte); 3417 mptepa = VM_PAGE_TO_PHYS(mpte); 3418 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3419 3420 /* 3421 * Initialize the page table page. 3422 */ 3423 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3424 3425 /* 3426 * Demote the mapping. 3427 */ 3428 if (workaround_erratum383) 3429 pmap_update_pde(pmap, va, pde, newpde); 3430 else 3431 pde_store(pde, newpde); 3432 3433 /* 3434 * Invalidate a stale recursive mapping of the page table page. 3435 */ 3436 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3437} 3438 3439/* 3440 * pmap_remove_pde: do the things to unmap a superpage in a process 3441 */ 3442static int 3443pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3444 struct spglist *free, struct rwlock **lockp) 3445{ 3446 struct md_page *pvh; 3447 pd_entry_t oldpde; 3448 vm_offset_t eva, va; 3449 vm_page_t m, mpte; 3450 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3451 3452 PG_G = pmap_global_bit(pmap); 3453 PG_A = pmap_accessed_bit(pmap); 3454 PG_M = pmap_modified_bit(pmap); 3455 PG_RW = pmap_rw_bit(pmap); 3456 3457 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3458 KASSERT((sva & PDRMASK) == 0, 3459 ("pmap_remove_pde: sva is not 2mpage aligned")); 3460 oldpde = pte_load_clear(pdq); 3461 if (oldpde & PG_W) 3462 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3463 3464 /* 3465 * Machines that don't support invlpg, also don't support 3466 * PG_G. 3467 */ 3468 if (oldpde & PG_G) 3469 pmap_invalidate_page(kernel_pmap, sva); 3470 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3471 if (oldpde & PG_MANAGED) { 3472 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3473 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3474 pmap_pvh_free(pvh, pmap, sva); 3475 eva = sva + NBPDR; 3476 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3477 va < eva; va += PAGE_SIZE, m++) { 3478 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3479 vm_page_dirty(m); 3480 if (oldpde & PG_A) 3481 vm_page_aflag_set(m, PGA_REFERENCED); 3482 if (TAILQ_EMPTY(&m->md.pv_list) && 3483 TAILQ_EMPTY(&pvh->pv_list)) 3484 vm_page_aflag_clear(m, PGA_WRITEABLE); 3485 } 3486 } 3487 if (pmap == kernel_pmap) { 3488 pmap_remove_kernel_pde(pmap, pdq, sva); 3489 } else { 3490 mpte = pmap_lookup_pt_page(pmap, sva); 3491 if (mpte != NULL) { 3492 pmap_remove_pt_page(pmap, mpte); 3493 pmap_resident_count_dec(pmap, 1); 3494 KASSERT(mpte->wire_count == NPTEPG, 3495 ("pmap_remove_pde: pte page wire count error")); 3496 mpte->wire_count = 0; 3497 pmap_add_delayed_free_list(mpte, free, FALSE); 3498 atomic_subtract_int(&cnt.v_wire_count, 1); 3499 } 3500 } 3501 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3502} 3503 3504/* 3505 * pmap_remove_pte: do the things to unmap a page in a process 3506 */ 3507static int 3508pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3509 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3510{ 3511 struct md_page *pvh; 3512 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3513 vm_page_t m; 3514 3515 PG_A = pmap_accessed_bit(pmap); 3516 PG_M = pmap_modified_bit(pmap); 3517 PG_RW = pmap_rw_bit(pmap); 3518 3519 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3520 oldpte = pte_load_clear(ptq); 3521 if (oldpte & PG_W) 3522 pmap->pm_stats.wired_count -= 1; 3523 pmap_resident_count_dec(pmap, 1); 3524 if (oldpte & PG_MANAGED) { 3525 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3526 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3527 vm_page_dirty(m); 3528 if (oldpte & PG_A) 3529 vm_page_aflag_set(m, PGA_REFERENCED); 3530 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3531 pmap_pvh_free(&m->md, pmap, va); 3532 if (TAILQ_EMPTY(&m->md.pv_list) && 3533 (m->flags & PG_FICTITIOUS) == 0) { 3534 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3535 if (TAILQ_EMPTY(&pvh->pv_list)) 3536 vm_page_aflag_clear(m, PGA_WRITEABLE); 3537 } 3538 } 3539 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3540} 3541 3542/* 3543 * Remove a single page from a process address space 3544 */ 3545static void 3546pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3547 struct spglist *free) 3548{ 3549 struct rwlock *lock; 3550 pt_entry_t *pte, PG_V; 3551 3552 PG_V = pmap_valid_bit(pmap); 3553 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3554 if ((*pde & PG_V) == 0) 3555 return; 3556 pte = pmap_pde_to_pte(pde, va); 3557 if ((*pte & PG_V) == 0) 3558 return; 3559 lock = NULL; 3560 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3561 if (lock != NULL) 3562 rw_wunlock(lock); 3563 pmap_invalidate_page(pmap, va); 3564} 3565 3566/* 3567 * Remove the given range of addresses from the specified map. 3568 * 3569 * It is assumed that the start and end are properly 3570 * rounded to the page size. 3571 */ 3572void 3573pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3574{ 3575 struct rwlock *lock; 3576 vm_offset_t va, va_next; 3577 pml4_entry_t *pml4e; 3578 pdp_entry_t *pdpe; 3579 pd_entry_t ptpaddr, *pde; 3580 pt_entry_t *pte, PG_G, PG_V; 3581 struct spglist free; 3582 int anyvalid; 3583 3584 PG_G = pmap_global_bit(pmap); 3585 PG_V = pmap_valid_bit(pmap); 3586 3587 /* 3588 * Perform an unsynchronized read. This is, however, safe. 3589 */ 3590 if (pmap->pm_stats.resident_count == 0) 3591 return; 3592 3593 anyvalid = 0; 3594 SLIST_INIT(&free); 3595 3596 rw_rlock(&pvh_global_lock); 3597 PMAP_LOCK(pmap); 3598 3599 /* 3600 * special handling of removing one page. a very 3601 * common operation and easy to short circuit some 3602 * code. 3603 */ 3604 if (sva + PAGE_SIZE == eva) { 3605 pde = pmap_pde(pmap, sva); 3606 if (pde && (*pde & PG_PS) == 0) { 3607 pmap_remove_page(pmap, sva, pde, &free); 3608 goto out; 3609 } 3610 } 3611 3612 lock = NULL; 3613 for (; sva < eva; sva = va_next) { 3614 3615 if (pmap->pm_stats.resident_count == 0) 3616 break; 3617 3618 pml4e = pmap_pml4e(pmap, sva); 3619 if ((*pml4e & PG_V) == 0) { 3620 va_next = (sva + NBPML4) & ~PML4MASK; 3621 if (va_next < sva) 3622 va_next = eva; 3623 continue; 3624 } 3625 3626 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3627 if ((*pdpe & PG_V) == 0) { 3628 va_next = (sva + NBPDP) & ~PDPMASK; 3629 if (va_next < sva) 3630 va_next = eva; 3631 continue; 3632 } 3633 3634 /* 3635 * Calculate index for next page table. 3636 */ 3637 va_next = (sva + NBPDR) & ~PDRMASK; 3638 if (va_next < sva) 3639 va_next = eva; 3640 3641 pde = pmap_pdpe_to_pde(pdpe, sva); 3642 ptpaddr = *pde; 3643 3644 /* 3645 * Weed out invalid mappings. 3646 */ 3647 if (ptpaddr == 0) 3648 continue; 3649 3650 /* 3651 * Check for large page. 3652 */ 3653 if ((ptpaddr & PG_PS) != 0) { 3654 /* 3655 * Are we removing the entire large page? If not, 3656 * demote the mapping and fall through. 3657 */ 3658 if (sva + NBPDR == va_next && eva >= va_next) { 3659 /* 3660 * The TLB entry for a PG_G mapping is 3661 * invalidated by pmap_remove_pde(). 3662 */ 3663 if ((ptpaddr & PG_G) == 0) 3664 anyvalid = 1; 3665 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3666 continue; 3667 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3668 &lock)) { 3669 /* The large page mapping was destroyed. */ 3670 continue; 3671 } else 3672 ptpaddr = *pde; 3673 } 3674 3675 /* 3676 * Limit our scan to either the end of the va represented 3677 * by the current page table page, or to the end of the 3678 * range being removed. 3679 */ 3680 if (va_next > eva) 3681 va_next = eva; 3682 3683 va = va_next; 3684 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3685 sva += PAGE_SIZE) { 3686 if (*pte == 0) { 3687 if (va != va_next) { 3688 pmap_invalidate_range(pmap, va, sva); 3689 va = va_next; 3690 } 3691 continue; 3692 } 3693 if ((*pte & PG_G) == 0) 3694 anyvalid = 1; 3695 else if (va == va_next) 3696 va = sva; 3697 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3698 &lock)) { 3699 sva += PAGE_SIZE; 3700 break; 3701 } 3702 } 3703 if (va != va_next) 3704 pmap_invalidate_range(pmap, va, sva); 3705 } 3706 if (lock != NULL) 3707 rw_wunlock(lock); 3708out: 3709 if (anyvalid) 3710 pmap_invalidate_all(pmap); 3711 rw_runlock(&pvh_global_lock); 3712 PMAP_UNLOCK(pmap); 3713 pmap_free_zero_pages(&free); 3714} 3715 3716/* 3717 * Routine: pmap_remove_all 3718 * Function: 3719 * Removes this physical page from 3720 * all physical maps in which it resides. 3721 * Reflects back modify bits to the pager. 3722 * 3723 * Notes: 3724 * Original versions of this routine were very 3725 * inefficient because they iteratively called 3726 * pmap_remove (slow...) 3727 */ 3728 3729void 3730pmap_remove_all(vm_page_t m) 3731{ 3732 struct md_page *pvh; 3733 pv_entry_t pv; 3734 pmap_t pmap; 3735 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 3736 pd_entry_t *pde; 3737 vm_offset_t va; 3738 struct spglist free; 3739 3740 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3741 ("pmap_remove_all: page %p is not managed", m)); 3742 SLIST_INIT(&free); 3743 rw_wlock(&pvh_global_lock); 3744 if ((m->flags & PG_FICTITIOUS) != 0) 3745 goto small_mappings; 3746 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3747 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3748 pmap = PV_PMAP(pv); 3749 PMAP_LOCK(pmap); 3750 va = pv->pv_va; 3751 pde = pmap_pde(pmap, va); 3752 (void)pmap_demote_pde(pmap, pde, va); 3753 PMAP_UNLOCK(pmap); 3754 } 3755small_mappings: 3756 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3757 pmap = PV_PMAP(pv); 3758 PMAP_LOCK(pmap); 3759 PG_A = pmap_accessed_bit(pmap); 3760 PG_M = pmap_modified_bit(pmap); 3761 PG_RW = pmap_rw_bit(pmap); 3762 pmap_resident_count_dec(pmap, 1); 3763 pde = pmap_pde(pmap, pv->pv_va); 3764 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3765 " a 2mpage in page %p's pv list", m)); 3766 pte = pmap_pde_to_pte(pde, pv->pv_va); 3767 tpte = pte_load_clear(pte); 3768 if (tpte & PG_W) 3769 pmap->pm_stats.wired_count--; 3770 if (tpte & PG_A) 3771 vm_page_aflag_set(m, PGA_REFERENCED); 3772 3773 /* 3774 * Update the vm_page_t clean and reference bits. 3775 */ 3776 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3777 vm_page_dirty(m); 3778 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3779 pmap_invalidate_page(pmap, pv->pv_va); 3780 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3781 m->md.pv_gen++; 3782 free_pv_entry(pmap, pv); 3783 PMAP_UNLOCK(pmap); 3784 } 3785 vm_page_aflag_clear(m, PGA_WRITEABLE); 3786 rw_wunlock(&pvh_global_lock); 3787 pmap_free_zero_pages(&free); 3788} 3789 3790/* 3791 * pmap_protect_pde: do the things to protect a 2mpage in a process 3792 */ 3793static boolean_t 3794pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3795{ 3796 pd_entry_t newpde, oldpde; 3797 vm_offset_t eva, va; 3798 vm_page_t m; 3799 boolean_t anychanged; 3800 pt_entry_t PG_G, PG_M, PG_RW; 3801 3802 PG_G = pmap_global_bit(pmap); 3803 PG_M = pmap_modified_bit(pmap); 3804 PG_RW = pmap_rw_bit(pmap); 3805 3806 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3807 KASSERT((sva & PDRMASK) == 0, 3808 ("pmap_protect_pde: sva is not 2mpage aligned")); 3809 anychanged = FALSE; 3810retry: 3811 oldpde = newpde = *pde; 3812 if (oldpde & PG_MANAGED) { 3813 eva = sva + NBPDR; 3814 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3815 va < eva; va += PAGE_SIZE, m++) 3816 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3817 vm_page_dirty(m); 3818 } 3819 if ((prot & VM_PROT_WRITE) == 0) 3820 newpde &= ~(PG_RW | PG_M); 3821 if ((prot & VM_PROT_EXECUTE) == 0) 3822 newpde |= pg_nx; 3823 if (newpde != oldpde) { 3824 if (!atomic_cmpset_long(pde, oldpde, newpde)) 3825 goto retry; 3826 if (oldpde & PG_G) 3827 pmap_invalidate_page(pmap, sva); 3828 else 3829 anychanged = TRUE; 3830 } 3831 return (anychanged); 3832} 3833 3834/* 3835 * Set the physical protection on the 3836 * specified range of this map as requested. 3837 */ 3838void 3839pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3840{ 3841 vm_offset_t va_next; 3842 pml4_entry_t *pml4e; 3843 pdp_entry_t *pdpe; 3844 pd_entry_t ptpaddr, *pde; 3845 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 3846 boolean_t anychanged, pv_lists_locked; 3847 3848 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3849 pmap_remove(pmap, sva, eva); 3850 return; 3851 } 3852 3853 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3854 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3855 return; 3856 3857 PG_G = pmap_global_bit(pmap); 3858 PG_M = pmap_modified_bit(pmap); 3859 PG_V = pmap_valid_bit(pmap); 3860 PG_RW = pmap_rw_bit(pmap); 3861 pv_lists_locked = FALSE; 3862resume: 3863 anychanged = FALSE; 3864 3865 PMAP_LOCK(pmap); 3866 for (; sva < eva; sva = va_next) { 3867 3868 pml4e = pmap_pml4e(pmap, sva); 3869 if ((*pml4e & PG_V) == 0) { 3870 va_next = (sva + NBPML4) & ~PML4MASK; 3871 if (va_next < sva) 3872 va_next = eva; 3873 continue; 3874 } 3875 3876 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3877 if ((*pdpe & PG_V) == 0) { 3878 va_next = (sva + NBPDP) & ~PDPMASK; 3879 if (va_next < sva) 3880 va_next = eva; 3881 continue; 3882 } 3883 3884 va_next = (sva + NBPDR) & ~PDRMASK; 3885 if (va_next < sva) 3886 va_next = eva; 3887 3888 pde = pmap_pdpe_to_pde(pdpe, sva); 3889 ptpaddr = *pde; 3890 3891 /* 3892 * Weed out invalid mappings. 3893 */ 3894 if (ptpaddr == 0) 3895 continue; 3896 3897 /* 3898 * Check for large page. 3899 */ 3900 if ((ptpaddr & PG_PS) != 0) { 3901 /* 3902 * Are we protecting the entire large page? If not, 3903 * demote the mapping and fall through. 3904 */ 3905 if (sva + NBPDR == va_next && eva >= va_next) { 3906 /* 3907 * The TLB entry for a PG_G mapping is 3908 * invalidated by pmap_protect_pde(). 3909 */ 3910 if (pmap_protect_pde(pmap, pde, sva, prot)) 3911 anychanged = TRUE; 3912 continue; 3913 } else { 3914 if (!pv_lists_locked) { 3915 pv_lists_locked = TRUE; 3916 if (!rw_try_rlock(&pvh_global_lock)) { 3917 if (anychanged) 3918 pmap_invalidate_all( 3919 pmap); 3920 PMAP_UNLOCK(pmap); 3921 rw_rlock(&pvh_global_lock); 3922 goto resume; 3923 } 3924 } 3925 if (!pmap_demote_pde(pmap, pde, sva)) { 3926 /* 3927 * The large page mapping was 3928 * destroyed. 3929 */ 3930 continue; 3931 } 3932 } 3933 } 3934 3935 if (va_next > eva) 3936 va_next = eva; 3937 3938 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3939 sva += PAGE_SIZE) { 3940 pt_entry_t obits, pbits; 3941 vm_page_t m; 3942 3943retry: 3944 obits = pbits = *pte; 3945 if ((pbits & PG_V) == 0) 3946 continue; 3947 3948 if ((prot & VM_PROT_WRITE) == 0) { 3949 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3950 (PG_MANAGED | PG_M | PG_RW)) { 3951 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3952 vm_page_dirty(m); 3953 } 3954 pbits &= ~(PG_RW | PG_M); 3955 } 3956 if ((prot & VM_PROT_EXECUTE) == 0) 3957 pbits |= pg_nx; 3958 3959 if (pbits != obits) { 3960 if (!atomic_cmpset_long(pte, obits, pbits)) 3961 goto retry; 3962 if (obits & PG_G) 3963 pmap_invalidate_page(pmap, sva); 3964 else 3965 anychanged = TRUE; 3966 } 3967 } 3968 } 3969 if (anychanged) 3970 pmap_invalidate_all(pmap); 3971 if (pv_lists_locked) 3972 rw_runlock(&pvh_global_lock); 3973 PMAP_UNLOCK(pmap); 3974} 3975 3976/* 3977 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3978 * single page table page (PTP) to a single 2MB page mapping. For promotion 3979 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3980 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3981 * identical characteristics. 3982 */ 3983static void 3984pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3985 struct rwlock **lockp) 3986{ 3987 pd_entry_t newpde; 3988 pt_entry_t *firstpte, oldpte, pa, *pte; 3989 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 3990 vm_offset_t oldpteva; 3991 vm_page_t mpte; 3992 int PG_PTE_CACHE; 3993 3994 PG_A = pmap_accessed_bit(pmap); 3995 PG_G = pmap_global_bit(pmap); 3996 PG_M = pmap_modified_bit(pmap); 3997 PG_V = pmap_valid_bit(pmap); 3998 PG_RW = pmap_rw_bit(pmap); 3999 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4000 4001 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4002 4003 /* 4004 * Examine the first PTE in the specified PTP. Abort if this PTE is 4005 * either invalid, unused, or does not map the first 4KB physical page 4006 * within a 2MB page. 4007 */ 4008 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4009setpde: 4010 newpde = *firstpte; 4011 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4012 atomic_add_long(&pmap_pde_p_failures, 1); 4013 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4014 " in pmap %p", va, pmap); 4015 return; 4016 } 4017 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4018 /* 4019 * When PG_M is already clear, PG_RW can be cleared without 4020 * a TLB invalidation. 4021 */ 4022 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4023 goto setpde; 4024 newpde &= ~PG_RW; 4025 } 4026 4027 /* 4028 * Examine each of the other PTEs in the specified PTP. Abort if this 4029 * PTE maps an unexpected 4KB physical page or does not have identical 4030 * characteristics to the first PTE. 4031 */ 4032 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4033 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4034setpte: 4035 oldpte = *pte; 4036 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4037 atomic_add_long(&pmap_pde_p_failures, 1); 4038 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4039 " in pmap %p", va, pmap); 4040 return; 4041 } 4042 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4043 /* 4044 * When PG_M is already clear, PG_RW can be cleared 4045 * without a TLB invalidation. 4046 */ 4047 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4048 goto setpte; 4049 oldpte &= ~PG_RW; 4050 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 4051 (va & ~PDRMASK); 4052 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4053 " in pmap %p", oldpteva, pmap); 4054 } 4055 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4056 atomic_add_long(&pmap_pde_p_failures, 1); 4057 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4058 " in pmap %p", va, pmap); 4059 return; 4060 } 4061 pa -= PAGE_SIZE; 4062 } 4063 4064 /* 4065 * Save the page table page in its current state until the PDE 4066 * mapping the superpage is demoted by pmap_demote_pde() or 4067 * destroyed by pmap_remove_pde(). 4068 */ 4069 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4070 KASSERT(mpte >= vm_page_array && 4071 mpte < &vm_page_array[vm_page_array_size], 4072 ("pmap_promote_pde: page table page is out of range")); 4073 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4074 ("pmap_promote_pde: page table page's pindex is wrong")); 4075 if (pmap_insert_pt_page(pmap, mpte)) { 4076 atomic_add_long(&pmap_pde_p_failures, 1); 4077 CTR2(KTR_PMAP, 4078 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4079 pmap); 4080 return; 4081 } 4082 4083 /* 4084 * Promote the pv entries. 4085 */ 4086 if ((newpde & PG_MANAGED) != 0) 4087 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4088 4089 /* 4090 * Propagate the PAT index to its proper position. 4091 */ 4092 newpde = pmap_swap_pat(pmap, newpde); 4093 4094 /* 4095 * Map the superpage. 4096 */ 4097 if (workaround_erratum383) 4098 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4099 else 4100 pde_store(pde, PG_PS | newpde); 4101 4102 atomic_add_long(&pmap_pde_promotions, 1); 4103 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4104 " in pmap %p", va, pmap); 4105} 4106 4107/* 4108 * Insert the given physical page (p) at 4109 * the specified virtual address (v) in the 4110 * target physical map with the protection requested. 4111 * 4112 * If specified, the page will be wired down, meaning 4113 * that the related pte can not be reclaimed. 4114 * 4115 * NB: This is the only routine which MAY NOT lazy-evaluate 4116 * or lose information. That is, this routine must actually 4117 * insert this page into the given map NOW. 4118 */ 4119int 4120pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4121 u_int flags, int8_t psind __unused) 4122{ 4123 struct rwlock *lock; 4124 pd_entry_t *pde; 4125 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4126 pt_entry_t newpte, origpte; 4127 pv_entry_t pv; 4128 vm_paddr_t opa, pa; 4129 vm_page_t mpte, om; 4130 boolean_t nosleep; 4131 4132 PG_A = pmap_accessed_bit(pmap); 4133 PG_G = pmap_global_bit(pmap); 4134 PG_M = pmap_modified_bit(pmap); 4135 PG_V = pmap_valid_bit(pmap); 4136 PG_RW = pmap_rw_bit(pmap); 4137 4138 va = trunc_page(va); 4139 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4140 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4141 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4142 va)); 4143 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4144 va >= kmi.clean_eva, 4145 ("pmap_enter: managed mapping within the clean submap")); 4146 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4147 VM_OBJECT_ASSERT_LOCKED(m->object); 4148 pa = VM_PAGE_TO_PHYS(m); 4149 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4150 if ((flags & VM_PROT_WRITE) != 0) 4151 newpte |= PG_M; 4152 if ((prot & VM_PROT_WRITE) != 0) 4153 newpte |= PG_RW; 4154 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4155 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4156 if ((prot & VM_PROT_EXECUTE) == 0) 4157 newpte |= pg_nx; 4158 if ((flags & PMAP_ENTER_WIRED) != 0) 4159 newpte |= PG_W; 4160 if (va < VM_MAXUSER_ADDRESS) 4161 newpte |= PG_U; 4162 if (pmap == kernel_pmap) 4163 newpte |= PG_G; 4164 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); 4165 4166 /* 4167 * Set modified bit gratuitously for writeable mappings if 4168 * the page is unmanaged. We do not want to take a fault 4169 * to do the dirty bit accounting for these mappings. 4170 */ 4171 if ((m->oflags & VPO_UNMANAGED) != 0) { 4172 if ((newpte & PG_RW) != 0) 4173 newpte |= PG_M; 4174 } 4175 4176 mpte = NULL; 4177 4178 lock = NULL; 4179 rw_rlock(&pvh_global_lock); 4180 PMAP_LOCK(pmap); 4181 4182 /* 4183 * In the case that a page table page is not 4184 * resident, we are creating it here. 4185 */ 4186retry: 4187 pde = pmap_pde(pmap, va); 4188 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4189 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4190 pte = pmap_pde_to_pte(pde, va); 4191 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4192 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4193 mpte->wire_count++; 4194 } 4195 } else if (va < VM_MAXUSER_ADDRESS) { 4196 /* 4197 * Here if the pte page isn't mapped, or if it has been 4198 * deallocated. 4199 */ 4200 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4201 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4202 nosleep ? NULL : &lock); 4203 if (mpte == NULL && nosleep) { 4204 if (lock != NULL) 4205 rw_wunlock(lock); 4206 rw_runlock(&pvh_global_lock); 4207 PMAP_UNLOCK(pmap); 4208 return (KERN_RESOURCE_SHORTAGE); 4209 } 4210 goto retry; 4211 } else 4212 panic("pmap_enter: invalid page directory va=%#lx", va); 4213 4214 origpte = *pte; 4215 4216 /* 4217 * Is the specified virtual address already mapped? 4218 */ 4219 if ((origpte & PG_V) != 0) { 4220 /* 4221 * Wiring change, just update stats. We don't worry about 4222 * wiring PT pages as they remain resident as long as there 4223 * are valid mappings in them. Hence, if a user page is wired, 4224 * the PT page will be also. 4225 */ 4226 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4227 pmap->pm_stats.wired_count++; 4228 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4229 pmap->pm_stats.wired_count--; 4230 4231 /* 4232 * Remove the extra PT page reference. 4233 */ 4234 if (mpte != NULL) { 4235 mpte->wire_count--; 4236 KASSERT(mpte->wire_count > 0, 4237 ("pmap_enter: missing reference to page table page," 4238 " va: 0x%lx", va)); 4239 } 4240 4241 /* 4242 * Has the physical page changed? 4243 */ 4244 opa = origpte & PG_FRAME; 4245 if (opa == pa) { 4246 /* 4247 * No, might be a protection or wiring change. 4248 */ 4249 if ((origpte & PG_MANAGED) != 0) { 4250 newpte |= PG_MANAGED; 4251 if ((newpte & PG_RW) != 0) 4252 vm_page_aflag_set(m, PGA_WRITEABLE); 4253 } 4254 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4255 goto unchanged; 4256 goto validate; 4257 } 4258 } else { 4259 /* 4260 * Increment the counters. 4261 */ 4262 if ((newpte & PG_W) != 0) 4263 pmap->pm_stats.wired_count++; 4264 pmap_resident_count_inc(pmap, 1); 4265 } 4266 4267 /* 4268 * Enter on the PV list if part of our managed memory. 4269 */ 4270 if ((m->oflags & VPO_UNMANAGED) == 0) { 4271 newpte |= PG_MANAGED; 4272 pv = get_pv_entry(pmap, &lock); 4273 pv->pv_va = va; 4274 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4275 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4276 m->md.pv_gen++; 4277 if ((newpte & PG_RW) != 0) 4278 vm_page_aflag_set(m, PGA_WRITEABLE); 4279 } 4280 4281 /* 4282 * Update the PTE. 4283 */ 4284 if ((origpte & PG_V) != 0) { 4285validate: 4286 origpte = pte_load_store(pte, newpte); 4287 opa = origpte & PG_FRAME; 4288 if (opa != pa) { 4289 if ((origpte & PG_MANAGED) != 0) { 4290 om = PHYS_TO_VM_PAGE(opa); 4291 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4292 PG_RW)) 4293 vm_page_dirty(om); 4294 if ((origpte & PG_A) != 0) 4295 vm_page_aflag_set(om, PGA_REFERENCED); 4296 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4297 pmap_pvh_free(&om->md, pmap, va); 4298 if ((om->aflags & PGA_WRITEABLE) != 0 && 4299 TAILQ_EMPTY(&om->md.pv_list) && 4300 ((om->flags & PG_FICTITIOUS) != 0 || 4301 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4302 vm_page_aflag_clear(om, PGA_WRITEABLE); 4303 } 4304 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4305 PG_RW)) == (PG_M | PG_RW)) { 4306 if ((origpte & PG_MANAGED) != 0) 4307 vm_page_dirty(m); 4308 4309 /* 4310 * Although the PTE may still have PG_RW set, TLB 4311 * invalidation may nonetheless be required because 4312 * the PTE no longer has PG_M set. 4313 */ 4314 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4315 /* 4316 * This PTE change does not require TLB invalidation. 4317 */ 4318 goto unchanged; 4319 } 4320 if ((origpte & PG_A) != 0) 4321 pmap_invalidate_page(pmap, va); 4322 } else 4323 pte_store(pte, newpte); 4324 4325unchanged: 4326 4327 /* 4328 * If both the page table page and the reservation are fully 4329 * populated, then attempt promotion. 4330 */ 4331 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4332 pmap_ps_enabled(pmap) && 4333 (m->flags & PG_FICTITIOUS) == 0 && 4334 vm_reserv_level_iffullpop(m) == 0) 4335 pmap_promote_pde(pmap, pde, va, &lock); 4336 4337 if (lock != NULL) 4338 rw_wunlock(lock); 4339 rw_runlock(&pvh_global_lock); 4340 PMAP_UNLOCK(pmap); 4341 return (KERN_SUCCESS); 4342} 4343 4344/* 4345 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 4346 * otherwise. Fails if (1) a page table page cannot be allocated without 4347 * blocking, (2) a mapping already exists at the specified virtual address, or 4348 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4349 */ 4350static boolean_t 4351pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4352 struct rwlock **lockp) 4353{ 4354 pd_entry_t *pde, newpde; 4355 pt_entry_t PG_V; 4356 vm_page_t mpde; 4357 struct spglist free; 4358 4359 PG_V = pmap_valid_bit(pmap); 4360 rw_assert(&pvh_global_lock, RA_LOCKED); 4361 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4362 4363 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 4364 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4365 " in pmap %p", va, pmap); 4366 return (FALSE); 4367 } 4368 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 4369 pde = &pde[pmap_pde_index(va)]; 4370 if ((*pde & PG_V) != 0) { 4371 KASSERT(mpde->wire_count > 1, 4372 ("pmap_enter_pde: mpde's wire count is too low")); 4373 mpde->wire_count--; 4374 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4375 " in pmap %p", va, pmap); 4376 return (FALSE); 4377 } 4378 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4379 PG_PS | PG_V; 4380 if ((m->oflags & VPO_UNMANAGED) == 0) { 4381 newpde |= PG_MANAGED; 4382 4383 /* 4384 * Abort this mapping if its PV entry could not be created. 4385 */ 4386 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 4387 lockp)) { 4388 SLIST_INIT(&free); 4389 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 4390 pmap_invalidate_page(pmap, va); 4391 pmap_free_zero_pages(&free); 4392 } 4393 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4394 " in pmap %p", va, pmap); 4395 return (FALSE); 4396 } 4397 } 4398 if ((prot & VM_PROT_EXECUTE) == 0) 4399 newpde |= pg_nx; 4400 if (va < VM_MAXUSER_ADDRESS) 4401 newpde |= PG_U; 4402 4403 /* 4404 * Increment counters. 4405 */ 4406 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4407 4408 /* 4409 * Map the superpage. 4410 */ 4411 pde_store(pde, newpde); 4412 4413 atomic_add_long(&pmap_pde_mappings, 1); 4414 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4415 " in pmap %p", va, pmap); 4416 return (TRUE); 4417} 4418 4419/* 4420 * Maps a sequence of resident pages belonging to the same object. 4421 * The sequence begins with the given page m_start. This page is 4422 * mapped at the given virtual address start. Each subsequent page is 4423 * mapped at a virtual address that is offset from start by the same 4424 * amount as the page is offset from m_start within the object. The 4425 * last page in the sequence is the page with the largest offset from 4426 * m_start that can be mapped at a virtual address less than the given 4427 * virtual address end. Not every virtual page between start and end 4428 * is mapped; only those for which a resident page exists with the 4429 * corresponding offset from m_start are mapped. 4430 */ 4431void 4432pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4433 vm_page_t m_start, vm_prot_t prot) 4434{ 4435 struct rwlock *lock; 4436 vm_offset_t va; 4437 vm_page_t m, mpte; 4438 vm_pindex_t diff, psize; 4439 4440 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4441 4442 psize = atop(end - start); 4443 mpte = NULL; 4444 m = m_start; 4445 lock = NULL; 4446 rw_rlock(&pvh_global_lock); 4447 PMAP_LOCK(pmap); 4448 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4449 va = start + ptoa(diff); 4450 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4451 m->psind == 1 && pmap_ps_enabled(pmap) && 4452 pmap_enter_pde(pmap, va, m, prot, &lock)) 4453 m = &m[NBPDR / PAGE_SIZE - 1]; 4454 else 4455 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4456 mpte, &lock); 4457 m = TAILQ_NEXT(m, listq); 4458 } 4459 if (lock != NULL) 4460 rw_wunlock(lock); 4461 rw_runlock(&pvh_global_lock); 4462 PMAP_UNLOCK(pmap); 4463} 4464 4465/* 4466 * this code makes some *MAJOR* assumptions: 4467 * 1. Current pmap & pmap exists. 4468 * 2. Not wired. 4469 * 3. Read access. 4470 * 4. No page table pages. 4471 * but is *MUCH* faster than pmap_enter... 4472 */ 4473 4474void 4475pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4476{ 4477 struct rwlock *lock; 4478 4479 lock = NULL; 4480 rw_rlock(&pvh_global_lock); 4481 PMAP_LOCK(pmap); 4482 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4483 if (lock != NULL) 4484 rw_wunlock(lock); 4485 rw_runlock(&pvh_global_lock); 4486 PMAP_UNLOCK(pmap); 4487} 4488 4489static vm_page_t 4490pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4491 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4492{ 4493 struct spglist free; 4494 pt_entry_t *pte, PG_V; 4495 vm_paddr_t pa; 4496 4497 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4498 (m->oflags & VPO_UNMANAGED) != 0, 4499 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4500 PG_V = pmap_valid_bit(pmap); 4501 rw_assert(&pvh_global_lock, RA_LOCKED); 4502 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4503 4504 /* 4505 * In the case that a page table page is not 4506 * resident, we are creating it here. 4507 */ 4508 if (va < VM_MAXUSER_ADDRESS) { 4509 vm_pindex_t ptepindex; 4510 pd_entry_t *ptepa; 4511 4512 /* 4513 * Calculate pagetable page index 4514 */ 4515 ptepindex = pmap_pde_pindex(va); 4516 if (mpte && (mpte->pindex == ptepindex)) { 4517 mpte->wire_count++; 4518 } else { 4519 /* 4520 * Get the page directory entry 4521 */ 4522 ptepa = pmap_pde(pmap, va); 4523 4524 /* 4525 * If the page table page is mapped, we just increment 4526 * the hold count, and activate it. Otherwise, we 4527 * attempt to allocate a page table page. If this 4528 * attempt fails, we don't retry. Instead, we give up. 4529 */ 4530 if (ptepa && (*ptepa & PG_V) != 0) { 4531 if (*ptepa & PG_PS) 4532 return (NULL); 4533 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4534 mpte->wire_count++; 4535 } else { 4536 /* 4537 * Pass NULL instead of the PV list lock 4538 * pointer, because we don't intend to sleep. 4539 */ 4540 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4541 if (mpte == NULL) 4542 return (mpte); 4543 } 4544 } 4545 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4546 pte = &pte[pmap_pte_index(va)]; 4547 } else { 4548 mpte = NULL; 4549 pte = vtopte(va); 4550 } 4551 if (*pte) { 4552 if (mpte != NULL) { 4553 mpte->wire_count--; 4554 mpte = NULL; 4555 } 4556 return (mpte); 4557 } 4558 4559 /* 4560 * Enter on the PV list if part of our managed memory. 4561 */ 4562 if ((m->oflags & VPO_UNMANAGED) == 0 && 4563 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4564 if (mpte != NULL) { 4565 SLIST_INIT(&free); 4566 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4567 pmap_invalidate_page(pmap, va); 4568 pmap_free_zero_pages(&free); 4569 } 4570 mpte = NULL; 4571 } 4572 return (mpte); 4573 } 4574 4575 /* 4576 * Increment counters 4577 */ 4578 pmap_resident_count_inc(pmap, 1); 4579 4580 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4581 if ((prot & VM_PROT_EXECUTE) == 0) 4582 pa |= pg_nx; 4583 4584 /* 4585 * Now validate mapping with RO protection 4586 */ 4587 if ((m->oflags & VPO_UNMANAGED) != 0) 4588 pte_store(pte, pa | PG_V | PG_U); 4589 else 4590 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4591 return (mpte); 4592} 4593 4594/* 4595 * Make a temporary mapping for a physical address. This is only intended 4596 * to be used for panic dumps. 4597 */ 4598void * 4599pmap_kenter_temporary(vm_paddr_t pa, int i) 4600{ 4601 vm_offset_t va; 4602 4603 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4604 pmap_kenter(va, pa); 4605 invlpg(va); 4606 return ((void *)crashdumpmap); 4607} 4608 4609/* 4610 * This code maps large physical mmap regions into the 4611 * processor address space. Note that some shortcuts 4612 * are taken, but the code works. 4613 */ 4614void 4615pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4616 vm_pindex_t pindex, vm_size_t size) 4617{ 4618 pd_entry_t *pde; 4619 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4620 vm_paddr_t pa, ptepa; 4621 vm_page_t p, pdpg; 4622 int pat_mode; 4623 4624 PG_A = pmap_accessed_bit(pmap); 4625 PG_M = pmap_modified_bit(pmap); 4626 PG_V = pmap_valid_bit(pmap); 4627 PG_RW = pmap_rw_bit(pmap); 4628 4629 VM_OBJECT_ASSERT_WLOCKED(object); 4630 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4631 ("pmap_object_init_pt: non-device object")); 4632 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4633 if (!pmap_ps_enabled(pmap)) 4634 return; 4635 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4636 return; 4637 p = vm_page_lookup(object, pindex); 4638 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4639 ("pmap_object_init_pt: invalid page %p", p)); 4640 pat_mode = p->md.pat_mode; 4641 4642 /* 4643 * Abort the mapping if the first page is not physically 4644 * aligned to a 2MB page boundary. 4645 */ 4646 ptepa = VM_PAGE_TO_PHYS(p); 4647 if (ptepa & (NBPDR - 1)) 4648 return; 4649 4650 /* 4651 * Skip the first page. Abort the mapping if the rest of 4652 * the pages are not physically contiguous or have differing 4653 * memory attributes. 4654 */ 4655 p = TAILQ_NEXT(p, listq); 4656 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4657 pa += PAGE_SIZE) { 4658 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4659 ("pmap_object_init_pt: invalid page %p", p)); 4660 if (pa != VM_PAGE_TO_PHYS(p) || 4661 pat_mode != p->md.pat_mode) 4662 return; 4663 p = TAILQ_NEXT(p, listq); 4664 } 4665 4666 /* 4667 * Map using 2MB pages. Since "ptepa" is 2M aligned and 4668 * "size" is a multiple of 2M, adding the PAT setting to "pa" 4669 * will not affect the termination of this loop. 4670 */ 4671 PMAP_LOCK(pmap); 4672 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4673 pa < ptepa + size; pa += NBPDR) { 4674 pdpg = pmap_allocpde(pmap, addr, NULL); 4675 if (pdpg == NULL) { 4676 /* 4677 * The creation of mappings below is only an 4678 * optimization. If a page directory page 4679 * cannot be allocated without blocking, 4680 * continue on to the next mapping rather than 4681 * blocking. 4682 */ 4683 addr += NBPDR; 4684 continue; 4685 } 4686 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4687 pde = &pde[pmap_pde_index(addr)]; 4688 if ((*pde & PG_V) == 0) { 4689 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4690 PG_U | PG_RW | PG_V); 4691 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4692 atomic_add_long(&pmap_pde_mappings, 1); 4693 } else { 4694 /* Continue on if the PDE is already valid. */ 4695 pdpg->wire_count--; 4696 KASSERT(pdpg->wire_count > 0, 4697 ("pmap_object_init_pt: missing reference " 4698 "to page directory page, va: 0x%lx", addr)); 4699 } 4700 addr += NBPDR; 4701 } 4702 PMAP_UNLOCK(pmap); 4703 } 4704} 4705 4706/* 4707 * Routine: pmap_change_wiring 4708 * Function: Change the wiring attribute for a map/virtual-address 4709 * pair. 4710 * In/out conditions: 4711 * The mapping must already exist in the pmap. 4712 */ 4713void 4714pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 4715{ 4716 pd_entry_t *pde; 4717 pt_entry_t *pte; 4718 boolean_t pv_lists_locked; 4719 4720 pv_lists_locked = FALSE; 4721 4722 /* 4723 * Wiring is not a hardware characteristic so there is no need to 4724 * invalidate TLB. 4725 */ 4726retry: 4727 PMAP_LOCK(pmap); 4728 pde = pmap_pde(pmap, va); 4729 if ((*pde & PG_PS) != 0) { 4730 if (!wired != ((*pde & PG_W) == 0)) { 4731 if (!pv_lists_locked) { 4732 pv_lists_locked = TRUE; 4733 if (!rw_try_rlock(&pvh_global_lock)) { 4734 PMAP_UNLOCK(pmap); 4735 rw_rlock(&pvh_global_lock); 4736 goto retry; 4737 } 4738 } 4739 if (!pmap_demote_pde(pmap, pde, va)) 4740 panic("pmap_change_wiring: demotion failed"); 4741 } else 4742 goto out; 4743 } 4744 pte = pmap_pde_to_pte(pde, va); 4745 if (wired && (*pte & PG_W) == 0) { 4746 pmap->pm_stats.wired_count++; 4747 atomic_set_long(pte, PG_W); 4748 } else if (!wired && (*pte & PG_W) != 0) { 4749 pmap->pm_stats.wired_count--; 4750 atomic_clear_long(pte, PG_W); 4751 } 4752out: 4753 if (pv_lists_locked) 4754 rw_runlock(&pvh_global_lock); 4755 PMAP_UNLOCK(pmap); 4756} 4757 4758/* 4759 * Copy the range specified by src_addr/len 4760 * from the source map to the range dst_addr/len 4761 * in the destination map. 4762 * 4763 * This routine is only advisory and need not do anything. 4764 */ 4765 4766void 4767pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4768 vm_offset_t src_addr) 4769{ 4770 struct rwlock *lock; 4771 struct spglist free; 4772 vm_offset_t addr; 4773 vm_offset_t end_addr = src_addr + len; 4774 vm_offset_t va_next; 4775 pt_entry_t PG_A, PG_M, PG_V; 4776 4777 if (dst_addr != src_addr) 4778 return; 4779 4780 if (dst_pmap->pm_type != src_pmap->pm_type) 4781 return; 4782 4783 /* 4784 * EPT page table entries that require emulation of A/D bits are 4785 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 4786 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 4787 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 4788 * implementations flag an EPT misconfiguration for exec-only 4789 * mappings we skip this function entirely for emulated pmaps. 4790 */ 4791 if (pmap_emulate_ad_bits(dst_pmap)) 4792 return; 4793 4794 lock = NULL; 4795 rw_rlock(&pvh_global_lock); 4796 if (dst_pmap < src_pmap) { 4797 PMAP_LOCK(dst_pmap); 4798 PMAP_LOCK(src_pmap); 4799 } else { 4800 PMAP_LOCK(src_pmap); 4801 PMAP_LOCK(dst_pmap); 4802 } 4803 4804 PG_A = pmap_accessed_bit(dst_pmap); 4805 PG_M = pmap_modified_bit(dst_pmap); 4806 PG_V = pmap_valid_bit(dst_pmap); 4807 4808 for (addr = src_addr; addr < end_addr; addr = va_next) { 4809 pt_entry_t *src_pte, *dst_pte; 4810 vm_page_t dstmpde, dstmpte, srcmpte; 4811 pml4_entry_t *pml4e; 4812 pdp_entry_t *pdpe; 4813 pd_entry_t srcptepaddr, *pde; 4814 4815 KASSERT(addr < UPT_MIN_ADDRESS, 4816 ("pmap_copy: invalid to pmap_copy page tables")); 4817 4818 pml4e = pmap_pml4e(src_pmap, addr); 4819 if ((*pml4e & PG_V) == 0) { 4820 va_next = (addr + NBPML4) & ~PML4MASK; 4821 if (va_next < addr) 4822 va_next = end_addr; 4823 continue; 4824 } 4825 4826 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 4827 if ((*pdpe & PG_V) == 0) { 4828 va_next = (addr + NBPDP) & ~PDPMASK; 4829 if (va_next < addr) 4830 va_next = end_addr; 4831 continue; 4832 } 4833 4834 va_next = (addr + NBPDR) & ~PDRMASK; 4835 if (va_next < addr) 4836 va_next = end_addr; 4837 4838 pde = pmap_pdpe_to_pde(pdpe, addr); 4839 srcptepaddr = *pde; 4840 if (srcptepaddr == 0) 4841 continue; 4842 4843 if (srcptepaddr & PG_PS) { 4844 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4845 continue; 4846 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 4847 if (dstmpde == NULL) 4848 break; 4849 pde = (pd_entry_t *) 4850 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 4851 pde = &pde[pmap_pde_index(addr)]; 4852 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 4853 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4854 PG_PS_FRAME, &lock))) { 4855 *pde = srcptepaddr & ~PG_W; 4856 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 4857 } else 4858 dstmpde->wire_count--; 4859 continue; 4860 } 4861 4862 srcptepaddr &= PG_FRAME; 4863 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4864 KASSERT(srcmpte->wire_count > 0, 4865 ("pmap_copy: source page table page is unused")); 4866 4867 if (va_next > end_addr) 4868 va_next = end_addr; 4869 4870 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4871 src_pte = &src_pte[pmap_pte_index(addr)]; 4872 dstmpte = NULL; 4873 while (addr < va_next) { 4874 pt_entry_t ptetemp; 4875 ptetemp = *src_pte; 4876 /* 4877 * we only virtual copy managed pages 4878 */ 4879 if ((ptetemp & PG_MANAGED) != 0) { 4880 if (dstmpte != NULL && 4881 dstmpte->pindex == pmap_pde_pindex(addr)) 4882 dstmpte->wire_count++; 4883 else if ((dstmpte = pmap_allocpte(dst_pmap, 4884 addr, NULL)) == NULL) 4885 goto out; 4886 dst_pte = (pt_entry_t *) 4887 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4888 dst_pte = &dst_pte[pmap_pte_index(addr)]; 4889 if (*dst_pte == 0 && 4890 pmap_try_insert_pv_entry(dst_pmap, addr, 4891 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 4892 &lock)) { 4893 /* 4894 * Clear the wired, modified, and 4895 * accessed (referenced) bits 4896 * during the copy. 4897 */ 4898 *dst_pte = ptetemp & ~(PG_W | PG_M | 4899 PG_A); 4900 pmap_resident_count_inc(dst_pmap, 1); 4901 } else { 4902 SLIST_INIT(&free); 4903 if (pmap_unwire_ptp(dst_pmap, addr, 4904 dstmpte, &free)) { 4905 pmap_invalidate_page(dst_pmap, 4906 addr); 4907 pmap_free_zero_pages(&free); 4908 } 4909 goto out; 4910 } 4911 if (dstmpte->wire_count >= srcmpte->wire_count) 4912 break; 4913 } 4914 addr += PAGE_SIZE; 4915 src_pte++; 4916 } 4917 } 4918out: 4919 if (lock != NULL) 4920 rw_wunlock(lock); 4921 rw_runlock(&pvh_global_lock); 4922 PMAP_UNLOCK(src_pmap); 4923 PMAP_UNLOCK(dst_pmap); 4924} 4925 4926/* 4927 * pmap_zero_page zeros the specified hardware page by mapping 4928 * the page into KVM and using bzero to clear its contents. 4929 */ 4930void 4931pmap_zero_page(vm_page_t m) 4932{ 4933 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4934 4935 pagezero((void *)va); 4936} 4937 4938/* 4939 * pmap_zero_page_area zeros the specified hardware page by mapping 4940 * the page into KVM and using bzero to clear its contents. 4941 * 4942 * off and size may not cover an area beyond a single hardware page. 4943 */ 4944void 4945pmap_zero_page_area(vm_page_t m, int off, int size) 4946{ 4947 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4948 4949 if (off == 0 && size == PAGE_SIZE) 4950 pagezero((void *)va); 4951 else 4952 bzero((char *)va + off, size); 4953} 4954 4955/* 4956 * pmap_zero_page_idle zeros the specified hardware page by mapping 4957 * the page into KVM and using bzero to clear its contents. This 4958 * is intended to be called from the vm_pagezero process only and 4959 * outside of Giant. 4960 */ 4961void 4962pmap_zero_page_idle(vm_page_t m) 4963{ 4964 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4965 4966 pagezero((void *)va); 4967} 4968 4969/* 4970 * pmap_copy_page copies the specified (machine independent) 4971 * page by mapping the page into virtual memory and using 4972 * bcopy to copy the page, one machine dependent page at a 4973 * time. 4974 */ 4975void 4976pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4977{ 4978 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4979 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4980 4981 pagecopy((void *)src, (void *)dst); 4982} 4983 4984int unmapped_buf_allowed = 1; 4985 4986void 4987pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4988 vm_offset_t b_offset, int xfersize) 4989{ 4990 void *a_cp, *b_cp; 4991 vm_page_t m_a, m_b; 4992 vm_paddr_t p_a, p_b; 4993 pt_entry_t *pte; 4994 vm_offset_t a_pg_offset, b_pg_offset; 4995 int cnt; 4996 boolean_t pinned; 4997 4998 /* 4999 * NB: The sequence of updating a page table followed by accesses 5000 * to the corresponding pages used in the !DMAP case is subject to 5001 * the situation described in the "AMD64 Architecture Programmer's 5002 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 5003 * Coherency Considerations". Therefore, issuing the INVLPG right 5004 * after modifying the PTE bits is crucial. 5005 */ 5006 pinned = FALSE; 5007 while (xfersize > 0) { 5008 a_pg_offset = a_offset & PAGE_MASK; 5009 m_a = ma[a_offset >> PAGE_SHIFT]; 5010 p_a = m_a->phys_addr; 5011 b_pg_offset = b_offset & PAGE_MASK; 5012 m_b = mb[b_offset >> PAGE_SHIFT]; 5013 p_b = m_b->phys_addr; 5014 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5015 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5016 if (__predict_false(p_a < DMAP_MIN_ADDRESS || 5017 p_a > DMAP_MIN_ADDRESS + dmaplimit)) { 5018 mtx_lock(&cpage_lock); 5019 sched_pin(); 5020 pinned = TRUE; 5021 pte = vtopte(cpage_a); 5022 *pte = p_a | X86_PG_A | X86_PG_V | 5023 pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0); 5024 invlpg(cpage_a); 5025 a_cp = (char *)cpage_a + a_pg_offset; 5026 } else { 5027 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 5028 } 5029 if (__predict_false(p_b < DMAP_MIN_ADDRESS || 5030 p_b > DMAP_MIN_ADDRESS + dmaplimit)) { 5031 if (!pinned) { 5032 mtx_lock(&cpage_lock); 5033 sched_pin(); 5034 pinned = TRUE; 5035 } 5036 pte = vtopte(cpage_b); 5037 *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW | 5038 X86_PG_V | pmap_cache_bits(kernel_pmap, 5039 m_b->md.pat_mode, 0); 5040 invlpg(cpage_b); 5041 b_cp = (char *)cpage_b + b_pg_offset; 5042 } else { 5043 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 5044 } 5045 bcopy(a_cp, b_cp, cnt); 5046 if (__predict_false(pinned)) { 5047 sched_unpin(); 5048 mtx_unlock(&cpage_lock); 5049 pinned = FALSE; 5050 } 5051 a_offset += cnt; 5052 b_offset += cnt; 5053 xfersize -= cnt; 5054 } 5055} 5056 5057/* 5058 * Returns true if the pmap's pv is one of the first 5059 * 16 pvs linked to from this page. This count may 5060 * be changed upwards or downwards in the future; it 5061 * is only necessary that true be returned for a small 5062 * subset of pmaps for proper page aging. 5063 */ 5064boolean_t 5065pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5066{ 5067 struct md_page *pvh; 5068 struct rwlock *lock; 5069 pv_entry_t pv; 5070 int loops = 0; 5071 boolean_t rv; 5072 5073 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5074 ("pmap_page_exists_quick: page %p is not managed", m)); 5075 rv = FALSE; 5076 rw_rlock(&pvh_global_lock); 5077 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5078 rw_rlock(lock); 5079 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5080 if (PV_PMAP(pv) == pmap) { 5081 rv = TRUE; 5082 break; 5083 } 5084 loops++; 5085 if (loops >= 16) 5086 break; 5087 } 5088 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5089 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5090 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5091 if (PV_PMAP(pv) == pmap) { 5092 rv = TRUE; 5093 break; 5094 } 5095 loops++; 5096 if (loops >= 16) 5097 break; 5098 } 5099 } 5100 rw_runlock(lock); 5101 rw_runlock(&pvh_global_lock); 5102 return (rv); 5103} 5104 5105/* 5106 * pmap_page_wired_mappings: 5107 * 5108 * Return the number of managed mappings to the given physical page 5109 * that are wired. 5110 */ 5111int 5112pmap_page_wired_mappings(vm_page_t m) 5113{ 5114 struct rwlock *lock; 5115 struct md_page *pvh; 5116 pmap_t pmap; 5117 pt_entry_t *pte; 5118 pv_entry_t pv; 5119 int count, md_gen, pvh_gen; 5120 5121 if ((m->oflags & VPO_UNMANAGED) != 0) 5122 return (0); 5123 rw_rlock(&pvh_global_lock); 5124 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5125 rw_rlock(lock); 5126restart: 5127 count = 0; 5128 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5129 pmap = PV_PMAP(pv); 5130 if (!PMAP_TRYLOCK(pmap)) { 5131 md_gen = m->md.pv_gen; 5132 rw_runlock(lock); 5133 PMAP_LOCK(pmap); 5134 rw_rlock(lock); 5135 if (md_gen != m->md.pv_gen) { 5136 PMAP_UNLOCK(pmap); 5137 goto restart; 5138 } 5139 } 5140 pte = pmap_pte(pmap, pv->pv_va); 5141 if ((*pte & PG_W) != 0) 5142 count++; 5143 PMAP_UNLOCK(pmap); 5144 } 5145 if ((m->flags & PG_FICTITIOUS) == 0) { 5146 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5147 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5148 pmap = PV_PMAP(pv); 5149 if (!PMAP_TRYLOCK(pmap)) { 5150 md_gen = m->md.pv_gen; 5151 pvh_gen = pvh->pv_gen; 5152 rw_runlock(lock); 5153 PMAP_LOCK(pmap); 5154 rw_rlock(lock); 5155 if (md_gen != m->md.pv_gen || 5156 pvh_gen != pvh->pv_gen) { 5157 PMAP_UNLOCK(pmap); 5158 goto restart; 5159 } 5160 } 5161 pte = pmap_pde(pmap, pv->pv_va); 5162 if ((*pte & PG_W) != 0) 5163 count++; 5164 PMAP_UNLOCK(pmap); 5165 } 5166 } 5167 rw_runlock(lock); 5168 rw_runlock(&pvh_global_lock); 5169 return (count); 5170} 5171 5172/* 5173 * Returns TRUE if the given page is mapped individually or as part of 5174 * a 2mpage. Otherwise, returns FALSE. 5175 */ 5176boolean_t 5177pmap_page_is_mapped(vm_page_t m) 5178{ 5179 struct rwlock *lock; 5180 boolean_t rv; 5181 5182 if ((m->oflags & VPO_UNMANAGED) != 0) 5183 return (FALSE); 5184 rw_rlock(&pvh_global_lock); 5185 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5186 rw_rlock(lock); 5187 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5188 ((m->flags & PG_FICTITIOUS) == 0 && 5189 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5190 rw_runlock(lock); 5191 rw_runlock(&pvh_global_lock); 5192 return (rv); 5193} 5194 5195/* 5196 * Destroy all managed, non-wired mappings in the given user-space 5197 * pmap. This pmap cannot be active on any processor besides the 5198 * caller. 5199 * 5200 * This function cannot be applied to the kernel pmap. Moreover, it 5201 * is not intended for general use. It is only to be used during 5202 * process termination. Consequently, it can be implemented in ways 5203 * that make it faster than pmap_remove(). First, it can more quickly 5204 * destroy mappings by iterating over the pmap's collection of PV 5205 * entries, rather than searching the page table. Second, it doesn't 5206 * have to test and clear the page table entries atomically, because 5207 * no processor is currently accessing the user address space. In 5208 * particular, a page table entry's dirty bit won't change state once 5209 * this function starts. 5210 */ 5211void 5212pmap_remove_pages(pmap_t pmap) 5213{ 5214 pd_entry_t ptepde; 5215 pt_entry_t *pte, tpte; 5216 pt_entry_t PG_M, PG_RW, PG_V; 5217 struct spglist free; 5218 vm_page_t m, mpte, mt; 5219 pv_entry_t pv; 5220 struct md_page *pvh; 5221 struct pv_chunk *pc, *npc; 5222 struct rwlock *lock; 5223 int64_t bit; 5224 uint64_t inuse, bitmask; 5225 int allfree, field, freed, idx; 5226 boolean_t superpage; 5227 vm_paddr_t pa; 5228 5229 /* 5230 * Assert that the given pmap is only active on the current 5231 * CPU. Unfortunately, we cannot block another CPU from 5232 * activating the pmap while this function is executing. 5233 */ 5234 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5235#ifdef INVARIANTS 5236 { 5237 cpuset_t other_cpus; 5238 5239 other_cpus = all_cpus; 5240 critical_enter(); 5241 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5242 CPU_AND(&other_cpus, &pmap->pm_active); 5243 critical_exit(); 5244 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5245 } 5246#endif 5247 5248 lock = NULL; 5249 PG_M = pmap_modified_bit(pmap); 5250 PG_V = pmap_valid_bit(pmap); 5251 PG_RW = pmap_rw_bit(pmap); 5252 5253 SLIST_INIT(&free); 5254 rw_rlock(&pvh_global_lock); 5255 PMAP_LOCK(pmap); 5256 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5257 allfree = 1; 5258 freed = 0; 5259 for (field = 0; field < _NPCM; field++) { 5260 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5261 while (inuse != 0) { 5262 bit = bsfq(inuse); 5263 bitmask = 1UL << bit; 5264 idx = field * 64 + bit; 5265 pv = &pc->pc_pventry[idx]; 5266 inuse &= ~bitmask; 5267 5268 pte = pmap_pdpe(pmap, pv->pv_va); 5269 ptepde = *pte; 5270 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5271 tpte = *pte; 5272 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5273 superpage = FALSE; 5274 ptepde = tpte; 5275 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5276 PG_FRAME); 5277 pte = &pte[pmap_pte_index(pv->pv_va)]; 5278 tpte = *pte; 5279 } else { 5280 /* 5281 * Keep track whether 'tpte' is a 5282 * superpage explicitly instead of 5283 * relying on PG_PS being set. 5284 * 5285 * This is because PG_PS is numerically 5286 * identical to PG_PTE_PAT and thus a 5287 * regular page could be mistaken for 5288 * a superpage. 5289 */ 5290 superpage = TRUE; 5291 } 5292 5293 if ((tpte & PG_V) == 0) { 5294 panic("bad pte va %lx pte %lx", 5295 pv->pv_va, tpte); 5296 } 5297 5298/* 5299 * We cannot remove wired pages from a process' mapping at this time 5300 */ 5301 if (tpte & PG_W) { 5302 allfree = 0; 5303 continue; 5304 } 5305 5306 if (superpage) 5307 pa = tpte & PG_PS_FRAME; 5308 else 5309 pa = tpte & PG_FRAME; 5310 5311 m = PHYS_TO_VM_PAGE(pa); 5312 KASSERT(m->phys_addr == pa, 5313 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5314 m, (uintmax_t)m->phys_addr, 5315 (uintmax_t)tpte)); 5316 5317 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5318 m < &vm_page_array[vm_page_array_size], 5319 ("pmap_remove_pages: bad tpte %#jx", 5320 (uintmax_t)tpte)); 5321 5322 pte_clear(pte); 5323 5324 /* 5325 * Update the vm_page_t clean/reference bits. 5326 */ 5327 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5328 if (superpage) { 5329 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5330 vm_page_dirty(mt); 5331 } else 5332 vm_page_dirty(m); 5333 } 5334 5335 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5336 5337 /* Mark free */ 5338 pc->pc_map[field] |= bitmask; 5339 if (superpage) { 5340 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5341 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5342 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5343 pvh->pv_gen++; 5344 if (TAILQ_EMPTY(&pvh->pv_list)) { 5345 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5346 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5347 TAILQ_EMPTY(&mt->md.pv_list)) 5348 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5349 } 5350 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 5351 if (mpte != NULL) { 5352 pmap_remove_pt_page(pmap, mpte); 5353 pmap_resident_count_dec(pmap, 1); 5354 KASSERT(mpte->wire_count == NPTEPG, 5355 ("pmap_remove_pages: pte page wire count error")); 5356 mpte->wire_count = 0; 5357 pmap_add_delayed_free_list(mpte, &free, FALSE); 5358 atomic_subtract_int(&cnt.v_wire_count, 1); 5359 } 5360 } else { 5361 pmap_resident_count_dec(pmap, 1); 5362 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5363 m->md.pv_gen++; 5364 if ((m->aflags & PGA_WRITEABLE) != 0 && 5365 TAILQ_EMPTY(&m->md.pv_list) && 5366 (m->flags & PG_FICTITIOUS) == 0) { 5367 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5368 if (TAILQ_EMPTY(&pvh->pv_list)) 5369 vm_page_aflag_clear(m, PGA_WRITEABLE); 5370 } 5371 } 5372 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5373 freed++; 5374 } 5375 } 5376 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5377 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5378 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5379 if (allfree) { 5380 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5381 free_pv_chunk(pc); 5382 } 5383 } 5384 if (lock != NULL) 5385 rw_wunlock(lock); 5386 pmap_invalidate_all(pmap); 5387 rw_runlock(&pvh_global_lock); 5388 PMAP_UNLOCK(pmap); 5389 pmap_free_zero_pages(&free); 5390} 5391 5392static boolean_t 5393pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5394{ 5395 struct rwlock *lock; 5396 pv_entry_t pv; 5397 struct md_page *pvh; 5398 pt_entry_t *pte, mask; 5399 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5400 pmap_t pmap; 5401 int md_gen, pvh_gen; 5402 boolean_t rv; 5403 5404 rv = FALSE; 5405 rw_rlock(&pvh_global_lock); 5406 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5407 rw_rlock(lock); 5408restart: 5409 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5410 pmap = PV_PMAP(pv); 5411 if (!PMAP_TRYLOCK(pmap)) { 5412 md_gen = m->md.pv_gen; 5413 rw_runlock(lock); 5414 PMAP_LOCK(pmap); 5415 rw_rlock(lock); 5416 if (md_gen != m->md.pv_gen) { 5417 PMAP_UNLOCK(pmap); 5418 goto restart; 5419 } 5420 } 5421 pte = pmap_pte(pmap, pv->pv_va); 5422 mask = 0; 5423 if (modified) { 5424 PG_M = pmap_modified_bit(pmap); 5425 PG_RW = pmap_rw_bit(pmap); 5426 mask |= PG_RW | PG_M; 5427 } 5428 if (accessed) { 5429 PG_A = pmap_accessed_bit(pmap); 5430 PG_V = pmap_valid_bit(pmap); 5431 mask |= PG_V | PG_A; 5432 } 5433 rv = (*pte & mask) == mask; 5434 PMAP_UNLOCK(pmap); 5435 if (rv) 5436 goto out; 5437 } 5438 if ((m->flags & PG_FICTITIOUS) == 0) { 5439 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5440 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5441 pmap = PV_PMAP(pv); 5442 if (!PMAP_TRYLOCK(pmap)) { 5443 md_gen = m->md.pv_gen; 5444 pvh_gen = pvh->pv_gen; 5445 rw_runlock(lock); 5446 PMAP_LOCK(pmap); 5447 rw_rlock(lock); 5448 if (md_gen != m->md.pv_gen || 5449 pvh_gen != pvh->pv_gen) { 5450 PMAP_UNLOCK(pmap); 5451 goto restart; 5452 } 5453 } 5454 pte = pmap_pde(pmap, pv->pv_va); 5455 mask = 0; 5456 if (modified) { 5457 PG_M = pmap_modified_bit(pmap); 5458 PG_RW = pmap_rw_bit(pmap); 5459 mask |= PG_RW | PG_M; 5460 } 5461 if (accessed) { 5462 PG_A = pmap_accessed_bit(pmap); 5463 PG_V = pmap_valid_bit(pmap); 5464 mask |= PG_V | PG_A; 5465 } 5466 rv = (*pte & mask) == mask; 5467 PMAP_UNLOCK(pmap); 5468 if (rv) 5469 goto out; 5470 } 5471 } 5472out: 5473 rw_runlock(lock); 5474 rw_runlock(&pvh_global_lock); 5475 return (rv); 5476} 5477 5478/* 5479 * pmap_is_modified: 5480 * 5481 * Return whether or not the specified physical page was modified 5482 * in any physical maps. 5483 */ 5484boolean_t 5485pmap_is_modified(vm_page_t m) 5486{ 5487 5488 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5489 ("pmap_is_modified: page %p is not managed", m)); 5490 5491 /* 5492 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5493 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5494 * is clear, no PTEs can have PG_M set. 5495 */ 5496 VM_OBJECT_ASSERT_WLOCKED(m->object); 5497 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5498 return (FALSE); 5499 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5500} 5501 5502/* 5503 * pmap_is_prefaultable: 5504 * 5505 * Return whether or not the specified virtual address is eligible 5506 * for prefault. 5507 */ 5508boolean_t 5509pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5510{ 5511 pd_entry_t *pde; 5512 pt_entry_t *pte, PG_V; 5513 boolean_t rv; 5514 5515 PG_V = pmap_valid_bit(pmap); 5516 rv = FALSE; 5517 PMAP_LOCK(pmap); 5518 pde = pmap_pde(pmap, addr); 5519 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5520 pte = pmap_pde_to_pte(pde, addr); 5521 rv = (*pte & PG_V) == 0; 5522 } 5523 PMAP_UNLOCK(pmap); 5524 return (rv); 5525} 5526 5527/* 5528 * pmap_is_referenced: 5529 * 5530 * Return whether or not the specified physical page was referenced 5531 * in any physical maps. 5532 */ 5533boolean_t 5534pmap_is_referenced(vm_page_t m) 5535{ 5536 5537 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5538 ("pmap_is_referenced: page %p is not managed", m)); 5539 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5540} 5541 5542/* 5543 * Clear the write and modified bits in each of the given page's mappings. 5544 */ 5545void 5546pmap_remove_write(vm_page_t m) 5547{ 5548 struct md_page *pvh; 5549 pmap_t pmap; 5550 struct rwlock *lock; 5551 pv_entry_t next_pv, pv; 5552 pd_entry_t *pde; 5553 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5554 vm_offset_t va; 5555 int pvh_gen, md_gen; 5556 5557 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5558 ("pmap_remove_write: page %p is not managed", m)); 5559 5560 /* 5561 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5562 * set by another thread while the object is locked. Thus, 5563 * if PGA_WRITEABLE is clear, no page table entries need updating. 5564 */ 5565 VM_OBJECT_ASSERT_WLOCKED(m->object); 5566 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5567 return; 5568 rw_rlock(&pvh_global_lock); 5569 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5570 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5571retry_pv_loop: 5572 rw_wlock(lock); 5573 if ((m->flags & PG_FICTITIOUS) != 0) 5574 goto small_mappings; 5575 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5576 pmap = PV_PMAP(pv); 5577 if (!PMAP_TRYLOCK(pmap)) { 5578 pvh_gen = pvh->pv_gen; 5579 rw_wunlock(lock); 5580 PMAP_LOCK(pmap); 5581 rw_wlock(lock); 5582 if (pvh_gen != pvh->pv_gen) { 5583 PMAP_UNLOCK(pmap); 5584 rw_wunlock(lock); 5585 goto retry_pv_loop; 5586 } 5587 } 5588 PG_RW = pmap_rw_bit(pmap); 5589 va = pv->pv_va; 5590 pde = pmap_pde(pmap, va); 5591 if ((*pde & PG_RW) != 0) 5592 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5593 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5594 ("inconsistent pv lock %p %p for page %p", 5595 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5596 PMAP_UNLOCK(pmap); 5597 } 5598small_mappings: 5599 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5600 pmap = PV_PMAP(pv); 5601 if (!PMAP_TRYLOCK(pmap)) { 5602 pvh_gen = pvh->pv_gen; 5603 md_gen = m->md.pv_gen; 5604 rw_wunlock(lock); 5605 PMAP_LOCK(pmap); 5606 rw_wlock(lock); 5607 if (pvh_gen != pvh->pv_gen || 5608 md_gen != m->md.pv_gen) { 5609 PMAP_UNLOCK(pmap); 5610 rw_wunlock(lock); 5611 goto retry_pv_loop; 5612 } 5613 } 5614 PG_M = pmap_modified_bit(pmap); 5615 PG_RW = pmap_rw_bit(pmap); 5616 pde = pmap_pde(pmap, pv->pv_va); 5617 KASSERT((*pde & PG_PS) == 0, 5618 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5619 m)); 5620 pte = pmap_pde_to_pte(pde, pv->pv_va); 5621retry: 5622 oldpte = *pte; 5623 if (oldpte & PG_RW) { 5624 if (!atomic_cmpset_long(pte, oldpte, oldpte & 5625 ~(PG_RW | PG_M))) 5626 goto retry; 5627 if ((oldpte & PG_M) != 0) 5628 vm_page_dirty(m); 5629 pmap_invalidate_page(pmap, pv->pv_va); 5630 } 5631 PMAP_UNLOCK(pmap); 5632 } 5633 rw_wunlock(lock); 5634 vm_page_aflag_clear(m, PGA_WRITEABLE); 5635 rw_runlock(&pvh_global_lock); 5636} 5637 5638static __inline boolean_t 5639safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 5640{ 5641 5642 if (!pmap_emulate_ad_bits(pmap)) 5643 return (TRUE); 5644 5645 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 5646 5647 /* 5648 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration 5649 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 5650 * if the EPT_PG_WRITE bit is set. 5651 */ 5652 if ((pte & EPT_PG_WRITE) != 0) 5653 return (FALSE); 5654 5655 /* 5656 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 5657 */ 5658 if ((pte & EPT_PG_EXECUTE) == 0 || 5659 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 5660 return (TRUE); 5661 else 5662 return (FALSE); 5663} 5664 5665#define PMAP_TS_REFERENCED_MAX 5 5666 5667/* 5668 * pmap_ts_referenced: 5669 * 5670 * Return a count of reference bits for a page, clearing those bits. 5671 * It is not necessary for every reference bit to be cleared, but it 5672 * is necessary that 0 only be returned when there are truly no 5673 * reference bits set. 5674 * 5675 * XXX: The exact number of bits to check and clear is a matter that 5676 * should be tested and standardized at some point in the future for 5677 * optimal aging of shared pages. 5678 */ 5679int 5680pmap_ts_referenced(vm_page_t m) 5681{ 5682 struct md_page *pvh; 5683 pv_entry_t pv, pvf; 5684 pmap_t pmap; 5685 struct rwlock *lock; 5686 pd_entry_t oldpde, *pde; 5687 pt_entry_t *pte, PG_A; 5688 vm_offset_t va; 5689 vm_paddr_t pa; 5690 int cleared, md_gen, not_cleared, pvh_gen; 5691 struct spglist free; 5692 boolean_t demoted; 5693 5694 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5695 ("pmap_ts_referenced: page %p is not managed", m)); 5696 SLIST_INIT(&free); 5697 cleared = 0; 5698 pa = VM_PAGE_TO_PHYS(m); 5699 lock = PHYS_TO_PV_LIST_LOCK(pa); 5700 pvh = pa_to_pvh(pa); 5701 rw_rlock(&pvh_global_lock); 5702 rw_wlock(lock); 5703retry: 5704 not_cleared = 0; 5705 if ((m->flags & PG_FICTITIOUS) != 0 || 5706 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5707 goto small_mappings; 5708 pv = pvf; 5709 do { 5710 if (pvf == NULL) 5711 pvf = pv; 5712 pmap = PV_PMAP(pv); 5713 if (!PMAP_TRYLOCK(pmap)) { 5714 pvh_gen = pvh->pv_gen; 5715 rw_wunlock(lock); 5716 PMAP_LOCK(pmap); 5717 rw_wlock(lock); 5718 if (pvh_gen != pvh->pv_gen) { 5719 PMAP_UNLOCK(pmap); 5720 goto retry; 5721 } 5722 } 5723 PG_A = pmap_accessed_bit(pmap); 5724 va = pv->pv_va; 5725 pde = pmap_pde(pmap, pv->pv_va); 5726 oldpde = *pde; 5727 if ((*pde & PG_A) != 0) { 5728 /* 5729 * Since this reference bit is shared by 512 4KB 5730 * pages, it should not be cleared every time it is 5731 * tested. Apply a simple "hash" function on the 5732 * physical page number, the virtual superpage number, 5733 * and the pmap address to select one 4KB page out of 5734 * the 512 on which testing the reference bit will 5735 * result in clearing that reference bit. This 5736 * function is designed to avoid the selection of the 5737 * same 4KB page for every 2MB page mapping. 5738 * 5739 * On demotion, a mapping that hasn't been referenced 5740 * is simply destroyed. To avoid the possibility of a 5741 * subsequent page fault on a demoted wired mapping, 5742 * always leave its reference bit set. Moreover, 5743 * since the superpage is wired, the current state of 5744 * its reference bit won't affect page replacement. 5745 */ 5746 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5747 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5748 (*pde & PG_W) == 0) { 5749 if (safe_to_clear_referenced(pmap, oldpde)) { 5750 atomic_clear_long(pde, PG_A); 5751 pmap_invalidate_page(pmap, pv->pv_va); 5752 demoted = FALSE; 5753 } else if (pmap_demote_pde_locked(pmap, pde, 5754 pv->pv_va, &lock)) { 5755 /* 5756 * Remove the mapping to a single page 5757 * so that a subsequent access may 5758 * repromote. Since the underlying 5759 * page table page is fully populated, 5760 * this removal never frees a page 5761 * table page. 5762 */ 5763 demoted = TRUE; 5764 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5765 PG_PS_FRAME); 5766 pte = pmap_pde_to_pte(pde, va); 5767 pmap_remove_pte(pmap, pte, va, *pde, 5768 NULL, &lock); 5769 pmap_invalidate_page(pmap, va); 5770 } else 5771 demoted = TRUE; 5772 5773 if (demoted) { 5774 /* 5775 * The superpage mapping was removed 5776 * entirely and therefore 'pv' is no 5777 * longer valid. 5778 */ 5779 if (pvf == pv) 5780 pvf = NULL; 5781 pv = NULL; 5782 } 5783 cleared++; 5784 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5785 ("inconsistent pv lock %p %p for page %p", 5786 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5787 } else 5788 not_cleared++; 5789 } 5790 PMAP_UNLOCK(pmap); 5791 /* Rotate the PV list if it has more than one entry. */ 5792 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5793 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5794 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5795 pvh->pv_gen++; 5796 } 5797 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5798 goto out; 5799 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5800small_mappings: 5801 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5802 goto out; 5803 pv = pvf; 5804 do { 5805 if (pvf == NULL) 5806 pvf = pv; 5807 pmap = PV_PMAP(pv); 5808 if (!PMAP_TRYLOCK(pmap)) { 5809 pvh_gen = pvh->pv_gen; 5810 md_gen = m->md.pv_gen; 5811 rw_wunlock(lock); 5812 PMAP_LOCK(pmap); 5813 rw_wlock(lock); 5814 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5815 PMAP_UNLOCK(pmap); 5816 goto retry; 5817 } 5818 } 5819 PG_A = pmap_accessed_bit(pmap); 5820 pde = pmap_pde(pmap, pv->pv_va); 5821 KASSERT((*pde & PG_PS) == 0, 5822 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 5823 m)); 5824 pte = pmap_pde_to_pte(pde, pv->pv_va); 5825 if ((*pte & PG_A) != 0) { 5826 if (safe_to_clear_referenced(pmap, *pte)) { 5827 atomic_clear_long(pte, PG_A); 5828 pmap_invalidate_page(pmap, pv->pv_va); 5829 cleared++; 5830 } else if ((*pte & PG_W) == 0) { 5831 /* 5832 * Wired pages cannot be paged out so 5833 * doing accessed bit emulation for 5834 * them is wasted effort. We do the 5835 * hard work for unwired pages only. 5836 */ 5837 pmap_remove_pte(pmap, pte, pv->pv_va, 5838 *pde, &free, &lock); 5839 pmap_invalidate_page(pmap, pv->pv_va); 5840 cleared++; 5841 if (pvf == pv) 5842 pvf = NULL; 5843 pv = NULL; 5844 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5845 ("inconsistent pv lock %p %p for page %p", 5846 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5847 } else 5848 not_cleared++; 5849 } 5850 PMAP_UNLOCK(pmap); 5851 /* Rotate the PV list if it has more than one entry. */ 5852 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5853 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5854 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5855 m->md.pv_gen++; 5856 } 5857 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5858 not_cleared < PMAP_TS_REFERENCED_MAX); 5859out: 5860 rw_wunlock(lock); 5861 rw_runlock(&pvh_global_lock); 5862 pmap_free_zero_pages(&free); 5863 return (cleared + not_cleared); 5864} 5865 5866/* 5867 * Apply the given advice to the specified range of addresses within the 5868 * given pmap. Depending on the advice, clear the referenced and/or 5869 * modified flags in each mapping and set the mapped page's dirty field. 5870 */ 5871void 5872pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5873{ 5874 struct rwlock *lock; 5875 pml4_entry_t *pml4e; 5876 pdp_entry_t *pdpe; 5877 pd_entry_t oldpde, *pde; 5878 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 5879 vm_offset_t va_next; 5880 vm_page_t m; 5881 boolean_t anychanged, pv_lists_locked; 5882 5883 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5884 return; 5885 5886 /* 5887 * A/D bit emulation requires an alternate code path when clearing 5888 * the modified and accessed bits below. Since this function is 5889 * advisory in nature we skip it entirely for pmaps that require 5890 * A/D bit emulation. 5891 */ 5892 if (pmap_emulate_ad_bits(pmap)) 5893 return; 5894 5895 PG_A = pmap_accessed_bit(pmap); 5896 PG_G = pmap_global_bit(pmap); 5897 PG_M = pmap_modified_bit(pmap); 5898 PG_V = pmap_valid_bit(pmap); 5899 PG_RW = pmap_rw_bit(pmap); 5900 5901 pv_lists_locked = FALSE; 5902resume: 5903 anychanged = FALSE; 5904 PMAP_LOCK(pmap); 5905 for (; sva < eva; sva = va_next) { 5906 pml4e = pmap_pml4e(pmap, sva); 5907 if ((*pml4e & PG_V) == 0) { 5908 va_next = (sva + NBPML4) & ~PML4MASK; 5909 if (va_next < sva) 5910 va_next = eva; 5911 continue; 5912 } 5913 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5914 if ((*pdpe & PG_V) == 0) { 5915 va_next = (sva + NBPDP) & ~PDPMASK; 5916 if (va_next < sva) 5917 va_next = eva; 5918 continue; 5919 } 5920 va_next = (sva + NBPDR) & ~PDRMASK; 5921 if (va_next < sva) 5922 va_next = eva; 5923 pde = pmap_pdpe_to_pde(pdpe, sva); 5924 oldpde = *pde; 5925 if ((oldpde & PG_V) == 0) 5926 continue; 5927 else if ((oldpde & PG_PS) != 0) { 5928 if ((oldpde & PG_MANAGED) == 0) 5929 continue; 5930 if (!pv_lists_locked) { 5931 pv_lists_locked = TRUE; 5932 if (!rw_try_rlock(&pvh_global_lock)) { 5933 if (anychanged) 5934 pmap_invalidate_all(pmap); 5935 PMAP_UNLOCK(pmap); 5936 rw_rlock(&pvh_global_lock); 5937 goto resume; 5938 } 5939 } 5940 lock = NULL; 5941 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 5942 if (lock != NULL) 5943 rw_wunlock(lock); 5944 5945 /* 5946 * The large page mapping was destroyed. 5947 */ 5948 continue; 5949 } 5950 5951 /* 5952 * Unless the page mappings are wired, remove the 5953 * mapping to a single page so that a subsequent 5954 * access may repromote. Since the underlying page 5955 * table page is fully populated, this removal never 5956 * frees a page table page. 5957 */ 5958 if ((oldpde & PG_W) == 0) { 5959 pte = pmap_pde_to_pte(pde, sva); 5960 KASSERT((*pte & PG_V) != 0, 5961 ("pmap_advise: invalid PTE")); 5962 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 5963 &lock); 5964 anychanged = TRUE; 5965 } 5966 if (lock != NULL) 5967 rw_wunlock(lock); 5968 } 5969 if (va_next > eva) 5970 va_next = eva; 5971 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5972 sva += PAGE_SIZE) { 5973 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 5974 PG_V)) 5975 continue; 5976 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5977 if (advice == MADV_DONTNEED) { 5978 /* 5979 * Future calls to pmap_is_modified() 5980 * can be avoided by making the page 5981 * dirty now. 5982 */ 5983 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5984 vm_page_dirty(m); 5985 } 5986 atomic_clear_long(pte, PG_M | PG_A); 5987 } else if ((*pte & PG_A) != 0) 5988 atomic_clear_long(pte, PG_A); 5989 else 5990 continue; 5991 if ((*pte & PG_G) != 0) 5992 pmap_invalidate_page(pmap, sva); 5993 else 5994 anychanged = TRUE; 5995 } 5996 } 5997 if (anychanged) 5998 pmap_invalidate_all(pmap); 5999 if (pv_lists_locked) 6000 rw_runlock(&pvh_global_lock); 6001 PMAP_UNLOCK(pmap); 6002} 6003 6004/* 6005 * Clear the modify bits on the specified physical page. 6006 */ 6007void 6008pmap_clear_modify(vm_page_t m) 6009{ 6010 struct md_page *pvh; 6011 pmap_t pmap; 6012 pv_entry_t next_pv, pv; 6013 pd_entry_t oldpde, *pde; 6014 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6015 struct rwlock *lock; 6016 vm_offset_t va; 6017 int md_gen, pvh_gen; 6018 6019 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6020 ("pmap_clear_modify: page %p is not managed", m)); 6021 VM_OBJECT_ASSERT_WLOCKED(m->object); 6022 KASSERT(!vm_page_xbusied(m), 6023 ("pmap_clear_modify: page %p is exclusive busied", m)); 6024 6025 /* 6026 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6027 * If the object containing the page is locked and the page is not 6028 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6029 */ 6030 if ((m->aflags & PGA_WRITEABLE) == 0) 6031 return; 6032 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6033 rw_rlock(&pvh_global_lock); 6034 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6035 rw_wlock(lock); 6036restart: 6037 if ((m->flags & PG_FICTITIOUS) != 0) 6038 goto small_mappings; 6039 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6040 pmap = PV_PMAP(pv); 6041 if (!PMAP_TRYLOCK(pmap)) { 6042 pvh_gen = pvh->pv_gen; 6043 rw_wunlock(lock); 6044 PMAP_LOCK(pmap); 6045 rw_wlock(lock); 6046 if (pvh_gen != pvh->pv_gen) { 6047 PMAP_UNLOCK(pmap); 6048 goto restart; 6049 } 6050 } 6051 PG_M = pmap_modified_bit(pmap); 6052 PG_V = pmap_valid_bit(pmap); 6053 PG_RW = pmap_rw_bit(pmap); 6054 va = pv->pv_va; 6055 pde = pmap_pde(pmap, va); 6056 oldpde = *pde; 6057 if ((oldpde & PG_RW) != 0) { 6058 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6059 if ((oldpde & PG_W) == 0) { 6060 /* 6061 * Write protect the mapping to a 6062 * single page so that a subsequent 6063 * write access may repromote. 6064 */ 6065 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6066 PG_PS_FRAME); 6067 pte = pmap_pde_to_pte(pde, va); 6068 oldpte = *pte; 6069 if ((oldpte & PG_V) != 0) { 6070 while (!atomic_cmpset_long(pte, 6071 oldpte, 6072 oldpte & ~(PG_M | PG_RW))) 6073 oldpte = *pte; 6074 vm_page_dirty(m); 6075 pmap_invalidate_page(pmap, va); 6076 } 6077 } 6078 } 6079 } 6080 PMAP_UNLOCK(pmap); 6081 } 6082small_mappings: 6083 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6084 pmap = PV_PMAP(pv); 6085 if (!PMAP_TRYLOCK(pmap)) { 6086 md_gen = m->md.pv_gen; 6087 pvh_gen = pvh->pv_gen; 6088 rw_wunlock(lock); 6089 PMAP_LOCK(pmap); 6090 rw_wlock(lock); 6091 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6092 PMAP_UNLOCK(pmap); 6093 goto restart; 6094 } 6095 } 6096 PG_M = pmap_modified_bit(pmap); 6097 PG_RW = pmap_rw_bit(pmap); 6098 pde = pmap_pde(pmap, pv->pv_va); 6099 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6100 " a 2mpage in page %p's pv list", m)); 6101 pte = pmap_pde_to_pte(pde, pv->pv_va); 6102 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6103 atomic_clear_long(pte, PG_M); 6104 pmap_invalidate_page(pmap, pv->pv_va); 6105 } 6106 PMAP_UNLOCK(pmap); 6107 } 6108 rw_wunlock(lock); 6109 rw_runlock(&pvh_global_lock); 6110} 6111 6112/* 6113 * Miscellaneous support routines follow 6114 */ 6115 6116/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6117static __inline void 6118pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6119{ 6120 u_int opte, npte; 6121 6122 /* 6123 * The cache mode bits are all in the low 32-bits of the 6124 * PTE, so we can just spin on updating the low 32-bits. 6125 */ 6126 do { 6127 opte = *(u_int *)pte; 6128 npte = opte & ~mask; 6129 npte |= cache_bits; 6130 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6131} 6132 6133/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6134static __inline void 6135pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6136{ 6137 u_int opde, npde; 6138 6139 /* 6140 * The cache mode bits are all in the low 32-bits of the 6141 * PDE, so we can just spin on updating the low 32-bits. 6142 */ 6143 do { 6144 opde = *(u_int *)pde; 6145 npde = opde & ~mask; 6146 npde |= cache_bits; 6147 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6148} 6149 6150/* 6151 * Map a set of physical memory pages into the kernel virtual 6152 * address space. Return a pointer to where it is mapped. This 6153 * routine is intended to be used for mapping device memory, 6154 * NOT real memory. 6155 */ 6156void * 6157pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6158{ 6159 vm_offset_t va, offset; 6160 vm_size_t tmpsize; 6161 6162 /* 6163 * If the specified range of physical addresses fits within the direct 6164 * map window, use the direct map. 6165 */ 6166 if (pa < dmaplimit && pa + size < dmaplimit) { 6167 va = PHYS_TO_DMAP(pa); 6168 if (!pmap_change_attr(va, size, mode)) 6169 return ((void *)va); 6170 } 6171 offset = pa & PAGE_MASK; 6172 size = round_page(offset + size); 6173 va = kva_alloc(size); 6174 if (!va) 6175 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 6176 pa = trunc_page(pa); 6177 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6178 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6179 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6180 pmap_invalidate_cache_range(va, va + tmpsize); 6181 return ((void *)(va + offset)); 6182} 6183 6184void * 6185pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6186{ 6187 6188 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6189} 6190 6191void * 6192pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6193{ 6194 6195 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6196} 6197 6198void 6199pmap_unmapdev(vm_offset_t va, vm_size_t size) 6200{ 6201 vm_offset_t base, offset; 6202 6203 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6204 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6205 return; 6206 base = trunc_page(va); 6207 offset = va & PAGE_MASK; 6208 size = round_page(offset + size); 6209 kva_free(base, size); 6210} 6211 6212/* 6213 * Tries to demote a 1GB page mapping. 6214 */ 6215static boolean_t 6216pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6217{ 6218 pdp_entry_t newpdpe, oldpdpe; 6219 pd_entry_t *firstpde, newpde, *pde; 6220 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6221 vm_paddr_t mpdepa; 6222 vm_page_t mpde; 6223 6224 PG_A = pmap_accessed_bit(pmap); 6225 PG_M = pmap_modified_bit(pmap); 6226 PG_V = pmap_valid_bit(pmap); 6227 PG_RW = pmap_rw_bit(pmap); 6228 6229 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6230 oldpdpe = *pdpe; 6231 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6232 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6233 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6234 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6235 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6236 " in pmap %p", va, pmap); 6237 return (FALSE); 6238 } 6239 mpdepa = VM_PAGE_TO_PHYS(mpde); 6240 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 6241 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6242 KASSERT((oldpdpe & PG_A) != 0, 6243 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6244 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6245 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6246 newpde = oldpdpe; 6247 6248 /* 6249 * Initialize the page directory page. 6250 */ 6251 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6252 *pde = newpde; 6253 newpde += NBPDR; 6254 } 6255 6256 /* 6257 * Demote the mapping. 6258 */ 6259 *pdpe = newpdpe; 6260 6261 /* 6262 * Invalidate a stale recursive mapping of the page directory page. 6263 */ 6264 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6265 6266 pmap_pdpe_demotions++; 6267 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6268 " in pmap %p", va, pmap); 6269 return (TRUE); 6270} 6271 6272/* 6273 * Sets the memory attribute for the specified page. 6274 */ 6275void 6276pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6277{ 6278 6279 m->md.pat_mode = ma; 6280 6281 /* 6282 * If "m" is a normal page, update its direct mapping. This update 6283 * can be relied upon to perform any cache operations that are 6284 * required for data coherence. 6285 */ 6286 if ((m->flags & PG_FICTITIOUS) == 0 && 6287 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6288 m->md.pat_mode)) 6289 panic("memory attribute change on the direct map failed"); 6290} 6291 6292/* 6293 * Changes the specified virtual address range's memory type to that given by 6294 * the parameter "mode". The specified virtual address range must be 6295 * completely contained within either the direct map or the kernel map. If 6296 * the virtual address range is contained within the kernel map, then the 6297 * memory type for each of the corresponding ranges of the direct map is also 6298 * changed. (The corresponding ranges of the direct map are those ranges that 6299 * map the same physical pages as the specified virtual address range.) These 6300 * changes to the direct map are necessary because Intel describes the 6301 * behavior of their processors as "undefined" if two or more mappings to the 6302 * same physical page have different memory types. 6303 * 6304 * Returns zero if the change completed successfully, and either EINVAL or 6305 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6306 * of the virtual address range was not mapped, and ENOMEM is returned if 6307 * there was insufficient memory available to complete the change. In the 6308 * latter case, the memory type may have been changed on some part of the 6309 * virtual address range or the direct map. 6310 */ 6311int 6312pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6313{ 6314 int error; 6315 6316 PMAP_LOCK(kernel_pmap); 6317 error = pmap_change_attr_locked(va, size, mode); 6318 PMAP_UNLOCK(kernel_pmap); 6319 return (error); 6320} 6321 6322static int 6323pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6324{ 6325 vm_offset_t base, offset, tmpva; 6326 vm_paddr_t pa_start, pa_end; 6327 pdp_entry_t *pdpe; 6328 pd_entry_t *pde; 6329 pt_entry_t *pte; 6330 int cache_bits_pte, cache_bits_pde, error; 6331 boolean_t changed; 6332 6333 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6334 base = trunc_page(va); 6335 offset = va & PAGE_MASK; 6336 size = round_page(offset + size); 6337 6338 /* 6339 * Only supported on kernel virtual addresses, including the direct 6340 * map but excluding the recursive map. 6341 */ 6342 if (base < DMAP_MIN_ADDRESS) 6343 return (EINVAL); 6344 6345 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6346 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6347 changed = FALSE; 6348 6349 /* 6350 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6351 * into 4KB pages if required. 6352 */ 6353 for (tmpva = base; tmpva < base + size; ) { 6354 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6355 if (*pdpe == 0) 6356 return (EINVAL); 6357 if (*pdpe & PG_PS) { 6358 /* 6359 * If the current 1GB page already has the required 6360 * memory type, then we need not demote this page. Just 6361 * increment tmpva to the next 1GB page frame. 6362 */ 6363 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6364 tmpva = trunc_1gpage(tmpva) + NBPDP; 6365 continue; 6366 } 6367 6368 /* 6369 * If the current offset aligns with a 1GB page frame 6370 * and there is at least 1GB left within the range, then 6371 * we need not break down this page into 2MB pages. 6372 */ 6373 if ((tmpva & PDPMASK) == 0 && 6374 tmpva + PDPMASK < base + size) { 6375 tmpva += NBPDP; 6376 continue; 6377 } 6378 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6379 return (ENOMEM); 6380 } 6381 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6382 if (*pde == 0) 6383 return (EINVAL); 6384 if (*pde & PG_PS) { 6385 /* 6386 * If the current 2MB page already has the required 6387 * memory type, then we need not demote this page. Just 6388 * increment tmpva to the next 2MB page frame. 6389 */ 6390 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6391 tmpva = trunc_2mpage(tmpva) + NBPDR; 6392 continue; 6393 } 6394 6395 /* 6396 * If the current offset aligns with a 2MB page frame 6397 * and there is at least 2MB left within the range, then 6398 * we need not break down this page into 4KB pages. 6399 */ 6400 if ((tmpva & PDRMASK) == 0 && 6401 tmpva + PDRMASK < base + size) { 6402 tmpva += NBPDR; 6403 continue; 6404 } 6405 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6406 return (ENOMEM); 6407 } 6408 pte = pmap_pde_to_pte(pde, tmpva); 6409 if (*pte == 0) 6410 return (EINVAL); 6411 tmpva += PAGE_SIZE; 6412 } 6413 error = 0; 6414 6415 /* 6416 * Ok, all the pages exist, so run through them updating their 6417 * cache mode if required. 6418 */ 6419 pa_start = pa_end = 0; 6420 for (tmpva = base; tmpva < base + size; ) { 6421 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6422 if (*pdpe & PG_PS) { 6423 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6424 pmap_pde_attr(pdpe, cache_bits_pde, 6425 X86_PG_PDE_CACHE); 6426 changed = TRUE; 6427 } 6428 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6429 if (pa_start == pa_end) { 6430 /* Start physical address run. */ 6431 pa_start = *pdpe & PG_PS_FRAME; 6432 pa_end = pa_start + NBPDP; 6433 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6434 pa_end += NBPDP; 6435 else { 6436 /* Run ended, update direct map. */ 6437 error = pmap_change_attr_locked( 6438 PHYS_TO_DMAP(pa_start), 6439 pa_end - pa_start, mode); 6440 if (error != 0) 6441 break; 6442 /* Start physical address run. */ 6443 pa_start = *pdpe & PG_PS_FRAME; 6444 pa_end = pa_start + NBPDP; 6445 } 6446 } 6447 tmpva = trunc_1gpage(tmpva) + NBPDP; 6448 continue; 6449 } 6450 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6451 if (*pde & PG_PS) { 6452 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6453 pmap_pde_attr(pde, cache_bits_pde, 6454 X86_PG_PDE_CACHE); 6455 changed = TRUE; 6456 } 6457 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6458 if (pa_start == pa_end) { 6459 /* Start physical address run. */ 6460 pa_start = *pde & PG_PS_FRAME; 6461 pa_end = pa_start + NBPDR; 6462 } else if (pa_end == (*pde & PG_PS_FRAME)) 6463 pa_end += NBPDR; 6464 else { 6465 /* Run ended, update direct map. */ 6466 error = pmap_change_attr_locked( 6467 PHYS_TO_DMAP(pa_start), 6468 pa_end - pa_start, mode); 6469 if (error != 0) 6470 break; 6471 /* Start physical address run. */ 6472 pa_start = *pde & PG_PS_FRAME; 6473 pa_end = pa_start + NBPDR; 6474 } 6475 } 6476 tmpva = trunc_2mpage(tmpva) + NBPDR; 6477 } else { 6478 pte = pmap_pde_to_pte(pde, tmpva); 6479 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6480 pmap_pte_attr(pte, cache_bits_pte, 6481 X86_PG_PTE_CACHE); 6482 changed = TRUE; 6483 } 6484 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6485 if (pa_start == pa_end) { 6486 /* Start physical address run. */ 6487 pa_start = *pte & PG_FRAME; 6488 pa_end = pa_start + PAGE_SIZE; 6489 } else if (pa_end == (*pte & PG_FRAME)) 6490 pa_end += PAGE_SIZE; 6491 else { 6492 /* Run ended, update direct map. */ 6493 error = pmap_change_attr_locked( 6494 PHYS_TO_DMAP(pa_start), 6495 pa_end - pa_start, mode); 6496 if (error != 0) 6497 break; 6498 /* Start physical address run. */ 6499 pa_start = *pte & PG_FRAME; 6500 pa_end = pa_start + PAGE_SIZE; 6501 } 6502 } 6503 tmpva += PAGE_SIZE; 6504 } 6505 } 6506 if (error == 0 && pa_start != pa_end) 6507 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6508 pa_end - pa_start, mode); 6509 6510 /* 6511 * Flush CPU caches if required to make sure any data isn't cached that 6512 * shouldn't be, etc. 6513 */ 6514 if (changed) { 6515 pmap_invalidate_range(kernel_pmap, base, tmpva); 6516 pmap_invalidate_cache_range(base, tmpva); 6517 } 6518 return (error); 6519} 6520 6521/* 6522 * Demotes any mapping within the direct map region that covers more than the 6523 * specified range of physical addresses. This range's size must be a power 6524 * of two and its starting address must be a multiple of its size. Since the 6525 * demotion does not change any attributes of the mapping, a TLB invalidation 6526 * is not mandatory. The caller may, however, request a TLB invalidation. 6527 */ 6528void 6529pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6530{ 6531 pdp_entry_t *pdpe; 6532 pd_entry_t *pde; 6533 vm_offset_t va; 6534 boolean_t changed; 6535 6536 if (len == 0) 6537 return; 6538 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6539 KASSERT((base & (len - 1)) == 0, 6540 ("pmap_demote_DMAP: base is not a multiple of len")); 6541 if (len < NBPDP && base < dmaplimit) { 6542 va = PHYS_TO_DMAP(base); 6543 changed = FALSE; 6544 PMAP_LOCK(kernel_pmap); 6545 pdpe = pmap_pdpe(kernel_pmap, va); 6546 if ((*pdpe & X86_PG_V) == 0) 6547 panic("pmap_demote_DMAP: invalid PDPE"); 6548 if ((*pdpe & PG_PS) != 0) { 6549 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6550 panic("pmap_demote_DMAP: PDPE failed"); 6551 changed = TRUE; 6552 } 6553 if (len < NBPDR) { 6554 pde = pmap_pdpe_to_pde(pdpe, va); 6555 if ((*pde & X86_PG_V) == 0) 6556 panic("pmap_demote_DMAP: invalid PDE"); 6557 if ((*pde & PG_PS) != 0) { 6558 if (!pmap_demote_pde(kernel_pmap, pde, va)) 6559 panic("pmap_demote_DMAP: PDE failed"); 6560 changed = TRUE; 6561 } 6562 } 6563 if (changed && invalidate) 6564 pmap_invalidate_page(kernel_pmap, va); 6565 PMAP_UNLOCK(kernel_pmap); 6566 } 6567} 6568 6569/* 6570 * perform the pmap work for mincore 6571 */ 6572int 6573pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6574{ 6575 pd_entry_t *pdep; 6576 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 6577 vm_paddr_t pa; 6578 int val; 6579 6580 PG_A = pmap_accessed_bit(pmap); 6581 PG_M = pmap_modified_bit(pmap); 6582 PG_V = pmap_valid_bit(pmap); 6583 PG_RW = pmap_rw_bit(pmap); 6584 6585 PMAP_LOCK(pmap); 6586retry: 6587 pdep = pmap_pde(pmap, addr); 6588 if (pdep != NULL && (*pdep & PG_V)) { 6589 if (*pdep & PG_PS) { 6590 pte = *pdep; 6591 /* Compute the physical address of the 4KB page. */ 6592 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 6593 PG_FRAME; 6594 val = MINCORE_SUPER; 6595 } else { 6596 pte = *pmap_pde_to_pte(pdep, addr); 6597 pa = pte & PG_FRAME; 6598 val = 0; 6599 } 6600 } else { 6601 pte = 0; 6602 pa = 0; 6603 val = 0; 6604 } 6605 if ((pte & PG_V) != 0) { 6606 val |= MINCORE_INCORE; 6607 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6608 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6609 if ((pte & PG_A) != 0) 6610 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6611 } 6612 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6613 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 6614 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 6615 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6616 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6617 goto retry; 6618 } else 6619 PA_UNLOCK_COND(*locked_pa); 6620 PMAP_UNLOCK(pmap); 6621 return (val); 6622} 6623 6624void 6625pmap_activate(struct thread *td) 6626{ 6627 pmap_t pmap, oldpmap; 6628 u_int cpuid; 6629 6630 critical_enter(); 6631 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6632 oldpmap = PCPU_GET(curpmap); 6633 cpuid = PCPU_GET(cpuid); 6634#ifdef SMP 6635 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6636 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6637 CPU_SET_ATOMIC(cpuid, &pmap->pm_save); 6638#else 6639 CPU_CLR(cpuid, &oldpmap->pm_active); 6640 CPU_SET(cpuid, &pmap->pm_active); 6641 CPU_SET(cpuid, &pmap->pm_save); 6642#endif 6643 td->td_pcb->pcb_cr3 = pmap->pm_cr3; 6644 load_cr3(pmap->pm_cr3); 6645 PCPU_SET(curpmap, pmap); 6646 critical_exit(); 6647} 6648 6649void 6650pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 6651{ 6652} 6653 6654/* 6655 * Increase the starting virtual address of the given mapping if a 6656 * different alignment might result in more superpage mappings. 6657 */ 6658void 6659pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6660 vm_offset_t *addr, vm_size_t size) 6661{ 6662 vm_offset_t superpage_offset; 6663 6664 if (size < NBPDR) 6665 return; 6666 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6667 offset += ptoa(object->pg_color); 6668 superpage_offset = offset & PDRMASK; 6669 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 6670 (*addr & PDRMASK) == superpage_offset) 6671 return; 6672 if ((*addr & PDRMASK) < superpage_offset) 6673 *addr = (*addr & ~PDRMASK) + superpage_offset; 6674 else 6675 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 6676} 6677 6678#ifdef INVARIANTS 6679static unsigned long num_dirty_emulations; 6680SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 6681 &num_dirty_emulations, 0, NULL); 6682 6683static unsigned long num_accessed_emulations; 6684SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 6685 &num_accessed_emulations, 0, NULL); 6686 6687static unsigned long num_superpage_accessed_emulations; 6688SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 6689 &num_superpage_accessed_emulations, 0, NULL); 6690 6691static unsigned long ad_emulation_superpage_promotions; 6692SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 6693 &ad_emulation_superpage_promotions, 0, NULL); 6694#endif /* INVARIANTS */ 6695 6696int 6697pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 6698{ 6699 int rv; 6700 struct rwlock *lock; 6701 vm_page_t m, mpte; 6702 pd_entry_t *pde; 6703 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 6704 boolean_t pv_lists_locked; 6705 6706 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 6707 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 6708 6709 if (!pmap_emulate_ad_bits(pmap)) 6710 return (-1); 6711 6712 PG_A = pmap_accessed_bit(pmap); 6713 PG_M = pmap_modified_bit(pmap); 6714 PG_V = pmap_valid_bit(pmap); 6715 PG_RW = pmap_rw_bit(pmap); 6716 6717 rv = -1; 6718 lock = NULL; 6719 pv_lists_locked = FALSE; 6720retry: 6721 PMAP_LOCK(pmap); 6722 6723 pde = pmap_pde(pmap, va); 6724 if (pde == NULL || (*pde & PG_V) == 0) 6725 goto done; 6726 6727 if ((*pde & PG_PS) != 0) { 6728 if (ftype == VM_PROT_READ) { 6729#ifdef INVARIANTS 6730 atomic_add_long(&num_superpage_accessed_emulations, 1); 6731#endif 6732 *pde |= PG_A; 6733 rv = 0; 6734 } 6735 goto done; 6736 } 6737 6738 pte = pmap_pde_to_pte(pde, va); 6739 if ((*pte & PG_V) == 0) 6740 goto done; 6741 6742 if (ftype == VM_PROT_WRITE) { 6743 if ((*pte & PG_RW) == 0) 6744 goto done; 6745 *pte |= PG_M; 6746 } 6747 *pte |= PG_A; 6748 6749 /* try to promote the mapping */ 6750 if (va < VM_MAXUSER_ADDRESS) 6751 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6752 else 6753 mpte = NULL; 6754 6755 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6756 6757 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 6758 pmap_ps_enabled(pmap) && 6759 (m->flags & PG_FICTITIOUS) == 0 && 6760 vm_reserv_level_iffullpop(m) == 0) { 6761 if (!pv_lists_locked) { 6762 pv_lists_locked = TRUE; 6763 if (!rw_try_rlock(&pvh_global_lock)) { 6764 PMAP_UNLOCK(pmap); 6765 rw_rlock(&pvh_global_lock); 6766 goto retry; 6767 } 6768 } 6769 pmap_promote_pde(pmap, pde, va, &lock); 6770#ifdef INVARIANTS 6771 atomic_add_long(&ad_emulation_superpage_promotions, 1); 6772#endif 6773 } 6774#ifdef INVARIANTS 6775 if (ftype == VM_PROT_WRITE) 6776 atomic_add_long(&num_dirty_emulations, 1); 6777 else 6778 atomic_add_long(&num_accessed_emulations, 1); 6779#endif 6780 rv = 0; /* success */ 6781done: 6782 if (lock != NULL) 6783 rw_wunlock(lock); 6784 if (pv_lists_locked) 6785 rw_runlock(&pvh_global_lock); 6786 PMAP_UNLOCK(pmap); 6787 return (rv); 6788} 6789 6790void 6791pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 6792{ 6793 pml4_entry_t *pml4; 6794 pdp_entry_t *pdp; 6795 pd_entry_t *pde; 6796 pt_entry_t *pte, PG_V; 6797 int idx; 6798 6799 idx = 0; 6800 PG_V = pmap_valid_bit(pmap); 6801 PMAP_LOCK(pmap); 6802 6803 pml4 = pmap_pml4e(pmap, va); 6804 ptr[idx++] = *pml4; 6805 if ((*pml4 & PG_V) == 0) 6806 goto done; 6807 6808 pdp = pmap_pml4e_to_pdpe(pml4, va); 6809 ptr[idx++] = *pdp; 6810 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 6811 goto done; 6812 6813 pde = pmap_pdpe_to_pde(pdp, va); 6814 ptr[idx++] = *pde; 6815 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 6816 goto done; 6817 6818 pte = pmap_pde_to_pte(pde, va); 6819 ptr[idx++] = *pte; 6820 6821done: 6822 PMAP_UNLOCK(pmap); 6823 *num = idx; 6824} 6825 6826#include "opt_ddb.h" 6827#ifdef DDB 6828#include <ddb/ddb.h> 6829 6830DB_SHOW_COMMAND(pte, pmap_print_pte) 6831{ 6832 pmap_t pmap; 6833 pml4_entry_t *pml4; 6834 pdp_entry_t *pdp; 6835 pd_entry_t *pde; 6836 pt_entry_t *pte, PG_V; 6837 vm_offset_t va; 6838 6839 if (have_addr) { 6840 va = (vm_offset_t)addr; 6841 pmap = PCPU_GET(curpmap); /* XXX */ 6842 } else { 6843 db_printf("show pte addr\n"); 6844 return; 6845 } 6846 PG_V = pmap_valid_bit(pmap); 6847 pml4 = pmap_pml4e(pmap, va); 6848 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 6849 if ((*pml4 & PG_V) == 0) { 6850 db_printf("\n"); 6851 return; 6852 } 6853 pdp = pmap_pml4e_to_pdpe(pml4, va); 6854 db_printf(" pdpe %#016lx", *pdp); 6855 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 6856 db_printf("\n"); 6857 return; 6858 } 6859 pde = pmap_pdpe_to_pde(pdp, va); 6860 db_printf(" pde %#016lx", *pde); 6861 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 6862 db_printf("\n"); 6863 return; 6864 } 6865 pte = pmap_pde_to_pte(pde, va); 6866 db_printf(" pte %#016lx\n", *pte); 6867} 6868 6869DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 6870{ 6871 vm_paddr_t a; 6872 6873 if (have_addr) { 6874 a = (vm_paddr_t)addr; 6875 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 6876 } else { 6877 db_printf("show phys2dmap addr\n"); 6878 } 6879} 6880#endif 6881