pmap.c revision 306558
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#define AMD64_NPT_AWARE 80 81#include <sys/cdefs.h> 82__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/pmap.c 306558 2016-10-01 19:30:28Z alc $"); 83 84/* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103#include "opt_pmap.h" 104#include "opt_vm.h" 105 106#include <sys/param.h> 107#include <sys/bitstring.h> 108#include <sys/bus.h> 109#include <sys/systm.h> 110#include <sys/kernel.h> 111#include <sys/ktr.h> 112#include <sys/lock.h> 113#include <sys/malloc.h> 114#include <sys/mman.h> 115#include <sys/mutex.h> 116#include <sys/proc.h> 117#include <sys/rwlock.h> 118#include <sys/sx.h> 119#include <sys/turnstile.h> 120#include <sys/vmem.h> 121#include <sys/vmmeter.h> 122#include <sys/sched.h> 123#include <sys/sysctl.h> 124#include <sys/smp.h> 125 126#include <vm/vm.h> 127#include <vm/vm_param.h> 128#include <vm/vm_kern.h> 129#include <vm/vm_page.h> 130#include <vm/vm_map.h> 131#include <vm/vm_object.h> 132#include <vm/vm_extern.h> 133#include <vm/vm_pageout.h> 134#include <vm/vm_pager.h> 135#include <vm/vm_phys.h> 136#include <vm/vm_radix.h> 137#include <vm/vm_reserv.h> 138#include <vm/uma.h> 139 140#include <machine/intr_machdep.h> 141#include <x86/apicvar.h> 142#include <machine/cpu.h> 143#include <machine/cputypes.h> 144#include <machine/md_var.h> 145#include <machine/pcb.h> 146#include <machine/specialreg.h> 147#ifdef SMP 148#include <machine/smp.h> 149#endif 150 151static __inline boolean_t 152pmap_type_guest(pmap_t pmap) 153{ 154 155 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 156} 157 158static __inline boolean_t 159pmap_emulate_ad_bits(pmap_t pmap) 160{ 161 162 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 163} 164 165static __inline pt_entry_t 166pmap_valid_bit(pmap_t pmap) 167{ 168 pt_entry_t mask; 169 170 switch (pmap->pm_type) { 171 case PT_X86: 172 case PT_RVI: 173 mask = X86_PG_V; 174 break; 175 case PT_EPT: 176 if (pmap_emulate_ad_bits(pmap)) 177 mask = EPT_PG_EMUL_V; 178 else 179 mask = EPT_PG_READ; 180 break; 181 default: 182 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 183 } 184 185 return (mask); 186} 187 188static __inline pt_entry_t 189pmap_rw_bit(pmap_t pmap) 190{ 191 pt_entry_t mask; 192 193 switch (pmap->pm_type) { 194 case PT_X86: 195 case PT_RVI: 196 mask = X86_PG_RW; 197 break; 198 case PT_EPT: 199 if (pmap_emulate_ad_bits(pmap)) 200 mask = EPT_PG_EMUL_RW; 201 else 202 mask = EPT_PG_WRITE; 203 break; 204 default: 205 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 206 } 207 208 return (mask); 209} 210 211static __inline pt_entry_t 212pmap_global_bit(pmap_t pmap) 213{ 214 pt_entry_t mask; 215 216 switch (pmap->pm_type) { 217 case PT_X86: 218 mask = X86_PG_G; 219 break; 220 case PT_RVI: 221 case PT_EPT: 222 mask = 0; 223 break; 224 default: 225 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 226 } 227 228 return (mask); 229} 230 231static __inline pt_entry_t 232pmap_accessed_bit(pmap_t pmap) 233{ 234 pt_entry_t mask; 235 236 switch (pmap->pm_type) { 237 case PT_X86: 238 case PT_RVI: 239 mask = X86_PG_A; 240 break; 241 case PT_EPT: 242 if (pmap_emulate_ad_bits(pmap)) 243 mask = EPT_PG_READ; 244 else 245 mask = EPT_PG_A; 246 break; 247 default: 248 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 249 } 250 251 return (mask); 252} 253 254static __inline pt_entry_t 255pmap_modified_bit(pmap_t pmap) 256{ 257 pt_entry_t mask; 258 259 switch (pmap->pm_type) { 260 case PT_X86: 261 case PT_RVI: 262 mask = X86_PG_M; 263 break; 264 case PT_EPT: 265 if (pmap_emulate_ad_bits(pmap)) 266 mask = EPT_PG_WRITE; 267 else 268 mask = EPT_PG_M; 269 break; 270 default: 271 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 272 } 273 274 return (mask); 275} 276 277extern struct pcpu __pcpu[]; 278 279#if !defined(DIAGNOSTIC) 280#ifdef __GNUC_GNU_INLINE__ 281#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 282#else 283#define PMAP_INLINE extern inline 284#endif 285#else 286#define PMAP_INLINE 287#endif 288 289#ifdef PV_STATS 290#define PV_STAT(x) do { x ; } while (0) 291#else 292#define PV_STAT(x) do { } while (0) 293#endif 294 295#define pa_index(pa) ((pa) >> PDRSHIFT) 296#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 297 298#define NPV_LIST_LOCKS MAXCPU 299 300#define PHYS_TO_PV_LIST_LOCK(pa) \ 301 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 302 303#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 304 struct rwlock **_lockp = (lockp); \ 305 struct rwlock *_new_lock; \ 306 \ 307 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 308 if (_new_lock != *_lockp) { \ 309 if (*_lockp != NULL) \ 310 rw_wunlock(*_lockp); \ 311 *_lockp = _new_lock; \ 312 rw_wlock(*_lockp); \ 313 } \ 314} while (0) 315 316#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 317 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 318 319#define RELEASE_PV_LIST_LOCK(lockp) do { \ 320 struct rwlock **_lockp = (lockp); \ 321 \ 322 if (*_lockp != NULL) { \ 323 rw_wunlock(*_lockp); \ 324 *_lockp = NULL; \ 325 } \ 326} while (0) 327 328#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 329 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 330 331struct pmap kernel_pmap_store; 332 333vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 334vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 335 336int nkpt; 337SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 338 "Number of kernel page table pages allocated on bootup"); 339 340static int ndmpdp; 341vm_paddr_t dmaplimit; 342vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 343pt_entry_t pg_nx; 344 345static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 346 347static int pat_works = 1; 348SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 349 "Is page attribute table fully functional?"); 350 351static int pg_ps_enabled = 1; 352SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 353 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 354 355#define PAT_INDEX_SIZE 8 356static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 357 358static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 359static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 360u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 361u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 362 363static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 364static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 365static int ndmpdpphys; /* number of DMPDPphys pages */ 366 367/* 368 * pmap_mapdev support pre initialization (i.e. console) 369 */ 370#define PMAP_PREINIT_MAPPING_COUNT 8 371static struct pmap_preinit_mapping { 372 vm_paddr_t pa; 373 vm_offset_t va; 374 vm_size_t sz; 375 int mode; 376} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 377static int pmap_initialized; 378 379/* 380 * Data for the pv entry allocation mechanism. 381 * Updates to pv_invl_gen are protected by the pv_list_locks[] 382 * elements, but reads are not. 383 */ 384static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 385static struct mtx pv_chunks_mutex; 386static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 387static u_long pv_invl_gen[NPV_LIST_LOCKS]; 388static struct md_page *pv_table; 389static struct md_page pv_dummy; 390 391/* 392 * All those kernel PT submaps that BSD is so fond of 393 */ 394pt_entry_t *CMAP1 = 0; 395caddr_t CADDR1 = 0; 396static vm_offset_t qframe = 0; 397static struct mtx qframe_mtx; 398 399static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 400 401int pmap_pcid_enabled = 1; 402SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 403 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 404int invpcid_works = 0; 405SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 406 "Is the invpcid instruction available ?"); 407 408static int 409pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 410{ 411 int i; 412 uint64_t res; 413 414 res = 0; 415 CPU_FOREACH(i) { 416 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 417 } 418 return (sysctl_handle_64(oidp, &res, 0, req)); 419} 420SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 421 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 422 "Count of saved TLB context on switch"); 423 424static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 425 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 426static struct mtx invl_gen_mtx; 427static u_long pmap_invl_gen = 0; 428/* Fake lock object to satisfy turnstiles interface. */ 429static struct lock_object invl_gen_ts = { 430 .lo_name = "invlts", 431}; 432 433#define PMAP_ASSERT_NOT_IN_DI() \ 434 KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started")) 435 436/* 437 * Start a new Delayed Invalidation (DI) block of code, executed by 438 * the current thread. Within a DI block, the current thread may 439 * destroy both the page table and PV list entries for a mapping and 440 * then release the corresponding PV list lock before ensuring that 441 * the mapping is flushed from the TLBs of any processors with the 442 * pmap active. 443 */ 444static void 445pmap_delayed_invl_started(void) 446{ 447 struct pmap_invl_gen *invl_gen; 448 u_long currgen; 449 450 invl_gen = &curthread->td_md.md_invl_gen; 451 PMAP_ASSERT_NOT_IN_DI(); 452 mtx_lock(&invl_gen_mtx); 453 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 454 currgen = pmap_invl_gen; 455 else 456 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 457 invl_gen->gen = currgen + 1; 458 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 459 mtx_unlock(&invl_gen_mtx); 460} 461 462/* 463 * Finish the DI block, previously started by the current thread. All 464 * required TLB flushes for the pages marked by 465 * pmap_delayed_invl_page() must be finished before this function is 466 * called. 467 * 468 * This function works by bumping the global DI generation number to 469 * the generation number of the current thread's DI, unless there is a 470 * pending DI that started earlier. In the latter case, bumping the 471 * global DI generation number would incorrectly signal that the 472 * earlier DI had finished. Instead, this function bumps the earlier 473 * DI's generation number to match the generation number of the 474 * current thread's DI. 475 */ 476static void 477pmap_delayed_invl_finished(void) 478{ 479 struct pmap_invl_gen *invl_gen, *next; 480 struct turnstile *ts; 481 482 invl_gen = &curthread->td_md.md_invl_gen; 483 KASSERT(invl_gen->gen != 0, ("missed invl_started")); 484 mtx_lock(&invl_gen_mtx); 485 next = LIST_NEXT(invl_gen, link); 486 if (next == NULL) { 487 turnstile_chain_lock(&invl_gen_ts); 488 ts = turnstile_lookup(&invl_gen_ts); 489 pmap_invl_gen = invl_gen->gen; 490 if (ts != NULL) { 491 turnstile_broadcast(ts, TS_SHARED_QUEUE); 492 turnstile_unpend(ts, TS_SHARED_LOCK); 493 } 494 turnstile_chain_unlock(&invl_gen_ts); 495 } else { 496 next->gen = invl_gen->gen; 497 } 498 LIST_REMOVE(invl_gen, link); 499 mtx_unlock(&invl_gen_mtx); 500 invl_gen->gen = 0; 501} 502 503#ifdef PV_STATS 504static long invl_wait; 505SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 506 "Number of times DI invalidation blocked pmap_remove_all/write"); 507#endif 508 509static u_long * 510pmap_delayed_invl_genp(vm_page_t m) 511{ 512 513 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 514} 515 516/* 517 * Ensure that all currently executing DI blocks, that need to flush 518 * TLB for the given page m, actually flushed the TLB at the time the 519 * function returned. If the page m has an empty PV list and we call 520 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 521 * valid mapping for the page m in either its page table or TLB. 522 * 523 * This function works by blocking until the global DI generation 524 * number catches up with the generation number associated with the 525 * given page m and its PV list. Since this function's callers 526 * typically own an object lock and sometimes own a page lock, it 527 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 528 * processor. 529 */ 530static void 531pmap_delayed_invl_wait(vm_page_t m) 532{ 533 struct thread *td; 534 struct turnstile *ts; 535 u_long *m_gen; 536#ifdef PV_STATS 537 bool accounted = false; 538#endif 539 540 td = curthread; 541 m_gen = pmap_delayed_invl_genp(m); 542 while (*m_gen > pmap_invl_gen) { 543#ifdef PV_STATS 544 if (!accounted) { 545 atomic_add_long(&invl_wait, 1); 546 accounted = true; 547 } 548#endif 549 ts = turnstile_trywait(&invl_gen_ts); 550 if (*m_gen > pmap_invl_gen) 551 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 552 else 553 turnstile_cancel(ts); 554 } 555} 556 557/* 558 * Mark the page m's PV list as participating in the current thread's 559 * DI block. Any threads concurrently using m's PV list to remove or 560 * restrict all mappings to m will wait for the current thread's DI 561 * block to complete before proceeding. 562 * 563 * The function works by setting the DI generation number for m's PV 564 * list to at least the DI generation number of the current thread. 565 * This forces a caller of pmap_delayed_invl_wait() to block until 566 * current thread calls pmap_delayed_invl_finished(). 567 */ 568static void 569pmap_delayed_invl_page(vm_page_t m) 570{ 571 u_long gen, *m_gen; 572 573 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 574 gen = curthread->td_md.md_invl_gen.gen; 575 if (gen == 0) 576 return; 577 m_gen = pmap_delayed_invl_genp(m); 578 if (*m_gen < gen) 579 *m_gen = gen; 580} 581 582/* 583 * Crashdump maps. 584 */ 585static caddr_t crashdumpmap; 586 587static void free_pv_chunk(struct pv_chunk *pc); 588static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 589static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 590static int popcnt_pc_map_pq(uint64_t *map); 591static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 592static void reserve_pv_entries(pmap_t pmap, int needed, 593 struct rwlock **lockp); 594static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 595 struct rwlock **lockp); 596static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 597 struct rwlock **lockp); 598static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 599 struct rwlock **lockp); 600static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 601static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 602 vm_offset_t va); 603 604static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 605static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 606static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 607 vm_offset_t va, struct rwlock **lockp); 608static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 609 vm_offset_t va); 610static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 611 vm_prot_t prot, struct rwlock **lockp); 612static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 613 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 614static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 615static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 616static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 617static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 618static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 619static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 620 struct rwlock **lockp); 621static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 622 vm_prot_t prot); 623static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 624static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 625 struct spglist *free, struct rwlock **lockp); 626static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 627 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 628static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 629static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 630 struct spglist *free); 631static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 632 vm_page_t m, struct rwlock **lockp); 633static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 634 pd_entry_t newpde); 635static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 636 637static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 638 struct rwlock **lockp); 639static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 640 struct rwlock **lockp); 641static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 642 struct rwlock **lockp); 643 644static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 645 struct spglist *free); 646static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 647static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 648 649/* 650 * Move the kernel virtual free pointer to the next 651 * 2MB. This is used to help improve performance 652 * by using a large (2MB) page for much of the kernel 653 * (.text, .data, .bss) 654 */ 655static vm_offset_t 656pmap_kmem_choose(vm_offset_t addr) 657{ 658 vm_offset_t newaddr = addr; 659 660 newaddr = roundup2(addr, NBPDR); 661 return (newaddr); 662} 663 664/********************/ 665/* Inline functions */ 666/********************/ 667 668/* Return a non-clipped PD index for a given VA */ 669static __inline vm_pindex_t 670pmap_pde_pindex(vm_offset_t va) 671{ 672 return (va >> PDRSHIFT); 673} 674 675 676/* Return a pointer to the PML4 slot that corresponds to a VA */ 677static __inline pml4_entry_t * 678pmap_pml4e(pmap_t pmap, vm_offset_t va) 679{ 680 681 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 682} 683 684/* Return a pointer to the PDP slot that corresponds to a VA */ 685static __inline pdp_entry_t * 686pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 687{ 688 pdp_entry_t *pdpe; 689 690 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 691 return (&pdpe[pmap_pdpe_index(va)]); 692} 693 694/* Return a pointer to the PDP slot that corresponds to a VA */ 695static __inline pdp_entry_t * 696pmap_pdpe(pmap_t pmap, vm_offset_t va) 697{ 698 pml4_entry_t *pml4e; 699 pt_entry_t PG_V; 700 701 PG_V = pmap_valid_bit(pmap); 702 pml4e = pmap_pml4e(pmap, va); 703 if ((*pml4e & PG_V) == 0) 704 return (NULL); 705 return (pmap_pml4e_to_pdpe(pml4e, va)); 706} 707 708/* Return a pointer to the PD slot that corresponds to a VA */ 709static __inline pd_entry_t * 710pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 711{ 712 pd_entry_t *pde; 713 714 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 715 return (&pde[pmap_pde_index(va)]); 716} 717 718/* Return a pointer to the PD slot that corresponds to a VA */ 719static __inline pd_entry_t * 720pmap_pde(pmap_t pmap, vm_offset_t va) 721{ 722 pdp_entry_t *pdpe; 723 pt_entry_t PG_V; 724 725 PG_V = pmap_valid_bit(pmap); 726 pdpe = pmap_pdpe(pmap, va); 727 if (pdpe == NULL || (*pdpe & PG_V) == 0) 728 return (NULL); 729 return (pmap_pdpe_to_pde(pdpe, va)); 730} 731 732/* Return a pointer to the PT slot that corresponds to a VA */ 733static __inline pt_entry_t * 734pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 735{ 736 pt_entry_t *pte; 737 738 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 739 return (&pte[pmap_pte_index(va)]); 740} 741 742/* Return a pointer to the PT slot that corresponds to a VA */ 743static __inline pt_entry_t * 744pmap_pte(pmap_t pmap, vm_offset_t va) 745{ 746 pd_entry_t *pde; 747 pt_entry_t PG_V; 748 749 PG_V = pmap_valid_bit(pmap); 750 pde = pmap_pde(pmap, va); 751 if (pde == NULL || (*pde & PG_V) == 0) 752 return (NULL); 753 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 754 return ((pt_entry_t *)pde); 755 return (pmap_pde_to_pte(pde, va)); 756} 757 758static __inline void 759pmap_resident_count_inc(pmap_t pmap, int count) 760{ 761 762 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 763 pmap->pm_stats.resident_count += count; 764} 765 766static __inline void 767pmap_resident_count_dec(pmap_t pmap, int count) 768{ 769 770 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 771 KASSERT(pmap->pm_stats.resident_count >= count, 772 ("pmap %p resident count underflow %ld %d", pmap, 773 pmap->pm_stats.resident_count, count)); 774 pmap->pm_stats.resident_count -= count; 775} 776 777PMAP_INLINE pt_entry_t * 778vtopte(vm_offset_t va) 779{ 780 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 781 782 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 783 784 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 785} 786 787static __inline pd_entry_t * 788vtopde(vm_offset_t va) 789{ 790 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 791 792 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 793 794 return (PDmap + ((va >> PDRSHIFT) & mask)); 795} 796 797static u_int64_t 798allocpages(vm_paddr_t *firstaddr, int n) 799{ 800 u_int64_t ret; 801 802 ret = *firstaddr; 803 bzero((void *)ret, n * PAGE_SIZE); 804 *firstaddr += n * PAGE_SIZE; 805 return (ret); 806} 807 808CTASSERT(powerof2(NDMPML4E)); 809 810/* number of kernel PDP slots */ 811#define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 812 813static void 814nkpt_init(vm_paddr_t addr) 815{ 816 int pt_pages; 817 818#ifdef NKPT 819 pt_pages = NKPT; 820#else 821 pt_pages = howmany(addr, 1 << PDRSHIFT); 822 pt_pages += NKPDPE(pt_pages); 823 824 /* 825 * Add some slop beyond the bare minimum required for bootstrapping 826 * the kernel. 827 * 828 * This is quite important when allocating KVA for kernel modules. 829 * The modules are required to be linked in the negative 2GB of 830 * the address space. If we run out of KVA in this region then 831 * pmap_growkernel() will need to allocate page table pages to map 832 * the entire 512GB of KVA space which is an unnecessary tax on 833 * physical memory. 834 * 835 * Secondly, device memory mapped as part of setting up the low- 836 * level console(s) is taken from KVA, starting at virtual_avail. 837 * This is because cninit() is called after pmap_bootstrap() but 838 * before vm_init() and pmap_init(). 20MB for a frame buffer is 839 * not uncommon. 840 */ 841 pt_pages += 32; /* 64MB additional slop. */ 842#endif 843 nkpt = pt_pages; 844} 845 846static void 847create_pagetables(vm_paddr_t *firstaddr) 848{ 849 int i, j, ndm1g, nkpdpe; 850 pt_entry_t *pt_p; 851 pd_entry_t *pd_p; 852 pdp_entry_t *pdp_p; 853 pml4_entry_t *p4_p; 854 855 /* Allocate page table pages for the direct map */ 856 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 857 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 858 ndmpdp = 4; 859 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 860 if (ndmpdpphys > NDMPML4E) { 861 /* 862 * Each NDMPML4E allows 512 GB, so limit to that, 863 * and then readjust ndmpdp and ndmpdpphys. 864 */ 865 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 866 Maxmem = atop(NDMPML4E * NBPML4); 867 ndmpdpphys = NDMPML4E; 868 ndmpdp = NDMPML4E * NPDEPG; 869 } 870 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 871 ndm1g = 0; 872 if ((amd_feature & AMDID_PAGE1GB) != 0) 873 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 874 if (ndm1g < ndmpdp) 875 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 876 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 877 878 /* Allocate pages */ 879 KPML4phys = allocpages(firstaddr, 1); 880 KPDPphys = allocpages(firstaddr, NKPML4E); 881 882 /* 883 * Allocate the initial number of kernel page table pages required to 884 * bootstrap. We defer this until after all memory-size dependent 885 * allocations are done (e.g. direct map), so that we don't have to 886 * build in too much slop in our estimate. 887 * 888 * Note that when NKPML4E > 1, we have an empty page underneath 889 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 890 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 891 */ 892 nkpt_init(*firstaddr); 893 nkpdpe = NKPDPE(nkpt); 894 895 KPTphys = allocpages(firstaddr, nkpt); 896 KPDphys = allocpages(firstaddr, nkpdpe); 897 898 /* Fill in the underlying page table pages */ 899 /* Nominally read-only (but really R/W) from zero to physfree */ 900 /* XXX not fully used, underneath 2M pages */ 901 pt_p = (pt_entry_t *)KPTphys; 902 for (i = 0; ptoa(i) < *firstaddr; i++) 903 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 904 905 /* Now map the page tables at their location within PTmap */ 906 pd_p = (pd_entry_t *)KPDphys; 907 for (i = 0; i < nkpt; i++) 908 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 909 910 /* Map from zero to end of allocations under 2M pages */ 911 /* This replaces some of the KPTphys entries above */ 912 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 913 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 914 X86_PG_G; 915 916 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 917 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 918 for (i = 0; i < nkpdpe; i++) 919 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 920 PG_U; 921 922 /* 923 * Now, set up the direct map region using 2MB and/or 1GB pages. If 924 * the end of physical memory is not aligned to a 1GB page boundary, 925 * then the residual physical memory is mapped with 2MB pages. Later, 926 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 927 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 928 * that are partially used. 929 */ 930 pd_p = (pd_entry_t *)DMPDphys; 931 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 932 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 933 /* Preset PG_M and PG_A because demotion expects it. */ 934 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 935 X86_PG_M | X86_PG_A; 936 } 937 pdp_p = (pdp_entry_t *)DMPDPphys; 938 for (i = 0; i < ndm1g; i++) { 939 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 940 /* Preset PG_M and PG_A because demotion expects it. */ 941 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 942 X86_PG_M | X86_PG_A; 943 } 944 for (j = 0; i < ndmpdp; i++, j++) { 945 pdp_p[i] = DMPDphys + ptoa(j); 946 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 947 } 948 949 /* And recursively map PML4 to itself in order to get PTmap */ 950 p4_p = (pml4_entry_t *)KPML4phys; 951 p4_p[PML4PML4I] = KPML4phys; 952 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 953 954 /* Connect the Direct Map slot(s) up to the PML4. */ 955 for (i = 0; i < ndmpdpphys; i++) { 956 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 957 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 958 } 959 960 /* Connect the KVA slots up to the PML4 */ 961 for (i = 0; i < NKPML4E; i++) { 962 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 963 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 964 } 965} 966 967/* 968 * Bootstrap the system enough to run with virtual memory. 969 * 970 * On amd64 this is called after mapping has already been enabled 971 * and just syncs the pmap module with what has already been done. 972 * [We can't call it easily with mapping off since the kernel is not 973 * mapped with PA == VA, hence we would have to relocate every address 974 * from the linked base (virtual) address "KERNBASE" to the actual 975 * (physical) address starting relative to 0] 976 */ 977void 978pmap_bootstrap(vm_paddr_t *firstaddr) 979{ 980 vm_offset_t va; 981 pt_entry_t *pte; 982 int i; 983 984 /* 985 * Create an initial set of page tables to run the kernel in. 986 */ 987 create_pagetables(firstaddr); 988 989 /* 990 * Add a physical memory segment (vm_phys_seg) corresponding to the 991 * preallocated kernel page table pages so that vm_page structures 992 * representing these pages will be created. The vm_page structures 993 * are required for promotion of the corresponding kernel virtual 994 * addresses to superpage mappings. 995 */ 996 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 997 998 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 999 virtual_avail = pmap_kmem_choose(virtual_avail); 1000 1001 virtual_end = VM_MAX_KERNEL_ADDRESS; 1002 1003 1004 /* XXX do %cr0 as well */ 1005 load_cr4(rcr4() | CR4_PGE); 1006 load_cr3(KPML4phys); 1007 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1008 load_cr4(rcr4() | CR4_SMEP); 1009 1010 /* 1011 * Initialize the kernel pmap (which is statically allocated). 1012 */ 1013 PMAP_LOCK_INIT(kernel_pmap); 1014 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1015 kernel_pmap->pm_cr3 = KPML4phys; 1016 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1017 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1018 kernel_pmap->pm_flags = pmap_flags; 1019 1020 /* 1021 * Initialize the TLB invalidations generation number lock. 1022 */ 1023 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1024 1025 /* 1026 * Reserve some special page table entries/VA space for temporary 1027 * mapping of pages. 1028 */ 1029#define SYSMAP(c, p, v, n) \ 1030 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1031 1032 va = virtual_avail; 1033 pte = vtopte(va); 1034 1035 /* 1036 * Crashdump maps. The first page is reused as CMAP1 for the 1037 * memory test. 1038 */ 1039 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1040 CADDR1 = crashdumpmap; 1041 1042 virtual_avail = va; 1043 1044 /* Initialize the PAT MSR. */ 1045 pmap_init_pat(); 1046 1047 /* Initialize TLB Context Id. */ 1048 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1049 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1050 /* Check for INVPCID support */ 1051 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 1052 != 0; 1053 for (i = 0; i < MAXCPU; i++) { 1054 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1055 kernel_pmap->pm_pcids[i].pm_gen = 1; 1056 } 1057 __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1; 1058 __pcpu[0].pc_pcid_gen = 1; 1059 /* 1060 * pcpu area for APs is zeroed during AP startup. 1061 * pc_pcid_next and pc_pcid_gen are initialized by AP 1062 * during pcpu setup. 1063 */ 1064 load_cr4(rcr4() | CR4_PCIDE); 1065 } else { 1066 pmap_pcid_enabled = 0; 1067 } 1068} 1069 1070/* 1071 * Setup the PAT MSR. 1072 */ 1073void 1074pmap_init_pat(void) 1075{ 1076 int pat_table[PAT_INDEX_SIZE]; 1077 uint64_t pat_msr; 1078 u_long cr0, cr4; 1079 int i; 1080 1081 /* Bail if this CPU doesn't implement PAT. */ 1082 if ((cpu_feature & CPUID_PAT) == 0) 1083 panic("no PAT??"); 1084 1085 /* Set default PAT index table. */ 1086 for (i = 0; i < PAT_INDEX_SIZE; i++) 1087 pat_table[i] = -1; 1088 pat_table[PAT_WRITE_BACK] = 0; 1089 pat_table[PAT_WRITE_THROUGH] = 1; 1090 pat_table[PAT_UNCACHEABLE] = 3; 1091 pat_table[PAT_WRITE_COMBINING] = 3; 1092 pat_table[PAT_WRITE_PROTECTED] = 3; 1093 pat_table[PAT_UNCACHED] = 3; 1094 1095 /* Initialize default PAT entries. */ 1096 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1097 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1098 PAT_VALUE(2, PAT_UNCACHED) | 1099 PAT_VALUE(3, PAT_UNCACHEABLE) | 1100 PAT_VALUE(4, PAT_WRITE_BACK) | 1101 PAT_VALUE(5, PAT_WRITE_THROUGH) | 1102 PAT_VALUE(6, PAT_UNCACHED) | 1103 PAT_VALUE(7, PAT_UNCACHEABLE); 1104 1105 if (pat_works) { 1106 /* 1107 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1108 * Program 5 and 6 as WP and WC. 1109 * Leave 4 and 7 as WB and UC. 1110 */ 1111 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 1112 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1113 PAT_VALUE(6, PAT_WRITE_COMBINING); 1114 pat_table[PAT_UNCACHED] = 2; 1115 pat_table[PAT_WRITE_PROTECTED] = 5; 1116 pat_table[PAT_WRITE_COMBINING] = 6; 1117 } else { 1118 /* 1119 * Just replace PAT Index 2 with WC instead of UC-. 1120 */ 1121 pat_msr &= ~PAT_MASK(2); 1122 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 1123 pat_table[PAT_WRITE_COMBINING] = 2; 1124 } 1125 1126 /* Disable PGE. */ 1127 cr4 = rcr4(); 1128 load_cr4(cr4 & ~CR4_PGE); 1129 1130 /* Disable caches (CD = 1, NW = 0). */ 1131 cr0 = rcr0(); 1132 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1133 1134 /* Flushes caches and TLBs. */ 1135 wbinvd(); 1136 invltlb(); 1137 1138 /* Update PAT and index table. */ 1139 wrmsr(MSR_PAT, pat_msr); 1140 for (i = 0; i < PAT_INDEX_SIZE; i++) 1141 pat_index[i] = pat_table[i]; 1142 1143 /* Flush caches and TLBs again. */ 1144 wbinvd(); 1145 invltlb(); 1146 1147 /* Restore caches and PGE. */ 1148 load_cr0(cr0); 1149 load_cr4(cr4); 1150} 1151 1152/* 1153 * Initialize a vm_page's machine-dependent fields. 1154 */ 1155void 1156pmap_page_init(vm_page_t m) 1157{ 1158 1159 TAILQ_INIT(&m->md.pv_list); 1160 m->md.pat_mode = PAT_WRITE_BACK; 1161} 1162 1163/* 1164 * Initialize the pmap module. 1165 * Called by vm_init, to initialize any structures that the pmap 1166 * system needs to map virtual memory. 1167 */ 1168void 1169pmap_init(void) 1170{ 1171 struct pmap_preinit_mapping *ppim; 1172 vm_page_t mpte; 1173 vm_size_t s; 1174 int error, i, pv_npg; 1175 1176 /* 1177 * Initialize the vm page array entries for the kernel pmap's 1178 * page table pages. 1179 */ 1180 for (i = 0; i < nkpt; i++) { 1181 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1182 KASSERT(mpte >= vm_page_array && 1183 mpte < &vm_page_array[vm_page_array_size], 1184 ("pmap_init: page table page is out of range")); 1185 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1186 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1187 } 1188 1189 /* 1190 * If the kernel is running on a virtual machine, then it must assume 1191 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1192 * be prepared for the hypervisor changing the vendor and family that 1193 * are reported by CPUID. Consequently, the workaround for AMD Family 1194 * 10h Erratum 383 is enabled if the processor's feature set does not 1195 * include at least one feature that is only supported by older Intel 1196 * or newer AMD processors. 1197 */ 1198 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1199 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1200 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1201 AMDID2_FMA4)) == 0) 1202 workaround_erratum383 = 1; 1203 1204 /* 1205 * Are large page mappings enabled? 1206 */ 1207 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1208 if (pg_ps_enabled) { 1209 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1210 ("pmap_init: can't assign to pagesizes[1]")); 1211 pagesizes[1] = NBPDR; 1212 } 1213 1214 /* 1215 * Initialize the pv chunk list mutex. 1216 */ 1217 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1218 1219 /* 1220 * Initialize the pool of pv list locks. 1221 */ 1222 for (i = 0; i < NPV_LIST_LOCKS; i++) 1223 rw_init(&pv_list_locks[i], "pmap pv list"); 1224 1225 /* 1226 * Calculate the size of the pv head table for superpages. 1227 */ 1228 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1229 1230 /* 1231 * Allocate memory for the pv head table for superpages. 1232 */ 1233 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1234 s = round_page(s); 1235 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1236 M_WAITOK | M_ZERO); 1237 for (i = 0; i < pv_npg; i++) 1238 TAILQ_INIT(&pv_table[i].pv_list); 1239 TAILQ_INIT(&pv_dummy.pv_list); 1240 1241 pmap_initialized = 1; 1242 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1243 ppim = pmap_preinit_mapping + i; 1244 if (ppim->va == 0) 1245 continue; 1246 /* Make the direct map consistent */ 1247 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { 1248 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1249 ppim->sz, ppim->mode); 1250 } 1251 if (!bootverbose) 1252 continue; 1253 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1254 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1255 } 1256 1257 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1258 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1259 (vmem_addr_t *)&qframe); 1260 if (error != 0) 1261 panic("qframe allocation failed"); 1262} 1263 1264static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1265 "2MB page mapping counters"); 1266 1267static u_long pmap_pde_demotions; 1268SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1269 &pmap_pde_demotions, 0, "2MB page demotions"); 1270 1271static u_long pmap_pde_mappings; 1272SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1273 &pmap_pde_mappings, 0, "2MB page mappings"); 1274 1275static u_long pmap_pde_p_failures; 1276SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1277 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1278 1279static u_long pmap_pde_promotions; 1280SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1281 &pmap_pde_promotions, 0, "2MB page promotions"); 1282 1283static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1284 "1GB page mapping counters"); 1285 1286static u_long pmap_pdpe_demotions; 1287SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1288 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1289 1290/*************************************************** 1291 * Low level helper routines..... 1292 ***************************************************/ 1293 1294static pt_entry_t 1295pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1296{ 1297 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1298 1299 switch (pmap->pm_type) { 1300 case PT_X86: 1301 case PT_RVI: 1302 /* Verify that both PAT bits are not set at the same time */ 1303 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1304 ("Invalid PAT bits in entry %#lx", entry)); 1305 1306 /* Swap the PAT bits if one of them is set */ 1307 if ((entry & x86_pat_bits) != 0) 1308 entry ^= x86_pat_bits; 1309 break; 1310 case PT_EPT: 1311 /* 1312 * Nothing to do - the memory attributes are represented 1313 * the same way for regular pages and superpages. 1314 */ 1315 break; 1316 default: 1317 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1318 } 1319 1320 return (entry); 1321} 1322 1323/* 1324 * Determine the appropriate bits to set in a PTE or PDE for a specified 1325 * caching mode. 1326 */ 1327int 1328pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1329{ 1330 int cache_bits, pat_flag, pat_idx; 1331 1332 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1333 panic("Unknown caching mode %d\n", mode); 1334 1335 switch (pmap->pm_type) { 1336 case PT_X86: 1337 case PT_RVI: 1338 /* The PAT bit is different for PTE's and PDE's. */ 1339 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1340 1341 /* Map the caching mode to a PAT index. */ 1342 pat_idx = pat_index[mode]; 1343 1344 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1345 cache_bits = 0; 1346 if (pat_idx & 0x4) 1347 cache_bits |= pat_flag; 1348 if (pat_idx & 0x2) 1349 cache_bits |= PG_NC_PCD; 1350 if (pat_idx & 0x1) 1351 cache_bits |= PG_NC_PWT; 1352 break; 1353 1354 case PT_EPT: 1355 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1356 break; 1357 1358 default: 1359 panic("unsupported pmap type %d", pmap->pm_type); 1360 } 1361 1362 return (cache_bits); 1363} 1364 1365static int 1366pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1367{ 1368 int mask; 1369 1370 switch (pmap->pm_type) { 1371 case PT_X86: 1372 case PT_RVI: 1373 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1374 break; 1375 case PT_EPT: 1376 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1377 break; 1378 default: 1379 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1380 } 1381 1382 return (mask); 1383} 1384 1385static __inline boolean_t 1386pmap_ps_enabled(pmap_t pmap) 1387{ 1388 1389 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1390} 1391 1392static void 1393pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1394{ 1395 1396 switch (pmap->pm_type) { 1397 case PT_X86: 1398 break; 1399 case PT_RVI: 1400 case PT_EPT: 1401 /* 1402 * XXX 1403 * This is a little bogus since the generation number is 1404 * supposed to be bumped up when a region of the address 1405 * space is invalidated in the page tables. 1406 * 1407 * In this case the old PDE entry is valid but yet we want 1408 * to make sure that any mappings using the old entry are 1409 * invalidated in the TLB. 1410 * 1411 * The reason this works as expected is because we rendezvous 1412 * "all" host cpus and force any vcpu context to exit as a 1413 * side-effect. 1414 */ 1415 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1416 break; 1417 default: 1418 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1419 } 1420 pde_store(pde, newpde); 1421} 1422 1423/* 1424 * After changing the page size for the specified virtual address in the page 1425 * table, flush the corresponding entries from the processor's TLB. Only the 1426 * calling processor's TLB is affected. 1427 * 1428 * The calling thread must be pinned to a processor. 1429 */ 1430static void 1431pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1432{ 1433 pt_entry_t PG_G; 1434 1435 if (pmap_type_guest(pmap)) 1436 return; 1437 1438 KASSERT(pmap->pm_type == PT_X86, 1439 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1440 1441 PG_G = pmap_global_bit(pmap); 1442 1443 if ((newpde & PG_PS) == 0) 1444 /* Demotion: flush a specific 2MB page mapping. */ 1445 invlpg(va); 1446 else if ((newpde & PG_G) == 0) 1447 /* 1448 * Promotion: flush every 4KB page mapping from the TLB 1449 * because there are too many to flush individually. 1450 */ 1451 invltlb(); 1452 else { 1453 /* 1454 * Promotion: flush every 4KB page mapping from the TLB, 1455 * including any global (PG_G) mappings. 1456 */ 1457 invltlb_glob(); 1458 } 1459} 1460#ifdef SMP 1461 1462/* 1463 * For SMP, these functions have to use the IPI mechanism for coherence. 1464 * 1465 * N.B.: Before calling any of the following TLB invalidation functions, 1466 * the calling processor must ensure that all stores updating a non- 1467 * kernel page table are globally performed. Otherwise, another 1468 * processor could cache an old, pre-update entry without being 1469 * invalidated. This can happen one of two ways: (1) The pmap becomes 1470 * active on another processor after its pm_active field is checked by 1471 * one of the following functions but before a store updating the page 1472 * table is globally performed. (2) The pmap becomes active on another 1473 * processor before its pm_active field is checked but due to 1474 * speculative loads one of the following functions stills reads the 1475 * pmap as inactive on the other processor. 1476 * 1477 * The kernel page table is exempt because its pm_active field is 1478 * immutable. The kernel page table is always active on every 1479 * processor. 1480 */ 1481 1482/* 1483 * Interrupt the cpus that are executing in the guest context. 1484 * This will force the vcpu to exit and the cached EPT mappings 1485 * will be invalidated by the host before the next vmresume. 1486 */ 1487static __inline void 1488pmap_invalidate_ept(pmap_t pmap) 1489{ 1490 int ipinum; 1491 1492 sched_pin(); 1493 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1494 ("pmap_invalidate_ept: absurd pm_active")); 1495 1496 /* 1497 * The TLB mappings associated with a vcpu context are not 1498 * flushed each time a different vcpu is chosen to execute. 1499 * 1500 * This is in contrast with a process's vtop mappings that 1501 * are flushed from the TLB on each context switch. 1502 * 1503 * Therefore we need to do more than just a TLB shootdown on 1504 * the active cpus in 'pmap->pm_active'. To do this we keep 1505 * track of the number of invalidations performed on this pmap. 1506 * 1507 * Each vcpu keeps a cache of this counter and compares it 1508 * just before a vmresume. If the counter is out-of-date an 1509 * invept will be done to flush stale mappings from the TLB. 1510 */ 1511 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1512 1513 /* 1514 * Force the vcpu to exit and trap back into the hypervisor. 1515 */ 1516 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1517 ipi_selected(pmap->pm_active, ipinum); 1518 sched_unpin(); 1519} 1520 1521void 1522pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1523{ 1524 cpuset_t *mask; 1525 u_int cpuid, i; 1526 1527 if (pmap_type_guest(pmap)) { 1528 pmap_invalidate_ept(pmap); 1529 return; 1530 } 1531 1532 KASSERT(pmap->pm_type == PT_X86, 1533 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1534 1535 sched_pin(); 1536 if (pmap == kernel_pmap) { 1537 invlpg(va); 1538 mask = &all_cpus; 1539 } else { 1540 cpuid = PCPU_GET(cpuid); 1541 if (pmap == PCPU_GET(curpmap)) 1542 invlpg(va); 1543 else if (pmap_pcid_enabled) 1544 pmap->pm_pcids[cpuid].pm_gen = 0; 1545 if (pmap_pcid_enabled) { 1546 CPU_FOREACH(i) { 1547 if (cpuid != i) 1548 pmap->pm_pcids[i].pm_gen = 0; 1549 } 1550 } 1551 mask = &pmap->pm_active; 1552 } 1553 smp_masked_invlpg(*mask, va); 1554 sched_unpin(); 1555} 1556 1557/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1558#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1559 1560void 1561pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1562{ 1563 cpuset_t *mask; 1564 vm_offset_t addr; 1565 u_int cpuid, i; 1566 1567 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1568 pmap_invalidate_all(pmap); 1569 return; 1570 } 1571 1572 if (pmap_type_guest(pmap)) { 1573 pmap_invalidate_ept(pmap); 1574 return; 1575 } 1576 1577 KASSERT(pmap->pm_type == PT_X86, 1578 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1579 1580 sched_pin(); 1581 cpuid = PCPU_GET(cpuid); 1582 if (pmap == kernel_pmap) { 1583 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1584 invlpg(addr); 1585 mask = &all_cpus; 1586 } else { 1587 if (pmap == PCPU_GET(curpmap)) { 1588 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1589 invlpg(addr); 1590 } else if (pmap_pcid_enabled) { 1591 pmap->pm_pcids[cpuid].pm_gen = 0; 1592 } 1593 if (pmap_pcid_enabled) { 1594 CPU_FOREACH(i) { 1595 if (cpuid != i) 1596 pmap->pm_pcids[i].pm_gen = 0; 1597 } 1598 } 1599 mask = &pmap->pm_active; 1600 } 1601 smp_masked_invlpg_range(*mask, sva, eva); 1602 sched_unpin(); 1603} 1604 1605void 1606pmap_invalidate_all(pmap_t pmap) 1607{ 1608 cpuset_t *mask; 1609 struct invpcid_descr d; 1610 u_int cpuid, i; 1611 1612 if (pmap_type_guest(pmap)) { 1613 pmap_invalidate_ept(pmap); 1614 return; 1615 } 1616 1617 KASSERT(pmap->pm_type == PT_X86, 1618 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1619 1620 sched_pin(); 1621 if (pmap == kernel_pmap) { 1622 if (pmap_pcid_enabled && invpcid_works) { 1623 bzero(&d, sizeof(d)); 1624 invpcid(&d, INVPCID_CTXGLOB); 1625 } else { 1626 invltlb_glob(); 1627 } 1628 mask = &all_cpus; 1629 } else { 1630 cpuid = PCPU_GET(cpuid); 1631 if (pmap == PCPU_GET(curpmap)) { 1632 if (pmap_pcid_enabled) { 1633 if (invpcid_works) { 1634 d.pcid = pmap->pm_pcids[cpuid].pm_pcid; 1635 d.pad = 0; 1636 d.addr = 0; 1637 invpcid(&d, INVPCID_CTX); 1638 } else { 1639 load_cr3(pmap->pm_cr3 | pmap->pm_pcids 1640 [PCPU_GET(cpuid)].pm_pcid); 1641 } 1642 } else { 1643 invltlb(); 1644 } 1645 } else if (pmap_pcid_enabled) { 1646 pmap->pm_pcids[cpuid].pm_gen = 0; 1647 } 1648 if (pmap_pcid_enabled) { 1649 CPU_FOREACH(i) { 1650 if (cpuid != i) 1651 pmap->pm_pcids[i].pm_gen = 0; 1652 } 1653 } 1654 mask = &pmap->pm_active; 1655 } 1656 smp_masked_invltlb(*mask, pmap); 1657 sched_unpin(); 1658} 1659 1660void 1661pmap_invalidate_cache(void) 1662{ 1663 1664 sched_pin(); 1665 wbinvd(); 1666 smp_cache_flush(); 1667 sched_unpin(); 1668} 1669 1670struct pde_action { 1671 cpuset_t invalidate; /* processors that invalidate their TLB */ 1672 pmap_t pmap; 1673 vm_offset_t va; 1674 pd_entry_t *pde; 1675 pd_entry_t newpde; 1676 u_int store; /* processor that updates the PDE */ 1677}; 1678 1679static void 1680pmap_update_pde_action(void *arg) 1681{ 1682 struct pde_action *act = arg; 1683 1684 if (act->store == PCPU_GET(cpuid)) 1685 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1686} 1687 1688static void 1689pmap_update_pde_teardown(void *arg) 1690{ 1691 struct pde_action *act = arg; 1692 1693 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1694 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1695} 1696 1697/* 1698 * Change the page size for the specified virtual address in a way that 1699 * prevents any possibility of the TLB ever having two entries that map the 1700 * same virtual address using different page sizes. This is the recommended 1701 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1702 * machine check exception for a TLB state that is improperly diagnosed as a 1703 * hardware error. 1704 */ 1705static void 1706pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1707{ 1708 struct pde_action act; 1709 cpuset_t active, other_cpus; 1710 u_int cpuid; 1711 1712 sched_pin(); 1713 cpuid = PCPU_GET(cpuid); 1714 other_cpus = all_cpus; 1715 CPU_CLR(cpuid, &other_cpus); 1716 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 1717 active = all_cpus; 1718 else { 1719 active = pmap->pm_active; 1720 } 1721 if (CPU_OVERLAP(&active, &other_cpus)) { 1722 act.store = cpuid; 1723 act.invalidate = active; 1724 act.va = va; 1725 act.pmap = pmap; 1726 act.pde = pde; 1727 act.newpde = newpde; 1728 CPU_SET(cpuid, &active); 1729 smp_rendezvous_cpus(active, 1730 smp_no_rendevous_barrier, pmap_update_pde_action, 1731 pmap_update_pde_teardown, &act); 1732 } else { 1733 pmap_update_pde_store(pmap, pde, newpde); 1734 if (CPU_ISSET(cpuid, &active)) 1735 pmap_update_pde_invalidate(pmap, va, newpde); 1736 } 1737 sched_unpin(); 1738} 1739#else /* !SMP */ 1740/* 1741 * Normal, non-SMP, invalidation functions. 1742 */ 1743void 1744pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1745{ 1746 1747 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1748 pmap->pm_eptgen++; 1749 return; 1750 } 1751 KASSERT(pmap->pm_type == PT_X86, 1752 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1753 1754 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1755 invlpg(va); 1756 else if (pmap_pcid_enabled) 1757 pmap->pm_pcids[0].pm_gen = 0; 1758} 1759 1760void 1761pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1762{ 1763 vm_offset_t addr; 1764 1765 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1766 pmap->pm_eptgen++; 1767 return; 1768 } 1769 KASSERT(pmap->pm_type == PT_X86, 1770 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1771 1772 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 1773 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1774 invlpg(addr); 1775 } else if (pmap_pcid_enabled) { 1776 pmap->pm_pcids[0].pm_gen = 0; 1777 } 1778} 1779 1780void 1781pmap_invalidate_all(pmap_t pmap) 1782{ 1783 struct invpcid_descr d; 1784 1785 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1786 pmap->pm_eptgen++; 1787 return; 1788 } 1789 KASSERT(pmap->pm_type == PT_X86, 1790 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 1791 1792 if (pmap == kernel_pmap) { 1793 if (pmap_pcid_enabled && invpcid_works) { 1794 bzero(&d, sizeof(d)); 1795 invpcid(&d, INVPCID_CTXGLOB); 1796 } else { 1797 invltlb_glob(); 1798 } 1799 } else if (pmap == PCPU_GET(curpmap)) { 1800 if (pmap_pcid_enabled) { 1801 if (invpcid_works) { 1802 d.pcid = pmap->pm_pcids[0].pm_pcid; 1803 d.pad = 0; 1804 d.addr = 0; 1805 invpcid(&d, INVPCID_CTX); 1806 } else { 1807 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. 1808 pm_pcid); 1809 } 1810 } else { 1811 invltlb(); 1812 } 1813 } else if (pmap_pcid_enabled) { 1814 pmap->pm_pcids[0].pm_gen = 0; 1815 } 1816} 1817 1818PMAP_INLINE void 1819pmap_invalidate_cache(void) 1820{ 1821 1822 wbinvd(); 1823} 1824 1825static void 1826pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1827{ 1828 1829 pmap_update_pde_store(pmap, pde, newpde); 1830 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1831 pmap_update_pde_invalidate(pmap, va, newpde); 1832 else 1833 pmap->pm_pcids[0].pm_gen = 0; 1834} 1835#endif /* !SMP */ 1836 1837#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1838 1839void 1840pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1841{ 1842 1843 if (force) { 1844 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1845 } else { 1846 KASSERT((sva & PAGE_MASK) == 0, 1847 ("pmap_invalidate_cache_range: sva not page-aligned")); 1848 KASSERT((eva & PAGE_MASK) == 0, 1849 ("pmap_invalidate_cache_range: eva not page-aligned")); 1850 } 1851 1852 if ((cpu_feature & CPUID_SS) != 0 && !force) 1853 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1854 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1855 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1856 /* 1857 * XXX: Some CPUs fault, hang, or trash the local APIC 1858 * registers if we use CLFLUSH on the local APIC 1859 * range. The local APIC is always uncached, so we 1860 * don't need to flush for that range anyway. 1861 */ 1862 if (pmap_kextract(sva) == lapic_paddr) 1863 return; 1864 1865 /* 1866 * Otherwise, do per-cache line flush. Use the mfence 1867 * instruction to insure that previous stores are 1868 * included in the write-back. The processor 1869 * propagates flush to other processors in the cache 1870 * coherence domain. 1871 */ 1872 mfence(); 1873 for (; sva < eva; sva += cpu_clflush_line_size) 1874 clflushopt(sva); 1875 mfence(); 1876 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1877 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1878 if (pmap_kextract(sva) == lapic_paddr) 1879 return; 1880 /* 1881 * Writes are ordered by CLFLUSH on Intel CPUs. 1882 */ 1883 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1884 mfence(); 1885 for (; sva < eva; sva += cpu_clflush_line_size) 1886 clflush(sva); 1887 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1888 mfence(); 1889 } else { 1890 1891 /* 1892 * No targeted cache flush methods are supported by CPU, 1893 * or the supplied range is bigger than 2MB. 1894 * Globally invalidate cache. 1895 */ 1896 pmap_invalidate_cache(); 1897 } 1898} 1899 1900/* 1901 * Remove the specified set of pages from the data and instruction caches. 1902 * 1903 * In contrast to pmap_invalidate_cache_range(), this function does not 1904 * rely on the CPU's self-snoop feature, because it is intended for use 1905 * when moving pages into a different cache domain. 1906 */ 1907void 1908pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1909{ 1910 vm_offset_t daddr, eva; 1911 int i; 1912 bool useclflushopt; 1913 1914 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 1915 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1916 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 1917 pmap_invalidate_cache(); 1918 else { 1919 if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) 1920 mfence(); 1921 for (i = 0; i < count; i++) { 1922 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1923 eva = daddr + PAGE_SIZE; 1924 for (; daddr < eva; daddr += cpu_clflush_line_size) { 1925 if (useclflushopt) 1926 clflushopt(daddr); 1927 else 1928 clflush(daddr); 1929 } 1930 } 1931 if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) 1932 mfence(); 1933 } 1934} 1935 1936/* 1937 * Routine: pmap_extract 1938 * Function: 1939 * Extract the physical page address associated 1940 * with the given map/virtual_address pair. 1941 */ 1942vm_paddr_t 1943pmap_extract(pmap_t pmap, vm_offset_t va) 1944{ 1945 pdp_entry_t *pdpe; 1946 pd_entry_t *pde; 1947 pt_entry_t *pte, PG_V; 1948 vm_paddr_t pa; 1949 1950 pa = 0; 1951 PG_V = pmap_valid_bit(pmap); 1952 PMAP_LOCK(pmap); 1953 pdpe = pmap_pdpe(pmap, va); 1954 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1955 if ((*pdpe & PG_PS) != 0) 1956 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1957 else { 1958 pde = pmap_pdpe_to_pde(pdpe, va); 1959 if ((*pde & PG_V) != 0) { 1960 if ((*pde & PG_PS) != 0) { 1961 pa = (*pde & PG_PS_FRAME) | 1962 (va & PDRMASK); 1963 } else { 1964 pte = pmap_pde_to_pte(pde, va); 1965 pa = (*pte & PG_FRAME) | 1966 (va & PAGE_MASK); 1967 } 1968 } 1969 } 1970 } 1971 PMAP_UNLOCK(pmap); 1972 return (pa); 1973} 1974 1975/* 1976 * Routine: pmap_extract_and_hold 1977 * Function: 1978 * Atomically extract and hold the physical page 1979 * with the given pmap and virtual address pair 1980 * if that mapping permits the given protection. 1981 */ 1982vm_page_t 1983pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1984{ 1985 pd_entry_t pde, *pdep; 1986 pt_entry_t pte, PG_RW, PG_V; 1987 vm_paddr_t pa; 1988 vm_page_t m; 1989 1990 pa = 0; 1991 m = NULL; 1992 PG_RW = pmap_rw_bit(pmap); 1993 PG_V = pmap_valid_bit(pmap); 1994 PMAP_LOCK(pmap); 1995retry: 1996 pdep = pmap_pde(pmap, va); 1997 if (pdep != NULL && (pde = *pdep)) { 1998 if (pde & PG_PS) { 1999 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 2000 if (vm_page_pa_tryrelock(pmap, (pde & 2001 PG_PS_FRAME) | (va & PDRMASK), &pa)) 2002 goto retry; 2003 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 2004 (va & PDRMASK)); 2005 vm_page_hold(m); 2006 } 2007 } else { 2008 pte = *pmap_pde_to_pte(pdep, va); 2009 if ((pte & PG_V) && 2010 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 2011 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 2012 &pa)) 2013 goto retry; 2014 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2015 vm_page_hold(m); 2016 } 2017 } 2018 } 2019 PA_UNLOCK_COND(pa); 2020 PMAP_UNLOCK(pmap); 2021 return (m); 2022} 2023 2024vm_paddr_t 2025pmap_kextract(vm_offset_t va) 2026{ 2027 pd_entry_t pde; 2028 vm_paddr_t pa; 2029 2030 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 2031 pa = DMAP_TO_PHYS(va); 2032 } else { 2033 pde = *vtopde(va); 2034 if (pde & PG_PS) { 2035 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 2036 } else { 2037 /* 2038 * Beware of a concurrent promotion that changes the 2039 * PDE at this point! For example, vtopte() must not 2040 * be used to access the PTE because it would use the 2041 * new PDE. It is, however, safe to use the old PDE 2042 * because the page table page is preserved by the 2043 * promotion. 2044 */ 2045 pa = *pmap_pde_to_pte(&pde, va); 2046 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 2047 } 2048 } 2049 return (pa); 2050} 2051 2052/*************************************************** 2053 * Low level mapping routines..... 2054 ***************************************************/ 2055 2056/* 2057 * Add a wired page to the kva. 2058 * Note: not SMP coherent. 2059 */ 2060PMAP_INLINE void 2061pmap_kenter(vm_offset_t va, vm_paddr_t pa) 2062{ 2063 pt_entry_t *pte; 2064 2065 pte = vtopte(va); 2066 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 2067} 2068 2069static __inline void 2070pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 2071{ 2072 pt_entry_t *pte; 2073 int cache_bits; 2074 2075 pte = vtopte(va); 2076 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 2077 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 2078} 2079 2080/* 2081 * Remove a page from the kernel pagetables. 2082 * Note: not SMP coherent. 2083 */ 2084PMAP_INLINE void 2085pmap_kremove(vm_offset_t va) 2086{ 2087 pt_entry_t *pte; 2088 2089 pte = vtopte(va); 2090 pte_clear(pte); 2091} 2092 2093/* 2094 * Used to map a range of physical addresses into kernel 2095 * virtual address space. 2096 * 2097 * The value passed in '*virt' is a suggested virtual address for 2098 * the mapping. Architectures which can support a direct-mapped 2099 * physical to virtual region can return the appropriate address 2100 * within that region, leaving '*virt' unchanged. Other 2101 * architectures should map the pages starting at '*virt' and 2102 * update '*virt' with the first usable address after the mapped 2103 * region. 2104 */ 2105vm_offset_t 2106pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2107{ 2108 return PHYS_TO_DMAP(start); 2109} 2110 2111 2112/* 2113 * Add a list of wired pages to the kva 2114 * this routine is only used for temporary 2115 * kernel mappings that do not need to have 2116 * page modification or references recorded. 2117 * Note that old mappings are simply written 2118 * over. The page *must* be wired. 2119 * Note: SMP coherent. Uses a ranged shootdown IPI. 2120 */ 2121void 2122pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2123{ 2124 pt_entry_t *endpte, oldpte, pa, *pte; 2125 vm_page_t m; 2126 int cache_bits; 2127 2128 oldpte = 0; 2129 pte = vtopte(sva); 2130 endpte = pte + count; 2131 while (pte < endpte) { 2132 m = *ma++; 2133 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2134 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2135 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2136 oldpte |= *pte; 2137 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 2138 } 2139 pte++; 2140 } 2141 if (__predict_false((oldpte & X86_PG_V) != 0)) 2142 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2143 PAGE_SIZE); 2144} 2145 2146/* 2147 * This routine tears out page mappings from the 2148 * kernel -- it is meant only for temporary mappings. 2149 * Note: SMP coherent. Uses a ranged shootdown IPI. 2150 */ 2151void 2152pmap_qremove(vm_offset_t sva, int count) 2153{ 2154 vm_offset_t va; 2155 2156 va = sva; 2157 while (count-- > 0) { 2158 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2159 pmap_kremove(va); 2160 va += PAGE_SIZE; 2161 } 2162 pmap_invalidate_range(kernel_pmap, sva, va); 2163} 2164 2165/*************************************************** 2166 * Page table page management routines..... 2167 ***************************************************/ 2168static __inline void 2169pmap_free_zero_pages(struct spglist *free) 2170{ 2171 vm_page_t m; 2172 2173 while ((m = SLIST_FIRST(free)) != NULL) { 2174 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2175 /* Preserve the page's PG_ZERO setting. */ 2176 vm_page_free_toq(m); 2177 } 2178} 2179 2180/* 2181 * Schedule the specified unused page table page to be freed. Specifically, 2182 * add the page to the specified list of pages that will be released to the 2183 * physical memory manager after the TLB has been updated. 2184 */ 2185static __inline void 2186pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2187 boolean_t set_PG_ZERO) 2188{ 2189 2190 if (set_PG_ZERO) 2191 m->flags |= PG_ZERO; 2192 else 2193 m->flags &= ~PG_ZERO; 2194 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2195} 2196 2197/* 2198 * Inserts the specified page table page into the specified pmap's collection 2199 * of idle page table pages. Each of a pmap's page table pages is responsible 2200 * for mapping a distinct range of virtual addresses. The pmap's collection is 2201 * ordered by this virtual address range. 2202 */ 2203static __inline int 2204pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2205{ 2206 2207 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2208 return (vm_radix_insert(&pmap->pm_root, mpte)); 2209} 2210 2211/* 2212 * Looks for a page table page mapping the specified virtual address in the 2213 * specified pmap's collection of idle page table pages. Returns NULL if there 2214 * is no page table page corresponding to the specified virtual address. 2215 */ 2216static __inline vm_page_t 2217pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 2218{ 2219 2220 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2221 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); 2222} 2223 2224/* 2225 * Removes the specified page table page from the specified pmap's collection 2226 * of idle page table pages. The specified page table page must be a member of 2227 * the pmap's collection. 2228 */ 2229static __inline void 2230pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 2231{ 2232 2233 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2234 vm_radix_remove(&pmap->pm_root, mpte->pindex); 2235} 2236 2237/* 2238 * Decrements a page table page's wire count, which is used to record the 2239 * number of valid page table entries within the page. If the wire count 2240 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2241 * page table page was unmapped and FALSE otherwise. 2242 */ 2243static inline boolean_t 2244pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2245{ 2246 2247 --m->wire_count; 2248 if (m->wire_count == 0) { 2249 _pmap_unwire_ptp(pmap, va, m, free); 2250 return (TRUE); 2251 } else 2252 return (FALSE); 2253} 2254 2255static void 2256_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2257{ 2258 2259 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2260 /* 2261 * unmap the page table page 2262 */ 2263 if (m->pindex >= (NUPDE + NUPDPE)) { 2264 /* PDP page */ 2265 pml4_entry_t *pml4; 2266 pml4 = pmap_pml4e(pmap, va); 2267 *pml4 = 0; 2268 } else if (m->pindex >= NUPDE) { 2269 /* PD page */ 2270 pdp_entry_t *pdp; 2271 pdp = pmap_pdpe(pmap, va); 2272 *pdp = 0; 2273 } else { 2274 /* PTE page */ 2275 pd_entry_t *pd; 2276 pd = pmap_pde(pmap, va); 2277 *pd = 0; 2278 } 2279 pmap_resident_count_dec(pmap, 1); 2280 if (m->pindex < NUPDE) { 2281 /* We just released a PT, unhold the matching PD */ 2282 vm_page_t pdpg; 2283 2284 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2285 pmap_unwire_ptp(pmap, va, pdpg, free); 2286 } 2287 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2288 /* We just released a PD, unhold the matching PDP */ 2289 vm_page_t pdppg; 2290 2291 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2292 pmap_unwire_ptp(pmap, va, pdppg, free); 2293 } 2294 2295 /* 2296 * This is a release store so that the ordinary store unmapping 2297 * the page table page is globally performed before TLB shoot- 2298 * down is begun. 2299 */ 2300 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); 2301 2302 /* 2303 * Put page on a list so that it is released after 2304 * *ALL* TLB shootdown is done 2305 */ 2306 pmap_add_delayed_free_list(m, free, TRUE); 2307} 2308 2309/* 2310 * After removing a page table entry, this routine is used to 2311 * conditionally free the page, and manage the hold/wire counts. 2312 */ 2313static int 2314pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2315 struct spglist *free) 2316{ 2317 vm_page_t mpte; 2318 2319 if (va >= VM_MAXUSER_ADDRESS) 2320 return (0); 2321 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2322 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2323 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2324} 2325 2326void 2327pmap_pinit0(pmap_t pmap) 2328{ 2329 int i; 2330 2331 PMAP_LOCK_INIT(pmap); 2332 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2333 pmap->pm_cr3 = KPML4phys; 2334 pmap->pm_root.rt_root = 0; 2335 CPU_ZERO(&pmap->pm_active); 2336 TAILQ_INIT(&pmap->pm_pvchunk); 2337 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2338 pmap->pm_flags = pmap_flags; 2339 CPU_FOREACH(i) { 2340 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2341 pmap->pm_pcids[i].pm_gen = 0; 2342 } 2343 PCPU_SET(curpmap, kernel_pmap); 2344 pmap_activate(curthread); 2345 CPU_FILL(&kernel_pmap->pm_active); 2346} 2347 2348void 2349pmap_pinit_pml4(vm_page_t pml4pg) 2350{ 2351 pml4_entry_t *pm_pml4; 2352 int i; 2353 2354 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2355 2356 /* Wire in kernel global address entries. */ 2357 for (i = 0; i < NKPML4E; i++) { 2358 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 2359 X86_PG_V | PG_U; 2360 } 2361 for (i = 0; i < ndmpdpphys; i++) { 2362 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 2363 X86_PG_V | PG_U; 2364 } 2365 2366 /* install self-referential address mapping entry(s) */ 2367 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 2368 X86_PG_A | X86_PG_M; 2369} 2370 2371/* 2372 * Initialize a preallocated and zeroed pmap structure, 2373 * such as one in a vmspace structure. 2374 */ 2375int 2376pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2377{ 2378 vm_page_t pml4pg; 2379 vm_paddr_t pml4phys; 2380 int i; 2381 2382 /* 2383 * allocate the page directory page 2384 */ 2385 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2386 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 2387 VM_WAIT; 2388 2389 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2390 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2391 CPU_FOREACH(i) { 2392 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2393 pmap->pm_pcids[i].pm_gen = 0; 2394 } 2395 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2396 2397 if ((pml4pg->flags & PG_ZERO) == 0) 2398 pagezero(pmap->pm_pml4); 2399 2400 /* 2401 * Do not install the host kernel mappings in the nested page 2402 * tables. These mappings are meaningless in the guest physical 2403 * address space. 2404 */ 2405 if ((pmap->pm_type = pm_type) == PT_X86) { 2406 pmap->pm_cr3 = pml4phys; 2407 pmap_pinit_pml4(pml4pg); 2408 } 2409 2410 pmap->pm_root.rt_root = 0; 2411 CPU_ZERO(&pmap->pm_active); 2412 TAILQ_INIT(&pmap->pm_pvchunk); 2413 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2414 pmap->pm_flags = flags; 2415 pmap->pm_eptgen = 0; 2416 2417 return (1); 2418} 2419 2420int 2421pmap_pinit(pmap_t pmap) 2422{ 2423 2424 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2425} 2426 2427/* 2428 * This routine is called if the desired page table page does not exist. 2429 * 2430 * If page table page allocation fails, this routine may sleep before 2431 * returning NULL. It sleeps only if a lock pointer was given. 2432 * 2433 * Note: If a page allocation fails at page table level two or three, 2434 * one or two pages may be held during the wait, only to be released 2435 * afterwards. This conservative approach is easily argued to avoid 2436 * race conditions. 2437 */ 2438static vm_page_t 2439_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2440{ 2441 vm_page_t m, pdppg, pdpg; 2442 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2443 2444 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2445 2446 PG_A = pmap_accessed_bit(pmap); 2447 PG_M = pmap_modified_bit(pmap); 2448 PG_V = pmap_valid_bit(pmap); 2449 PG_RW = pmap_rw_bit(pmap); 2450 2451 /* 2452 * Allocate a page table page. 2453 */ 2454 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2455 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2456 if (lockp != NULL) { 2457 RELEASE_PV_LIST_LOCK(lockp); 2458 PMAP_UNLOCK(pmap); 2459 PMAP_ASSERT_NOT_IN_DI(); 2460 VM_WAIT; 2461 PMAP_LOCK(pmap); 2462 } 2463 2464 /* 2465 * Indicate the need to retry. While waiting, the page table 2466 * page may have been allocated. 2467 */ 2468 return (NULL); 2469 } 2470 if ((m->flags & PG_ZERO) == 0) 2471 pmap_zero_page(m); 2472 2473 /* 2474 * Map the pagetable page into the process address space, if 2475 * it isn't already there. 2476 */ 2477 2478 if (ptepindex >= (NUPDE + NUPDPE)) { 2479 pml4_entry_t *pml4; 2480 vm_pindex_t pml4index; 2481 2482 /* Wire up a new PDPE page */ 2483 pml4index = ptepindex - (NUPDE + NUPDPE); 2484 pml4 = &pmap->pm_pml4[pml4index]; 2485 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2486 2487 } else if (ptepindex >= NUPDE) { 2488 vm_pindex_t pml4index; 2489 vm_pindex_t pdpindex; 2490 pml4_entry_t *pml4; 2491 pdp_entry_t *pdp; 2492 2493 /* Wire up a new PDE page */ 2494 pdpindex = ptepindex - NUPDE; 2495 pml4index = pdpindex >> NPML4EPGSHIFT; 2496 2497 pml4 = &pmap->pm_pml4[pml4index]; 2498 if ((*pml4 & PG_V) == 0) { 2499 /* Have to allocate a new pdp, recurse */ 2500 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2501 lockp) == NULL) { 2502 --m->wire_count; 2503 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2504 vm_page_free_zero(m); 2505 return (NULL); 2506 } 2507 } else { 2508 /* Add reference to pdp page */ 2509 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2510 pdppg->wire_count++; 2511 } 2512 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2513 2514 /* Now find the pdp page */ 2515 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2516 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2517 2518 } else { 2519 vm_pindex_t pml4index; 2520 vm_pindex_t pdpindex; 2521 pml4_entry_t *pml4; 2522 pdp_entry_t *pdp; 2523 pd_entry_t *pd; 2524 2525 /* Wire up a new PTE page */ 2526 pdpindex = ptepindex >> NPDPEPGSHIFT; 2527 pml4index = pdpindex >> NPML4EPGSHIFT; 2528 2529 /* First, find the pdp and check that its valid. */ 2530 pml4 = &pmap->pm_pml4[pml4index]; 2531 if ((*pml4 & PG_V) == 0) { 2532 /* Have to allocate a new pd, recurse */ 2533 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2534 lockp) == NULL) { 2535 --m->wire_count; 2536 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2537 vm_page_free_zero(m); 2538 return (NULL); 2539 } 2540 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2541 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2542 } else { 2543 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2544 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2545 if ((*pdp & PG_V) == 0) { 2546 /* Have to allocate a new pd, recurse */ 2547 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2548 lockp) == NULL) { 2549 --m->wire_count; 2550 atomic_subtract_int(&vm_cnt.v_wire_count, 2551 1); 2552 vm_page_free_zero(m); 2553 return (NULL); 2554 } 2555 } else { 2556 /* Add reference to the pd page */ 2557 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2558 pdpg->wire_count++; 2559 } 2560 } 2561 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2562 2563 /* Now we know where the page directory page is */ 2564 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2565 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2566 } 2567 2568 pmap_resident_count_inc(pmap, 1); 2569 2570 return (m); 2571} 2572 2573static vm_page_t 2574pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2575{ 2576 vm_pindex_t pdpindex, ptepindex; 2577 pdp_entry_t *pdpe, PG_V; 2578 vm_page_t pdpg; 2579 2580 PG_V = pmap_valid_bit(pmap); 2581 2582retry: 2583 pdpe = pmap_pdpe(pmap, va); 2584 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2585 /* Add a reference to the pd page. */ 2586 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2587 pdpg->wire_count++; 2588 } else { 2589 /* Allocate a pd page. */ 2590 ptepindex = pmap_pde_pindex(va); 2591 pdpindex = ptepindex >> NPDPEPGSHIFT; 2592 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2593 if (pdpg == NULL && lockp != NULL) 2594 goto retry; 2595 } 2596 return (pdpg); 2597} 2598 2599static vm_page_t 2600pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2601{ 2602 vm_pindex_t ptepindex; 2603 pd_entry_t *pd, PG_V; 2604 vm_page_t m; 2605 2606 PG_V = pmap_valid_bit(pmap); 2607 2608 /* 2609 * Calculate pagetable page index 2610 */ 2611 ptepindex = pmap_pde_pindex(va); 2612retry: 2613 /* 2614 * Get the page directory entry 2615 */ 2616 pd = pmap_pde(pmap, va); 2617 2618 /* 2619 * This supports switching from a 2MB page to a 2620 * normal 4K page. 2621 */ 2622 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2623 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2624 /* 2625 * Invalidation of the 2MB page mapping may have caused 2626 * the deallocation of the underlying PD page. 2627 */ 2628 pd = NULL; 2629 } 2630 } 2631 2632 /* 2633 * If the page table page is mapped, we just increment the 2634 * hold count, and activate it. 2635 */ 2636 if (pd != NULL && (*pd & PG_V) != 0) { 2637 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2638 m->wire_count++; 2639 } else { 2640 /* 2641 * Here if the pte page isn't mapped, or if it has been 2642 * deallocated. 2643 */ 2644 m = _pmap_allocpte(pmap, ptepindex, lockp); 2645 if (m == NULL && lockp != NULL) 2646 goto retry; 2647 } 2648 return (m); 2649} 2650 2651 2652/*************************************************** 2653 * Pmap allocation/deallocation routines. 2654 ***************************************************/ 2655 2656/* 2657 * Release any resources held by the given physical map. 2658 * Called when a pmap initialized by pmap_pinit is being released. 2659 * Should only be called if the map contains no valid mappings. 2660 */ 2661void 2662pmap_release(pmap_t pmap) 2663{ 2664 vm_page_t m; 2665 int i; 2666 2667 KASSERT(pmap->pm_stats.resident_count == 0, 2668 ("pmap_release: pmap resident count %ld != 0", 2669 pmap->pm_stats.resident_count)); 2670 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2671 ("pmap_release: pmap has reserved page table page(s)")); 2672 KASSERT(CPU_EMPTY(&pmap->pm_active), 2673 ("releasing active pmap %p", pmap)); 2674 2675 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2676 2677 for (i = 0; i < NKPML4E; i++) /* KVA */ 2678 pmap->pm_pml4[KPML4BASE + i] = 0; 2679 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2680 pmap->pm_pml4[DMPML4I + i] = 0; 2681 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2682 2683 m->wire_count--; 2684 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2685 vm_page_free_zero(m); 2686} 2687 2688static int 2689kvm_size(SYSCTL_HANDLER_ARGS) 2690{ 2691 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2692 2693 return sysctl_handle_long(oidp, &ksize, 0, req); 2694} 2695SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2696 0, 0, kvm_size, "LU", "Size of KVM"); 2697 2698static int 2699kvm_free(SYSCTL_HANDLER_ARGS) 2700{ 2701 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2702 2703 return sysctl_handle_long(oidp, &kfree, 0, req); 2704} 2705SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2706 0, 0, kvm_free, "LU", "Amount of KVM free"); 2707 2708/* 2709 * grow the number of kernel page table entries, if needed 2710 */ 2711void 2712pmap_growkernel(vm_offset_t addr) 2713{ 2714 vm_paddr_t paddr; 2715 vm_page_t nkpg; 2716 pd_entry_t *pde, newpdir; 2717 pdp_entry_t *pdpe; 2718 2719 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2720 2721 /* 2722 * Return if "addr" is within the range of kernel page table pages 2723 * that were preallocated during pmap bootstrap. Moreover, leave 2724 * "kernel_vm_end" and the kernel page table as they were. 2725 * 2726 * The correctness of this action is based on the following 2727 * argument: vm_map_insert() allocates contiguous ranges of the 2728 * kernel virtual address space. It calls this function if a range 2729 * ends after "kernel_vm_end". If the kernel is mapped between 2730 * "kernel_vm_end" and "addr", then the range cannot begin at 2731 * "kernel_vm_end". In fact, its beginning address cannot be less 2732 * than the kernel. Thus, there is no immediate need to allocate 2733 * any new kernel page table pages between "kernel_vm_end" and 2734 * "KERNBASE". 2735 */ 2736 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2737 return; 2738 2739 addr = roundup2(addr, NBPDR); 2740 if (addr - 1 >= kernel_map->max_offset) 2741 addr = kernel_map->max_offset; 2742 while (kernel_vm_end < addr) { 2743 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2744 if ((*pdpe & X86_PG_V) == 0) { 2745 /* We need a new PDP entry */ 2746 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2747 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2748 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2749 if (nkpg == NULL) 2750 panic("pmap_growkernel: no memory to grow kernel"); 2751 if ((nkpg->flags & PG_ZERO) == 0) 2752 pmap_zero_page(nkpg); 2753 paddr = VM_PAGE_TO_PHYS(nkpg); 2754 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2755 X86_PG_A | X86_PG_M); 2756 continue; /* try again */ 2757 } 2758 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2759 if ((*pde & X86_PG_V) != 0) { 2760 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2761 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2762 kernel_vm_end = kernel_map->max_offset; 2763 break; 2764 } 2765 continue; 2766 } 2767 2768 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2769 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2770 VM_ALLOC_ZERO); 2771 if (nkpg == NULL) 2772 panic("pmap_growkernel: no memory to grow kernel"); 2773 if ((nkpg->flags & PG_ZERO) == 0) 2774 pmap_zero_page(nkpg); 2775 paddr = VM_PAGE_TO_PHYS(nkpg); 2776 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2777 pde_store(pde, newpdir); 2778 2779 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2780 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2781 kernel_vm_end = kernel_map->max_offset; 2782 break; 2783 } 2784 } 2785} 2786 2787 2788/*************************************************** 2789 * page management routines. 2790 ***************************************************/ 2791 2792CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2793CTASSERT(_NPCM == 3); 2794CTASSERT(_NPCPV == 168); 2795 2796static __inline struct pv_chunk * 2797pv_to_chunk(pv_entry_t pv) 2798{ 2799 2800 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2801} 2802 2803#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2804 2805#define PC_FREE0 0xfffffffffffffffful 2806#define PC_FREE1 0xfffffffffffffffful 2807#define PC_FREE2 0x000000fffffffffful 2808 2809static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2810 2811#ifdef PV_STATS 2812static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2813 2814SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2815 "Current number of pv entry chunks"); 2816SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2817 "Current number of pv entry chunks allocated"); 2818SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2819 "Current number of pv entry chunks frees"); 2820SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2821 "Number of times tried to get a chunk page but failed."); 2822 2823static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2824static int pv_entry_spare; 2825 2826SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2827 "Current number of pv entry frees"); 2828SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2829 "Current number of pv entry allocs"); 2830SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2831 "Current number of pv entries"); 2832SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2833 "Current number of spare pv entries"); 2834#endif 2835 2836/* 2837 * We are in a serious low memory condition. Resort to 2838 * drastic measures to free some pages so we can allocate 2839 * another pv entry chunk. 2840 * 2841 * Returns NULL if PV entries were reclaimed from the specified pmap. 2842 * 2843 * We do not, however, unmap 2mpages because subsequent accesses will 2844 * allocate per-page pv entries until repromotion occurs, thereby 2845 * exacerbating the shortage of free pv entries. 2846 */ 2847static vm_page_t 2848reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2849{ 2850 struct pch new_tail; 2851 struct pv_chunk *pc; 2852 struct md_page *pvh; 2853 pd_entry_t *pde; 2854 pmap_t pmap; 2855 pt_entry_t *pte, tpte; 2856 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2857 pv_entry_t pv; 2858 vm_offset_t va; 2859 vm_page_t m, m_pc; 2860 struct spglist free; 2861 uint64_t inuse; 2862 int bit, field, freed; 2863 2864 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2865 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2866 pmap = NULL; 2867 m_pc = NULL; 2868 PG_G = PG_A = PG_M = PG_RW = 0; 2869 SLIST_INIT(&free); 2870 TAILQ_INIT(&new_tail); 2871 pmap_delayed_invl_started(); 2872 mtx_lock(&pv_chunks_mutex); 2873 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { 2874 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2875 mtx_unlock(&pv_chunks_mutex); 2876 if (pmap != pc->pc_pmap) { 2877 if (pmap != NULL) { 2878 pmap_invalidate_all(pmap); 2879 if (pmap != locked_pmap) 2880 PMAP_UNLOCK(pmap); 2881 } 2882 pmap_delayed_invl_finished(); 2883 pmap_delayed_invl_started(); 2884 pmap = pc->pc_pmap; 2885 /* Avoid deadlock and lock recursion. */ 2886 if (pmap > locked_pmap) { 2887 RELEASE_PV_LIST_LOCK(lockp); 2888 PMAP_LOCK(pmap); 2889 } else if (pmap != locked_pmap && 2890 !PMAP_TRYLOCK(pmap)) { 2891 pmap = NULL; 2892 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2893 mtx_lock(&pv_chunks_mutex); 2894 continue; 2895 } 2896 PG_G = pmap_global_bit(pmap); 2897 PG_A = pmap_accessed_bit(pmap); 2898 PG_M = pmap_modified_bit(pmap); 2899 PG_RW = pmap_rw_bit(pmap); 2900 } 2901 2902 /* 2903 * Destroy every non-wired, 4 KB page mapping in the chunk. 2904 */ 2905 freed = 0; 2906 for (field = 0; field < _NPCM; field++) { 2907 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2908 inuse != 0; inuse &= ~(1UL << bit)) { 2909 bit = bsfq(inuse); 2910 pv = &pc->pc_pventry[field * 64 + bit]; 2911 va = pv->pv_va; 2912 pde = pmap_pde(pmap, va); 2913 if ((*pde & PG_PS) != 0) 2914 continue; 2915 pte = pmap_pde_to_pte(pde, va); 2916 if ((*pte & PG_W) != 0) 2917 continue; 2918 tpte = pte_load_clear(pte); 2919 if ((tpte & PG_G) != 0) 2920 pmap_invalidate_page(pmap, va); 2921 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2922 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2923 vm_page_dirty(m); 2924 if ((tpte & PG_A) != 0) 2925 vm_page_aflag_set(m, PGA_REFERENCED); 2926 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2927 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2928 m->md.pv_gen++; 2929 if (TAILQ_EMPTY(&m->md.pv_list) && 2930 (m->flags & PG_FICTITIOUS) == 0) { 2931 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2932 if (TAILQ_EMPTY(&pvh->pv_list)) { 2933 vm_page_aflag_clear(m, 2934 PGA_WRITEABLE); 2935 } 2936 } 2937 pmap_delayed_invl_page(m); 2938 pc->pc_map[field] |= 1UL << bit; 2939 pmap_unuse_pt(pmap, va, *pde, &free); 2940 freed++; 2941 } 2942 } 2943 if (freed == 0) { 2944 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2945 mtx_lock(&pv_chunks_mutex); 2946 continue; 2947 } 2948 /* Every freed mapping is for a 4 KB page. */ 2949 pmap_resident_count_dec(pmap, freed); 2950 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2951 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2952 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2953 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2954 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2955 pc->pc_map[2] == PC_FREE2) { 2956 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2957 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2958 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2959 /* Entire chunk is free; return it. */ 2960 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2961 dump_drop_page(m_pc->phys_addr); 2962 mtx_lock(&pv_chunks_mutex); 2963 break; 2964 } 2965 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2966 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2967 mtx_lock(&pv_chunks_mutex); 2968 /* One freed pv entry in locked_pmap is sufficient. */ 2969 if (pmap == locked_pmap) 2970 break; 2971 } 2972 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2973 mtx_unlock(&pv_chunks_mutex); 2974 if (pmap != NULL) { 2975 pmap_invalidate_all(pmap); 2976 if (pmap != locked_pmap) 2977 PMAP_UNLOCK(pmap); 2978 } 2979 pmap_delayed_invl_finished(); 2980 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2981 m_pc = SLIST_FIRST(&free); 2982 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2983 /* Recycle a freed page table page. */ 2984 m_pc->wire_count = 1; 2985 atomic_add_int(&vm_cnt.v_wire_count, 1); 2986 } 2987 pmap_free_zero_pages(&free); 2988 return (m_pc); 2989} 2990 2991/* 2992 * free the pv_entry back to the free list 2993 */ 2994static void 2995free_pv_entry(pmap_t pmap, pv_entry_t pv) 2996{ 2997 struct pv_chunk *pc; 2998 int idx, field, bit; 2999 3000 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3001 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3002 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3003 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3004 pc = pv_to_chunk(pv); 3005 idx = pv - &pc->pc_pventry[0]; 3006 field = idx / 64; 3007 bit = idx % 64; 3008 pc->pc_map[field] |= 1ul << bit; 3009 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 3010 pc->pc_map[2] != PC_FREE2) { 3011 /* 98% of the time, pc is already at the head of the list. */ 3012 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3013 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3014 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3015 } 3016 return; 3017 } 3018 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3019 free_pv_chunk(pc); 3020} 3021 3022static void 3023free_pv_chunk(struct pv_chunk *pc) 3024{ 3025 vm_page_t m; 3026 3027 mtx_lock(&pv_chunks_mutex); 3028 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3029 mtx_unlock(&pv_chunks_mutex); 3030 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3031 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3032 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3033 /* entire chunk is free, return it */ 3034 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3035 dump_drop_page(m->phys_addr); 3036 vm_page_unwire(m, PQ_NONE); 3037 vm_page_free(m); 3038} 3039 3040/* 3041 * Returns a new PV entry, allocating a new PV chunk from the system when 3042 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3043 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3044 * returned. 3045 * 3046 * The given PV list lock may be released. 3047 */ 3048static pv_entry_t 3049get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3050{ 3051 int bit, field; 3052 pv_entry_t pv; 3053 struct pv_chunk *pc; 3054 vm_page_t m; 3055 3056 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3057 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3058retry: 3059 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3060 if (pc != NULL) { 3061 for (field = 0; field < _NPCM; field++) { 3062 if (pc->pc_map[field]) { 3063 bit = bsfq(pc->pc_map[field]); 3064 break; 3065 } 3066 } 3067 if (field < _NPCM) { 3068 pv = &pc->pc_pventry[field * 64 + bit]; 3069 pc->pc_map[field] &= ~(1ul << bit); 3070 /* If this was the last item, move it to tail */ 3071 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 3072 pc->pc_map[2] == 0) { 3073 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3074 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3075 pc_list); 3076 } 3077 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3078 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3079 return (pv); 3080 } 3081 } 3082 /* No free items, allocate another chunk */ 3083 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3084 VM_ALLOC_WIRED); 3085 if (m == NULL) { 3086 if (lockp == NULL) { 3087 PV_STAT(pc_chunk_tryfail++); 3088 return (NULL); 3089 } 3090 m = reclaim_pv_chunk(pmap, lockp); 3091 if (m == NULL) 3092 goto retry; 3093 } 3094 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3095 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3096 dump_add_page(m->phys_addr); 3097 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3098 pc->pc_pmap = pmap; 3099 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3100 pc->pc_map[1] = PC_FREE1; 3101 pc->pc_map[2] = PC_FREE2; 3102 mtx_lock(&pv_chunks_mutex); 3103 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3104 mtx_unlock(&pv_chunks_mutex); 3105 pv = &pc->pc_pventry[0]; 3106 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3107 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3108 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3109 return (pv); 3110} 3111 3112/* 3113 * Returns the number of one bits within the given PV chunk map. 3114 * 3115 * The erratas for Intel processors state that "POPCNT Instruction May 3116 * Take Longer to Execute Than Expected". It is believed that the 3117 * issue is the spurious dependency on the destination register. 3118 * Provide a hint to the register rename logic that the destination 3119 * value is overwritten, by clearing it, as suggested in the 3120 * optimization manual. It should be cheap for unaffected processors 3121 * as well. 3122 * 3123 * Reference numbers for erratas are 3124 * 4th Gen Core: HSD146 3125 * 5th Gen Core: BDM85 3126 * 6th Gen Core: SKL029 3127 */ 3128static int 3129popcnt_pc_map_pq(uint64_t *map) 3130{ 3131 u_long result, tmp; 3132 3133 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 3134 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 3135 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 3136 : "=&r" (result), "=&r" (tmp) 3137 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 3138 return (result); 3139} 3140 3141/* 3142 * Ensure that the number of spare PV entries in the specified pmap meets or 3143 * exceeds the given count, "needed". 3144 * 3145 * The given PV list lock may be released. 3146 */ 3147static void 3148reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3149{ 3150 struct pch new_tail; 3151 struct pv_chunk *pc; 3152 int avail, free; 3153 vm_page_t m; 3154 3155 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3156 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3157 3158 /* 3159 * Newly allocated PV chunks must be stored in a private list until 3160 * the required number of PV chunks have been allocated. Otherwise, 3161 * reclaim_pv_chunk() could recycle one of these chunks. In 3162 * contrast, these chunks must be added to the pmap upon allocation. 3163 */ 3164 TAILQ_INIT(&new_tail); 3165retry: 3166 avail = 0; 3167 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3168#ifndef __POPCNT__ 3169 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 3170 bit_count((bitstr_t *)pc->pc_map, 0, 3171 sizeof(pc->pc_map) * NBBY, &free); 3172 else 3173#endif 3174 free = popcnt_pc_map_pq(pc->pc_map); 3175 if (free == 0) 3176 break; 3177 avail += free; 3178 if (avail >= needed) 3179 break; 3180 } 3181 for (; avail < needed; avail += _NPCPV) { 3182 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3183 VM_ALLOC_WIRED); 3184 if (m == NULL) { 3185 m = reclaim_pv_chunk(pmap, lockp); 3186 if (m == NULL) 3187 goto retry; 3188 } 3189 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3190 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3191 dump_add_page(m->phys_addr); 3192 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3193 pc->pc_pmap = pmap; 3194 pc->pc_map[0] = PC_FREE0; 3195 pc->pc_map[1] = PC_FREE1; 3196 pc->pc_map[2] = PC_FREE2; 3197 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3198 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3199 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3200 } 3201 if (!TAILQ_EMPTY(&new_tail)) { 3202 mtx_lock(&pv_chunks_mutex); 3203 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3204 mtx_unlock(&pv_chunks_mutex); 3205 } 3206} 3207 3208/* 3209 * First find and then remove the pv entry for the specified pmap and virtual 3210 * address from the specified pv list. Returns the pv entry if found and NULL 3211 * otherwise. This operation can be performed on pv lists for either 4KB or 3212 * 2MB page mappings. 3213 */ 3214static __inline pv_entry_t 3215pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3216{ 3217 pv_entry_t pv; 3218 3219 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3220 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3221 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3222 pvh->pv_gen++; 3223 break; 3224 } 3225 } 3226 return (pv); 3227} 3228 3229/* 3230 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3231 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3232 * entries for each of the 4KB page mappings. 3233 */ 3234static void 3235pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3236 struct rwlock **lockp) 3237{ 3238 struct md_page *pvh; 3239 struct pv_chunk *pc; 3240 pv_entry_t pv; 3241 vm_offset_t va_last; 3242 vm_page_t m; 3243 int bit, field; 3244 3245 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3246 KASSERT((pa & PDRMASK) == 0, 3247 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3248 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3249 3250 /* 3251 * Transfer the 2mpage's pv entry for this mapping to the first 3252 * page's pv list. Once this transfer begins, the pv list lock 3253 * must not be released until the last pv entry is reinstantiated. 3254 */ 3255 pvh = pa_to_pvh(pa); 3256 va = trunc_2mpage(va); 3257 pv = pmap_pvh_remove(pvh, pmap, va); 3258 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3259 m = PHYS_TO_VM_PAGE(pa); 3260 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3261 m->md.pv_gen++; 3262 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3263 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3264 va_last = va + NBPDR - PAGE_SIZE; 3265 for (;;) { 3266 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3267 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3268 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3269 for (field = 0; field < _NPCM; field++) { 3270 while (pc->pc_map[field]) { 3271 bit = bsfq(pc->pc_map[field]); 3272 pc->pc_map[field] &= ~(1ul << bit); 3273 pv = &pc->pc_pventry[field * 64 + bit]; 3274 va += PAGE_SIZE; 3275 pv->pv_va = va; 3276 m++; 3277 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3278 ("pmap_pv_demote_pde: page %p is not managed", m)); 3279 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3280 m->md.pv_gen++; 3281 if (va == va_last) 3282 goto out; 3283 } 3284 } 3285 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3286 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3287 } 3288out: 3289 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3290 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3291 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3292 } 3293 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3294 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3295} 3296 3297/* 3298 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3299 * replace the many pv entries for the 4KB page mappings by a single pv entry 3300 * for the 2MB page mapping. 3301 */ 3302static void 3303pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3304 struct rwlock **lockp) 3305{ 3306 struct md_page *pvh; 3307 pv_entry_t pv; 3308 vm_offset_t va_last; 3309 vm_page_t m; 3310 3311 KASSERT((pa & PDRMASK) == 0, 3312 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3313 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3314 3315 /* 3316 * Transfer the first page's pv entry for this mapping to the 2mpage's 3317 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3318 * a transfer avoids the possibility that get_pv_entry() calls 3319 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3320 * mappings that is being promoted. 3321 */ 3322 m = PHYS_TO_VM_PAGE(pa); 3323 va = trunc_2mpage(va); 3324 pv = pmap_pvh_remove(&m->md, pmap, va); 3325 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3326 pvh = pa_to_pvh(pa); 3327 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3328 pvh->pv_gen++; 3329 /* Free the remaining NPTEPG - 1 pv entries. */ 3330 va_last = va + NBPDR - PAGE_SIZE; 3331 do { 3332 m++; 3333 va += PAGE_SIZE; 3334 pmap_pvh_free(&m->md, pmap, va); 3335 } while (va < va_last); 3336} 3337 3338/* 3339 * First find and then destroy the pv entry for the specified pmap and virtual 3340 * address. This operation can be performed on pv lists for either 4KB or 2MB 3341 * page mappings. 3342 */ 3343static void 3344pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3345{ 3346 pv_entry_t pv; 3347 3348 pv = pmap_pvh_remove(pvh, pmap, va); 3349 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3350 free_pv_entry(pmap, pv); 3351} 3352 3353/* 3354 * Conditionally create the PV entry for a 4KB page mapping if the required 3355 * memory can be allocated without resorting to reclamation. 3356 */ 3357static boolean_t 3358pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3359 struct rwlock **lockp) 3360{ 3361 pv_entry_t pv; 3362 3363 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3364 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3365 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3366 pv->pv_va = va; 3367 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3368 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3369 m->md.pv_gen++; 3370 return (TRUE); 3371 } else 3372 return (FALSE); 3373} 3374 3375/* 3376 * Conditionally create the PV entry for a 2MB page mapping if the required 3377 * memory can be allocated without resorting to reclamation. 3378 */ 3379static boolean_t 3380pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3381 struct rwlock **lockp) 3382{ 3383 struct md_page *pvh; 3384 pv_entry_t pv; 3385 3386 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3387 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3388 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3389 pv->pv_va = va; 3390 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3391 pvh = pa_to_pvh(pa); 3392 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3393 pvh->pv_gen++; 3394 return (TRUE); 3395 } else 3396 return (FALSE); 3397} 3398 3399/* 3400 * Fills a page table page with mappings to consecutive physical pages. 3401 */ 3402static void 3403pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3404{ 3405 pt_entry_t *pte; 3406 3407 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3408 *pte = newpte; 3409 newpte += PAGE_SIZE; 3410 } 3411} 3412 3413/* 3414 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3415 * mapping is invalidated. 3416 */ 3417static boolean_t 3418pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3419{ 3420 struct rwlock *lock; 3421 boolean_t rv; 3422 3423 lock = NULL; 3424 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3425 if (lock != NULL) 3426 rw_wunlock(lock); 3427 return (rv); 3428} 3429 3430static boolean_t 3431pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3432 struct rwlock **lockp) 3433{ 3434 pd_entry_t newpde, oldpde; 3435 pt_entry_t *firstpte, newpte; 3436 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3437 vm_paddr_t mptepa; 3438 vm_page_t mpte; 3439 struct spglist free; 3440 int PG_PTE_CACHE; 3441 3442 PG_G = pmap_global_bit(pmap); 3443 PG_A = pmap_accessed_bit(pmap); 3444 PG_M = pmap_modified_bit(pmap); 3445 PG_RW = pmap_rw_bit(pmap); 3446 PG_V = pmap_valid_bit(pmap); 3447 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3448 3449 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3450 oldpde = *pde; 3451 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3452 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3453 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 3454 NULL) 3455 pmap_remove_pt_page(pmap, mpte); 3456 else { 3457 KASSERT((oldpde & PG_W) == 0, 3458 ("pmap_demote_pde: page table page for a wired mapping" 3459 " is missing")); 3460 3461 /* 3462 * Invalidate the 2MB page mapping and return "failure" if the 3463 * mapping was never accessed or the allocation of the new 3464 * page table page fails. If the 2MB page mapping belongs to 3465 * the direct map region of the kernel's address space, then 3466 * the page allocation request specifies the highest possible 3467 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3468 * normal. Page table pages are preallocated for every other 3469 * part of the kernel address space, so the direct map region 3470 * is the only part of the kernel address space that must be 3471 * handled here. 3472 */ 3473 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3474 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3475 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3476 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3477 SLIST_INIT(&free); 3478 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, 3479 lockp); 3480 pmap_invalidate_page(pmap, trunc_2mpage(va)); 3481 pmap_free_zero_pages(&free); 3482 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3483 " in pmap %p", va, pmap); 3484 return (FALSE); 3485 } 3486 if (va < VM_MAXUSER_ADDRESS) 3487 pmap_resident_count_inc(pmap, 1); 3488 } 3489 mptepa = VM_PAGE_TO_PHYS(mpte); 3490 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3491 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3492 KASSERT((oldpde & PG_A) != 0, 3493 ("pmap_demote_pde: oldpde is missing PG_A")); 3494 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3495 ("pmap_demote_pde: oldpde is missing PG_M")); 3496 newpte = oldpde & ~PG_PS; 3497 newpte = pmap_swap_pat(pmap, newpte); 3498 3499 /* 3500 * If the page table page is new, initialize it. 3501 */ 3502 if (mpte->wire_count == 1) { 3503 mpte->wire_count = NPTEPG; 3504 pmap_fill_ptp(firstpte, newpte); 3505 } 3506 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3507 ("pmap_demote_pde: firstpte and newpte map different physical" 3508 " addresses")); 3509 3510 /* 3511 * If the mapping has changed attributes, update the page table 3512 * entries. 3513 */ 3514 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3515 pmap_fill_ptp(firstpte, newpte); 3516 3517 /* 3518 * The spare PV entries must be reserved prior to demoting the 3519 * mapping, that is, prior to changing the PDE. Otherwise, the state 3520 * of the PDE and the PV lists will be inconsistent, which can result 3521 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3522 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3523 * PV entry for the 2MB page mapping that is being demoted. 3524 */ 3525 if ((oldpde & PG_MANAGED) != 0) 3526 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3527 3528 /* 3529 * Demote the mapping. This pmap is locked. The old PDE has 3530 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3531 * set. Thus, there is no danger of a race with another 3532 * processor changing the setting of PG_A and/or PG_M between 3533 * the read above and the store below. 3534 */ 3535 if (workaround_erratum383) 3536 pmap_update_pde(pmap, va, pde, newpde); 3537 else 3538 pde_store(pde, newpde); 3539 3540 /* 3541 * Invalidate a stale recursive mapping of the page table page. 3542 */ 3543 if (va >= VM_MAXUSER_ADDRESS) 3544 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3545 3546 /* 3547 * Demote the PV entry. 3548 */ 3549 if ((oldpde & PG_MANAGED) != 0) 3550 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3551 3552 atomic_add_long(&pmap_pde_demotions, 1); 3553 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3554 " in pmap %p", va, pmap); 3555 return (TRUE); 3556} 3557 3558/* 3559 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3560 */ 3561static void 3562pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3563{ 3564 pd_entry_t newpde; 3565 vm_paddr_t mptepa; 3566 vm_page_t mpte; 3567 3568 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3569 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3570 mpte = pmap_lookup_pt_page(pmap, va); 3571 if (mpte == NULL) 3572 panic("pmap_remove_kernel_pde: Missing pt page."); 3573 3574 pmap_remove_pt_page(pmap, mpte); 3575 mptepa = VM_PAGE_TO_PHYS(mpte); 3576 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3577 3578 /* 3579 * Initialize the page table page. 3580 */ 3581 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3582 3583 /* 3584 * Demote the mapping. 3585 */ 3586 if (workaround_erratum383) 3587 pmap_update_pde(pmap, va, pde, newpde); 3588 else 3589 pde_store(pde, newpde); 3590 3591 /* 3592 * Invalidate a stale recursive mapping of the page table page. 3593 */ 3594 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3595} 3596 3597/* 3598 * pmap_remove_pde: do the things to unmap a superpage in a process 3599 */ 3600static int 3601pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3602 struct spglist *free, struct rwlock **lockp) 3603{ 3604 struct md_page *pvh; 3605 pd_entry_t oldpde; 3606 vm_offset_t eva, va; 3607 vm_page_t m, mpte; 3608 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3609 3610 PG_G = pmap_global_bit(pmap); 3611 PG_A = pmap_accessed_bit(pmap); 3612 PG_M = pmap_modified_bit(pmap); 3613 PG_RW = pmap_rw_bit(pmap); 3614 3615 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3616 KASSERT((sva & PDRMASK) == 0, 3617 ("pmap_remove_pde: sva is not 2mpage aligned")); 3618 oldpde = pte_load_clear(pdq); 3619 if (oldpde & PG_W) 3620 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3621 3622 /* 3623 * Machines that don't support invlpg, also don't support 3624 * PG_G. 3625 */ 3626 if (oldpde & PG_G) 3627 pmap_invalidate_page(kernel_pmap, sva); 3628 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3629 if (oldpde & PG_MANAGED) { 3630 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3631 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3632 pmap_pvh_free(pvh, pmap, sva); 3633 eva = sva + NBPDR; 3634 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3635 va < eva; va += PAGE_SIZE, m++) { 3636 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3637 vm_page_dirty(m); 3638 if (oldpde & PG_A) 3639 vm_page_aflag_set(m, PGA_REFERENCED); 3640 if (TAILQ_EMPTY(&m->md.pv_list) && 3641 TAILQ_EMPTY(&pvh->pv_list)) 3642 vm_page_aflag_clear(m, PGA_WRITEABLE); 3643 pmap_delayed_invl_page(m); 3644 } 3645 } 3646 if (pmap == kernel_pmap) { 3647 pmap_remove_kernel_pde(pmap, pdq, sva); 3648 } else { 3649 mpte = pmap_lookup_pt_page(pmap, sva); 3650 if (mpte != NULL) { 3651 pmap_remove_pt_page(pmap, mpte); 3652 pmap_resident_count_dec(pmap, 1); 3653 KASSERT(mpte->wire_count == NPTEPG, 3654 ("pmap_remove_pde: pte page wire count error")); 3655 mpte->wire_count = 0; 3656 pmap_add_delayed_free_list(mpte, free, FALSE); 3657 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 3658 } 3659 } 3660 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3661} 3662 3663/* 3664 * pmap_remove_pte: do the things to unmap a page in a process 3665 */ 3666static int 3667pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3668 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3669{ 3670 struct md_page *pvh; 3671 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3672 vm_page_t m; 3673 3674 PG_A = pmap_accessed_bit(pmap); 3675 PG_M = pmap_modified_bit(pmap); 3676 PG_RW = pmap_rw_bit(pmap); 3677 3678 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3679 oldpte = pte_load_clear(ptq); 3680 if (oldpte & PG_W) 3681 pmap->pm_stats.wired_count -= 1; 3682 pmap_resident_count_dec(pmap, 1); 3683 if (oldpte & PG_MANAGED) { 3684 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3685 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3686 vm_page_dirty(m); 3687 if (oldpte & PG_A) 3688 vm_page_aflag_set(m, PGA_REFERENCED); 3689 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3690 pmap_pvh_free(&m->md, pmap, va); 3691 if (TAILQ_EMPTY(&m->md.pv_list) && 3692 (m->flags & PG_FICTITIOUS) == 0) { 3693 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3694 if (TAILQ_EMPTY(&pvh->pv_list)) 3695 vm_page_aflag_clear(m, PGA_WRITEABLE); 3696 } 3697 pmap_delayed_invl_page(m); 3698 } 3699 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3700} 3701 3702/* 3703 * Remove a single page from a process address space 3704 */ 3705static void 3706pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3707 struct spglist *free) 3708{ 3709 struct rwlock *lock; 3710 pt_entry_t *pte, PG_V; 3711 3712 PG_V = pmap_valid_bit(pmap); 3713 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3714 if ((*pde & PG_V) == 0) 3715 return; 3716 pte = pmap_pde_to_pte(pde, va); 3717 if ((*pte & PG_V) == 0) 3718 return; 3719 lock = NULL; 3720 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3721 if (lock != NULL) 3722 rw_wunlock(lock); 3723 pmap_invalidate_page(pmap, va); 3724} 3725 3726/* 3727 * Remove the given range of addresses from the specified map. 3728 * 3729 * It is assumed that the start and end are properly 3730 * rounded to the page size. 3731 */ 3732void 3733pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3734{ 3735 struct rwlock *lock; 3736 vm_offset_t va, va_next; 3737 pml4_entry_t *pml4e; 3738 pdp_entry_t *pdpe; 3739 pd_entry_t ptpaddr, *pde; 3740 pt_entry_t *pte, PG_G, PG_V; 3741 struct spglist free; 3742 int anyvalid; 3743 3744 PG_G = pmap_global_bit(pmap); 3745 PG_V = pmap_valid_bit(pmap); 3746 3747 /* 3748 * Perform an unsynchronized read. This is, however, safe. 3749 */ 3750 if (pmap->pm_stats.resident_count == 0) 3751 return; 3752 3753 anyvalid = 0; 3754 SLIST_INIT(&free); 3755 3756 pmap_delayed_invl_started(); 3757 PMAP_LOCK(pmap); 3758 3759 /* 3760 * special handling of removing one page. a very 3761 * common operation and easy to short circuit some 3762 * code. 3763 */ 3764 if (sva + PAGE_SIZE == eva) { 3765 pde = pmap_pde(pmap, sva); 3766 if (pde && (*pde & PG_PS) == 0) { 3767 pmap_remove_page(pmap, sva, pde, &free); 3768 goto out; 3769 } 3770 } 3771 3772 lock = NULL; 3773 for (; sva < eva; sva = va_next) { 3774 3775 if (pmap->pm_stats.resident_count == 0) 3776 break; 3777 3778 pml4e = pmap_pml4e(pmap, sva); 3779 if ((*pml4e & PG_V) == 0) { 3780 va_next = (sva + NBPML4) & ~PML4MASK; 3781 if (va_next < sva) 3782 va_next = eva; 3783 continue; 3784 } 3785 3786 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3787 if ((*pdpe & PG_V) == 0) { 3788 va_next = (sva + NBPDP) & ~PDPMASK; 3789 if (va_next < sva) 3790 va_next = eva; 3791 continue; 3792 } 3793 3794 /* 3795 * Calculate index for next page table. 3796 */ 3797 va_next = (sva + NBPDR) & ~PDRMASK; 3798 if (va_next < sva) 3799 va_next = eva; 3800 3801 pde = pmap_pdpe_to_pde(pdpe, sva); 3802 ptpaddr = *pde; 3803 3804 /* 3805 * Weed out invalid mappings. 3806 */ 3807 if (ptpaddr == 0) 3808 continue; 3809 3810 /* 3811 * Check for large page. 3812 */ 3813 if ((ptpaddr & PG_PS) != 0) { 3814 /* 3815 * Are we removing the entire large page? If not, 3816 * demote the mapping and fall through. 3817 */ 3818 if (sva + NBPDR == va_next && eva >= va_next) { 3819 /* 3820 * The TLB entry for a PG_G mapping is 3821 * invalidated by pmap_remove_pde(). 3822 */ 3823 if ((ptpaddr & PG_G) == 0) 3824 anyvalid = 1; 3825 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3826 continue; 3827 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3828 &lock)) { 3829 /* The large page mapping was destroyed. */ 3830 continue; 3831 } else 3832 ptpaddr = *pde; 3833 } 3834 3835 /* 3836 * Limit our scan to either the end of the va represented 3837 * by the current page table page, or to the end of the 3838 * range being removed. 3839 */ 3840 if (va_next > eva) 3841 va_next = eva; 3842 3843 va = va_next; 3844 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3845 sva += PAGE_SIZE) { 3846 if (*pte == 0) { 3847 if (va != va_next) { 3848 pmap_invalidate_range(pmap, va, sva); 3849 va = va_next; 3850 } 3851 continue; 3852 } 3853 if ((*pte & PG_G) == 0) 3854 anyvalid = 1; 3855 else if (va == va_next) 3856 va = sva; 3857 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3858 &lock)) { 3859 sva += PAGE_SIZE; 3860 break; 3861 } 3862 } 3863 if (va != va_next) 3864 pmap_invalidate_range(pmap, va, sva); 3865 } 3866 if (lock != NULL) 3867 rw_wunlock(lock); 3868out: 3869 if (anyvalid) 3870 pmap_invalidate_all(pmap); 3871 PMAP_UNLOCK(pmap); 3872 pmap_delayed_invl_finished(); 3873 pmap_free_zero_pages(&free); 3874} 3875 3876/* 3877 * Routine: pmap_remove_all 3878 * Function: 3879 * Removes this physical page from 3880 * all physical maps in which it resides. 3881 * Reflects back modify bits to the pager. 3882 * 3883 * Notes: 3884 * Original versions of this routine were very 3885 * inefficient because they iteratively called 3886 * pmap_remove (slow...) 3887 */ 3888 3889void 3890pmap_remove_all(vm_page_t m) 3891{ 3892 struct md_page *pvh; 3893 pv_entry_t pv; 3894 pmap_t pmap; 3895 struct rwlock *lock; 3896 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 3897 pd_entry_t *pde; 3898 vm_offset_t va; 3899 struct spglist free; 3900 int pvh_gen, md_gen; 3901 3902 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3903 ("pmap_remove_all: page %p is not managed", m)); 3904 SLIST_INIT(&free); 3905 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3906 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3907 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3908retry: 3909 rw_wlock(lock); 3910 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3911 pmap = PV_PMAP(pv); 3912 if (!PMAP_TRYLOCK(pmap)) { 3913 pvh_gen = pvh->pv_gen; 3914 rw_wunlock(lock); 3915 PMAP_LOCK(pmap); 3916 rw_wlock(lock); 3917 if (pvh_gen != pvh->pv_gen) { 3918 rw_wunlock(lock); 3919 PMAP_UNLOCK(pmap); 3920 goto retry; 3921 } 3922 } 3923 va = pv->pv_va; 3924 pde = pmap_pde(pmap, va); 3925 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 3926 PMAP_UNLOCK(pmap); 3927 } 3928 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3929 pmap = PV_PMAP(pv); 3930 if (!PMAP_TRYLOCK(pmap)) { 3931 pvh_gen = pvh->pv_gen; 3932 md_gen = m->md.pv_gen; 3933 rw_wunlock(lock); 3934 PMAP_LOCK(pmap); 3935 rw_wlock(lock); 3936 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3937 rw_wunlock(lock); 3938 PMAP_UNLOCK(pmap); 3939 goto retry; 3940 } 3941 } 3942 PG_A = pmap_accessed_bit(pmap); 3943 PG_M = pmap_modified_bit(pmap); 3944 PG_RW = pmap_rw_bit(pmap); 3945 pmap_resident_count_dec(pmap, 1); 3946 pde = pmap_pde(pmap, pv->pv_va); 3947 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3948 " a 2mpage in page %p's pv list", m)); 3949 pte = pmap_pde_to_pte(pde, pv->pv_va); 3950 tpte = pte_load_clear(pte); 3951 if (tpte & PG_W) 3952 pmap->pm_stats.wired_count--; 3953 if (tpte & PG_A) 3954 vm_page_aflag_set(m, PGA_REFERENCED); 3955 3956 /* 3957 * Update the vm_page_t clean and reference bits. 3958 */ 3959 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3960 vm_page_dirty(m); 3961 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3962 pmap_invalidate_page(pmap, pv->pv_va); 3963 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3964 m->md.pv_gen++; 3965 free_pv_entry(pmap, pv); 3966 PMAP_UNLOCK(pmap); 3967 } 3968 vm_page_aflag_clear(m, PGA_WRITEABLE); 3969 rw_wunlock(lock); 3970 pmap_delayed_invl_wait(m); 3971 pmap_free_zero_pages(&free); 3972} 3973 3974/* 3975 * pmap_protect_pde: do the things to protect a 2mpage in a process 3976 */ 3977static boolean_t 3978pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3979{ 3980 pd_entry_t newpde, oldpde; 3981 vm_offset_t eva, va; 3982 vm_page_t m; 3983 boolean_t anychanged; 3984 pt_entry_t PG_G, PG_M, PG_RW; 3985 3986 PG_G = pmap_global_bit(pmap); 3987 PG_M = pmap_modified_bit(pmap); 3988 PG_RW = pmap_rw_bit(pmap); 3989 3990 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3991 KASSERT((sva & PDRMASK) == 0, 3992 ("pmap_protect_pde: sva is not 2mpage aligned")); 3993 anychanged = FALSE; 3994retry: 3995 oldpde = newpde = *pde; 3996 if (oldpde & PG_MANAGED) { 3997 eva = sva + NBPDR; 3998 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3999 va < eva; va += PAGE_SIZE, m++) 4000 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4001 vm_page_dirty(m); 4002 } 4003 if ((prot & VM_PROT_WRITE) == 0) 4004 newpde &= ~(PG_RW | PG_M); 4005 if ((prot & VM_PROT_EXECUTE) == 0) 4006 newpde |= pg_nx; 4007 if (newpde != oldpde) { 4008 if (!atomic_cmpset_long(pde, oldpde, newpde)) 4009 goto retry; 4010 if (oldpde & PG_G) 4011 pmap_invalidate_page(pmap, sva); 4012 else 4013 anychanged = TRUE; 4014 } 4015 return (anychanged); 4016} 4017 4018/* 4019 * Set the physical protection on the 4020 * specified range of this map as requested. 4021 */ 4022void 4023pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4024{ 4025 vm_offset_t va_next; 4026 pml4_entry_t *pml4e; 4027 pdp_entry_t *pdpe; 4028 pd_entry_t ptpaddr, *pde; 4029 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 4030 boolean_t anychanged; 4031 4032 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4033 if (prot == VM_PROT_NONE) { 4034 pmap_remove(pmap, sva, eva); 4035 return; 4036 } 4037 4038 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4039 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4040 return; 4041 4042 PG_G = pmap_global_bit(pmap); 4043 PG_M = pmap_modified_bit(pmap); 4044 PG_V = pmap_valid_bit(pmap); 4045 PG_RW = pmap_rw_bit(pmap); 4046 anychanged = FALSE; 4047 4048 PMAP_LOCK(pmap); 4049 for (; sva < eva; sva = va_next) { 4050 4051 pml4e = pmap_pml4e(pmap, sva); 4052 if ((*pml4e & PG_V) == 0) { 4053 va_next = (sva + NBPML4) & ~PML4MASK; 4054 if (va_next < sva) 4055 va_next = eva; 4056 continue; 4057 } 4058 4059 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4060 if ((*pdpe & PG_V) == 0) { 4061 va_next = (sva + NBPDP) & ~PDPMASK; 4062 if (va_next < sva) 4063 va_next = eva; 4064 continue; 4065 } 4066 4067 va_next = (sva + NBPDR) & ~PDRMASK; 4068 if (va_next < sva) 4069 va_next = eva; 4070 4071 pde = pmap_pdpe_to_pde(pdpe, sva); 4072 ptpaddr = *pde; 4073 4074 /* 4075 * Weed out invalid mappings. 4076 */ 4077 if (ptpaddr == 0) 4078 continue; 4079 4080 /* 4081 * Check for large page. 4082 */ 4083 if ((ptpaddr & PG_PS) != 0) { 4084 /* 4085 * Are we protecting the entire large page? If not, 4086 * demote the mapping and fall through. 4087 */ 4088 if (sva + NBPDR == va_next && eva >= va_next) { 4089 /* 4090 * The TLB entry for a PG_G mapping is 4091 * invalidated by pmap_protect_pde(). 4092 */ 4093 if (pmap_protect_pde(pmap, pde, sva, prot)) 4094 anychanged = TRUE; 4095 continue; 4096 } else if (!pmap_demote_pde(pmap, pde, sva)) { 4097 /* 4098 * The large page mapping was destroyed. 4099 */ 4100 continue; 4101 } 4102 } 4103 4104 if (va_next > eva) 4105 va_next = eva; 4106 4107 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4108 sva += PAGE_SIZE) { 4109 pt_entry_t obits, pbits; 4110 vm_page_t m; 4111 4112retry: 4113 obits = pbits = *pte; 4114 if ((pbits & PG_V) == 0) 4115 continue; 4116 4117 if ((prot & VM_PROT_WRITE) == 0) { 4118 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4119 (PG_MANAGED | PG_M | PG_RW)) { 4120 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4121 vm_page_dirty(m); 4122 } 4123 pbits &= ~(PG_RW | PG_M); 4124 } 4125 if ((prot & VM_PROT_EXECUTE) == 0) 4126 pbits |= pg_nx; 4127 4128 if (pbits != obits) { 4129 if (!atomic_cmpset_long(pte, obits, pbits)) 4130 goto retry; 4131 if (obits & PG_G) 4132 pmap_invalidate_page(pmap, sva); 4133 else 4134 anychanged = TRUE; 4135 } 4136 } 4137 } 4138 if (anychanged) 4139 pmap_invalidate_all(pmap); 4140 PMAP_UNLOCK(pmap); 4141} 4142 4143/* 4144 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4145 * single page table page (PTP) to a single 2MB page mapping. For promotion 4146 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4147 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4148 * identical characteristics. 4149 */ 4150static void 4151pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4152 struct rwlock **lockp) 4153{ 4154 pd_entry_t newpde; 4155 pt_entry_t *firstpte, oldpte, pa, *pte; 4156 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 4157 vm_page_t mpte; 4158 int PG_PTE_CACHE; 4159 4160 PG_A = pmap_accessed_bit(pmap); 4161 PG_G = pmap_global_bit(pmap); 4162 PG_M = pmap_modified_bit(pmap); 4163 PG_V = pmap_valid_bit(pmap); 4164 PG_RW = pmap_rw_bit(pmap); 4165 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4166 4167 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4168 4169 /* 4170 * Examine the first PTE in the specified PTP. Abort if this PTE is 4171 * either invalid, unused, or does not map the first 4KB physical page 4172 * within a 2MB page. 4173 */ 4174 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4175setpde: 4176 newpde = *firstpte; 4177 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4178 atomic_add_long(&pmap_pde_p_failures, 1); 4179 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4180 " in pmap %p", va, pmap); 4181 return; 4182 } 4183 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4184 /* 4185 * When PG_M is already clear, PG_RW can be cleared without 4186 * a TLB invalidation. 4187 */ 4188 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4189 goto setpde; 4190 newpde &= ~PG_RW; 4191 } 4192 4193 /* 4194 * Examine each of the other PTEs in the specified PTP. Abort if this 4195 * PTE maps an unexpected 4KB physical page or does not have identical 4196 * characteristics to the first PTE. 4197 */ 4198 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4199 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4200setpte: 4201 oldpte = *pte; 4202 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4203 atomic_add_long(&pmap_pde_p_failures, 1); 4204 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4205 " in pmap %p", va, pmap); 4206 return; 4207 } 4208 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4209 /* 4210 * When PG_M is already clear, PG_RW can be cleared 4211 * without a TLB invalidation. 4212 */ 4213 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4214 goto setpte; 4215 oldpte &= ~PG_RW; 4216 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4217 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4218 (va & ~PDRMASK), pmap); 4219 } 4220 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4221 atomic_add_long(&pmap_pde_p_failures, 1); 4222 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4223 " in pmap %p", va, pmap); 4224 return; 4225 } 4226 pa -= PAGE_SIZE; 4227 } 4228 4229 /* 4230 * Save the page table page in its current state until the PDE 4231 * mapping the superpage is demoted by pmap_demote_pde() or 4232 * destroyed by pmap_remove_pde(). 4233 */ 4234 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4235 KASSERT(mpte >= vm_page_array && 4236 mpte < &vm_page_array[vm_page_array_size], 4237 ("pmap_promote_pde: page table page is out of range")); 4238 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4239 ("pmap_promote_pde: page table page's pindex is wrong")); 4240 if (pmap_insert_pt_page(pmap, mpte)) { 4241 atomic_add_long(&pmap_pde_p_failures, 1); 4242 CTR2(KTR_PMAP, 4243 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4244 pmap); 4245 return; 4246 } 4247 4248 /* 4249 * Promote the pv entries. 4250 */ 4251 if ((newpde & PG_MANAGED) != 0) 4252 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4253 4254 /* 4255 * Propagate the PAT index to its proper position. 4256 */ 4257 newpde = pmap_swap_pat(pmap, newpde); 4258 4259 /* 4260 * Map the superpage. 4261 */ 4262 if (workaround_erratum383) 4263 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4264 else 4265 pde_store(pde, PG_PS | newpde); 4266 4267 atomic_add_long(&pmap_pde_promotions, 1); 4268 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4269 " in pmap %p", va, pmap); 4270} 4271 4272/* 4273 * Insert the given physical page (p) at 4274 * the specified virtual address (v) in the 4275 * target physical map with the protection requested. 4276 * 4277 * If specified, the page will be wired down, meaning 4278 * that the related pte can not be reclaimed. 4279 * 4280 * NB: This is the only routine which MAY NOT lazy-evaluate 4281 * or lose information. That is, this routine must actually 4282 * insert this page into the given map NOW. 4283 * 4284 * When destroying both a page table and PV entry, this function 4285 * performs the TLB invalidation before releasing the PV list 4286 * lock, so we do not need pmap_delayed_invl_page() calls here. 4287 */ 4288int 4289pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4290 u_int flags, int8_t psind __unused) 4291{ 4292 struct rwlock *lock; 4293 pd_entry_t *pde; 4294 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4295 pt_entry_t newpte, origpte; 4296 pv_entry_t pv; 4297 vm_paddr_t opa, pa; 4298 vm_page_t mpte, om; 4299 boolean_t nosleep; 4300 4301 PG_A = pmap_accessed_bit(pmap); 4302 PG_G = pmap_global_bit(pmap); 4303 PG_M = pmap_modified_bit(pmap); 4304 PG_V = pmap_valid_bit(pmap); 4305 PG_RW = pmap_rw_bit(pmap); 4306 4307 va = trunc_page(va); 4308 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4309 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4310 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4311 va)); 4312 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4313 va >= kmi.clean_eva, 4314 ("pmap_enter: managed mapping within the clean submap")); 4315 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4316 VM_OBJECT_ASSERT_LOCKED(m->object); 4317 pa = VM_PAGE_TO_PHYS(m); 4318 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4319 if ((flags & VM_PROT_WRITE) != 0) 4320 newpte |= PG_M; 4321 if ((prot & VM_PROT_WRITE) != 0) 4322 newpte |= PG_RW; 4323 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4324 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4325 if ((prot & VM_PROT_EXECUTE) == 0) 4326 newpte |= pg_nx; 4327 if ((flags & PMAP_ENTER_WIRED) != 0) 4328 newpte |= PG_W; 4329 if (va < VM_MAXUSER_ADDRESS) 4330 newpte |= PG_U; 4331 if (pmap == kernel_pmap) 4332 newpte |= PG_G; 4333 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); 4334 4335 /* 4336 * Set modified bit gratuitously for writeable mappings if 4337 * the page is unmanaged. We do not want to take a fault 4338 * to do the dirty bit accounting for these mappings. 4339 */ 4340 if ((m->oflags & VPO_UNMANAGED) != 0) { 4341 if ((newpte & PG_RW) != 0) 4342 newpte |= PG_M; 4343 } 4344 4345 mpte = NULL; 4346 4347 lock = NULL; 4348 PMAP_LOCK(pmap); 4349 4350 /* 4351 * In the case that a page table page is not 4352 * resident, we are creating it here. 4353 */ 4354retry: 4355 pde = pmap_pde(pmap, va); 4356 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4357 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4358 pte = pmap_pde_to_pte(pde, va); 4359 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4360 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4361 mpte->wire_count++; 4362 } 4363 } else if (va < VM_MAXUSER_ADDRESS) { 4364 /* 4365 * Here if the pte page isn't mapped, or if it has been 4366 * deallocated. 4367 */ 4368 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4369 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4370 nosleep ? NULL : &lock); 4371 if (mpte == NULL && nosleep) { 4372 if (lock != NULL) 4373 rw_wunlock(lock); 4374 PMAP_UNLOCK(pmap); 4375 return (KERN_RESOURCE_SHORTAGE); 4376 } 4377 goto retry; 4378 } else 4379 panic("pmap_enter: invalid page directory va=%#lx", va); 4380 4381 origpte = *pte; 4382 4383 /* 4384 * Is the specified virtual address already mapped? 4385 */ 4386 if ((origpte & PG_V) != 0) { 4387 /* 4388 * Wiring change, just update stats. We don't worry about 4389 * wiring PT pages as they remain resident as long as there 4390 * are valid mappings in them. Hence, if a user page is wired, 4391 * the PT page will be also. 4392 */ 4393 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4394 pmap->pm_stats.wired_count++; 4395 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4396 pmap->pm_stats.wired_count--; 4397 4398 /* 4399 * Remove the extra PT page reference. 4400 */ 4401 if (mpte != NULL) { 4402 mpte->wire_count--; 4403 KASSERT(mpte->wire_count > 0, 4404 ("pmap_enter: missing reference to page table page," 4405 " va: 0x%lx", va)); 4406 } 4407 4408 /* 4409 * Has the physical page changed? 4410 */ 4411 opa = origpte & PG_FRAME; 4412 if (opa == pa) { 4413 /* 4414 * No, might be a protection or wiring change. 4415 */ 4416 if ((origpte & PG_MANAGED) != 0) { 4417 newpte |= PG_MANAGED; 4418 if ((newpte & PG_RW) != 0) 4419 vm_page_aflag_set(m, PGA_WRITEABLE); 4420 } 4421 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4422 goto unchanged; 4423 goto validate; 4424 } 4425 } else { 4426 /* 4427 * Increment the counters. 4428 */ 4429 if ((newpte & PG_W) != 0) 4430 pmap->pm_stats.wired_count++; 4431 pmap_resident_count_inc(pmap, 1); 4432 } 4433 4434 /* 4435 * Enter on the PV list if part of our managed memory. 4436 */ 4437 if ((m->oflags & VPO_UNMANAGED) == 0) { 4438 newpte |= PG_MANAGED; 4439 pv = get_pv_entry(pmap, &lock); 4440 pv->pv_va = va; 4441 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4442 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4443 m->md.pv_gen++; 4444 if ((newpte & PG_RW) != 0) 4445 vm_page_aflag_set(m, PGA_WRITEABLE); 4446 } 4447 4448 /* 4449 * Update the PTE. 4450 */ 4451 if ((origpte & PG_V) != 0) { 4452validate: 4453 origpte = pte_load_store(pte, newpte); 4454 opa = origpte & PG_FRAME; 4455 if (opa != pa) { 4456 if ((origpte & PG_MANAGED) != 0) { 4457 om = PHYS_TO_VM_PAGE(opa); 4458 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4459 PG_RW)) 4460 vm_page_dirty(om); 4461 if ((origpte & PG_A) != 0) 4462 vm_page_aflag_set(om, PGA_REFERENCED); 4463 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4464 pmap_pvh_free(&om->md, pmap, va); 4465 if ((om->aflags & PGA_WRITEABLE) != 0 && 4466 TAILQ_EMPTY(&om->md.pv_list) && 4467 ((om->flags & PG_FICTITIOUS) != 0 || 4468 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4469 vm_page_aflag_clear(om, PGA_WRITEABLE); 4470 } 4471 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4472 PG_RW)) == (PG_M | PG_RW)) { 4473 if ((origpte & PG_MANAGED) != 0) 4474 vm_page_dirty(m); 4475 4476 /* 4477 * Although the PTE may still have PG_RW set, TLB 4478 * invalidation may nonetheless be required because 4479 * the PTE no longer has PG_M set. 4480 */ 4481 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4482 /* 4483 * This PTE change does not require TLB invalidation. 4484 */ 4485 goto unchanged; 4486 } 4487 if ((origpte & PG_A) != 0) 4488 pmap_invalidate_page(pmap, va); 4489 } else 4490 pte_store(pte, newpte); 4491 4492unchanged: 4493 4494 /* 4495 * If both the page table page and the reservation are fully 4496 * populated, then attempt promotion. 4497 */ 4498 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4499 pmap_ps_enabled(pmap) && 4500 (m->flags & PG_FICTITIOUS) == 0 && 4501 vm_reserv_level_iffullpop(m) == 0) 4502 pmap_promote_pde(pmap, pde, va, &lock); 4503 4504 if (lock != NULL) 4505 rw_wunlock(lock); 4506 PMAP_UNLOCK(pmap); 4507 return (KERN_SUCCESS); 4508} 4509 4510/* 4511 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 4512 * otherwise. Fails if (1) a page table page cannot be allocated without 4513 * blocking, (2) a mapping already exists at the specified virtual address, or 4514 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4515 */ 4516static boolean_t 4517pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4518 struct rwlock **lockp) 4519{ 4520 pd_entry_t *pde, newpde; 4521 pt_entry_t PG_V; 4522 vm_page_t mpde; 4523 struct spglist free; 4524 4525 PG_V = pmap_valid_bit(pmap); 4526 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4527 4528 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 4529 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4530 " in pmap %p", va, pmap); 4531 return (FALSE); 4532 } 4533 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 4534 pde = &pde[pmap_pde_index(va)]; 4535 if ((*pde & PG_V) != 0) { 4536 KASSERT(mpde->wire_count > 1, 4537 ("pmap_enter_pde: mpde's wire count is too low")); 4538 mpde->wire_count--; 4539 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4540 " in pmap %p", va, pmap); 4541 return (FALSE); 4542 } 4543 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4544 PG_PS | PG_V; 4545 if ((m->oflags & VPO_UNMANAGED) == 0) { 4546 newpde |= PG_MANAGED; 4547 4548 /* 4549 * Abort this mapping if its PV entry could not be created. 4550 */ 4551 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 4552 lockp)) { 4553 SLIST_INIT(&free); 4554 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 4555 /* 4556 * Although "va" is not mapped, paging- 4557 * structure caches could nonetheless have 4558 * entries that refer to the freed page table 4559 * pages. Invalidate those entries. 4560 */ 4561 pmap_invalidate_page(pmap, va); 4562 pmap_free_zero_pages(&free); 4563 } 4564 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4565 " in pmap %p", va, pmap); 4566 return (FALSE); 4567 } 4568 } 4569 if ((prot & VM_PROT_EXECUTE) == 0) 4570 newpde |= pg_nx; 4571 if (va < VM_MAXUSER_ADDRESS) 4572 newpde |= PG_U; 4573 4574 /* 4575 * Increment counters. 4576 */ 4577 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4578 4579 /* 4580 * Map the superpage. 4581 */ 4582 pde_store(pde, newpde); 4583 4584 atomic_add_long(&pmap_pde_mappings, 1); 4585 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4586 " in pmap %p", va, pmap); 4587 return (TRUE); 4588} 4589 4590/* 4591 * Maps a sequence of resident pages belonging to the same object. 4592 * The sequence begins with the given page m_start. This page is 4593 * mapped at the given virtual address start. Each subsequent page is 4594 * mapped at a virtual address that is offset from start by the same 4595 * amount as the page is offset from m_start within the object. The 4596 * last page in the sequence is the page with the largest offset from 4597 * m_start that can be mapped at a virtual address less than the given 4598 * virtual address end. Not every virtual page between start and end 4599 * is mapped; only those for which a resident page exists with the 4600 * corresponding offset from m_start are mapped. 4601 */ 4602void 4603pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4604 vm_page_t m_start, vm_prot_t prot) 4605{ 4606 struct rwlock *lock; 4607 vm_offset_t va; 4608 vm_page_t m, mpte; 4609 vm_pindex_t diff, psize; 4610 4611 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4612 4613 psize = atop(end - start); 4614 mpte = NULL; 4615 m = m_start; 4616 lock = NULL; 4617 PMAP_LOCK(pmap); 4618 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4619 va = start + ptoa(diff); 4620 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4621 m->psind == 1 && pmap_ps_enabled(pmap) && 4622 pmap_enter_pde(pmap, va, m, prot, &lock)) 4623 m = &m[NBPDR / PAGE_SIZE - 1]; 4624 else 4625 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4626 mpte, &lock); 4627 m = TAILQ_NEXT(m, listq); 4628 } 4629 if (lock != NULL) 4630 rw_wunlock(lock); 4631 PMAP_UNLOCK(pmap); 4632} 4633 4634/* 4635 * this code makes some *MAJOR* assumptions: 4636 * 1. Current pmap & pmap exists. 4637 * 2. Not wired. 4638 * 3. Read access. 4639 * 4. No page table pages. 4640 * but is *MUCH* faster than pmap_enter... 4641 */ 4642 4643void 4644pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4645{ 4646 struct rwlock *lock; 4647 4648 lock = NULL; 4649 PMAP_LOCK(pmap); 4650 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4651 if (lock != NULL) 4652 rw_wunlock(lock); 4653 PMAP_UNLOCK(pmap); 4654} 4655 4656static vm_page_t 4657pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4658 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4659{ 4660 struct spglist free; 4661 pt_entry_t *pte, PG_V; 4662 vm_paddr_t pa; 4663 4664 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4665 (m->oflags & VPO_UNMANAGED) != 0, 4666 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4667 PG_V = pmap_valid_bit(pmap); 4668 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4669 4670 /* 4671 * In the case that a page table page is not 4672 * resident, we are creating it here. 4673 */ 4674 if (va < VM_MAXUSER_ADDRESS) { 4675 vm_pindex_t ptepindex; 4676 pd_entry_t *ptepa; 4677 4678 /* 4679 * Calculate pagetable page index 4680 */ 4681 ptepindex = pmap_pde_pindex(va); 4682 if (mpte && (mpte->pindex == ptepindex)) { 4683 mpte->wire_count++; 4684 } else { 4685 /* 4686 * Get the page directory entry 4687 */ 4688 ptepa = pmap_pde(pmap, va); 4689 4690 /* 4691 * If the page table page is mapped, we just increment 4692 * the hold count, and activate it. Otherwise, we 4693 * attempt to allocate a page table page. If this 4694 * attempt fails, we don't retry. Instead, we give up. 4695 */ 4696 if (ptepa && (*ptepa & PG_V) != 0) { 4697 if (*ptepa & PG_PS) 4698 return (NULL); 4699 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4700 mpte->wire_count++; 4701 } else { 4702 /* 4703 * Pass NULL instead of the PV list lock 4704 * pointer, because we don't intend to sleep. 4705 */ 4706 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4707 if (mpte == NULL) 4708 return (mpte); 4709 } 4710 } 4711 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4712 pte = &pte[pmap_pte_index(va)]; 4713 } else { 4714 mpte = NULL; 4715 pte = vtopte(va); 4716 } 4717 if (*pte) { 4718 if (mpte != NULL) { 4719 mpte->wire_count--; 4720 mpte = NULL; 4721 } 4722 return (mpte); 4723 } 4724 4725 /* 4726 * Enter on the PV list if part of our managed memory. 4727 */ 4728 if ((m->oflags & VPO_UNMANAGED) == 0 && 4729 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4730 if (mpte != NULL) { 4731 SLIST_INIT(&free); 4732 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4733 /* 4734 * Although "va" is not mapped, paging- 4735 * structure caches could nonetheless have 4736 * entries that refer to the freed page table 4737 * pages. Invalidate those entries. 4738 */ 4739 pmap_invalidate_page(pmap, va); 4740 pmap_free_zero_pages(&free); 4741 } 4742 mpte = NULL; 4743 } 4744 return (mpte); 4745 } 4746 4747 /* 4748 * Increment counters 4749 */ 4750 pmap_resident_count_inc(pmap, 1); 4751 4752 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4753 if ((prot & VM_PROT_EXECUTE) == 0) 4754 pa |= pg_nx; 4755 4756 /* 4757 * Now validate mapping with RO protection 4758 */ 4759 if ((m->oflags & VPO_UNMANAGED) != 0) 4760 pte_store(pte, pa | PG_V | PG_U); 4761 else 4762 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4763 return (mpte); 4764} 4765 4766/* 4767 * Make a temporary mapping for a physical address. This is only intended 4768 * to be used for panic dumps. 4769 */ 4770void * 4771pmap_kenter_temporary(vm_paddr_t pa, int i) 4772{ 4773 vm_offset_t va; 4774 4775 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4776 pmap_kenter(va, pa); 4777 invlpg(va); 4778 return ((void *)crashdumpmap); 4779} 4780 4781/* 4782 * This code maps large physical mmap regions into the 4783 * processor address space. Note that some shortcuts 4784 * are taken, but the code works. 4785 */ 4786void 4787pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4788 vm_pindex_t pindex, vm_size_t size) 4789{ 4790 pd_entry_t *pde; 4791 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4792 vm_paddr_t pa, ptepa; 4793 vm_page_t p, pdpg; 4794 int pat_mode; 4795 4796 PG_A = pmap_accessed_bit(pmap); 4797 PG_M = pmap_modified_bit(pmap); 4798 PG_V = pmap_valid_bit(pmap); 4799 PG_RW = pmap_rw_bit(pmap); 4800 4801 VM_OBJECT_ASSERT_WLOCKED(object); 4802 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4803 ("pmap_object_init_pt: non-device object")); 4804 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4805 if (!pmap_ps_enabled(pmap)) 4806 return; 4807 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4808 return; 4809 p = vm_page_lookup(object, pindex); 4810 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4811 ("pmap_object_init_pt: invalid page %p", p)); 4812 pat_mode = p->md.pat_mode; 4813 4814 /* 4815 * Abort the mapping if the first page is not physically 4816 * aligned to a 2MB page boundary. 4817 */ 4818 ptepa = VM_PAGE_TO_PHYS(p); 4819 if (ptepa & (NBPDR - 1)) 4820 return; 4821 4822 /* 4823 * Skip the first page. Abort the mapping if the rest of 4824 * the pages are not physically contiguous or have differing 4825 * memory attributes. 4826 */ 4827 p = TAILQ_NEXT(p, listq); 4828 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4829 pa += PAGE_SIZE) { 4830 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4831 ("pmap_object_init_pt: invalid page %p", p)); 4832 if (pa != VM_PAGE_TO_PHYS(p) || 4833 pat_mode != p->md.pat_mode) 4834 return; 4835 p = TAILQ_NEXT(p, listq); 4836 } 4837 4838 /* 4839 * Map using 2MB pages. Since "ptepa" is 2M aligned and 4840 * "size" is a multiple of 2M, adding the PAT setting to "pa" 4841 * will not affect the termination of this loop. 4842 */ 4843 PMAP_LOCK(pmap); 4844 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4845 pa < ptepa + size; pa += NBPDR) { 4846 pdpg = pmap_allocpde(pmap, addr, NULL); 4847 if (pdpg == NULL) { 4848 /* 4849 * The creation of mappings below is only an 4850 * optimization. If a page directory page 4851 * cannot be allocated without blocking, 4852 * continue on to the next mapping rather than 4853 * blocking. 4854 */ 4855 addr += NBPDR; 4856 continue; 4857 } 4858 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4859 pde = &pde[pmap_pde_index(addr)]; 4860 if ((*pde & PG_V) == 0) { 4861 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4862 PG_U | PG_RW | PG_V); 4863 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4864 atomic_add_long(&pmap_pde_mappings, 1); 4865 } else { 4866 /* Continue on if the PDE is already valid. */ 4867 pdpg->wire_count--; 4868 KASSERT(pdpg->wire_count > 0, 4869 ("pmap_object_init_pt: missing reference " 4870 "to page directory page, va: 0x%lx", addr)); 4871 } 4872 addr += NBPDR; 4873 } 4874 PMAP_UNLOCK(pmap); 4875 } 4876} 4877 4878/* 4879 * Clear the wired attribute from the mappings for the specified range of 4880 * addresses in the given pmap. Every valid mapping within that range 4881 * must have the wired attribute set. In contrast, invalid mappings 4882 * cannot have the wired attribute set, so they are ignored. 4883 * 4884 * The wired attribute of the page table entry is not a hardware 4885 * feature, so there is no need to invalidate any TLB entries. 4886 * Since pmap_demote_pde() for the wired entry must never fail, 4887 * pmap_delayed_invl_started()/finished() calls around the 4888 * function are not needed. 4889 */ 4890void 4891pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4892{ 4893 vm_offset_t va_next; 4894 pml4_entry_t *pml4e; 4895 pdp_entry_t *pdpe; 4896 pd_entry_t *pde; 4897 pt_entry_t *pte, PG_V; 4898 4899 PG_V = pmap_valid_bit(pmap); 4900 PMAP_LOCK(pmap); 4901 for (; sva < eva; sva = va_next) { 4902 pml4e = pmap_pml4e(pmap, sva); 4903 if ((*pml4e & PG_V) == 0) { 4904 va_next = (sva + NBPML4) & ~PML4MASK; 4905 if (va_next < sva) 4906 va_next = eva; 4907 continue; 4908 } 4909 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4910 if ((*pdpe & PG_V) == 0) { 4911 va_next = (sva + NBPDP) & ~PDPMASK; 4912 if (va_next < sva) 4913 va_next = eva; 4914 continue; 4915 } 4916 va_next = (sva + NBPDR) & ~PDRMASK; 4917 if (va_next < sva) 4918 va_next = eva; 4919 pde = pmap_pdpe_to_pde(pdpe, sva); 4920 if ((*pde & PG_V) == 0) 4921 continue; 4922 if ((*pde & PG_PS) != 0) { 4923 if ((*pde & PG_W) == 0) 4924 panic("pmap_unwire: pde %#jx is missing PG_W", 4925 (uintmax_t)*pde); 4926 4927 /* 4928 * Are we unwiring the entire large page? If not, 4929 * demote the mapping and fall through. 4930 */ 4931 if (sva + NBPDR == va_next && eva >= va_next) { 4932 atomic_clear_long(pde, PG_W); 4933 pmap->pm_stats.wired_count -= NBPDR / 4934 PAGE_SIZE; 4935 continue; 4936 } else if (!pmap_demote_pde(pmap, pde, sva)) 4937 panic("pmap_unwire: demotion failed"); 4938 } 4939 if (va_next > eva) 4940 va_next = eva; 4941 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4942 sva += PAGE_SIZE) { 4943 if ((*pte & PG_V) == 0) 4944 continue; 4945 if ((*pte & PG_W) == 0) 4946 panic("pmap_unwire: pte %#jx is missing PG_W", 4947 (uintmax_t)*pte); 4948 4949 /* 4950 * PG_W must be cleared atomically. Although the pmap 4951 * lock synchronizes access to PG_W, another processor 4952 * could be setting PG_M and/or PG_A concurrently. 4953 */ 4954 atomic_clear_long(pte, PG_W); 4955 pmap->pm_stats.wired_count--; 4956 } 4957 } 4958 PMAP_UNLOCK(pmap); 4959} 4960 4961/* 4962 * Copy the range specified by src_addr/len 4963 * from the source map to the range dst_addr/len 4964 * in the destination map. 4965 * 4966 * This routine is only advisory and need not do anything. 4967 */ 4968 4969void 4970pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4971 vm_offset_t src_addr) 4972{ 4973 struct rwlock *lock; 4974 struct spglist free; 4975 vm_offset_t addr; 4976 vm_offset_t end_addr = src_addr + len; 4977 vm_offset_t va_next; 4978 pt_entry_t PG_A, PG_M, PG_V; 4979 4980 if (dst_addr != src_addr) 4981 return; 4982 4983 if (dst_pmap->pm_type != src_pmap->pm_type) 4984 return; 4985 4986 /* 4987 * EPT page table entries that require emulation of A/D bits are 4988 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 4989 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 4990 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 4991 * implementations flag an EPT misconfiguration for exec-only 4992 * mappings we skip this function entirely for emulated pmaps. 4993 */ 4994 if (pmap_emulate_ad_bits(dst_pmap)) 4995 return; 4996 4997 lock = NULL; 4998 if (dst_pmap < src_pmap) { 4999 PMAP_LOCK(dst_pmap); 5000 PMAP_LOCK(src_pmap); 5001 } else { 5002 PMAP_LOCK(src_pmap); 5003 PMAP_LOCK(dst_pmap); 5004 } 5005 5006 PG_A = pmap_accessed_bit(dst_pmap); 5007 PG_M = pmap_modified_bit(dst_pmap); 5008 PG_V = pmap_valid_bit(dst_pmap); 5009 5010 for (addr = src_addr; addr < end_addr; addr = va_next) { 5011 pt_entry_t *src_pte, *dst_pte; 5012 vm_page_t dstmpde, dstmpte, srcmpte; 5013 pml4_entry_t *pml4e; 5014 pdp_entry_t *pdpe; 5015 pd_entry_t srcptepaddr, *pde; 5016 5017 KASSERT(addr < UPT_MIN_ADDRESS, 5018 ("pmap_copy: invalid to pmap_copy page tables")); 5019 5020 pml4e = pmap_pml4e(src_pmap, addr); 5021 if ((*pml4e & PG_V) == 0) { 5022 va_next = (addr + NBPML4) & ~PML4MASK; 5023 if (va_next < addr) 5024 va_next = end_addr; 5025 continue; 5026 } 5027 5028 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 5029 if ((*pdpe & PG_V) == 0) { 5030 va_next = (addr + NBPDP) & ~PDPMASK; 5031 if (va_next < addr) 5032 va_next = end_addr; 5033 continue; 5034 } 5035 5036 va_next = (addr + NBPDR) & ~PDRMASK; 5037 if (va_next < addr) 5038 va_next = end_addr; 5039 5040 pde = pmap_pdpe_to_pde(pdpe, addr); 5041 srcptepaddr = *pde; 5042 if (srcptepaddr == 0) 5043 continue; 5044 5045 if (srcptepaddr & PG_PS) { 5046 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 5047 continue; 5048 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 5049 if (dstmpde == NULL) 5050 break; 5051 pde = (pd_entry_t *) 5052 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 5053 pde = &pde[pmap_pde_index(addr)]; 5054 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 5055 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 5056 PG_PS_FRAME, &lock))) { 5057 *pde = srcptepaddr & ~PG_W; 5058 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 5059 atomic_add_long(&pmap_pde_mappings, 1); 5060 } else 5061 dstmpde->wire_count--; 5062 continue; 5063 } 5064 5065 srcptepaddr &= PG_FRAME; 5066 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5067 KASSERT(srcmpte->wire_count > 0, 5068 ("pmap_copy: source page table page is unused")); 5069 5070 if (va_next > end_addr) 5071 va_next = end_addr; 5072 5073 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5074 src_pte = &src_pte[pmap_pte_index(addr)]; 5075 dstmpte = NULL; 5076 while (addr < va_next) { 5077 pt_entry_t ptetemp; 5078 ptetemp = *src_pte; 5079 /* 5080 * we only virtual copy managed pages 5081 */ 5082 if ((ptetemp & PG_MANAGED) != 0) { 5083 if (dstmpte != NULL && 5084 dstmpte->pindex == pmap_pde_pindex(addr)) 5085 dstmpte->wire_count++; 5086 else if ((dstmpte = pmap_allocpte(dst_pmap, 5087 addr, NULL)) == NULL) 5088 goto out; 5089 dst_pte = (pt_entry_t *) 5090 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5091 dst_pte = &dst_pte[pmap_pte_index(addr)]; 5092 if (*dst_pte == 0 && 5093 pmap_try_insert_pv_entry(dst_pmap, addr, 5094 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5095 &lock)) { 5096 /* 5097 * Clear the wired, modified, and 5098 * accessed (referenced) bits 5099 * during the copy. 5100 */ 5101 *dst_pte = ptetemp & ~(PG_W | PG_M | 5102 PG_A); 5103 pmap_resident_count_inc(dst_pmap, 1); 5104 } else { 5105 SLIST_INIT(&free); 5106 if (pmap_unwire_ptp(dst_pmap, addr, 5107 dstmpte, &free)) { 5108 /* 5109 * Although "addr" is not 5110 * mapped, paging-structure 5111 * caches could nonetheless 5112 * have entries that refer to 5113 * the freed page table pages. 5114 * Invalidate those entries. 5115 */ 5116 pmap_invalidate_page(dst_pmap, 5117 addr); 5118 pmap_free_zero_pages(&free); 5119 } 5120 goto out; 5121 } 5122 if (dstmpte->wire_count >= srcmpte->wire_count) 5123 break; 5124 } 5125 addr += PAGE_SIZE; 5126 src_pte++; 5127 } 5128 } 5129out: 5130 if (lock != NULL) 5131 rw_wunlock(lock); 5132 PMAP_UNLOCK(src_pmap); 5133 PMAP_UNLOCK(dst_pmap); 5134} 5135 5136/* 5137 * pmap_zero_page zeros the specified hardware page by mapping 5138 * the page into KVM and using bzero to clear its contents. 5139 */ 5140void 5141pmap_zero_page(vm_page_t m) 5142{ 5143 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5144 5145 pagezero((void *)va); 5146} 5147 5148/* 5149 * pmap_zero_page_area zeros the specified hardware page by mapping 5150 * the page into KVM and using bzero to clear its contents. 5151 * 5152 * off and size may not cover an area beyond a single hardware page. 5153 */ 5154void 5155pmap_zero_page_area(vm_page_t m, int off, int size) 5156{ 5157 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5158 5159 if (off == 0 && size == PAGE_SIZE) 5160 pagezero((void *)va); 5161 else 5162 bzero((char *)va + off, size); 5163} 5164 5165/* 5166 * pmap_zero_page_idle zeros the specified hardware page by mapping 5167 * the page into KVM and using bzero to clear its contents. This 5168 * is intended to be called from the vm_pagezero process only and 5169 * outside of Giant. 5170 */ 5171void 5172pmap_zero_page_idle(vm_page_t m) 5173{ 5174 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5175 5176 pagezero((void *)va); 5177} 5178 5179/* 5180 * pmap_copy_page copies the specified (machine independent) 5181 * page by mapping the page into virtual memory and using 5182 * bcopy to copy the page, one machine dependent page at a 5183 * time. 5184 */ 5185void 5186pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5187{ 5188 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5189 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5190 5191 pagecopy((void *)src, (void *)dst); 5192} 5193 5194int unmapped_buf_allowed = 1; 5195 5196void 5197pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5198 vm_offset_t b_offset, int xfersize) 5199{ 5200 void *a_cp, *b_cp; 5201 vm_page_t pages[2]; 5202 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 5203 int cnt; 5204 boolean_t mapped; 5205 5206 while (xfersize > 0) { 5207 a_pg_offset = a_offset & PAGE_MASK; 5208 pages[0] = ma[a_offset >> PAGE_SHIFT]; 5209 b_pg_offset = b_offset & PAGE_MASK; 5210 pages[1] = mb[b_offset >> PAGE_SHIFT]; 5211 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5212 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5213 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 5214 a_cp = (char *)vaddr[0] + a_pg_offset; 5215 b_cp = (char *)vaddr[1] + b_pg_offset; 5216 bcopy(a_cp, b_cp, cnt); 5217 if (__predict_false(mapped)) 5218 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 5219 a_offset += cnt; 5220 b_offset += cnt; 5221 xfersize -= cnt; 5222 } 5223} 5224 5225/* 5226 * Returns true if the pmap's pv is one of the first 5227 * 16 pvs linked to from this page. This count may 5228 * be changed upwards or downwards in the future; it 5229 * is only necessary that true be returned for a small 5230 * subset of pmaps for proper page aging. 5231 */ 5232boolean_t 5233pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5234{ 5235 struct md_page *pvh; 5236 struct rwlock *lock; 5237 pv_entry_t pv; 5238 int loops = 0; 5239 boolean_t rv; 5240 5241 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5242 ("pmap_page_exists_quick: page %p is not managed", m)); 5243 rv = FALSE; 5244 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5245 rw_rlock(lock); 5246 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5247 if (PV_PMAP(pv) == pmap) { 5248 rv = TRUE; 5249 break; 5250 } 5251 loops++; 5252 if (loops >= 16) 5253 break; 5254 } 5255 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5256 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5257 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5258 if (PV_PMAP(pv) == pmap) { 5259 rv = TRUE; 5260 break; 5261 } 5262 loops++; 5263 if (loops >= 16) 5264 break; 5265 } 5266 } 5267 rw_runlock(lock); 5268 return (rv); 5269} 5270 5271/* 5272 * pmap_page_wired_mappings: 5273 * 5274 * Return the number of managed mappings to the given physical page 5275 * that are wired. 5276 */ 5277int 5278pmap_page_wired_mappings(vm_page_t m) 5279{ 5280 struct rwlock *lock; 5281 struct md_page *pvh; 5282 pmap_t pmap; 5283 pt_entry_t *pte; 5284 pv_entry_t pv; 5285 int count, md_gen, pvh_gen; 5286 5287 if ((m->oflags & VPO_UNMANAGED) != 0) 5288 return (0); 5289 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5290 rw_rlock(lock); 5291restart: 5292 count = 0; 5293 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5294 pmap = PV_PMAP(pv); 5295 if (!PMAP_TRYLOCK(pmap)) { 5296 md_gen = m->md.pv_gen; 5297 rw_runlock(lock); 5298 PMAP_LOCK(pmap); 5299 rw_rlock(lock); 5300 if (md_gen != m->md.pv_gen) { 5301 PMAP_UNLOCK(pmap); 5302 goto restart; 5303 } 5304 } 5305 pte = pmap_pte(pmap, pv->pv_va); 5306 if ((*pte & PG_W) != 0) 5307 count++; 5308 PMAP_UNLOCK(pmap); 5309 } 5310 if ((m->flags & PG_FICTITIOUS) == 0) { 5311 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5312 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5313 pmap = PV_PMAP(pv); 5314 if (!PMAP_TRYLOCK(pmap)) { 5315 md_gen = m->md.pv_gen; 5316 pvh_gen = pvh->pv_gen; 5317 rw_runlock(lock); 5318 PMAP_LOCK(pmap); 5319 rw_rlock(lock); 5320 if (md_gen != m->md.pv_gen || 5321 pvh_gen != pvh->pv_gen) { 5322 PMAP_UNLOCK(pmap); 5323 goto restart; 5324 } 5325 } 5326 pte = pmap_pde(pmap, pv->pv_va); 5327 if ((*pte & PG_W) != 0) 5328 count++; 5329 PMAP_UNLOCK(pmap); 5330 } 5331 } 5332 rw_runlock(lock); 5333 return (count); 5334} 5335 5336/* 5337 * Returns TRUE if the given page is mapped individually or as part of 5338 * a 2mpage. Otherwise, returns FALSE. 5339 */ 5340boolean_t 5341pmap_page_is_mapped(vm_page_t m) 5342{ 5343 struct rwlock *lock; 5344 boolean_t rv; 5345 5346 if ((m->oflags & VPO_UNMANAGED) != 0) 5347 return (FALSE); 5348 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5349 rw_rlock(lock); 5350 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5351 ((m->flags & PG_FICTITIOUS) == 0 && 5352 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5353 rw_runlock(lock); 5354 return (rv); 5355} 5356 5357/* 5358 * Destroy all managed, non-wired mappings in the given user-space 5359 * pmap. This pmap cannot be active on any processor besides the 5360 * caller. 5361 * 5362 * This function cannot be applied to the kernel pmap. Moreover, it 5363 * is not intended for general use. It is only to be used during 5364 * process termination. Consequently, it can be implemented in ways 5365 * that make it faster than pmap_remove(). First, it can more quickly 5366 * destroy mappings by iterating over the pmap's collection of PV 5367 * entries, rather than searching the page table. Second, it doesn't 5368 * have to test and clear the page table entries atomically, because 5369 * no processor is currently accessing the user address space. In 5370 * particular, a page table entry's dirty bit won't change state once 5371 * this function starts. 5372 */ 5373void 5374pmap_remove_pages(pmap_t pmap) 5375{ 5376 pd_entry_t ptepde; 5377 pt_entry_t *pte, tpte; 5378 pt_entry_t PG_M, PG_RW, PG_V; 5379 struct spglist free; 5380 vm_page_t m, mpte, mt; 5381 pv_entry_t pv; 5382 struct md_page *pvh; 5383 struct pv_chunk *pc, *npc; 5384 struct rwlock *lock; 5385 int64_t bit; 5386 uint64_t inuse, bitmask; 5387 int allfree, field, freed, idx; 5388 boolean_t superpage; 5389 vm_paddr_t pa; 5390 5391 /* 5392 * Assert that the given pmap is only active on the current 5393 * CPU. Unfortunately, we cannot block another CPU from 5394 * activating the pmap while this function is executing. 5395 */ 5396 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5397#ifdef INVARIANTS 5398 { 5399 cpuset_t other_cpus; 5400 5401 other_cpus = all_cpus; 5402 critical_enter(); 5403 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5404 CPU_AND(&other_cpus, &pmap->pm_active); 5405 critical_exit(); 5406 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5407 } 5408#endif 5409 5410 lock = NULL; 5411 PG_M = pmap_modified_bit(pmap); 5412 PG_V = pmap_valid_bit(pmap); 5413 PG_RW = pmap_rw_bit(pmap); 5414 5415 SLIST_INIT(&free); 5416 PMAP_LOCK(pmap); 5417 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5418 allfree = 1; 5419 freed = 0; 5420 for (field = 0; field < _NPCM; field++) { 5421 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5422 while (inuse != 0) { 5423 bit = bsfq(inuse); 5424 bitmask = 1UL << bit; 5425 idx = field * 64 + bit; 5426 pv = &pc->pc_pventry[idx]; 5427 inuse &= ~bitmask; 5428 5429 pte = pmap_pdpe(pmap, pv->pv_va); 5430 ptepde = *pte; 5431 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5432 tpte = *pte; 5433 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5434 superpage = FALSE; 5435 ptepde = tpte; 5436 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5437 PG_FRAME); 5438 pte = &pte[pmap_pte_index(pv->pv_va)]; 5439 tpte = *pte; 5440 } else { 5441 /* 5442 * Keep track whether 'tpte' is a 5443 * superpage explicitly instead of 5444 * relying on PG_PS being set. 5445 * 5446 * This is because PG_PS is numerically 5447 * identical to PG_PTE_PAT and thus a 5448 * regular page could be mistaken for 5449 * a superpage. 5450 */ 5451 superpage = TRUE; 5452 } 5453 5454 if ((tpte & PG_V) == 0) { 5455 panic("bad pte va %lx pte %lx", 5456 pv->pv_va, tpte); 5457 } 5458 5459/* 5460 * We cannot remove wired pages from a process' mapping at this time 5461 */ 5462 if (tpte & PG_W) { 5463 allfree = 0; 5464 continue; 5465 } 5466 5467 if (superpage) 5468 pa = tpte & PG_PS_FRAME; 5469 else 5470 pa = tpte & PG_FRAME; 5471 5472 m = PHYS_TO_VM_PAGE(pa); 5473 KASSERT(m->phys_addr == pa, 5474 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5475 m, (uintmax_t)m->phys_addr, 5476 (uintmax_t)tpte)); 5477 5478 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5479 m < &vm_page_array[vm_page_array_size], 5480 ("pmap_remove_pages: bad tpte %#jx", 5481 (uintmax_t)tpte)); 5482 5483 pte_clear(pte); 5484 5485 /* 5486 * Update the vm_page_t clean/reference bits. 5487 */ 5488 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5489 if (superpage) { 5490 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5491 vm_page_dirty(mt); 5492 } else 5493 vm_page_dirty(m); 5494 } 5495 5496 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5497 5498 /* Mark free */ 5499 pc->pc_map[field] |= bitmask; 5500 if (superpage) { 5501 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5502 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5503 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5504 pvh->pv_gen++; 5505 if (TAILQ_EMPTY(&pvh->pv_list)) { 5506 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5507 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5508 TAILQ_EMPTY(&mt->md.pv_list)) 5509 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5510 } 5511 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 5512 if (mpte != NULL) { 5513 pmap_remove_pt_page(pmap, mpte); 5514 pmap_resident_count_dec(pmap, 1); 5515 KASSERT(mpte->wire_count == NPTEPG, 5516 ("pmap_remove_pages: pte page wire count error")); 5517 mpte->wire_count = 0; 5518 pmap_add_delayed_free_list(mpte, &free, FALSE); 5519 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 5520 } 5521 } else { 5522 pmap_resident_count_dec(pmap, 1); 5523 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5524 m->md.pv_gen++; 5525 if ((m->aflags & PGA_WRITEABLE) != 0 && 5526 TAILQ_EMPTY(&m->md.pv_list) && 5527 (m->flags & PG_FICTITIOUS) == 0) { 5528 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5529 if (TAILQ_EMPTY(&pvh->pv_list)) 5530 vm_page_aflag_clear(m, PGA_WRITEABLE); 5531 } 5532 } 5533 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5534 freed++; 5535 } 5536 } 5537 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5538 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5539 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5540 if (allfree) { 5541 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5542 free_pv_chunk(pc); 5543 } 5544 } 5545 if (lock != NULL) 5546 rw_wunlock(lock); 5547 pmap_invalidate_all(pmap); 5548 PMAP_UNLOCK(pmap); 5549 pmap_free_zero_pages(&free); 5550} 5551 5552static boolean_t 5553pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5554{ 5555 struct rwlock *lock; 5556 pv_entry_t pv; 5557 struct md_page *pvh; 5558 pt_entry_t *pte, mask; 5559 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5560 pmap_t pmap; 5561 int md_gen, pvh_gen; 5562 boolean_t rv; 5563 5564 rv = FALSE; 5565 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5566 rw_rlock(lock); 5567restart: 5568 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5569 pmap = PV_PMAP(pv); 5570 if (!PMAP_TRYLOCK(pmap)) { 5571 md_gen = m->md.pv_gen; 5572 rw_runlock(lock); 5573 PMAP_LOCK(pmap); 5574 rw_rlock(lock); 5575 if (md_gen != m->md.pv_gen) { 5576 PMAP_UNLOCK(pmap); 5577 goto restart; 5578 } 5579 } 5580 pte = pmap_pte(pmap, pv->pv_va); 5581 mask = 0; 5582 if (modified) { 5583 PG_M = pmap_modified_bit(pmap); 5584 PG_RW = pmap_rw_bit(pmap); 5585 mask |= PG_RW | PG_M; 5586 } 5587 if (accessed) { 5588 PG_A = pmap_accessed_bit(pmap); 5589 PG_V = pmap_valid_bit(pmap); 5590 mask |= PG_V | PG_A; 5591 } 5592 rv = (*pte & mask) == mask; 5593 PMAP_UNLOCK(pmap); 5594 if (rv) 5595 goto out; 5596 } 5597 if ((m->flags & PG_FICTITIOUS) == 0) { 5598 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5599 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5600 pmap = PV_PMAP(pv); 5601 if (!PMAP_TRYLOCK(pmap)) { 5602 md_gen = m->md.pv_gen; 5603 pvh_gen = pvh->pv_gen; 5604 rw_runlock(lock); 5605 PMAP_LOCK(pmap); 5606 rw_rlock(lock); 5607 if (md_gen != m->md.pv_gen || 5608 pvh_gen != pvh->pv_gen) { 5609 PMAP_UNLOCK(pmap); 5610 goto restart; 5611 } 5612 } 5613 pte = pmap_pde(pmap, pv->pv_va); 5614 mask = 0; 5615 if (modified) { 5616 PG_M = pmap_modified_bit(pmap); 5617 PG_RW = pmap_rw_bit(pmap); 5618 mask |= PG_RW | PG_M; 5619 } 5620 if (accessed) { 5621 PG_A = pmap_accessed_bit(pmap); 5622 PG_V = pmap_valid_bit(pmap); 5623 mask |= PG_V | PG_A; 5624 } 5625 rv = (*pte & mask) == mask; 5626 PMAP_UNLOCK(pmap); 5627 if (rv) 5628 goto out; 5629 } 5630 } 5631out: 5632 rw_runlock(lock); 5633 return (rv); 5634} 5635 5636/* 5637 * pmap_is_modified: 5638 * 5639 * Return whether or not the specified physical page was modified 5640 * in any physical maps. 5641 */ 5642boolean_t 5643pmap_is_modified(vm_page_t m) 5644{ 5645 5646 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5647 ("pmap_is_modified: page %p is not managed", m)); 5648 5649 /* 5650 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5651 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5652 * is clear, no PTEs can have PG_M set. 5653 */ 5654 VM_OBJECT_ASSERT_WLOCKED(m->object); 5655 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5656 return (FALSE); 5657 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5658} 5659 5660/* 5661 * pmap_is_prefaultable: 5662 * 5663 * Return whether or not the specified virtual address is eligible 5664 * for prefault. 5665 */ 5666boolean_t 5667pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5668{ 5669 pd_entry_t *pde; 5670 pt_entry_t *pte, PG_V; 5671 boolean_t rv; 5672 5673 PG_V = pmap_valid_bit(pmap); 5674 rv = FALSE; 5675 PMAP_LOCK(pmap); 5676 pde = pmap_pde(pmap, addr); 5677 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5678 pte = pmap_pde_to_pte(pde, addr); 5679 rv = (*pte & PG_V) == 0; 5680 } 5681 PMAP_UNLOCK(pmap); 5682 return (rv); 5683} 5684 5685/* 5686 * pmap_is_referenced: 5687 * 5688 * Return whether or not the specified physical page was referenced 5689 * in any physical maps. 5690 */ 5691boolean_t 5692pmap_is_referenced(vm_page_t m) 5693{ 5694 5695 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5696 ("pmap_is_referenced: page %p is not managed", m)); 5697 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5698} 5699 5700/* 5701 * Clear the write and modified bits in each of the given page's mappings. 5702 */ 5703void 5704pmap_remove_write(vm_page_t m) 5705{ 5706 struct md_page *pvh; 5707 pmap_t pmap; 5708 struct rwlock *lock; 5709 pv_entry_t next_pv, pv; 5710 pd_entry_t *pde; 5711 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5712 vm_offset_t va; 5713 int pvh_gen, md_gen; 5714 5715 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5716 ("pmap_remove_write: page %p is not managed", m)); 5717 5718 /* 5719 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5720 * set by another thread while the object is locked. Thus, 5721 * if PGA_WRITEABLE is clear, no page table entries need updating. 5722 */ 5723 VM_OBJECT_ASSERT_WLOCKED(m->object); 5724 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5725 return; 5726 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5727 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5728 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5729retry_pv_loop: 5730 rw_wlock(lock); 5731 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5732 pmap = PV_PMAP(pv); 5733 if (!PMAP_TRYLOCK(pmap)) { 5734 pvh_gen = pvh->pv_gen; 5735 rw_wunlock(lock); 5736 PMAP_LOCK(pmap); 5737 rw_wlock(lock); 5738 if (pvh_gen != pvh->pv_gen) { 5739 PMAP_UNLOCK(pmap); 5740 rw_wunlock(lock); 5741 goto retry_pv_loop; 5742 } 5743 } 5744 PG_RW = pmap_rw_bit(pmap); 5745 va = pv->pv_va; 5746 pde = pmap_pde(pmap, va); 5747 if ((*pde & PG_RW) != 0) 5748 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5749 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5750 ("inconsistent pv lock %p %p for page %p", 5751 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5752 PMAP_UNLOCK(pmap); 5753 } 5754 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5755 pmap = PV_PMAP(pv); 5756 if (!PMAP_TRYLOCK(pmap)) { 5757 pvh_gen = pvh->pv_gen; 5758 md_gen = m->md.pv_gen; 5759 rw_wunlock(lock); 5760 PMAP_LOCK(pmap); 5761 rw_wlock(lock); 5762 if (pvh_gen != pvh->pv_gen || 5763 md_gen != m->md.pv_gen) { 5764 PMAP_UNLOCK(pmap); 5765 rw_wunlock(lock); 5766 goto retry_pv_loop; 5767 } 5768 } 5769 PG_M = pmap_modified_bit(pmap); 5770 PG_RW = pmap_rw_bit(pmap); 5771 pde = pmap_pde(pmap, pv->pv_va); 5772 KASSERT((*pde & PG_PS) == 0, 5773 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5774 m)); 5775 pte = pmap_pde_to_pte(pde, pv->pv_va); 5776retry: 5777 oldpte = *pte; 5778 if (oldpte & PG_RW) { 5779 if (!atomic_cmpset_long(pte, oldpte, oldpte & 5780 ~(PG_RW | PG_M))) 5781 goto retry; 5782 if ((oldpte & PG_M) != 0) 5783 vm_page_dirty(m); 5784 pmap_invalidate_page(pmap, pv->pv_va); 5785 } 5786 PMAP_UNLOCK(pmap); 5787 } 5788 rw_wunlock(lock); 5789 vm_page_aflag_clear(m, PGA_WRITEABLE); 5790 pmap_delayed_invl_wait(m); 5791} 5792 5793static __inline boolean_t 5794safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 5795{ 5796 5797 if (!pmap_emulate_ad_bits(pmap)) 5798 return (TRUE); 5799 5800 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 5801 5802 /* 5803 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 5804 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 5805 * if the EPT_PG_WRITE bit is set. 5806 */ 5807 if ((pte & EPT_PG_WRITE) != 0) 5808 return (FALSE); 5809 5810 /* 5811 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 5812 */ 5813 if ((pte & EPT_PG_EXECUTE) == 0 || 5814 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 5815 return (TRUE); 5816 else 5817 return (FALSE); 5818} 5819 5820#define PMAP_TS_REFERENCED_MAX 5 5821 5822/* 5823 * pmap_ts_referenced: 5824 * 5825 * Return a count of reference bits for a page, clearing those bits. 5826 * It is not necessary for every reference bit to be cleared, but it 5827 * is necessary that 0 only be returned when there are truly no 5828 * reference bits set. 5829 * 5830 * XXX: The exact number of bits to check and clear is a matter that 5831 * should be tested and standardized at some point in the future for 5832 * optimal aging of shared pages. 5833 * 5834 * As an optimization, update the page's dirty field if a modified bit is 5835 * found while counting reference bits. This opportunistic update can be 5836 * performed at low cost and can eliminate the need for some future calls 5837 * to pmap_is_modified(). However, since this function stops after 5838 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5839 * dirty pages. Those dirty pages will only be detected by a future call 5840 * to pmap_is_modified(). 5841 * 5842 * A DI block is not needed within this function, because 5843 * invalidations are performed before the PV list lock is 5844 * released. 5845 */ 5846int 5847pmap_ts_referenced(vm_page_t m) 5848{ 5849 struct md_page *pvh; 5850 pv_entry_t pv, pvf; 5851 pmap_t pmap; 5852 struct rwlock *lock; 5853 pd_entry_t oldpde, *pde; 5854 pt_entry_t *pte, PG_A, PG_M, PG_RW; 5855 vm_offset_t va; 5856 vm_paddr_t pa; 5857 int cleared, md_gen, not_cleared, pvh_gen; 5858 struct spglist free; 5859 boolean_t demoted; 5860 5861 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5862 ("pmap_ts_referenced: page %p is not managed", m)); 5863 SLIST_INIT(&free); 5864 cleared = 0; 5865 pa = VM_PAGE_TO_PHYS(m); 5866 lock = PHYS_TO_PV_LIST_LOCK(pa); 5867 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 5868 rw_wlock(lock); 5869retry: 5870 not_cleared = 0; 5871 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5872 goto small_mappings; 5873 pv = pvf; 5874 do { 5875 if (pvf == NULL) 5876 pvf = pv; 5877 pmap = PV_PMAP(pv); 5878 if (!PMAP_TRYLOCK(pmap)) { 5879 pvh_gen = pvh->pv_gen; 5880 rw_wunlock(lock); 5881 PMAP_LOCK(pmap); 5882 rw_wlock(lock); 5883 if (pvh_gen != pvh->pv_gen) { 5884 PMAP_UNLOCK(pmap); 5885 goto retry; 5886 } 5887 } 5888 PG_A = pmap_accessed_bit(pmap); 5889 PG_M = pmap_modified_bit(pmap); 5890 PG_RW = pmap_rw_bit(pmap); 5891 va = pv->pv_va; 5892 pde = pmap_pde(pmap, pv->pv_va); 5893 oldpde = *pde; 5894 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5895 /* 5896 * Although "oldpde" is mapping a 2MB page, because 5897 * this function is called at a 4KB page granularity, 5898 * we only update the 4KB page under test. 5899 */ 5900 vm_page_dirty(m); 5901 } 5902 if ((*pde & PG_A) != 0) { 5903 /* 5904 * Since this reference bit is shared by 512 4KB 5905 * pages, it should not be cleared every time it is 5906 * tested. Apply a simple "hash" function on the 5907 * physical page number, the virtual superpage number, 5908 * and the pmap address to select one 4KB page out of 5909 * the 512 on which testing the reference bit will 5910 * result in clearing that reference bit. This 5911 * function is designed to avoid the selection of the 5912 * same 4KB page for every 2MB page mapping. 5913 * 5914 * On demotion, a mapping that hasn't been referenced 5915 * is simply destroyed. To avoid the possibility of a 5916 * subsequent page fault on a demoted wired mapping, 5917 * always leave its reference bit set. Moreover, 5918 * since the superpage is wired, the current state of 5919 * its reference bit won't affect page replacement. 5920 */ 5921 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5922 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5923 (*pde & PG_W) == 0) { 5924 if (safe_to_clear_referenced(pmap, oldpde)) { 5925 atomic_clear_long(pde, PG_A); 5926 pmap_invalidate_page(pmap, pv->pv_va); 5927 demoted = FALSE; 5928 } else if (pmap_demote_pde_locked(pmap, pde, 5929 pv->pv_va, &lock)) { 5930 /* 5931 * Remove the mapping to a single page 5932 * so that a subsequent access may 5933 * repromote. Since the underlying 5934 * page table page is fully populated, 5935 * this removal never frees a page 5936 * table page. 5937 */ 5938 demoted = TRUE; 5939 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5940 PG_PS_FRAME); 5941 pte = pmap_pde_to_pte(pde, va); 5942 pmap_remove_pte(pmap, pte, va, *pde, 5943 NULL, &lock); 5944 pmap_invalidate_page(pmap, va); 5945 } else 5946 demoted = TRUE; 5947 5948 if (demoted) { 5949 /* 5950 * The superpage mapping was removed 5951 * entirely and therefore 'pv' is no 5952 * longer valid. 5953 */ 5954 if (pvf == pv) 5955 pvf = NULL; 5956 pv = NULL; 5957 } 5958 cleared++; 5959 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5960 ("inconsistent pv lock %p %p for page %p", 5961 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5962 } else 5963 not_cleared++; 5964 } 5965 PMAP_UNLOCK(pmap); 5966 /* Rotate the PV list if it has more than one entry. */ 5967 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5968 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5969 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5970 pvh->pv_gen++; 5971 } 5972 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5973 goto out; 5974 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5975small_mappings: 5976 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5977 goto out; 5978 pv = pvf; 5979 do { 5980 if (pvf == NULL) 5981 pvf = pv; 5982 pmap = PV_PMAP(pv); 5983 if (!PMAP_TRYLOCK(pmap)) { 5984 pvh_gen = pvh->pv_gen; 5985 md_gen = m->md.pv_gen; 5986 rw_wunlock(lock); 5987 PMAP_LOCK(pmap); 5988 rw_wlock(lock); 5989 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5990 PMAP_UNLOCK(pmap); 5991 goto retry; 5992 } 5993 } 5994 PG_A = pmap_accessed_bit(pmap); 5995 PG_M = pmap_modified_bit(pmap); 5996 PG_RW = pmap_rw_bit(pmap); 5997 pde = pmap_pde(pmap, pv->pv_va); 5998 KASSERT((*pde & PG_PS) == 0, 5999 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 6000 m)); 6001 pte = pmap_pde_to_pte(pde, pv->pv_va); 6002 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6003 vm_page_dirty(m); 6004 if ((*pte & PG_A) != 0) { 6005 if (safe_to_clear_referenced(pmap, *pte)) { 6006 atomic_clear_long(pte, PG_A); 6007 pmap_invalidate_page(pmap, pv->pv_va); 6008 cleared++; 6009 } else if ((*pte & PG_W) == 0) { 6010 /* 6011 * Wired pages cannot be paged out so 6012 * doing accessed bit emulation for 6013 * them is wasted effort. We do the 6014 * hard work for unwired pages only. 6015 */ 6016 pmap_remove_pte(pmap, pte, pv->pv_va, 6017 *pde, &free, &lock); 6018 pmap_invalidate_page(pmap, pv->pv_va); 6019 cleared++; 6020 if (pvf == pv) 6021 pvf = NULL; 6022 pv = NULL; 6023 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6024 ("inconsistent pv lock %p %p for page %p", 6025 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6026 } else 6027 not_cleared++; 6028 } 6029 PMAP_UNLOCK(pmap); 6030 /* Rotate the PV list if it has more than one entry. */ 6031 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6032 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6033 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6034 m->md.pv_gen++; 6035 } 6036 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6037 not_cleared < PMAP_TS_REFERENCED_MAX); 6038out: 6039 rw_wunlock(lock); 6040 pmap_free_zero_pages(&free); 6041 return (cleared + not_cleared); 6042} 6043 6044/* 6045 * Apply the given advice to the specified range of addresses within the 6046 * given pmap. Depending on the advice, clear the referenced and/or 6047 * modified flags in each mapping and set the mapped page's dirty field. 6048 */ 6049void 6050pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6051{ 6052 struct rwlock *lock; 6053 pml4_entry_t *pml4e; 6054 pdp_entry_t *pdpe; 6055 pd_entry_t oldpde, *pde; 6056 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 6057 vm_offset_t va_next; 6058 vm_page_t m; 6059 boolean_t anychanged; 6060 6061 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6062 return; 6063 6064 /* 6065 * A/D bit emulation requires an alternate code path when clearing 6066 * the modified and accessed bits below. Since this function is 6067 * advisory in nature we skip it entirely for pmaps that require 6068 * A/D bit emulation. 6069 */ 6070 if (pmap_emulate_ad_bits(pmap)) 6071 return; 6072 6073 PG_A = pmap_accessed_bit(pmap); 6074 PG_G = pmap_global_bit(pmap); 6075 PG_M = pmap_modified_bit(pmap); 6076 PG_V = pmap_valid_bit(pmap); 6077 PG_RW = pmap_rw_bit(pmap); 6078 anychanged = FALSE; 6079 pmap_delayed_invl_started(); 6080 PMAP_LOCK(pmap); 6081 for (; sva < eva; sva = va_next) { 6082 pml4e = pmap_pml4e(pmap, sva); 6083 if ((*pml4e & PG_V) == 0) { 6084 va_next = (sva + NBPML4) & ~PML4MASK; 6085 if (va_next < sva) 6086 va_next = eva; 6087 continue; 6088 } 6089 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6090 if ((*pdpe & PG_V) == 0) { 6091 va_next = (sva + NBPDP) & ~PDPMASK; 6092 if (va_next < sva) 6093 va_next = eva; 6094 continue; 6095 } 6096 va_next = (sva + NBPDR) & ~PDRMASK; 6097 if (va_next < sva) 6098 va_next = eva; 6099 pde = pmap_pdpe_to_pde(pdpe, sva); 6100 oldpde = *pde; 6101 if ((oldpde & PG_V) == 0) 6102 continue; 6103 else if ((oldpde & PG_PS) != 0) { 6104 if ((oldpde & PG_MANAGED) == 0) 6105 continue; 6106 lock = NULL; 6107 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6108 if (lock != NULL) 6109 rw_wunlock(lock); 6110 6111 /* 6112 * The large page mapping was destroyed. 6113 */ 6114 continue; 6115 } 6116 6117 /* 6118 * Unless the page mappings are wired, remove the 6119 * mapping to a single page so that a subsequent 6120 * access may repromote. Since the underlying page 6121 * table page is fully populated, this removal never 6122 * frees a page table page. 6123 */ 6124 if ((oldpde & PG_W) == 0) { 6125 pte = pmap_pde_to_pte(pde, sva); 6126 KASSERT((*pte & PG_V) != 0, 6127 ("pmap_advise: invalid PTE")); 6128 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6129 &lock); 6130 anychanged = TRUE; 6131 } 6132 if (lock != NULL) 6133 rw_wunlock(lock); 6134 } 6135 if (va_next > eva) 6136 va_next = eva; 6137 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6138 sva += PAGE_SIZE) { 6139 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 6140 PG_V)) 6141 continue; 6142 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6143 if (advice == MADV_DONTNEED) { 6144 /* 6145 * Future calls to pmap_is_modified() 6146 * can be avoided by making the page 6147 * dirty now. 6148 */ 6149 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6150 vm_page_dirty(m); 6151 } 6152 atomic_clear_long(pte, PG_M | PG_A); 6153 } else if ((*pte & PG_A) != 0) 6154 atomic_clear_long(pte, PG_A); 6155 else 6156 continue; 6157 if ((*pte & PG_G) != 0) 6158 pmap_invalidate_page(pmap, sva); 6159 else 6160 anychanged = TRUE; 6161 } 6162 } 6163 if (anychanged) 6164 pmap_invalidate_all(pmap); 6165 PMAP_UNLOCK(pmap); 6166 pmap_delayed_invl_finished(); 6167} 6168 6169/* 6170 * Clear the modify bits on the specified physical page. 6171 */ 6172void 6173pmap_clear_modify(vm_page_t m) 6174{ 6175 struct md_page *pvh; 6176 pmap_t pmap; 6177 pv_entry_t next_pv, pv; 6178 pd_entry_t oldpde, *pde; 6179 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6180 struct rwlock *lock; 6181 vm_offset_t va; 6182 int md_gen, pvh_gen; 6183 6184 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6185 ("pmap_clear_modify: page %p is not managed", m)); 6186 VM_OBJECT_ASSERT_WLOCKED(m->object); 6187 KASSERT(!vm_page_xbusied(m), 6188 ("pmap_clear_modify: page %p is exclusive busied", m)); 6189 6190 /* 6191 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6192 * If the object containing the page is locked and the page is not 6193 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6194 */ 6195 if ((m->aflags & PGA_WRITEABLE) == 0) 6196 return; 6197 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6198 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6199 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6200 rw_wlock(lock); 6201restart: 6202 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6203 pmap = PV_PMAP(pv); 6204 if (!PMAP_TRYLOCK(pmap)) { 6205 pvh_gen = pvh->pv_gen; 6206 rw_wunlock(lock); 6207 PMAP_LOCK(pmap); 6208 rw_wlock(lock); 6209 if (pvh_gen != pvh->pv_gen) { 6210 PMAP_UNLOCK(pmap); 6211 goto restart; 6212 } 6213 } 6214 PG_M = pmap_modified_bit(pmap); 6215 PG_V = pmap_valid_bit(pmap); 6216 PG_RW = pmap_rw_bit(pmap); 6217 va = pv->pv_va; 6218 pde = pmap_pde(pmap, va); 6219 oldpde = *pde; 6220 if ((oldpde & PG_RW) != 0) { 6221 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6222 if ((oldpde & PG_W) == 0) { 6223 /* 6224 * Write protect the mapping to a 6225 * single page so that a subsequent 6226 * write access may repromote. 6227 */ 6228 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6229 PG_PS_FRAME); 6230 pte = pmap_pde_to_pte(pde, va); 6231 oldpte = *pte; 6232 if ((oldpte & PG_V) != 0) { 6233 while (!atomic_cmpset_long(pte, 6234 oldpte, 6235 oldpte & ~(PG_M | PG_RW))) 6236 oldpte = *pte; 6237 vm_page_dirty(m); 6238 pmap_invalidate_page(pmap, va); 6239 } 6240 } 6241 } 6242 } 6243 PMAP_UNLOCK(pmap); 6244 } 6245 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6246 pmap = PV_PMAP(pv); 6247 if (!PMAP_TRYLOCK(pmap)) { 6248 md_gen = m->md.pv_gen; 6249 pvh_gen = pvh->pv_gen; 6250 rw_wunlock(lock); 6251 PMAP_LOCK(pmap); 6252 rw_wlock(lock); 6253 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6254 PMAP_UNLOCK(pmap); 6255 goto restart; 6256 } 6257 } 6258 PG_M = pmap_modified_bit(pmap); 6259 PG_RW = pmap_rw_bit(pmap); 6260 pde = pmap_pde(pmap, pv->pv_va); 6261 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6262 " a 2mpage in page %p's pv list", m)); 6263 pte = pmap_pde_to_pte(pde, pv->pv_va); 6264 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6265 atomic_clear_long(pte, PG_M); 6266 pmap_invalidate_page(pmap, pv->pv_va); 6267 } 6268 PMAP_UNLOCK(pmap); 6269 } 6270 rw_wunlock(lock); 6271} 6272 6273/* 6274 * Miscellaneous support routines follow 6275 */ 6276 6277/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6278static __inline void 6279pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6280{ 6281 u_int opte, npte; 6282 6283 /* 6284 * The cache mode bits are all in the low 32-bits of the 6285 * PTE, so we can just spin on updating the low 32-bits. 6286 */ 6287 do { 6288 opte = *(u_int *)pte; 6289 npte = opte & ~mask; 6290 npte |= cache_bits; 6291 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6292} 6293 6294/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6295static __inline void 6296pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6297{ 6298 u_int opde, npde; 6299 6300 /* 6301 * The cache mode bits are all in the low 32-bits of the 6302 * PDE, so we can just spin on updating the low 32-bits. 6303 */ 6304 do { 6305 opde = *(u_int *)pde; 6306 npde = opde & ~mask; 6307 npde |= cache_bits; 6308 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6309} 6310 6311/* 6312 * Map a set of physical memory pages into the kernel virtual 6313 * address space. Return a pointer to where it is mapped. This 6314 * routine is intended to be used for mapping device memory, 6315 * NOT real memory. 6316 */ 6317void * 6318pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6319{ 6320 struct pmap_preinit_mapping *ppim; 6321 vm_offset_t va, offset; 6322 vm_size_t tmpsize; 6323 int i; 6324 6325 offset = pa & PAGE_MASK; 6326 size = round_page(offset + size); 6327 pa = trunc_page(pa); 6328 6329 if (!pmap_initialized) { 6330 va = 0; 6331 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6332 ppim = pmap_preinit_mapping + i; 6333 if (ppim->va == 0) { 6334 ppim->pa = pa; 6335 ppim->sz = size; 6336 ppim->mode = mode; 6337 ppim->va = virtual_avail; 6338 virtual_avail += size; 6339 va = ppim->va; 6340 break; 6341 } 6342 } 6343 if (va == 0) 6344 panic("%s: too many preinit mappings", __func__); 6345 } else { 6346 /* 6347 * If we have a preinit mapping, re-use it. 6348 */ 6349 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6350 ppim = pmap_preinit_mapping + i; 6351 if (ppim->pa == pa && ppim->sz == size && 6352 ppim->mode == mode) 6353 return ((void *)(ppim->va + offset)); 6354 } 6355 /* 6356 * If the specified range of physical addresses fits within 6357 * the direct map window, use the direct map. 6358 */ 6359 if (pa < dmaplimit && pa + size < dmaplimit) { 6360 va = PHYS_TO_DMAP(pa); 6361 if (!pmap_change_attr(va, size, mode)) 6362 return ((void *)(va + offset)); 6363 } 6364 va = kva_alloc(size); 6365 if (va == 0) 6366 panic("%s: Couldn't allocate KVA", __func__); 6367 } 6368 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6369 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6370 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6371 pmap_invalidate_cache_range(va, va + tmpsize, FALSE); 6372 return ((void *)(va + offset)); 6373} 6374 6375void * 6376pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6377{ 6378 6379 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6380} 6381 6382void * 6383pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6384{ 6385 6386 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6387} 6388 6389void 6390pmap_unmapdev(vm_offset_t va, vm_size_t size) 6391{ 6392 struct pmap_preinit_mapping *ppim; 6393 vm_offset_t offset; 6394 int i; 6395 6396 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6397 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6398 return; 6399 offset = va & PAGE_MASK; 6400 size = round_page(offset + size); 6401 va = trunc_page(va); 6402 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6403 ppim = pmap_preinit_mapping + i; 6404 if (ppim->va == va && ppim->sz == size) { 6405 if (pmap_initialized) 6406 return; 6407 ppim->pa = 0; 6408 ppim->va = 0; 6409 ppim->sz = 0; 6410 ppim->mode = 0; 6411 if (va + size == virtual_avail) 6412 virtual_avail = va; 6413 return; 6414 } 6415 } 6416 if (pmap_initialized) 6417 kva_free(va, size); 6418} 6419 6420/* 6421 * Tries to demote a 1GB page mapping. 6422 */ 6423static boolean_t 6424pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6425{ 6426 pdp_entry_t newpdpe, oldpdpe; 6427 pd_entry_t *firstpde, newpde, *pde; 6428 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6429 vm_paddr_t mpdepa; 6430 vm_page_t mpde; 6431 6432 PG_A = pmap_accessed_bit(pmap); 6433 PG_M = pmap_modified_bit(pmap); 6434 PG_V = pmap_valid_bit(pmap); 6435 PG_RW = pmap_rw_bit(pmap); 6436 6437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6438 oldpdpe = *pdpe; 6439 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6440 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6441 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6442 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6443 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6444 " in pmap %p", va, pmap); 6445 return (FALSE); 6446 } 6447 mpdepa = VM_PAGE_TO_PHYS(mpde); 6448 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 6449 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6450 KASSERT((oldpdpe & PG_A) != 0, 6451 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6452 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6453 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6454 newpde = oldpdpe; 6455 6456 /* 6457 * Initialize the page directory page. 6458 */ 6459 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6460 *pde = newpde; 6461 newpde += NBPDR; 6462 } 6463 6464 /* 6465 * Demote the mapping. 6466 */ 6467 *pdpe = newpdpe; 6468 6469 /* 6470 * Invalidate a stale recursive mapping of the page directory page. 6471 */ 6472 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6473 6474 pmap_pdpe_demotions++; 6475 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6476 " in pmap %p", va, pmap); 6477 return (TRUE); 6478} 6479 6480/* 6481 * Sets the memory attribute for the specified page. 6482 */ 6483void 6484pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6485{ 6486 6487 m->md.pat_mode = ma; 6488 6489 /* 6490 * If "m" is a normal page, update its direct mapping. This update 6491 * can be relied upon to perform any cache operations that are 6492 * required for data coherence. 6493 */ 6494 if ((m->flags & PG_FICTITIOUS) == 0 && 6495 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6496 m->md.pat_mode)) 6497 panic("memory attribute change on the direct map failed"); 6498} 6499 6500/* 6501 * Changes the specified virtual address range's memory type to that given by 6502 * the parameter "mode". The specified virtual address range must be 6503 * completely contained within either the direct map or the kernel map. If 6504 * the virtual address range is contained within the kernel map, then the 6505 * memory type for each of the corresponding ranges of the direct map is also 6506 * changed. (The corresponding ranges of the direct map are those ranges that 6507 * map the same physical pages as the specified virtual address range.) These 6508 * changes to the direct map are necessary because Intel describes the 6509 * behavior of their processors as "undefined" if two or more mappings to the 6510 * same physical page have different memory types. 6511 * 6512 * Returns zero if the change completed successfully, and either EINVAL or 6513 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6514 * of the virtual address range was not mapped, and ENOMEM is returned if 6515 * there was insufficient memory available to complete the change. In the 6516 * latter case, the memory type may have been changed on some part of the 6517 * virtual address range or the direct map. 6518 */ 6519int 6520pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6521{ 6522 int error; 6523 6524 PMAP_LOCK(kernel_pmap); 6525 error = pmap_change_attr_locked(va, size, mode); 6526 PMAP_UNLOCK(kernel_pmap); 6527 return (error); 6528} 6529 6530static int 6531pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6532{ 6533 vm_offset_t base, offset, tmpva; 6534 vm_paddr_t pa_start, pa_end, pa_end1; 6535 pdp_entry_t *pdpe; 6536 pd_entry_t *pde; 6537 pt_entry_t *pte; 6538 int cache_bits_pte, cache_bits_pde, error; 6539 boolean_t changed; 6540 6541 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6542 base = trunc_page(va); 6543 offset = va & PAGE_MASK; 6544 size = round_page(offset + size); 6545 6546 /* 6547 * Only supported on kernel virtual addresses, including the direct 6548 * map but excluding the recursive map. 6549 */ 6550 if (base < DMAP_MIN_ADDRESS) 6551 return (EINVAL); 6552 6553 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6554 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6555 changed = FALSE; 6556 6557 /* 6558 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6559 * into 4KB pages if required. 6560 */ 6561 for (tmpva = base; tmpva < base + size; ) { 6562 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6563 if (pdpe == NULL || *pdpe == 0) 6564 return (EINVAL); 6565 if (*pdpe & PG_PS) { 6566 /* 6567 * If the current 1GB page already has the required 6568 * memory type, then we need not demote this page. Just 6569 * increment tmpva to the next 1GB page frame. 6570 */ 6571 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6572 tmpva = trunc_1gpage(tmpva) + NBPDP; 6573 continue; 6574 } 6575 6576 /* 6577 * If the current offset aligns with a 1GB page frame 6578 * and there is at least 1GB left within the range, then 6579 * we need not break down this page into 2MB pages. 6580 */ 6581 if ((tmpva & PDPMASK) == 0 && 6582 tmpva + PDPMASK < base + size) { 6583 tmpva += NBPDP; 6584 continue; 6585 } 6586 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6587 return (ENOMEM); 6588 } 6589 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6590 if (*pde == 0) 6591 return (EINVAL); 6592 if (*pde & PG_PS) { 6593 /* 6594 * If the current 2MB page already has the required 6595 * memory type, then we need not demote this page. Just 6596 * increment tmpva to the next 2MB page frame. 6597 */ 6598 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6599 tmpva = trunc_2mpage(tmpva) + NBPDR; 6600 continue; 6601 } 6602 6603 /* 6604 * If the current offset aligns with a 2MB page frame 6605 * and there is at least 2MB left within the range, then 6606 * we need not break down this page into 4KB pages. 6607 */ 6608 if ((tmpva & PDRMASK) == 0 && 6609 tmpva + PDRMASK < base + size) { 6610 tmpva += NBPDR; 6611 continue; 6612 } 6613 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6614 return (ENOMEM); 6615 } 6616 pte = pmap_pde_to_pte(pde, tmpva); 6617 if (*pte == 0) 6618 return (EINVAL); 6619 tmpva += PAGE_SIZE; 6620 } 6621 error = 0; 6622 6623 /* 6624 * Ok, all the pages exist, so run through them updating their 6625 * cache mode if required. 6626 */ 6627 pa_start = pa_end = 0; 6628 for (tmpva = base; tmpva < base + size; ) { 6629 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6630 if (*pdpe & PG_PS) { 6631 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6632 pmap_pde_attr(pdpe, cache_bits_pde, 6633 X86_PG_PDE_CACHE); 6634 changed = TRUE; 6635 } 6636 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6637 (*pdpe & PG_PS_FRAME) < dmaplimit) { 6638 if (pa_start == pa_end) { 6639 /* Start physical address run. */ 6640 pa_start = *pdpe & PG_PS_FRAME; 6641 pa_end = pa_start + NBPDP; 6642 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6643 pa_end += NBPDP; 6644 else { 6645 /* Run ended, update direct map. */ 6646 error = pmap_change_attr_locked( 6647 PHYS_TO_DMAP(pa_start), 6648 pa_end - pa_start, mode); 6649 if (error != 0) 6650 break; 6651 /* Start physical address run. */ 6652 pa_start = *pdpe & PG_PS_FRAME; 6653 pa_end = pa_start + NBPDP; 6654 } 6655 } 6656 tmpva = trunc_1gpage(tmpva) + NBPDP; 6657 continue; 6658 } 6659 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6660 if (*pde & PG_PS) { 6661 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6662 pmap_pde_attr(pde, cache_bits_pde, 6663 X86_PG_PDE_CACHE); 6664 changed = TRUE; 6665 } 6666 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6667 (*pde & PG_PS_FRAME) < dmaplimit) { 6668 if (pa_start == pa_end) { 6669 /* Start physical address run. */ 6670 pa_start = *pde & PG_PS_FRAME; 6671 pa_end = pa_start + NBPDR; 6672 } else if (pa_end == (*pde & PG_PS_FRAME)) 6673 pa_end += NBPDR; 6674 else { 6675 /* Run ended, update direct map. */ 6676 error = pmap_change_attr_locked( 6677 PHYS_TO_DMAP(pa_start), 6678 pa_end - pa_start, mode); 6679 if (error != 0) 6680 break; 6681 /* Start physical address run. */ 6682 pa_start = *pde & PG_PS_FRAME; 6683 pa_end = pa_start + NBPDR; 6684 } 6685 } 6686 tmpva = trunc_2mpage(tmpva) + NBPDR; 6687 } else { 6688 pte = pmap_pde_to_pte(pde, tmpva); 6689 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6690 pmap_pte_attr(pte, cache_bits_pte, 6691 X86_PG_PTE_CACHE); 6692 changed = TRUE; 6693 } 6694 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6695 (*pte & PG_PS_FRAME) < dmaplimit) { 6696 if (pa_start == pa_end) { 6697 /* Start physical address run. */ 6698 pa_start = *pte & PG_FRAME; 6699 pa_end = pa_start + PAGE_SIZE; 6700 } else if (pa_end == (*pte & PG_FRAME)) 6701 pa_end += PAGE_SIZE; 6702 else { 6703 /* Run ended, update direct map. */ 6704 error = pmap_change_attr_locked( 6705 PHYS_TO_DMAP(pa_start), 6706 pa_end - pa_start, mode); 6707 if (error != 0) 6708 break; 6709 /* Start physical address run. */ 6710 pa_start = *pte & PG_FRAME; 6711 pa_end = pa_start + PAGE_SIZE; 6712 } 6713 } 6714 tmpva += PAGE_SIZE; 6715 } 6716 } 6717 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6718 pa_end1 = MIN(pa_end, dmaplimit); 6719 if (pa_start != pa_end1) 6720 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6721 pa_end1 - pa_start, mode); 6722 } 6723 6724 /* 6725 * Flush CPU caches if required to make sure any data isn't cached that 6726 * shouldn't be, etc. 6727 */ 6728 if (changed) { 6729 pmap_invalidate_range(kernel_pmap, base, tmpva); 6730 pmap_invalidate_cache_range(base, tmpva, FALSE); 6731 } 6732 return (error); 6733} 6734 6735/* 6736 * Demotes any mapping within the direct map region that covers more than the 6737 * specified range of physical addresses. This range's size must be a power 6738 * of two and its starting address must be a multiple of its size. Since the 6739 * demotion does not change any attributes of the mapping, a TLB invalidation 6740 * is not mandatory. The caller may, however, request a TLB invalidation. 6741 */ 6742void 6743pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6744{ 6745 pdp_entry_t *pdpe; 6746 pd_entry_t *pde; 6747 vm_offset_t va; 6748 boolean_t changed; 6749 6750 if (len == 0) 6751 return; 6752 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6753 KASSERT((base & (len - 1)) == 0, 6754 ("pmap_demote_DMAP: base is not a multiple of len")); 6755 if (len < NBPDP && base < dmaplimit) { 6756 va = PHYS_TO_DMAP(base); 6757 changed = FALSE; 6758 PMAP_LOCK(kernel_pmap); 6759 pdpe = pmap_pdpe(kernel_pmap, va); 6760 if ((*pdpe & X86_PG_V) == 0) 6761 panic("pmap_demote_DMAP: invalid PDPE"); 6762 if ((*pdpe & PG_PS) != 0) { 6763 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6764 panic("pmap_demote_DMAP: PDPE failed"); 6765 changed = TRUE; 6766 } 6767 if (len < NBPDR) { 6768 pde = pmap_pdpe_to_pde(pdpe, va); 6769 if ((*pde & X86_PG_V) == 0) 6770 panic("pmap_demote_DMAP: invalid PDE"); 6771 if ((*pde & PG_PS) != 0) { 6772 if (!pmap_demote_pde(kernel_pmap, pde, va)) 6773 panic("pmap_demote_DMAP: PDE failed"); 6774 changed = TRUE; 6775 } 6776 } 6777 if (changed && invalidate) 6778 pmap_invalidate_page(kernel_pmap, va); 6779 PMAP_UNLOCK(kernel_pmap); 6780 } 6781} 6782 6783/* 6784 * perform the pmap work for mincore 6785 */ 6786int 6787pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6788{ 6789 pd_entry_t *pdep; 6790 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 6791 vm_paddr_t pa; 6792 int val; 6793 6794 PG_A = pmap_accessed_bit(pmap); 6795 PG_M = pmap_modified_bit(pmap); 6796 PG_V = pmap_valid_bit(pmap); 6797 PG_RW = pmap_rw_bit(pmap); 6798 6799 PMAP_LOCK(pmap); 6800retry: 6801 pdep = pmap_pde(pmap, addr); 6802 if (pdep != NULL && (*pdep & PG_V)) { 6803 if (*pdep & PG_PS) { 6804 pte = *pdep; 6805 /* Compute the physical address of the 4KB page. */ 6806 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 6807 PG_FRAME; 6808 val = MINCORE_SUPER; 6809 } else { 6810 pte = *pmap_pde_to_pte(pdep, addr); 6811 pa = pte & PG_FRAME; 6812 val = 0; 6813 } 6814 } else { 6815 pte = 0; 6816 pa = 0; 6817 val = 0; 6818 } 6819 if ((pte & PG_V) != 0) { 6820 val |= MINCORE_INCORE; 6821 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6822 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6823 if ((pte & PG_A) != 0) 6824 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6825 } 6826 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6827 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 6828 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 6829 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6830 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6831 goto retry; 6832 } else 6833 PA_UNLOCK_COND(*locked_pa); 6834 PMAP_UNLOCK(pmap); 6835 return (val); 6836} 6837 6838static uint64_t 6839pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 6840{ 6841 uint32_t gen, new_gen, pcid_next; 6842 6843 CRITICAL_ASSERT(curthread); 6844 gen = PCPU_GET(pcid_gen); 6845 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || 6846 pmap->pm_pcids[cpuid].pm_gen == gen) 6847 return (CR3_PCID_SAVE); 6848 pcid_next = PCPU_GET(pcid_next); 6849 KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", 6850 cpuid, pcid_next)); 6851 if (pcid_next == PMAP_PCID_OVERMAX) { 6852 new_gen = gen + 1; 6853 if (new_gen == 0) 6854 new_gen = 1; 6855 PCPU_SET(pcid_gen, new_gen); 6856 pcid_next = PMAP_PCID_KERN + 1; 6857 } else { 6858 new_gen = gen; 6859 } 6860 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 6861 pmap->pm_pcids[cpuid].pm_gen = new_gen; 6862 PCPU_SET(pcid_next, pcid_next + 1); 6863 return (0); 6864} 6865 6866void 6867pmap_activate_sw(struct thread *td) 6868{ 6869 pmap_t oldpmap, pmap; 6870 uint64_t cached, cr3; 6871 u_int cpuid; 6872 6873 oldpmap = PCPU_GET(curpmap); 6874 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6875 if (oldpmap == pmap) 6876 return; 6877 cpuid = PCPU_GET(cpuid); 6878#ifdef SMP 6879 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6880#else 6881 CPU_SET(cpuid, &pmap->pm_active); 6882#endif 6883 cr3 = rcr3(); 6884 if (pmap_pcid_enabled) { 6885 cached = pmap_pcid_alloc(pmap, cpuid); 6886 KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && 6887 pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 6888 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 6889 pmap->pm_pcids[cpuid].pm_pcid)); 6890 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 6891 pmap == kernel_pmap, 6892 ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", 6893 td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 6894 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { 6895 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 6896 cached); 6897 if (cached) 6898 PCPU_INC(pm_save_cnt); 6899 } 6900 } else if (cr3 != pmap->pm_cr3) { 6901 load_cr3(pmap->pm_cr3); 6902 } 6903 PCPU_SET(curpmap, pmap); 6904#ifdef SMP 6905 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6906#else 6907 CPU_CLR(cpuid, &oldpmap->pm_active); 6908#endif 6909} 6910 6911void 6912pmap_activate(struct thread *td) 6913{ 6914 6915 critical_enter(); 6916 pmap_activate_sw(td); 6917 critical_exit(); 6918} 6919 6920void 6921pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 6922{ 6923} 6924 6925/* 6926 * Increase the starting virtual address of the given mapping if a 6927 * different alignment might result in more superpage mappings. 6928 */ 6929void 6930pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6931 vm_offset_t *addr, vm_size_t size) 6932{ 6933 vm_offset_t superpage_offset; 6934 6935 if (size < NBPDR) 6936 return; 6937 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6938 offset += ptoa(object->pg_color); 6939 superpage_offset = offset & PDRMASK; 6940 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 6941 (*addr & PDRMASK) == superpage_offset) 6942 return; 6943 if ((*addr & PDRMASK) < superpage_offset) 6944 *addr = (*addr & ~PDRMASK) + superpage_offset; 6945 else 6946 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 6947} 6948 6949#ifdef INVARIANTS 6950static unsigned long num_dirty_emulations; 6951SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 6952 &num_dirty_emulations, 0, NULL); 6953 6954static unsigned long num_accessed_emulations; 6955SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 6956 &num_accessed_emulations, 0, NULL); 6957 6958static unsigned long num_superpage_accessed_emulations; 6959SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 6960 &num_superpage_accessed_emulations, 0, NULL); 6961 6962static unsigned long ad_emulation_superpage_promotions; 6963SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 6964 &ad_emulation_superpage_promotions, 0, NULL); 6965#endif /* INVARIANTS */ 6966 6967int 6968pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 6969{ 6970 int rv; 6971 struct rwlock *lock; 6972 vm_page_t m, mpte; 6973 pd_entry_t *pde; 6974 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 6975 6976 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 6977 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 6978 6979 if (!pmap_emulate_ad_bits(pmap)) 6980 return (-1); 6981 6982 PG_A = pmap_accessed_bit(pmap); 6983 PG_M = pmap_modified_bit(pmap); 6984 PG_V = pmap_valid_bit(pmap); 6985 PG_RW = pmap_rw_bit(pmap); 6986 6987 rv = -1; 6988 lock = NULL; 6989 PMAP_LOCK(pmap); 6990 6991 pde = pmap_pde(pmap, va); 6992 if (pde == NULL || (*pde & PG_V) == 0) 6993 goto done; 6994 6995 if ((*pde & PG_PS) != 0) { 6996 if (ftype == VM_PROT_READ) { 6997#ifdef INVARIANTS 6998 atomic_add_long(&num_superpage_accessed_emulations, 1); 6999#endif 7000 *pde |= PG_A; 7001 rv = 0; 7002 } 7003 goto done; 7004 } 7005 7006 pte = pmap_pde_to_pte(pde, va); 7007 if ((*pte & PG_V) == 0) 7008 goto done; 7009 7010 if (ftype == VM_PROT_WRITE) { 7011 if ((*pte & PG_RW) == 0) 7012 goto done; 7013 /* 7014 * Set the modified and accessed bits simultaneously. 7015 * 7016 * Intel EPT PTEs that do software emulation of A/D bits map 7017 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 7018 * An EPT misconfiguration is triggered if the PTE is writable 7019 * but not readable (WR=10). This is avoided by setting PG_A 7020 * and PG_M simultaneously. 7021 */ 7022 *pte |= PG_M | PG_A; 7023 } else { 7024 *pte |= PG_A; 7025 } 7026 7027 /* try to promote the mapping */ 7028 if (va < VM_MAXUSER_ADDRESS) 7029 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7030 else 7031 mpte = NULL; 7032 7033 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 7034 7035 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 7036 pmap_ps_enabled(pmap) && 7037 (m->flags & PG_FICTITIOUS) == 0 && 7038 vm_reserv_level_iffullpop(m) == 0) { 7039 pmap_promote_pde(pmap, pde, va, &lock); 7040#ifdef INVARIANTS 7041 atomic_add_long(&ad_emulation_superpage_promotions, 1); 7042#endif 7043 } 7044#ifdef INVARIANTS 7045 if (ftype == VM_PROT_WRITE) 7046 atomic_add_long(&num_dirty_emulations, 1); 7047 else 7048 atomic_add_long(&num_accessed_emulations, 1); 7049#endif 7050 rv = 0; /* success */ 7051done: 7052 if (lock != NULL) 7053 rw_wunlock(lock); 7054 PMAP_UNLOCK(pmap); 7055 return (rv); 7056} 7057 7058void 7059pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 7060{ 7061 pml4_entry_t *pml4; 7062 pdp_entry_t *pdp; 7063 pd_entry_t *pde; 7064 pt_entry_t *pte, PG_V; 7065 int idx; 7066 7067 idx = 0; 7068 PG_V = pmap_valid_bit(pmap); 7069 PMAP_LOCK(pmap); 7070 7071 pml4 = pmap_pml4e(pmap, va); 7072 ptr[idx++] = *pml4; 7073 if ((*pml4 & PG_V) == 0) 7074 goto done; 7075 7076 pdp = pmap_pml4e_to_pdpe(pml4, va); 7077 ptr[idx++] = *pdp; 7078 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 7079 goto done; 7080 7081 pde = pmap_pdpe_to_pde(pdp, va); 7082 ptr[idx++] = *pde; 7083 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 7084 goto done; 7085 7086 pte = pmap_pde_to_pte(pde, va); 7087 ptr[idx++] = *pte; 7088 7089done: 7090 PMAP_UNLOCK(pmap); 7091 *num = idx; 7092} 7093 7094/** 7095 * Get the kernel virtual address of a set of physical pages. If there are 7096 * physical addresses not covered by the DMAP perform a transient mapping 7097 * that will be removed when calling pmap_unmap_io_transient. 7098 * 7099 * \param page The pages the caller wishes to obtain the virtual 7100 * address on the kernel memory map. 7101 * \param vaddr On return contains the kernel virtual memory address 7102 * of the pages passed in the page parameter. 7103 * \param count Number of pages passed in. 7104 * \param can_fault TRUE if the thread using the mapped pages can take 7105 * page faults, FALSE otherwise. 7106 * 7107 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7108 * finished or FALSE otherwise. 7109 * 7110 */ 7111boolean_t 7112pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7113 boolean_t can_fault) 7114{ 7115 vm_paddr_t paddr; 7116 boolean_t needs_mapping; 7117 pt_entry_t *pte; 7118 int cache_bits, error, i; 7119 7120 /* 7121 * Allocate any KVA space that we need, this is done in a separate 7122 * loop to prevent calling vmem_alloc while pinned. 7123 */ 7124 needs_mapping = FALSE; 7125 for (i = 0; i < count; i++) { 7126 paddr = VM_PAGE_TO_PHYS(page[i]); 7127 if (__predict_false(paddr >= dmaplimit)) { 7128 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7129 M_BESTFIT | M_WAITOK, &vaddr[i]); 7130 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7131 needs_mapping = TRUE; 7132 } else { 7133 vaddr[i] = PHYS_TO_DMAP(paddr); 7134 } 7135 } 7136 7137 /* Exit early if everything is covered by the DMAP */ 7138 if (!needs_mapping) 7139 return (FALSE); 7140 7141 /* 7142 * NB: The sequence of updating a page table followed by accesses 7143 * to the corresponding pages used in the !DMAP case is subject to 7144 * the situation described in the "AMD64 Architecture Programmer's 7145 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 7146 * Coherency Considerations". Therefore, issuing the INVLPG right 7147 * after modifying the PTE bits is crucial. 7148 */ 7149 if (!can_fault) 7150 sched_pin(); 7151 for (i = 0; i < count; i++) { 7152 paddr = VM_PAGE_TO_PHYS(page[i]); 7153 if (paddr >= dmaplimit) { 7154 if (can_fault) { 7155 /* 7156 * Slow path, since we can get page faults 7157 * while mappings are active don't pin the 7158 * thread to the CPU and instead add a global 7159 * mapping visible to all CPUs. 7160 */ 7161 pmap_qenter(vaddr[i], &page[i], 1); 7162 } else { 7163 pte = vtopte(vaddr[i]); 7164 cache_bits = pmap_cache_bits(kernel_pmap, 7165 page[i]->md.pat_mode, 0); 7166 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 7167 cache_bits); 7168 invlpg(vaddr[i]); 7169 } 7170 } 7171 } 7172 7173 return (needs_mapping); 7174} 7175 7176void 7177pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7178 boolean_t can_fault) 7179{ 7180 vm_paddr_t paddr; 7181 int i; 7182 7183 if (!can_fault) 7184 sched_unpin(); 7185 for (i = 0; i < count; i++) { 7186 paddr = VM_PAGE_TO_PHYS(page[i]); 7187 if (paddr >= dmaplimit) { 7188 if (can_fault) 7189 pmap_qremove(vaddr[i], 1); 7190 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 7191 } 7192 } 7193} 7194 7195vm_offset_t 7196pmap_quick_enter_page(vm_page_t m) 7197{ 7198 vm_paddr_t paddr; 7199 7200 paddr = VM_PAGE_TO_PHYS(m); 7201 if (paddr < dmaplimit) 7202 return (PHYS_TO_DMAP(paddr)); 7203 mtx_lock_spin(&qframe_mtx); 7204 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 7205 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 7206 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 7207 return (qframe); 7208} 7209 7210void 7211pmap_quick_remove_page(vm_offset_t addr) 7212{ 7213 7214 if (addr != qframe) 7215 return; 7216 pte_store(vtopte(qframe), 0); 7217 invlpg(qframe); 7218 mtx_unlock_spin(&qframe_mtx); 7219} 7220 7221#include "opt_ddb.h" 7222#ifdef DDB 7223#include <ddb/ddb.h> 7224 7225DB_SHOW_COMMAND(pte, pmap_print_pte) 7226{ 7227 pmap_t pmap; 7228 pml4_entry_t *pml4; 7229 pdp_entry_t *pdp; 7230 pd_entry_t *pde; 7231 pt_entry_t *pte, PG_V; 7232 vm_offset_t va; 7233 7234 if (have_addr) { 7235 va = (vm_offset_t)addr; 7236 pmap = PCPU_GET(curpmap); /* XXX */ 7237 } else { 7238 db_printf("show pte addr\n"); 7239 return; 7240 } 7241 PG_V = pmap_valid_bit(pmap); 7242 pml4 = pmap_pml4e(pmap, va); 7243 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 7244 if ((*pml4 & PG_V) == 0) { 7245 db_printf("\n"); 7246 return; 7247 } 7248 pdp = pmap_pml4e_to_pdpe(pml4, va); 7249 db_printf(" pdpe %#016lx", *pdp); 7250 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 7251 db_printf("\n"); 7252 return; 7253 } 7254 pde = pmap_pdpe_to_pde(pdp, va); 7255 db_printf(" pde %#016lx", *pde); 7256 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 7257 db_printf("\n"); 7258 return; 7259 } 7260 pte = pmap_pde_to_pte(pde, va); 7261 db_printf(" pte %#016lx\n", *pte); 7262} 7263 7264DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 7265{ 7266 vm_paddr_t a; 7267 7268 if (have_addr) { 7269 a = (vm_paddr_t)addr; 7270 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 7271 } else { 7272 db_printf("show phys2dmap addr\n"); 7273 } 7274} 7275#endif 7276