pmap.c revision 328386
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: stable/11/sys/i386/i386/pmap.c 328386 2018-01-25 02:45:21Z pkelsey $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_vm.h" 104#include "opt_xbox.h" 105 106#include <sys/param.h> 107#include <sys/systm.h> 108#include <sys/kernel.h> 109#include <sys/ktr.h> 110#include <sys/lock.h> 111#include <sys/malloc.h> 112#include <sys/mman.h> 113#include <sys/msgbuf.h> 114#include <sys/mutex.h> 115#include <sys/proc.h> 116#include <sys/rwlock.h> 117#include <sys/sf_buf.h> 118#include <sys/sx.h> 119#include <sys/vmmeter.h> 120#include <sys/sched.h> 121#include <sys/sysctl.h> 122#include <sys/smp.h> 123 124#include <vm/vm.h> 125#include <vm/vm_param.h> 126#include <vm/vm_kern.h> 127#include <vm/vm_page.h> 128#include <vm/vm_map.h> 129#include <vm/vm_object.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_pageout.h> 132#include <vm/vm_pager.h> 133#include <vm/vm_phys.h> 134#include <vm/vm_radix.h> 135#include <vm/vm_reserv.h> 136#include <vm/uma.h> 137 138#ifdef DEV_APIC 139#include <sys/bus.h> 140#include <machine/intr_machdep.h> 141#include <x86/apicvar.h> 142#endif 143#include <machine/cpu.h> 144#include <machine/cputypes.h> 145#include <machine/md_var.h> 146#include <machine/pcb.h> 147#include <machine/specialreg.h> 148#ifdef SMP 149#include <machine/smp.h> 150#endif 151 152#ifdef XBOX 153#include <machine/xbox.h> 154#endif 155 156#ifndef PMAP_SHPGPERPROC 157#define PMAP_SHPGPERPROC 200 158#endif 159 160#if !defined(DIAGNOSTIC) 161#ifdef __GNUC_GNU_INLINE__ 162#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 163#else 164#define PMAP_INLINE extern inline 165#endif 166#else 167#define PMAP_INLINE 168#endif 169 170#ifdef PV_STATS 171#define PV_STAT(x) do { x ; } while (0) 172#else 173#define PV_STAT(x) do { } while (0) 174#endif 175 176#define pa_index(pa) ((pa) >> PDRSHIFT) 177#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 178 179/* 180 * Get PDEs and PTEs for user/kernel address space 181 */ 182#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 183#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 184 185#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 186#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 187#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 188#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 189#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 190 191#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 192 atomic_clear_int((u_int *)(pte), PG_W)) 193#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 194 195struct pmap kernel_pmap_store; 196LIST_HEAD(pmaplist, pmap); 197static struct pmaplist allpmaps; 198static struct mtx allpmaps_lock; 199 200vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 201vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 202int pgeflag = 0; /* PG_G or-in */ 203int pseflag = 0; /* PG_PS or-in */ 204 205static int nkpt = NKPT; 206vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 207extern u_int32_t KERNend; 208extern u_int32_t KPTphys; 209 210#if defined(PAE) || defined(PAE_TABLES) 211pt_entry_t pg_nx; 212static uma_zone_t pdptzone; 213#endif 214 215static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 216 217static int pat_works = 1; 218SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 219 "Is page attribute table fully functional?"); 220 221static int pg_ps_enabled = 1; 222SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 223 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 224 225#define PAT_INDEX_SIZE 8 226static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 227 228/* 229 * pmap_mapdev support pre initialization (i.e. console) 230 */ 231#define PMAP_PREINIT_MAPPING_COUNT 8 232static struct pmap_preinit_mapping { 233 vm_paddr_t pa; 234 vm_offset_t va; 235 vm_size_t sz; 236 int mode; 237} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 238static int pmap_initialized; 239 240static struct rwlock_padalign pvh_global_lock; 241 242/* 243 * Data for the pv entry allocation mechanism 244 */ 245static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 246static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 247static struct md_page *pv_table; 248static int shpgperproc = PMAP_SHPGPERPROC; 249 250struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 251int pv_maxchunks; /* How many chunks we have KVA for */ 252vm_offset_t pv_vafree; /* freelist stored in the PTE */ 253 254/* 255 * All those kernel PT submaps that BSD is so fond of 256 */ 257pt_entry_t *CMAP3; 258static pd_entry_t *KPTD; 259caddr_t ptvmmap = 0; 260caddr_t CADDR3; 261struct msgbuf *msgbufp = NULL; 262 263/* 264 * Crashdump maps. 265 */ 266static caddr_t crashdumpmap; 267 268static pt_entry_t *PMAP1 = NULL, *PMAP2; 269static pt_entry_t *PADDR1 = NULL, *PADDR2; 270#ifdef SMP 271static int PMAP1cpu; 272static int PMAP1changedcpu; 273SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 274 &PMAP1changedcpu, 0, 275 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 276#endif 277static int PMAP1changed; 278SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 279 &PMAP1changed, 0, 280 "Number of times pmap_pte_quick changed PMAP1"); 281static int PMAP1unchanged; 282SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 283 &PMAP1unchanged, 0, 284 "Number of times pmap_pte_quick didn't change PMAP1"); 285static struct mtx PMAP2mutex; 286 287static void free_pv_chunk(struct pv_chunk *pc); 288static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 289static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 290static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 291static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 292#if VM_NRESERVLEVEL > 0 293static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 294#endif 295static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 296static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 297 vm_offset_t va); 298static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 299 300static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 301static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 302 vm_prot_t prot); 303static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 304 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 305static void pmap_flush_page(vm_page_t m); 306static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 307static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 308 pd_entry_t pde); 309static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 310static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 311static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 312static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 313static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 314static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 315#if VM_NRESERVLEVEL > 0 316static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 317#endif 318static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 319 vm_prot_t prot); 320static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 321static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 322 struct spglist *free); 323static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 324 struct spglist *free); 325static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 326static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 327 struct spglist *free); 328static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 329 vm_offset_t va); 330static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 331static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 332 vm_page_t m); 333static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 334 pd_entry_t newpde); 335static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 336 337static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 338 339static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 340static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 341static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 342static void pmap_pte_release(pt_entry_t *pte); 343static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 344#if defined(PAE) || defined(PAE_TABLES) 345static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, 346 int wait); 347#endif 348static void pmap_set_pg(void); 349 350static __inline void pagezero(void *page); 351 352CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 353CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 354 355/* 356 * If you get an error here, then you set KVA_PAGES wrong! See the 357 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 358 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 359 */ 360CTASSERT(KERNBASE % (1 << 24) == 0); 361 362/* 363 * Bootstrap the system enough to run with virtual memory. 364 * 365 * On the i386 this is called after mapping has already been enabled 366 * and just syncs the pmap module with what has already been done. 367 * [We can't call it easily with mapping off since the kernel is not 368 * mapped with PA == VA, hence we would have to relocate every address 369 * from the linked base (virtual) address "KERNBASE" to the actual 370 * (physical) address starting relative to 0] 371 */ 372void 373pmap_bootstrap(vm_paddr_t firstaddr) 374{ 375 vm_offset_t va; 376 pt_entry_t *pte, *unused; 377 struct pcpu *pc; 378 int i; 379 380 /* 381 * Add a physical memory segment (vm_phys_seg) corresponding to the 382 * preallocated kernel page table pages so that vm_page structures 383 * representing these pages will be created. The vm_page structures 384 * are required for promotion of the corresponding kernel virtual 385 * addresses to superpage mappings. 386 */ 387 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 388 389 /* 390 * Initialize the first available kernel virtual address. However, 391 * using "firstaddr" may waste a few pages of the kernel virtual 392 * address space, because locore may not have mapped every physical 393 * page that it allocated. Preferably, locore would provide a first 394 * unused virtual address in addition to "firstaddr". 395 */ 396 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 397 398 virtual_end = VM_MAX_KERNEL_ADDRESS; 399 400 /* 401 * Initialize the kernel pmap (which is statically allocated). 402 */ 403 PMAP_LOCK_INIT(kernel_pmap); 404 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 405#if defined(PAE) || defined(PAE_TABLES) 406 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 407#endif 408 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 409 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 410 411 /* 412 * Initialize the global pv list lock. 413 */ 414 rw_init(&pvh_global_lock, "pmap pv global"); 415 416 LIST_INIT(&allpmaps); 417 418 /* 419 * Request a spin mutex so that changes to allpmaps cannot be 420 * preempted by smp_rendezvous_cpus(). Otherwise, 421 * pmap_update_pde_kernel() could access allpmaps while it is 422 * being changed. 423 */ 424 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 425 mtx_lock_spin(&allpmaps_lock); 426 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 427 mtx_unlock_spin(&allpmaps_lock); 428 429 /* 430 * Reserve some special page table entries/VA space for temporary 431 * mapping of pages. 432 */ 433#define SYSMAP(c, p, v, n) \ 434 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 435 436 va = virtual_avail; 437 pte = vtopte(va); 438 439 440 /* 441 * Initialize temporary map objects on the current CPU for use 442 * during early boot. 443 * CMAP1/CMAP2 are used for zeroing and copying pages. 444 * CMAP3 is used for the idle process page zeroing. 445 */ 446 pc = get_pcpu(); 447 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 448 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 449 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 450 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 451 452 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 453 454 /* 455 * Crashdump maps. 456 */ 457 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 458 459 /* 460 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 461 */ 462 SYSMAP(caddr_t, unused, ptvmmap, 1) 463 464 /* 465 * msgbufp is used to map the system message buffer. 466 */ 467 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 468 469 /* 470 * KPTmap is used by pmap_kextract(). 471 * 472 * KPTmap is first initialized by locore. However, that initial 473 * KPTmap can only support NKPT page table pages. Here, a larger 474 * KPTmap is created that can support KVA_PAGES page table pages. 475 */ 476 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 477 478 for (i = 0; i < NKPT; i++) 479 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 480 481 /* 482 * Adjust the start of the KPTD and KPTmap so that the implementation 483 * of pmap_kextract() and pmap_growkernel() can be made simpler. 484 */ 485 KPTD -= KPTDI; 486 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 487 488 /* 489 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 490 * respectively. 491 */ 492 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 493 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 494 495 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 496 497 virtual_avail = va; 498 499 /* 500 * Leave in place an identity mapping (virt == phys) for the low 1 MB 501 * physical memory region that is used by the ACPI wakeup code. This 502 * mapping must not have PG_G set. 503 */ 504#ifdef XBOX 505 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 506 * an early stadium, we cannot yet neatly map video memory ... :-( 507 * Better fixes are very welcome! */ 508 if (!arch_i386_is_xbox) 509#endif 510 for (i = 1; i < NKPT; i++) 511 PTD[i] = 0; 512 513 /* 514 * Initialize the PAT MSR if present. 515 * pmap_init_pat() clears and sets CR4_PGE, which, as a 516 * side-effect, invalidates stale PG_G TLB entries that might 517 * have been created in our pre-boot environment. We assume 518 * that PAT support implies PGE and in reverse, PGE presence 519 * comes with PAT. Both features were added for Pentium Pro. 520 */ 521 pmap_init_pat(); 522 523 /* Turn on PG_G on kernel page(s) */ 524 pmap_set_pg(); 525} 526 527static void 528pmap_init_reserved_pages(void) 529{ 530 struct pcpu *pc; 531 vm_offset_t pages; 532 int i; 533 534 CPU_FOREACH(i) { 535 pc = pcpu_find(i); 536 /* 537 * Skip if the mapping has already been initialized, 538 * i.e. this is the BSP. 539 */ 540 if (pc->pc_cmap_addr1 != 0) 541 continue; 542 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 543 pages = kva_alloc(PAGE_SIZE * 3); 544 if (pages == 0) 545 panic("%s: unable to allocate KVA", __func__); 546 pc->pc_cmap_pte1 = vtopte(pages); 547 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 548 pc->pc_cmap_addr1 = (caddr_t)pages; 549 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 550 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 551 } 552} 553 554SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 555 556/* 557 * Setup the PAT MSR. 558 */ 559void 560pmap_init_pat(void) 561{ 562 int pat_table[PAT_INDEX_SIZE]; 563 uint64_t pat_msr; 564 u_long cr0, cr4; 565 int i; 566 567 /* Set default PAT index table. */ 568 for (i = 0; i < PAT_INDEX_SIZE; i++) 569 pat_table[i] = -1; 570 pat_table[PAT_WRITE_BACK] = 0; 571 pat_table[PAT_WRITE_THROUGH] = 1; 572 pat_table[PAT_UNCACHEABLE] = 3; 573 pat_table[PAT_WRITE_COMBINING] = 3; 574 pat_table[PAT_WRITE_PROTECTED] = 3; 575 pat_table[PAT_UNCACHED] = 3; 576 577 /* 578 * Bail if this CPU doesn't implement PAT. 579 * We assume that PAT support implies PGE. 580 */ 581 if ((cpu_feature & CPUID_PAT) == 0) { 582 for (i = 0; i < PAT_INDEX_SIZE; i++) 583 pat_index[i] = pat_table[i]; 584 pat_works = 0; 585 return; 586 } 587 588 /* 589 * Due to some Intel errata, we can only safely use the lower 4 590 * PAT entries. 591 * 592 * Intel Pentium III Processor Specification Update 593 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 594 * or Mode C Paging) 595 * 596 * Intel Pentium IV Processor Specification Update 597 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 598 */ 599 if (cpu_vendor_id == CPU_VENDOR_INTEL && 600 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 601 pat_works = 0; 602 603 /* Initialize default PAT entries. */ 604 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 605 PAT_VALUE(1, PAT_WRITE_THROUGH) | 606 PAT_VALUE(2, PAT_UNCACHED) | 607 PAT_VALUE(3, PAT_UNCACHEABLE) | 608 PAT_VALUE(4, PAT_WRITE_BACK) | 609 PAT_VALUE(5, PAT_WRITE_THROUGH) | 610 PAT_VALUE(6, PAT_UNCACHED) | 611 PAT_VALUE(7, PAT_UNCACHEABLE); 612 613 if (pat_works) { 614 /* 615 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 616 * Program 5 and 6 as WP and WC. 617 * Leave 4 and 7 as WB and UC. 618 */ 619 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 620 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 621 PAT_VALUE(6, PAT_WRITE_COMBINING); 622 pat_table[PAT_UNCACHED] = 2; 623 pat_table[PAT_WRITE_PROTECTED] = 5; 624 pat_table[PAT_WRITE_COMBINING] = 6; 625 } else { 626 /* 627 * Just replace PAT Index 2 with WC instead of UC-. 628 */ 629 pat_msr &= ~PAT_MASK(2); 630 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 631 pat_table[PAT_WRITE_COMBINING] = 2; 632 } 633 634 /* Disable PGE. */ 635 cr4 = rcr4(); 636 load_cr4(cr4 & ~CR4_PGE); 637 638 /* Disable caches (CD = 1, NW = 0). */ 639 cr0 = rcr0(); 640 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 641 642 /* Flushes caches and TLBs. */ 643 wbinvd(); 644 invltlb(); 645 646 /* Update PAT and index table. */ 647 wrmsr(MSR_PAT, pat_msr); 648 for (i = 0; i < PAT_INDEX_SIZE; i++) 649 pat_index[i] = pat_table[i]; 650 651 /* Flush caches and TLBs again. */ 652 wbinvd(); 653 invltlb(); 654 655 /* Restore caches and PGE. */ 656 load_cr0(cr0); 657 load_cr4(cr4); 658} 659 660/* 661 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 662 */ 663static void 664pmap_set_pg(void) 665{ 666 pt_entry_t *pte; 667 vm_offset_t va, endva; 668 669 if (pgeflag == 0) 670 return; 671 672 endva = KERNBASE + KERNend; 673 674 if (pseflag) { 675 va = KERNBASE + KERNLOAD; 676 while (va < endva) { 677 pdir_pde(PTD, va) |= pgeflag; 678 invltlb(); /* Flush non-PG_G entries. */ 679 va += NBPDR; 680 } 681 } else { 682 va = (vm_offset_t)btext; 683 while (va < endva) { 684 pte = vtopte(va); 685 if (*pte) 686 *pte |= pgeflag; 687 invltlb(); /* Flush non-PG_G entries. */ 688 va += PAGE_SIZE; 689 } 690 } 691} 692 693/* 694 * Initialize a vm_page's machine-dependent fields. 695 */ 696void 697pmap_page_init(vm_page_t m) 698{ 699 700 TAILQ_INIT(&m->md.pv_list); 701 m->md.pat_mode = PAT_WRITE_BACK; 702} 703 704#if defined(PAE) || defined(PAE_TABLES) 705static void * 706pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) 707{ 708 709 /* Inform UMA that this allocator uses kernel_map/object. */ 710 *flags = UMA_SLAB_KERNEL; 711 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 712 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 713} 714#endif 715 716/* 717 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 718 * Requirements: 719 * - Must deal with pages in order to ensure that none of the PG_* bits 720 * are ever set, PG_V in particular. 721 * - Assumes we can write to ptes without pte_store() atomic ops, even 722 * on PAE systems. This should be ok. 723 * - Assumes nothing will ever test these addresses for 0 to indicate 724 * no mapping instead of correctly checking PG_V. 725 * - Assumes a vm_offset_t will fit in a pte (true for i386). 726 * Because PG_V is never set, there can be no mappings to invalidate. 727 */ 728static vm_offset_t 729pmap_ptelist_alloc(vm_offset_t *head) 730{ 731 pt_entry_t *pte; 732 vm_offset_t va; 733 734 va = *head; 735 if (va == 0) 736 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 737 pte = vtopte(va); 738 *head = *pte; 739 if (*head & PG_V) 740 panic("pmap_ptelist_alloc: va with PG_V set!"); 741 *pte = 0; 742 return (va); 743} 744 745static void 746pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 747{ 748 pt_entry_t *pte; 749 750 if (va & PG_V) 751 panic("pmap_ptelist_free: freeing va with PG_V set!"); 752 pte = vtopte(va); 753 *pte = *head; /* virtual! PG_V is 0 though */ 754 *head = va; 755} 756 757static void 758pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 759{ 760 int i; 761 vm_offset_t va; 762 763 *head = 0; 764 for (i = npages - 1; i >= 0; i--) { 765 va = (vm_offset_t)base + i * PAGE_SIZE; 766 pmap_ptelist_free(head, va); 767 } 768} 769 770 771/* 772 * Initialize the pmap module. 773 * Called by vm_init, to initialize any structures that the pmap 774 * system needs to map virtual memory. 775 */ 776void 777pmap_init(void) 778{ 779 struct pmap_preinit_mapping *ppim; 780 vm_page_t mpte; 781 vm_size_t s; 782 int i, pv_npg; 783 784 /* 785 * Initialize the vm page array entries for the kernel pmap's 786 * page table pages. 787 */ 788 for (i = 0; i < NKPT; i++) { 789 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 790 KASSERT(mpte >= vm_page_array && 791 mpte < &vm_page_array[vm_page_array_size], 792 ("pmap_init: page table page is out of range")); 793 mpte->pindex = i + KPTDI; 794 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 795 } 796 797 /* 798 * Initialize the address space (zone) for the pv entries. Set a 799 * high water mark so that the system can recover from excessive 800 * numbers of pv entries. 801 */ 802 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 803 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 804 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 805 pv_entry_max = roundup(pv_entry_max, _NPCPV); 806 pv_entry_high_water = 9 * (pv_entry_max / 10); 807 808 /* 809 * If the kernel is running on a virtual machine, then it must assume 810 * that MCA is enabled by the hypervisor. Moreover, the kernel must 811 * be prepared for the hypervisor changing the vendor and family that 812 * are reported by CPUID. Consequently, the workaround for AMD Family 813 * 10h Erratum 383 is enabled if the processor's feature set does not 814 * include at least one feature that is only supported by older Intel 815 * or newer AMD processors. 816 */ 817 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 818 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 819 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 820 AMDID2_FMA4)) == 0) 821 workaround_erratum383 = 1; 822 823 /* 824 * Are large page mappings supported and enabled? 825 */ 826 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 827 if (pseflag == 0) 828 pg_ps_enabled = 0; 829 else if (pg_ps_enabled) { 830 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 831 ("pmap_init: can't assign to pagesizes[1]")); 832 pagesizes[1] = NBPDR; 833 } 834 835 /* 836 * Calculate the size of the pv head table for superpages. 837 * Handle the possibility that "vm_phys_segs[...].end" is zero. 838 */ 839 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 840 PAGE_SIZE) / NBPDR + 1; 841 842 /* 843 * Allocate memory for the pv head table for superpages. 844 */ 845 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 846 s = round_page(s); 847 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 848 M_WAITOK | M_ZERO); 849 for (i = 0; i < pv_npg; i++) 850 TAILQ_INIT(&pv_table[i].pv_list); 851 852 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 853 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 854 if (pv_chunkbase == NULL) 855 panic("pmap_init: not enough kvm for pv chunks"); 856 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 857#if defined(PAE) || defined(PAE_TABLES) 858 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 859 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 860 UMA_ZONE_VM | UMA_ZONE_NOFREE); 861 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 862#endif 863 864 pmap_initialized = 1; 865 if (!bootverbose) 866 return; 867 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 868 ppim = pmap_preinit_mapping + i; 869 if (ppim->va == 0) 870 continue; 871 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 872 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 873 } 874} 875 876 877SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 878 "Max number of PV entries"); 879SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 880 "Page share factor per proc"); 881 882static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 883 "2/4MB page mapping counters"); 884 885static u_long pmap_pde_demotions; 886SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 887 &pmap_pde_demotions, 0, "2/4MB page demotions"); 888 889static u_long pmap_pde_mappings; 890SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 891 &pmap_pde_mappings, 0, "2/4MB page mappings"); 892 893static u_long pmap_pde_p_failures; 894SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 895 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 896 897static u_long pmap_pde_promotions; 898SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 899 &pmap_pde_promotions, 0, "2/4MB page promotions"); 900 901/*************************************************** 902 * Low level helper routines..... 903 ***************************************************/ 904 905/* 906 * Determine the appropriate bits to set in a PTE or PDE for a specified 907 * caching mode. 908 */ 909int 910pmap_cache_bits(int mode, boolean_t is_pde) 911{ 912 int cache_bits, pat_flag, pat_idx; 913 914 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 915 panic("Unknown caching mode %d\n", mode); 916 917 /* The PAT bit is different for PTE's and PDE's. */ 918 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 919 920 /* Map the caching mode to a PAT index. */ 921 pat_idx = pat_index[mode]; 922 923 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 924 cache_bits = 0; 925 if (pat_idx & 0x4) 926 cache_bits |= pat_flag; 927 if (pat_idx & 0x2) 928 cache_bits |= PG_NC_PCD; 929 if (pat_idx & 0x1) 930 cache_bits |= PG_NC_PWT; 931 return (cache_bits); 932} 933 934/* 935 * The caller is responsible for maintaining TLB consistency. 936 */ 937static void 938pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 939{ 940 pd_entry_t *pde; 941 pmap_t pmap; 942 boolean_t PTD_updated; 943 944 PTD_updated = FALSE; 945 mtx_lock_spin(&allpmaps_lock); 946 LIST_FOREACH(pmap, &allpmaps, pm_list) { 947 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 948 PG_FRAME)) 949 PTD_updated = TRUE; 950 pde = pmap_pde(pmap, va); 951 pde_store(pde, newpde); 952 } 953 mtx_unlock_spin(&allpmaps_lock); 954 KASSERT(PTD_updated, 955 ("pmap_kenter_pde: current page table is not in allpmaps")); 956} 957 958/* 959 * After changing the page size for the specified virtual address in the page 960 * table, flush the corresponding entries from the processor's TLB. Only the 961 * calling processor's TLB is affected. 962 * 963 * The calling thread must be pinned to a processor. 964 */ 965static void 966pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 967{ 968 u_long cr4; 969 970 if ((newpde & PG_PS) == 0) 971 /* Demotion: flush a specific 2MB page mapping. */ 972 invlpg(va); 973 else if ((newpde & PG_G) == 0) 974 /* 975 * Promotion: flush every 4KB page mapping from the TLB 976 * because there are too many to flush individually. 977 */ 978 invltlb(); 979 else { 980 /* 981 * Promotion: flush every 4KB page mapping from the TLB, 982 * including any global (PG_G) mappings. 983 */ 984 cr4 = rcr4(); 985 load_cr4(cr4 & ~CR4_PGE); 986 /* 987 * Although preemption at this point could be detrimental to 988 * performance, it would not lead to an error. PG_G is simply 989 * ignored if CR4.PGE is clear. Moreover, in case this block 990 * is re-entered, the load_cr4() either above or below will 991 * modify CR4.PGE flushing the TLB. 992 */ 993 load_cr4(cr4 | CR4_PGE); 994 } 995} 996 997void 998invltlb_glob(void) 999{ 1000 uint64_t cr4; 1001 1002 if (pgeflag == 0) { 1003 invltlb(); 1004 } else { 1005 cr4 = rcr4(); 1006 load_cr4(cr4 & ~CR4_PGE); 1007 load_cr4(cr4 | CR4_PGE); 1008 } 1009} 1010 1011 1012#ifdef SMP 1013/* 1014 * For SMP, these functions have to use the IPI mechanism for coherence. 1015 * 1016 * N.B.: Before calling any of the following TLB invalidation functions, 1017 * the calling processor must ensure that all stores updating a non- 1018 * kernel page table are globally performed. Otherwise, another 1019 * processor could cache an old, pre-update entry without being 1020 * invalidated. This can happen one of two ways: (1) The pmap becomes 1021 * active on another processor after its pm_active field is checked by 1022 * one of the following functions but before a store updating the page 1023 * table is globally performed. (2) The pmap becomes active on another 1024 * processor before its pm_active field is checked but due to 1025 * speculative loads one of the following functions stills reads the 1026 * pmap as inactive on the other processor. 1027 * 1028 * The kernel page table is exempt because its pm_active field is 1029 * immutable. The kernel page table is always active on every 1030 * processor. 1031 */ 1032void 1033pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1034{ 1035 cpuset_t *mask, other_cpus; 1036 u_int cpuid; 1037 1038 sched_pin(); 1039 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1040 invlpg(va); 1041 mask = &all_cpus; 1042 } else { 1043 cpuid = PCPU_GET(cpuid); 1044 other_cpus = all_cpus; 1045 CPU_CLR(cpuid, &other_cpus); 1046 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1047 invlpg(va); 1048 CPU_AND(&other_cpus, &pmap->pm_active); 1049 mask = &other_cpus; 1050 } 1051 smp_masked_invlpg(*mask, va); 1052 sched_unpin(); 1053} 1054 1055/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1056#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1057 1058void 1059pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1060{ 1061 cpuset_t *mask, other_cpus; 1062 vm_offset_t addr; 1063 u_int cpuid; 1064 1065 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1066 pmap_invalidate_all(pmap); 1067 return; 1068 } 1069 1070 sched_pin(); 1071 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1072 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1073 invlpg(addr); 1074 mask = &all_cpus; 1075 } else { 1076 cpuid = PCPU_GET(cpuid); 1077 other_cpus = all_cpus; 1078 CPU_CLR(cpuid, &other_cpus); 1079 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1080 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1081 invlpg(addr); 1082 CPU_AND(&other_cpus, &pmap->pm_active); 1083 mask = &other_cpus; 1084 } 1085 smp_masked_invlpg_range(*mask, sva, eva); 1086 sched_unpin(); 1087} 1088 1089void 1090pmap_invalidate_all(pmap_t pmap) 1091{ 1092 cpuset_t *mask, other_cpus; 1093 u_int cpuid; 1094 1095 sched_pin(); 1096 if (pmap == kernel_pmap) { 1097 invltlb_glob(); 1098 mask = &all_cpus; 1099 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1100 invltlb(); 1101 mask = &all_cpus; 1102 } else { 1103 cpuid = PCPU_GET(cpuid); 1104 other_cpus = all_cpus; 1105 CPU_CLR(cpuid, &other_cpus); 1106 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1107 invltlb(); 1108 CPU_AND(&other_cpus, &pmap->pm_active); 1109 mask = &other_cpus; 1110 } 1111 smp_masked_invltlb(*mask, pmap); 1112 sched_unpin(); 1113} 1114 1115void 1116pmap_invalidate_cache(void) 1117{ 1118 1119 sched_pin(); 1120 wbinvd(); 1121 smp_cache_flush(); 1122 sched_unpin(); 1123} 1124 1125struct pde_action { 1126 cpuset_t invalidate; /* processors that invalidate their TLB */ 1127 vm_offset_t va; 1128 pd_entry_t *pde; 1129 pd_entry_t newpde; 1130 u_int store; /* processor that updates the PDE */ 1131}; 1132 1133static void 1134pmap_update_pde_kernel(void *arg) 1135{ 1136 struct pde_action *act = arg; 1137 pd_entry_t *pde; 1138 pmap_t pmap; 1139 1140 if (act->store == PCPU_GET(cpuid)) { 1141 1142 /* 1143 * Elsewhere, this operation requires allpmaps_lock for 1144 * synchronization. Here, it does not because it is being 1145 * performed in the context of an all_cpus rendezvous. 1146 */ 1147 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1148 pde = pmap_pde(pmap, act->va); 1149 pde_store(pde, act->newpde); 1150 } 1151 } 1152} 1153 1154static void 1155pmap_update_pde_user(void *arg) 1156{ 1157 struct pde_action *act = arg; 1158 1159 if (act->store == PCPU_GET(cpuid)) 1160 pde_store(act->pde, act->newpde); 1161} 1162 1163static void 1164pmap_update_pde_teardown(void *arg) 1165{ 1166 struct pde_action *act = arg; 1167 1168 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1169 pmap_update_pde_invalidate(act->va, act->newpde); 1170} 1171 1172/* 1173 * Change the page size for the specified virtual address in a way that 1174 * prevents any possibility of the TLB ever having two entries that map the 1175 * same virtual address using different page sizes. This is the recommended 1176 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1177 * machine check exception for a TLB state that is improperly diagnosed as a 1178 * hardware error. 1179 */ 1180static void 1181pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1182{ 1183 struct pde_action act; 1184 cpuset_t active, other_cpus; 1185 u_int cpuid; 1186 1187 sched_pin(); 1188 cpuid = PCPU_GET(cpuid); 1189 other_cpus = all_cpus; 1190 CPU_CLR(cpuid, &other_cpus); 1191 if (pmap == kernel_pmap) 1192 active = all_cpus; 1193 else 1194 active = pmap->pm_active; 1195 if (CPU_OVERLAP(&active, &other_cpus)) { 1196 act.store = cpuid; 1197 act.invalidate = active; 1198 act.va = va; 1199 act.pde = pde; 1200 act.newpde = newpde; 1201 CPU_SET(cpuid, &active); 1202 smp_rendezvous_cpus(active, 1203 smp_no_rendezvous_barrier, pmap == kernel_pmap ? 1204 pmap_update_pde_kernel : pmap_update_pde_user, 1205 pmap_update_pde_teardown, &act); 1206 } else { 1207 if (pmap == kernel_pmap) 1208 pmap_kenter_pde(va, newpde); 1209 else 1210 pde_store(pde, newpde); 1211 if (CPU_ISSET(cpuid, &active)) 1212 pmap_update_pde_invalidate(va, newpde); 1213 } 1214 sched_unpin(); 1215} 1216#else /* !SMP */ 1217/* 1218 * Normal, non-SMP, 486+ invalidation functions. 1219 * We inline these within pmap.c for speed. 1220 */ 1221PMAP_INLINE void 1222pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1223{ 1224 1225 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1226 invlpg(va); 1227} 1228 1229PMAP_INLINE void 1230pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1231{ 1232 vm_offset_t addr; 1233 1234 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1235 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1236 invlpg(addr); 1237} 1238 1239PMAP_INLINE void 1240pmap_invalidate_all(pmap_t pmap) 1241{ 1242 1243 if (pmap == kernel_pmap) 1244 invltlb_glob(); 1245 else if (!CPU_EMPTY(&pmap->pm_active)) 1246 invltlb(); 1247} 1248 1249PMAP_INLINE void 1250pmap_invalidate_cache(void) 1251{ 1252 1253 wbinvd(); 1254} 1255 1256static void 1257pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1258{ 1259 1260 if (pmap == kernel_pmap) 1261 pmap_kenter_pde(va, newpde); 1262 else 1263 pde_store(pde, newpde); 1264 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1265 pmap_update_pde_invalidate(va, newpde); 1266} 1267#endif /* !SMP */ 1268 1269static void 1270pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1271{ 1272 1273 /* 1274 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was 1275 * created by a promotion that did not invalidate the 512 or 1024 4KB 1276 * page mappings that might exist in the TLB. Consequently, at this 1277 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for 1278 * the address range [va, va + NBPDR). Therefore, the entire range 1279 * must be invalidated here. In contrast, when PG_PROMOTED is clear, 1280 * the TLB will not hold any 4KB page mappings for the address range 1281 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the 1282 * 2- or 4MB page mapping from the TLB. 1283 */ 1284 if ((pde & PG_PROMOTED) != 0) 1285 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 1286 else 1287 pmap_invalidate_page(pmap, va); 1288} 1289 1290#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1291 1292void 1293pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1294{ 1295 1296 if (force) { 1297 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 1298 } else { 1299 KASSERT((sva & PAGE_MASK) == 0, 1300 ("pmap_invalidate_cache_range: sva not page-aligned")); 1301 KASSERT((eva & PAGE_MASK) == 0, 1302 ("pmap_invalidate_cache_range: eva not page-aligned")); 1303 } 1304 1305 if ((cpu_feature & CPUID_SS) != 0 && !force) 1306 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1307 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1308 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1309#ifdef DEV_APIC 1310 /* 1311 * XXX: Some CPUs fault, hang, or trash the local APIC 1312 * registers if we use CLFLUSH on the local APIC 1313 * range. The local APIC is always uncached, so we 1314 * don't need to flush for that range anyway. 1315 */ 1316 if (pmap_kextract(sva) == lapic_paddr) 1317 return; 1318#endif 1319 /* 1320 * Otherwise, do per-cache line flush. Use the sfence 1321 * instruction to insure that previous stores are 1322 * included in the write-back. The processor 1323 * propagates flush to other processors in the cache 1324 * coherence domain. 1325 */ 1326 sfence(); 1327 for (; sva < eva; sva += cpu_clflush_line_size) 1328 clflushopt(sva); 1329 sfence(); 1330 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1331 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1332#ifdef DEV_APIC 1333 if (pmap_kextract(sva) == lapic_paddr) 1334 return; 1335#endif 1336 /* 1337 * Writes are ordered by CLFLUSH on Intel CPUs. 1338 */ 1339 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1340 mfence(); 1341 for (; sva < eva; sva += cpu_clflush_line_size) 1342 clflush(sva); 1343 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1344 mfence(); 1345 } else { 1346 1347 /* 1348 * No targeted cache flush methods are supported by CPU, 1349 * or the supplied range is bigger than 2MB. 1350 * Globally invalidate cache. 1351 */ 1352 pmap_invalidate_cache(); 1353 } 1354} 1355 1356void 1357pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1358{ 1359 int i; 1360 1361 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1362 (cpu_feature & CPUID_CLFSH) == 0) { 1363 pmap_invalidate_cache(); 1364 } else { 1365 for (i = 0; i < count; i++) 1366 pmap_flush_page(pages[i]); 1367 } 1368} 1369 1370/* 1371 * Are we current address space or kernel? 1372 */ 1373static __inline int 1374pmap_is_current(pmap_t pmap) 1375{ 1376 1377 return (pmap == kernel_pmap || pmap == 1378 vmspace_pmap(curthread->td_proc->p_vmspace)); 1379} 1380 1381/* 1382 * If the given pmap is not the current or kernel pmap, the returned pte must 1383 * be released by passing it to pmap_pte_release(). 1384 */ 1385pt_entry_t * 1386pmap_pte(pmap_t pmap, vm_offset_t va) 1387{ 1388 pd_entry_t newpf; 1389 pd_entry_t *pde; 1390 1391 pde = pmap_pde(pmap, va); 1392 if (*pde & PG_PS) 1393 return (pde); 1394 if (*pde != 0) { 1395 /* are we current address space or kernel? */ 1396 if (pmap_is_current(pmap)) 1397 return (vtopte(va)); 1398 mtx_lock(&PMAP2mutex); 1399 newpf = *pde & PG_FRAME; 1400 if ((*PMAP2 & PG_FRAME) != newpf) { 1401 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1402 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1403 } 1404 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1405 } 1406 return (NULL); 1407} 1408 1409/* 1410 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1411 * being NULL. 1412 */ 1413static __inline void 1414pmap_pte_release(pt_entry_t *pte) 1415{ 1416 1417 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1418 mtx_unlock(&PMAP2mutex); 1419} 1420 1421/* 1422 * NB: The sequence of updating a page table followed by accesses to the 1423 * corresponding pages is subject to the situation described in the "AMD64 1424 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1425 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1426 * right after modifying the PTE bits is crucial. 1427 */ 1428static __inline void 1429invlcaddr(void *caddr) 1430{ 1431 1432 invlpg((u_int)caddr); 1433} 1434 1435/* 1436 * Super fast pmap_pte routine best used when scanning 1437 * the pv lists. This eliminates many coarse-grained 1438 * invltlb calls. Note that many of the pv list 1439 * scans are across different pmaps. It is very wasteful 1440 * to do an entire invltlb for checking a single mapping. 1441 * 1442 * If the given pmap is not the current pmap, pvh_global_lock 1443 * must be held and curthread pinned to a CPU. 1444 */ 1445static pt_entry_t * 1446pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1447{ 1448 pd_entry_t newpf; 1449 pd_entry_t *pde; 1450 1451 pde = pmap_pde(pmap, va); 1452 if (*pde & PG_PS) 1453 return (pde); 1454 if (*pde != 0) { 1455 /* are we current address space or kernel? */ 1456 if (pmap_is_current(pmap)) 1457 return (vtopte(va)); 1458 rw_assert(&pvh_global_lock, RA_WLOCKED); 1459 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1460 newpf = *pde & PG_FRAME; 1461 if ((*PMAP1 & PG_FRAME) != newpf) { 1462 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1463#ifdef SMP 1464 PMAP1cpu = PCPU_GET(cpuid); 1465#endif 1466 invlcaddr(PADDR1); 1467 PMAP1changed++; 1468 } else 1469#ifdef SMP 1470 if (PMAP1cpu != PCPU_GET(cpuid)) { 1471 PMAP1cpu = PCPU_GET(cpuid); 1472 invlcaddr(PADDR1); 1473 PMAP1changedcpu++; 1474 } else 1475#endif 1476 PMAP1unchanged++; 1477 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1478 } 1479 return (0); 1480} 1481 1482/* 1483 * Routine: pmap_extract 1484 * Function: 1485 * Extract the physical page address associated 1486 * with the given map/virtual_address pair. 1487 */ 1488vm_paddr_t 1489pmap_extract(pmap_t pmap, vm_offset_t va) 1490{ 1491 vm_paddr_t rtval; 1492 pt_entry_t *pte; 1493 pd_entry_t pde; 1494 1495 rtval = 0; 1496 PMAP_LOCK(pmap); 1497 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1498 if (pde != 0) { 1499 if ((pde & PG_PS) != 0) 1500 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1501 else { 1502 pte = pmap_pte(pmap, va); 1503 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1504 pmap_pte_release(pte); 1505 } 1506 } 1507 PMAP_UNLOCK(pmap); 1508 return (rtval); 1509} 1510 1511/* 1512 * Routine: pmap_extract_and_hold 1513 * Function: 1514 * Atomically extract and hold the physical page 1515 * with the given pmap and virtual address pair 1516 * if that mapping permits the given protection. 1517 */ 1518vm_page_t 1519pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1520{ 1521 pd_entry_t pde; 1522 pt_entry_t pte, *ptep; 1523 vm_page_t m; 1524 vm_paddr_t pa; 1525 1526 pa = 0; 1527 m = NULL; 1528 PMAP_LOCK(pmap); 1529retry: 1530 pde = *pmap_pde(pmap, va); 1531 if (pde != 0) { 1532 if (pde & PG_PS) { 1533 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1534 if (vm_page_pa_tryrelock(pmap, (pde & 1535 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1536 goto retry; 1537 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1538 (va & PDRMASK)); 1539 vm_page_hold(m); 1540 } 1541 } else { 1542 ptep = pmap_pte(pmap, va); 1543 pte = *ptep; 1544 pmap_pte_release(ptep); 1545 if (pte != 0 && 1546 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1547 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1548 &pa)) 1549 goto retry; 1550 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1551 vm_page_hold(m); 1552 } 1553 } 1554 } 1555 PA_UNLOCK_COND(pa); 1556 PMAP_UNLOCK(pmap); 1557 return (m); 1558} 1559 1560/*************************************************** 1561 * Low level mapping routines..... 1562 ***************************************************/ 1563 1564/* 1565 * Add a wired page to the kva. 1566 * Note: not SMP coherent. 1567 * 1568 * This function may be used before pmap_bootstrap() is called. 1569 */ 1570PMAP_INLINE void 1571pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1572{ 1573 pt_entry_t *pte; 1574 1575 pte = vtopte(va); 1576 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1577} 1578 1579static __inline void 1580pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1581{ 1582 pt_entry_t *pte; 1583 1584 pte = vtopte(va); 1585 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1586} 1587 1588/* 1589 * Remove a page from the kernel pagetables. 1590 * Note: not SMP coherent. 1591 * 1592 * This function may be used before pmap_bootstrap() is called. 1593 */ 1594PMAP_INLINE void 1595pmap_kremove(vm_offset_t va) 1596{ 1597 pt_entry_t *pte; 1598 1599 pte = vtopte(va); 1600 pte_clear(pte); 1601} 1602 1603/* 1604 * Used to map a range of physical addresses into kernel 1605 * virtual address space. 1606 * 1607 * The value passed in '*virt' is a suggested virtual address for 1608 * the mapping. Architectures which can support a direct-mapped 1609 * physical to virtual region can return the appropriate address 1610 * within that region, leaving '*virt' unchanged. Other 1611 * architectures should map the pages starting at '*virt' and 1612 * update '*virt' with the first usable address after the mapped 1613 * region. 1614 */ 1615vm_offset_t 1616pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1617{ 1618 vm_offset_t va, sva; 1619 vm_paddr_t superpage_offset; 1620 pd_entry_t newpde; 1621 1622 va = *virt; 1623 /* 1624 * Does the physical address range's size and alignment permit at 1625 * least one superpage mapping to be created? 1626 */ 1627 superpage_offset = start & PDRMASK; 1628 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1629 /* 1630 * Increase the starting virtual address so that its alignment 1631 * does not preclude the use of superpage mappings. 1632 */ 1633 if ((va & PDRMASK) < superpage_offset) 1634 va = (va & ~PDRMASK) + superpage_offset; 1635 else if ((va & PDRMASK) > superpage_offset) 1636 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1637 } 1638 sva = va; 1639 while (start < end) { 1640 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1641 pseflag) { 1642 KASSERT((va & PDRMASK) == 0, 1643 ("pmap_map: misaligned va %#x", va)); 1644 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1645 pmap_kenter_pde(va, newpde); 1646 va += NBPDR; 1647 start += NBPDR; 1648 } else { 1649 pmap_kenter(va, start); 1650 va += PAGE_SIZE; 1651 start += PAGE_SIZE; 1652 } 1653 } 1654 pmap_invalidate_range(kernel_pmap, sva, va); 1655 *virt = va; 1656 return (sva); 1657} 1658 1659 1660/* 1661 * Add a list of wired pages to the kva 1662 * this routine is only used for temporary 1663 * kernel mappings that do not need to have 1664 * page modification or references recorded. 1665 * Note that old mappings are simply written 1666 * over. The page *must* be wired. 1667 * Note: SMP coherent. Uses a ranged shootdown IPI. 1668 */ 1669void 1670pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1671{ 1672 pt_entry_t *endpte, oldpte, pa, *pte; 1673 vm_page_t m; 1674 1675 oldpte = 0; 1676 pte = vtopte(sva); 1677 endpte = pte + count; 1678 while (pte < endpte) { 1679 m = *ma++; 1680 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1681 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1682 oldpte |= *pte; 1683 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1684 } 1685 pte++; 1686 } 1687 if (__predict_false((oldpte & PG_V) != 0)) 1688 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1689 PAGE_SIZE); 1690} 1691 1692/* 1693 * This routine tears out page mappings from the 1694 * kernel -- it is meant only for temporary mappings. 1695 * Note: SMP coherent. Uses a ranged shootdown IPI. 1696 */ 1697void 1698pmap_qremove(vm_offset_t sva, int count) 1699{ 1700 vm_offset_t va; 1701 1702 va = sva; 1703 while (count-- > 0) { 1704 pmap_kremove(va); 1705 va += PAGE_SIZE; 1706 } 1707 pmap_invalidate_range(kernel_pmap, sva, va); 1708} 1709 1710/*************************************************** 1711 * Page table page management routines..... 1712 ***************************************************/ 1713static __inline void 1714pmap_free_zero_pages(struct spglist *free) 1715{ 1716 vm_page_t m; 1717 int count; 1718 1719 for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) { 1720 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1721 /* Preserve the page's PG_ZERO setting. */ 1722 vm_page_free_toq(m); 1723 } 1724 atomic_subtract_int(&vm_cnt.v_wire_count, count); 1725} 1726 1727/* 1728 * Schedule the specified unused page table page to be freed. Specifically, 1729 * add the page to the specified list of pages that will be released to the 1730 * physical memory manager after the TLB has been updated. 1731 */ 1732static __inline void 1733pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1734 boolean_t set_PG_ZERO) 1735{ 1736 1737 if (set_PG_ZERO) 1738 m->flags |= PG_ZERO; 1739 else 1740 m->flags &= ~PG_ZERO; 1741 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1742} 1743 1744/* 1745 * Inserts the specified page table page into the specified pmap's collection 1746 * of idle page table pages. Each of a pmap's page table pages is responsible 1747 * for mapping a distinct range of virtual addresses. The pmap's collection is 1748 * ordered by this virtual address range. 1749 */ 1750static __inline int 1751pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1752{ 1753 1754 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1755 return (vm_radix_insert(&pmap->pm_root, mpte)); 1756} 1757 1758/* 1759 * Removes the page table page mapping the specified virtual address from the 1760 * specified pmap's collection of idle page table pages, and returns it. 1761 * Otherwise, returns NULL if there is no page table page corresponding to the 1762 * specified virtual address. 1763 */ 1764static __inline vm_page_t 1765pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1766{ 1767 1768 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1769 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); 1770} 1771 1772/* 1773 * Decrements a page table page's wire count, which is used to record the 1774 * number of valid page table entries within the page. If the wire count 1775 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1776 * page table page was unmapped and FALSE otherwise. 1777 */ 1778static inline boolean_t 1779pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1780{ 1781 1782 --m->wire_count; 1783 if (m->wire_count == 0) { 1784 _pmap_unwire_ptp(pmap, m, free); 1785 return (TRUE); 1786 } else 1787 return (FALSE); 1788} 1789 1790static void 1791_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1792{ 1793 vm_offset_t pteva; 1794 1795 /* 1796 * unmap the page table page 1797 */ 1798 pmap->pm_pdir[m->pindex] = 0; 1799 --pmap->pm_stats.resident_count; 1800 1801 /* 1802 * Do an invltlb to make the invalidated mapping 1803 * take effect immediately. 1804 */ 1805 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1806 pmap_invalidate_page(pmap, pteva); 1807 1808 /* 1809 * Put page on a list so that it is released after 1810 * *ALL* TLB shootdown is done 1811 */ 1812 pmap_add_delayed_free_list(m, free, TRUE); 1813} 1814 1815/* 1816 * After removing a page table entry, this routine is used to 1817 * conditionally free the page, and manage the hold/wire counts. 1818 */ 1819static int 1820pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1821{ 1822 pd_entry_t ptepde; 1823 vm_page_t mpte; 1824 1825 if (va >= VM_MAXUSER_ADDRESS) 1826 return (0); 1827 ptepde = *pmap_pde(pmap, va); 1828 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1829 return (pmap_unwire_ptp(pmap, mpte, free)); 1830} 1831 1832/* 1833 * Initialize the pmap for the swapper process. 1834 */ 1835void 1836pmap_pinit0(pmap_t pmap) 1837{ 1838 1839 PMAP_LOCK_INIT(pmap); 1840 /* 1841 * Since the page table directory is shared with the kernel pmap, 1842 * which is already included in the list "allpmaps", this pmap does 1843 * not need to be inserted into that list. 1844 */ 1845 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1846#if defined(PAE) || defined(PAE_TABLES) 1847 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1848#endif 1849 pmap->pm_root.rt_root = 0; 1850 CPU_ZERO(&pmap->pm_active); 1851 PCPU_SET(curpmap, pmap); 1852 TAILQ_INIT(&pmap->pm_pvchunk); 1853 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1854} 1855 1856/* 1857 * Initialize a preallocated and zeroed pmap structure, 1858 * such as one in a vmspace structure. 1859 */ 1860int 1861pmap_pinit(pmap_t pmap) 1862{ 1863 vm_page_t m, ptdpg[NPGPTD]; 1864 vm_paddr_t pa; 1865 int i; 1866 1867 /* 1868 * No need to allocate page table space yet but we do need a valid 1869 * page directory table. 1870 */ 1871 if (pmap->pm_pdir == NULL) { 1872 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1873 if (pmap->pm_pdir == NULL) 1874 return (0); 1875#if defined(PAE) || defined(PAE_TABLES) 1876 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1877 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1878 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1879 ("pmap_pinit: pdpt misaligned")); 1880 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1881 ("pmap_pinit: pdpt above 4g")); 1882#endif 1883 pmap->pm_root.rt_root = 0; 1884 } 1885 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1886 ("pmap_pinit: pmap has reserved page table page(s)")); 1887 1888 /* 1889 * allocate the page directory page(s) 1890 */ 1891 for (i = 0; i < NPGPTD;) { 1892 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1893 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1894 if (m == NULL) 1895 VM_WAIT; 1896 else { 1897 ptdpg[i++] = m; 1898 } 1899 } 1900 1901 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1902 1903 for (i = 0; i < NPGPTD; i++) 1904 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1905 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1906 1907 mtx_lock_spin(&allpmaps_lock); 1908 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1909 /* Copy the kernel page table directory entries. */ 1910 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1911 mtx_unlock_spin(&allpmaps_lock); 1912 1913 /* install self-referential address mapping entry(s) */ 1914 for (i = 0; i < NPGPTD; i++) { 1915 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1916 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1917#if defined(PAE) || defined(PAE_TABLES) 1918 pmap->pm_pdpt[i] = pa | PG_V; 1919#endif 1920 } 1921 1922 CPU_ZERO(&pmap->pm_active); 1923 TAILQ_INIT(&pmap->pm_pvchunk); 1924 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1925 1926 return (1); 1927} 1928 1929/* 1930 * this routine is called if the page table page is not 1931 * mapped correctly. 1932 */ 1933static vm_page_t 1934_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1935{ 1936 vm_paddr_t ptepa; 1937 vm_page_t m; 1938 1939 /* 1940 * Allocate a page table page. 1941 */ 1942 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1943 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1944 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1945 PMAP_UNLOCK(pmap); 1946 rw_wunlock(&pvh_global_lock); 1947 VM_WAIT; 1948 rw_wlock(&pvh_global_lock); 1949 PMAP_LOCK(pmap); 1950 } 1951 1952 /* 1953 * Indicate the need to retry. While waiting, the page table 1954 * page may have been allocated. 1955 */ 1956 return (NULL); 1957 } 1958 if ((m->flags & PG_ZERO) == 0) 1959 pmap_zero_page(m); 1960 1961 /* 1962 * Map the pagetable page into the process address space, if 1963 * it isn't already there. 1964 */ 1965 1966 pmap->pm_stats.resident_count++; 1967 1968 ptepa = VM_PAGE_TO_PHYS(m); 1969 pmap->pm_pdir[ptepindex] = 1970 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1971 1972 return (m); 1973} 1974 1975static vm_page_t 1976pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1977{ 1978 u_int ptepindex; 1979 pd_entry_t ptepa; 1980 vm_page_t m; 1981 1982 /* 1983 * Calculate pagetable page index 1984 */ 1985 ptepindex = va >> PDRSHIFT; 1986retry: 1987 /* 1988 * Get the page directory entry 1989 */ 1990 ptepa = pmap->pm_pdir[ptepindex]; 1991 1992 /* 1993 * This supports switching from a 4MB page to a 1994 * normal 4K page. 1995 */ 1996 if (ptepa & PG_PS) { 1997 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1998 ptepa = pmap->pm_pdir[ptepindex]; 1999 } 2000 2001 /* 2002 * If the page table page is mapped, we just increment the 2003 * hold count, and activate it. 2004 */ 2005 if (ptepa) { 2006 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 2007 m->wire_count++; 2008 } else { 2009 /* 2010 * Here if the pte page isn't mapped, or if it has 2011 * been deallocated. 2012 */ 2013 m = _pmap_allocpte(pmap, ptepindex, flags); 2014 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2015 goto retry; 2016 } 2017 return (m); 2018} 2019 2020 2021/*************************************************** 2022* Pmap allocation/deallocation routines. 2023 ***************************************************/ 2024 2025/* 2026 * Release any resources held by the given physical map. 2027 * Called when a pmap initialized by pmap_pinit is being released. 2028 * Should only be called if the map contains no valid mappings. 2029 */ 2030void 2031pmap_release(pmap_t pmap) 2032{ 2033 vm_page_t m, ptdpg[NPGPTD]; 2034 int i; 2035 2036 KASSERT(pmap->pm_stats.resident_count == 0, 2037 ("pmap_release: pmap resident count %ld != 0", 2038 pmap->pm_stats.resident_count)); 2039 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2040 ("pmap_release: pmap has reserved page table page(s)")); 2041 KASSERT(CPU_EMPTY(&pmap->pm_active), 2042 ("releasing active pmap %p", pmap)); 2043 2044 mtx_lock_spin(&allpmaps_lock); 2045 LIST_REMOVE(pmap, pm_list); 2046 mtx_unlock_spin(&allpmaps_lock); 2047 2048 for (i = 0; i < NPGPTD; i++) 2049 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2050 PG_FRAME); 2051 2052 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2053 sizeof(*pmap->pm_pdir)); 2054 2055 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2056 2057 for (i = 0; i < NPGPTD; i++) { 2058 m = ptdpg[i]; 2059#if defined(PAE) || defined(PAE_TABLES) 2060 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2061 ("pmap_release: got wrong ptd page")); 2062#endif 2063 m->wire_count--; 2064 vm_page_free_zero(m); 2065 } 2066 atomic_subtract_int(&vm_cnt.v_wire_count, NPGPTD); 2067} 2068 2069static int 2070kvm_size(SYSCTL_HANDLER_ARGS) 2071{ 2072 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2073 2074 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2075} 2076SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2077 0, 0, kvm_size, "IU", "Size of KVM"); 2078 2079static int 2080kvm_free(SYSCTL_HANDLER_ARGS) 2081{ 2082 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2083 2084 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2085} 2086SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2087 0, 0, kvm_free, "IU", "Amount of KVM free"); 2088 2089/* 2090 * grow the number of kernel page table entries, if needed 2091 */ 2092void 2093pmap_growkernel(vm_offset_t addr) 2094{ 2095 vm_paddr_t ptppaddr; 2096 vm_page_t nkpg; 2097 pd_entry_t newpdir; 2098 2099 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2100 addr = roundup2(addr, NBPDR); 2101 if (addr - 1 >= kernel_map->max_offset) 2102 addr = kernel_map->max_offset; 2103 while (kernel_vm_end < addr) { 2104 if (pdir_pde(PTD, kernel_vm_end)) { 2105 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2106 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2107 kernel_vm_end = kernel_map->max_offset; 2108 break; 2109 } 2110 continue; 2111 } 2112 2113 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2114 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2115 VM_ALLOC_ZERO); 2116 if (nkpg == NULL) 2117 panic("pmap_growkernel: no memory to grow kernel"); 2118 2119 nkpt++; 2120 2121 if ((nkpg->flags & PG_ZERO) == 0) 2122 pmap_zero_page(nkpg); 2123 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2124 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2125 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2126 2127 pmap_kenter_pde(kernel_vm_end, newpdir); 2128 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2129 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2130 kernel_vm_end = kernel_map->max_offset; 2131 break; 2132 } 2133 } 2134} 2135 2136 2137/*************************************************** 2138 * page management routines. 2139 ***************************************************/ 2140 2141CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2142CTASSERT(_NPCM == 11); 2143CTASSERT(_NPCPV == 336); 2144 2145static __inline struct pv_chunk * 2146pv_to_chunk(pv_entry_t pv) 2147{ 2148 2149 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2150} 2151 2152#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2153 2154#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2155#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2156 2157static const uint32_t pc_freemask[_NPCM] = { 2158 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2159 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2160 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2161 PC_FREE0_9, PC_FREE10 2162}; 2163 2164SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2165 "Current number of pv entries"); 2166 2167#ifdef PV_STATS 2168static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2169 2170SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2171 "Current number of pv entry chunks"); 2172SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2173 "Current number of pv entry chunks allocated"); 2174SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2175 "Current number of pv entry chunks frees"); 2176SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2177 "Number of times tried to get a chunk page but failed."); 2178 2179static long pv_entry_frees, pv_entry_allocs; 2180static int pv_entry_spare; 2181 2182SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2183 "Current number of pv entry frees"); 2184SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2185 "Current number of pv entry allocs"); 2186SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2187 "Current number of spare pv entries"); 2188#endif 2189 2190/* 2191 * We are in a serious low memory condition. Resort to 2192 * drastic measures to free some pages so we can allocate 2193 * another pv entry chunk. 2194 */ 2195static vm_page_t 2196pmap_pv_reclaim(pmap_t locked_pmap) 2197{ 2198 struct pch newtail; 2199 struct pv_chunk *pc; 2200 struct md_page *pvh; 2201 pd_entry_t *pde; 2202 pmap_t pmap; 2203 pt_entry_t *pte, tpte; 2204 pv_entry_t pv; 2205 vm_offset_t va; 2206 vm_page_t m, m_pc; 2207 struct spglist free; 2208 uint32_t inuse; 2209 int bit, field, freed; 2210 2211 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2212 pmap = NULL; 2213 m_pc = NULL; 2214 SLIST_INIT(&free); 2215 TAILQ_INIT(&newtail); 2216 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2217 SLIST_EMPTY(&free))) { 2218 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2219 if (pmap != pc->pc_pmap) { 2220 if (pmap != NULL) { 2221 pmap_invalidate_all(pmap); 2222 if (pmap != locked_pmap) 2223 PMAP_UNLOCK(pmap); 2224 } 2225 pmap = pc->pc_pmap; 2226 /* Avoid deadlock and lock recursion. */ 2227 if (pmap > locked_pmap) 2228 PMAP_LOCK(pmap); 2229 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2230 pmap = NULL; 2231 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2232 continue; 2233 } 2234 } 2235 2236 /* 2237 * Destroy every non-wired, 4 KB page mapping in the chunk. 2238 */ 2239 freed = 0; 2240 for (field = 0; field < _NPCM; field++) { 2241 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2242 inuse != 0; inuse &= ~(1UL << bit)) { 2243 bit = bsfl(inuse); 2244 pv = &pc->pc_pventry[field * 32 + bit]; 2245 va = pv->pv_va; 2246 pde = pmap_pde(pmap, va); 2247 if ((*pde & PG_PS) != 0) 2248 continue; 2249 pte = pmap_pte(pmap, va); 2250 tpte = *pte; 2251 if ((tpte & PG_W) == 0) 2252 tpte = pte_load_clear(pte); 2253 pmap_pte_release(pte); 2254 if ((tpte & PG_W) != 0) 2255 continue; 2256 KASSERT(tpte != 0, 2257 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2258 pmap, va)); 2259 if ((tpte & PG_G) != 0) 2260 pmap_invalidate_page(pmap, va); 2261 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2262 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2263 vm_page_dirty(m); 2264 if ((tpte & PG_A) != 0) 2265 vm_page_aflag_set(m, PGA_REFERENCED); 2266 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2267 if (TAILQ_EMPTY(&m->md.pv_list) && 2268 (m->flags & PG_FICTITIOUS) == 0) { 2269 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2270 if (TAILQ_EMPTY(&pvh->pv_list)) { 2271 vm_page_aflag_clear(m, 2272 PGA_WRITEABLE); 2273 } 2274 } 2275 pc->pc_map[field] |= 1UL << bit; 2276 pmap_unuse_pt(pmap, va, &free); 2277 freed++; 2278 } 2279 } 2280 if (freed == 0) { 2281 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2282 continue; 2283 } 2284 /* Every freed mapping is for a 4 KB page. */ 2285 pmap->pm_stats.resident_count -= freed; 2286 PV_STAT(pv_entry_frees += freed); 2287 PV_STAT(pv_entry_spare += freed); 2288 pv_entry_count -= freed; 2289 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2290 for (field = 0; field < _NPCM; field++) 2291 if (pc->pc_map[field] != pc_freemask[field]) { 2292 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2293 pc_list); 2294 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2295 2296 /* 2297 * One freed pv entry in locked_pmap is 2298 * sufficient. 2299 */ 2300 if (pmap == locked_pmap) 2301 goto out; 2302 break; 2303 } 2304 if (field == _NPCM) { 2305 PV_STAT(pv_entry_spare -= _NPCPV); 2306 PV_STAT(pc_chunk_count--); 2307 PV_STAT(pc_chunk_frees++); 2308 /* Entire chunk is free; return it. */ 2309 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2310 pmap_qremove((vm_offset_t)pc, 1); 2311 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2312 break; 2313 } 2314 } 2315out: 2316 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2317 if (pmap != NULL) { 2318 pmap_invalidate_all(pmap); 2319 if (pmap != locked_pmap) 2320 PMAP_UNLOCK(pmap); 2321 } 2322 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2323 m_pc = SLIST_FIRST(&free); 2324 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2325 /* Recycle a freed page table page. */ 2326 m_pc->wire_count = 1; 2327 } 2328 pmap_free_zero_pages(&free); 2329 return (m_pc); 2330} 2331 2332/* 2333 * free the pv_entry back to the free list 2334 */ 2335static void 2336free_pv_entry(pmap_t pmap, pv_entry_t pv) 2337{ 2338 struct pv_chunk *pc; 2339 int idx, field, bit; 2340 2341 rw_assert(&pvh_global_lock, RA_WLOCKED); 2342 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2343 PV_STAT(pv_entry_frees++); 2344 PV_STAT(pv_entry_spare++); 2345 pv_entry_count--; 2346 pc = pv_to_chunk(pv); 2347 idx = pv - &pc->pc_pventry[0]; 2348 field = idx / 32; 2349 bit = idx % 32; 2350 pc->pc_map[field] |= 1ul << bit; 2351 for (idx = 0; idx < _NPCM; idx++) 2352 if (pc->pc_map[idx] != pc_freemask[idx]) { 2353 /* 2354 * 98% of the time, pc is already at the head of the 2355 * list. If it isn't already, move it to the head. 2356 */ 2357 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2358 pc)) { 2359 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2360 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2361 pc_list); 2362 } 2363 return; 2364 } 2365 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2366 free_pv_chunk(pc); 2367} 2368 2369static void 2370free_pv_chunk(struct pv_chunk *pc) 2371{ 2372 vm_page_t m; 2373 2374 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2375 PV_STAT(pv_entry_spare -= _NPCPV); 2376 PV_STAT(pc_chunk_count--); 2377 PV_STAT(pc_chunk_frees++); 2378 /* entire chunk is free, return it */ 2379 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2380 pmap_qremove((vm_offset_t)pc, 1); 2381 vm_page_unwire(m, PQ_NONE); 2382 vm_page_free(m); 2383 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2384} 2385 2386/* 2387 * get a new pv_entry, allocating a block from the system 2388 * when needed. 2389 */ 2390static pv_entry_t 2391get_pv_entry(pmap_t pmap, boolean_t try) 2392{ 2393 static const struct timeval printinterval = { 60, 0 }; 2394 static struct timeval lastprint; 2395 int bit, field; 2396 pv_entry_t pv; 2397 struct pv_chunk *pc; 2398 vm_page_t m; 2399 2400 rw_assert(&pvh_global_lock, RA_WLOCKED); 2401 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2402 PV_STAT(pv_entry_allocs++); 2403 pv_entry_count++; 2404 if (pv_entry_count > pv_entry_high_water) 2405 if (ratecheck(&lastprint, &printinterval)) 2406 printf("Approaching the limit on PV entries, consider " 2407 "increasing either the vm.pmap.shpgperproc or the " 2408 "vm.pmap.pv_entry_max tunable.\n"); 2409retry: 2410 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2411 if (pc != NULL) { 2412 for (field = 0; field < _NPCM; field++) { 2413 if (pc->pc_map[field]) { 2414 bit = bsfl(pc->pc_map[field]); 2415 break; 2416 } 2417 } 2418 if (field < _NPCM) { 2419 pv = &pc->pc_pventry[field * 32 + bit]; 2420 pc->pc_map[field] &= ~(1ul << bit); 2421 /* If this was the last item, move it to tail */ 2422 for (field = 0; field < _NPCM; field++) 2423 if (pc->pc_map[field] != 0) { 2424 PV_STAT(pv_entry_spare--); 2425 return (pv); /* not full, return */ 2426 } 2427 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2428 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2429 PV_STAT(pv_entry_spare--); 2430 return (pv); 2431 } 2432 } 2433 /* 2434 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2435 * global lock. If "pv_vafree" is currently non-empty, it will 2436 * remain non-empty until pmap_ptelist_alloc() completes. 2437 */ 2438 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2439 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2440 if (try) { 2441 pv_entry_count--; 2442 PV_STAT(pc_chunk_tryfail++); 2443 return (NULL); 2444 } 2445 m = pmap_pv_reclaim(pmap); 2446 if (m == NULL) 2447 goto retry; 2448 } 2449 PV_STAT(pc_chunk_count++); 2450 PV_STAT(pc_chunk_allocs++); 2451 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2452 pmap_qenter((vm_offset_t)pc, &m, 1); 2453 pc->pc_pmap = pmap; 2454 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2455 for (field = 1; field < _NPCM; field++) 2456 pc->pc_map[field] = pc_freemask[field]; 2457 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2458 pv = &pc->pc_pventry[0]; 2459 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2460 PV_STAT(pv_entry_spare += _NPCPV - 1); 2461 return (pv); 2462} 2463 2464static __inline pv_entry_t 2465pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2466{ 2467 pv_entry_t pv; 2468 2469 rw_assert(&pvh_global_lock, RA_WLOCKED); 2470 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2471 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2472 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2473 break; 2474 } 2475 } 2476 return (pv); 2477} 2478 2479static void 2480pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2481{ 2482 struct md_page *pvh; 2483 pv_entry_t pv; 2484 vm_offset_t va_last; 2485 vm_page_t m; 2486 2487 rw_assert(&pvh_global_lock, RA_WLOCKED); 2488 KASSERT((pa & PDRMASK) == 0, 2489 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2490 2491 /* 2492 * Transfer the 4mpage's pv entry for this mapping to the first 2493 * page's pv list. 2494 */ 2495 pvh = pa_to_pvh(pa); 2496 va = trunc_4mpage(va); 2497 pv = pmap_pvh_remove(pvh, pmap, va); 2498 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2499 m = PHYS_TO_VM_PAGE(pa); 2500 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2501 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2502 va_last = va + NBPDR - PAGE_SIZE; 2503 do { 2504 m++; 2505 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2506 ("pmap_pv_demote_pde: page %p is not managed", m)); 2507 va += PAGE_SIZE; 2508 pmap_insert_entry(pmap, va, m); 2509 } while (va < va_last); 2510} 2511 2512#if VM_NRESERVLEVEL > 0 2513static void 2514pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2515{ 2516 struct md_page *pvh; 2517 pv_entry_t pv; 2518 vm_offset_t va_last; 2519 vm_page_t m; 2520 2521 rw_assert(&pvh_global_lock, RA_WLOCKED); 2522 KASSERT((pa & PDRMASK) == 0, 2523 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2524 2525 /* 2526 * Transfer the first page's pv entry for this mapping to the 2527 * 4mpage's pv list. Aside from avoiding the cost of a call 2528 * to get_pv_entry(), a transfer avoids the possibility that 2529 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2530 * removes one of the mappings that is being promoted. 2531 */ 2532 m = PHYS_TO_VM_PAGE(pa); 2533 va = trunc_4mpage(va); 2534 pv = pmap_pvh_remove(&m->md, pmap, va); 2535 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2536 pvh = pa_to_pvh(pa); 2537 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2538 /* Free the remaining NPTEPG - 1 pv entries. */ 2539 va_last = va + NBPDR - PAGE_SIZE; 2540 do { 2541 m++; 2542 va += PAGE_SIZE; 2543 pmap_pvh_free(&m->md, pmap, va); 2544 } while (va < va_last); 2545} 2546#endif /* VM_NRESERVLEVEL > 0 */ 2547 2548static void 2549pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2550{ 2551 pv_entry_t pv; 2552 2553 pv = pmap_pvh_remove(pvh, pmap, va); 2554 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2555 free_pv_entry(pmap, pv); 2556} 2557 2558static void 2559pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2560{ 2561 struct md_page *pvh; 2562 2563 rw_assert(&pvh_global_lock, RA_WLOCKED); 2564 pmap_pvh_free(&m->md, pmap, va); 2565 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2566 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2567 if (TAILQ_EMPTY(&pvh->pv_list)) 2568 vm_page_aflag_clear(m, PGA_WRITEABLE); 2569 } 2570} 2571 2572/* 2573 * Create a pv entry for page at pa for 2574 * (pmap, va). 2575 */ 2576static void 2577pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2578{ 2579 pv_entry_t pv; 2580 2581 rw_assert(&pvh_global_lock, RA_WLOCKED); 2582 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2583 pv = get_pv_entry(pmap, FALSE); 2584 pv->pv_va = va; 2585 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2586} 2587 2588/* 2589 * Conditionally create a pv entry. 2590 */ 2591static boolean_t 2592pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2593{ 2594 pv_entry_t pv; 2595 2596 rw_assert(&pvh_global_lock, RA_WLOCKED); 2597 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2598 if (pv_entry_count < pv_entry_high_water && 2599 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2600 pv->pv_va = va; 2601 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2602 return (TRUE); 2603 } else 2604 return (FALSE); 2605} 2606 2607/* 2608 * Create the pv entries for each of the pages within a superpage. 2609 */ 2610static boolean_t 2611pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2612{ 2613 struct md_page *pvh; 2614 pv_entry_t pv; 2615 2616 rw_assert(&pvh_global_lock, RA_WLOCKED); 2617 if (pv_entry_count < pv_entry_high_water && 2618 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2619 pv->pv_va = va; 2620 pvh = pa_to_pvh(pa); 2621 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2622 return (TRUE); 2623 } else 2624 return (FALSE); 2625} 2626 2627/* 2628 * Fills a page table page with mappings to consecutive physical pages. 2629 */ 2630static void 2631pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2632{ 2633 pt_entry_t *pte; 2634 2635 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2636 *pte = newpte; 2637 newpte += PAGE_SIZE; 2638 } 2639} 2640 2641/* 2642 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2643 * 2- or 4MB page mapping is invalidated. 2644 */ 2645static boolean_t 2646pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2647{ 2648 pd_entry_t newpde, oldpde; 2649 pt_entry_t *firstpte, newpte; 2650 vm_paddr_t mptepa; 2651 vm_page_t mpte; 2652 struct spglist free; 2653 vm_offset_t sva; 2654 2655 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2656 oldpde = *pde; 2657 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2658 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2659 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2660 NULL) { 2661 KASSERT((oldpde & PG_W) == 0, 2662 ("pmap_demote_pde: page table page for a wired mapping" 2663 " is missing")); 2664 2665 /* 2666 * Invalidate the 2- or 4MB page mapping and return 2667 * "failure" if the mapping was never accessed or the 2668 * allocation of the new page table page fails. 2669 */ 2670 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2671 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2672 VM_ALLOC_WIRED)) == NULL) { 2673 SLIST_INIT(&free); 2674 sva = trunc_4mpage(va); 2675 pmap_remove_pde(pmap, pde, sva, &free); 2676 if ((oldpde & PG_G) == 0) 2677 pmap_invalidate_pde_page(pmap, sva, oldpde); 2678 pmap_free_zero_pages(&free); 2679 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2680 " in pmap %p", va, pmap); 2681 return (FALSE); 2682 } 2683 if (va < VM_MAXUSER_ADDRESS) 2684 pmap->pm_stats.resident_count++; 2685 } 2686 mptepa = VM_PAGE_TO_PHYS(mpte); 2687 2688 /* 2689 * If the page mapping is in the kernel's address space, then the 2690 * KPTmap can provide access to the page table page. Otherwise, 2691 * temporarily map the page table page (mpte) into the kernel's 2692 * address space at either PADDR1 or PADDR2. 2693 */ 2694 if (va >= KERNBASE) 2695 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2696 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2697 if ((*PMAP1 & PG_FRAME) != mptepa) { 2698 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2699#ifdef SMP 2700 PMAP1cpu = PCPU_GET(cpuid); 2701#endif 2702 invlcaddr(PADDR1); 2703 PMAP1changed++; 2704 } else 2705#ifdef SMP 2706 if (PMAP1cpu != PCPU_GET(cpuid)) { 2707 PMAP1cpu = PCPU_GET(cpuid); 2708 invlcaddr(PADDR1); 2709 PMAP1changedcpu++; 2710 } else 2711#endif 2712 PMAP1unchanged++; 2713 firstpte = PADDR1; 2714 } else { 2715 mtx_lock(&PMAP2mutex); 2716 if ((*PMAP2 & PG_FRAME) != mptepa) { 2717 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2718 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2719 } 2720 firstpte = PADDR2; 2721 } 2722 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2723 KASSERT((oldpde & PG_A) != 0, 2724 ("pmap_demote_pde: oldpde is missing PG_A")); 2725 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2726 ("pmap_demote_pde: oldpde is missing PG_M")); 2727 newpte = oldpde & ~PG_PS; 2728 if ((newpte & PG_PDE_PAT) != 0) 2729 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2730 2731 /* 2732 * If the page table page is new, initialize it. 2733 */ 2734 if (mpte->wire_count == 1) { 2735 mpte->wire_count = NPTEPG; 2736 pmap_fill_ptp(firstpte, newpte); 2737 } 2738 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2739 ("pmap_demote_pde: firstpte and newpte map different physical" 2740 " addresses")); 2741 2742 /* 2743 * If the mapping has changed attributes, update the page table 2744 * entries. 2745 */ 2746 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2747 pmap_fill_ptp(firstpte, newpte); 2748 2749 /* 2750 * Demote the mapping. This pmap is locked. The old PDE has 2751 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2752 * set. Thus, there is no danger of a race with another 2753 * processor changing the setting of PG_A and/or PG_M between 2754 * the read above and the store below. 2755 */ 2756 if (workaround_erratum383) 2757 pmap_update_pde(pmap, va, pde, newpde); 2758 else if (pmap == kernel_pmap) 2759 pmap_kenter_pde(va, newpde); 2760 else 2761 pde_store(pde, newpde); 2762 if (firstpte == PADDR2) 2763 mtx_unlock(&PMAP2mutex); 2764 2765 /* 2766 * Invalidate the recursive mapping of the page table page. 2767 */ 2768 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2769 2770 /* 2771 * Demote the pv entry. This depends on the earlier demotion 2772 * of the mapping. Specifically, the (re)creation of a per- 2773 * page pv entry might trigger the execution of pmap_collect(), 2774 * which might reclaim a newly (re)created per-page pv entry 2775 * and destroy the associated mapping. In order to destroy 2776 * the mapping, the PDE must have already changed from mapping 2777 * the 2mpage to referencing the page table page. 2778 */ 2779 if ((oldpde & PG_MANAGED) != 0) 2780 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2781 2782 pmap_pde_demotions++; 2783 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2784 " in pmap %p", va, pmap); 2785 return (TRUE); 2786} 2787 2788/* 2789 * Removes a 2- or 4MB page mapping from the kernel pmap. 2790 */ 2791static void 2792pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2793{ 2794 pd_entry_t newpde; 2795 vm_paddr_t mptepa; 2796 vm_page_t mpte; 2797 2798 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2799 mpte = pmap_remove_pt_page(pmap, va); 2800 if (mpte == NULL) 2801 panic("pmap_remove_kernel_pde: Missing pt page."); 2802 2803 mptepa = VM_PAGE_TO_PHYS(mpte); 2804 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2805 2806 /* 2807 * Initialize the page table page. 2808 */ 2809 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2810 2811 /* 2812 * Remove the mapping. 2813 */ 2814 if (workaround_erratum383) 2815 pmap_update_pde(pmap, va, pde, newpde); 2816 else 2817 pmap_kenter_pde(va, newpde); 2818 2819 /* 2820 * Invalidate the recursive mapping of the page table page. 2821 */ 2822 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2823} 2824 2825/* 2826 * pmap_remove_pde: do the things to unmap a superpage in a process 2827 */ 2828static void 2829pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2830 struct spglist *free) 2831{ 2832 struct md_page *pvh; 2833 pd_entry_t oldpde; 2834 vm_offset_t eva, va; 2835 vm_page_t m, mpte; 2836 2837 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2838 KASSERT((sva & PDRMASK) == 0, 2839 ("pmap_remove_pde: sva is not 4mpage aligned")); 2840 oldpde = pte_load_clear(pdq); 2841 if (oldpde & PG_W) 2842 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2843 2844 /* 2845 * Machines that don't support invlpg, also don't support 2846 * PG_G. 2847 */ 2848 if ((oldpde & PG_G) != 0) 2849 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 2850 2851 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2852 if (oldpde & PG_MANAGED) { 2853 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2854 pmap_pvh_free(pvh, pmap, sva); 2855 eva = sva + NBPDR; 2856 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2857 va < eva; va += PAGE_SIZE, m++) { 2858 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2859 vm_page_dirty(m); 2860 if (oldpde & PG_A) 2861 vm_page_aflag_set(m, PGA_REFERENCED); 2862 if (TAILQ_EMPTY(&m->md.pv_list) && 2863 TAILQ_EMPTY(&pvh->pv_list)) 2864 vm_page_aflag_clear(m, PGA_WRITEABLE); 2865 } 2866 } 2867 if (pmap == kernel_pmap) { 2868 pmap_remove_kernel_pde(pmap, pdq, sva); 2869 } else { 2870 mpte = pmap_remove_pt_page(pmap, sva); 2871 if (mpte != NULL) { 2872 pmap->pm_stats.resident_count--; 2873 KASSERT(mpte->wire_count == NPTEPG, 2874 ("pmap_remove_pde: pte page wire count error")); 2875 mpte->wire_count = 0; 2876 pmap_add_delayed_free_list(mpte, free, FALSE); 2877 } 2878 } 2879} 2880 2881/* 2882 * pmap_remove_pte: do the things to unmap a page in a process 2883 */ 2884static int 2885pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2886 struct spglist *free) 2887{ 2888 pt_entry_t oldpte; 2889 vm_page_t m; 2890 2891 rw_assert(&pvh_global_lock, RA_WLOCKED); 2892 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2893 oldpte = pte_load_clear(ptq); 2894 KASSERT(oldpte != 0, 2895 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2896 if (oldpte & PG_W) 2897 pmap->pm_stats.wired_count -= 1; 2898 /* 2899 * Machines that don't support invlpg, also don't support 2900 * PG_G. 2901 */ 2902 if (oldpte & PG_G) 2903 pmap_invalidate_page(kernel_pmap, va); 2904 pmap->pm_stats.resident_count -= 1; 2905 if (oldpte & PG_MANAGED) { 2906 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2907 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2908 vm_page_dirty(m); 2909 if (oldpte & PG_A) 2910 vm_page_aflag_set(m, PGA_REFERENCED); 2911 pmap_remove_entry(pmap, m, va); 2912 } 2913 return (pmap_unuse_pt(pmap, va, free)); 2914} 2915 2916/* 2917 * Remove a single page from a process address space 2918 */ 2919static void 2920pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2921{ 2922 pt_entry_t *pte; 2923 2924 rw_assert(&pvh_global_lock, RA_WLOCKED); 2925 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2926 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2927 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2928 return; 2929 pmap_remove_pte(pmap, pte, va, free); 2930 pmap_invalidate_page(pmap, va); 2931} 2932 2933/* 2934 * Remove the given range of addresses from the specified map. 2935 * 2936 * It is assumed that the start and end are properly 2937 * rounded to the page size. 2938 */ 2939void 2940pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2941{ 2942 vm_offset_t pdnxt; 2943 pd_entry_t ptpaddr; 2944 pt_entry_t *pte; 2945 struct spglist free; 2946 int anyvalid; 2947 2948 /* 2949 * Perform an unsynchronized read. This is, however, safe. 2950 */ 2951 if (pmap->pm_stats.resident_count == 0) 2952 return; 2953 2954 anyvalid = 0; 2955 SLIST_INIT(&free); 2956 2957 rw_wlock(&pvh_global_lock); 2958 sched_pin(); 2959 PMAP_LOCK(pmap); 2960 2961 /* 2962 * special handling of removing one page. a very 2963 * common operation and easy to short circuit some 2964 * code. 2965 */ 2966 if ((sva + PAGE_SIZE == eva) && 2967 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2968 pmap_remove_page(pmap, sva, &free); 2969 goto out; 2970 } 2971 2972 for (; sva < eva; sva = pdnxt) { 2973 u_int pdirindex; 2974 2975 /* 2976 * Calculate index for next page table. 2977 */ 2978 pdnxt = (sva + NBPDR) & ~PDRMASK; 2979 if (pdnxt < sva) 2980 pdnxt = eva; 2981 if (pmap->pm_stats.resident_count == 0) 2982 break; 2983 2984 pdirindex = sva >> PDRSHIFT; 2985 ptpaddr = pmap->pm_pdir[pdirindex]; 2986 2987 /* 2988 * Weed out invalid mappings. Note: we assume that the page 2989 * directory table is always allocated, and in kernel virtual. 2990 */ 2991 if (ptpaddr == 0) 2992 continue; 2993 2994 /* 2995 * Check for large page. 2996 */ 2997 if ((ptpaddr & PG_PS) != 0) { 2998 /* 2999 * Are we removing the entire large page? If not, 3000 * demote the mapping and fall through. 3001 */ 3002 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3003 /* 3004 * The TLB entry for a PG_G mapping is 3005 * invalidated by pmap_remove_pde(). 3006 */ 3007 if ((ptpaddr & PG_G) == 0) 3008 anyvalid = 1; 3009 pmap_remove_pde(pmap, 3010 &pmap->pm_pdir[pdirindex], sva, &free); 3011 continue; 3012 } else if (!pmap_demote_pde(pmap, 3013 &pmap->pm_pdir[pdirindex], sva)) { 3014 /* The large page mapping was destroyed. */ 3015 continue; 3016 } 3017 } 3018 3019 /* 3020 * Limit our scan to either the end of the va represented 3021 * by the current page table page, or to the end of the 3022 * range being removed. 3023 */ 3024 if (pdnxt > eva) 3025 pdnxt = eva; 3026 3027 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3028 sva += PAGE_SIZE) { 3029 if (*pte == 0) 3030 continue; 3031 3032 /* 3033 * The TLB entry for a PG_G mapping is invalidated 3034 * by pmap_remove_pte(). 3035 */ 3036 if ((*pte & PG_G) == 0) 3037 anyvalid = 1; 3038 if (pmap_remove_pte(pmap, pte, sva, &free)) 3039 break; 3040 } 3041 } 3042out: 3043 sched_unpin(); 3044 if (anyvalid) 3045 pmap_invalidate_all(pmap); 3046 rw_wunlock(&pvh_global_lock); 3047 PMAP_UNLOCK(pmap); 3048 pmap_free_zero_pages(&free); 3049} 3050 3051/* 3052 * Routine: pmap_remove_all 3053 * Function: 3054 * Removes this physical page from 3055 * all physical maps in which it resides. 3056 * Reflects back modify bits to the pager. 3057 * 3058 * Notes: 3059 * Original versions of this routine were very 3060 * inefficient because they iteratively called 3061 * pmap_remove (slow...) 3062 */ 3063 3064void 3065pmap_remove_all(vm_page_t m) 3066{ 3067 struct md_page *pvh; 3068 pv_entry_t pv; 3069 pmap_t pmap; 3070 pt_entry_t *pte, tpte; 3071 pd_entry_t *pde; 3072 vm_offset_t va; 3073 struct spglist free; 3074 3075 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3076 ("pmap_remove_all: page %p is not managed", m)); 3077 SLIST_INIT(&free); 3078 rw_wlock(&pvh_global_lock); 3079 sched_pin(); 3080 if ((m->flags & PG_FICTITIOUS) != 0) 3081 goto small_mappings; 3082 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3083 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3084 va = pv->pv_va; 3085 pmap = PV_PMAP(pv); 3086 PMAP_LOCK(pmap); 3087 pde = pmap_pde(pmap, va); 3088 (void)pmap_demote_pde(pmap, pde, va); 3089 PMAP_UNLOCK(pmap); 3090 } 3091small_mappings: 3092 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3093 pmap = PV_PMAP(pv); 3094 PMAP_LOCK(pmap); 3095 pmap->pm_stats.resident_count--; 3096 pde = pmap_pde(pmap, pv->pv_va); 3097 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3098 " a 4mpage in page %p's pv list", m)); 3099 pte = pmap_pte_quick(pmap, pv->pv_va); 3100 tpte = pte_load_clear(pte); 3101 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3102 pmap, pv->pv_va)); 3103 if (tpte & PG_W) 3104 pmap->pm_stats.wired_count--; 3105 if (tpte & PG_A) 3106 vm_page_aflag_set(m, PGA_REFERENCED); 3107 3108 /* 3109 * Update the vm_page_t clean and reference bits. 3110 */ 3111 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3112 vm_page_dirty(m); 3113 pmap_unuse_pt(pmap, pv->pv_va, &free); 3114 pmap_invalidate_page(pmap, pv->pv_va); 3115 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3116 free_pv_entry(pmap, pv); 3117 PMAP_UNLOCK(pmap); 3118 } 3119 vm_page_aflag_clear(m, PGA_WRITEABLE); 3120 sched_unpin(); 3121 rw_wunlock(&pvh_global_lock); 3122 pmap_free_zero_pages(&free); 3123} 3124 3125/* 3126 * pmap_protect_pde: do the things to protect a 4mpage in a process 3127 */ 3128static boolean_t 3129pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3130{ 3131 pd_entry_t newpde, oldpde; 3132 vm_offset_t eva, va; 3133 vm_page_t m; 3134 boolean_t anychanged; 3135 3136 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3137 KASSERT((sva & PDRMASK) == 0, 3138 ("pmap_protect_pde: sva is not 4mpage aligned")); 3139 anychanged = FALSE; 3140retry: 3141 oldpde = newpde = *pde; 3142 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 3143 (PG_MANAGED | PG_M | PG_RW)) { 3144 eva = sva + NBPDR; 3145 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3146 va < eva; va += PAGE_SIZE, m++) 3147 vm_page_dirty(m); 3148 } 3149 if ((prot & VM_PROT_WRITE) == 0) 3150 newpde &= ~(PG_RW | PG_M); 3151#if defined(PAE) || defined(PAE_TABLES) 3152 if ((prot & VM_PROT_EXECUTE) == 0) 3153 newpde |= pg_nx; 3154#endif 3155 if (newpde != oldpde) { 3156 /* 3157 * As an optimization to future operations on this PDE, clear 3158 * PG_PROMOTED. The impending invalidation will remove any 3159 * lingering 4KB page mappings from the TLB. 3160 */ 3161 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) 3162 goto retry; 3163 if ((oldpde & PG_G) != 0) 3164 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3165 else 3166 anychanged = TRUE; 3167 } 3168 return (anychanged); 3169} 3170 3171/* 3172 * Set the physical protection on the 3173 * specified range of this map as requested. 3174 */ 3175void 3176pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3177{ 3178 vm_offset_t pdnxt; 3179 pd_entry_t ptpaddr; 3180 pt_entry_t *pte; 3181 boolean_t anychanged, pv_lists_locked; 3182 3183 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3184 if (prot == VM_PROT_NONE) { 3185 pmap_remove(pmap, sva, eva); 3186 return; 3187 } 3188 3189#if defined(PAE) || defined(PAE_TABLES) 3190 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3191 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3192 return; 3193#else 3194 if (prot & VM_PROT_WRITE) 3195 return; 3196#endif 3197 3198 if (pmap_is_current(pmap)) 3199 pv_lists_locked = FALSE; 3200 else { 3201 pv_lists_locked = TRUE; 3202resume: 3203 rw_wlock(&pvh_global_lock); 3204 sched_pin(); 3205 } 3206 anychanged = FALSE; 3207 3208 PMAP_LOCK(pmap); 3209 for (; sva < eva; sva = pdnxt) { 3210 pt_entry_t obits, pbits; 3211 u_int pdirindex; 3212 3213 pdnxt = (sva + NBPDR) & ~PDRMASK; 3214 if (pdnxt < sva) 3215 pdnxt = eva; 3216 3217 pdirindex = sva >> PDRSHIFT; 3218 ptpaddr = pmap->pm_pdir[pdirindex]; 3219 3220 /* 3221 * Weed out invalid mappings. Note: we assume that the page 3222 * directory table is always allocated, and in kernel virtual. 3223 */ 3224 if (ptpaddr == 0) 3225 continue; 3226 3227 /* 3228 * Check for large page. 3229 */ 3230 if ((ptpaddr & PG_PS) != 0) { 3231 /* 3232 * Are we protecting the entire large page? If not, 3233 * demote the mapping and fall through. 3234 */ 3235 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3236 /* 3237 * The TLB entry for a PG_G mapping is 3238 * invalidated by pmap_protect_pde(). 3239 */ 3240 if (pmap_protect_pde(pmap, 3241 &pmap->pm_pdir[pdirindex], sva, prot)) 3242 anychanged = TRUE; 3243 continue; 3244 } else { 3245 if (!pv_lists_locked) { 3246 pv_lists_locked = TRUE; 3247 if (!rw_try_wlock(&pvh_global_lock)) { 3248 if (anychanged) 3249 pmap_invalidate_all( 3250 pmap); 3251 PMAP_UNLOCK(pmap); 3252 goto resume; 3253 } 3254 sched_pin(); 3255 } 3256 if (!pmap_demote_pde(pmap, 3257 &pmap->pm_pdir[pdirindex], sva)) { 3258 /* 3259 * The large page mapping was 3260 * destroyed. 3261 */ 3262 continue; 3263 } 3264 } 3265 } 3266 3267 if (pdnxt > eva) 3268 pdnxt = eva; 3269 3270 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3271 sva += PAGE_SIZE) { 3272 vm_page_t m; 3273 3274retry: 3275 /* 3276 * Regardless of whether a pte is 32 or 64 bits in 3277 * size, PG_RW, PG_A, and PG_M are among the least 3278 * significant 32 bits. 3279 */ 3280 obits = pbits = *pte; 3281 if ((pbits & PG_V) == 0) 3282 continue; 3283 3284 if ((prot & VM_PROT_WRITE) == 0) { 3285 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3286 (PG_MANAGED | PG_M | PG_RW)) { 3287 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3288 vm_page_dirty(m); 3289 } 3290 pbits &= ~(PG_RW | PG_M); 3291 } 3292#if defined(PAE) || defined(PAE_TABLES) 3293 if ((prot & VM_PROT_EXECUTE) == 0) 3294 pbits |= pg_nx; 3295#endif 3296 3297 if (pbits != obits) { 3298#if defined(PAE) || defined(PAE_TABLES) 3299 if (!atomic_cmpset_64(pte, obits, pbits)) 3300 goto retry; 3301#else 3302 if (!atomic_cmpset_int((u_int *)pte, obits, 3303 pbits)) 3304 goto retry; 3305#endif 3306 if (obits & PG_G) 3307 pmap_invalidate_page(pmap, sva); 3308 else 3309 anychanged = TRUE; 3310 } 3311 } 3312 } 3313 if (anychanged) 3314 pmap_invalidate_all(pmap); 3315 if (pv_lists_locked) { 3316 sched_unpin(); 3317 rw_wunlock(&pvh_global_lock); 3318 } 3319 PMAP_UNLOCK(pmap); 3320} 3321 3322#if VM_NRESERVLEVEL > 0 3323/* 3324 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3325 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3326 * For promotion to occur, two conditions must be met: (1) the 4KB page 3327 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3328 * mappings must have identical characteristics. 3329 * 3330 * Managed (PG_MANAGED) mappings within the kernel address space are not 3331 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3332 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3333 * pmap. 3334 */ 3335static void 3336pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3337{ 3338 pd_entry_t newpde; 3339 pt_entry_t *firstpte, oldpte, pa, *pte; 3340 vm_offset_t oldpteva; 3341 vm_page_t mpte; 3342 3343 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3344 3345 /* 3346 * Examine the first PTE in the specified PTP. Abort if this PTE is 3347 * either invalid, unused, or does not map the first 4KB physical page 3348 * within a 2- or 4MB page. 3349 */ 3350 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3351setpde: 3352 newpde = *firstpte; 3353 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3354 pmap_pde_p_failures++; 3355 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3356 " in pmap %p", va, pmap); 3357 return; 3358 } 3359 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3360 pmap_pde_p_failures++; 3361 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3362 " in pmap %p", va, pmap); 3363 return; 3364 } 3365 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3366 /* 3367 * When PG_M is already clear, PG_RW can be cleared without 3368 * a TLB invalidation. 3369 */ 3370 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3371 ~PG_RW)) 3372 goto setpde; 3373 newpde &= ~PG_RW; 3374 } 3375 3376 /* 3377 * Examine each of the other PTEs in the specified PTP. Abort if this 3378 * PTE maps an unexpected 4KB physical page or does not have identical 3379 * characteristics to the first PTE. 3380 */ 3381 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3382 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3383setpte: 3384 oldpte = *pte; 3385 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3386 pmap_pde_p_failures++; 3387 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3388 " in pmap %p", va, pmap); 3389 return; 3390 } 3391 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3392 /* 3393 * When PG_M is already clear, PG_RW can be cleared 3394 * without a TLB invalidation. 3395 */ 3396 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3397 oldpte & ~PG_RW)) 3398 goto setpte; 3399 oldpte &= ~PG_RW; 3400 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3401 (va & ~PDRMASK); 3402 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3403 " in pmap %p", oldpteva, pmap); 3404 } 3405 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3406 pmap_pde_p_failures++; 3407 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3408 " in pmap %p", va, pmap); 3409 return; 3410 } 3411 pa -= PAGE_SIZE; 3412 } 3413 3414 /* 3415 * Save the page table page in its current state until the PDE 3416 * mapping the superpage is demoted by pmap_demote_pde() or 3417 * destroyed by pmap_remove_pde(). 3418 */ 3419 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3420 KASSERT(mpte >= vm_page_array && 3421 mpte < &vm_page_array[vm_page_array_size], 3422 ("pmap_promote_pde: page table page is out of range")); 3423 KASSERT(mpte->pindex == va >> PDRSHIFT, 3424 ("pmap_promote_pde: page table page's pindex is wrong")); 3425 if (pmap_insert_pt_page(pmap, mpte)) { 3426 pmap_pde_p_failures++; 3427 CTR2(KTR_PMAP, 3428 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3429 pmap); 3430 return; 3431 } 3432 3433 /* 3434 * Promote the pv entries. 3435 */ 3436 if ((newpde & PG_MANAGED) != 0) 3437 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3438 3439 /* 3440 * Propagate the PAT index to its proper position. 3441 */ 3442 if ((newpde & PG_PTE_PAT) != 0) 3443 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3444 3445 /* 3446 * Map the superpage. 3447 */ 3448 if (workaround_erratum383) 3449 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3450 else if (pmap == kernel_pmap) 3451 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); 3452 else 3453 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 3454 3455 pmap_pde_promotions++; 3456 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3457 " in pmap %p", va, pmap); 3458} 3459#endif /* VM_NRESERVLEVEL > 0 */ 3460 3461/* 3462 * Insert the given physical page (p) at 3463 * the specified virtual address (v) in the 3464 * target physical map with the protection requested. 3465 * 3466 * If specified, the page will be wired down, meaning 3467 * that the related pte can not be reclaimed. 3468 * 3469 * NB: This is the only routine which MAY NOT lazy-evaluate 3470 * or lose information. That is, this routine must actually 3471 * insert this page into the given map NOW. 3472 */ 3473int 3474pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3475 u_int flags, int8_t psind) 3476{ 3477 pd_entry_t *pde; 3478 pt_entry_t *pte; 3479 pt_entry_t newpte, origpte; 3480 pv_entry_t pv; 3481 vm_paddr_t opa, pa; 3482 vm_page_t mpte, om; 3483 boolean_t invlva, wired; 3484 3485 va = trunc_page(va); 3486 mpte = NULL; 3487 wired = (flags & PMAP_ENTER_WIRED) != 0; 3488 3489 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3490 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3491 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3492 va)); 3493 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3494 VM_OBJECT_ASSERT_LOCKED(m->object); 3495 3496 rw_wlock(&pvh_global_lock); 3497 PMAP_LOCK(pmap); 3498 sched_pin(); 3499 3500 pde = pmap_pde(pmap, va); 3501 if (va < VM_MAXUSER_ADDRESS) { 3502 /* 3503 * va is for UVA. 3504 * In the case that a page table page is not resident, 3505 * we are creating it here. pmap_allocpte() handles 3506 * demotion. 3507 */ 3508 mpte = pmap_allocpte(pmap, va, flags); 3509 if (mpte == NULL) { 3510 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3511 ("pmap_allocpte failed with sleep allowed")); 3512 sched_unpin(); 3513 rw_wunlock(&pvh_global_lock); 3514 PMAP_UNLOCK(pmap); 3515 return (KERN_RESOURCE_SHORTAGE); 3516 } 3517 } else { 3518 /* 3519 * va is for KVA, so pmap_demote_pde() will never fail 3520 * to install a page table page. PG_V is also 3521 * asserted by pmap_demote_pde(). 3522 */ 3523 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3524 ("KVA %#x invalid pde pdir %#jx", va, 3525 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3526 if ((*pde & PG_PS) != 0) 3527 pmap_demote_pde(pmap, pde, va); 3528 } 3529 pte = pmap_pte_quick(pmap, va); 3530 3531 /* 3532 * Page Directory table entry is not valid, which should not 3533 * happen. We should have either allocated the page table 3534 * page or demoted the existing mapping above. 3535 */ 3536 if (pte == NULL) { 3537 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3538 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3539 } 3540 3541 pa = VM_PAGE_TO_PHYS(m); 3542 om = NULL; 3543 origpte = *pte; 3544 opa = origpte & PG_FRAME; 3545 3546 /* 3547 * Mapping has not changed, must be protection or wiring change. 3548 */ 3549 if (origpte && (opa == pa)) { 3550 /* 3551 * Wiring change, just update stats. We don't worry about 3552 * wiring PT pages as they remain resident as long as there 3553 * are valid mappings in them. Hence, if a user page is wired, 3554 * the PT page will be also. 3555 */ 3556 if (wired && ((origpte & PG_W) == 0)) 3557 pmap->pm_stats.wired_count++; 3558 else if (!wired && (origpte & PG_W)) 3559 pmap->pm_stats.wired_count--; 3560 3561 /* 3562 * Remove extra pte reference 3563 */ 3564 if (mpte) 3565 mpte->wire_count--; 3566 3567 if (origpte & PG_MANAGED) { 3568 om = m; 3569 pa |= PG_MANAGED; 3570 } 3571 goto validate; 3572 } 3573 3574 pv = NULL; 3575 3576 /* 3577 * Mapping has changed, invalidate old range and fall through to 3578 * handle validating new mapping. 3579 */ 3580 if (opa) { 3581 if (origpte & PG_W) 3582 pmap->pm_stats.wired_count--; 3583 if (origpte & PG_MANAGED) { 3584 om = PHYS_TO_VM_PAGE(opa); 3585 pv = pmap_pvh_remove(&om->md, pmap, va); 3586 } 3587 if (mpte != NULL) { 3588 mpte->wire_count--; 3589 KASSERT(mpte->wire_count > 0, 3590 ("pmap_enter: missing reference to page table page," 3591 " va: 0x%x", va)); 3592 } 3593 } else 3594 pmap->pm_stats.resident_count++; 3595 3596 /* 3597 * Enter on the PV list if part of our managed memory. 3598 */ 3599 if ((m->oflags & VPO_UNMANAGED) == 0) { 3600 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3601 ("pmap_enter: managed mapping within the clean submap")); 3602 if (pv == NULL) 3603 pv = get_pv_entry(pmap, FALSE); 3604 pv->pv_va = va; 3605 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3606 pa |= PG_MANAGED; 3607 } else if (pv != NULL) 3608 free_pv_entry(pmap, pv); 3609 3610 /* 3611 * Increment counters 3612 */ 3613 if (wired) 3614 pmap->pm_stats.wired_count++; 3615 3616validate: 3617 /* 3618 * Now validate mapping with desired protection/wiring. 3619 */ 3620 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3621 if ((prot & VM_PROT_WRITE) != 0) { 3622 newpte |= PG_RW; 3623 if ((newpte & PG_MANAGED) != 0) 3624 vm_page_aflag_set(m, PGA_WRITEABLE); 3625 } 3626#if defined(PAE) || defined(PAE_TABLES) 3627 if ((prot & VM_PROT_EXECUTE) == 0) 3628 newpte |= pg_nx; 3629#endif 3630 if (wired) 3631 newpte |= PG_W; 3632 if (va < VM_MAXUSER_ADDRESS) 3633 newpte |= PG_U; 3634 if (pmap == kernel_pmap) 3635 newpte |= pgeflag; 3636 3637 /* 3638 * if the mapping or permission bits are different, we need 3639 * to update the pte. 3640 */ 3641 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3642 newpte |= PG_A; 3643 if ((flags & VM_PROT_WRITE) != 0) 3644 newpte |= PG_M; 3645 if (origpte & PG_V) { 3646 invlva = FALSE; 3647 origpte = pte_load_store(pte, newpte); 3648 if (origpte & PG_A) { 3649 if (origpte & PG_MANAGED) 3650 vm_page_aflag_set(om, PGA_REFERENCED); 3651 if (opa != VM_PAGE_TO_PHYS(m)) 3652 invlva = TRUE; 3653#if defined(PAE) || defined(PAE_TABLES) 3654 if ((origpte & PG_NX) == 0 && 3655 (newpte & PG_NX) != 0) 3656 invlva = TRUE; 3657#endif 3658 } 3659 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3660 if ((origpte & PG_MANAGED) != 0) 3661 vm_page_dirty(om); 3662 if ((prot & VM_PROT_WRITE) == 0) 3663 invlva = TRUE; 3664 } 3665 if ((origpte & PG_MANAGED) != 0 && 3666 TAILQ_EMPTY(&om->md.pv_list) && 3667 ((om->flags & PG_FICTITIOUS) != 0 || 3668 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3669 vm_page_aflag_clear(om, PGA_WRITEABLE); 3670 if (invlva) 3671 pmap_invalidate_page(pmap, va); 3672 } else 3673 pte_store(pte, newpte); 3674 } 3675 3676#if VM_NRESERVLEVEL > 0 3677 /* 3678 * If both the page table page and the reservation are fully 3679 * populated, then attempt promotion. 3680 */ 3681 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3682 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3683 vm_reserv_level_iffullpop(m) == 0) 3684 pmap_promote_pde(pmap, pde, va); 3685#endif 3686 3687 sched_unpin(); 3688 rw_wunlock(&pvh_global_lock); 3689 PMAP_UNLOCK(pmap); 3690 return (KERN_SUCCESS); 3691} 3692 3693/* 3694 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3695 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3696 * blocking, (2) a mapping already exists at the specified virtual address, or 3697 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3698 */ 3699static boolean_t 3700pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3701{ 3702 pd_entry_t *pde, newpde; 3703 3704 rw_assert(&pvh_global_lock, RA_WLOCKED); 3705 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3706 pde = pmap_pde(pmap, va); 3707 if (*pde != 0) { 3708 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3709 " in pmap %p", va, pmap); 3710 return (FALSE); 3711 } 3712 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3713 PG_PS | PG_V; 3714 if ((m->oflags & VPO_UNMANAGED) == 0) { 3715 newpde |= PG_MANAGED; 3716 3717 /* 3718 * Abort this mapping if its PV entry could not be created. 3719 */ 3720 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3721 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3722 " in pmap %p", va, pmap); 3723 return (FALSE); 3724 } 3725 } 3726#if defined(PAE) || defined(PAE_TABLES) 3727 if ((prot & VM_PROT_EXECUTE) == 0) 3728 newpde |= pg_nx; 3729#endif 3730 if (va < VM_MAXUSER_ADDRESS) 3731 newpde |= PG_U; 3732 3733 /* 3734 * Increment counters. 3735 */ 3736 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3737 3738 /* 3739 * Map the superpage. (This is not a promoted mapping; there will not 3740 * be any lingering 4KB page mappings in the TLB.) 3741 */ 3742 pde_store(pde, newpde); 3743 3744 pmap_pde_mappings++; 3745 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3746 " in pmap %p", va, pmap); 3747 return (TRUE); 3748} 3749 3750/* 3751 * Maps a sequence of resident pages belonging to the same object. 3752 * The sequence begins with the given page m_start. This page is 3753 * mapped at the given virtual address start. Each subsequent page is 3754 * mapped at a virtual address that is offset from start by the same 3755 * amount as the page is offset from m_start within the object. The 3756 * last page in the sequence is the page with the largest offset from 3757 * m_start that can be mapped at a virtual address less than the given 3758 * virtual address end. Not every virtual page between start and end 3759 * is mapped; only those for which a resident page exists with the 3760 * corresponding offset from m_start are mapped. 3761 */ 3762void 3763pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3764 vm_page_t m_start, vm_prot_t prot) 3765{ 3766 vm_offset_t va; 3767 vm_page_t m, mpte; 3768 vm_pindex_t diff, psize; 3769 3770 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3771 3772 psize = atop(end - start); 3773 mpte = NULL; 3774 m = m_start; 3775 rw_wlock(&pvh_global_lock); 3776 PMAP_LOCK(pmap); 3777 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3778 va = start + ptoa(diff); 3779 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3780 m->psind == 1 && pg_ps_enabled && 3781 pmap_enter_pde(pmap, va, m, prot)) 3782 m = &m[NBPDR / PAGE_SIZE - 1]; 3783 else 3784 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3785 mpte); 3786 m = TAILQ_NEXT(m, listq); 3787 } 3788 rw_wunlock(&pvh_global_lock); 3789 PMAP_UNLOCK(pmap); 3790} 3791 3792/* 3793 * this code makes some *MAJOR* assumptions: 3794 * 1. Current pmap & pmap exists. 3795 * 2. Not wired. 3796 * 3. Read access. 3797 * 4. No page table pages. 3798 * but is *MUCH* faster than pmap_enter... 3799 */ 3800 3801void 3802pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3803{ 3804 3805 rw_wlock(&pvh_global_lock); 3806 PMAP_LOCK(pmap); 3807 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3808 rw_wunlock(&pvh_global_lock); 3809 PMAP_UNLOCK(pmap); 3810} 3811 3812static vm_page_t 3813pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3814 vm_prot_t prot, vm_page_t mpte) 3815{ 3816 pt_entry_t *pte; 3817 vm_paddr_t pa; 3818 struct spglist free; 3819 3820 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3821 (m->oflags & VPO_UNMANAGED) != 0, 3822 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3823 rw_assert(&pvh_global_lock, RA_WLOCKED); 3824 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3825 3826 /* 3827 * In the case that a page table page is not 3828 * resident, we are creating it here. 3829 */ 3830 if (va < VM_MAXUSER_ADDRESS) { 3831 u_int ptepindex; 3832 pd_entry_t ptepa; 3833 3834 /* 3835 * Calculate pagetable page index 3836 */ 3837 ptepindex = va >> PDRSHIFT; 3838 if (mpte && (mpte->pindex == ptepindex)) { 3839 mpte->wire_count++; 3840 } else { 3841 /* 3842 * Get the page directory entry 3843 */ 3844 ptepa = pmap->pm_pdir[ptepindex]; 3845 3846 /* 3847 * If the page table page is mapped, we just increment 3848 * the hold count, and activate it. 3849 */ 3850 if (ptepa) { 3851 if (ptepa & PG_PS) 3852 return (NULL); 3853 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3854 mpte->wire_count++; 3855 } else { 3856 mpte = _pmap_allocpte(pmap, ptepindex, 3857 PMAP_ENTER_NOSLEEP); 3858 if (mpte == NULL) 3859 return (mpte); 3860 } 3861 } 3862 } else { 3863 mpte = NULL; 3864 } 3865 3866 /* 3867 * This call to vtopte makes the assumption that we are 3868 * entering the page into the current pmap. In order to support 3869 * quick entry into any pmap, one would likely use pmap_pte_quick. 3870 * But that isn't as quick as vtopte. 3871 */ 3872 pte = vtopte(va); 3873 if (*pte) { 3874 if (mpte != NULL) { 3875 mpte->wire_count--; 3876 mpte = NULL; 3877 } 3878 return (mpte); 3879 } 3880 3881 /* 3882 * Enter on the PV list if part of our managed memory. 3883 */ 3884 if ((m->oflags & VPO_UNMANAGED) == 0 && 3885 !pmap_try_insert_pv_entry(pmap, va, m)) { 3886 if (mpte != NULL) { 3887 SLIST_INIT(&free); 3888 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3889 pmap_invalidate_page(pmap, va); 3890 pmap_free_zero_pages(&free); 3891 } 3892 3893 mpte = NULL; 3894 } 3895 return (mpte); 3896 } 3897 3898 /* 3899 * Increment counters 3900 */ 3901 pmap->pm_stats.resident_count++; 3902 3903 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3904#if defined(PAE) || defined(PAE_TABLES) 3905 if ((prot & VM_PROT_EXECUTE) == 0) 3906 pa |= pg_nx; 3907#endif 3908 3909 /* 3910 * Now validate mapping with RO protection 3911 */ 3912 if ((m->oflags & VPO_UNMANAGED) != 0) 3913 pte_store(pte, pa | PG_V | PG_U); 3914 else 3915 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3916 return (mpte); 3917} 3918 3919/* 3920 * Make a temporary mapping for a physical address. This is only intended 3921 * to be used for panic dumps. 3922 */ 3923void * 3924pmap_kenter_temporary(vm_paddr_t pa, int i) 3925{ 3926 vm_offset_t va; 3927 3928 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3929 pmap_kenter(va, pa); 3930 invlpg(va); 3931 return ((void *)crashdumpmap); 3932} 3933 3934/* 3935 * This code maps large physical mmap regions into the 3936 * processor address space. Note that some shortcuts 3937 * are taken, but the code works. 3938 */ 3939void 3940pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3941 vm_pindex_t pindex, vm_size_t size) 3942{ 3943 pd_entry_t *pde; 3944 vm_paddr_t pa, ptepa; 3945 vm_page_t p; 3946 int pat_mode; 3947 3948 VM_OBJECT_ASSERT_WLOCKED(object); 3949 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3950 ("pmap_object_init_pt: non-device object")); 3951 if (pseflag && 3952 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3953 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3954 return; 3955 p = vm_page_lookup(object, pindex); 3956 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3957 ("pmap_object_init_pt: invalid page %p", p)); 3958 pat_mode = p->md.pat_mode; 3959 3960 /* 3961 * Abort the mapping if the first page is not physically 3962 * aligned to a 2/4MB page boundary. 3963 */ 3964 ptepa = VM_PAGE_TO_PHYS(p); 3965 if (ptepa & (NBPDR - 1)) 3966 return; 3967 3968 /* 3969 * Skip the first page. Abort the mapping if the rest of 3970 * the pages are not physically contiguous or have differing 3971 * memory attributes. 3972 */ 3973 p = TAILQ_NEXT(p, listq); 3974 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3975 pa += PAGE_SIZE) { 3976 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3977 ("pmap_object_init_pt: invalid page %p", p)); 3978 if (pa != VM_PAGE_TO_PHYS(p) || 3979 pat_mode != p->md.pat_mode) 3980 return; 3981 p = TAILQ_NEXT(p, listq); 3982 } 3983 3984 /* 3985 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3986 * "size" is a multiple of 2/4M, adding the PAT setting to 3987 * "pa" will not affect the termination of this loop. 3988 */ 3989 PMAP_LOCK(pmap); 3990 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3991 size; pa += NBPDR) { 3992 pde = pmap_pde(pmap, addr); 3993 if (*pde == 0) { 3994 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3995 PG_U | PG_RW | PG_V); 3996 pmap->pm_stats.resident_count += NBPDR / 3997 PAGE_SIZE; 3998 pmap_pde_mappings++; 3999 } 4000 /* Else continue on if the PDE is already valid. */ 4001 addr += NBPDR; 4002 } 4003 PMAP_UNLOCK(pmap); 4004 } 4005} 4006 4007/* 4008 * Clear the wired attribute from the mappings for the specified range of 4009 * addresses in the given pmap. Every valid mapping within that range 4010 * must have the wired attribute set. In contrast, invalid mappings 4011 * cannot have the wired attribute set, so they are ignored. 4012 * 4013 * The wired attribute of the page table entry is not a hardware feature, 4014 * so there is no need to invalidate any TLB entries. 4015 */ 4016void 4017pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4018{ 4019 vm_offset_t pdnxt; 4020 pd_entry_t *pde; 4021 pt_entry_t *pte; 4022 boolean_t pv_lists_locked; 4023 4024 if (pmap_is_current(pmap)) 4025 pv_lists_locked = FALSE; 4026 else { 4027 pv_lists_locked = TRUE; 4028resume: 4029 rw_wlock(&pvh_global_lock); 4030 sched_pin(); 4031 } 4032 PMAP_LOCK(pmap); 4033 for (; sva < eva; sva = pdnxt) { 4034 pdnxt = (sva + NBPDR) & ~PDRMASK; 4035 if (pdnxt < sva) 4036 pdnxt = eva; 4037 pde = pmap_pde(pmap, sva); 4038 if ((*pde & PG_V) == 0) 4039 continue; 4040 if ((*pde & PG_PS) != 0) { 4041 if ((*pde & PG_W) == 0) 4042 panic("pmap_unwire: pde %#jx is missing PG_W", 4043 (uintmax_t)*pde); 4044 4045 /* 4046 * Are we unwiring the entire large page? If not, 4047 * demote the mapping and fall through. 4048 */ 4049 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4050 /* 4051 * Regardless of whether a pde (or pte) is 32 4052 * or 64 bits in size, PG_W is among the least 4053 * significant 32 bits. 4054 */ 4055 atomic_clear_int((u_int *)pde, PG_W); 4056 pmap->pm_stats.wired_count -= NBPDR / 4057 PAGE_SIZE; 4058 continue; 4059 } else { 4060 if (!pv_lists_locked) { 4061 pv_lists_locked = TRUE; 4062 if (!rw_try_wlock(&pvh_global_lock)) { 4063 PMAP_UNLOCK(pmap); 4064 /* Repeat sva. */ 4065 goto resume; 4066 } 4067 sched_pin(); 4068 } 4069 if (!pmap_demote_pde(pmap, pde, sva)) 4070 panic("pmap_unwire: demotion failed"); 4071 } 4072 } 4073 if (pdnxt > eva) 4074 pdnxt = eva; 4075 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4076 sva += PAGE_SIZE) { 4077 if ((*pte & PG_V) == 0) 4078 continue; 4079 if ((*pte & PG_W) == 0) 4080 panic("pmap_unwire: pte %#jx is missing PG_W", 4081 (uintmax_t)*pte); 4082 4083 /* 4084 * PG_W must be cleared atomically. Although the pmap 4085 * lock synchronizes access to PG_W, another processor 4086 * could be setting PG_M and/or PG_A concurrently. 4087 * 4088 * PG_W is among the least significant 32 bits. 4089 */ 4090 atomic_clear_int((u_int *)pte, PG_W); 4091 pmap->pm_stats.wired_count--; 4092 } 4093 } 4094 if (pv_lists_locked) { 4095 sched_unpin(); 4096 rw_wunlock(&pvh_global_lock); 4097 } 4098 PMAP_UNLOCK(pmap); 4099} 4100 4101 4102/* 4103 * Copy the range specified by src_addr/len 4104 * from the source map to the range dst_addr/len 4105 * in the destination map. 4106 * 4107 * This routine is only advisory and need not do anything. 4108 */ 4109 4110void 4111pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4112 vm_offset_t src_addr) 4113{ 4114 struct spglist free; 4115 vm_offset_t addr; 4116 vm_offset_t end_addr = src_addr + len; 4117 vm_offset_t pdnxt; 4118 4119 if (dst_addr != src_addr) 4120 return; 4121 4122 if (!pmap_is_current(src_pmap)) 4123 return; 4124 4125 rw_wlock(&pvh_global_lock); 4126 if (dst_pmap < src_pmap) { 4127 PMAP_LOCK(dst_pmap); 4128 PMAP_LOCK(src_pmap); 4129 } else { 4130 PMAP_LOCK(src_pmap); 4131 PMAP_LOCK(dst_pmap); 4132 } 4133 sched_pin(); 4134 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4135 pt_entry_t *src_pte, *dst_pte; 4136 vm_page_t dstmpte, srcmpte; 4137 pd_entry_t srcptepaddr; 4138 u_int ptepindex; 4139 4140 KASSERT(addr < UPT_MIN_ADDRESS, 4141 ("pmap_copy: invalid to pmap_copy page tables")); 4142 4143 pdnxt = (addr + NBPDR) & ~PDRMASK; 4144 if (pdnxt < addr) 4145 pdnxt = end_addr; 4146 ptepindex = addr >> PDRSHIFT; 4147 4148 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4149 if (srcptepaddr == 0) 4150 continue; 4151 4152 if (srcptepaddr & PG_PS) { 4153 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4154 continue; 4155 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4156 ((srcptepaddr & PG_MANAGED) == 0 || 4157 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4158 PG_PS_FRAME))) { 4159 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4160 ~PG_W; 4161 dst_pmap->pm_stats.resident_count += 4162 NBPDR / PAGE_SIZE; 4163 pmap_pde_mappings++; 4164 } 4165 continue; 4166 } 4167 4168 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4169 KASSERT(srcmpte->wire_count > 0, 4170 ("pmap_copy: source page table page is unused")); 4171 4172 if (pdnxt > end_addr) 4173 pdnxt = end_addr; 4174 4175 src_pte = vtopte(addr); 4176 while (addr < pdnxt) { 4177 pt_entry_t ptetemp; 4178 ptetemp = *src_pte; 4179 /* 4180 * we only virtual copy managed pages 4181 */ 4182 if ((ptetemp & PG_MANAGED) != 0) { 4183 dstmpte = pmap_allocpte(dst_pmap, addr, 4184 PMAP_ENTER_NOSLEEP); 4185 if (dstmpte == NULL) 4186 goto out; 4187 dst_pte = pmap_pte_quick(dst_pmap, addr); 4188 if (*dst_pte == 0 && 4189 pmap_try_insert_pv_entry(dst_pmap, addr, 4190 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4191 /* 4192 * Clear the wired, modified, and 4193 * accessed (referenced) bits 4194 * during the copy. 4195 */ 4196 *dst_pte = ptetemp & ~(PG_W | PG_M | 4197 PG_A); 4198 dst_pmap->pm_stats.resident_count++; 4199 } else { 4200 SLIST_INIT(&free); 4201 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4202 &free)) { 4203 pmap_invalidate_page(dst_pmap, 4204 addr); 4205 pmap_free_zero_pages(&free); 4206 } 4207 goto out; 4208 } 4209 if (dstmpte->wire_count >= srcmpte->wire_count) 4210 break; 4211 } 4212 addr += PAGE_SIZE; 4213 src_pte++; 4214 } 4215 } 4216out: 4217 sched_unpin(); 4218 rw_wunlock(&pvh_global_lock); 4219 PMAP_UNLOCK(src_pmap); 4220 PMAP_UNLOCK(dst_pmap); 4221} 4222 4223static __inline void 4224pagezero(void *page) 4225{ 4226#if defined(I686_CPU) 4227 if (cpu_class == CPUCLASS_686) { 4228 if (cpu_feature & CPUID_SSE2) 4229 sse2_pagezero(page); 4230 else 4231 i686_pagezero(page); 4232 } else 4233#endif 4234 bzero(page, PAGE_SIZE); 4235} 4236 4237/* 4238 * pmap_zero_page zeros the specified hardware page by mapping 4239 * the page into KVM and using bzero to clear its contents. 4240 */ 4241void 4242pmap_zero_page(vm_page_t m) 4243{ 4244 pt_entry_t *cmap_pte2; 4245 struct pcpu *pc; 4246 4247 sched_pin(); 4248 pc = get_pcpu(); 4249 cmap_pte2 = pc->pc_cmap_pte2; 4250 mtx_lock(&pc->pc_cmap_lock); 4251 if (*cmap_pte2) 4252 panic("pmap_zero_page: CMAP2 busy"); 4253 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4254 pmap_cache_bits(m->md.pat_mode, 0); 4255 invlcaddr(pc->pc_cmap_addr2); 4256 pagezero(pc->pc_cmap_addr2); 4257 *cmap_pte2 = 0; 4258 4259 /* 4260 * Unpin the thread before releasing the lock. Otherwise the thread 4261 * could be rescheduled while still bound to the current CPU, only 4262 * to unpin itself immediately upon resuming execution. 4263 */ 4264 sched_unpin(); 4265 mtx_unlock(&pc->pc_cmap_lock); 4266} 4267 4268/* 4269 * pmap_zero_page_area zeros the specified hardware page by mapping 4270 * the page into KVM and using bzero to clear its contents. 4271 * 4272 * off and size may not cover an area beyond a single hardware page. 4273 */ 4274void 4275pmap_zero_page_area(vm_page_t m, int off, int size) 4276{ 4277 pt_entry_t *cmap_pte2; 4278 struct pcpu *pc; 4279 4280 sched_pin(); 4281 pc = get_pcpu(); 4282 cmap_pte2 = pc->pc_cmap_pte2; 4283 mtx_lock(&pc->pc_cmap_lock); 4284 if (*cmap_pte2) 4285 panic("pmap_zero_page_area: CMAP2 busy"); 4286 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4287 pmap_cache_bits(m->md.pat_mode, 0); 4288 invlcaddr(pc->pc_cmap_addr2); 4289 if (off == 0 && size == PAGE_SIZE) 4290 pagezero(pc->pc_cmap_addr2); 4291 else 4292 bzero(pc->pc_cmap_addr2 + off, size); 4293 *cmap_pte2 = 0; 4294 sched_unpin(); 4295 mtx_unlock(&pc->pc_cmap_lock); 4296} 4297 4298/* 4299 * pmap_zero_page_idle zeros the specified hardware page by mapping 4300 * the page into KVM and using bzero to clear its contents. This 4301 * is intended to be called from the vm_pagezero process only and 4302 * outside of Giant. 4303 */ 4304void 4305pmap_zero_page_idle(vm_page_t m) 4306{ 4307 4308 if (*CMAP3) 4309 panic("pmap_zero_page_idle: CMAP3 busy"); 4310 sched_pin(); 4311 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4312 pmap_cache_bits(m->md.pat_mode, 0); 4313 invlcaddr(CADDR3); 4314 pagezero(CADDR3); 4315 *CMAP3 = 0; 4316 sched_unpin(); 4317} 4318 4319/* 4320 * pmap_copy_page copies the specified (machine independent) 4321 * page by mapping the page into virtual memory and using 4322 * bcopy to copy the page, one machine dependent page at a 4323 * time. 4324 */ 4325void 4326pmap_copy_page(vm_page_t src, vm_page_t dst) 4327{ 4328 pt_entry_t *cmap_pte1, *cmap_pte2; 4329 struct pcpu *pc; 4330 4331 sched_pin(); 4332 pc = get_pcpu(); 4333 cmap_pte1 = pc->pc_cmap_pte1; 4334 cmap_pte2 = pc->pc_cmap_pte2; 4335 mtx_lock(&pc->pc_cmap_lock); 4336 if (*cmap_pte1) 4337 panic("pmap_copy_page: CMAP1 busy"); 4338 if (*cmap_pte2) 4339 panic("pmap_copy_page: CMAP2 busy"); 4340 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4341 pmap_cache_bits(src->md.pat_mode, 0); 4342 invlcaddr(pc->pc_cmap_addr1); 4343 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4344 pmap_cache_bits(dst->md.pat_mode, 0); 4345 invlcaddr(pc->pc_cmap_addr2); 4346 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4347 *cmap_pte1 = 0; 4348 *cmap_pte2 = 0; 4349 sched_unpin(); 4350 mtx_unlock(&pc->pc_cmap_lock); 4351} 4352 4353int unmapped_buf_allowed = 1; 4354 4355void 4356pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4357 vm_offset_t b_offset, int xfersize) 4358{ 4359 vm_page_t a_pg, b_pg; 4360 char *a_cp, *b_cp; 4361 vm_offset_t a_pg_offset, b_pg_offset; 4362 pt_entry_t *cmap_pte1, *cmap_pte2; 4363 struct pcpu *pc; 4364 int cnt; 4365 4366 sched_pin(); 4367 pc = get_pcpu(); 4368 cmap_pte1 = pc->pc_cmap_pte1; 4369 cmap_pte2 = pc->pc_cmap_pte2; 4370 mtx_lock(&pc->pc_cmap_lock); 4371 if (*cmap_pte1 != 0) 4372 panic("pmap_copy_pages: CMAP1 busy"); 4373 if (*cmap_pte2 != 0) 4374 panic("pmap_copy_pages: CMAP2 busy"); 4375 while (xfersize > 0) { 4376 a_pg = ma[a_offset >> PAGE_SHIFT]; 4377 a_pg_offset = a_offset & PAGE_MASK; 4378 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4379 b_pg = mb[b_offset >> PAGE_SHIFT]; 4380 b_pg_offset = b_offset & PAGE_MASK; 4381 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4382 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4383 pmap_cache_bits(a_pg->md.pat_mode, 0); 4384 invlcaddr(pc->pc_cmap_addr1); 4385 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4386 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4387 invlcaddr(pc->pc_cmap_addr2); 4388 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4389 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4390 bcopy(a_cp, b_cp, cnt); 4391 a_offset += cnt; 4392 b_offset += cnt; 4393 xfersize -= cnt; 4394 } 4395 *cmap_pte1 = 0; 4396 *cmap_pte2 = 0; 4397 sched_unpin(); 4398 mtx_unlock(&pc->pc_cmap_lock); 4399} 4400 4401/* 4402 * Returns true if the pmap's pv is one of the first 4403 * 16 pvs linked to from this page. This count may 4404 * be changed upwards or downwards in the future; it 4405 * is only necessary that true be returned for a small 4406 * subset of pmaps for proper page aging. 4407 */ 4408boolean_t 4409pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4410{ 4411 struct md_page *pvh; 4412 pv_entry_t pv; 4413 int loops = 0; 4414 boolean_t rv; 4415 4416 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4417 ("pmap_page_exists_quick: page %p is not managed", m)); 4418 rv = FALSE; 4419 rw_wlock(&pvh_global_lock); 4420 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4421 if (PV_PMAP(pv) == pmap) { 4422 rv = TRUE; 4423 break; 4424 } 4425 loops++; 4426 if (loops >= 16) 4427 break; 4428 } 4429 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4430 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4431 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4432 if (PV_PMAP(pv) == pmap) { 4433 rv = TRUE; 4434 break; 4435 } 4436 loops++; 4437 if (loops >= 16) 4438 break; 4439 } 4440 } 4441 rw_wunlock(&pvh_global_lock); 4442 return (rv); 4443} 4444 4445/* 4446 * pmap_page_wired_mappings: 4447 * 4448 * Return the number of managed mappings to the given physical page 4449 * that are wired. 4450 */ 4451int 4452pmap_page_wired_mappings(vm_page_t m) 4453{ 4454 int count; 4455 4456 count = 0; 4457 if ((m->oflags & VPO_UNMANAGED) != 0) 4458 return (count); 4459 rw_wlock(&pvh_global_lock); 4460 count = pmap_pvh_wired_mappings(&m->md, count); 4461 if ((m->flags & PG_FICTITIOUS) == 0) { 4462 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4463 count); 4464 } 4465 rw_wunlock(&pvh_global_lock); 4466 return (count); 4467} 4468 4469/* 4470 * pmap_pvh_wired_mappings: 4471 * 4472 * Return the updated number "count" of managed mappings that are wired. 4473 */ 4474static int 4475pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4476{ 4477 pmap_t pmap; 4478 pt_entry_t *pte; 4479 pv_entry_t pv; 4480 4481 rw_assert(&pvh_global_lock, RA_WLOCKED); 4482 sched_pin(); 4483 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4484 pmap = PV_PMAP(pv); 4485 PMAP_LOCK(pmap); 4486 pte = pmap_pte_quick(pmap, pv->pv_va); 4487 if ((*pte & PG_W) != 0) 4488 count++; 4489 PMAP_UNLOCK(pmap); 4490 } 4491 sched_unpin(); 4492 return (count); 4493} 4494 4495/* 4496 * Returns TRUE if the given page is mapped individually or as part of 4497 * a 4mpage. Otherwise, returns FALSE. 4498 */ 4499boolean_t 4500pmap_page_is_mapped(vm_page_t m) 4501{ 4502 boolean_t rv; 4503 4504 if ((m->oflags & VPO_UNMANAGED) != 0) 4505 return (FALSE); 4506 rw_wlock(&pvh_global_lock); 4507 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4508 ((m->flags & PG_FICTITIOUS) == 0 && 4509 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4510 rw_wunlock(&pvh_global_lock); 4511 return (rv); 4512} 4513 4514/* 4515 * Remove all pages from specified address space 4516 * this aids process exit speeds. Also, this code 4517 * is special cased for current process only, but 4518 * can have the more generic (and slightly slower) 4519 * mode enabled. This is much faster than pmap_remove 4520 * in the case of running down an entire address space. 4521 */ 4522void 4523pmap_remove_pages(pmap_t pmap) 4524{ 4525 pt_entry_t *pte, tpte; 4526 vm_page_t m, mpte, mt; 4527 pv_entry_t pv; 4528 struct md_page *pvh; 4529 struct pv_chunk *pc, *npc; 4530 struct spglist free; 4531 int field, idx; 4532 int32_t bit; 4533 uint32_t inuse, bitmask; 4534 int allfree; 4535 4536 if (pmap != PCPU_GET(curpmap)) { 4537 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4538 return; 4539 } 4540 SLIST_INIT(&free); 4541 rw_wlock(&pvh_global_lock); 4542 PMAP_LOCK(pmap); 4543 sched_pin(); 4544 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4545 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4546 pc->pc_pmap)); 4547 allfree = 1; 4548 for (field = 0; field < _NPCM; field++) { 4549 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4550 while (inuse != 0) { 4551 bit = bsfl(inuse); 4552 bitmask = 1UL << bit; 4553 idx = field * 32 + bit; 4554 pv = &pc->pc_pventry[idx]; 4555 inuse &= ~bitmask; 4556 4557 pte = pmap_pde(pmap, pv->pv_va); 4558 tpte = *pte; 4559 if ((tpte & PG_PS) == 0) { 4560 pte = vtopte(pv->pv_va); 4561 tpte = *pte & ~PG_PTE_PAT; 4562 } 4563 4564 if (tpte == 0) { 4565 printf( 4566 "TPTE at %p IS ZERO @ VA %08x\n", 4567 pte, pv->pv_va); 4568 panic("bad pte"); 4569 } 4570 4571/* 4572 * We cannot remove wired pages from a process' mapping at this time 4573 */ 4574 if (tpte & PG_W) { 4575 allfree = 0; 4576 continue; 4577 } 4578 4579 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4580 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4581 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4582 m, (uintmax_t)m->phys_addr, 4583 (uintmax_t)tpte)); 4584 4585 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4586 m < &vm_page_array[vm_page_array_size], 4587 ("pmap_remove_pages: bad tpte %#jx", 4588 (uintmax_t)tpte)); 4589 4590 pte_clear(pte); 4591 4592 /* 4593 * Update the vm_page_t clean/reference bits. 4594 */ 4595 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4596 if ((tpte & PG_PS) != 0) { 4597 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4598 vm_page_dirty(mt); 4599 } else 4600 vm_page_dirty(m); 4601 } 4602 4603 /* Mark free */ 4604 PV_STAT(pv_entry_frees++); 4605 PV_STAT(pv_entry_spare++); 4606 pv_entry_count--; 4607 pc->pc_map[field] |= bitmask; 4608 if ((tpte & PG_PS) != 0) { 4609 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4610 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4611 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4612 if (TAILQ_EMPTY(&pvh->pv_list)) { 4613 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4614 if (TAILQ_EMPTY(&mt->md.pv_list)) 4615 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4616 } 4617 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4618 if (mpte != NULL) { 4619 pmap->pm_stats.resident_count--; 4620 KASSERT(mpte->wire_count == NPTEPG, 4621 ("pmap_remove_pages: pte page wire count error")); 4622 mpte->wire_count = 0; 4623 pmap_add_delayed_free_list(mpte, &free, FALSE); 4624 } 4625 } else { 4626 pmap->pm_stats.resident_count--; 4627 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4628 if (TAILQ_EMPTY(&m->md.pv_list) && 4629 (m->flags & PG_FICTITIOUS) == 0) { 4630 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4631 if (TAILQ_EMPTY(&pvh->pv_list)) 4632 vm_page_aflag_clear(m, PGA_WRITEABLE); 4633 } 4634 pmap_unuse_pt(pmap, pv->pv_va, &free); 4635 } 4636 } 4637 } 4638 if (allfree) { 4639 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4640 free_pv_chunk(pc); 4641 } 4642 } 4643 sched_unpin(); 4644 pmap_invalidate_all(pmap); 4645 rw_wunlock(&pvh_global_lock); 4646 PMAP_UNLOCK(pmap); 4647 pmap_free_zero_pages(&free); 4648} 4649 4650/* 4651 * pmap_is_modified: 4652 * 4653 * Return whether or not the specified physical page was modified 4654 * in any physical maps. 4655 */ 4656boolean_t 4657pmap_is_modified(vm_page_t m) 4658{ 4659 boolean_t rv; 4660 4661 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4662 ("pmap_is_modified: page %p is not managed", m)); 4663 4664 /* 4665 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4666 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4667 * is clear, no PTEs can have PG_M set. 4668 */ 4669 VM_OBJECT_ASSERT_WLOCKED(m->object); 4670 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4671 return (FALSE); 4672 rw_wlock(&pvh_global_lock); 4673 rv = pmap_is_modified_pvh(&m->md) || 4674 ((m->flags & PG_FICTITIOUS) == 0 && 4675 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4676 rw_wunlock(&pvh_global_lock); 4677 return (rv); 4678} 4679 4680/* 4681 * Returns TRUE if any of the given mappings were used to modify 4682 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4683 * mappings are supported. 4684 */ 4685static boolean_t 4686pmap_is_modified_pvh(struct md_page *pvh) 4687{ 4688 pv_entry_t pv; 4689 pt_entry_t *pte; 4690 pmap_t pmap; 4691 boolean_t rv; 4692 4693 rw_assert(&pvh_global_lock, RA_WLOCKED); 4694 rv = FALSE; 4695 sched_pin(); 4696 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4697 pmap = PV_PMAP(pv); 4698 PMAP_LOCK(pmap); 4699 pte = pmap_pte_quick(pmap, pv->pv_va); 4700 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4701 PMAP_UNLOCK(pmap); 4702 if (rv) 4703 break; 4704 } 4705 sched_unpin(); 4706 return (rv); 4707} 4708 4709/* 4710 * pmap_is_prefaultable: 4711 * 4712 * Return whether or not the specified virtual address is elgible 4713 * for prefault. 4714 */ 4715boolean_t 4716pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4717{ 4718 pd_entry_t *pde; 4719 pt_entry_t *pte; 4720 boolean_t rv; 4721 4722 rv = FALSE; 4723 PMAP_LOCK(pmap); 4724 pde = pmap_pde(pmap, addr); 4725 if (*pde != 0 && (*pde & PG_PS) == 0) { 4726 pte = vtopte(addr); 4727 rv = *pte == 0; 4728 } 4729 PMAP_UNLOCK(pmap); 4730 return (rv); 4731} 4732 4733/* 4734 * pmap_is_referenced: 4735 * 4736 * Return whether or not the specified physical page was referenced 4737 * in any physical maps. 4738 */ 4739boolean_t 4740pmap_is_referenced(vm_page_t m) 4741{ 4742 boolean_t rv; 4743 4744 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4745 ("pmap_is_referenced: page %p is not managed", m)); 4746 rw_wlock(&pvh_global_lock); 4747 rv = pmap_is_referenced_pvh(&m->md) || 4748 ((m->flags & PG_FICTITIOUS) == 0 && 4749 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4750 rw_wunlock(&pvh_global_lock); 4751 return (rv); 4752} 4753 4754/* 4755 * Returns TRUE if any of the given mappings were referenced and FALSE 4756 * otherwise. Both page and 4mpage mappings are supported. 4757 */ 4758static boolean_t 4759pmap_is_referenced_pvh(struct md_page *pvh) 4760{ 4761 pv_entry_t pv; 4762 pt_entry_t *pte; 4763 pmap_t pmap; 4764 boolean_t rv; 4765 4766 rw_assert(&pvh_global_lock, RA_WLOCKED); 4767 rv = FALSE; 4768 sched_pin(); 4769 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4770 pmap = PV_PMAP(pv); 4771 PMAP_LOCK(pmap); 4772 pte = pmap_pte_quick(pmap, pv->pv_va); 4773 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4774 PMAP_UNLOCK(pmap); 4775 if (rv) 4776 break; 4777 } 4778 sched_unpin(); 4779 return (rv); 4780} 4781 4782/* 4783 * Clear the write and modified bits in each of the given page's mappings. 4784 */ 4785void 4786pmap_remove_write(vm_page_t m) 4787{ 4788 struct md_page *pvh; 4789 pv_entry_t next_pv, pv; 4790 pmap_t pmap; 4791 pd_entry_t *pde; 4792 pt_entry_t oldpte, *pte; 4793 vm_offset_t va; 4794 4795 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4796 ("pmap_remove_write: page %p is not managed", m)); 4797 4798 /* 4799 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4800 * set by another thread while the object is locked. Thus, 4801 * if PGA_WRITEABLE is clear, no page table entries need updating. 4802 */ 4803 VM_OBJECT_ASSERT_WLOCKED(m->object); 4804 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4805 return; 4806 rw_wlock(&pvh_global_lock); 4807 sched_pin(); 4808 if ((m->flags & PG_FICTITIOUS) != 0) 4809 goto small_mappings; 4810 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4811 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4812 va = pv->pv_va; 4813 pmap = PV_PMAP(pv); 4814 PMAP_LOCK(pmap); 4815 pde = pmap_pde(pmap, va); 4816 if ((*pde & PG_RW) != 0) 4817 (void)pmap_demote_pde(pmap, pde, va); 4818 PMAP_UNLOCK(pmap); 4819 } 4820small_mappings: 4821 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4822 pmap = PV_PMAP(pv); 4823 PMAP_LOCK(pmap); 4824 pde = pmap_pde(pmap, pv->pv_va); 4825 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4826 " a 4mpage in page %p's pv list", m)); 4827 pte = pmap_pte_quick(pmap, pv->pv_va); 4828retry: 4829 oldpte = *pte; 4830 if ((oldpte & PG_RW) != 0) { 4831 /* 4832 * Regardless of whether a pte is 32 or 64 bits 4833 * in size, PG_RW and PG_M are among the least 4834 * significant 32 bits. 4835 */ 4836 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4837 oldpte & ~(PG_RW | PG_M))) 4838 goto retry; 4839 if ((oldpte & PG_M) != 0) 4840 vm_page_dirty(m); 4841 pmap_invalidate_page(pmap, pv->pv_va); 4842 } 4843 PMAP_UNLOCK(pmap); 4844 } 4845 vm_page_aflag_clear(m, PGA_WRITEABLE); 4846 sched_unpin(); 4847 rw_wunlock(&pvh_global_lock); 4848} 4849 4850/* 4851 * pmap_ts_referenced: 4852 * 4853 * Return a count of reference bits for a page, clearing those bits. 4854 * It is not necessary for every reference bit to be cleared, but it 4855 * is necessary that 0 only be returned when there are truly no 4856 * reference bits set. 4857 * 4858 * As an optimization, update the page's dirty field if a modified bit is 4859 * found while counting reference bits. This opportunistic update can be 4860 * performed at low cost and can eliminate the need for some future calls 4861 * to pmap_is_modified(). However, since this function stops after 4862 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4863 * dirty pages. Those dirty pages will only be detected by a future call 4864 * to pmap_is_modified(). 4865 */ 4866int 4867pmap_ts_referenced(vm_page_t m) 4868{ 4869 struct md_page *pvh; 4870 pv_entry_t pv, pvf; 4871 pmap_t pmap; 4872 pd_entry_t *pde; 4873 pt_entry_t *pte; 4874 vm_paddr_t pa; 4875 int rtval = 0; 4876 4877 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4878 ("pmap_ts_referenced: page %p is not managed", m)); 4879 pa = VM_PAGE_TO_PHYS(m); 4880 pvh = pa_to_pvh(pa); 4881 rw_wlock(&pvh_global_lock); 4882 sched_pin(); 4883 if ((m->flags & PG_FICTITIOUS) != 0 || 4884 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4885 goto small_mappings; 4886 pv = pvf; 4887 do { 4888 pmap = PV_PMAP(pv); 4889 PMAP_LOCK(pmap); 4890 pde = pmap_pde(pmap, pv->pv_va); 4891 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4892 /* 4893 * Although "*pde" is mapping a 2/4MB page, because 4894 * this function is called at a 4KB page granularity, 4895 * we only update the 4KB page under test. 4896 */ 4897 vm_page_dirty(m); 4898 } 4899 if ((*pde & PG_A) != 0) { 4900 /* 4901 * Since this reference bit is shared by either 1024 4902 * or 512 4KB pages, it should not be cleared every 4903 * time it is tested. Apply a simple "hash" function 4904 * on the physical page number, the virtual superpage 4905 * number, and the pmap address to select one 4KB page 4906 * out of the 1024 or 512 on which testing the 4907 * reference bit will result in clearing that bit. 4908 * This function is designed to avoid the selection of 4909 * the same 4KB page for every 2- or 4MB page mapping. 4910 * 4911 * On demotion, a mapping that hasn't been referenced 4912 * is simply destroyed. To avoid the possibility of a 4913 * subsequent page fault on a demoted wired mapping, 4914 * always leave its reference bit set. Moreover, 4915 * since the superpage is wired, the current state of 4916 * its reference bit won't affect page replacement. 4917 */ 4918 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4919 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4920 (*pde & PG_W) == 0) { 4921 atomic_clear_int((u_int *)pde, PG_A); 4922 pmap_invalidate_page(pmap, pv->pv_va); 4923 } 4924 rtval++; 4925 } 4926 PMAP_UNLOCK(pmap); 4927 /* Rotate the PV list if it has more than one entry. */ 4928 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4929 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4930 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4931 } 4932 if (rtval >= PMAP_TS_REFERENCED_MAX) 4933 goto out; 4934 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4935small_mappings: 4936 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4937 goto out; 4938 pv = pvf; 4939 do { 4940 pmap = PV_PMAP(pv); 4941 PMAP_LOCK(pmap); 4942 pde = pmap_pde(pmap, pv->pv_va); 4943 KASSERT((*pde & PG_PS) == 0, 4944 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4945 m)); 4946 pte = pmap_pte_quick(pmap, pv->pv_va); 4947 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4948 vm_page_dirty(m); 4949 if ((*pte & PG_A) != 0) { 4950 atomic_clear_int((u_int *)pte, PG_A); 4951 pmap_invalidate_page(pmap, pv->pv_va); 4952 rtval++; 4953 } 4954 PMAP_UNLOCK(pmap); 4955 /* Rotate the PV list if it has more than one entry. */ 4956 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4957 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4958 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4959 } 4960 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4961 PMAP_TS_REFERENCED_MAX); 4962out: 4963 sched_unpin(); 4964 rw_wunlock(&pvh_global_lock); 4965 return (rtval); 4966} 4967 4968/* 4969 * Apply the given advice to the specified range of addresses within the 4970 * given pmap. Depending on the advice, clear the referenced and/or 4971 * modified flags in each mapping and set the mapped page's dirty field. 4972 */ 4973void 4974pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4975{ 4976 pd_entry_t oldpde, *pde; 4977 pt_entry_t *pte; 4978 vm_offset_t va, pdnxt; 4979 vm_page_t m; 4980 boolean_t anychanged, pv_lists_locked; 4981 4982 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4983 return; 4984 if (pmap_is_current(pmap)) 4985 pv_lists_locked = FALSE; 4986 else { 4987 pv_lists_locked = TRUE; 4988resume: 4989 rw_wlock(&pvh_global_lock); 4990 sched_pin(); 4991 } 4992 anychanged = FALSE; 4993 PMAP_LOCK(pmap); 4994 for (; sva < eva; sva = pdnxt) { 4995 pdnxt = (sva + NBPDR) & ~PDRMASK; 4996 if (pdnxt < sva) 4997 pdnxt = eva; 4998 pde = pmap_pde(pmap, sva); 4999 oldpde = *pde; 5000 if ((oldpde & PG_V) == 0) 5001 continue; 5002 else if ((oldpde & PG_PS) != 0) { 5003 if ((oldpde & PG_MANAGED) == 0) 5004 continue; 5005 if (!pv_lists_locked) { 5006 pv_lists_locked = TRUE; 5007 if (!rw_try_wlock(&pvh_global_lock)) { 5008 if (anychanged) 5009 pmap_invalidate_all(pmap); 5010 PMAP_UNLOCK(pmap); 5011 goto resume; 5012 } 5013 sched_pin(); 5014 } 5015 if (!pmap_demote_pde(pmap, pde, sva)) { 5016 /* 5017 * The large page mapping was destroyed. 5018 */ 5019 continue; 5020 } 5021 5022 /* 5023 * Unless the page mappings are wired, remove the 5024 * mapping to a single page so that a subsequent 5025 * access may repromote. Since the underlying page 5026 * table page is fully populated, this removal never 5027 * frees a page table page. 5028 */ 5029 if ((oldpde & PG_W) == 0) { 5030 pte = pmap_pte_quick(pmap, sva); 5031 KASSERT((*pte & PG_V) != 0, 5032 ("pmap_advise: invalid PTE")); 5033 pmap_remove_pte(pmap, pte, sva, NULL); 5034 anychanged = TRUE; 5035 } 5036 } 5037 if (pdnxt > eva) 5038 pdnxt = eva; 5039 va = pdnxt; 5040 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5041 sva += PAGE_SIZE) { 5042 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5043 goto maybe_invlrng; 5044 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5045 if (advice == MADV_DONTNEED) { 5046 /* 5047 * Future calls to pmap_is_modified() 5048 * can be avoided by making the page 5049 * dirty now. 5050 */ 5051 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5052 vm_page_dirty(m); 5053 } 5054 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5055 } else if ((*pte & PG_A) != 0) 5056 atomic_clear_int((u_int *)pte, PG_A); 5057 else 5058 goto maybe_invlrng; 5059 if ((*pte & PG_G) != 0) { 5060 if (va == pdnxt) 5061 va = sva; 5062 } else 5063 anychanged = TRUE; 5064 continue; 5065maybe_invlrng: 5066 if (va != pdnxt) { 5067 pmap_invalidate_range(pmap, va, sva); 5068 va = pdnxt; 5069 } 5070 } 5071 if (va != pdnxt) 5072 pmap_invalidate_range(pmap, va, sva); 5073 } 5074 if (anychanged) 5075 pmap_invalidate_all(pmap); 5076 if (pv_lists_locked) { 5077 sched_unpin(); 5078 rw_wunlock(&pvh_global_lock); 5079 } 5080 PMAP_UNLOCK(pmap); 5081} 5082 5083/* 5084 * Clear the modify bits on the specified physical page. 5085 */ 5086void 5087pmap_clear_modify(vm_page_t m) 5088{ 5089 struct md_page *pvh; 5090 pv_entry_t next_pv, pv; 5091 pmap_t pmap; 5092 pd_entry_t oldpde, *pde; 5093 pt_entry_t oldpte, *pte; 5094 vm_offset_t va; 5095 5096 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5097 ("pmap_clear_modify: page %p is not managed", m)); 5098 VM_OBJECT_ASSERT_WLOCKED(m->object); 5099 KASSERT(!vm_page_xbusied(m), 5100 ("pmap_clear_modify: page %p is exclusive busied", m)); 5101 5102 /* 5103 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5104 * If the object containing the page is locked and the page is not 5105 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5106 */ 5107 if ((m->aflags & PGA_WRITEABLE) == 0) 5108 return; 5109 rw_wlock(&pvh_global_lock); 5110 sched_pin(); 5111 if ((m->flags & PG_FICTITIOUS) != 0) 5112 goto small_mappings; 5113 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5114 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5115 va = pv->pv_va; 5116 pmap = PV_PMAP(pv); 5117 PMAP_LOCK(pmap); 5118 pde = pmap_pde(pmap, va); 5119 oldpde = *pde; 5120 if ((oldpde & PG_RW) != 0) { 5121 if (pmap_demote_pde(pmap, pde, va)) { 5122 if ((oldpde & PG_W) == 0) { 5123 /* 5124 * Write protect the mapping to a 5125 * single page so that a subsequent 5126 * write access may repromote. 5127 */ 5128 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5129 PG_PS_FRAME); 5130 pte = pmap_pte_quick(pmap, va); 5131 oldpte = *pte; 5132 if ((oldpte & PG_V) != 0) { 5133 /* 5134 * Regardless of whether a pte is 32 or 64 bits 5135 * in size, PG_RW and PG_M are among the least 5136 * significant 32 bits. 5137 */ 5138 while (!atomic_cmpset_int((u_int *)pte, 5139 oldpte, 5140 oldpte & ~(PG_M | PG_RW))) 5141 oldpte = *pte; 5142 vm_page_dirty(m); 5143 pmap_invalidate_page(pmap, va); 5144 } 5145 } 5146 } 5147 } 5148 PMAP_UNLOCK(pmap); 5149 } 5150small_mappings: 5151 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5152 pmap = PV_PMAP(pv); 5153 PMAP_LOCK(pmap); 5154 pde = pmap_pde(pmap, pv->pv_va); 5155 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5156 " a 4mpage in page %p's pv list", m)); 5157 pte = pmap_pte_quick(pmap, pv->pv_va); 5158 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5159 /* 5160 * Regardless of whether a pte is 32 or 64 bits 5161 * in size, PG_M is among the least significant 5162 * 32 bits. 5163 */ 5164 atomic_clear_int((u_int *)pte, PG_M); 5165 pmap_invalidate_page(pmap, pv->pv_va); 5166 } 5167 PMAP_UNLOCK(pmap); 5168 } 5169 sched_unpin(); 5170 rw_wunlock(&pvh_global_lock); 5171} 5172 5173/* 5174 * Miscellaneous support routines follow 5175 */ 5176 5177/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5178static __inline void 5179pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5180{ 5181 u_int opte, npte; 5182 5183 /* 5184 * The cache mode bits are all in the low 32-bits of the 5185 * PTE, so we can just spin on updating the low 32-bits. 5186 */ 5187 do { 5188 opte = *(u_int *)pte; 5189 npte = opte & ~PG_PTE_CACHE; 5190 npte |= cache_bits; 5191 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5192} 5193 5194/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5195static __inline void 5196pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5197{ 5198 u_int opde, npde; 5199 5200 /* 5201 * The cache mode bits are all in the low 32-bits of the 5202 * PDE, so we can just spin on updating the low 32-bits. 5203 */ 5204 do { 5205 opde = *(u_int *)pde; 5206 npde = opde & ~PG_PDE_CACHE; 5207 npde |= cache_bits; 5208 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5209} 5210 5211/* 5212 * Map a set of physical memory pages into the kernel virtual 5213 * address space. Return a pointer to where it is mapped. This 5214 * routine is intended to be used for mapping device memory, 5215 * NOT real memory. 5216 */ 5217void * 5218pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5219{ 5220 struct pmap_preinit_mapping *ppim; 5221 vm_offset_t va, offset; 5222 vm_size_t tmpsize; 5223 int i; 5224 5225 offset = pa & PAGE_MASK; 5226 size = round_page(offset + size); 5227 pa = pa & PG_FRAME; 5228 5229 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5230 va = KERNBASE + pa; 5231 else if (!pmap_initialized) { 5232 va = 0; 5233 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5234 ppim = pmap_preinit_mapping + i; 5235 if (ppim->va == 0) { 5236 ppim->pa = pa; 5237 ppim->sz = size; 5238 ppim->mode = mode; 5239 ppim->va = virtual_avail; 5240 virtual_avail += size; 5241 va = ppim->va; 5242 break; 5243 } 5244 } 5245 if (va == 0) 5246 panic("%s: too many preinit mappings", __func__); 5247 } else { 5248 /* 5249 * If we have a preinit mapping, re-use it. 5250 */ 5251 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5252 ppim = pmap_preinit_mapping + i; 5253 if (ppim->pa == pa && ppim->sz == size && 5254 ppim->mode == mode) 5255 return ((void *)(ppim->va + offset)); 5256 } 5257 va = kva_alloc(size); 5258 if (va == 0) 5259 panic("%s: Couldn't allocate KVA", __func__); 5260 } 5261 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5262 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5263 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5264 pmap_invalidate_cache_range(va, va + size, FALSE); 5265 return ((void *)(va + offset)); 5266} 5267 5268void * 5269pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5270{ 5271 5272 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5273} 5274 5275void * 5276pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5277{ 5278 5279 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5280} 5281 5282void 5283pmap_unmapdev(vm_offset_t va, vm_size_t size) 5284{ 5285 struct pmap_preinit_mapping *ppim; 5286 vm_offset_t offset; 5287 int i; 5288 5289 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5290 return; 5291 offset = va & PAGE_MASK; 5292 size = round_page(offset + size); 5293 va = trunc_page(va); 5294 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5295 ppim = pmap_preinit_mapping + i; 5296 if (ppim->va == va && ppim->sz == size) { 5297 if (pmap_initialized) 5298 return; 5299 ppim->pa = 0; 5300 ppim->va = 0; 5301 ppim->sz = 0; 5302 ppim->mode = 0; 5303 if (va + size == virtual_avail) 5304 virtual_avail = va; 5305 return; 5306 } 5307 } 5308 if (pmap_initialized) 5309 kva_free(va, size); 5310} 5311 5312/* 5313 * Sets the memory attribute for the specified page. 5314 */ 5315void 5316pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5317{ 5318 5319 m->md.pat_mode = ma; 5320 if ((m->flags & PG_FICTITIOUS) != 0) 5321 return; 5322 5323 /* 5324 * If "m" is a normal page, flush it from the cache. 5325 * See pmap_invalidate_cache_range(). 5326 * 5327 * First, try to find an existing mapping of the page by sf 5328 * buffer. sf_buf_invalidate_cache() modifies mapping and 5329 * flushes the cache. 5330 */ 5331 if (sf_buf_invalidate_cache(m)) 5332 return; 5333 5334 /* 5335 * If page is not mapped by sf buffer, but CPU does not 5336 * support self snoop, map the page transient and do 5337 * invalidation. In the worst case, whole cache is flushed by 5338 * pmap_invalidate_cache_range(). 5339 */ 5340 if ((cpu_feature & CPUID_SS) == 0) 5341 pmap_flush_page(m); 5342} 5343 5344static void 5345pmap_flush_page(vm_page_t m) 5346{ 5347 pt_entry_t *cmap_pte2; 5348 struct pcpu *pc; 5349 vm_offset_t sva, eva; 5350 bool useclflushopt; 5351 5352 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5353 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5354 sched_pin(); 5355 pc = get_pcpu(); 5356 cmap_pte2 = pc->pc_cmap_pte2; 5357 mtx_lock(&pc->pc_cmap_lock); 5358 if (*cmap_pte2) 5359 panic("pmap_flush_page: CMAP2 busy"); 5360 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5361 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5362 invlcaddr(pc->pc_cmap_addr2); 5363 sva = (vm_offset_t)pc->pc_cmap_addr2; 5364 eva = sva + PAGE_SIZE; 5365 5366 /* 5367 * Use mfence or sfence despite the ordering implied by 5368 * mtx_{un,}lock() because clflush on non-Intel CPUs 5369 * and clflushopt are not guaranteed to be ordered by 5370 * any other instruction. 5371 */ 5372 if (useclflushopt) 5373 sfence(); 5374 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5375 mfence(); 5376 for (; sva < eva; sva += cpu_clflush_line_size) { 5377 if (useclflushopt) 5378 clflushopt(sva); 5379 else 5380 clflush(sva); 5381 } 5382 if (useclflushopt) 5383 sfence(); 5384 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5385 mfence(); 5386 *cmap_pte2 = 0; 5387 sched_unpin(); 5388 mtx_unlock(&pc->pc_cmap_lock); 5389 } else 5390 pmap_invalidate_cache(); 5391} 5392 5393/* 5394 * Changes the specified virtual address range's memory type to that given by 5395 * the parameter "mode". The specified virtual address range must be 5396 * completely contained within either the kernel map. 5397 * 5398 * Returns zero if the change completed successfully, and either EINVAL or 5399 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5400 * of the virtual address range was not mapped, and ENOMEM is returned if 5401 * there was insufficient memory available to complete the change. 5402 */ 5403int 5404pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5405{ 5406 vm_offset_t base, offset, tmpva; 5407 pd_entry_t *pde; 5408 pt_entry_t *pte; 5409 int cache_bits_pte, cache_bits_pde; 5410 boolean_t changed; 5411 5412 base = trunc_page(va); 5413 offset = va & PAGE_MASK; 5414 size = round_page(offset + size); 5415 5416 /* 5417 * Only supported on kernel virtual addresses above the recursive map. 5418 */ 5419 if (base < VM_MIN_KERNEL_ADDRESS) 5420 return (EINVAL); 5421 5422 cache_bits_pde = pmap_cache_bits(mode, 1); 5423 cache_bits_pte = pmap_cache_bits(mode, 0); 5424 changed = FALSE; 5425 5426 /* 5427 * Pages that aren't mapped aren't supported. Also break down 5428 * 2/4MB pages into 4KB pages if required. 5429 */ 5430 PMAP_LOCK(kernel_pmap); 5431 for (tmpva = base; tmpva < base + size; ) { 5432 pde = pmap_pde(kernel_pmap, tmpva); 5433 if (*pde == 0) { 5434 PMAP_UNLOCK(kernel_pmap); 5435 return (EINVAL); 5436 } 5437 if (*pde & PG_PS) { 5438 /* 5439 * If the current 2/4MB page already has 5440 * the required memory type, then we need not 5441 * demote this page. Just increment tmpva to 5442 * the next 2/4MB page frame. 5443 */ 5444 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5445 tmpva = trunc_4mpage(tmpva) + NBPDR; 5446 continue; 5447 } 5448 5449 /* 5450 * If the current offset aligns with a 2/4MB 5451 * page frame and there is at least 2/4MB left 5452 * within the range, then we need not break 5453 * down this page into 4KB pages. 5454 */ 5455 if ((tmpva & PDRMASK) == 0 && 5456 tmpva + PDRMASK < base + size) { 5457 tmpva += NBPDR; 5458 continue; 5459 } 5460 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5461 PMAP_UNLOCK(kernel_pmap); 5462 return (ENOMEM); 5463 } 5464 } 5465 pte = vtopte(tmpva); 5466 if (*pte == 0) { 5467 PMAP_UNLOCK(kernel_pmap); 5468 return (EINVAL); 5469 } 5470 tmpva += PAGE_SIZE; 5471 } 5472 PMAP_UNLOCK(kernel_pmap); 5473 5474 /* 5475 * Ok, all the pages exist, so run through them updating their 5476 * cache mode if required. 5477 */ 5478 for (tmpva = base; tmpva < base + size; ) { 5479 pde = pmap_pde(kernel_pmap, tmpva); 5480 if (*pde & PG_PS) { 5481 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5482 pmap_pde_attr(pde, cache_bits_pde); 5483 changed = TRUE; 5484 } 5485 tmpva = trunc_4mpage(tmpva) + NBPDR; 5486 } else { 5487 pte = vtopte(tmpva); 5488 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5489 pmap_pte_attr(pte, cache_bits_pte); 5490 changed = TRUE; 5491 } 5492 tmpva += PAGE_SIZE; 5493 } 5494 } 5495 5496 /* 5497 * Flush CPU caches to make sure any data isn't cached that 5498 * shouldn't be, etc. 5499 */ 5500 if (changed) { 5501 pmap_invalidate_range(kernel_pmap, base, tmpva); 5502 pmap_invalidate_cache_range(base, tmpva, FALSE); 5503 } 5504 return (0); 5505} 5506 5507/* 5508 * perform the pmap work for mincore 5509 */ 5510int 5511pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5512{ 5513 pd_entry_t *pdep; 5514 pt_entry_t *ptep, pte; 5515 vm_paddr_t pa; 5516 int val; 5517 5518 PMAP_LOCK(pmap); 5519retry: 5520 pdep = pmap_pde(pmap, addr); 5521 if (*pdep != 0) { 5522 if (*pdep & PG_PS) { 5523 pte = *pdep; 5524 /* Compute the physical address of the 4KB page. */ 5525 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5526 PG_FRAME; 5527 val = MINCORE_SUPER; 5528 } else { 5529 ptep = pmap_pte(pmap, addr); 5530 pte = *ptep; 5531 pmap_pte_release(ptep); 5532 pa = pte & PG_FRAME; 5533 val = 0; 5534 } 5535 } else { 5536 pte = 0; 5537 pa = 0; 5538 val = 0; 5539 } 5540 if ((pte & PG_V) != 0) { 5541 val |= MINCORE_INCORE; 5542 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5543 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5544 if ((pte & PG_A) != 0) 5545 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5546 } 5547 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5548 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5549 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5550 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5551 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5552 goto retry; 5553 } else 5554 PA_UNLOCK_COND(*locked_pa); 5555 PMAP_UNLOCK(pmap); 5556 return (val); 5557} 5558 5559void 5560pmap_activate(struct thread *td) 5561{ 5562 pmap_t pmap, oldpmap; 5563 u_int cpuid; 5564 u_int32_t cr3; 5565 5566 critical_enter(); 5567 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5568 oldpmap = PCPU_GET(curpmap); 5569 cpuid = PCPU_GET(cpuid); 5570#if defined(SMP) 5571 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5572 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5573#else 5574 CPU_CLR(cpuid, &oldpmap->pm_active); 5575 CPU_SET(cpuid, &pmap->pm_active); 5576#endif 5577#if defined(PAE) || defined(PAE_TABLES) 5578 cr3 = vtophys(pmap->pm_pdpt); 5579#else 5580 cr3 = vtophys(pmap->pm_pdir); 5581#endif 5582 /* 5583 * pmap_activate is for the current thread on the current cpu 5584 */ 5585 td->td_pcb->pcb_cr3 = cr3; 5586 load_cr3(cr3); 5587 PCPU_SET(curpmap, pmap); 5588 critical_exit(); 5589} 5590 5591void 5592pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5593{ 5594} 5595 5596/* 5597 * Increase the starting virtual address of the given mapping if a 5598 * different alignment might result in more superpage mappings. 5599 */ 5600void 5601pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5602 vm_offset_t *addr, vm_size_t size) 5603{ 5604 vm_offset_t superpage_offset; 5605 5606 if (size < NBPDR) 5607 return; 5608 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5609 offset += ptoa(object->pg_color); 5610 superpage_offset = offset & PDRMASK; 5611 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5612 (*addr & PDRMASK) == superpage_offset) 5613 return; 5614 if ((*addr & PDRMASK) < superpage_offset) 5615 *addr = (*addr & ~PDRMASK) + superpage_offset; 5616 else 5617 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5618} 5619 5620vm_offset_t 5621pmap_quick_enter_page(vm_page_t m) 5622{ 5623 vm_offset_t qaddr; 5624 pt_entry_t *pte; 5625 5626 critical_enter(); 5627 qaddr = PCPU_GET(qmap_addr); 5628 pte = vtopte(qaddr); 5629 5630 KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); 5631 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5632 pmap_cache_bits(pmap_page_get_memattr(m), 0); 5633 invlpg(qaddr); 5634 5635 return (qaddr); 5636} 5637 5638void 5639pmap_quick_remove_page(vm_offset_t addr) 5640{ 5641 vm_offset_t qaddr; 5642 pt_entry_t *pte; 5643 5644 qaddr = PCPU_GET(qmap_addr); 5645 pte = vtopte(qaddr); 5646 5647 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5648 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5649 5650 *pte = 0; 5651 critical_exit(); 5652} 5653 5654#if defined(PMAP_DEBUG) 5655pmap_pid_dump(int pid) 5656{ 5657 pmap_t pmap; 5658 struct proc *p; 5659 int npte = 0; 5660 int index; 5661 5662 sx_slock(&allproc_lock); 5663 FOREACH_PROC_IN_SYSTEM(p) { 5664 if (p->p_pid != pid) 5665 continue; 5666 5667 if (p->p_vmspace) { 5668 int i,j; 5669 index = 0; 5670 pmap = vmspace_pmap(p->p_vmspace); 5671 for (i = 0; i < NPDEPTD; i++) { 5672 pd_entry_t *pde; 5673 pt_entry_t *pte; 5674 vm_offset_t base = i << PDRSHIFT; 5675 5676 pde = &pmap->pm_pdir[i]; 5677 if (pde && pmap_pde_v(pde)) { 5678 for (j = 0; j < NPTEPG; j++) { 5679 vm_offset_t va = base + (j << PAGE_SHIFT); 5680 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5681 if (index) { 5682 index = 0; 5683 printf("\n"); 5684 } 5685 sx_sunlock(&allproc_lock); 5686 return (npte); 5687 } 5688 pte = pmap_pte(pmap, va); 5689 if (pte && pmap_pte_v(pte)) { 5690 pt_entry_t pa; 5691 vm_page_t m; 5692 pa = *pte; 5693 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5694 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5695 va, pa, m->hold_count, m->wire_count, m->flags); 5696 npte++; 5697 index++; 5698 if (index >= 2) { 5699 index = 0; 5700 printf("\n"); 5701 } else { 5702 printf(" "); 5703 } 5704 } 5705 } 5706 } 5707 } 5708 } 5709 } 5710 sx_sunlock(&allproc_lock); 5711 return (npte); 5712} 5713#endif 5714