pmap.c revision 287945
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: stable/10/sys/i386/i386/pmap.c 287945 2015-09-17 23:31:44Z rstone $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_xbox.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/ktr.h> 109#include <sys/lock.h> 110#include <sys/malloc.h> 111#include <sys/mman.h> 112#include <sys/msgbuf.h> 113#include <sys/mutex.h> 114#include <sys/proc.h> 115#include <sys/rwlock.h> 116#include <sys/sf_buf.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#ifdef SMP 122#include <sys/smp.h> 123#else 124#include <sys/cpuset.h> 125#endif 126 127#include <vm/vm.h> 128#include <vm/vm_param.h> 129#include <vm/vm_kern.h> 130#include <vm/vm_page.h> 131#include <vm/vm_map.h> 132#include <vm/vm_object.h> 133#include <vm/vm_extern.h> 134#include <vm/vm_pageout.h> 135#include <vm/vm_pager.h> 136#include <vm/vm_phys.h> 137#include <vm/vm_radix.h> 138#include <vm/vm_reserv.h> 139#include <vm/uma.h> 140 141#ifdef DEV_APIC 142#include <sys/bus.h> 143#include <machine/intr_machdep.h> 144#include <machine/apicvar.h> 145#endif 146#include <machine/cpu.h> 147#include <machine/cputypes.h> 148#include <machine/md_var.h> 149#include <machine/pcb.h> 150#include <machine/specialreg.h> 151#ifdef SMP 152#include <machine/smp.h> 153#endif 154 155#ifdef XBOX 156#include <machine/xbox.h> 157#endif 158 159#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 160#define CPU_ENABLE_SSE 161#endif 162 163#ifndef PMAP_SHPGPERPROC 164#define PMAP_SHPGPERPROC 200 165#endif 166 167#if !defined(DIAGNOSTIC) 168#ifdef __GNUC_GNU_INLINE__ 169#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 170#else 171#define PMAP_INLINE extern inline 172#endif 173#else 174#define PMAP_INLINE 175#endif 176 177#ifdef PV_STATS 178#define PV_STAT(x) do { x ; } while (0) 179#else 180#define PV_STAT(x) do { } while (0) 181#endif 182 183#define pa_index(pa) ((pa) >> PDRSHIFT) 184#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 185 186/* 187 * Get PDEs and PTEs for user/kernel address space 188 */ 189#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 190#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 191 192#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 193#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 194#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 195#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 196#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 197 198#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 199 atomic_clear_int((u_int *)(pte), PG_W)) 200#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 201 202struct pmap kernel_pmap_store; 203LIST_HEAD(pmaplist, pmap); 204static struct pmaplist allpmaps; 205static struct mtx allpmaps_lock; 206 207vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 208vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 209int pgeflag = 0; /* PG_G or-in */ 210int pseflag = 0; /* PG_PS or-in */ 211 212static int nkpt = NKPT; 213vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 214extern u_int32_t KERNend; 215extern u_int32_t KPTphys; 216 217#if defined(PAE) || defined(PAE_TABLES) 218pt_entry_t pg_nx; 219static uma_zone_t pdptzone; 220#endif 221 222static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 223 224static int pat_works = 1; 225SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 226 "Is page attribute table fully functional?"); 227 228static int pg_ps_enabled = 1; 229SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 230 "Are large page mappings enabled?"); 231 232#define PAT_INDEX_SIZE 8 233static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 234 235/* 236 * pmap_mapdev support pre initialization (i.e. console) 237 */ 238#define PMAP_PREINIT_MAPPING_COUNT 8 239static struct pmap_preinit_mapping { 240 vm_paddr_t pa; 241 vm_offset_t va; 242 vm_size_t sz; 243 int mode; 244} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 245static int pmap_initialized; 246 247static struct rwlock_padalign pvh_global_lock; 248 249/* 250 * Data for the pv entry allocation mechanism 251 */ 252static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 253static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 254static struct md_page *pv_table; 255static int shpgperproc = PMAP_SHPGPERPROC; 256 257struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 258int pv_maxchunks; /* How many chunks we have KVA for */ 259vm_offset_t pv_vafree; /* freelist stored in the PTE */ 260 261/* 262 * All those kernel PT submaps that BSD is so fond of 263 */ 264struct sysmaps { 265 struct mtx lock; 266 pt_entry_t *CMAP1; 267 pt_entry_t *CMAP2; 268 caddr_t CADDR1; 269 caddr_t CADDR2; 270}; 271static struct sysmaps sysmaps_pcpu[MAXCPU]; 272pt_entry_t *CMAP3; 273static pd_entry_t *KPTD; 274caddr_t ptvmmap = 0; 275caddr_t CADDR3; 276struct msgbuf *msgbufp = 0; 277 278/* 279 * Crashdump maps. 280 */ 281static caddr_t crashdumpmap; 282 283static pt_entry_t *PMAP1 = 0, *PMAP2; 284static pt_entry_t *PADDR1 = 0, *PADDR2; 285#ifdef SMP 286static int PMAP1cpu; 287static int PMAP1changedcpu; 288SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 289 &PMAP1changedcpu, 0, 290 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 291#endif 292static int PMAP1changed; 293SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 294 &PMAP1changed, 0, 295 "Number of times pmap_pte_quick changed PMAP1"); 296static int PMAP1unchanged; 297SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 298 &PMAP1unchanged, 0, 299 "Number of times pmap_pte_quick didn't change PMAP1"); 300static struct mtx PMAP2mutex; 301 302static void free_pv_chunk(struct pv_chunk *pc); 303static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 304static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 305static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 306static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 307static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 308static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 309static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 310 vm_offset_t va); 311static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 312 313static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 314static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 315 vm_prot_t prot); 316static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 317 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 318static void pmap_flush_page(vm_page_t m); 319static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 320static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 321static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 322static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 323static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 324static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 325static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 326static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 327static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 328static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 329 vm_prot_t prot); 330static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 331static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 332 struct spglist *free); 333static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 334 struct spglist *free); 335static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 336static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 337 struct spglist *free); 338static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 339 vm_offset_t va); 340static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 341static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 342 vm_page_t m); 343static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 344 pd_entry_t newpde); 345static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 346 347static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 348 349static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 350static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 351static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 352static void pmap_pte_release(pt_entry_t *pte); 353static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 354#if defined(PAE) || defined(PAE_TABLES) 355static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, 356 int wait); 357#endif 358static void pmap_set_pg(void); 359 360static __inline void pagezero(void *page); 361 362CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 363CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 364 365/* 366 * If you get an error here, then you set KVA_PAGES wrong! See the 367 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 368 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 369 */ 370CTASSERT(KERNBASE % (1 << 24) == 0); 371 372/* 373 * Bootstrap the system enough to run with virtual memory. 374 * 375 * On the i386 this is called after mapping has already been enabled 376 * and just syncs the pmap module with what has already been done. 377 * [We can't call it easily with mapping off since the kernel is not 378 * mapped with PA == VA, hence we would have to relocate every address 379 * from the linked base (virtual) address "KERNBASE" to the actual 380 * (physical) address starting relative to 0] 381 */ 382void 383pmap_bootstrap(vm_paddr_t firstaddr) 384{ 385 vm_offset_t va; 386 pt_entry_t *pte, *unused; 387 struct sysmaps *sysmaps; 388 int i; 389 390 /* 391 * Add a physical memory segment (vm_phys_seg) corresponding to the 392 * preallocated kernel page table pages so that vm_page structures 393 * representing these pages will be created. The vm_page structures 394 * are required for promotion of the corresponding kernel virtual 395 * addresses to superpage mappings. 396 */ 397 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 398 399 /* 400 * Initialize the first available kernel virtual address. However, 401 * using "firstaddr" may waste a few pages of the kernel virtual 402 * address space, because locore may not have mapped every physical 403 * page that it allocated. Preferably, locore would provide a first 404 * unused virtual address in addition to "firstaddr". 405 */ 406 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 407 408 virtual_end = VM_MAX_KERNEL_ADDRESS; 409 410 /* 411 * Initialize the kernel pmap (which is statically allocated). 412 */ 413 PMAP_LOCK_INIT(kernel_pmap); 414 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 415#if defined(PAE) || defined(PAE_TABLES) 416 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 417#endif 418 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 419 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 420 421 /* 422 * Initialize the global pv list lock. 423 */ 424 rw_init(&pvh_global_lock, "pmap pv global"); 425 426 LIST_INIT(&allpmaps); 427 428 /* 429 * Request a spin mutex so that changes to allpmaps cannot be 430 * preempted by smp_rendezvous_cpus(). Otherwise, 431 * pmap_update_pde_kernel() could access allpmaps while it is 432 * being changed. 433 */ 434 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 435 mtx_lock_spin(&allpmaps_lock); 436 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 437 mtx_unlock_spin(&allpmaps_lock); 438 439 /* 440 * Reserve some special page table entries/VA space for temporary 441 * mapping of pages. 442 */ 443#define SYSMAP(c, p, v, n) \ 444 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 445 446 va = virtual_avail; 447 pte = vtopte(va); 448 449 /* 450 * CMAP1/CMAP2 are used for zeroing and copying pages. 451 * CMAP3 is used for the idle process page zeroing. 452 */ 453 for (i = 0; i < MAXCPU; i++) { 454 sysmaps = &sysmaps_pcpu[i]; 455 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 456 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 457 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 458 } 459 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 460 461 /* 462 * Crashdump maps. 463 */ 464 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 465 466 /* 467 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 468 */ 469 SYSMAP(caddr_t, unused, ptvmmap, 1) 470 471 /* 472 * msgbufp is used to map the system message buffer. 473 */ 474 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 475 476 /* 477 * KPTmap is used by pmap_kextract(). 478 * 479 * KPTmap is first initialized by locore. However, that initial 480 * KPTmap can only support NKPT page table pages. Here, a larger 481 * KPTmap is created that can support KVA_PAGES page table pages. 482 */ 483 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 484 485 for (i = 0; i < NKPT; i++) 486 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 487 488 /* 489 * Adjust the start of the KPTD and KPTmap so that the implementation 490 * of pmap_kextract() and pmap_growkernel() can be made simpler. 491 */ 492 KPTD -= KPTDI; 493 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 494 495 /* 496 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 497 * respectively. 498 */ 499 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 500 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 501 502 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 503 504 virtual_avail = va; 505 506 /* 507 * Leave in place an identity mapping (virt == phys) for the low 1 MB 508 * physical memory region that is used by the ACPI wakeup code. This 509 * mapping must not have PG_G set. 510 */ 511#ifdef XBOX 512 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 513 * an early stadium, we cannot yet neatly map video memory ... :-( 514 * Better fixes are very welcome! */ 515 if (!arch_i386_is_xbox) 516#endif 517 for (i = 1; i < NKPT; i++) 518 PTD[i] = 0; 519 520 /* Initialize the PAT MSR if present. */ 521 pmap_init_pat(); 522 523 /* Turn on PG_G on kernel page(s) */ 524 pmap_set_pg(); 525} 526 527/* 528 * Setup the PAT MSR. 529 */ 530void 531pmap_init_pat(void) 532{ 533 int pat_table[PAT_INDEX_SIZE]; 534 uint64_t pat_msr; 535 u_long cr0, cr4; 536 int i; 537 538 /* Set default PAT index table. */ 539 for (i = 0; i < PAT_INDEX_SIZE; i++) 540 pat_table[i] = -1; 541 pat_table[PAT_WRITE_BACK] = 0; 542 pat_table[PAT_WRITE_THROUGH] = 1; 543 pat_table[PAT_UNCACHEABLE] = 3; 544 pat_table[PAT_WRITE_COMBINING] = 3; 545 pat_table[PAT_WRITE_PROTECTED] = 3; 546 pat_table[PAT_UNCACHED] = 3; 547 548 /* Bail if this CPU doesn't implement PAT. */ 549 if ((cpu_feature & CPUID_PAT) == 0) { 550 for (i = 0; i < PAT_INDEX_SIZE; i++) 551 pat_index[i] = pat_table[i]; 552 pat_works = 0; 553 return; 554 } 555 556 /* 557 * Due to some Intel errata, we can only safely use the lower 4 558 * PAT entries. 559 * 560 * Intel Pentium III Processor Specification Update 561 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 562 * or Mode C Paging) 563 * 564 * Intel Pentium IV Processor Specification Update 565 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 566 */ 567 if (cpu_vendor_id == CPU_VENDOR_INTEL && 568 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 569 pat_works = 0; 570 571 /* Initialize default PAT entries. */ 572 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 573 PAT_VALUE(1, PAT_WRITE_THROUGH) | 574 PAT_VALUE(2, PAT_UNCACHED) | 575 PAT_VALUE(3, PAT_UNCACHEABLE) | 576 PAT_VALUE(4, PAT_WRITE_BACK) | 577 PAT_VALUE(5, PAT_WRITE_THROUGH) | 578 PAT_VALUE(6, PAT_UNCACHED) | 579 PAT_VALUE(7, PAT_UNCACHEABLE); 580 581 if (pat_works) { 582 /* 583 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 584 * Program 5 and 6 as WP and WC. 585 * Leave 4 and 7 as WB and UC. 586 */ 587 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 588 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 589 PAT_VALUE(6, PAT_WRITE_COMBINING); 590 pat_table[PAT_UNCACHED] = 2; 591 pat_table[PAT_WRITE_PROTECTED] = 5; 592 pat_table[PAT_WRITE_COMBINING] = 6; 593 } else { 594 /* 595 * Just replace PAT Index 2 with WC instead of UC-. 596 */ 597 pat_msr &= ~PAT_MASK(2); 598 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 599 pat_table[PAT_WRITE_COMBINING] = 2; 600 } 601 602 /* Disable PGE. */ 603 cr4 = rcr4(); 604 load_cr4(cr4 & ~CR4_PGE); 605 606 /* Disable caches (CD = 1, NW = 0). */ 607 cr0 = rcr0(); 608 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 609 610 /* Flushes caches and TLBs. */ 611 wbinvd(); 612 invltlb(); 613 614 /* Update PAT and index table. */ 615 wrmsr(MSR_PAT, pat_msr); 616 for (i = 0; i < PAT_INDEX_SIZE; i++) 617 pat_index[i] = pat_table[i]; 618 619 /* Flush caches and TLBs again. */ 620 wbinvd(); 621 invltlb(); 622 623 /* Restore caches and PGE. */ 624 load_cr0(cr0); 625 load_cr4(cr4); 626} 627 628/* 629 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 630 */ 631static void 632pmap_set_pg(void) 633{ 634 pt_entry_t *pte; 635 vm_offset_t va, endva; 636 637 if (pgeflag == 0) 638 return; 639 640 endva = KERNBASE + KERNend; 641 642 if (pseflag) { 643 va = KERNBASE + KERNLOAD; 644 while (va < endva) { 645 pdir_pde(PTD, va) |= pgeflag; 646 invltlb(); /* Play it safe, invltlb() every time */ 647 va += NBPDR; 648 } 649 } else { 650 va = (vm_offset_t)btext; 651 while (va < endva) { 652 pte = vtopte(va); 653 if (*pte) 654 *pte |= pgeflag; 655 invltlb(); /* Play it safe, invltlb() every time */ 656 va += PAGE_SIZE; 657 } 658 } 659} 660 661/* 662 * Initialize a vm_page's machine-dependent fields. 663 */ 664void 665pmap_page_init(vm_page_t m) 666{ 667 668 TAILQ_INIT(&m->md.pv_list); 669 m->md.pat_mode = PAT_WRITE_BACK; 670} 671 672#if defined(PAE) || defined(PAE_TABLES) 673static void * 674pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) 675{ 676 677 /* Inform UMA that this allocator uses kernel_map/object. */ 678 *flags = UMA_SLAB_KERNEL; 679 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 680 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 681} 682#endif 683 684/* 685 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 686 * Requirements: 687 * - Must deal with pages in order to ensure that none of the PG_* bits 688 * are ever set, PG_V in particular. 689 * - Assumes we can write to ptes without pte_store() atomic ops, even 690 * on PAE systems. This should be ok. 691 * - Assumes nothing will ever test these addresses for 0 to indicate 692 * no mapping instead of correctly checking PG_V. 693 * - Assumes a vm_offset_t will fit in a pte (true for i386). 694 * Because PG_V is never set, there can be no mappings to invalidate. 695 */ 696static vm_offset_t 697pmap_ptelist_alloc(vm_offset_t *head) 698{ 699 pt_entry_t *pte; 700 vm_offset_t va; 701 702 va = *head; 703 if (va == 0) 704 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 705 pte = vtopte(va); 706 *head = *pte; 707 if (*head & PG_V) 708 panic("pmap_ptelist_alloc: va with PG_V set!"); 709 *pte = 0; 710 return (va); 711} 712 713static void 714pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 715{ 716 pt_entry_t *pte; 717 718 if (va & PG_V) 719 panic("pmap_ptelist_free: freeing va with PG_V set!"); 720 pte = vtopte(va); 721 *pte = *head; /* virtual! PG_V is 0 though */ 722 *head = va; 723} 724 725static void 726pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 727{ 728 int i; 729 vm_offset_t va; 730 731 *head = 0; 732 for (i = npages - 1; i >= 0; i--) { 733 va = (vm_offset_t)base + i * PAGE_SIZE; 734 pmap_ptelist_free(head, va); 735 } 736} 737 738 739/* 740 * Initialize the pmap module. 741 * Called by vm_init, to initialize any structures that the pmap 742 * system needs to map virtual memory. 743 */ 744void 745pmap_init(void) 746{ 747 struct pmap_preinit_mapping *ppim; 748 vm_page_t mpte; 749 vm_size_t s; 750 int i, pv_npg; 751 752 /* 753 * Initialize the vm page array entries for the kernel pmap's 754 * page table pages. 755 */ 756 for (i = 0; i < NKPT; i++) { 757 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 758 KASSERT(mpte >= vm_page_array && 759 mpte < &vm_page_array[vm_page_array_size], 760 ("pmap_init: page table page is out of range")); 761 mpte->pindex = i + KPTDI; 762 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 763 } 764 765 /* 766 * Initialize the address space (zone) for the pv entries. Set a 767 * high water mark so that the system can recover from excessive 768 * numbers of pv entries. 769 */ 770 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 771 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 772 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 773 pv_entry_max = roundup(pv_entry_max, _NPCPV); 774 pv_entry_high_water = 9 * (pv_entry_max / 10); 775 776 /* 777 * If the kernel is running on a virtual machine, then it must assume 778 * that MCA is enabled by the hypervisor. Moreover, the kernel must 779 * be prepared for the hypervisor changing the vendor and family that 780 * are reported by CPUID. Consequently, the workaround for AMD Family 781 * 10h Erratum 383 is enabled if the processor's feature set does not 782 * include at least one feature that is only supported by older Intel 783 * or newer AMD processors. 784 */ 785 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 786 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 787 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 788 AMDID2_FMA4)) == 0) 789 workaround_erratum383 = 1; 790 791 /* 792 * Are large page mappings supported and enabled? 793 */ 794 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 795 if (pseflag == 0) 796 pg_ps_enabled = 0; 797 else if (pg_ps_enabled) { 798 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 799 ("pmap_init: can't assign to pagesizes[1]")); 800 pagesizes[1] = NBPDR; 801 } 802 803 /* 804 * Calculate the size of the pv head table for superpages. 805 * Handle the possibility that "vm_phys_segs[...].end" is zero. 806 */ 807 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 808 PAGE_SIZE) / NBPDR + 1; 809 810 /* 811 * Allocate memory for the pv head table for superpages. 812 */ 813 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 814 s = round_page(s); 815 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 816 M_WAITOK | M_ZERO); 817 for (i = 0; i < pv_npg; i++) 818 TAILQ_INIT(&pv_table[i].pv_list); 819 820 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 821 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 822 if (pv_chunkbase == NULL) 823 panic("pmap_init: not enough kvm for pv chunks"); 824 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 825#if defined(PAE) || defined(PAE_TABLES) 826 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 827 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 828 UMA_ZONE_VM | UMA_ZONE_NOFREE); 829 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 830#endif 831 832 pmap_initialized = 1; 833 if (!bootverbose) 834 return; 835 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 836 ppim = pmap_preinit_mapping + i; 837 if (ppim->va == 0) 838 continue; 839 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 840 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 841 } 842} 843 844 845SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 846 "Max number of PV entries"); 847SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 848 "Page share factor per proc"); 849 850static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 851 "2/4MB page mapping counters"); 852 853static u_long pmap_pde_demotions; 854SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 855 &pmap_pde_demotions, 0, "2/4MB page demotions"); 856 857static u_long pmap_pde_mappings; 858SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 859 &pmap_pde_mappings, 0, "2/4MB page mappings"); 860 861static u_long pmap_pde_p_failures; 862SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 863 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 864 865static u_long pmap_pde_promotions; 866SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 867 &pmap_pde_promotions, 0, "2/4MB page promotions"); 868 869/*************************************************** 870 * Low level helper routines..... 871 ***************************************************/ 872 873/* 874 * Determine the appropriate bits to set in a PTE or PDE for a specified 875 * caching mode. 876 */ 877int 878pmap_cache_bits(int mode, boolean_t is_pde) 879{ 880 int cache_bits, pat_flag, pat_idx; 881 882 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 883 panic("Unknown caching mode %d\n", mode); 884 885 /* The PAT bit is different for PTE's and PDE's. */ 886 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 887 888 /* Map the caching mode to a PAT index. */ 889 pat_idx = pat_index[mode]; 890 891 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 892 cache_bits = 0; 893 if (pat_idx & 0x4) 894 cache_bits |= pat_flag; 895 if (pat_idx & 0x2) 896 cache_bits |= PG_NC_PCD; 897 if (pat_idx & 0x1) 898 cache_bits |= PG_NC_PWT; 899 return (cache_bits); 900} 901 902/* 903 * The caller is responsible for maintaining TLB consistency. 904 */ 905static void 906pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 907{ 908 pd_entry_t *pde; 909 pmap_t pmap; 910 boolean_t PTD_updated; 911 912 PTD_updated = FALSE; 913 mtx_lock_spin(&allpmaps_lock); 914 LIST_FOREACH(pmap, &allpmaps, pm_list) { 915 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 916 PG_FRAME)) 917 PTD_updated = TRUE; 918 pde = pmap_pde(pmap, va); 919 pde_store(pde, newpde); 920 } 921 mtx_unlock_spin(&allpmaps_lock); 922 KASSERT(PTD_updated, 923 ("pmap_kenter_pde: current page table is not in allpmaps")); 924} 925 926/* 927 * After changing the page size for the specified virtual address in the page 928 * table, flush the corresponding entries from the processor's TLB. Only the 929 * calling processor's TLB is affected. 930 * 931 * The calling thread must be pinned to a processor. 932 */ 933static void 934pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 935{ 936 u_long cr4; 937 938 if ((newpde & PG_PS) == 0) 939 /* Demotion: flush a specific 2MB page mapping. */ 940 invlpg(va); 941 else if ((newpde & PG_G) == 0) 942 /* 943 * Promotion: flush every 4KB page mapping from the TLB 944 * because there are too many to flush individually. 945 */ 946 invltlb(); 947 else { 948 /* 949 * Promotion: flush every 4KB page mapping from the TLB, 950 * including any global (PG_G) mappings. 951 */ 952 cr4 = rcr4(); 953 load_cr4(cr4 & ~CR4_PGE); 954 /* 955 * Although preemption at this point could be detrimental to 956 * performance, it would not lead to an error. PG_G is simply 957 * ignored if CR4.PGE is clear. Moreover, in case this block 958 * is re-entered, the load_cr4() either above or below will 959 * modify CR4.PGE flushing the TLB. 960 */ 961 load_cr4(cr4 | CR4_PGE); 962 } 963} 964#ifdef SMP 965/* 966 * For SMP, these functions have to use the IPI mechanism for coherence. 967 * 968 * N.B.: Before calling any of the following TLB invalidation functions, 969 * the calling processor must ensure that all stores updating a non- 970 * kernel page table are globally performed. Otherwise, another 971 * processor could cache an old, pre-update entry without being 972 * invalidated. This can happen one of two ways: (1) The pmap becomes 973 * active on another processor after its pm_active field is checked by 974 * one of the following functions but before a store updating the page 975 * table is globally performed. (2) The pmap becomes active on another 976 * processor before its pm_active field is checked but due to 977 * speculative loads one of the following functions stills reads the 978 * pmap as inactive on the other processor. 979 * 980 * The kernel page table is exempt because its pm_active field is 981 * immutable. The kernel page table is always active on every 982 * processor. 983 */ 984void 985pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 986{ 987 cpuset_t other_cpus; 988 u_int cpuid; 989 990 sched_pin(); 991 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 992 invlpg(va); 993 smp_invlpg(va); 994 } else { 995 cpuid = PCPU_GET(cpuid); 996 other_cpus = all_cpus; 997 CPU_CLR(cpuid, &other_cpus); 998 if (CPU_ISSET(cpuid, &pmap->pm_active)) 999 invlpg(va); 1000 CPU_AND(&other_cpus, &pmap->pm_active); 1001 if (!CPU_EMPTY(&other_cpus)) 1002 smp_masked_invlpg(other_cpus, va); 1003 } 1004 sched_unpin(); 1005} 1006 1007void 1008pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1009{ 1010 cpuset_t other_cpus; 1011 vm_offset_t addr; 1012 u_int cpuid; 1013 1014 sched_pin(); 1015 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1016 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1017 invlpg(addr); 1018 smp_invlpg_range(sva, eva); 1019 } else { 1020 cpuid = PCPU_GET(cpuid); 1021 other_cpus = all_cpus; 1022 CPU_CLR(cpuid, &other_cpus); 1023 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1024 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1025 invlpg(addr); 1026 CPU_AND(&other_cpus, &pmap->pm_active); 1027 if (!CPU_EMPTY(&other_cpus)) 1028 smp_masked_invlpg_range(other_cpus, sva, eva); 1029 } 1030 sched_unpin(); 1031} 1032 1033void 1034pmap_invalidate_all(pmap_t pmap) 1035{ 1036 cpuset_t other_cpus; 1037 u_int cpuid; 1038 1039 sched_pin(); 1040 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1041 invltlb(); 1042 smp_invltlb(); 1043 } else { 1044 cpuid = PCPU_GET(cpuid); 1045 other_cpus = all_cpus; 1046 CPU_CLR(cpuid, &other_cpus); 1047 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1048 invltlb(); 1049 CPU_AND(&other_cpus, &pmap->pm_active); 1050 if (!CPU_EMPTY(&other_cpus)) 1051 smp_masked_invltlb(other_cpus); 1052 } 1053 sched_unpin(); 1054} 1055 1056void 1057pmap_invalidate_cache(void) 1058{ 1059 1060 sched_pin(); 1061 wbinvd(); 1062 smp_cache_flush(); 1063 sched_unpin(); 1064} 1065 1066struct pde_action { 1067 cpuset_t invalidate; /* processors that invalidate their TLB */ 1068 vm_offset_t va; 1069 pd_entry_t *pde; 1070 pd_entry_t newpde; 1071 u_int store; /* processor that updates the PDE */ 1072}; 1073 1074static void 1075pmap_update_pde_kernel(void *arg) 1076{ 1077 struct pde_action *act = arg; 1078 pd_entry_t *pde; 1079 pmap_t pmap; 1080 1081 if (act->store == PCPU_GET(cpuid)) { 1082 1083 /* 1084 * Elsewhere, this operation requires allpmaps_lock for 1085 * synchronization. Here, it does not because it is being 1086 * performed in the context of an all_cpus rendezvous. 1087 */ 1088 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1089 pde = pmap_pde(pmap, act->va); 1090 pde_store(pde, act->newpde); 1091 } 1092 } 1093} 1094 1095static void 1096pmap_update_pde_user(void *arg) 1097{ 1098 struct pde_action *act = arg; 1099 1100 if (act->store == PCPU_GET(cpuid)) 1101 pde_store(act->pde, act->newpde); 1102} 1103 1104static void 1105pmap_update_pde_teardown(void *arg) 1106{ 1107 struct pde_action *act = arg; 1108 1109 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1110 pmap_update_pde_invalidate(act->va, act->newpde); 1111} 1112 1113/* 1114 * Change the page size for the specified virtual address in a way that 1115 * prevents any possibility of the TLB ever having two entries that map the 1116 * same virtual address using different page sizes. This is the recommended 1117 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1118 * machine check exception for a TLB state that is improperly diagnosed as a 1119 * hardware error. 1120 */ 1121static void 1122pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1123{ 1124 struct pde_action act; 1125 cpuset_t active, other_cpus; 1126 u_int cpuid; 1127 1128 sched_pin(); 1129 cpuid = PCPU_GET(cpuid); 1130 other_cpus = all_cpus; 1131 CPU_CLR(cpuid, &other_cpus); 1132 if (pmap == kernel_pmap) 1133 active = all_cpus; 1134 else 1135 active = pmap->pm_active; 1136 if (CPU_OVERLAP(&active, &other_cpus)) { 1137 act.store = cpuid; 1138 act.invalidate = active; 1139 act.va = va; 1140 act.pde = pde; 1141 act.newpde = newpde; 1142 CPU_SET(cpuid, &active); 1143 smp_rendezvous_cpus(active, 1144 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1145 pmap_update_pde_kernel : pmap_update_pde_user, 1146 pmap_update_pde_teardown, &act); 1147 } else { 1148 if (pmap == kernel_pmap) 1149 pmap_kenter_pde(va, newpde); 1150 else 1151 pde_store(pde, newpde); 1152 if (CPU_ISSET(cpuid, &active)) 1153 pmap_update_pde_invalidate(va, newpde); 1154 } 1155 sched_unpin(); 1156} 1157#else /* !SMP */ 1158/* 1159 * Normal, non-SMP, 486+ invalidation functions. 1160 * We inline these within pmap.c for speed. 1161 */ 1162PMAP_INLINE void 1163pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1164{ 1165 1166 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1167 invlpg(va); 1168} 1169 1170PMAP_INLINE void 1171pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1172{ 1173 vm_offset_t addr; 1174 1175 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1176 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1177 invlpg(addr); 1178} 1179 1180PMAP_INLINE void 1181pmap_invalidate_all(pmap_t pmap) 1182{ 1183 1184 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1185 invltlb(); 1186} 1187 1188PMAP_INLINE void 1189pmap_invalidate_cache(void) 1190{ 1191 1192 wbinvd(); 1193} 1194 1195static void 1196pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1197{ 1198 1199 if (pmap == kernel_pmap) 1200 pmap_kenter_pde(va, newpde); 1201 else 1202 pde_store(pde, newpde); 1203 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1204 pmap_update_pde_invalidate(va, newpde); 1205} 1206#endif /* !SMP */ 1207 1208#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1209 1210void 1211pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1212{ 1213 1214 if (force) { 1215 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1216 } else { 1217 KASSERT((sva & PAGE_MASK) == 0, 1218 ("pmap_invalidate_cache_range: sva not page-aligned")); 1219 KASSERT((eva & PAGE_MASK) == 0, 1220 ("pmap_invalidate_cache_range: eva not page-aligned")); 1221 } 1222 1223 if ((cpu_feature & CPUID_SS) != 0 && !force) 1224 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1225 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1226 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1227 1228#ifdef DEV_APIC 1229 /* 1230 * XXX: Some CPUs fault, hang, or trash the local APIC 1231 * registers if we use CLFLUSH on the local APIC 1232 * range. The local APIC is always uncached, so we 1233 * don't need to flush for that range anyway. 1234 */ 1235 if (pmap_kextract(sva) == lapic_paddr) 1236 return; 1237#endif 1238 /* 1239 * Otherwise, do per-cache line flush. Use the mfence 1240 * instruction to insure that previous stores are 1241 * included in the write-back. The processor 1242 * propagates flush to other processors in the cache 1243 * coherence domain. 1244 */ 1245 mfence(); 1246 for (; sva < eva; sva += cpu_clflush_line_size) 1247 clflush(sva); 1248 mfence(); 1249 } else { 1250 1251 /* 1252 * No targeted cache flush methods are supported by CPU, 1253 * or the supplied range is bigger than 2MB. 1254 * Globally invalidate cache. 1255 */ 1256 pmap_invalidate_cache(); 1257 } 1258} 1259 1260void 1261pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1262{ 1263 int i; 1264 1265 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1266 (cpu_feature & CPUID_CLFSH) == 0) { 1267 pmap_invalidate_cache(); 1268 } else { 1269 for (i = 0; i < count; i++) 1270 pmap_flush_page(pages[i]); 1271 } 1272} 1273 1274/* 1275 * Are we current address space or kernel? N.B. We return FALSE when 1276 * a pmap's page table is in use because a kernel thread is borrowing 1277 * it. The borrowed page table can change spontaneously, making any 1278 * dependence on its continued use subject to a race condition. 1279 */ 1280static __inline int 1281pmap_is_current(pmap_t pmap) 1282{ 1283 1284 return (pmap == kernel_pmap || 1285 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1286 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1287} 1288 1289/* 1290 * If the given pmap is not the current or kernel pmap, the returned pte must 1291 * be released by passing it to pmap_pte_release(). 1292 */ 1293pt_entry_t * 1294pmap_pte(pmap_t pmap, vm_offset_t va) 1295{ 1296 pd_entry_t newpf; 1297 pd_entry_t *pde; 1298 1299 pde = pmap_pde(pmap, va); 1300 if (*pde & PG_PS) 1301 return (pde); 1302 if (*pde != 0) { 1303 /* are we current address space or kernel? */ 1304 if (pmap_is_current(pmap)) 1305 return (vtopte(va)); 1306 mtx_lock(&PMAP2mutex); 1307 newpf = *pde & PG_FRAME; 1308 if ((*PMAP2 & PG_FRAME) != newpf) { 1309 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1310 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1311 } 1312 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1313 } 1314 return (NULL); 1315} 1316 1317/* 1318 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1319 * being NULL. 1320 */ 1321static __inline void 1322pmap_pte_release(pt_entry_t *pte) 1323{ 1324 1325 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1326 mtx_unlock(&PMAP2mutex); 1327} 1328 1329/* 1330 * NB: The sequence of updating a page table followed by accesses to the 1331 * corresponding pages is subject to the situation described in the "AMD64 1332 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1333 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1334 * right after modifying the PTE bits is crucial. 1335 */ 1336static __inline void 1337invlcaddr(void *caddr) 1338{ 1339 1340 invlpg((u_int)caddr); 1341} 1342 1343/* 1344 * Super fast pmap_pte routine best used when scanning 1345 * the pv lists. This eliminates many coarse-grained 1346 * invltlb calls. Note that many of the pv list 1347 * scans are across different pmaps. It is very wasteful 1348 * to do an entire invltlb for checking a single mapping. 1349 * 1350 * If the given pmap is not the current pmap, pvh_global_lock 1351 * must be held and curthread pinned to a CPU. 1352 */ 1353static pt_entry_t * 1354pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1355{ 1356 pd_entry_t newpf; 1357 pd_entry_t *pde; 1358 1359 pde = pmap_pde(pmap, va); 1360 if (*pde & PG_PS) 1361 return (pde); 1362 if (*pde != 0) { 1363 /* are we current address space or kernel? */ 1364 if (pmap_is_current(pmap)) 1365 return (vtopte(va)); 1366 rw_assert(&pvh_global_lock, RA_WLOCKED); 1367 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1368 newpf = *pde & PG_FRAME; 1369 if ((*PMAP1 & PG_FRAME) != newpf) { 1370 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1371#ifdef SMP 1372 PMAP1cpu = PCPU_GET(cpuid); 1373#endif 1374 invlcaddr(PADDR1); 1375 PMAP1changed++; 1376 } else 1377#ifdef SMP 1378 if (PMAP1cpu != PCPU_GET(cpuid)) { 1379 PMAP1cpu = PCPU_GET(cpuid); 1380 invlcaddr(PADDR1); 1381 PMAP1changedcpu++; 1382 } else 1383#endif 1384 PMAP1unchanged++; 1385 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1386 } 1387 return (0); 1388} 1389 1390/* 1391 * Routine: pmap_extract 1392 * Function: 1393 * Extract the physical page address associated 1394 * with the given map/virtual_address pair. 1395 */ 1396vm_paddr_t 1397pmap_extract(pmap_t pmap, vm_offset_t va) 1398{ 1399 vm_paddr_t rtval; 1400 pt_entry_t *pte; 1401 pd_entry_t pde; 1402 1403 rtval = 0; 1404 PMAP_LOCK(pmap); 1405 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1406 if (pde != 0) { 1407 if ((pde & PG_PS) != 0) 1408 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1409 else { 1410 pte = pmap_pte(pmap, va); 1411 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1412 pmap_pte_release(pte); 1413 } 1414 } 1415 PMAP_UNLOCK(pmap); 1416 return (rtval); 1417} 1418 1419/* 1420 * Routine: pmap_extract_and_hold 1421 * Function: 1422 * Atomically extract and hold the physical page 1423 * with the given pmap and virtual address pair 1424 * if that mapping permits the given protection. 1425 */ 1426vm_page_t 1427pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1428{ 1429 pd_entry_t pde; 1430 pt_entry_t pte, *ptep; 1431 vm_page_t m; 1432 vm_paddr_t pa; 1433 1434 pa = 0; 1435 m = NULL; 1436 PMAP_LOCK(pmap); 1437retry: 1438 pde = *pmap_pde(pmap, va); 1439 if (pde != 0) { 1440 if (pde & PG_PS) { 1441 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1442 if (vm_page_pa_tryrelock(pmap, (pde & 1443 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1444 goto retry; 1445 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1446 (va & PDRMASK)); 1447 vm_page_hold(m); 1448 } 1449 } else { 1450 ptep = pmap_pte(pmap, va); 1451 pte = *ptep; 1452 pmap_pte_release(ptep); 1453 if (pte != 0 && 1454 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1455 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1456 &pa)) 1457 goto retry; 1458 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1459 vm_page_hold(m); 1460 } 1461 } 1462 } 1463 PA_UNLOCK_COND(pa); 1464 PMAP_UNLOCK(pmap); 1465 return (m); 1466} 1467 1468/*************************************************** 1469 * Low level mapping routines..... 1470 ***************************************************/ 1471 1472/* 1473 * Add a wired page to the kva. 1474 * Note: not SMP coherent. 1475 * 1476 * This function may be used before pmap_bootstrap() is called. 1477 */ 1478PMAP_INLINE void 1479pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1480{ 1481 pt_entry_t *pte; 1482 1483 pte = vtopte(va); 1484 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1485} 1486 1487static __inline void 1488pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1489{ 1490 pt_entry_t *pte; 1491 1492 pte = vtopte(va); 1493 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1494} 1495 1496/* 1497 * Remove a page from the kernel pagetables. 1498 * Note: not SMP coherent. 1499 * 1500 * This function may be used before pmap_bootstrap() is called. 1501 */ 1502PMAP_INLINE void 1503pmap_kremove(vm_offset_t va) 1504{ 1505 pt_entry_t *pte; 1506 1507 pte = vtopte(va); 1508 pte_clear(pte); 1509} 1510 1511/* 1512 * Used to map a range of physical addresses into kernel 1513 * virtual address space. 1514 * 1515 * The value passed in '*virt' is a suggested virtual address for 1516 * the mapping. Architectures which can support a direct-mapped 1517 * physical to virtual region can return the appropriate address 1518 * within that region, leaving '*virt' unchanged. Other 1519 * architectures should map the pages starting at '*virt' and 1520 * update '*virt' with the first usable address after the mapped 1521 * region. 1522 */ 1523vm_offset_t 1524pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1525{ 1526 vm_offset_t va, sva; 1527 vm_paddr_t superpage_offset; 1528 pd_entry_t newpde; 1529 1530 va = *virt; 1531 /* 1532 * Does the physical address range's size and alignment permit at 1533 * least one superpage mapping to be created? 1534 */ 1535 superpage_offset = start & PDRMASK; 1536 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1537 /* 1538 * Increase the starting virtual address so that its alignment 1539 * does not preclude the use of superpage mappings. 1540 */ 1541 if ((va & PDRMASK) < superpage_offset) 1542 va = (va & ~PDRMASK) + superpage_offset; 1543 else if ((va & PDRMASK) > superpage_offset) 1544 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1545 } 1546 sva = va; 1547 while (start < end) { 1548 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1549 pseflag) { 1550 KASSERT((va & PDRMASK) == 0, 1551 ("pmap_map: misaligned va %#x", va)); 1552 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1553 pmap_kenter_pde(va, newpde); 1554 va += NBPDR; 1555 start += NBPDR; 1556 } else { 1557 pmap_kenter(va, start); 1558 va += PAGE_SIZE; 1559 start += PAGE_SIZE; 1560 } 1561 } 1562 pmap_invalidate_range(kernel_pmap, sva, va); 1563 *virt = va; 1564 return (sva); 1565} 1566 1567 1568/* 1569 * Add a list of wired pages to the kva 1570 * this routine is only used for temporary 1571 * kernel mappings that do not need to have 1572 * page modification or references recorded. 1573 * Note that old mappings are simply written 1574 * over. The page *must* be wired. 1575 * Note: SMP coherent. Uses a ranged shootdown IPI. 1576 */ 1577void 1578pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1579{ 1580 pt_entry_t *endpte, oldpte, pa, *pte; 1581 vm_page_t m; 1582 1583 oldpte = 0; 1584 pte = vtopte(sva); 1585 endpte = pte + count; 1586 while (pte < endpte) { 1587 m = *ma++; 1588 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1589 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1590 oldpte |= *pte; 1591 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1592 } 1593 pte++; 1594 } 1595 if (__predict_false((oldpte & PG_V) != 0)) 1596 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1597 PAGE_SIZE); 1598} 1599 1600/* 1601 * This routine tears out page mappings from the 1602 * kernel -- it is meant only for temporary mappings. 1603 * Note: SMP coherent. Uses a ranged shootdown IPI. 1604 */ 1605void 1606pmap_qremove(vm_offset_t sva, int count) 1607{ 1608 vm_offset_t va; 1609 1610 va = sva; 1611 while (count-- > 0) { 1612 pmap_kremove(va); 1613 va += PAGE_SIZE; 1614 } 1615 pmap_invalidate_range(kernel_pmap, sva, va); 1616} 1617 1618/*************************************************** 1619 * Page table page management routines..... 1620 ***************************************************/ 1621static __inline void 1622pmap_free_zero_pages(struct spglist *free) 1623{ 1624 vm_page_t m; 1625 1626 while ((m = SLIST_FIRST(free)) != NULL) { 1627 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1628 /* Preserve the page's PG_ZERO setting. */ 1629 vm_page_free_toq(m); 1630 } 1631} 1632 1633/* 1634 * Schedule the specified unused page table page to be freed. Specifically, 1635 * add the page to the specified list of pages that will be released to the 1636 * physical memory manager after the TLB has been updated. 1637 */ 1638static __inline void 1639pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1640 boolean_t set_PG_ZERO) 1641{ 1642 1643 if (set_PG_ZERO) 1644 m->flags |= PG_ZERO; 1645 else 1646 m->flags &= ~PG_ZERO; 1647 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1648} 1649 1650/* 1651 * Inserts the specified page table page into the specified pmap's collection 1652 * of idle page table pages. Each of a pmap's page table pages is responsible 1653 * for mapping a distinct range of virtual addresses. The pmap's collection is 1654 * ordered by this virtual address range. 1655 */ 1656static __inline int 1657pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1658{ 1659 1660 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1661 return (vm_radix_insert(&pmap->pm_root, mpte)); 1662} 1663 1664/* 1665 * Looks for a page table page mapping the specified virtual address in the 1666 * specified pmap's collection of idle page table pages. Returns NULL if there 1667 * is no page table page corresponding to the specified virtual address. 1668 */ 1669static __inline vm_page_t 1670pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1671{ 1672 1673 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1674 return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); 1675} 1676 1677/* 1678 * Removes the specified page table page from the specified pmap's collection 1679 * of idle page table pages. The specified page table page must be a member of 1680 * the pmap's collection. 1681 */ 1682static __inline void 1683pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1684{ 1685 1686 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1687 vm_radix_remove(&pmap->pm_root, mpte->pindex); 1688} 1689 1690/* 1691 * Decrements a page table page's wire count, which is used to record the 1692 * number of valid page table entries within the page. If the wire count 1693 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1694 * page table page was unmapped and FALSE otherwise. 1695 */ 1696static inline boolean_t 1697pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1698{ 1699 1700 --m->wire_count; 1701 if (m->wire_count == 0) { 1702 _pmap_unwire_ptp(pmap, m, free); 1703 return (TRUE); 1704 } else 1705 return (FALSE); 1706} 1707 1708static void 1709_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1710{ 1711 vm_offset_t pteva; 1712 1713 /* 1714 * unmap the page table page 1715 */ 1716 pmap->pm_pdir[m->pindex] = 0; 1717 --pmap->pm_stats.resident_count; 1718 1719 /* 1720 * This is a release store so that the ordinary store unmapping 1721 * the page table page is globally performed before TLB shoot- 1722 * down is begun. 1723 */ 1724 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1725 1726 /* 1727 * Do an invltlb to make the invalidated mapping 1728 * take effect immediately. 1729 */ 1730 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1731 pmap_invalidate_page(pmap, pteva); 1732 1733 /* 1734 * Put page on a list so that it is released after 1735 * *ALL* TLB shootdown is done 1736 */ 1737 pmap_add_delayed_free_list(m, free, TRUE); 1738} 1739 1740/* 1741 * After removing a page table entry, this routine is used to 1742 * conditionally free the page, and manage the hold/wire counts. 1743 */ 1744static int 1745pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1746{ 1747 pd_entry_t ptepde; 1748 vm_page_t mpte; 1749 1750 if (va >= VM_MAXUSER_ADDRESS) 1751 return (0); 1752 ptepde = *pmap_pde(pmap, va); 1753 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1754 return (pmap_unwire_ptp(pmap, mpte, free)); 1755} 1756 1757/* 1758 * Initialize the pmap for the swapper process. 1759 */ 1760void 1761pmap_pinit0(pmap_t pmap) 1762{ 1763 1764 PMAP_LOCK_INIT(pmap); 1765 /* 1766 * Since the page table directory is shared with the kernel pmap, 1767 * which is already included in the list "allpmaps", this pmap does 1768 * not need to be inserted into that list. 1769 */ 1770 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1771#if defined(PAE) || defined(PAE_TABLES) 1772 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1773#endif 1774 pmap->pm_root.rt_root = 0; 1775 CPU_ZERO(&pmap->pm_active); 1776 PCPU_SET(curpmap, pmap); 1777 TAILQ_INIT(&pmap->pm_pvchunk); 1778 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1779} 1780 1781/* 1782 * Initialize a preallocated and zeroed pmap structure, 1783 * such as one in a vmspace structure. 1784 */ 1785int 1786pmap_pinit(pmap_t pmap) 1787{ 1788 vm_page_t m, ptdpg[NPGPTD]; 1789 vm_paddr_t pa; 1790 int i; 1791 1792 /* 1793 * No need to allocate page table space yet but we do need a valid 1794 * page directory table. 1795 */ 1796 if (pmap->pm_pdir == NULL) { 1797 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1798 if (pmap->pm_pdir == NULL) 1799 return (0); 1800#if defined(PAE) || defined(PAE_TABLES) 1801 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1802 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1803 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1804 ("pmap_pinit: pdpt misaligned")); 1805 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1806 ("pmap_pinit: pdpt above 4g")); 1807#endif 1808 pmap->pm_root.rt_root = 0; 1809 } 1810 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1811 ("pmap_pinit: pmap has reserved page table page(s)")); 1812 1813 /* 1814 * allocate the page directory page(s) 1815 */ 1816 for (i = 0; i < NPGPTD;) { 1817 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1818 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1819 if (m == NULL) 1820 VM_WAIT; 1821 else { 1822 ptdpg[i++] = m; 1823 } 1824 } 1825 1826 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1827 1828 for (i = 0; i < NPGPTD; i++) 1829 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1830 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1831 1832 mtx_lock_spin(&allpmaps_lock); 1833 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1834 /* Copy the kernel page table directory entries. */ 1835 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1836 mtx_unlock_spin(&allpmaps_lock); 1837 1838 /* install self-referential address mapping entry(s) */ 1839 for (i = 0; i < NPGPTD; i++) { 1840 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1841 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1842#if defined(PAE) || defined(PAE_TABLES) 1843 pmap->pm_pdpt[i] = pa | PG_V; 1844#endif 1845 } 1846 1847 CPU_ZERO(&pmap->pm_active); 1848 TAILQ_INIT(&pmap->pm_pvchunk); 1849 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1850 1851 return (1); 1852} 1853 1854/* 1855 * this routine is called if the page table page is not 1856 * mapped correctly. 1857 */ 1858static vm_page_t 1859_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1860{ 1861 vm_paddr_t ptepa; 1862 vm_page_t m; 1863 1864 /* 1865 * Allocate a page table page. 1866 */ 1867 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1868 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1869 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1870 PMAP_UNLOCK(pmap); 1871 rw_wunlock(&pvh_global_lock); 1872 VM_WAIT; 1873 rw_wlock(&pvh_global_lock); 1874 PMAP_LOCK(pmap); 1875 } 1876 1877 /* 1878 * Indicate the need to retry. While waiting, the page table 1879 * page may have been allocated. 1880 */ 1881 return (NULL); 1882 } 1883 if ((m->flags & PG_ZERO) == 0) 1884 pmap_zero_page(m); 1885 1886 /* 1887 * Map the pagetable page into the process address space, if 1888 * it isn't already there. 1889 */ 1890 1891 pmap->pm_stats.resident_count++; 1892 1893 ptepa = VM_PAGE_TO_PHYS(m); 1894 pmap->pm_pdir[ptepindex] = 1895 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1896 1897 return (m); 1898} 1899 1900static vm_page_t 1901pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1902{ 1903 u_int ptepindex; 1904 pd_entry_t ptepa; 1905 vm_page_t m; 1906 1907 /* 1908 * Calculate pagetable page index 1909 */ 1910 ptepindex = va >> PDRSHIFT; 1911retry: 1912 /* 1913 * Get the page directory entry 1914 */ 1915 ptepa = pmap->pm_pdir[ptepindex]; 1916 1917 /* 1918 * This supports switching from a 4MB page to a 1919 * normal 4K page. 1920 */ 1921 if (ptepa & PG_PS) { 1922 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1923 ptepa = pmap->pm_pdir[ptepindex]; 1924 } 1925 1926 /* 1927 * If the page table page is mapped, we just increment the 1928 * hold count, and activate it. 1929 */ 1930 if (ptepa) { 1931 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1932 m->wire_count++; 1933 } else { 1934 /* 1935 * Here if the pte page isn't mapped, or if it has 1936 * been deallocated. 1937 */ 1938 m = _pmap_allocpte(pmap, ptepindex, flags); 1939 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 1940 goto retry; 1941 } 1942 return (m); 1943} 1944 1945 1946/*************************************************** 1947* Pmap allocation/deallocation routines. 1948 ***************************************************/ 1949 1950#ifdef SMP 1951/* 1952 * Deal with a SMP shootdown of other users of the pmap that we are 1953 * trying to dispose of. This can be a bit hairy. 1954 */ 1955static cpuset_t *lazymask; 1956static u_int lazyptd; 1957static volatile u_int lazywait; 1958 1959void pmap_lazyfix_action(void); 1960 1961void 1962pmap_lazyfix_action(void) 1963{ 1964 1965#ifdef COUNT_IPIS 1966 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1967#endif 1968 if (rcr3() == lazyptd) 1969 load_cr3(curpcb->pcb_cr3); 1970 CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); 1971 atomic_store_rel_int(&lazywait, 1); 1972} 1973 1974static void 1975pmap_lazyfix_self(u_int cpuid) 1976{ 1977 1978 if (rcr3() == lazyptd) 1979 load_cr3(curpcb->pcb_cr3); 1980 CPU_CLR_ATOMIC(cpuid, lazymask); 1981} 1982 1983 1984static void 1985pmap_lazyfix(pmap_t pmap) 1986{ 1987 cpuset_t mymask, mask; 1988 u_int cpuid, spins; 1989 int lsb; 1990 1991 mask = pmap->pm_active; 1992 while (!CPU_EMPTY(&mask)) { 1993 spins = 50000000; 1994 1995 /* Find least significant set bit. */ 1996 lsb = CPU_FFS(&mask); 1997 MPASS(lsb != 0); 1998 lsb--; 1999 CPU_SETOF(lsb, &mask); 2000 mtx_lock_spin(&smp_ipi_mtx); 2001#if defined(PAE) || defined(PAE_TABLES) 2002 lazyptd = vtophys(pmap->pm_pdpt); 2003#else 2004 lazyptd = vtophys(pmap->pm_pdir); 2005#endif 2006 cpuid = PCPU_GET(cpuid); 2007 2008 /* Use a cpuset just for having an easy check. */ 2009 CPU_SETOF(cpuid, &mymask); 2010 if (!CPU_CMP(&mask, &mymask)) { 2011 lazymask = &pmap->pm_active; 2012 pmap_lazyfix_self(cpuid); 2013 } else { 2014 atomic_store_rel_int((u_int *)&lazymask, 2015 (u_int)&pmap->pm_active); 2016 atomic_store_rel_int(&lazywait, 0); 2017 ipi_selected(mask, IPI_LAZYPMAP); 2018 while (lazywait == 0) { 2019 ia32_pause(); 2020 if (--spins == 0) 2021 break; 2022 } 2023 } 2024 mtx_unlock_spin(&smp_ipi_mtx); 2025 if (spins == 0) 2026 printf("pmap_lazyfix: spun for 50000000\n"); 2027 mask = pmap->pm_active; 2028 } 2029} 2030 2031#else /* SMP */ 2032 2033/* 2034 * Cleaning up on uniprocessor is easy. For various reasons, we're 2035 * unlikely to have to even execute this code, including the fact 2036 * that the cleanup is deferred until the parent does a wait(2), which 2037 * means that another userland process has run. 2038 */ 2039static void 2040pmap_lazyfix(pmap_t pmap) 2041{ 2042 u_int cr3; 2043 2044 cr3 = vtophys(pmap->pm_pdir); 2045 if (cr3 == rcr3()) { 2046 load_cr3(curpcb->pcb_cr3); 2047 CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); 2048 } 2049} 2050#endif /* SMP */ 2051 2052/* 2053 * Release any resources held by the given physical map. 2054 * Called when a pmap initialized by pmap_pinit is being released. 2055 * Should only be called if the map contains no valid mappings. 2056 */ 2057void 2058pmap_release(pmap_t pmap) 2059{ 2060 vm_page_t m, ptdpg[NPGPTD]; 2061 int i; 2062 2063 KASSERT(pmap->pm_stats.resident_count == 0, 2064 ("pmap_release: pmap resident count %ld != 0", 2065 pmap->pm_stats.resident_count)); 2066 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2067 ("pmap_release: pmap has reserved page table page(s)")); 2068 2069 pmap_lazyfix(pmap); 2070 mtx_lock_spin(&allpmaps_lock); 2071 LIST_REMOVE(pmap, pm_list); 2072 mtx_unlock_spin(&allpmaps_lock); 2073 2074 for (i = 0; i < NPGPTD; i++) 2075 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2076 PG_FRAME); 2077 2078 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2079 sizeof(*pmap->pm_pdir)); 2080 2081 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2082 2083 for (i = 0; i < NPGPTD; i++) { 2084 m = ptdpg[i]; 2085#if defined(PAE) || defined(PAE_TABLES) 2086 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2087 ("pmap_release: got wrong ptd page")); 2088#endif 2089 m->wire_count--; 2090 atomic_subtract_int(&cnt.v_wire_count, 1); 2091 vm_page_free_zero(m); 2092 } 2093} 2094 2095static int 2096kvm_size(SYSCTL_HANDLER_ARGS) 2097{ 2098 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2099 2100 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2101} 2102SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2103 0, 0, kvm_size, "IU", "Size of KVM"); 2104 2105static int 2106kvm_free(SYSCTL_HANDLER_ARGS) 2107{ 2108 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2109 2110 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2111} 2112SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2113 0, 0, kvm_free, "IU", "Amount of KVM free"); 2114 2115/* 2116 * grow the number of kernel page table entries, if needed 2117 */ 2118void 2119pmap_growkernel(vm_offset_t addr) 2120{ 2121 vm_paddr_t ptppaddr; 2122 vm_page_t nkpg; 2123 pd_entry_t newpdir; 2124 2125 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2126 addr = roundup2(addr, NBPDR); 2127 if (addr - 1 >= kernel_map->max_offset) 2128 addr = kernel_map->max_offset; 2129 while (kernel_vm_end < addr) { 2130 if (pdir_pde(PTD, kernel_vm_end)) { 2131 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2132 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2133 kernel_vm_end = kernel_map->max_offset; 2134 break; 2135 } 2136 continue; 2137 } 2138 2139 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2140 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2141 VM_ALLOC_ZERO); 2142 if (nkpg == NULL) 2143 panic("pmap_growkernel: no memory to grow kernel"); 2144 2145 nkpt++; 2146 2147 if ((nkpg->flags & PG_ZERO) == 0) 2148 pmap_zero_page(nkpg); 2149 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2150 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2151 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2152 2153 pmap_kenter_pde(kernel_vm_end, newpdir); 2154 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2155 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2156 kernel_vm_end = kernel_map->max_offset; 2157 break; 2158 } 2159 } 2160} 2161 2162 2163/*************************************************** 2164 * page management routines. 2165 ***************************************************/ 2166 2167CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2168CTASSERT(_NPCM == 11); 2169CTASSERT(_NPCPV == 336); 2170 2171static __inline struct pv_chunk * 2172pv_to_chunk(pv_entry_t pv) 2173{ 2174 2175 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2176} 2177 2178#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2179 2180#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2181#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2182 2183static const uint32_t pc_freemask[_NPCM] = { 2184 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2185 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2186 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2187 PC_FREE0_9, PC_FREE10 2188}; 2189 2190SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2191 "Current number of pv entries"); 2192 2193#ifdef PV_STATS 2194static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2195 2196SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2197 "Current number of pv entry chunks"); 2198SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2199 "Current number of pv entry chunks allocated"); 2200SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2201 "Current number of pv entry chunks frees"); 2202SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2203 "Number of times tried to get a chunk page but failed."); 2204 2205static long pv_entry_frees, pv_entry_allocs; 2206static int pv_entry_spare; 2207 2208SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2209 "Current number of pv entry frees"); 2210SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2211 "Current number of pv entry allocs"); 2212SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2213 "Current number of spare pv entries"); 2214#endif 2215 2216/* 2217 * We are in a serious low memory condition. Resort to 2218 * drastic measures to free some pages so we can allocate 2219 * another pv entry chunk. 2220 */ 2221static vm_page_t 2222pmap_pv_reclaim(pmap_t locked_pmap) 2223{ 2224 struct pch newtail; 2225 struct pv_chunk *pc; 2226 struct md_page *pvh; 2227 pd_entry_t *pde; 2228 pmap_t pmap; 2229 pt_entry_t *pte, tpte; 2230 pv_entry_t pv; 2231 vm_offset_t va; 2232 vm_page_t m, m_pc; 2233 struct spglist free; 2234 uint32_t inuse; 2235 int bit, field, freed; 2236 2237 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2238 pmap = NULL; 2239 m_pc = NULL; 2240 SLIST_INIT(&free); 2241 TAILQ_INIT(&newtail); 2242 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2243 SLIST_EMPTY(&free))) { 2244 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2245 if (pmap != pc->pc_pmap) { 2246 if (pmap != NULL) { 2247 pmap_invalidate_all(pmap); 2248 if (pmap != locked_pmap) 2249 PMAP_UNLOCK(pmap); 2250 } 2251 pmap = pc->pc_pmap; 2252 /* Avoid deadlock and lock recursion. */ 2253 if (pmap > locked_pmap) 2254 PMAP_LOCK(pmap); 2255 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2256 pmap = NULL; 2257 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2258 continue; 2259 } 2260 } 2261 2262 /* 2263 * Destroy every non-wired, 4 KB page mapping in the chunk. 2264 */ 2265 freed = 0; 2266 for (field = 0; field < _NPCM; field++) { 2267 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2268 inuse != 0; inuse &= ~(1UL << bit)) { 2269 bit = bsfl(inuse); 2270 pv = &pc->pc_pventry[field * 32 + bit]; 2271 va = pv->pv_va; 2272 pde = pmap_pde(pmap, va); 2273 if ((*pde & PG_PS) != 0) 2274 continue; 2275 pte = pmap_pte(pmap, va); 2276 tpte = *pte; 2277 if ((tpte & PG_W) == 0) 2278 tpte = pte_load_clear(pte); 2279 pmap_pte_release(pte); 2280 if ((tpte & PG_W) != 0) 2281 continue; 2282 KASSERT(tpte != 0, 2283 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2284 pmap, va)); 2285 if ((tpte & PG_G) != 0) 2286 pmap_invalidate_page(pmap, va); 2287 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2288 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2289 vm_page_dirty(m); 2290 if ((tpte & PG_A) != 0) 2291 vm_page_aflag_set(m, PGA_REFERENCED); 2292 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2293 if (TAILQ_EMPTY(&m->md.pv_list) && 2294 (m->flags & PG_FICTITIOUS) == 0) { 2295 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2296 if (TAILQ_EMPTY(&pvh->pv_list)) { 2297 vm_page_aflag_clear(m, 2298 PGA_WRITEABLE); 2299 } 2300 } 2301 pc->pc_map[field] |= 1UL << bit; 2302 pmap_unuse_pt(pmap, va, &free); 2303 freed++; 2304 } 2305 } 2306 if (freed == 0) { 2307 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2308 continue; 2309 } 2310 /* Every freed mapping is for a 4 KB page. */ 2311 pmap->pm_stats.resident_count -= freed; 2312 PV_STAT(pv_entry_frees += freed); 2313 PV_STAT(pv_entry_spare += freed); 2314 pv_entry_count -= freed; 2315 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2316 for (field = 0; field < _NPCM; field++) 2317 if (pc->pc_map[field] != pc_freemask[field]) { 2318 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2319 pc_list); 2320 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2321 2322 /* 2323 * One freed pv entry in locked_pmap is 2324 * sufficient. 2325 */ 2326 if (pmap == locked_pmap) 2327 goto out; 2328 break; 2329 } 2330 if (field == _NPCM) { 2331 PV_STAT(pv_entry_spare -= _NPCPV); 2332 PV_STAT(pc_chunk_count--); 2333 PV_STAT(pc_chunk_frees++); 2334 /* Entire chunk is free; return it. */ 2335 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2336 pmap_qremove((vm_offset_t)pc, 1); 2337 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2338 break; 2339 } 2340 } 2341out: 2342 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2343 if (pmap != NULL) { 2344 pmap_invalidate_all(pmap); 2345 if (pmap != locked_pmap) 2346 PMAP_UNLOCK(pmap); 2347 } 2348 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2349 m_pc = SLIST_FIRST(&free); 2350 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2351 /* Recycle a freed page table page. */ 2352 m_pc->wire_count = 1; 2353 atomic_add_int(&cnt.v_wire_count, 1); 2354 } 2355 pmap_free_zero_pages(&free); 2356 return (m_pc); 2357} 2358 2359/* 2360 * free the pv_entry back to the free list 2361 */ 2362static void 2363free_pv_entry(pmap_t pmap, pv_entry_t pv) 2364{ 2365 struct pv_chunk *pc; 2366 int idx, field, bit; 2367 2368 rw_assert(&pvh_global_lock, RA_WLOCKED); 2369 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2370 PV_STAT(pv_entry_frees++); 2371 PV_STAT(pv_entry_spare++); 2372 pv_entry_count--; 2373 pc = pv_to_chunk(pv); 2374 idx = pv - &pc->pc_pventry[0]; 2375 field = idx / 32; 2376 bit = idx % 32; 2377 pc->pc_map[field] |= 1ul << bit; 2378 for (idx = 0; idx < _NPCM; idx++) 2379 if (pc->pc_map[idx] != pc_freemask[idx]) { 2380 /* 2381 * 98% of the time, pc is already at the head of the 2382 * list. If it isn't already, move it to the head. 2383 */ 2384 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2385 pc)) { 2386 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2387 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2388 pc_list); 2389 } 2390 return; 2391 } 2392 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2393 free_pv_chunk(pc); 2394} 2395 2396static void 2397free_pv_chunk(struct pv_chunk *pc) 2398{ 2399 vm_page_t m; 2400 2401 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2402 PV_STAT(pv_entry_spare -= _NPCPV); 2403 PV_STAT(pc_chunk_count--); 2404 PV_STAT(pc_chunk_frees++); 2405 /* entire chunk is free, return it */ 2406 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2407 pmap_qremove((vm_offset_t)pc, 1); 2408 vm_page_unwire(m, 0); 2409 vm_page_free(m); 2410 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2411} 2412 2413/* 2414 * get a new pv_entry, allocating a block from the system 2415 * when needed. 2416 */ 2417static pv_entry_t 2418get_pv_entry(pmap_t pmap, boolean_t try) 2419{ 2420 static const struct timeval printinterval = { 60, 0 }; 2421 static struct timeval lastprint; 2422 int bit, field; 2423 pv_entry_t pv; 2424 struct pv_chunk *pc; 2425 vm_page_t m; 2426 2427 rw_assert(&pvh_global_lock, RA_WLOCKED); 2428 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2429 PV_STAT(pv_entry_allocs++); 2430 pv_entry_count++; 2431 if (pv_entry_count > pv_entry_high_water) 2432 if (ratecheck(&lastprint, &printinterval)) 2433 printf("Approaching the limit on PV entries, consider " 2434 "increasing either the vm.pmap.shpgperproc or the " 2435 "vm.pmap.pv_entry_max tunable.\n"); 2436retry: 2437 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2438 if (pc != NULL) { 2439 for (field = 0; field < _NPCM; field++) { 2440 if (pc->pc_map[field]) { 2441 bit = bsfl(pc->pc_map[field]); 2442 break; 2443 } 2444 } 2445 if (field < _NPCM) { 2446 pv = &pc->pc_pventry[field * 32 + bit]; 2447 pc->pc_map[field] &= ~(1ul << bit); 2448 /* If this was the last item, move it to tail */ 2449 for (field = 0; field < _NPCM; field++) 2450 if (pc->pc_map[field] != 0) { 2451 PV_STAT(pv_entry_spare--); 2452 return (pv); /* not full, return */ 2453 } 2454 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2455 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2456 PV_STAT(pv_entry_spare--); 2457 return (pv); 2458 } 2459 } 2460 /* 2461 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2462 * global lock. If "pv_vafree" is currently non-empty, it will 2463 * remain non-empty until pmap_ptelist_alloc() completes. 2464 */ 2465 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2466 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2467 if (try) { 2468 pv_entry_count--; 2469 PV_STAT(pc_chunk_tryfail++); 2470 return (NULL); 2471 } 2472 m = pmap_pv_reclaim(pmap); 2473 if (m == NULL) 2474 goto retry; 2475 } 2476 PV_STAT(pc_chunk_count++); 2477 PV_STAT(pc_chunk_allocs++); 2478 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2479 pmap_qenter((vm_offset_t)pc, &m, 1); 2480 pc->pc_pmap = pmap; 2481 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2482 for (field = 1; field < _NPCM; field++) 2483 pc->pc_map[field] = pc_freemask[field]; 2484 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2485 pv = &pc->pc_pventry[0]; 2486 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2487 PV_STAT(pv_entry_spare += _NPCPV - 1); 2488 return (pv); 2489} 2490 2491static __inline pv_entry_t 2492pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2493{ 2494 pv_entry_t pv; 2495 2496 rw_assert(&pvh_global_lock, RA_WLOCKED); 2497 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2498 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2499 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2500 break; 2501 } 2502 } 2503 return (pv); 2504} 2505 2506static void 2507pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2508{ 2509 struct md_page *pvh; 2510 pv_entry_t pv; 2511 vm_offset_t va_last; 2512 vm_page_t m; 2513 2514 rw_assert(&pvh_global_lock, RA_WLOCKED); 2515 KASSERT((pa & PDRMASK) == 0, 2516 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2517 2518 /* 2519 * Transfer the 4mpage's pv entry for this mapping to the first 2520 * page's pv list. 2521 */ 2522 pvh = pa_to_pvh(pa); 2523 va = trunc_4mpage(va); 2524 pv = pmap_pvh_remove(pvh, pmap, va); 2525 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2526 m = PHYS_TO_VM_PAGE(pa); 2527 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2528 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2529 va_last = va + NBPDR - PAGE_SIZE; 2530 do { 2531 m++; 2532 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2533 ("pmap_pv_demote_pde: page %p is not managed", m)); 2534 va += PAGE_SIZE; 2535 pmap_insert_entry(pmap, va, m); 2536 } while (va < va_last); 2537} 2538 2539static void 2540pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2541{ 2542 struct md_page *pvh; 2543 pv_entry_t pv; 2544 vm_offset_t va_last; 2545 vm_page_t m; 2546 2547 rw_assert(&pvh_global_lock, RA_WLOCKED); 2548 KASSERT((pa & PDRMASK) == 0, 2549 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2550 2551 /* 2552 * Transfer the first page's pv entry for this mapping to the 2553 * 4mpage's pv list. Aside from avoiding the cost of a call 2554 * to get_pv_entry(), a transfer avoids the possibility that 2555 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2556 * removes one of the mappings that is being promoted. 2557 */ 2558 m = PHYS_TO_VM_PAGE(pa); 2559 va = trunc_4mpage(va); 2560 pv = pmap_pvh_remove(&m->md, pmap, va); 2561 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2562 pvh = pa_to_pvh(pa); 2563 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2564 /* Free the remaining NPTEPG - 1 pv entries. */ 2565 va_last = va + NBPDR - PAGE_SIZE; 2566 do { 2567 m++; 2568 va += PAGE_SIZE; 2569 pmap_pvh_free(&m->md, pmap, va); 2570 } while (va < va_last); 2571} 2572 2573static void 2574pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2575{ 2576 pv_entry_t pv; 2577 2578 pv = pmap_pvh_remove(pvh, pmap, va); 2579 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2580 free_pv_entry(pmap, pv); 2581} 2582 2583static void 2584pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2585{ 2586 struct md_page *pvh; 2587 2588 rw_assert(&pvh_global_lock, RA_WLOCKED); 2589 pmap_pvh_free(&m->md, pmap, va); 2590 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2591 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2592 if (TAILQ_EMPTY(&pvh->pv_list)) 2593 vm_page_aflag_clear(m, PGA_WRITEABLE); 2594 } 2595} 2596 2597/* 2598 * Create a pv entry for page at pa for 2599 * (pmap, va). 2600 */ 2601static void 2602pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2603{ 2604 pv_entry_t pv; 2605 2606 rw_assert(&pvh_global_lock, RA_WLOCKED); 2607 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2608 pv = get_pv_entry(pmap, FALSE); 2609 pv->pv_va = va; 2610 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2611} 2612 2613/* 2614 * Conditionally create a pv entry. 2615 */ 2616static boolean_t 2617pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2618{ 2619 pv_entry_t pv; 2620 2621 rw_assert(&pvh_global_lock, RA_WLOCKED); 2622 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2623 if (pv_entry_count < pv_entry_high_water && 2624 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2625 pv->pv_va = va; 2626 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2627 return (TRUE); 2628 } else 2629 return (FALSE); 2630} 2631 2632/* 2633 * Create the pv entries for each of the pages within a superpage. 2634 */ 2635static boolean_t 2636pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2637{ 2638 struct md_page *pvh; 2639 pv_entry_t pv; 2640 2641 rw_assert(&pvh_global_lock, RA_WLOCKED); 2642 if (pv_entry_count < pv_entry_high_water && 2643 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2644 pv->pv_va = va; 2645 pvh = pa_to_pvh(pa); 2646 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2647 return (TRUE); 2648 } else 2649 return (FALSE); 2650} 2651 2652/* 2653 * Fills a page table page with mappings to consecutive physical pages. 2654 */ 2655static void 2656pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2657{ 2658 pt_entry_t *pte; 2659 2660 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2661 *pte = newpte; 2662 newpte += PAGE_SIZE; 2663 } 2664} 2665 2666/* 2667 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2668 * 2- or 4MB page mapping is invalidated. 2669 */ 2670static boolean_t 2671pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2672{ 2673 pd_entry_t newpde, oldpde; 2674 pt_entry_t *firstpte, newpte; 2675 vm_paddr_t mptepa; 2676 vm_page_t mpte; 2677 struct spglist free; 2678 2679 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2680 oldpde = *pde; 2681 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2682 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2683 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 2684 NULL) 2685 pmap_remove_pt_page(pmap, mpte); 2686 else { 2687 KASSERT((oldpde & PG_W) == 0, 2688 ("pmap_demote_pde: page table page for a wired mapping" 2689 " is missing")); 2690 2691 /* 2692 * Invalidate the 2- or 4MB page mapping and return 2693 * "failure" if the mapping was never accessed or the 2694 * allocation of the new page table page fails. 2695 */ 2696 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2697 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2698 VM_ALLOC_WIRED)) == NULL) { 2699 SLIST_INIT(&free); 2700 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2701 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2702 pmap_free_zero_pages(&free); 2703 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2704 " in pmap %p", va, pmap); 2705 return (FALSE); 2706 } 2707 if (va < VM_MAXUSER_ADDRESS) 2708 pmap->pm_stats.resident_count++; 2709 } 2710 mptepa = VM_PAGE_TO_PHYS(mpte); 2711 2712 /* 2713 * If the page mapping is in the kernel's address space, then the 2714 * KPTmap can provide access to the page table page. Otherwise, 2715 * temporarily map the page table page (mpte) into the kernel's 2716 * address space at either PADDR1 or PADDR2. 2717 */ 2718 if (va >= KERNBASE) 2719 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2720 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2721 if ((*PMAP1 & PG_FRAME) != mptepa) { 2722 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2723#ifdef SMP 2724 PMAP1cpu = PCPU_GET(cpuid); 2725#endif 2726 invlcaddr(PADDR1); 2727 PMAP1changed++; 2728 } else 2729#ifdef SMP 2730 if (PMAP1cpu != PCPU_GET(cpuid)) { 2731 PMAP1cpu = PCPU_GET(cpuid); 2732 invlcaddr(PADDR1); 2733 PMAP1changedcpu++; 2734 } else 2735#endif 2736 PMAP1unchanged++; 2737 firstpte = PADDR1; 2738 } else { 2739 mtx_lock(&PMAP2mutex); 2740 if ((*PMAP2 & PG_FRAME) != mptepa) { 2741 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2742 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2743 } 2744 firstpte = PADDR2; 2745 } 2746 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2747 KASSERT((oldpde & PG_A) != 0, 2748 ("pmap_demote_pde: oldpde is missing PG_A")); 2749 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2750 ("pmap_demote_pde: oldpde is missing PG_M")); 2751 newpte = oldpde & ~PG_PS; 2752 if ((newpte & PG_PDE_PAT) != 0) 2753 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2754 2755 /* 2756 * If the page table page is new, initialize it. 2757 */ 2758 if (mpte->wire_count == 1) { 2759 mpte->wire_count = NPTEPG; 2760 pmap_fill_ptp(firstpte, newpte); 2761 } 2762 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2763 ("pmap_demote_pde: firstpte and newpte map different physical" 2764 " addresses")); 2765 2766 /* 2767 * If the mapping has changed attributes, update the page table 2768 * entries. 2769 */ 2770 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2771 pmap_fill_ptp(firstpte, newpte); 2772 2773 /* 2774 * Demote the mapping. This pmap is locked. The old PDE has 2775 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2776 * set. Thus, there is no danger of a race with another 2777 * processor changing the setting of PG_A and/or PG_M between 2778 * the read above and the store below. 2779 */ 2780 if (workaround_erratum383) 2781 pmap_update_pde(pmap, va, pde, newpde); 2782 else if (pmap == kernel_pmap) 2783 pmap_kenter_pde(va, newpde); 2784 else 2785 pde_store(pde, newpde); 2786 if (firstpte == PADDR2) 2787 mtx_unlock(&PMAP2mutex); 2788 2789 /* 2790 * Invalidate the recursive mapping of the page table page. 2791 */ 2792 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2793 2794 /* 2795 * Demote the pv entry. This depends on the earlier demotion 2796 * of the mapping. Specifically, the (re)creation of a per- 2797 * page pv entry might trigger the execution of pmap_collect(), 2798 * which might reclaim a newly (re)created per-page pv entry 2799 * and destroy the associated mapping. In order to destroy 2800 * the mapping, the PDE must have already changed from mapping 2801 * the 2mpage to referencing the page table page. 2802 */ 2803 if ((oldpde & PG_MANAGED) != 0) 2804 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2805 2806 pmap_pde_demotions++; 2807 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2808 " in pmap %p", va, pmap); 2809 return (TRUE); 2810} 2811 2812/* 2813 * Removes a 2- or 4MB page mapping from the kernel pmap. 2814 */ 2815static void 2816pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2817{ 2818 pd_entry_t newpde; 2819 vm_paddr_t mptepa; 2820 vm_page_t mpte; 2821 2822 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2823 mpte = pmap_lookup_pt_page(pmap, va); 2824 if (mpte == NULL) 2825 panic("pmap_remove_kernel_pde: Missing pt page."); 2826 2827 pmap_remove_pt_page(pmap, mpte); 2828 mptepa = VM_PAGE_TO_PHYS(mpte); 2829 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2830 2831 /* 2832 * Initialize the page table page. 2833 */ 2834 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2835 2836 /* 2837 * Remove the mapping. 2838 */ 2839 if (workaround_erratum383) 2840 pmap_update_pde(pmap, va, pde, newpde); 2841 else 2842 pmap_kenter_pde(va, newpde); 2843 2844 /* 2845 * Invalidate the recursive mapping of the page table page. 2846 */ 2847 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2848} 2849 2850/* 2851 * pmap_remove_pde: do the things to unmap a superpage in a process 2852 */ 2853static void 2854pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2855 struct spglist *free) 2856{ 2857 struct md_page *pvh; 2858 pd_entry_t oldpde; 2859 vm_offset_t eva, va; 2860 vm_page_t m, mpte; 2861 2862 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2863 KASSERT((sva & PDRMASK) == 0, 2864 ("pmap_remove_pde: sva is not 4mpage aligned")); 2865 oldpde = pte_load_clear(pdq); 2866 if (oldpde & PG_W) 2867 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2868 2869 /* 2870 * Machines that don't support invlpg, also don't support 2871 * PG_G. 2872 */ 2873 if (oldpde & PG_G) 2874 pmap_invalidate_page(kernel_pmap, sva); 2875 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2876 if (oldpde & PG_MANAGED) { 2877 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2878 pmap_pvh_free(pvh, pmap, sva); 2879 eva = sva + NBPDR; 2880 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2881 va < eva; va += PAGE_SIZE, m++) { 2882 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2883 vm_page_dirty(m); 2884 if (oldpde & PG_A) 2885 vm_page_aflag_set(m, PGA_REFERENCED); 2886 if (TAILQ_EMPTY(&m->md.pv_list) && 2887 TAILQ_EMPTY(&pvh->pv_list)) 2888 vm_page_aflag_clear(m, PGA_WRITEABLE); 2889 } 2890 } 2891 if (pmap == kernel_pmap) { 2892 pmap_remove_kernel_pde(pmap, pdq, sva); 2893 } else { 2894 mpte = pmap_lookup_pt_page(pmap, sva); 2895 if (mpte != NULL) { 2896 pmap_remove_pt_page(pmap, mpte); 2897 pmap->pm_stats.resident_count--; 2898 KASSERT(mpte->wire_count == NPTEPG, 2899 ("pmap_remove_pde: pte page wire count error")); 2900 mpte->wire_count = 0; 2901 pmap_add_delayed_free_list(mpte, free, FALSE); 2902 atomic_subtract_int(&cnt.v_wire_count, 1); 2903 } 2904 } 2905} 2906 2907/* 2908 * pmap_remove_pte: do the things to unmap a page in a process 2909 */ 2910static int 2911pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2912 struct spglist *free) 2913{ 2914 pt_entry_t oldpte; 2915 vm_page_t m; 2916 2917 rw_assert(&pvh_global_lock, RA_WLOCKED); 2918 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2919 oldpte = pte_load_clear(ptq); 2920 KASSERT(oldpte != 0, 2921 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2922 if (oldpte & PG_W) 2923 pmap->pm_stats.wired_count -= 1; 2924 /* 2925 * Machines that don't support invlpg, also don't support 2926 * PG_G. 2927 */ 2928 if (oldpte & PG_G) 2929 pmap_invalidate_page(kernel_pmap, va); 2930 pmap->pm_stats.resident_count -= 1; 2931 if (oldpte & PG_MANAGED) { 2932 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2933 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2934 vm_page_dirty(m); 2935 if (oldpte & PG_A) 2936 vm_page_aflag_set(m, PGA_REFERENCED); 2937 pmap_remove_entry(pmap, m, va); 2938 } 2939 return (pmap_unuse_pt(pmap, va, free)); 2940} 2941 2942/* 2943 * Remove a single page from a process address space 2944 */ 2945static void 2946pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2947{ 2948 pt_entry_t *pte; 2949 2950 rw_assert(&pvh_global_lock, RA_WLOCKED); 2951 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2952 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2953 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2954 return; 2955 pmap_remove_pte(pmap, pte, va, free); 2956 pmap_invalidate_page(pmap, va); 2957} 2958 2959/* 2960 * Remove the given range of addresses from the specified map. 2961 * 2962 * It is assumed that the start and end are properly 2963 * rounded to the page size. 2964 */ 2965void 2966pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2967{ 2968 vm_offset_t pdnxt; 2969 pd_entry_t ptpaddr; 2970 pt_entry_t *pte; 2971 struct spglist free; 2972 int anyvalid; 2973 2974 /* 2975 * Perform an unsynchronized read. This is, however, safe. 2976 */ 2977 if (pmap->pm_stats.resident_count == 0) 2978 return; 2979 2980 anyvalid = 0; 2981 SLIST_INIT(&free); 2982 2983 rw_wlock(&pvh_global_lock); 2984 sched_pin(); 2985 PMAP_LOCK(pmap); 2986 2987 /* 2988 * special handling of removing one page. a very 2989 * common operation and easy to short circuit some 2990 * code. 2991 */ 2992 if ((sva + PAGE_SIZE == eva) && 2993 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2994 pmap_remove_page(pmap, sva, &free); 2995 goto out; 2996 } 2997 2998 for (; sva < eva; sva = pdnxt) { 2999 u_int pdirindex; 3000 3001 /* 3002 * Calculate index for next page table. 3003 */ 3004 pdnxt = (sva + NBPDR) & ~PDRMASK; 3005 if (pdnxt < sva) 3006 pdnxt = eva; 3007 if (pmap->pm_stats.resident_count == 0) 3008 break; 3009 3010 pdirindex = sva >> PDRSHIFT; 3011 ptpaddr = pmap->pm_pdir[pdirindex]; 3012 3013 /* 3014 * Weed out invalid mappings. Note: we assume that the page 3015 * directory table is always allocated, and in kernel virtual. 3016 */ 3017 if (ptpaddr == 0) 3018 continue; 3019 3020 /* 3021 * Check for large page. 3022 */ 3023 if ((ptpaddr & PG_PS) != 0) { 3024 /* 3025 * Are we removing the entire large page? If not, 3026 * demote the mapping and fall through. 3027 */ 3028 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3029 /* 3030 * The TLB entry for a PG_G mapping is 3031 * invalidated by pmap_remove_pde(). 3032 */ 3033 if ((ptpaddr & PG_G) == 0) 3034 anyvalid = 1; 3035 pmap_remove_pde(pmap, 3036 &pmap->pm_pdir[pdirindex], sva, &free); 3037 continue; 3038 } else if (!pmap_demote_pde(pmap, 3039 &pmap->pm_pdir[pdirindex], sva)) { 3040 /* The large page mapping was destroyed. */ 3041 continue; 3042 } 3043 } 3044 3045 /* 3046 * Limit our scan to either the end of the va represented 3047 * by the current page table page, or to the end of the 3048 * range being removed. 3049 */ 3050 if (pdnxt > eva) 3051 pdnxt = eva; 3052 3053 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3054 sva += PAGE_SIZE) { 3055 if (*pte == 0) 3056 continue; 3057 3058 /* 3059 * The TLB entry for a PG_G mapping is invalidated 3060 * by pmap_remove_pte(). 3061 */ 3062 if ((*pte & PG_G) == 0) 3063 anyvalid = 1; 3064 if (pmap_remove_pte(pmap, pte, sva, &free)) 3065 break; 3066 } 3067 } 3068out: 3069 sched_unpin(); 3070 if (anyvalid) 3071 pmap_invalidate_all(pmap); 3072 rw_wunlock(&pvh_global_lock); 3073 PMAP_UNLOCK(pmap); 3074 pmap_free_zero_pages(&free); 3075} 3076 3077/* 3078 * Routine: pmap_remove_all 3079 * Function: 3080 * Removes this physical page from 3081 * all physical maps in which it resides. 3082 * Reflects back modify bits to the pager. 3083 * 3084 * Notes: 3085 * Original versions of this routine were very 3086 * inefficient because they iteratively called 3087 * pmap_remove (slow...) 3088 */ 3089 3090void 3091pmap_remove_all(vm_page_t m) 3092{ 3093 struct md_page *pvh; 3094 pv_entry_t pv; 3095 pmap_t pmap; 3096 pt_entry_t *pte, tpte; 3097 pd_entry_t *pde; 3098 vm_offset_t va; 3099 struct spglist free; 3100 3101 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3102 ("pmap_remove_all: page %p is not managed", m)); 3103 SLIST_INIT(&free); 3104 rw_wlock(&pvh_global_lock); 3105 sched_pin(); 3106 if ((m->flags & PG_FICTITIOUS) != 0) 3107 goto small_mappings; 3108 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3109 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3110 va = pv->pv_va; 3111 pmap = PV_PMAP(pv); 3112 PMAP_LOCK(pmap); 3113 pde = pmap_pde(pmap, va); 3114 (void)pmap_demote_pde(pmap, pde, va); 3115 PMAP_UNLOCK(pmap); 3116 } 3117small_mappings: 3118 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3119 pmap = PV_PMAP(pv); 3120 PMAP_LOCK(pmap); 3121 pmap->pm_stats.resident_count--; 3122 pde = pmap_pde(pmap, pv->pv_va); 3123 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3124 " a 4mpage in page %p's pv list", m)); 3125 pte = pmap_pte_quick(pmap, pv->pv_va); 3126 tpte = pte_load_clear(pte); 3127 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3128 pmap, pv->pv_va)); 3129 if (tpte & PG_W) 3130 pmap->pm_stats.wired_count--; 3131 if (tpte & PG_A) 3132 vm_page_aflag_set(m, PGA_REFERENCED); 3133 3134 /* 3135 * Update the vm_page_t clean and reference bits. 3136 */ 3137 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3138 vm_page_dirty(m); 3139 pmap_unuse_pt(pmap, pv->pv_va, &free); 3140 pmap_invalidate_page(pmap, pv->pv_va); 3141 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3142 free_pv_entry(pmap, pv); 3143 PMAP_UNLOCK(pmap); 3144 } 3145 vm_page_aflag_clear(m, PGA_WRITEABLE); 3146 sched_unpin(); 3147 rw_wunlock(&pvh_global_lock); 3148 pmap_free_zero_pages(&free); 3149} 3150 3151/* 3152 * pmap_protect_pde: do the things to protect a 4mpage in a process 3153 */ 3154static boolean_t 3155pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3156{ 3157 pd_entry_t newpde, oldpde; 3158 vm_offset_t eva, va; 3159 vm_page_t m; 3160 boolean_t anychanged; 3161 3162 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3163 KASSERT((sva & PDRMASK) == 0, 3164 ("pmap_protect_pde: sva is not 4mpage aligned")); 3165 anychanged = FALSE; 3166retry: 3167 oldpde = newpde = *pde; 3168 if (oldpde & PG_MANAGED) { 3169 eva = sva + NBPDR; 3170 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3171 va < eva; va += PAGE_SIZE, m++) 3172 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3173 vm_page_dirty(m); 3174 } 3175 if ((prot & VM_PROT_WRITE) == 0) 3176 newpde &= ~(PG_RW | PG_M); 3177#if defined(PAE) || defined(PAE_TABLES) 3178 if ((prot & VM_PROT_EXECUTE) == 0) 3179 newpde |= pg_nx; 3180#endif 3181 if (newpde != oldpde) { 3182 if (!pde_cmpset(pde, oldpde, newpde)) 3183 goto retry; 3184 if (oldpde & PG_G) 3185 pmap_invalidate_page(pmap, sva); 3186 else 3187 anychanged = TRUE; 3188 } 3189 return (anychanged); 3190} 3191 3192/* 3193 * Set the physical protection on the 3194 * specified range of this map as requested. 3195 */ 3196void 3197pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3198{ 3199 vm_offset_t pdnxt; 3200 pd_entry_t ptpaddr; 3201 pt_entry_t *pte; 3202 boolean_t anychanged, pv_lists_locked; 3203 3204 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3205 if (prot == VM_PROT_NONE) { 3206 pmap_remove(pmap, sva, eva); 3207 return; 3208 } 3209 3210#if defined(PAE) || defined(PAE_TABLES) 3211 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3212 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3213 return; 3214#else 3215 if (prot & VM_PROT_WRITE) 3216 return; 3217#endif 3218 3219 if (pmap_is_current(pmap)) 3220 pv_lists_locked = FALSE; 3221 else { 3222 pv_lists_locked = TRUE; 3223resume: 3224 rw_wlock(&pvh_global_lock); 3225 sched_pin(); 3226 } 3227 anychanged = FALSE; 3228 3229 PMAP_LOCK(pmap); 3230 for (; sva < eva; sva = pdnxt) { 3231 pt_entry_t obits, pbits; 3232 u_int pdirindex; 3233 3234 pdnxt = (sva + NBPDR) & ~PDRMASK; 3235 if (pdnxt < sva) 3236 pdnxt = eva; 3237 3238 pdirindex = sva >> PDRSHIFT; 3239 ptpaddr = pmap->pm_pdir[pdirindex]; 3240 3241 /* 3242 * Weed out invalid mappings. Note: we assume that the page 3243 * directory table is always allocated, and in kernel virtual. 3244 */ 3245 if (ptpaddr == 0) 3246 continue; 3247 3248 /* 3249 * Check for large page. 3250 */ 3251 if ((ptpaddr & PG_PS) != 0) { 3252 /* 3253 * Are we protecting the entire large page? If not, 3254 * demote the mapping and fall through. 3255 */ 3256 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3257 /* 3258 * The TLB entry for a PG_G mapping is 3259 * invalidated by pmap_protect_pde(). 3260 */ 3261 if (pmap_protect_pde(pmap, 3262 &pmap->pm_pdir[pdirindex], sva, prot)) 3263 anychanged = TRUE; 3264 continue; 3265 } else { 3266 if (!pv_lists_locked) { 3267 pv_lists_locked = TRUE; 3268 if (!rw_try_wlock(&pvh_global_lock)) { 3269 if (anychanged) 3270 pmap_invalidate_all( 3271 pmap); 3272 PMAP_UNLOCK(pmap); 3273 goto resume; 3274 } 3275 sched_pin(); 3276 } 3277 if (!pmap_demote_pde(pmap, 3278 &pmap->pm_pdir[pdirindex], sva)) { 3279 /* 3280 * The large page mapping was 3281 * destroyed. 3282 */ 3283 continue; 3284 } 3285 } 3286 } 3287 3288 if (pdnxt > eva) 3289 pdnxt = eva; 3290 3291 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3292 sva += PAGE_SIZE) { 3293 vm_page_t m; 3294 3295retry: 3296 /* 3297 * Regardless of whether a pte is 32 or 64 bits in 3298 * size, PG_RW, PG_A, and PG_M are among the least 3299 * significant 32 bits. 3300 */ 3301 obits = pbits = *pte; 3302 if ((pbits & PG_V) == 0) 3303 continue; 3304 3305 if ((prot & VM_PROT_WRITE) == 0) { 3306 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3307 (PG_MANAGED | PG_M | PG_RW)) { 3308 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3309 vm_page_dirty(m); 3310 } 3311 pbits &= ~(PG_RW | PG_M); 3312 } 3313#if defined(PAE) || defined(PAE_TABLES) 3314 if ((prot & VM_PROT_EXECUTE) == 0) 3315 pbits |= pg_nx; 3316#endif 3317 3318 if (pbits != obits) { 3319#if defined(PAE) || defined(PAE_TABLES) 3320 if (!atomic_cmpset_64(pte, obits, pbits)) 3321 goto retry; 3322#else 3323 if (!atomic_cmpset_int((u_int *)pte, obits, 3324 pbits)) 3325 goto retry; 3326#endif 3327 if (obits & PG_G) 3328 pmap_invalidate_page(pmap, sva); 3329 else 3330 anychanged = TRUE; 3331 } 3332 } 3333 } 3334 if (anychanged) 3335 pmap_invalidate_all(pmap); 3336 if (pv_lists_locked) { 3337 sched_unpin(); 3338 rw_wunlock(&pvh_global_lock); 3339 } 3340 PMAP_UNLOCK(pmap); 3341} 3342 3343/* 3344 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3345 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3346 * For promotion to occur, two conditions must be met: (1) the 4KB page 3347 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3348 * mappings must have identical characteristics. 3349 * 3350 * Managed (PG_MANAGED) mappings within the kernel address space are not 3351 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3352 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3353 * pmap. 3354 */ 3355static void 3356pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3357{ 3358 pd_entry_t newpde; 3359 pt_entry_t *firstpte, oldpte, pa, *pte; 3360 vm_offset_t oldpteva; 3361 vm_page_t mpte; 3362 3363 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3364 3365 /* 3366 * Examine the first PTE in the specified PTP. Abort if this PTE is 3367 * either invalid, unused, or does not map the first 4KB physical page 3368 * within a 2- or 4MB page. 3369 */ 3370 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3371setpde: 3372 newpde = *firstpte; 3373 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3374 pmap_pde_p_failures++; 3375 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3376 " in pmap %p", va, pmap); 3377 return; 3378 } 3379 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3380 pmap_pde_p_failures++; 3381 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3382 " in pmap %p", va, pmap); 3383 return; 3384 } 3385 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3386 /* 3387 * When PG_M is already clear, PG_RW can be cleared without 3388 * a TLB invalidation. 3389 */ 3390 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3391 ~PG_RW)) 3392 goto setpde; 3393 newpde &= ~PG_RW; 3394 } 3395 3396 /* 3397 * Examine each of the other PTEs in the specified PTP. Abort if this 3398 * PTE maps an unexpected 4KB physical page or does not have identical 3399 * characteristics to the first PTE. 3400 */ 3401 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3402 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3403setpte: 3404 oldpte = *pte; 3405 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3406 pmap_pde_p_failures++; 3407 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3408 " in pmap %p", va, pmap); 3409 return; 3410 } 3411 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3412 /* 3413 * When PG_M is already clear, PG_RW can be cleared 3414 * without a TLB invalidation. 3415 */ 3416 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3417 oldpte & ~PG_RW)) 3418 goto setpte; 3419 oldpte &= ~PG_RW; 3420 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3421 (va & ~PDRMASK); 3422 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3423 " in pmap %p", oldpteva, pmap); 3424 } 3425 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3426 pmap_pde_p_failures++; 3427 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3428 " in pmap %p", va, pmap); 3429 return; 3430 } 3431 pa -= PAGE_SIZE; 3432 } 3433 3434 /* 3435 * Save the page table page in its current state until the PDE 3436 * mapping the superpage is demoted by pmap_demote_pde() or 3437 * destroyed by pmap_remove_pde(). 3438 */ 3439 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3440 KASSERT(mpte >= vm_page_array && 3441 mpte < &vm_page_array[vm_page_array_size], 3442 ("pmap_promote_pde: page table page is out of range")); 3443 KASSERT(mpte->pindex == va >> PDRSHIFT, 3444 ("pmap_promote_pde: page table page's pindex is wrong")); 3445 if (pmap_insert_pt_page(pmap, mpte)) { 3446 pmap_pde_p_failures++; 3447 CTR2(KTR_PMAP, 3448 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3449 pmap); 3450 return; 3451 } 3452 3453 /* 3454 * Promote the pv entries. 3455 */ 3456 if ((newpde & PG_MANAGED) != 0) 3457 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3458 3459 /* 3460 * Propagate the PAT index to its proper position. 3461 */ 3462 if ((newpde & PG_PTE_PAT) != 0) 3463 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3464 3465 /* 3466 * Map the superpage. 3467 */ 3468 if (workaround_erratum383) 3469 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3470 else if (pmap == kernel_pmap) 3471 pmap_kenter_pde(va, PG_PS | newpde); 3472 else 3473 pde_store(pde, PG_PS | newpde); 3474 3475 pmap_pde_promotions++; 3476 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3477 " in pmap %p", va, pmap); 3478} 3479 3480/* 3481 * Insert the given physical page (p) at 3482 * the specified virtual address (v) in the 3483 * target physical map with the protection requested. 3484 * 3485 * If specified, the page will be wired down, meaning 3486 * that the related pte can not be reclaimed. 3487 * 3488 * NB: This is the only routine which MAY NOT lazy-evaluate 3489 * or lose information. That is, this routine must actually 3490 * insert this page into the given map NOW. 3491 */ 3492int 3493pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3494 u_int flags, int8_t psind) 3495{ 3496 pd_entry_t *pde; 3497 pt_entry_t *pte; 3498 pt_entry_t newpte, origpte; 3499 pv_entry_t pv; 3500 vm_paddr_t opa, pa; 3501 vm_page_t mpte, om; 3502 boolean_t invlva, wired; 3503 3504 va = trunc_page(va); 3505 mpte = NULL; 3506 wired = (flags & PMAP_ENTER_WIRED) != 0; 3507 3508 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3509 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3510 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3511 va)); 3512 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3513 VM_OBJECT_ASSERT_LOCKED(m->object); 3514 3515 rw_wlock(&pvh_global_lock); 3516 PMAP_LOCK(pmap); 3517 sched_pin(); 3518 3519 /* 3520 * In the case that a page table page is not 3521 * resident, we are creating it here. 3522 */ 3523 if (va < VM_MAXUSER_ADDRESS) { 3524 mpte = pmap_allocpte(pmap, va, flags); 3525 if (mpte == NULL) { 3526 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3527 ("pmap_allocpte failed with sleep allowed")); 3528 sched_unpin(); 3529 rw_wunlock(&pvh_global_lock); 3530 PMAP_UNLOCK(pmap); 3531 return (KERN_RESOURCE_SHORTAGE); 3532 } 3533 } 3534 3535 pde = pmap_pde(pmap, va); 3536 if ((*pde & PG_PS) != 0) 3537 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3538 pte = pmap_pte_quick(pmap, va); 3539 3540 /* 3541 * Page Directory table entry not valid, we need a new PT page 3542 */ 3543 if (pte == NULL) { 3544 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3545 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3546 } 3547 3548 pa = VM_PAGE_TO_PHYS(m); 3549 om = NULL; 3550 origpte = *pte; 3551 opa = origpte & PG_FRAME; 3552 3553 /* 3554 * Mapping has not changed, must be protection or wiring change. 3555 */ 3556 if (origpte && (opa == pa)) { 3557 /* 3558 * Wiring change, just update stats. We don't worry about 3559 * wiring PT pages as they remain resident as long as there 3560 * are valid mappings in them. Hence, if a user page is wired, 3561 * the PT page will be also. 3562 */ 3563 if (wired && ((origpte & PG_W) == 0)) 3564 pmap->pm_stats.wired_count++; 3565 else if (!wired && (origpte & PG_W)) 3566 pmap->pm_stats.wired_count--; 3567 3568 /* 3569 * Remove extra pte reference 3570 */ 3571 if (mpte) 3572 mpte->wire_count--; 3573 3574 if (origpte & PG_MANAGED) { 3575 om = m; 3576 pa |= PG_MANAGED; 3577 } 3578 goto validate; 3579 } 3580 3581 pv = NULL; 3582 3583 /* 3584 * Mapping has changed, invalidate old range and fall through to 3585 * handle validating new mapping. 3586 */ 3587 if (opa) { 3588 if (origpte & PG_W) 3589 pmap->pm_stats.wired_count--; 3590 if (origpte & PG_MANAGED) { 3591 om = PHYS_TO_VM_PAGE(opa); 3592 pv = pmap_pvh_remove(&om->md, pmap, va); 3593 } 3594 if (mpte != NULL) { 3595 mpte->wire_count--; 3596 KASSERT(mpte->wire_count > 0, 3597 ("pmap_enter: missing reference to page table page," 3598 " va: 0x%x", va)); 3599 } 3600 } else 3601 pmap->pm_stats.resident_count++; 3602 3603 /* 3604 * Enter on the PV list if part of our managed memory. 3605 */ 3606 if ((m->oflags & VPO_UNMANAGED) == 0) { 3607 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3608 ("pmap_enter: managed mapping within the clean submap")); 3609 if (pv == NULL) 3610 pv = get_pv_entry(pmap, FALSE); 3611 pv->pv_va = va; 3612 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3613 pa |= PG_MANAGED; 3614 } else if (pv != NULL) 3615 free_pv_entry(pmap, pv); 3616 3617 /* 3618 * Increment counters 3619 */ 3620 if (wired) 3621 pmap->pm_stats.wired_count++; 3622 3623validate: 3624 /* 3625 * Now validate mapping with desired protection/wiring. 3626 */ 3627 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3628 if ((prot & VM_PROT_WRITE) != 0) { 3629 newpte |= PG_RW; 3630 if ((newpte & PG_MANAGED) != 0) 3631 vm_page_aflag_set(m, PGA_WRITEABLE); 3632 } 3633#if defined(PAE) || defined(PAE_TABLES) 3634 if ((prot & VM_PROT_EXECUTE) == 0) 3635 newpte |= pg_nx; 3636#endif 3637 if (wired) 3638 newpte |= PG_W; 3639 if (va < VM_MAXUSER_ADDRESS) 3640 newpte |= PG_U; 3641 if (pmap == kernel_pmap) 3642 newpte |= pgeflag; 3643 3644 /* 3645 * if the mapping or permission bits are different, we need 3646 * to update the pte. 3647 */ 3648 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3649 newpte |= PG_A; 3650 if ((flags & VM_PROT_WRITE) != 0) 3651 newpte |= PG_M; 3652 if (origpte & PG_V) { 3653 invlva = FALSE; 3654 origpte = pte_load_store(pte, newpte); 3655 if (origpte & PG_A) { 3656 if (origpte & PG_MANAGED) 3657 vm_page_aflag_set(om, PGA_REFERENCED); 3658 if (opa != VM_PAGE_TO_PHYS(m)) 3659 invlva = TRUE; 3660#if defined(PAE) || defined(PAE_TABLES) 3661 if ((origpte & PG_NX) == 0 && 3662 (newpte & PG_NX) != 0) 3663 invlva = TRUE; 3664#endif 3665 } 3666 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3667 if ((origpte & PG_MANAGED) != 0) 3668 vm_page_dirty(om); 3669 if ((prot & VM_PROT_WRITE) == 0) 3670 invlva = TRUE; 3671 } 3672 if ((origpte & PG_MANAGED) != 0 && 3673 TAILQ_EMPTY(&om->md.pv_list) && 3674 ((om->flags & PG_FICTITIOUS) != 0 || 3675 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3676 vm_page_aflag_clear(om, PGA_WRITEABLE); 3677 if (invlva) 3678 pmap_invalidate_page(pmap, va); 3679 } else 3680 pte_store(pte, newpte); 3681 } 3682 3683 /* 3684 * If both the page table page and the reservation are fully 3685 * populated, then attempt promotion. 3686 */ 3687 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3688 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3689 vm_reserv_level_iffullpop(m) == 0) 3690 pmap_promote_pde(pmap, pde, va); 3691 3692 sched_unpin(); 3693 rw_wunlock(&pvh_global_lock); 3694 PMAP_UNLOCK(pmap); 3695 return (KERN_SUCCESS); 3696} 3697 3698/* 3699 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3700 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3701 * blocking, (2) a mapping already exists at the specified virtual address, or 3702 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3703 */ 3704static boolean_t 3705pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3706{ 3707 pd_entry_t *pde, newpde; 3708 3709 rw_assert(&pvh_global_lock, RA_WLOCKED); 3710 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3711 pde = pmap_pde(pmap, va); 3712 if (*pde != 0) { 3713 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3714 " in pmap %p", va, pmap); 3715 return (FALSE); 3716 } 3717 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3718 PG_PS | PG_V; 3719 if ((m->oflags & VPO_UNMANAGED) == 0) { 3720 newpde |= PG_MANAGED; 3721 3722 /* 3723 * Abort this mapping if its PV entry could not be created. 3724 */ 3725 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3726 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3727 " in pmap %p", va, pmap); 3728 return (FALSE); 3729 } 3730 } 3731#if defined(PAE) || defined(PAE_TABLES) 3732 if ((prot & VM_PROT_EXECUTE) == 0) 3733 newpde |= pg_nx; 3734#endif 3735 if (va < VM_MAXUSER_ADDRESS) 3736 newpde |= PG_U; 3737 3738 /* 3739 * Increment counters. 3740 */ 3741 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3742 3743 /* 3744 * Map the superpage. 3745 */ 3746 pde_store(pde, newpde); 3747 3748 pmap_pde_mappings++; 3749 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3750 " in pmap %p", va, pmap); 3751 return (TRUE); 3752} 3753 3754/* 3755 * Maps a sequence of resident pages belonging to the same object. 3756 * The sequence begins with the given page m_start. This page is 3757 * mapped at the given virtual address start. Each subsequent page is 3758 * mapped at a virtual address that is offset from start by the same 3759 * amount as the page is offset from m_start within the object. The 3760 * last page in the sequence is the page with the largest offset from 3761 * m_start that can be mapped at a virtual address less than the given 3762 * virtual address end. Not every virtual page between start and end 3763 * is mapped; only those for which a resident page exists with the 3764 * corresponding offset from m_start are mapped. 3765 */ 3766void 3767pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3768 vm_page_t m_start, vm_prot_t prot) 3769{ 3770 vm_offset_t va; 3771 vm_page_t m, mpte; 3772 vm_pindex_t diff, psize; 3773 3774 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3775 3776 psize = atop(end - start); 3777 mpte = NULL; 3778 m = m_start; 3779 rw_wlock(&pvh_global_lock); 3780 PMAP_LOCK(pmap); 3781 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3782 va = start + ptoa(diff); 3783 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3784 m->psind == 1 && pg_ps_enabled && 3785 pmap_enter_pde(pmap, va, m, prot)) 3786 m = &m[NBPDR / PAGE_SIZE - 1]; 3787 else 3788 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3789 mpte); 3790 m = TAILQ_NEXT(m, listq); 3791 } 3792 rw_wunlock(&pvh_global_lock); 3793 PMAP_UNLOCK(pmap); 3794} 3795 3796/* 3797 * this code makes some *MAJOR* assumptions: 3798 * 1. Current pmap & pmap exists. 3799 * 2. Not wired. 3800 * 3. Read access. 3801 * 4. No page table pages. 3802 * but is *MUCH* faster than pmap_enter... 3803 */ 3804 3805void 3806pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3807{ 3808 3809 rw_wlock(&pvh_global_lock); 3810 PMAP_LOCK(pmap); 3811 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3812 rw_wunlock(&pvh_global_lock); 3813 PMAP_UNLOCK(pmap); 3814} 3815 3816static vm_page_t 3817pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3818 vm_prot_t prot, vm_page_t mpte) 3819{ 3820 pt_entry_t *pte; 3821 vm_paddr_t pa; 3822 struct spglist free; 3823 3824 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3825 (m->oflags & VPO_UNMANAGED) != 0, 3826 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3827 rw_assert(&pvh_global_lock, RA_WLOCKED); 3828 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3829 3830 /* 3831 * In the case that a page table page is not 3832 * resident, we are creating it here. 3833 */ 3834 if (va < VM_MAXUSER_ADDRESS) { 3835 u_int ptepindex; 3836 pd_entry_t ptepa; 3837 3838 /* 3839 * Calculate pagetable page index 3840 */ 3841 ptepindex = va >> PDRSHIFT; 3842 if (mpte && (mpte->pindex == ptepindex)) { 3843 mpte->wire_count++; 3844 } else { 3845 /* 3846 * Get the page directory entry 3847 */ 3848 ptepa = pmap->pm_pdir[ptepindex]; 3849 3850 /* 3851 * If the page table page is mapped, we just increment 3852 * the hold count, and activate it. 3853 */ 3854 if (ptepa) { 3855 if (ptepa & PG_PS) 3856 return (NULL); 3857 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3858 mpte->wire_count++; 3859 } else { 3860 mpte = _pmap_allocpte(pmap, ptepindex, 3861 PMAP_ENTER_NOSLEEP); 3862 if (mpte == NULL) 3863 return (mpte); 3864 } 3865 } 3866 } else { 3867 mpte = NULL; 3868 } 3869 3870 /* 3871 * This call to vtopte makes the assumption that we are 3872 * entering the page into the current pmap. In order to support 3873 * quick entry into any pmap, one would likely use pmap_pte_quick. 3874 * But that isn't as quick as vtopte. 3875 */ 3876 pte = vtopte(va); 3877 if (*pte) { 3878 if (mpte != NULL) { 3879 mpte->wire_count--; 3880 mpte = NULL; 3881 } 3882 return (mpte); 3883 } 3884 3885 /* 3886 * Enter on the PV list if part of our managed memory. 3887 */ 3888 if ((m->oflags & VPO_UNMANAGED) == 0 && 3889 !pmap_try_insert_pv_entry(pmap, va, m)) { 3890 if (mpte != NULL) { 3891 SLIST_INIT(&free); 3892 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3893 pmap_invalidate_page(pmap, va); 3894 pmap_free_zero_pages(&free); 3895 } 3896 3897 mpte = NULL; 3898 } 3899 return (mpte); 3900 } 3901 3902 /* 3903 * Increment counters 3904 */ 3905 pmap->pm_stats.resident_count++; 3906 3907 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3908#if defined(PAE) || defined(PAE_TABLES) 3909 if ((prot & VM_PROT_EXECUTE) == 0) 3910 pa |= pg_nx; 3911#endif 3912 3913 /* 3914 * Now validate mapping with RO protection 3915 */ 3916 if ((m->oflags & VPO_UNMANAGED) != 0) 3917 pte_store(pte, pa | PG_V | PG_U); 3918 else 3919 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3920 return (mpte); 3921} 3922 3923/* 3924 * Make a temporary mapping for a physical address. This is only intended 3925 * to be used for panic dumps. 3926 */ 3927void * 3928pmap_kenter_temporary(vm_paddr_t pa, int i) 3929{ 3930 vm_offset_t va; 3931 3932 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3933 pmap_kenter(va, pa); 3934 invlpg(va); 3935 return ((void *)crashdumpmap); 3936} 3937 3938/* 3939 * This code maps large physical mmap regions into the 3940 * processor address space. Note that some shortcuts 3941 * are taken, but the code works. 3942 */ 3943void 3944pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3945 vm_pindex_t pindex, vm_size_t size) 3946{ 3947 pd_entry_t *pde; 3948 vm_paddr_t pa, ptepa; 3949 vm_page_t p; 3950 int pat_mode; 3951 3952 VM_OBJECT_ASSERT_WLOCKED(object); 3953 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3954 ("pmap_object_init_pt: non-device object")); 3955 if (pseflag && 3956 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3957 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3958 return; 3959 p = vm_page_lookup(object, pindex); 3960 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3961 ("pmap_object_init_pt: invalid page %p", p)); 3962 pat_mode = p->md.pat_mode; 3963 3964 /* 3965 * Abort the mapping if the first page is not physically 3966 * aligned to a 2/4MB page boundary. 3967 */ 3968 ptepa = VM_PAGE_TO_PHYS(p); 3969 if (ptepa & (NBPDR - 1)) 3970 return; 3971 3972 /* 3973 * Skip the first page. Abort the mapping if the rest of 3974 * the pages are not physically contiguous or have differing 3975 * memory attributes. 3976 */ 3977 p = TAILQ_NEXT(p, listq); 3978 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3979 pa += PAGE_SIZE) { 3980 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3981 ("pmap_object_init_pt: invalid page %p", p)); 3982 if (pa != VM_PAGE_TO_PHYS(p) || 3983 pat_mode != p->md.pat_mode) 3984 return; 3985 p = TAILQ_NEXT(p, listq); 3986 } 3987 3988 /* 3989 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3990 * "size" is a multiple of 2/4M, adding the PAT setting to 3991 * "pa" will not affect the termination of this loop. 3992 */ 3993 PMAP_LOCK(pmap); 3994 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3995 size; pa += NBPDR) { 3996 pde = pmap_pde(pmap, addr); 3997 if (*pde == 0) { 3998 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3999 PG_U | PG_RW | PG_V); 4000 pmap->pm_stats.resident_count += NBPDR / 4001 PAGE_SIZE; 4002 pmap_pde_mappings++; 4003 } 4004 /* Else continue on if the PDE is already valid. */ 4005 addr += NBPDR; 4006 } 4007 PMAP_UNLOCK(pmap); 4008 } 4009} 4010 4011/* 4012 * Clear the wired attribute from the mappings for the specified range of 4013 * addresses in the given pmap. Every valid mapping within that range 4014 * must have the wired attribute set. In contrast, invalid mappings 4015 * cannot have the wired attribute set, so they are ignored. 4016 * 4017 * The wired attribute of the page table entry is not a hardware feature, 4018 * so there is no need to invalidate any TLB entries. 4019 */ 4020void 4021pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4022{ 4023 vm_offset_t pdnxt; 4024 pd_entry_t *pde; 4025 pt_entry_t *pte; 4026 boolean_t pv_lists_locked; 4027 4028 if (pmap_is_current(pmap)) 4029 pv_lists_locked = FALSE; 4030 else { 4031 pv_lists_locked = TRUE; 4032resume: 4033 rw_wlock(&pvh_global_lock); 4034 sched_pin(); 4035 } 4036 PMAP_LOCK(pmap); 4037 for (; sva < eva; sva = pdnxt) { 4038 pdnxt = (sva + NBPDR) & ~PDRMASK; 4039 if (pdnxt < sva) 4040 pdnxt = eva; 4041 pde = pmap_pde(pmap, sva); 4042 if ((*pde & PG_V) == 0) 4043 continue; 4044 if ((*pde & PG_PS) != 0) { 4045 if ((*pde & PG_W) == 0) 4046 panic("pmap_unwire: pde %#jx is missing PG_W", 4047 (uintmax_t)*pde); 4048 4049 /* 4050 * Are we unwiring the entire large page? If not, 4051 * demote the mapping and fall through. 4052 */ 4053 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4054 /* 4055 * Regardless of whether a pde (or pte) is 32 4056 * or 64 bits in size, PG_W is among the least 4057 * significant 32 bits. 4058 */ 4059 atomic_clear_int((u_int *)pde, PG_W); 4060 pmap->pm_stats.wired_count -= NBPDR / 4061 PAGE_SIZE; 4062 continue; 4063 } else { 4064 if (!pv_lists_locked) { 4065 pv_lists_locked = TRUE; 4066 if (!rw_try_wlock(&pvh_global_lock)) { 4067 PMAP_UNLOCK(pmap); 4068 /* Repeat sva. */ 4069 goto resume; 4070 } 4071 sched_pin(); 4072 } 4073 if (!pmap_demote_pde(pmap, pde, sva)) 4074 panic("pmap_unwire: demotion failed"); 4075 } 4076 } 4077 if (pdnxt > eva) 4078 pdnxt = eva; 4079 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4080 sva += PAGE_SIZE) { 4081 if ((*pte & PG_V) == 0) 4082 continue; 4083 if ((*pte & PG_W) == 0) 4084 panic("pmap_unwire: pte %#jx is missing PG_W", 4085 (uintmax_t)*pte); 4086 4087 /* 4088 * PG_W must be cleared atomically. Although the pmap 4089 * lock synchronizes access to PG_W, another processor 4090 * could be setting PG_M and/or PG_A concurrently. 4091 * 4092 * PG_W is among the least significant 32 bits. 4093 */ 4094 atomic_clear_int((u_int *)pte, PG_W); 4095 pmap->pm_stats.wired_count--; 4096 } 4097 } 4098 if (pv_lists_locked) { 4099 sched_unpin(); 4100 rw_wunlock(&pvh_global_lock); 4101 } 4102 PMAP_UNLOCK(pmap); 4103} 4104 4105 4106/* 4107 * Copy the range specified by src_addr/len 4108 * from the source map to the range dst_addr/len 4109 * in the destination map. 4110 * 4111 * This routine is only advisory and need not do anything. 4112 */ 4113 4114void 4115pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4116 vm_offset_t src_addr) 4117{ 4118 struct spglist free; 4119 vm_offset_t addr; 4120 vm_offset_t end_addr = src_addr + len; 4121 vm_offset_t pdnxt; 4122 4123 if (dst_addr != src_addr) 4124 return; 4125 4126 if (!pmap_is_current(src_pmap)) 4127 return; 4128 4129 rw_wlock(&pvh_global_lock); 4130 if (dst_pmap < src_pmap) { 4131 PMAP_LOCK(dst_pmap); 4132 PMAP_LOCK(src_pmap); 4133 } else { 4134 PMAP_LOCK(src_pmap); 4135 PMAP_LOCK(dst_pmap); 4136 } 4137 sched_pin(); 4138 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4139 pt_entry_t *src_pte, *dst_pte; 4140 vm_page_t dstmpte, srcmpte; 4141 pd_entry_t srcptepaddr; 4142 u_int ptepindex; 4143 4144 KASSERT(addr < UPT_MIN_ADDRESS, 4145 ("pmap_copy: invalid to pmap_copy page tables")); 4146 4147 pdnxt = (addr + NBPDR) & ~PDRMASK; 4148 if (pdnxt < addr) 4149 pdnxt = end_addr; 4150 ptepindex = addr >> PDRSHIFT; 4151 4152 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4153 if (srcptepaddr == 0) 4154 continue; 4155 4156 if (srcptepaddr & PG_PS) { 4157 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4158 continue; 4159 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4160 ((srcptepaddr & PG_MANAGED) == 0 || 4161 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4162 PG_PS_FRAME))) { 4163 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4164 ~PG_W; 4165 dst_pmap->pm_stats.resident_count += 4166 NBPDR / PAGE_SIZE; 4167 } 4168 continue; 4169 } 4170 4171 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4172 KASSERT(srcmpte->wire_count > 0, 4173 ("pmap_copy: source page table page is unused")); 4174 4175 if (pdnxt > end_addr) 4176 pdnxt = end_addr; 4177 4178 src_pte = vtopte(addr); 4179 while (addr < pdnxt) { 4180 pt_entry_t ptetemp; 4181 ptetemp = *src_pte; 4182 /* 4183 * we only virtual copy managed pages 4184 */ 4185 if ((ptetemp & PG_MANAGED) != 0) { 4186 dstmpte = pmap_allocpte(dst_pmap, addr, 4187 PMAP_ENTER_NOSLEEP); 4188 if (dstmpte == NULL) 4189 goto out; 4190 dst_pte = pmap_pte_quick(dst_pmap, addr); 4191 if (*dst_pte == 0 && 4192 pmap_try_insert_pv_entry(dst_pmap, addr, 4193 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4194 /* 4195 * Clear the wired, modified, and 4196 * accessed (referenced) bits 4197 * during the copy. 4198 */ 4199 *dst_pte = ptetemp & ~(PG_W | PG_M | 4200 PG_A); 4201 dst_pmap->pm_stats.resident_count++; 4202 } else { 4203 SLIST_INIT(&free); 4204 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4205 &free)) { 4206 pmap_invalidate_page(dst_pmap, 4207 addr); 4208 pmap_free_zero_pages(&free); 4209 } 4210 goto out; 4211 } 4212 if (dstmpte->wire_count >= srcmpte->wire_count) 4213 break; 4214 } 4215 addr += PAGE_SIZE; 4216 src_pte++; 4217 } 4218 } 4219out: 4220 sched_unpin(); 4221 rw_wunlock(&pvh_global_lock); 4222 PMAP_UNLOCK(src_pmap); 4223 PMAP_UNLOCK(dst_pmap); 4224} 4225 4226static __inline void 4227pagezero(void *page) 4228{ 4229#if defined(I686_CPU) 4230 if (cpu_class == CPUCLASS_686) { 4231#if defined(CPU_ENABLE_SSE) 4232 if (cpu_feature & CPUID_SSE2) 4233 sse2_pagezero(page); 4234 else 4235#endif 4236 i686_pagezero(page); 4237 } else 4238#endif 4239 bzero(page, PAGE_SIZE); 4240} 4241 4242/* 4243 * pmap_zero_page zeros the specified hardware page by mapping 4244 * the page into KVM and using bzero to clear its contents. 4245 */ 4246void 4247pmap_zero_page(vm_page_t m) 4248{ 4249 struct sysmaps *sysmaps; 4250 4251 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4252 mtx_lock(&sysmaps->lock); 4253 if (*sysmaps->CMAP2) 4254 panic("pmap_zero_page: CMAP2 busy"); 4255 sched_pin(); 4256 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4257 pmap_cache_bits(m->md.pat_mode, 0); 4258 invlcaddr(sysmaps->CADDR2); 4259 pagezero(sysmaps->CADDR2); 4260 *sysmaps->CMAP2 = 0; 4261 sched_unpin(); 4262 mtx_unlock(&sysmaps->lock); 4263} 4264 4265/* 4266 * pmap_zero_page_area zeros the specified hardware page by mapping 4267 * the page into KVM and using bzero to clear its contents. 4268 * 4269 * off and size may not cover an area beyond a single hardware page. 4270 */ 4271void 4272pmap_zero_page_area(vm_page_t m, int off, int size) 4273{ 4274 struct sysmaps *sysmaps; 4275 4276 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4277 mtx_lock(&sysmaps->lock); 4278 if (*sysmaps->CMAP2) 4279 panic("pmap_zero_page_area: CMAP2 busy"); 4280 sched_pin(); 4281 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4282 pmap_cache_bits(m->md.pat_mode, 0); 4283 invlcaddr(sysmaps->CADDR2); 4284 if (off == 0 && size == PAGE_SIZE) 4285 pagezero(sysmaps->CADDR2); 4286 else 4287 bzero((char *)sysmaps->CADDR2 + off, size); 4288 *sysmaps->CMAP2 = 0; 4289 sched_unpin(); 4290 mtx_unlock(&sysmaps->lock); 4291} 4292 4293/* 4294 * pmap_zero_page_idle zeros the specified hardware page by mapping 4295 * the page into KVM and using bzero to clear its contents. This 4296 * is intended to be called from the vm_pagezero process only and 4297 * outside of Giant. 4298 */ 4299void 4300pmap_zero_page_idle(vm_page_t m) 4301{ 4302 4303 if (*CMAP3) 4304 panic("pmap_zero_page_idle: CMAP3 busy"); 4305 sched_pin(); 4306 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4307 pmap_cache_bits(m->md.pat_mode, 0); 4308 invlcaddr(CADDR3); 4309 pagezero(CADDR3); 4310 *CMAP3 = 0; 4311 sched_unpin(); 4312} 4313 4314/* 4315 * pmap_copy_page copies the specified (machine independent) 4316 * page by mapping the page into virtual memory and using 4317 * bcopy to copy the page, one machine dependent page at a 4318 * time. 4319 */ 4320void 4321pmap_copy_page(vm_page_t src, vm_page_t dst) 4322{ 4323 struct sysmaps *sysmaps; 4324 4325 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4326 mtx_lock(&sysmaps->lock); 4327 if (*sysmaps->CMAP1) 4328 panic("pmap_copy_page: CMAP1 busy"); 4329 if (*sysmaps->CMAP2) 4330 panic("pmap_copy_page: CMAP2 busy"); 4331 sched_pin(); 4332 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4333 pmap_cache_bits(src->md.pat_mode, 0); 4334 invlcaddr(sysmaps->CADDR1); 4335 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4336 pmap_cache_bits(dst->md.pat_mode, 0); 4337 invlcaddr(sysmaps->CADDR2); 4338 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4339 *sysmaps->CMAP1 = 0; 4340 *sysmaps->CMAP2 = 0; 4341 sched_unpin(); 4342 mtx_unlock(&sysmaps->lock); 4343} 4344 4345int unmapped_buf_allowed = 1; 4346 4347void 4348pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4349 vm_offset_t b_offset, int xfersize) 4350{ 4351 struct sysmaps *sysmaps; 4352 vm_page_t a_pg, b_pg; 4353 char *a_cp, *b_cp; 4354 vm_offset_t a_pg_offset, b_pg_offset; 4355 int cnt; 4356 4357 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4358 mtx_lock(&sysmaps->lock); 4359 if (*sysmaps->CMAP1 != 0) 4360 panic("pmap_copy_pages: CMAP1 busy"); 4361 if (*sysmaps->CMAP2 != 0) 4362 panic("pmap_copy_pages: CMAP2 busy"); 4363 sched_pin(); 4364 while (xfersize > 0) { 4365 a_pg = ma[a_offset >> PAGE_SHIFT]; 4366 a_pg_offset = a_offset & PAGE_MASK; 4367 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4368 b_pg = mb[b_offset >> PAGE_SHIFT]; 4369 b_pg_offset = b_offset & PAGE_MASK; 4370 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4371 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4372 pmap_cache_bits(a_pg->md.pat_mode, 0); 4373 invlcaddr(sysmaps->CADDR1); 4374 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4375 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4376 invlcaddr(sysmaps->CADDR2); 4377 a_cp = sysmaps->CADDR1 + a_pg_offset; 4378 b_cp = sysmaps->CADDR2 + b_pg_offset; 4379 bcopy(a_cp, b_cp, cnt); 4380 a_offset += cnt; 4381 b_offset += cnt; 4382 xfersize -= cnt; 4383 } 4384 *sysmaps->CMAP1 = 0; 4385 *sysmaps->CMAP2 = 0; 4386 sched_unpin(); 4387 mtx_unlock(&sysmaps->lock); 4388} 4389 4390/* 4391 * Returns true if the pmap's pv is one of the first 4392 * 16 pvs linked to from this page. This count may 4393 * be changed upwards or downwards in the future; it 4394 * is only necessary that true be returned for a small 4395 * subset of pmaps for proper page aging. 4396 */ 4397boolean_t 4398pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4399{ 4400 struct md_page *pvh; 4401 pv_entry_t pv; 4402 int loops = 0; 4403 boolean_t rv; 4404 4405 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4406 ("pmap_page_exists_quick: page %p is not managed", m)); 4407 rv = FALSE; 4408 rw_wlock(&pvh_global_lock); 4409 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4410 if (PV_PMAP(pv) == pmap) { 4411 rv = TRUE; 4412 break; 4413 } 4414 loops++; 4415 if (loops >= 16) 4416 break; 4417 } 4418 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4419 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4420 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4421 if (PV_PMAP(pv) == pmap) { 4422 rv = TRUE; 4423 break; 4424 } 4425 loops++; 4426 if (loops >= 16) 4427 break; 4428 } 4429 } 4430 rw_wunlock(&pvh_global_lock); 4431 return (rv); 4432} 4433 4434/* 4435 * pmap_page_wired_mappings: 4436 * 4437 * Return the number of managed mappings to the given physical page 4438 * that are wired. 4439 */ 4440int 4441pmap_page_wired_mappings(vm_page_t m) 4442{ 4443 int count; 4444 4445 count = 0; 4446 if ((m->oflags & VPO_UNMANAGED) != 0) 4447 return (count); 4448 rw_wlock(&pvh_global_lock); 4449 count = pmap_pvh_wired_mappings(&m->md, count); 4450 if ((m->flags & PG_FICTITIOUS) == 0) { 4451 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4452 count); 4453 } 4454 rw_wunlock(&pvh_global_lock); 4455 return (count); 4456} 4457 4458/* 4459 * pmap_pvh_wired_mappings: 4460 * 4461 * Return the updated number "count" of managed mappings that are wired. 4462 */ 4463static int 4464pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4465{ 4466 pmap_t pmap; 4467 pt_entry_t *pte; 4468 pv_entry_t pv; 4469 4470 rw_assert(&pvh_global_lock, RA_WLOCKED); 4471 sched_pin(); 4472 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4473 pmap = PV_PMAP(pv); 4474 PMAP_LOCK(pmap); 4475 pte = pmap_pte_quick(pmap, pv->pv_va); 4476 if ((*pte & PG_W) != 0) 4477 count++; 4478 PMAP_UNLOCK(pmap); 4479 } 4480 sched_unpin(); 4481 return (count); 4482} 4483 4484/* 4485 * Returns TRUE if the given page is mapped individually or as part of 4486 * a 4mpage. Otherwise, returns FALSE. 4487 */ 4488boolean_t 4489pmap_page_is_mapped(vm_page_t m) 4490{ 4491 boolean_t rv; 4492 4493 if ((m->oflags & VPO_UNMANAGED) != 0) 4494 return (FALSE); 4495 rw_wlock(&pvh_global_lock); 4496 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4497 ((m->flags & PG_FICTITIOUS) == 0 && 4498 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4499 rw_wunlock(&pvh_global_lock); 4500 return (rv); 4501} 4502 4503/* 4504 * Remove all pages from specified address space 4505 * this aids process exit speeds. Also, this code 4506 * is special cased for current process only, but 4507 * can have the more generic (and slightly slower) 4508 * mode enabled. This is much faster than pmap_remove 4509 * in the case of running down an entire address space. 4510 */ 4511void 4512pmap_remove_pages(pmap_t pmap) 4513{ 4514 pt_entry_t *pte, tpte; 4515 vm_page_t m, mpte, mt; 4516 pv_entry_t pv; 4517 struct md_page *pvh; 4518 struct pv_chunk *pc, *npc; 4519 struct spglist free; 4520 int field, idx; 4521 int32_t bit; 4522 uint32_t inuse, bitmask; 4523 int allfree; 4524 4525 if (pmap != PCPU_GET(curpmap)) { 4526 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4527 return; 4528 } 4529 SLIST_INIT(&free); 4530 rw_wlock(&pvh_global_lock); 4531 PMAP_LOCK(pmap); 4532 sched_pin(); 4533 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4534 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4535 pc->pc_pmap)); 4536 allfree = 1; 4537 for (field = 0; field < _NPCM; field++) { 4538 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4539 while (inuse != 0) { 4540 bit = bsfl(inuse); 4541 bitmask = 1UL << bit; 4542 idx = field * 32 + bit; 4543 pv = &pc->pc_pventry[idx]; 4544 inuse &= ~bitmask; 4545 4546 pte = pmap_pde(pmap, pv->pv_va); 4547 tpte = *pte; 4548 if ((tpte & PG_PS) == 0) { 4549 pte = vtopte(pv->pv_va); 4550 tpte = *pte & ~PG_PTE_PAT; 4551 } 4552 4553 if (tpte == 0) { 4554 printf( 4555 "TPTE at %p IS ZERO @ VA %08x\n", 4556 pte, pv->pv_va); 4557 panic("bad pte"); 4558 } 4559 4560/* 4561 * We cannot remove wired pages from a process' mapping at this time 4562 */ 4563 if (tpte & PG_W) { 4564 allfree = 0; 4565 continue; 4566 } 4567 4568 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4569 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4570 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4571 m, (uintmax_t)m->phys_addr, 4572 (uintmax_t)tpte)); 4573 4574 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4575 m < &vm_page_array[vm_page_array_size], 4576 ("pmap_remove_pages: bad tpte %#jx", 4577 (uintmax_t)tpte)); 4578 4579 pte_clear(pte); 4580 4581 /* 4582 * Update the vm_page_t clean/reference bits. 4583 */ 4584 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4585 if ((tpte & PG_PS) != 0) { 4586 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4587 vm_page_dirty(mt); 4588 } else 4589 vm_page_dirty(m); 4590 } 4591 4592 /* Mark free */ 4593 PV_STAT(pv_entry_frees++); 4594 PV_STAT(pv_entry_spare++); 4595 pv_entry_count--; 4596 pc->pc_map[field] |= bitmask; 4597 if ((tpte & PG_PS) != 0) { 4598 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4599 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4600 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4601 if (TAILQ_EMPTY(&pvh->pv_list)) { 4602 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4603 if (TAILQ_EMPTY(&mt->md.pv_list)) 4604 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4605 } 4606 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4607 if (mpte != NULL) { 4608 pmap_remove_pt_page(pmap, mpte); 4609 pmap->pm_stats.resident_count--; 4610 KASSERT(mpte->wire_count == NPTEPG, 4611 ("pmap_remove_pages: pte page wire count error")); 4612 mpte->wire_count = 0; 4613 pmap_add_delayed_free_list(mpte, &free, FALSE); 4614 atomic_subtract_int(&cnt.v_wire_count, 1); 4615 } 4616 } else { 4617 pmap->pm_stats.resident_count--; 4618 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4619 if (TAILQ_EMPTY(&m->md.pv_list) && 4620 (m->flags & PG_FICTITIOUS) == 0) { 4621 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4622 if (TAILQ_EMPTY(&pvh->pv_list)) 4623 vm_page_aflag_clear(m, PGA_WRITEABLE); 4624 } 4625 pmap_unuse_pt(pmap, pv->pv_va, &free); 4626 } 4627 } 4628 } 4629 if (allfree) { 4630 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4631 free_pv_chunk(pc); 4632 } 4633 } 4634 sched_unpin(); 4635 pmap_invalidate_all(pmap); 4636 rw_wunlock(&pvh_global_lock); 4637 PMAP_UNLOCK(pmap); 4638 pmap_free_zero_pages(&free); 4639} 4640 4641/* 4642 * pmap_is_modified: 4643 * 4644 * Return whether or not the specified physical page was modified 4645 * in any physical maps. 4646 */ 4647boolean_t 4648pmap_is_modified(vm_page_t m) 4649{ 4650 boolean_t rv; 4651 4652 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4653 ("pmap_is_modified: page %p is not managed", m)); 4654 4655 /* 4656 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4657 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4658 * is clear, no PTEs can have PG_M set. 4659 */ 4660 VM_OBJECT_ASSERT_WLOCKED(m->object); 4661 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4662 return (FALSE); 4663 rw_wlock(&pvh_global_lock); 4664 rv = pmap_is_modified_pvh(&m->md) || 4665 ((m->flags & PG_FICTITIOUS) == 0 && 4666 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4667 rw_wunlock(&pvh_global_lock); 4668 return (rv); 4669} 4670 4671/* 4672 * Returns TRUE if any of the given mappings were used to modify 4673 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4674 * mappings are supported. 4675 */ 4676static boolean_t 4677pmap_is_modified_pvh(struct md_page *pvh) 4678{ 4679 pv_entry_t pv; 4680 pt_entry_t *pte; 4681 pmap_t pmap; 4682 boolean_t rv; 4683 4684 rw_assert(&pvh_global_lock, RA_WLOCKED); 4685 rv = FALSE; 4686 sched_pin(); 4687 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4688 pmap = PV_PMAP(pv); 4689 PMAP_LOCK(pmap); 4690 pte = pmap_pte_quick(pmap, pv->pv_va); 4691 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4692 PMAP_UNLOCK(pmap); 4693 if (rv) 4694 break; 4695 } 4696 sched_unpin(); 4697 return (rv); 4698} 4699 4700/* 4701 * pmap_is_prefaultable: 4702 * 4703 * Return whether or not the specified virtual address is elgible 4704 * for prefault. 4705 */ 4706boolean_t 4707pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4708{ 4709 pd_entry_t *pde; 4710 pt_entry_t *pte; 4711 boolean_t rv; 4712 4713 rv = FALSE; 4714 PMAP_LOCK(pmap); 4715 pde = pmap_pde(pmap, addr); 4716 if (*pde != 0 && (*pde & PG_PS) == 0) { 4717 pte = vtopte(addr); 4718 rv = *pte == 0; 4719 } 4720 PMAP_UNLOCK(pmap); 4721 return (rv); 4722} 4723 4724/* 4725 * pmap_is_referenced: 4726 * 4727 * Return whether or not the specified physical page was referenced 4728 * in any physical maps. 4729 */ 4730boolean_t 4731pmap_is_referenced(vm_page_t m) 4732{ 4733 boolean_t rv; 4734 4735 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4736 ("pmap_is_referenced: page %p is not managed", m)); 4737 rw_wlock(&pvh_global_lock); 4738 rv = pmap_is_referenced_pvh(&m->md) || 4739 ((m->flags & PG_FICTITIOUS) == 0 && 4740 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4741 rw_wunlock(&pvh_global_lock); 4742 return (rv); 4743} 4744 4745/* 4746 * Returns TRUE if any of the given mappings were referenced and FALSE 4747 * otherwise. Both page and 4mpage mappings are supported. 4748 */ 4749static boolean_t 4750pmap_is_referenced_pvh(struct md_page *pvh) 4751{ 4752 pv_entry_t pv; 4753 pt_entry_t *pte; 4754 pmap_t pmap; 4755 boolean_t rv; 4756 4757 rw_assert(&pvh_global_lock, RA_WLOCKED); 4758 rv = FALSE; 4759 sched_pin(); 4760 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4761 pmap = PV_PMAP(pv); 4762 PMAP_LOCK(pmap); 4763 pte = pmap_pte_quick(pmap, pv->pv_va); 4764 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4765 PMAP_UNLOCK(pmap); 4766 if (rv) 4767 break; 4768 } 4769 sched_unpin(); 4770 return (rv); 4771} 4772 4773/* 4774 * Clear the write and modified bits in each of the given page's mappings. 4775 */ 4776void 4777pmap_remove_write(vm_page_t m) 4778{ 4779 struct md_page *pvh; 4780 pv_entry_t next_pv, pv; 4781 pmap_t pmap; 4782 pd_entry_t *pde; 4783 pt_entry_t oldpte, *pte; 4784 vm_offset_t va; 4785 4786 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4787 ("pmap_remove_write: page %p is not managed", m)); 4788 4789 /* 4790 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4791 * set by another thread while the object is locked. Thus, 4792 * if PGA_WRITEABLE is clear, no page table entries need updating. 4793 */ 4794 VM_OBJECT_ASSERT_WLOCKED(m->object); 4795 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4796 return; 4797 rw_wlock(&pvh_global_lock); 4798 sched_pin(); 4799 if ((m->flags & PG_FICTITIOUS) != 0) 4800 goto small_mappings; 4801 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4802 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4803 va = pv->pv_va; 4804 pmap = PV_PMAP(pv); 4805 PMAP_LOCK(pmap); 4806 pde = pmap_pde(pmap, va); 4807 if ((*pde & PG_RW) != 0) 4808 (void)pmap_demote_pde(pmap, pde, va); 4809 PMAP_UNLOCK(pmap); 4810 } 4811small_mappings: 4812 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4813 pmap = PV_PMAP(pv); 4814 PMAP_LOCK(pmap); 4815 pde = pmap_pde(pmap, pv->pv_va); 4816 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4817 " a 4mpage in page %p's pv list", m)); 4818 pte = pmap_pte_quick(pmap, pv->pv_va); 4819retry: 4820 oldpte = *pte; 4821 if ((oldpte & PG_RW) != 0) { 4822 /* 4823 * Regardless of whether a pte is 32 or 64 bits 4824 * in size, PG_RW and PG_M are among the least 4825 * significant 32 bits. 4826 */ 4827 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4828 oldpte & ~(PG_RW | PG_M))) 4829 goto retry; 4830 if ((oldpte & PG_M) != 0) 4831 vm_page_dirty(m); 4832 pmap_invalidate_page(pmap, pv->pv_va); 4833 } 4834 PMAP_UNLOCK(pmap); 4835 } 4836 vm_page_aflag_clear(m, PGA_WRITEABLE); 4837 sched_unpin(); 4838 rw_wunlock(&pvh_global_lock); 4839} 4840 4841#define PMAP_TS_REFERENCED_MAX 5 4842 4843/* 4844 * pmap_ts_referenced: 4845 * 4846 * Return a count of reference bits for a page, clearing those bits. 4847 * It is not necessary for every reference bit to be cleared, but it 4848 * is necessary that 0 only be returned when there are truly no 4849 * reference bits set. 4850 * 4851 * XXX: The exact number of bits to check and clear is a matter that 4852 * should be tested and standardized at some point in the future for 4853 * optimal aging of shared pages. 4854 */ 4855int 4856pmap_ts_referenced(vm_page_t m) 4857{ 4858 struct md_page *pvh; 4859 pv_entry_t pv, pvf; 4860 pmap_t pmap; 4861 pd_entry_t *pde; 4862 pt_entry_t *pte; 4863 vm_paddr_t pa; 4864 int rtval = 0; 4865 4866 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4867 ("pmap_ts_referenced: page %p is not managed", m)); 4868 pa = VM_PAGE_TO_PHYS(m); 4869 pvh = pa_to_pvh(pa); 4870 rw_wlock(&pvh_global_lock); 4871 sched_pin(); 4872 if ((m->flags & PG_FICTITIOUS) != 0 || 4873 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4874 goto small_mappings; 4875 pv = pvf; 4876 do { 4877 pmap = PV_PMAP(pv); 4878 PMAP_LOCK(pmap); 4879 pde = pmap_pde(pmap, pv->pv_va); 4880 if ((*pde & PG_A) != 0) { 4881 /* 4882 * Since this reference bit is shared by either 1024 4883 * or 512 4KB pages, it should not be cleared every 4884 * time it is tested. Apply a simple "hash" function 4885 * on the physical page number, the virtual superpage 4886 * number, and the pmap address to select one 4KB page 4887 * out of the 1024 or 512 on which testing the 4888 * reference bit will result in clearing that bit. 4889 * This function is designed to avoid the selection of 4890 * the same 4KB page for every 2- or 4MB page mapping. 4891 * 4892 * On demotion, a mapping that hasn't been referenced 4893 * is simply destroyed. To avoid the possibility of a 4894 * subsequent page fault on a demoted wired mapping, 4895 * always leave its reference bit set. Moreover, 4896 * since the superpage is wired, the current state of 4897 * its reference bit won't affect page replacement. 4898 */ 4899 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4900 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4901 (*pde & PG_W) == 0) { 4902 atomic_clear_int((u_int *)pde, PG_A); 4903 pmap_invalidate_page(pmap, pv->pv_va); 4904 } 4905 rtval++; 4906 } 4907 PMAP_UNLOCK(pmap); 4908 /* Rotate the PV list if it has more than one entry. */ 4909 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4910 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4911 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4912 } 4913 if (rtval >= PMAP_TS_REFERENCED_MAX) 4914 goto out; 4915 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4916small_mappings: 4917 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4918 goto out; 4919 pv = pvf; 4920 do { 4921 pmap = PV_PMAP(pv); 4922 PMAP_LOCK(pmap); 4923 pde = pmap_pde(pmap, pv->pv_va); 4924 KASSERT((*pde & PG_PS) == 0, 4925 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4926 m)); 4927 pte = pmap_pte_quick(pmap, pv->pv_va); 4928 if ((*pte & PG_A) != 0) { 4929 atomic_clear_int((u_int *)pte, PG_A); 4930 pmap_invalidate_page(pmap, pv->pv_va); 4931 rtval++; 4932 } 4933 PMAP_UNLOCK(pmap); 4934 /* Rotate the PV list if it has more than one entry. */ 4935 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4936 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4937 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4938 } 4939 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4940 PMAP_TS_REFERENCED_MAX); 4941out: 4942 sched_unpin(); 4943 rw_wunlock(&pvh_global_lock); 4944 return (rtval); 4945} 4946 4947/* 4948 * Apply the given advice to the specified range of addresses within the 4949 * given pmap. Depending on the advice, clear the referenced and/or 4950 * modified flags in each mapping and set the mapped page's dirty field. 4951 */ 4952void 4953pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4954{ 4955 pd_entry_t oldpde, *pde; 4956 pt_entry_t *pte; 4957 vm_offset_t pdnxt; 4958 vm_page_t m; 4959 boolean_t anychanged, pv_lists_locked; 4960 4961 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4962 return; 4963 if (pmap_is_current(pmap)) 4964 pv_lists_locked = FALSE; 4965 else { 4966 pv_lists_locked = TRUE; 4967resume: 4968 rw_wlock(&pvh_global_lock); 4969 sched_pin(); 4970 } 4971 anychanged = FALSE; 4972 PMAP_LOCK(pmap); 4973 for (; sva < eva; sva = pdnxt) { 4974 pdnxt = (sva + NBPDR) & ~PDRMASK; 4975 if (pdnxt < sva) 4976 pdnxt = eva; 4977 pde = pmap_pde(pmap, sva); 4978 oldpde = *pde; 4979 if ((oldpde & PG_V) == 0) 4980 continue; 4981 else if ((oldpde & PG_PS) != 0) { 4982 if ((oldpde & PG_MANAGED) == 0) 4983 continue; 4984 if (!pv_lists_locked) { 4985 pv_lists_locked = TRUE; 4986 if (!rw_try_wlock(&pvh_global_lock)) { 4987 if (anychanged) 4988 pmap_invalidate_all(pmap); 4989 PMAP_UNLOCK(pmap); 4990 goto resume; 4991 } 4992 sched_pin(); 4993 } 4994 if (!pmap_demote_pde(pmap, pde, sva)) { 4995 /* 4996 * The large page mapping was destroyed. 4997 */ 4998 continue; 4999 } 5000 5001 /* 5002 * Unless the page mappings are wired, remove the 5003 * mapping to a single page so that a subsequent 5004 * access may repromote. Since the underlying page 5005 * table page is fully populated, this removal never 5006 * frees a page table page. 5007 */ 5008 if ((oldpde & PG_W) == 0) { 5009 pte = pmap_pte_quick(pmap, sva); 5010 KASSERT((*pte & PG_V) != 0, 5011 ("pmap_advise: invalid PTE")); 5012 pmap_remove_pte(pmap, pte, sva, NULL); 5013 anychanged = TRUE; 5014 } 5015 } 5016 if (pdnxt > eva) 5017 pdnxt = eva; 5018 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5019 sva += PAGE_SIZE) { 5020 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 5021 PG_V)) 5022 continue; 5023 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5024 if (advice == MADV_DONTNEED) { 5025 /* 5026 * Future calls to pmap_is_modified() 5027 * can be avoided by making the page 5028 * dirty now. 5029 */ 5030 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5031 vm_page_dirty(m); 5032 } 5033 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5034 } else if ((*pte & PG_A) != 0) 5035 atomic_clear_int((u_int *)pte, PG_A); 5036 else 5037 continue; 5038 if ((*pte & PG_G) != 0) 5039 pmap_invalidate_page(pmap, sva); 5040 else 5041 anychanged = TRUE; 5042 } 5043 } 5044 if (anychanged) 5045 pmap_invalidate_all(pmap); 5046 if (pv_lists_locked) { 5047 sched_unpin(); 5048 rw_wunlock(&pvh_global_lock); 5049 } 5050 PMAP_UNLOCK(pmap); 5051} 5052 5053/* 5054 * Clear the modify bits on the specified physical page. 5055 */ 5056void 5057pmap_clear_modify(vm_page_t m) 5058{ 5059 struct md_page *pvh; 5060 pv_entry_t next_pv, pv; 5061 pmap_t pmap; 5062 pd_entry_t oldpde, *pde; 5063 pt_entry_t oldpte, *pte; 5064 vm_offset_t va; 5065 5066 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5067 ("pmap_clear_modify: page %p is not managed", m)); 5068 VM_OBJECT_ASSERT_WLOCKED(m->object); 5069 KASSERT(!vm_page_xbusied(m), 5070 ("pmap_clear_modify: page %p is exclusive busied", m)); 5071 5072 /* 5073 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5074 * If the object containing the page is locked and the page is not 5075 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5076 */ 5077 if ((m->aflags & PGA_WRITEABLE) == 0) 5078 return; 5079 rw_wlock(&pvh_global_lock); 5080 sched_pin(); 5081 if ((m->flags & PG_FICTITIOUS) != 0) 5082 goto small_mappings; 5083 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5084 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5085 va = pv->pv_va; 5086 pmap = PV_PMAP(pv); 5087 PMAP_LOCK(pmap); 5088 pde = pmap_pde(pmap, va); 5089 oldpde = *pde; 5090 if ((oldpde & PG_RW) != 0) { 5091 if (pmap_demote_pde(pmap, pde, va)) { 5092 if ((oldpde & PG_W) == 0) { 5093 /* 5094 * Write protect the mapping to a 5095 * single page so that a subsequent 5096 * write access may repromote. 5097 */ 5098 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5099 PG_PS_FRAME); 5100 pte = pmap_pte_quick(pmap, va); 5101 oldpte = *pte; 5102 if ((oldpte & PG_V) != 0) { 5103 /* 5104 * Regardless of whether a pte is 32 or 64 bits 5105 * in size, PG_RW and PG_M are among the least 5106 * significant 32 bits. 5107 */ 5108 while (!atomic_cmpset_int((u_int *)pte, 5109 oldpte, 5110 oldpte & ~(PG_M | PG_RW))) 5111 oldpte = *pte; 5112 vm_page_dirty(m); 5113 pmap_invalidate_page(pmap, va); 5114 } 5115 } 5116 } 5117 } 5118 PMAP_UNLOCK(pmap); 5119 } 5120small_mappings: 5121 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5122 pmap = PV_PMAP(pv); 5123 PMAP_LOCK(pmap); 5124 pde = pmap_pde(pmap, pv->pv_va); 5125 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5126 " a 4mpage in page %p's pv list", m)); 5127 pte = pmap_pte_quick(pmap, pv->pv_va); 5128 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5129 /* 5130 * Regardless of whether a pte is 32 or 64 bits 5131 * in size, PG_M is among the least significant 5132 * 32 bits. 5133 */ 5134 atomic_clear_int((u_int *)pte, PG_M); 5135 pmap_invalidate_page(pmap, pv->pv_va); 5136 } 5137 PMAP_UNLOCK(pmap); 5138 } 5139 sched_unpin(); 5140 rw_wunlock(&pvh_global_lock); 5141} 5142 5143/* 5144 * Miscellaneous support routines follow 5145 */ 5146 5147/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5148static __inline void 5149pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5150{ 5151 u_int opte, npte; 5152 5153 /* 5154 * The cache mode bits are all in the low 32-bits of the 5155 * PTE, so we can just spin on updating the low 32-bits. 5156 */ 5157 do { 5158 opte = *(u_int *)pte; 5159 npte = opte & ~PG_PTE_CACHE; 5160 npte |= cache_bits; 5161 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5162} 5163 5164/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5165static __inline void 5166pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5167{ 5168 u_int opde, npde; 5169 5170 /* 5171 * The cache mode bits are all in the low 32-bits of the 5172 * PDE, so we can just spin on updating the low 32-bits. 5173 */ 5174 do { 5175 opde = *(u_int *)pde; 5176 npde = opde & ~PG_PDE_CACHE; 5177 npde |= cache_bits; 5178 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5179} 5180 5181/* 5182 * Map a set of physical memory pages into the kernel virtual 5183 * address space. Return a pointer to where it is mapped. This 5184 * routine is intended to be used for mapping device memory, 5185 * NOT real memory. 5186 */ 5187void * 5188pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5189{ 5190 struct pmap_preinit_mapping *ppim; 5191 vm_offset_t va, offset; 5192 vm_size_t tmpsize; 5193 int i; 5194 5195 offset = pa & PAGE_MASK; 5196 size = round_page(offset + size); 5197 pa = pa & PG_FRAME; 5198 5199 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5200 va = KERNBASE + pa; 5201 else if (!pmap_initialized) { 5202 va = 0; 5203 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5204 ppim = pmap_preinit_mapping + i; 5205 if (ppim->va == 0) { 5206 ppim->pa = pa; 5207 ppim->sz = size; 5208 ppim->mode = mode; 5209 ppim->va = virtual_avail; 5210 virtual_avail += size; 5211 va = ppim->va; 5212 break; 5213 } 5214 } 5215 if (va == 0) 5216 panic("%s: too many preinit mappings", __func__); 5217 } else { 5218 /* 5219 * If we have a preinit mapping, re-use it. 5220 */ 5221 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5222 ppim = pmap_preinit_mapping + i; 5223 if (ppim->pa == pa && ppim->sz == size && 5224 ppim->mode == mode) 5225 return ((void *)(ppim->va + offset)); 5226 } 5227 va = kva_alloc(size); 5228 if (va == 0) 5229 panic("%s: Couldn't allocate KVA", __func__); 5230 } 5231 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5232 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5233 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5234 pmap_invalidate_cache_range(va, va + size, FALSE); 5235 return ((void *)(va + offset)); 5236} 5237 5238void * 5239pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5240{ 5241 5242 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5243} 5244 5245void * 5246pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5247{ 5248 5249 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5250} 5251 5252void 5253pmap_unmapdev(vm_offset_t va, vm_size_t size) 5254{ 5255 struct pmap_preinit_mapping *ppim; 5256 vm_offset_t offset; 5257 int i; 5258 5259 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5260 return; 5261 offset = va & PAGE_MASK; 5262 size = round_page(offset + size); 5263 va = trunc_page(va); 5264 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5265 ppim = pmap_preinit_mapping + i; 5266 if (ppim->va == va && ppim->sz == size) { 5267 if (pmap_initialized) 5268 return; 5269 ppim->pa = 0; 5270 ppim->va = 0; 5271 ppim->sz = 0; 5272 ppim->mode = 0; 5273 if (va + size == virtual_avail) 5274 virtual_avail = va; 5275 return; 5276 } 5277 } 5278 if (pmap_initialized) 5279 kva_free(va, size); 5280} 5281 5282/* 5283 * Sets the memory attribute for the specified page. 5284 */ 5285void 5286pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5287{ 5288 5289 m->md.pat_mode = ma; 5290 if ((m->flags & PG_FICTITIOUS) != 0) 5291 return; 5292 5293 /* 5294 * If "m" is a normal page, flush it from the cache. 5295 * See pmap_invalidate_cache_range(). 5296 * 5297 * First, try to find an existing mapping of the page by sf 5298 * buffer. sf_buf_invalidate_cache() modifies mapping and 5299 * flushes the cache. 5300 */ 5301 if (sf_buf_invalidate_cache(m)) 5302 return; 5303 5304 /* 5305 * If page is not mapped by sf buffer, but CPU does not 5306 * support self snoop, map the page transient and do 5307 * invalidation. In the worst case, whole cache is flushed by 5308 * pmap_invalidate_cache_range(). 5309 */ 5310 if ((cpu_feature & CPUID_SS) == 0) 5311 pmap_flush_page(m); 5312} 5313 5314static void 5315pmap_flush_page(vm_page_t m) 5316{ 5317 struct sysmaps *sysmaps; 5318 vm_offset_t sva, eva; 5319 5320 if ((cpu_feature & CPUID_CLFSH) != 0) { 5321 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 5322 mtx_lock(&sysmaps->lock); 5323 if (*sysmaps->CMAP2) 5324 panic("pmap_flush_page: CMAP2 busy"); 5325 sched_pin(); 5326 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5327 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5328 invlcaddr(sysmaps->CADDR2); 5329 sva = (vm_offset_t)sysmaps->CADDR2; 5330 eva = sva + PAGE_SIZE; 5331 5332 /* 5333 * Use mfence despite the ordering implied by 5334 * mtx_{un,}lock() because clflush is not guaranteed 5335 * to be ordered by any other instruction. 5336 */ 5337 mfence(); 5338 for (; sva < eva; sva += cpu_clflush_line_size) 5339 clflush(sva); 5340 mfence(); 5341 *sysmaps->CMAP2 = 0; 5342 sched_unpin(); 5343 mtx_unlock(&sysmaps->lock); 5344 } else 5345 pmap_invalidate_cache(); 5346} 5347 5348/* 5349 * Changes the specified virtual address range's memory type to that given by 5350 * the parameter "mode". The specified virtual address range must be 5351 * completely contained within either the kernel map. 5352 * 5353 * Returns zero if the change completed successfully, and either EINVAL or 5354 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5355 * of the virtual address range was not mapped, and ENOMEM is returned if 5356 * there was insufficient memory available to complete the change. 5357 */ 5358int 5359pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5360{ 5361 vm_offset_t base, offset, tmpva; 5362 pd_entry_t *pde; 5363 pt_entry_t *pte; 5364 int cache_bits_pte, cache_bits_pde; 5365 boolean_t changed; 5366 5367 base = trunc_page(va); 5368 offset = va & PAGE_MASK; 5369 size = round_page(offset + size); 5370 5371 /* 5372 * Only supported on kernel virtual addresses above the recursive map. 5373 */ 5374 if (base < VM_MIN_KERNEL_ADDRESS) 5375 return (EINVAL); 5376 5377 cache_bits_pde = pmap_cache_bits(mode, 1); 5378 cache_bits_pte = pmap_cache_bits(mode, 0); 5379 changed = FALSE; 5380 5381 /* 5382 * Pages that aren't mapped aren't supported. Also break down 5383 * 2/4MB pages into 4KB pages if required. 5384 */ 5385 PMAP_LOCK(kernel_pmap); 5386 for (tmpva = base; tmpva < base + size; ) { 5387 pde = pmap_pde(kernel_pmap, tmpva); 5388 if (*pde == 0) { 5389 PMAP_UNLOCK(kernel_pmap); 5390 return (EINVAL); 5391 } 5392 if (*pde & PG_PS) { 5393 /* 5394 * If the current 2/4MB page already has 5395 * the required memory type, then we need not 5396 * demote this page. Just increment tmpva to 5397 * the next 2/4MB page frame. 5398 */ 5399 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5400 tmpva = trunc_4mpage(tmpva) + NBPDR; 5401 continue; 5402 } 5403 5404 /* 5405 * If the current offset aligns with a 2/4MB 5406 * page frame and there is at least 2/4MB left 5407 * within the range, then we need not break 5408 * down this page into 4KB pages. 5409 */ 5410 if ((tmpva & PDRMASK) == 0 && 5411 tmpva + PDRMASK < base + size) { 5412 tmpva += NBPDR; 5413 continue; 5414 } 5415 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5416 PMAP_UNLOCK(kernel_pmap); 5417 return (ENOMEM); 5418 } 5419 } 5420 pte = vtopte(tmpva); 5421 if (*pte == 0) { 5422 PMAP_UNLOCK(kernel_pmap); 5423 return (EINVAL); 5424 } 5425 tmpva += PAGE_SIZE; 5426 } 5427 PMAP_UNLOCK(kernel_pmap); 5428 5429 /* 5430 * Ok, all the pages exist, so run through them updating their 5431 * cache mode if required. 5432 */ 5433 for (tmpva = base; tmpva < base + size; ) { 5434 pde = pmap_pde(kernel_pmap, tmpva); 5435 if (*pde & PG_PS) { 5436 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5437 pmap_pde_attr(pde, cache_bits_pde); 5438 changed = TRUE; 5439 } 5440 tmpva = trunc_4mpage(tmpva) + NBPDR; 5441 } else { 5442 pte = vtopte(tmpva); 5443 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5444 pmap_pte_attr(pte, cache_bits_pte); 5445 changed = TRUE; 5446 } 5447 tmpva += PAGE_SIZE; 5448 } 5449 } 5450 5451 /* 5452 * Flush CPU caches to make sure any data isn't cached that 5453 * shouldn't be, etc. 5454 */ 5455 if (changed) { 5456 pmap_invalidate_range(kernel_pmap, base, tmpva); 5457 pmap_invalidate_cache_range(base, tmpva, FALSE); 5458 } 5459 return (0); 5460} 5461 5462/* 5463 * perform the pmap work for mincore 5464 */ 5465int 5466pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5467{ 5468 pd_entry_t *pdep; 5469 pt_entry_t *ptep, pte; 5470 vm_paddr_t pa; 5471 int val; 5472 5473 PMAP_LOCK(pmap); 5474retry: 5475 pdep = pmap_pde(pmap, addr); 5476 if (*pdep != 0) { 5477 if (*pdep & PG_PS) { 5478 pte = *pdep; 5479 /* Compute the physical address of the 4KB page. */ 5480 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5481 PG_FRAME; 5482 val = MINCORE_SUPER; 5483 } else { 5484 ptep = pmap_pte(pmap, addr); 5485 pte = *ptep; 5486 pmap_pte_release(ptep); 5487 pa = pte & PG_FRAME; 5488 val = 0; 5489 } 5490 } else { 5491 pte = 0; 5492 pa = 0; 5493 val = 0; 5494 } 5495 if ((pte & PG_V) != 0) { 5496 val |= MINCORE_INCORE; 5497 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5498 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5499 if ((pte & PG_A) != 0) 5500 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5501 } 5502 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5503 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5504 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5505 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5506 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5507 goto retry; 5508 } else 5509 PA_UNLOCK_COND(*locked_pa); 5510 PMAP_UNLOCK(pmap); 5511 return (val); 5512} 5513 5514void 5515pmap_activate(struct thread *td) 5516{ 5517 pmap_t pmap, oldpmap; 5518 u_int cpuid; 5519 u_int32_t cr3; 5520 5521 critical_enter(); 5522 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5523 oldpmap = PCPU_GET(curpmap); 5524 cpuid = PCPU_GET(cpuid); 5525#if defined(SMP) 5526 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5527 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5528#else 5529 CPU_CLR(cpuid, &oldpmap->pm_active); 5530 CPU_SET(cpuid, &pmap->pm_active); 5531#endif 5532#if defined(PAE) || defined(PAE_TABLES) 5533 cr3 = vtophys(pmap->pm_pdpt); 5534#else 5535 cr3 = vtophys(pmap->pm_pdir); 5536#endif 5537 /* 5538 * pmap_activate is for the current thread on the current cpu 5539 */ 5540 td->td_pcb->pcb_cr3 = cr3; 5541 load_cr3(cr3); 5542 PCPU_SET(curpmap, pmap); 5543 critical_exit(); 5544} 5545 5546void 5547pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5548{ 5549} 5550 5551/* 5552 * Increase the starting virtual address of the given mapping if a 5553 * different alignment might result in more superpage mappings. 5554 */ 5555void 5556pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5557 vm_offset_t *addr, vm_size_t size) 5558{ 5559 vm_offset_t superpage_offset; 5560 5561 if (size < NBPDR) 5562 return; 5563 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5564 offset += ptoa(object->pg_color); 5565 superpage_offset = offset & PDRMASK; 5566 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5567 (*addr & PDRMASK) == superpage_offset) 5568 return; 5569 if ((*addr & PDRMASK) < superpage_offset) 5570 *addr = (*addr & ~PDRMASK) + superpage_offset; 5571 else 5572 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5573} 5574 5575 5576#if defined(PMAP_DEBUG) 5577pmap_pid_dump(int pid) 5578{ 5579 pmap_t pmap; 5580 struct proc *p; 5581 int npte = 0; 5582 int index; 5583 5584 sx_slock(&allproc_lock); 5585 FOREACH_PROC_IN_SYSTEM(p) { 5586 if (p->p_pid != pid) 5587 continue; 5588 5589 if (p->p_vmspace) { 5590 int i,j; 5591 index = 0; 5592 pmap = vmspace_pmap(p->p_vmspace); 5593 for (i = 0; i < NPDEPTD; i++) { 5594 pd_entry_t *pde; 5595 pt_entry_t *pte; 5596 vm_offset_t base = i << PDRSHIFT; 5597 5598 pde = &pmap->pm_pdir[i]; 5599 if (pde && pmap_pde_v(pde)) { 5600 for (j = 0; j < NPTEPG; j++) { 5601 vm_offset_t va = base + (j << PAGE_SHIFT); 5602 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5603 if (index) { 5604 index = 0; 5605 printf("\n"); 5606 } 5607 sx_sunlock(&allproc_lock); 5608 return (npte); 5609 } 5610 pte = pmap_pte(pmap, va); 5611 if (pte && pmap_pte_v(pte)) { 5612 pt_entry_t pa; 5613 vm_page_t m; 5614 pa = *pte; 5615 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5616 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5617 va, pa, m->hold_count, m->wire_count, m->flags); 5618 npte++; 5619 index++; 5620 if (index >= 2) { 5621 index = 0; 5622 printf("\n"); 5623 } else { 5624 printf(" "); 5625 } 5626 } 5627 } 5628 } 5629 } 5630 } 5631 } 5632 sx_sunlock(&allproc_lock); 5633 return (npte); 5634} 5635#endif 5636 5637#if defined(DEBUG) 5638 5639static void pads(pmap_t pm); 5640void pmap_pvdump(vm_paddr_t pa); 5641 5642/* print address space of pmap*/ 5643static void 5644pads(pmap_t pm) 5645{ 5646 int i, j; 5647 vm_paddr_t va; 5648 pt_entry_t *ptep; 5649 5650 if (pm == kernel_pmap) 5651 return; 5652 for (i = 0; i < NPDEPTD; i++) 5653 if (pm->pm_pdir[i]) 5654 for (j = 0; j < NPTEPG; j++) { 5655 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5656 if (pm == kernel_pmap && va < KERNBASE) 5657 continue; 5658 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5659 continue; 5660 ptep = pmap_pte(pm, va); 5661 if (pmap_pte_v(ptep)) 5662 printf("%x:%x ", va, *ptep); 5663 }; 5664 5665} 5666 5667void 5668pmap_pvdump(vm_paddr_t pa) 5669{ 5670 pv_entry_t pv; 5671 pmap_t pmap; 5672 vm_page_t m; 5673 5674 printf("pa %x", pa); 5675 m = PHYS_TO_VM_PAGE(pa); 5676 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5677 pmap = PV_PMAP(pv); 5678 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5679 pads(pmap); 5680 } 5681 printf(" "); 5682} 5683#endif 5684