pmap.c revision 276546
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: stable/10/sys/i386/i386/pmap.c 276546 2015-01-02 17:45:52Z alc $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_xbox.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/ktr.h> 109#include <sys/lock.h> 110#include <sys/malloc.h> 111#include <sys/mman.h> 112#include <sys/msgbuf.h> 113#include <sys/mutex.h> 114#include <sys/proc.h> 115#include <sys/rwlock.h> 116#include <sys/sf_buf.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#ifdef SMP 122#include <sys/smp.h> 123#else 124#include <sys/cpuset.h> 125#endif 126 127#include <vm/vm.h> 128#include <vm/vm_param.h> 129#include <vm/vm_kern.h> 130#include <vm/vm_page.h> 131#include <vm/vm_map.h> 132#include <vm/vm_object.h> 133#include <vm/vm_extern.h> 134#include <vm/vm_pageout.h> 135#include <vm/vm_pager.h> 136#include <vm/vm_phys.h> 137#include <vm/vm_radix.h> 138#include <vm/vm_reserv.h> 139#include <vm/uma.h> 140 141#ifdef DEV_APIC 142#include <sys/bus.h> 143#include <machine/intr_machdep.h> 144#include <machine/apicvar.h> 145#endif 146#include <machine/cpu.h> 147#include <machine/cputypes.h> 148#include <machine/md_var.h> 149#include <machine/pcb.h> 150#include <machine/specialreg.h> 151#ifdef SMP 152#include <machine/smp.h> 153#endif 154 155#ifdef XBOX 156#include <machine/xbox.h> 157#endif 158 159#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 160#define CPU_ENABLE_SSE 161#endif 162 163#ifndef PMAP_SHPGPERPROC 164#define PMAP_SHPGPERPROC 200 165#endif 166 167#if !defined(DIAGNOSTIC) 168#ifdef __GNUC_GNU_INLINE__ 169#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 170#else 171#define PMAP_INLINE extern inline 172#endif 173#else 174#define PMAP_INLINE 175#endif 176 177#ifdef PV_STATS 178#define PV_STAT(x) do { x ; } while (0) 179#else 180#define PV_STAT(x) do { } while (0) 181#endif 182 183#define pa_index(pa) ((pa) >> PDRSHIFT) 184#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 185 186/* 187 * Get PDEs and PTEs for user/kernel address space 188 */ 189#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 190#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 191 192#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 193#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 194#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 195#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 196#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 197 198#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 199 atomic_clear_int((u_int *)(pte), PG_W)) 200#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 201 202struct pmap kernel_pmap_store; 203LIST_HEAD(pmaplist, pmap); 204static struct pmaplist allpmaps; 205static struct mtx allpmaps_lock; 206 207vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 208vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 209int pgeflag = 0; /* PG_G or-in */ 210int pseflag = 0; /* PG_PS or-in */ 211 212static int nkpt = NKPT; 213vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 214extern u_int32_t KERNend; 215extern u_int32_t KPTphys; 216 217#ifdef PAE 218pt_entry_t pg_nx; 219static uma_zone_t pdptzone; 220#endif 221 222static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 223 224static int pat_works = 1; 225SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 226 "Is page attribute table fully functional?"); 227 228static int pg_ps_enabled = 1; 229SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 230 "Are large page mappings enabled?"); 231 232#define PAT_INDEX_SIZE 8 233static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 234 235static struct rwlock_padalign pvh_global_lock; 236 237/* 238 * Data for the pv entry allocation mechanism 239 */ 240static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 241static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 242static struct md_page *pv_table; 243static int shpgperproc = PMAP_SHPGPERPROC; 244 245struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 246int pv_maxchunks; /* How many chunks we have KVA for */ 247vm_offset_t pv_vafree; /* freelist stored in the PTE */ 248 249/* 250 * All those kernel PT submaps that BSD is so fond of 251 */ 252struct sysmaps { 253 struct mtx lock; 254 pt_entry_t *CMAP1; 255 pt_entry_t *CMAP2; 256 caddr_t CADDR1; 257 caddr_t CADDR2; 258}; 259static struct sysmaps sysmaps_pcpu[MAXCPU]; 260pt_entry_t *CMAP3; 261static pd_entry_t *KPTD; 262caddr_t ptvmmap = 0; 263caddr_t CADDR3; 264struct msgbuf *msgbufp = 0; 265 266/* 267 * Crashdump maps. 268 */ 269static caddr_t crashdumpmap; 270 271static pt_entry_t *PMAP1 = 0, *PMAP2; 272static pt_entry_t *PADDR1 = 0, *PADDR2; 273#ifdef SMP 274static int PMAP1cpu; 275static int PMAP1changedcpu; 276SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 277 &PMAP1changedcpu, 0, 278 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 279#endif 280static int PMAP1changed; 281SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 282 &PMAP1changed, 0, 283 "Number of times pmap_pte_quick changed PMAP1"); 284static int PMAP1unchanged; 285SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 286 &PMAP1unchanged, 0, 287 "Number of times pmap_pte_quick didn't change PMAP1"); 288static struct mtx PMAP2mutex; 289 290static void free_pv_chunk(struct pv_chunk *pc); 291static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 292static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 293static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 294static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 295static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 296static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 297static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 298 vm_offset_t va); 299static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 300 301static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 302static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 303 vm_prot_t prot); 304static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 305 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 306static void pmap_flush_page(vm_page_t m); 307static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 308static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 309static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 310static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 311static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 312static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 313static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 314static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 315static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 316static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 317 vm_prot_t prot); 318static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 319static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 320 struct spglist *free); 321static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 322 struct spglist *free); 323static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 324static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 325 struct spglist *free); 326static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 327 vm_offset_t va); 328static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 329static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 330 vm_page_t m); 331static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 332 pd_entry_t newpde); 333static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 334 335static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 336 337static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 338static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 339static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 340static void pmap_pte_release(pt_entry_t *pte); 341static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 342#ifdef PAE 343static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 344#endif 345static void pmap_set_pg(void); 346 347static __inline void pagezero(void *page); 348 349CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 350CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 351 352/* 353 * If you get an error here, then you set KVA_PAGES wrong! See the 354 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 355 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 356 */ 357CTASSERT(KERNBASE % (1 << 24) == 0); 358 359/* 360 * Bootstrap the system enough to run with virtual memory. 361 * 362 * On the i386 this is called after mapping has already been enabled 363 * and just syncs the pmap module with what has already been done. 364 * [We can't call it easily with mapping off since the kernel is not 365 * mapped with PA == VA, hence we would have to relocate every address 366 * from the linked base (virtual) address "KERNBASE" to the actual 367 * (physical) address starting relative to 0] 368 */ 369void 370pmap_bootstrap(vm_paddr_t firstaddr) 371{ 372 vm_offset_t va; 373 pt_entry_t *pte, *unused; 374 struct sysmaps *sysmaps; 375 int i; 376 377 /* 378 * Add a physical memory segment (vm_phys_seg) corresponding to the 379 * preallocated kernel page table pages so that vm_page structures 380 * representing these pages will be created. The vm_page structures 381 * are required for promotion of the corresponding kernel virtual 382 * addresses to superpage mappings. 383 */ 384 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 385 386 /* 387 * Initialize the first available kernel virtual address. However, 388 * using "firstaddr" may waste a few pages of the kernel virtual 389 * address space, because locore may not have mapped every physical 390 * page that it allocated. Preferably, locore would provide a first 391 * unused virtual address in addition to "firstaddr". 392 */ 393 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 394 395 virtual_end = VM_MAX_KERNEL_ADDRESS; 396 397 /* 398 * Initialize the kernel pmap (which is statically allocated). 399 */ 400 PMAP_LOCK_INIT(kernel_pmap); 401 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 402#ifdef PAE 403 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 404#endif 405 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 406 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 407 408 /* 409 * Initialize the global pv list lock. 410 */ 411 rw_init(&pvh_global_lock, "pmap pv global"); 412 413 LIST_INIT(&allpmaps); 414 415 /* 416 * Request a spin mutex so that changes to allpmaps cannot be 417 * preempted by smp_rendezvous_cpus(). Otherwise, 418 * pmap_update_pde_kernel() could access allpmaps while it is 419 * being changed. 420 */ 421 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 422 mtx_lock_spin(&allpmaps_lock); 423 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 424 mtx_unlock_spin(&allpmaps_lock); 425 426 /* 427 * Reserve some special page table entries/VA space for temporary 428 * mapping of pages. 429 */ 430#define SYSMAP(c, p, v, n) \ 431 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 432 433 va = virtual_avail; 434 pte = vtopte(va); 435 436 /* 437 * CMAP1/CMAP2 are used for zeroing and copying pages. 438 * CMAP3 is used for the idle process page zeroing. 439 */ 440 for (i = 0; i < MAXCPU; i++) { 441 sysmaps = &sysmaps_pcpu[i]; 442 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 443 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 444 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 445 } 446 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 447 448 /* 449 * Crashdump maps. 450 */ 451 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 452 453 /* 454 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 455 */ 456 SYSMAP(caddr_t, unused, ptvmmap, 1) 457 458 /* 459 * msgbufp is used to map the system message buffer. 460 */ 461 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 462 463 /* 464 * KPTmap is used by pmap_kextract(). 465 * 466 * KPTmap is first initialized by locore. However, that initial 467 * KPTmap can only support NKPT page table pages. Here, a larger 468 * KPTmap is created that can support KVA_PAGES page table pages. 469 */ 470 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 471 472 for (i = 0; i < NKPT; i++) 473 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 474 475 /* 476 * Adjust the start of the KPTD and KPTmap so that the implementation 477 * of pmap_kextract() and pmap_growkernel() can be made simpler. 478 */ 479 KPTD -= KPTDI; 480 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 481 482 /* 483 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 484 * respectively. 485 */ 486 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 487 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 488 489 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 490 491 virtual_avail = va; 492 493 /* 494 * Leave in place an identity mapping (virt == phys) for the low 1 MB 495 * physical memory region that is used by the ACPI wakeup code. This 496 * mapping must not have PG_G set. 497 */ 498#ifdef XBOX 499 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 500 * an early stadium, we cannot yet neatly map video memory ... :-( 501 * Better fixes are very welcome! */ 502 if (!arch_i386_is_xbox) 503#endif 504 for (i = 1; i < NKPT; i++) 505 PTD[i] = 0; 506 507 /* Initialize the PAT MSR if present. */ 508 pmap_init_pat(); 509 510 /* Turn on PG_G on kernel page(s) */ 511 pmap_set_pg(); 512} 513 514/* 515 * Setup the PAT MSR. 516 */ 517void 518pmap_init_pat(void) 519{ 520 int pat_table[PAT_INDEX_SIZE]; 521 uint64_t pat_msr; 522 u_long cr0, cr4; 523 int i; 524 525 /* Set default PAT index table. */ 526 for (i = 0; i < PAT_INDEX_SIZE; i++) 527 pat_table[i] = -1; 528 pat_table[PAT_WRITE_BACK] = 0; 529 pat_table[PAT_WRITE_THROUGH] = 1; 530 pat_table[PAT_UNCACHEABLE] = 3; 531 pat_table[PAT_WRITE_COMBINING] = 3; 532 pat_table[PAT_WRITE_PROTECTED] = 3; 533 pat_table[PAT_UNCACHED] = 3; 534 535 /* Bail if this CPU doesn't implement PAT. */ 536 if ((cpu_feature & CPUID_PAT) == 0) { 537 for (i = 0; i < PAT_INDEX_SIZE; i++) 538 pat_index[i] = pat_table[i]; 539 pat_works = 0; 540 return; 541 } 542 543 /* 544 * Due to some Intel errata, we can only safely use the lower 4 545 * PAT entries. 546 * 547 * Intel Pentium III Processor Specification Update 548 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 549 * or Mode C Paging) 550 * 551 * Intel Pentium IV Processor Specification Update 552 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 553 */ 554 if (cpu_vendor_id == CPU_VENDOR_INTEL && 555 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 556 pat_works = 0; 557 558 /* Initialize default PAT entries. */ 559 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 560 PAT_VALUE(1, PAT_WRITE_THROUGH) | 561 PAT_VALUE(2, PAT_UNCACHED) | 562 PAT_VALUE(3, PAT_UNCACHEABLE) | 563 PAT_VALUE(4, PAT_WRITE_BACK) | 564 PAT_VALUE(5, PAT_WRITE_THROUGH) | 565 PAT_VALUE(6, PAT_UNCACHED) | 566 PAT_VALUE(7, PAT_UNCACHEABLE); 567 568 if (pat_works) { 569 /* 570 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 571 * Program 5 and 6 as WP and WC. 572 * Leave 4 and 7 as WB and UC. 573 */ 574 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 575 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 576 PAT_VALUE(6, PAT_WRITE_COMBINING); 577 pat_table[PAT_UNCACHED] = 2; 578 pat_table[PAT_WRITE_PROTECTED] = 5; 579 pat_table[PAT_WRITE_COMBINING] = 6; 580 } else { 581 /* 582 * Just replace PAT Index 2 with WC instead of UC-. 583 */ 584 pat_msr &= ~PAT_MASK(2); 585 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 586 pat_table[PAT_WRITE_COMBINING] = 2; 587 } 588 589 /* Disable PGE. */ 590 cr4 = rcr4(); 591 load_cr4(cr4 & ~CR4_PGE); 592 593 /* Disable caches (CD = 1, NW = 0). */ 594 cr0 = rcr0(); 595 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 596 597 /* Flushes caches and TLBs. */ 598 wbinvd(); 599 invltlb(); 600 601 /* Update PAT and index table. */ 602 wrmsr(MSR_PAT, pat_msr); 603 for (i = 0; i < PAT_INDEX_SIZE; i++) 604 pat_index[i] = pat_table[i]; 605 606 /* Flush caches and TLBs again. */ 607 wbinvd(); 608 invltlb(); 609 610 /* Restore caches and PGE. */ 611 load_cr0(cr0); 612 load_cr4(cr4); 613} 614 615/* 616 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 617 */ 618static void 619pmap_set_pg(void) 620{ 621 pt_entry_t *pte; 622 vm_offset_t va, endva; 623 624 if (pgeflag == 0) 625 return; 626 627 endva = KERNBASE + KERNend; 628 629 if (pseflag) { 630 va = KERNBASE + KERNLOAD; 631 while (va < endva) { 632 pdir_pde(PTD, va) |= pgeflag; 633 invltlb(); /* Play it safe, invltlb() every time */ 634 va += NBPDR; 635 } 636 } else { 637 va = (vm_offset_t)btext; 638 while (va < endva) { 639 pte = vtopte(va); 640 if (*pte) 641 *pte |= pgeflag; 642 invltlb(); /* Play it safe, invltlb() every time */ 643 va += PAGE_SIZE; 644 } 645 } 646} 647 648/* 649 * Initialize a vm_page's machine-dependent fields. 650 */ 651void 652pmap_page_init(vm_page_t m) 653{ 654 655 TAILQ_INIT(&m->md.pv_list); 656 m->md.pat_mode = PAT_WRITE_BACK; 657} 658 659#ifdef PAE 660static void * 661pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 662{ 663 664 /* Inform UMA that this allocator uses kernel_map/object. */ 665 *flags = UMA_SLAB_KERNEL; 666 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 667 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 668} 669#endif 670 671/* 672 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 673 * Requirements: 674 * - Must deal with pages in order to ensure that none of the PG_* bits 675 * are ever set, PG_V in particular. 676 * - Assumes we can write to ptes without pte_store() atomic ops, even 677 * on PAE systems. This should be ok. 678 * - Assumes nothing will ever test these addresses for 0 to indicate 679 * no mapping instead of correctly checking PG_V. 680 * - Assumes a vm_offset_t will fit in a pte (true for i386). 681 * Because PG_V is never set, there can be no mappings to invalidate. 682 */ 683static vm_offset_t 684pmap_ptelist_alloc(vm_offset_t *head) 685{ 686 pt_entry_t *pte; 687 vm_offset_t va; 688 689 va = *head; 690 if (va == 0) 691 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 692 pte = vtopte(va); 693 *head = *pte; 694 if (*head & PG_V) 695 panic("pmap_ptelist_alloc: va with PG_V set!"); 696 *pte = 0; 697 return (va); 698} 699 700static void 701pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 702{ 703 pt_entry_t *pte; 704 705 if (va & PG_V) 706 panic("pmap_ptelist_free: freeing va with PG_V set!"); 707 pte = vtopte(va); 708 *pte = *head; /* virtual! PG_V is 0 though */ 709 *head = va; 710} 711 712static void 713pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 714{ 715 int i; 716 vm_offset_t va; 717 718 *head = 0; 719 for (i = npages - 1; i >= 0; i--) { 720 va = (vm_offset_t)base + i * PAGE_SIZE; 721 pmap_ptelist_free(head, va); 722 } 723} 724 725 726/* 727 * Initialize the pmap module. 728 * Called by vm_init, to initialize any structures that the pmap 729 * system needs to map virtual memory. 730 */ 731void 732pmap_init(void) 733{ 734 vm_page_t mpte; 735 vm_size_t s; 736 int i, pv_npg; 737 738 /* 739 * Initialize the vm page array entries for the kernel pmap's 740 * page table pages. 741 */ 742 for (i = 0; i < NKPT; i++) { 743 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 744 KASSERT(mpte >= vm_page_array && 745 mpte < &vm_page_array[vm_page_array_size], 746 ("pmap_init: page table page is out of range")); 747 mpte->pindex = i + KPTDI; 748 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 749 } 750 751 /* 752 * Initialize the address space (zone) for the pv entries. Set a 753 * high water mark so that the system can recover from excessive 754 * numbers of pv entries. 755 */ 756 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 757 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 758 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 759 pv_entry_max = roundup(pv_entry_max, _NPCPV); 760 pv_entry_high_water = 9 * (pv_entry_max / 10); 761 762 /* 763 * If the kernel is running on a virtual machine, then it must assume 764 * that MCA is enabled by the hypervisor. Moreover, the kernel must 765 * be prepared for the hypervisor changing the vendor and family that 766 * are reported by CPUID. Consequently, the workaround for AMD Family 767 * 10h Erratum 383 is enabled if the processor's feature set does not 768 * include at least one feature that is only supported by older Intel 769 * or newer AMD processors. 770 */ 771 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 772 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 773 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 774 AMDID2_FMA4)) == 0) 775 workaround_erratum383 = 1; 776 777 /* 778 * Are large page mappings supported and enabled? 779 */ 780 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 781 if (pseflag == 0) 782 pg_ps_enabled = 0; 783 else if (pg_ps_enabled) { 784 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 785 ("pmap_init: can't assign to pagesizes[1]")); 786 pagesizes[1] = NBPDR; 787 } 788 789 /* 790 * Calculate the size of the pv head table for superpages. 791 * Handle the possibility that "vm_phys_segs[...].end" is zero. 792 */ 793 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 794 PAGE_SIZE) / NBPDR + 1; 795 796 /* 797 * Allocate memory for the pv head table for superpages. 798 */ 799 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 800 s = round_page(s); 801 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 802 M_WAITOK | M_ZERO); 803 for (i = 0; i < pv_npg; i++) 804 TAILQ_INIT(&pv_table[i].pv_list); 805 806 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 807 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 808 if (pv_chunkbase == NULL) 809 panic("pmap_init: not enough kvm for pv chunks"); 810 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 811#ifdef PAE 812 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 813 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 814 UMA_ZONE_VM | UMA_ZONE_NOFREE); 815 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 816#endif 817} 818 819 820SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 821 "Max number of PV entries"); 822SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 823 "Page share factor per proc"); 824 825static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 826 "2/4MB page mapping counters"); 827 828static u_long pmap_pde_demotions; 829SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 830 &pmap_pde_demotions, 0, "2/4MB page demotions"); 831 832static u_long pmap_pde_mappings; 833SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 834 &pmap_pde_mappings, 0, "2/4MB page mappings"); 835 836static u_long pmap_pde_p_failures; 837SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 838 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 839 840static u_long pmap_pde_promotions; 841SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 842 &pmap_pde_promotions, 0, "2/4MB page promotions"); 843 844/*************************************************** 845 * Low level helper routines..... 846 ***************************************************/ 847 848/* 849 * Determine the appropriate bits to set in a PTE or PDE for a specified 850 * caching mode. 851 */ 852int 853pmap_cache_bits(int mode, boolean_t is_pde) 854{ 855 int cache_bits, pat_flag, pat_idx; 856 857 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 858 panic("Unknown caching mode %d\n", mode); 859 860 /* The PAT bit is different for PTE's and PDE's. */ 861 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 862 863 /* Map the caching mode to a PAT index. */ 864 pat_idx = pat_index[mode]; 865 866 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 867 cache_bits = 0; 868 if (pat_idx & 0x4) 869 cache_bits |= pat_flag; 870 if (pat_idx & 0x2) 871 cache_bits |= PG_NC_PCD; 872 if (pat_idx & 0x1) 873 cache_bits |= PG_NC_PWT; 874 return (cache_bits); 875} 876 877/* 878 * The caller is responsible for maintaining TLB consistency. 879 */ 880static void 881pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 882{ 883 pd_entry_t *pde; 884 pmap_t pmap; 885 boolean_t PTD_updated; 886 887 PTD_updated = FALSE; 888 mtx_lock_spin(&allpmaps_lock); 889 LIST_FOREACH(pmap, &allpmaps, pm_list) { 890 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 891 PG_FRAME)) 892 PTD_updated = TRUE; 893 pde = pmap_pde(pmap, va); 894 pde_store(pde, newpde); 895 } 896 mtx_unlock_spin(&allpmaps_lock); 897 KASSERT(PTD_updated, 898 ("pmap_kenter_pde: current page table is not in allpmaps")); 899} 900 901/* 902 * After changing the page size for the specified virtual address in the page 903 * table, flush the corresponding entries from the processor's TLB. Only the 904 * calling processor's TLB is affected. 905 * 906 * The calling thread must be pinned to a processor. 907 */ 908static void 909pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 910{ 911 u_long cr4; 912 913 if ((newpde & PG_PS) == 0) 914 /* Demotion: flush a specific 2MB page mapping. */ 915 invlpg(va); 916 else if ((newpde & PG_G) == 0) 917 /* 918 * Promotion: flush every 4KB page mapping from the TLB 919 * because there are too many to flush individually. 920 */ 921 invltlb(); 922 else { 923 /* 924 * Promotion: flush every 4KB page mapping from the TLB, 925 * including any global (PG_G) mappings. 926 */ 927 cr4 = rcr4(); 928 load_cr4(cr4 & ~CR4_PGE); 929 /* 930 * Although preemption at this point could be detrimental to 931 * performance, it would not lead to an error. PG_G is simply 932 * ignored if CR4.PGE is clear. Moreover, in case this block 933 * is re-entered, the load_cr4() either above or below will 934 * modify CR4.PGE flushing the TLB. 935 */ 936 load_cr4(cr4 | CR4_PGE); 937 } 938} 939#ifdef SMP 940/* 941 * For SMP, these functions have to use the IPI mechanism for coherence. 942 * 943 * N.B.: Before calling any of the following TLB invalidation functions, 944 * the calling processor must ensure that all stores updating a non- 945 * kernel page table are globally performed. Otherwise, another 946 * processor could cache an old, pre-update entry without being 947 * invalidated. This can happen one of two ways: (1) The pmap becomes 948 * active on another processor after its pm_active field is checked by 949 * one of the following functions but before a store updating the page 950 * table is globally performed. (2) The pmap becomes active on another 951 * processor before its pm_active field is checked but due to 952 * speculative loads one of the following functions stills reads the 953 * pmap as inactive on the other processor. 954 * 955 * The kernel page table is exempt because its pm_active field is 956 * immutable. The kernel page table is always active on every 957 * processor. 958 */ 959void 960pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 961{ 962 cpuset_t other_cpus; 963 u_int cpuid; 964 965 sched_pin(); 966 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 967 invlpg(va); 968 smp_invlpg(va); 969 } else { 970 cpuid = PCPU_GET(cpuid); 971 other_cpus = all_cpus; 972 CPU_CLR(cpuid, &other_cpus); 973 if (CPU_ISSET(cpuid, &pmap->pm_active)) 974 invlpg(va); 975 CPU_AND(&other_cpus, &pmap->pm_active); 976 if (!CPU_EMPTY(&other_cpus)) 977 smp_masked_invlpg(other_cpus, va); 978 } 979 sched_unpin(); 980} 981 982void 983pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 984{ 985 cpuset_t other_cpus; 986 vm_offset_t addr; 987 u_int cpuid; 988 989 sched_pin(); 990 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 991 for (addr = sva; addr < eva; addr += PAGE_SIZE) 992 invlpg(addr); 993 smp_invlpg_range(sva, eva); 994 } else { 995 cpuid = PCPU_GET(cpuid); 996 other_cpus = all_cpus; 997 CPU_CLR(cpuid, &other_cpus); 998 if (CPU_ISSET(cpuid, &pmap->pm_active)) 999 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1000 invlpg(addr); 1001 CPU_AND(&other_cpus, &pmap->pm_active); 1002 if (!CPU_EMPTY(&other_cpus)) 1003 smp_masked_invlpg_range(other_cpus, sva, eva); 1004 } 1005 sched_unpin(); 1006} 1007 1008void 1009pmap_invalidate_all(pmap_t pmap) 1010{ 1011 cpuset_t other_cpus; 1012 u_int cpuid; 1013 1014 sched_pin(); 1015 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1016 invltlb(); 1017 smp_invltlb(); 1018 } else { 1019 cpuid = PCPU_GET(cpuid); 1020 other_cpus = all_cpus; 1021 CPU_CLR(cpuid, &other_cpus); 1022 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1023 invltlb(); 1024 CPU_AND(&other_cpus, &pmap->pm_active); 1025 if (!CPU_EMPTY(&other_cpus)) 1026 smp_masked_invltlb(other_cpus); 1027 } 1028 sched_unpin(); 1029} 1030 1031void 1032pmap_invalidate_cache(void) 1033{ 1034 1035 sched_pin(); 1036 wbinvd(); 1037 smp_cache_flush(); 1038 sched_unpin(); 1039} 1040 1041struct pde_action { 1042 cpuset_t invalidate; /* processors that invalidate their TLB */ 1043 vm_offset_t va; 1044 pd_entry_t *pde; 1045 pd_entry_t newpde; 1046 u_int store; /* processor that updates the PDE */ 1047}; 1048 1049static void 1050pmap_update_pde_kernel(void *arg) 1051{ 1052 struct pde_action *act = arg; 1053 pd_entry_t *pde; 1054 pmap_t pmap; 1055 1056 if (act->store == PCPU_GET(cpuid)) { 1057 1058 /* 1059 * Elsewhere, this operation requires allpmaps_lock for 1060 * synchronization. Here, it does not because it is being 1061 * performed in the context of an all_cpus rendezvous. 1062 */ 1063 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1064 pde = pmap_pde(pmap, act->va); 1065 pde_store(pde, act->newpde); 1066 } 1067 } 1068} 1069 1070static void 1071pmap_update_pde_user(void *arg) 1072{ 1073 struct pde_action *act = arg; 1074 1075 if (act->store == PCPU_GET(cpuid)) 1076 pde_store(act->pde, act->newpde); 1077} 1078 1079static void 1080pmap_update_pde_teardown(void *arg) 1081{ 1082 struct pde_action *act = arg; 1083 1084 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1085 pmap_update_pde_invalidate(act->va, act->newpde); 1086} 1087 1088/* 1089 * Change the page size for the specified virtual address in a way that 1090 * prevents any possibility of the TLB ever having two entries that map the 1091 * same virtual address using different page sizes. This is the recommended 1092 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1093 * machine check exception for a TLB state that is improperly diagnosed as a 1094 * hardware error. 1095 */ 1096static void 1097pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1098{ 1099 struct pde_action act; 1100 cpuset_t active, other_cpus; 1101 u_int cpuid; 1102 1103 sched_pin(); 1104 cpuid = PCPU_GET(cpuid); 1105 other_cpus = all_cpus; 1106 CPU_CLR(cpuid, &other_cpus); 1107 if (pmap == kernel_pmap) 1108 active = all_cpus; 1109 else 1110 active = pmap->pm_active; 1111 if (CPU_OVERLAP(&active, &other_cpus)) { 1112 act.store = cpuid; 1113 act.invalidate = active; 1114 act.va = va; 1115 act.pde = pde; 1116 act.newpde = newpde; 1117 CPU_SET(cpuid, &active); 1118 smp_rendezvous_cpus(active, 1119 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1120 pmap_update_pde_kernel : pmap_update_pde_user, 1121 pmap_update_pde_teardown, &act); 1122 } else { 1123 if (pmap == kernel_pmap) 1124 pmap_kenter_pde(va, newpde); 1125 else 1126 pde_store(pde, newpde); 1127 if (CPU_ISSET(cpuid, &active)) 1128 pmap_update_pde_invalidate(va, newpde); 1129 } 1130 sched_unpin(); 1131} 1132#else /* !SMP */ 1133/* 1134 * Normal, non-SMP, 486+ invalidation functions. 1135 * We inline these within pmap.c for speed. 1136 */ 1137PMAP_INLINE void 1138pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1139{ 1140 1141 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1142 invlpg(va); 1143} 1144 1145PMAP_INLINE void 1146pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1147{ 1148 vm_offset_t addr; 1149 1150 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1151 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1152 invlpg(addr); 1153} 1154 1155PMAP_INLINE void 1156pmap_invalidate_all(pmap_t pmap) 1157{ 1158 1159 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1160 invltlb(); 1161} 1162 1163PMAP_INLINE void 1164pmap_invalidate_cache(void) 1165{ 1166 1167 wbinvd(); 1168} 1169 1170static void 1171pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1172{ 1173 1174 if (pmap == kernel_pmap) 1175 pmap_kenter_pde(va, newpde); 1176 else 1177 pde_store(pde, newpde); 1178 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1179 pmap_update_pde_invalidate(va, newpde); 1180} 1181#endif /* !SMP */ 1182 1183#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1184 1185void 1186pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1187{ 1188 1189 if (force) { 1190 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1191 } else { 1192 KASSERT((sva & PAGE_MASK) == 0, 1193 ("pmap_invalidate_cache_range: sva not page-aligned")); 1194 KASSERT((eva & PAGE_MASK) == 0, 1195 ("pmap_invalidate_cache_range: eva not page-aligned")); 1196 } 1197 1198 if ((cpu_feature & CPUID_SS) != 0 && !force) 1199 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1200 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1201 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1202 1203#ifdef DEV_APIC 1204 /* 1205 * XXX: Some CPUs fault, hang, or trash the local APIC 1206 * registers if we use CLFLUSH on the local APIC 1207 * range. The local APIC is always uncached, so we 1208 * don't need to flush for that range anyway. 1209 */ 1210 if (pmap_kextract(sva) == lapic_paddr) 1211 return; 1212#endif 1213 /* 1214 * Otherwise, do per-cache line flush. Use the mfence 1215 * instruction to insure that previous stores are 1216 * included in the write-back. The processor 1217 * propagates flush to other processors in the cache 1218 * coherence domain. 1219 */ 1220 mfence(); 1221 for (; sva < eva; sva += cpu_clflush_line_size) 1222 clflush(sva); 1223 mfence(); 1224 } else { 1225 1226 /* 1227 * No targeted cache flush methods are supported by CPU, 1228 * or the supplied range is bigger than 2MB. 1229 * Globally invalidate cache. 1230 */ 1231 pmap_invalidate_cache(); 1232 } 1233} 1234 1235void 1236pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1237{ 1238 int i; 1239 1240 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1241 (cpu_feature & CPUID_CLFSH) == 0) { 1242 pmap_invalidate_cache(); 1243 } else { 1244 for (i = 0; i < count; i++) 1245 pmap_flush_page(pages[i]); 1246 } 1247} 1248 1249/* 1250 * Are we current address space or kernel? N.B. We return FALSE when 1251 * a pmap's page table is in use because a kernel thread is borrowing 1252 * it. The borrowed page table can change spontaneously, making any 1253 * dependence on its continued use subject to a race condition. 1254 */ 1255static __inline int 1256pmap_is_current(pmap_t pmap) 1257{ 1258 1259 return (pmap == kernel_pmap || 1260 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1261 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1262} 1263 1264/* 1265 * If the given pmap is not the current or kernel pmap, the returned pte must 1266 * be released by passing it to pmap_pte_release(). 1267 */ 1268pt_entry_t * 1269pmap_pte(pmap_t pmap, vm_offset_t va) 1270{ 1271 pd_entry_t newpf; 1272 pd_entry_t *pde; 1273 1274 pde = pmap_pde(pmap, va); 1275 if (*pde & PG_PS) 1276 return (pde); 1277 if (*pde != 0) { 1278 /* are we current address space or kernel? */ 1279 if (pmap_is_current(pmap)) 1280 return (vtopte(va)); 1281 mtx_lock(&PMAP2mutex); 1282 newpf = *pde & PG_FRAME; 1283 if ((*PMAP2 & PG_FRAME) != newpf) { 1284 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1285 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1286 } 1287 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1288 } 1289 return (NULL); 1290} 1291 1292/* 1293 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1294 * being NULL. 1295 */ 1296static __inline void 1297pmap_pte_release(pt_entry_t *pte) 1298{ 1299 1300 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1301 mtx_unlock(&PMAP2mutex); 1302} 1303 1304/* 1305 * NB: The sequence of updating a page table followed by accesses to the 1306 * corresponding pages is subject to the situation described in the "AMD64 1307 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1308 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1309 * right after modifying the PTE bits is crucial. 1310 */ 1311static __inline void 1312invlcaddr(void *caddr) 1313{ 1314 1315 invlpg((u_int)caddr); 1316} 1317 1318/* 1319 * Super fast pmap_pte routine best used when scanning 1320 * the pv lists. This eliminates many coarse-grained 1321 * invltlb calls. Note that many of the pv list 1322 * scans are across different pmaps. It is very wasteful 1323 * to do an entire invltlb for checking a single mapping. 1324 * 1325 * If the given pmap is not the current pmap, pvh_global_lock 1326 * must be held and curthread pinned to a CPU. 1327 */ 1328static pt_entry_t * 1329pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1330{ 1331 pd_entry_t newpf; 1332 pd_entry_t *pde; 1333 1334 pde = pmap_pde(pmap, va); 1335 if (*pde & PG_PS) 1336 return (pde); 1337 if (*pde != 0) { 1338 /* are we current address space or kernel? */ 1339 if (pmap_is_current(pmap)) 1340 return (vtopte(va)); 1341 rw_assert(&pvh_global_lock, RA_WLOCKED); 1342 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1343 newpf = *pde & PG_FRAME; 1344 if ((*PMAP1 & PG_FRAME) != newpf) { 1345 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1346#ifdef SMP 1347 PMAP1cpu = PCPU_GET(cpuid); 1348#endif 1349 invlcaddr(PADDR1); 1350 PMAP1changed++; 1351 } else 1352#ifdef SMP 1353 if (PMAP1cpu != PCPU_GET(cpuid)) { 1354 PMAP1cpu = PCPU_GET(cpuid); 1355 invlcaddr(PADDR1); 1356 PMAP1changedcpu++; 1357 } else 1358#endif 1359 PMAP1unchanged++; 1360 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1361 } 1362 return (0); 1363} 1364 1365/* 1366 * Routine: pmap_extract 1367 * Function: 1368 * Extract the physical page address associated 1369 * with the given map/virtual_address pair. 1370 */ 1371vm_paddr_t 1372pmap_extract(pmap_t pmap, vm_offset_t va) 1373{ 1374 vm_paddr_t rtval; 1375 pt_entry_t *pte; 1376 pd_entry_t pde; 1377 1378 rtval = 0; 1379 PMAP_LOCK(pmap); 1380 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1381 if (pde != 0) { 1382 if ((pde & PG_PS) != 0) 1383 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1384 else { 1385 pte = pmap_pte(pmap, va); 1386 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1387 pmap_pte_release(pte); 1388 } 1389 } 1390 PMAP_UNLOCK(pmap); 1391 return (rtval); 1392} 1393 1394/* 1395 * Routine: pmap_extract_and_hold 1396 * Function: 1397 * Atomically extract and hold the physical page 1398 * with the given pmap and virtual address pair 1399 * if that mapping permits the given protection. 1400 */ 1401vm_page_t 1402pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1403{ 1404 pd_entry_t pde; 1405 pt_entry_t pte, *ptep; 1406 vm_page_t m; 1407 vm_paddr_t pa; 1408 1409 pa = 0; 1410 m = NULL; 1411 PMAP_LOCK(pmap); 1412retry: 1413 pde = *pmap_pde(pmap, va); 1414 if (pde != 0) { 1415 if (pde & PG_PS) { 1416 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1417 if (vm_page_pa_tryrelock(pmap, (pde & 1418 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1419 goto retry; 1420 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1421 (va & PDRMASK)); 1422 vm_page_hold(m); 1423 } 1424 } else { 1425 ptep = pmap_pte(pmap, va); 1426 pte = *ptep; 1427 pmap_pte_release(ptep); 1428 if (pte != 0 && 1429 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1430 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1431 &pa)) 1432 goto retry; 1433 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1434 vm_page_hold(m); 1435 } 1436 } 1437 } 1438 PA_UNLOCK_COND(pa); 1439 PMAP_UNLOCK(pmap); 1440 return (m); 1441} 1442 1443/*************************************************** 1444 * Low level mapping routines..... 1445 ***************************************************/ 1446 1447/* 1448 * Add a wired page to the kva. 1449 * Note: not SMP coherent. 1450 * 1451 * This function may be used before pmap_bootstrap() is called. 1452 */ 1453PMAP_INLINE void 1454pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1455{ 1456 pt_entry_t *pte; 1457 1458 pte = vtopte(va); 1459 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1460} 1461 1462static __inline void 1463pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1464{ 1465 pt_entry_t *pte; 1466 1467 pte = vtopte(va); 1468 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1469} 1470 1471/* 1472 * Remove a page from the kernel pagetables. 1473 * Note: not SMP coherent. 1474 * 1475 * This function may be used before pmap_bootstrap() is called. 1476 */ 1477PMAP_INLINE void 1478pmap_kremove(vm_offset_t va) 1479{ 1480 pt_entry_t *pte; 1481 1482 pte = vtopte(va); 1483 pte_clear(pte); 1484} 1485 1486/* 1487 * Used to map a range of physical addresses into kernel 1488 * virtual address space. 1489 * 1490 * The value passed in '*virt' is a suggested virtual address for 1491 * the mapping. Architectures which can support a direct-mapped 1492 * physical to virtual region can return the appropriate address 1493 * within that region, leaving '*virt' unchanged. Other 1494 * architectures should map the pages starting at '*virt' and 1495 * update '*virt' with the first usable address after the mapped 1496 * region. 1497 */ 1498vm_offset_t 1499pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1500{ 1501 vm_offset_t va, sva; 1502 vm_paddr_t superpage_offset; 1503 pd_entry_t newpde; 1504 1505 va = *virt; 1506 /* 1507 * Does the physical address range's size and alignment permit at 1508 * least one superpage mapping to be created? 1509 */ 1510 superpage_offset = start & PDRMASK; 1511 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1512 /* 1513 * Increase the starting virtual address so that its alignment 1514 * does not preclude the use of superpage mappings. 1515 */ 1516 if ((va & PDRMASK) < superpage_offset) 1517 va = (va & ~PDRMASK) + superpage_offset; 1518 else if ((va & PDRMASK) > superpage_offset) 1519 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1520 } 1521 sva = va; 1522 while (start < end) { 1523 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1524 pseflag) { 1525 KASSERT((va & PDRMASK) == 0, 1526 ("pmap_map: misaligned va %#x", va)); 1527 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1528 pmap_kenter_pde(va, newpde); 1529 va += NBPDR; 1530 start += NBPDR; 1531 } else { 1532 pmap_kenter(va, start); 1533 va += PAGE_SIZE; 1534 start += PAGE_SIZE; 1535 } 1536 } 1537 pmap_invalidate_range(kernel_pmap, sva, va); 1538 *virt = va; 1539 return (sva); 1540} 1541 1542 1543/* 1544 * Add a list of wired pages to the kva 1545 * this routine is only used for temporary 1546 * kernel mappings that do not need to have 1547 * page modification or references recorded. 1548 * Note that old mappings are simply written 1549 * over. The page *must* be wired. 1550 * Note: SMP coherent. Uses a ranged shootdown IPI. 1551 */ 1552void 1553pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1554{ 1555 pt_entry_t *endpte, oldpte, pa, *pte; 1556 vm_page_t m; 1557 1558 oldpte = 0; 1559 pte = vtopte(sva); 1560 endpte = pte + count; 1561 while (pte < endpte) { 1562 m = *ma++; 1563 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1564 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1565 oldpte |= *pte; 1566 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1567 } 1568 pte++; 1569 } 1570 if (__predict_false((oldpte & PG_V) != 0)) 1571 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1572 PAGE_SIZE); 1573} 1574 1575/* 1576 * This routine tears out page mappings from the 1577 * kernel -- it is meant only for temporary mappings. 1578 * Note: SMP coherent. Uses a ranged shootdown IPI. 1579 */ 1580void 1581pmap_qremove(vm_offset_t sva, int count) 1582{ 1583 vm_offset_t va; 1584 1585 va = sva; 1586 while (count-- > 0) { 1587 pmap_kremove(va); 1588 va += PAGE_SIZE; 1589 } 1590 pmap_invalidate_range(kernel_pmap, sva, va); 1591} 1592 1593/*************************************************** 1594 * Page table page management routines..... 1595 ***************************************************/ 1596static __inline void 1597pmap_free_zero_pages(struct spglist *free) 1598{ 1599 vm_page_t m; 1600 1601 while ((m = SLIST_FIRST(free)) != NULL) { 1602 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1603 /* Preserve the page's PG_ZERO setting. */ 1604 vm_page_free_toq(m); 1605 } 1606} 1607 1608/* 1609 * Schedule the specified unused page table page to be freed. Specifically, 1610 * add the page to the specified list of pages that will be released to the 1611 * physical memory manager after the TLB has been updated. 1612 */ 1613static __inline void 1614pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1615 boolean_t set_PG_ZERO) 1616{ 1617 1618 if (set_PG_ZERO) 1619 m->flags |= PG_ZERO; 1620 else 1621 m->flags &= ~PG_ZERO; 1622 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1623} 1624 1625/* 1626 * Inserts the specified page table page into the specified pmap's collection 1627 * of idle page table pages. Each of a pmap's page table pages is responsible 1628 * for mapping a distinct range of virtual addresses. The pmap's collection is 1629 * ordered by this virtual address range. 1630 */ 1631static __inline int 1632pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1633{ 1634 1635 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1636 return (vm_radix_insert(&pmap->pm_root, mpte)); 1637} 1638 1639/* 1640 * Looks for a page table page mapping the specified virtual address in the 1641 * specified pmap's collection of idle page table pages. Returns NULL if there 1642 * is no page table page corresponding to the specified virtual address. 1643 */ 1644static __inline vm_page_t 1645pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1646{ 1647 1648 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1649 return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); 1650} 1651 1652/* 1653 * Removes the specified page table page from the specified pmap's collection 1654 * of idle page table pages. The specified page table page must be a member of 1655 * the pmap's collection. 1656 */ 1657static __inline void 1658pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1659{ 1660 1661 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1662 vm_radix_remove(&pmap->pm_root, mpte->pindex); 1663} 1664 1665/* 1666 * Decrements a page table page's wire count, which is used to record the 1667 * number of valid page table entries within the page. If the wire count 1668 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1669 * page table page was unmapped and FALSE otherwise. 1670 */ 1671static inline boolean_t 1672pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1673{ 1674 1675 --m->wire_count; 1676 if (m->wire_count == 0) { 1677 _pmap_unwire_ptp(pmap, m, free); 1678 return (TRUE); 1679 } else 1680 return (FALSE); 1681} 1682 1683static void 1684_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1685{ 1686 vm_offset_t pteva; 1687 1688 /* 1689 * unmap the page table page 1690 */ 1691 pmap->pm_pdir[m->pindex] = 0; 1692 --pmap->pm_stats.resident_count; 1693 1694 /* 1695 * This is a release store so that the ordinary store unmapping 1696 * the page table page is globally performed before TLB shoot- 1697 * down is begun. 1698 */ 1699 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1700 1701 /* 1702 * Do an invltlb to make the invalidated mapping 1703 * take effect immediately. 1704 */ 1705 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1706 pmap_invalidate_page(pmap, pteva); 1707 1708 /* 1709 * Put page on a list so that it is released after 1710 * *ALL* TLB shootdown is done 1711 */ 1712 pmap_add_delayed_free_list(m, free, TRUE); 1713} 1714 1715/* 1716 * After removing a page table entry, this routine is used to 1717 * conditionally free the page, and manage the hold/wire counts. 1718 */ 1719static int 1720pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1721{ 1722 pd_entry_t ptepde; 1723 vm_page_t mpte; 1724 1725 if (va >= VM_MAXUSER_ADDRESS) 1726 return (0); 1727 ptepde = *pmap_pde(pmap, va); 1728 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1729 return (pmap_unwire_ptp(pmap, mpte, free)); 1730} 1731 1732/* 1733 * Initialize the pmap for the swapper process. 1734 */ 1735void 1736pmap_pinit0(pmap_t pmap) 1737{ 1738 1739 PMAP_LOCK_INIT(pmap); 1740 /* 1741 * Since the page table directory is shared with the kernel pmap, 1742 * which is already included in the list "allpmaps", this pmap does 1743 * not need to be inserted into that list. 1744 */ 1745 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1746#ifdef PAE 1747 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1748#endif 1749 pmap->pm_root.rt_root = 0; 1750 CPU_ZERO(&pmap->pm_active); 1751 PCPU_SET(curpmap, pmap); 1752 TAILQ_INIT(&pmap->pm_pvchunk); 1753 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1754} 1755 1756/* 1757 * Initialize a preallocated and zeroed pmap structure, 1758 * such as one in a vmspace structure. 1759 */ 1760int 1761pmap_pinit(pmap_t pmap) 1762{ 1763 vm_page_t m, ptdpg[NPGPTD]; 1764 vm_paddr_t pa; 1765 int i; 1766 1767 /* 1768 * No need to allocate page table space yet but we do need a valid 1769 * page directory table. 1770 */ 1771 if (pmap->pm_pdir == NULL) { 1772 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1773 if (pmap->pm_pdir == NULL) 1774 return (0); 1775#ifdef PAE 1776 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1777 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1778 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1779 ("pmap_pinit: pdpt misaligned")); 1780 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1781 ("pmap_pinit: pdpt above 4g")); 1782#endif 1783 pmap->pm_root.rt_root = 0; 1784 } 1785 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1786 ("pmap_pinit: pmap has reserved page table page(s)")); 1787 1788 /* 1789 * allocate the page directory page(s) 1790 */ 1791 for (i = 0; i < NPGPTD;) { 1792 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1793 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1794 if (m == NULL) 1795 VM_WAIT; 1796 else { 1797 ptdpg[i++] = m; 1798 } 1799 } 1800 1801 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1802 1803 for (i = 0; i < NPGPTD; i++) 1804 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1805 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1806 1807 mtx_lock_spin(&allpmaps_lock); 1808 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1809 /* Copy the kernel page table directory entries. */ 1810 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1811 mtx_unlock_spin(&allpmaps_lock); 1812 1813 /* install self-referential address mapping entry(s) */ 1814 for (i = 0; i < NPGPTD; i++) { 1815 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1816 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1817#ifdef PAE 1818 pmap->pm_pdpt[i] = pa | PG_V; 1819#endif 1820 } 1821 1822 CPU_ZERO(&pmap->pm_active); 1823 TAILQ_INIT(&pmap->pm_pvchunk); 1824 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1825 1826 return (1); 1827} 1828 1829/* 1830 * this routine is called if the page table page is not 1831 * mapped correctly. 1832 */ 1833static vm_page_t 1834_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1835{ 1836 vm_paddr_t ptepa; 1837 vm_page_t m; 1838 1839 /* 1840 * Allocate a page table page. 1841 */ 1842 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1843 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1844 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1845 PMAP_UNLOCK(pmap); 1846 rw_wunlock(&pvh_global_lock); 1847 VM_WAIT; 1848 rw_wlock(&pvh_global_lock); 1849 PMAP_LOCK(pmap); 1850 } 1851 1852 /* 1853 * Indicate the need to retry. While waiting, the page table 1854 * page may have been allocated. 1855 */ 1856 return (NULL); 1857 } 1858 if ((m->flags & PG_ZERO) == 0) 1859 pmap_zero_page(m); 1860 1861 /* 1862 * Map the pagetable page into the process address space, if 1863 * it isn't already there. 1864 */ 1865 1866 pmap->pm_stats.resident_count++; 1867 1868 ptepa = VM_PAGE_TO_PHYS(m); 1869 pmap->pm_pdir[ptepindex] = 1870 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1871 1872 return (m); 1873} 1874 1875static vm_page_t 1876pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1877{ 1878 u_int ptepindex; 1879 pd_entry_t ptepa; 1880 vm_page_t m; 1881 1882 /* 1883 * Calculate pagetable page index 1884 */ 1885 ptepindex = va >> PDRSHIFT; 1886retry: 1887 /* 1888 * Get the page directory entry 1889 */ 1890 ptepa = pmap->pm_pdir[ptepindex]; 1891 1892 /* 1893 * This supports switching from a 4MB page to a 1894 * normal 4K page. 1895 */ 1896 if (ptepa & PG_PS) { 1897 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1898 ptepa = pmap->pm_pdir[ptepindex]; 1899 } 1900 1901 /* 1902 * If the page table page is mapped, we just increment the 1903 * hold count, and activate it. 1904 */ 1905 if (ptepa) { 1906 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1907 m->wire_count++; 1908 } else { 1909 /* 1910 * Here if the pte page isn't mapped, or if it has 1911 * been deallocated. 1912 */ 1913 m = _pmap_allocpte(pmap, ptepindex, flags); 1914 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 1915 goto retry; 1916 } 1917 return (m); 1918} 1919 1920 1921/*************************************************** 1922* Pmap allocation/deallocation routines. 1923 ***************************************************/ 1924 1925#ifdef SMP 1926/* 1927 * Deal with a SMP shootdown of other users of the pmap that we are 1928 * trying to dispose of. This can be a bit hairy. 1929 */ 1930static cpuset_t *lazymask; 1931static u_int lazyptd; 1932static volatile u_int lazywait; 1933 1934void pmap_lazyfix_action(void); 1935 1936void 1937pmap_lazyfix_action(void) 1938{ 1939 1940#ifdef COUNT_IPIS 1941 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1942#endif 1943 if (rcr3() == lazyptd) 1944 load_cr3(curpcb->pcb_cr3); 1945 CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); 1946 atomic_store_rel_int(&lazywait, 1); 1947} 1948 1949static void 1950pmap_lazyfix_self(u_int cpuid) 1951{ 1952 1953 if (rcr3() == lazyptd) 1954 load_cr3(curpcb->pcb_cr3); 1955 CPU_CLR_ATOMIC(cpuid, lazymask); 1956} 1957 1958 1959static void 1960pmap_lazyfix(pmap_t pmap) 1961{ 1962 cpuset_t mymask, mask; 1963 u_int cpuid, spins; 1964 int lsb; 1965 1966 mask = pmap->pm_active; 1967 while (!CPU_EMPTY(&mask)) { 1968 spins = 50000000; 1969 1970 /* Find least significant set bit. */ 1971 lsb = CPU_FFS(&mask); 1972 MPASS(lsb != 0); 1973 lsb--; 1974 CPU_SETOF(lsb, &mask); 1975 mtx_lock_spin(&smp_ipi_mtx); 1976#ifdef PAE 1977 lazyptd = vtophys(pmap->pm_pdpt); 1978#else 1979 lazyptd = vtophys(pmap->pm_pdir); 1980#endif 1981 cpuid = PCPU_GET(cpuid); 1982 1983 /* Use a cpuset just for having an easy check. */ 1984 CPU_SETOF(cpuid, &mymask); 1985 if (!CPU_CMP(&mask, &mymask)) { 1986 lazymask = &pmap->pm_active; 1987 pmap_lazyfix_self(cpuid); 1988 } else { 1989 atomic_store_rel_int((u_int *)&lazymask, 1990 (u_int)&pmap->pm_active); 1991 atomic_store_rel_int(&lazywait, 0); 1992 ipi_selected(mask, IPI_LAZYPMAP); 1993 while (lazywait == 0) { 1994 ia32_pause(); 1995 if (--spins == 0) 1996 break; 1997 } 1998 } 1999 mtx_unlock_spin(&smp_ipi_mtx); 2000 if (spins == 0) 2001 printf("pmap_lazyfix: spun for 50000000\n"); 2002 mask = pmap->pm_active; 2003 } 2004} 2005 2006#else /* SMP */ 2007 2008/* 2009 * Cleaning up on uniprocessor is easy. For various reasons, we're 2010 * unlikely to have to even execute this code, including the fact 2011 * that the cleanup is deferred until the parent does a wait(2), which 2012 * means that another userland process has run. 2013 */ 2014static void 2015pmap_lazyfix(pmap_t pmap) 2016{ 2017 u_int cr3; 2018 2019 cr3 = vtophys(pmap->pm_pdir); 2020 if (cr3 == rcr3()) { 2021 load_cr3(curpcb->pcb_cr3); 2022 CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); 2023 } 2024} 2025#endif /* SMP */ 2026 2027/* 2028 * Release any resources held by the given physical map. 2029 * Called when a pmap initialized by pmap_pinit is being released. 2030 * Should only be called if the map contains no valid mappings. 2031 */ 2032void 2033pmap_release(pmap_t pmap) 2034{ 2035 vm_page_t m, ptdpg[NPGPTD]; 2036 int i; 2037 2038 KASSERT(pmap->pm_stats.resident_count == 0, 2039 ("pmap_release: pmap resident count %ld != 0", 2040 pmap->pm_stats.resident_count)); 2041 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2042 ("pmap_release: pmap has reserved page table page(s)")); 2043 2044 pmap_lazyfix(pmap); 2045 mtx_lock_spin(&allpmaps_lock); 2046 LIST_REMOVE(pmap, pm_list); 2047 mtx_unlock_spin(&allpmaps_lock); 2048 2049 for (i = 0; i < NPGPTD; i++) 2050 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2051 PG_FRAME); 2052 2053 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2054 sizeof(*pmap->pm_pdir)); 2055 2056 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2057 2058 for (i = 0; i < NPGPTD; i++) { 2059 m = ptdpg[i]; 2060#ifdef PAE 2061 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2062 ("pmap_release: got wrong ptd page")); 2063#endif 2064 m->wire_count--; 2065 atomic_subtract_int(&cnt.v_wire_count, 1); 2066 vm_page_free_zero(m); 2067 } 2068} 2069 2070static int 2071kvm_size(SYSCTL_HANDLER_ARGS) 2072{ 2073 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2074 2075 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2076} 2077SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2078 0, 0, kvm_size, "IU", "Size of KVM"); 2079 2080static int 2081kvm_free(SYSCTL_HANDLER_ARGS) 2082{ 2083 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2084 2085 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2086} 2087SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2088 0, 0, kvm_free, "IU", "Amount of KVM free"); 2089 2090/* 2091 * grow the number of kernel page table entries, if needed 2092 */ 2093void 2094pmap_growkernel(vm_offset_t addr) 2095{ 2096 vm_paddr_t ptppaddr; 2097 vm_page_t nkpg; 2098 pd_entry_t newpdir; 2099 2100 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2101 addr = roundup2(addr, NBPDR); 2102 if (addr - 1 >= kernel_map->max_offset) 2103 addr = kernel_map->max_offset; 2104 while (kernel_vm_end < addr) { 2105 if (pdir_pde(PTD, kernel_vm_end)) { 2106 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2107 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2108 kernel_vm_end = kernel_map->max_offset; 2109 break; 2110 } 2111 continue; 2112 } 2113 2114 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2115 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2116 VM_ALLOC_ZERO); 2117 if (nkpg == NULL) 2118 panic("pmap_growkernel: no memory to grow kernel"); 2119 2120 nkpt++; 2121 2122 if ((nkpg->flags & PG_ZERO) == 0) 2123 pmap_zero_page(nkpg); 2124 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2125 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2126 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2127 2128 pmap_kenter_pde(kernel_vm_end, newpdir); 2129 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2130 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2131 kernel_vm_end = kernel_map->max_offset; 2132 break; 2133 } 2134 } 2135} 2136 2137 2138/*************************************************** 2139 * page management routines. 2140 ***************************************************/ 2141 2142CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2143CTASSERT(_NPCM == 11); 2144CTASSERT(_NPCPV == 336); 2145 2146static __inline struct pv_chunk * 2147pv_to_chunk(pv_entry_t pv) 2148{ 2149 2150 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2151} 2152 2153#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2154 2155#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2156#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2157 2158static const uint32_t pc_freemask[_NPCM] = { 2159 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2160 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2161 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2162 PC_FREE0_9, PC_FREE10 2163}; 2164 2165SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2166 "Current number of pv entries"); 2167 2168#ifdef PV_STATS 2169static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2170 2171SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2172 "Current number of pv entry chunks"); 2173SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2174 "Current number of pv entry chunks allocated"); 2175SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2176 "Current number of pv entry chunks frees"); 2177SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2178 "Number of times tried to get a chunk page but failed."); 2179 2180static long pv_entry_frees, pv_entry_allocs; 2181static int pv_entry_spare; 2182 2183SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2184 "Current number of pv entry frees"); 2185SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2186 "Current number of pv entry allocs"); 2187SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2188 "Current number of spare pv entries"); 2189#endif 2190 2191/* 2192 * We are in a serious low memory condition. Resort to 2193 * drastic measures to free some pages so we can allocate 2194 * another pv entry chunk. 2195 */ 2196static vm_page_t 2197pmap_pv_reclaim(pmap_t locked_pmap) 2198{ 2199 struct pch newtail; 2200 struct pv_chunk *pc; 2201 struct md_page *pvh; 2202 pd_entry_t *pde; 2203 pmap_t pmap; 2204 pt_entry_t *pte, tpte; 2205 pv_entry_t pv; 2206 vm_offset_t va; 2207 vm_page_t m, m_pc; 2208 struct spglist free; 2209 uint32_t inuse; 2210 int bit, field, freed; 2211 2212 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2213 pmap = NULL; 2214 m_pc = NULL; 2215 SLIST_INIT(&free); 2216 TAILQ_INIT(&newtail); 2217 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2218 SLIST_EMPTY(&free))) { 2219 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2220 if (pmap != pc->pc_pmap) { 2221 if (pmap != NULL) { 2222 pmap_invalidate_all(pmap); 2223 if (pmap != locked_pmap) 2224 PMAP_UNLOCK(pmap); 2225 } 2226 pmap = pc->pc_pmap; 2227 /* Avoid deadlock and lock recursion. */ 2228 if (pmap > locked_pmap) 2229 PMAP_LOCK(pmap); 2230 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2231 pmap = NULL; 2232 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2233 continue; 2234 } 2235 } 2236 2237 /* 2238 * Destroy every non-wired, 4 KB page mapping in the chunk. 2239 */ 2240 freed = 0; 2241 for (field = 0; field < _NPCM; field++) { 2242 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2243 inuse != 0; inuse &= ~(1UL << bit)) { 2244 bit = bsfl(inuse); 2245 pv = &pc->pc_pventry[field * 32 + bit]; 2246 va = pv->pv_va; 2247 pde = pmap_pde(pmap, va); 2248 if ((*pde & PG_PS) != 0) 2249 continue; 2250 pte = pmap_pte(pmap, va); 2251 tpte = *pte; 2252 if ((tpte & PG_W) == 0) 2253 tpte = pte_load_clear(pte); 2254 pmap_pte_release(pte); 2255 if ((tpte & PG_W) != 0) 2256 continue; 2257 KASSERT(tpte != 0, 2258 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2259 pmap, va)); 2260 if ((tpte & PG_G) != 0) 2261 pmap_invalidate_page(pmap, va); 2262 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2263 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2264 vm_page_dirty(m); 2265 if ((tpte & PG_A) != 0) 2266 vm_page_aflag_set(m, PGA_REFERENCED); 2267 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2268 if (TAILQ_EMPTY(&m->md.pv_list) && 2269 (m->flags & PG_FICTITIOUS) == 0) { 2270 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2271 if (TAILQ_EMPTY(&pvh->pv_list)) { 2272 vm_page_aflag_clear(m, 2273 PGA_WRITEABLE); 2274 } 2275 } 2276 pc->pc_map[field] |= 1UL << bit; 2277 pmap_unuse_pt(pmap, va, &free); 2278 freed++; 2279 } 2280 } 2281 if (freed == 0) { 2282 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2283 continue; 2284 } 2285 /* Every freed mapping is for a 4 KB page. */ 2286 pmap->pm_stats.resident_count -= freed; 2287 PV_STAT(pv_entry_frees += freed); 2288 PV_STAT(pv_entry_spare += freed); 2289 pv_entry_count -= freed; 2290 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2291 for (field = 0; field < _NPCM; field++) 2292 if (pc->pc_map[field] != pc_freemask[field]) { 2293 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2294 pc_list); 2295 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2296 2297 /* 2298 * One freed pv entry in locked_pmap is 2299 * sufficient. 2300 */ 2301 if (pmap == locked_pmap) 2302 goto out; 2303 break; 2304 } 2305 if (field == _NPCM) { 2306 PV_STAT(pv_entry_spare -= _NPCPV); 2307 PV_STAT(pc_chunk_count--); 2308 PV_STAT(pc_chunk_frees++); 2309 /* Entire chunk is free; return it. */ 2310 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2311 pmap_qremove((vm_offset_t)pc, 1); 2312 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2313 break; 2314 } 2315 } 2316out: 2317 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2318 if (pmap != NULL) { 2319 pmap_invalidate_all(pmap); 2320 if (pmap != locked_pmap) 2321 PMAP_UNLOCK(pmap); 2322 } 2323 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2324 m_pc = SLIST_FIRST(&free); 2325 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2326 /* Recycle a freed page table page. */ 2327 m_pc->wire_count = 1; 2328 atomic_add_int(&cnt.v_wire_count, 1); 2329 } 2330 pmap_free_zero_pages(&free); 2331 return (m_pc); 2332} 2333 2334/* 2335 * free the pv_entry back to the free list 2336 */ 2337static void 2338free_pv_entry(pmap_t pmap, pv_entry_t pv) 2339{ 2340 struct pv_chunk *pc; 2341 int idx, field, bit; 2342 2343 rw_assert(&pvh_global_lock, RA_WLOCKED); 2344 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2345 PV_STAT(pv_entry_frees++); 2346 PV_STAT(pv_entry_spare++); 2347 pv_entry_count--; 2348 pc = pv_to_chunk(pv); 2349 idx = pv - &pc->pc_pventry[0]; 2350 field = idx / 32; 2351 bit = idx % 32; 2352 pc->pc_map[field] |= 1ul << bit; 2353 for (idx = 0; idx < _NPCM; idx++) 2354 if (pc->pc_map[idx] != pc_freemask[idx]) { 2355 /* 2356 * 98% of the time, pc is already at the head of the 2357 * list. If it isn't already, move it to the head. 2358 */ 2359 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2360 pc)) { 2361 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2362 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2363 pc_list); 2364 } 2365 return; 2366 } 2367 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2368 free_pv_chunk(pc); 2369} 2370 2371static void 2372free_pv_chunk(struct pv_chunk *pc) 2373{ 2374 vm_page_t m; 2375 2376 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2377 PV_STAT(pv_entry_spare -= _NPCPV); 2378 PV_STAT(pc_chunk_count--); 2379 PV_STAT(pc_chunk_frees++); 2380 /* entire chunk is free, return it */ 2381 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2382 pmap_qremove((vm_offset_t)pc, 1); 2383 vm_page_unwire(m, 0); 2384 vm_page_free(m); 2385 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2386} 2387 2388/* 2389 * get a new pv_entry, allocating a block from the system 2390 * when needed. 2391 */ 2392static pv_entry_t 2393get_pv_entry(pmap_t pmap, boolean_t try) 2394{ 2395 static const struct timeval printinterval = { 60, 0 }; 2396 static struct timeval lastprint; 2397 int bit, field; 2398 pv_entry_t pv; 2399 struct pv_chunk *pc; 2400 vm_page_t m; 2401 2402 rw_assert(&pvh_global_lock, RA_WLOCKED); 2403 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2404 PV_STAT(pv_entry_allocs++); 2405 pv_entry_count++; 2406 if (pv_entry_count > pv_entry_high_water) 2407 if (ratecheck(&lastprint, &printinterval)) 2408 printf("Approaching the limit on PV entries, consider " 2409 "increasing either the vm.pmap.shpgperproc or the " 2410 "vm.pmap.pv_entry_max tunable.\n"); 2411retry: 2412 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2413 if (pc != NULL) { 2414 for (field = 0; field < _NPCM; field++) { 2415 if (pc->pc_map[field]) { 2416 bit = bsfl(pc->pc_map[field]); 2417 break; 2418 } 2419 } 2420 if (field < _NPCM) { 2421 pv = &pc->pc_pventry[field * 32 + bit]; 2422 pc->pc_map[field] &= ~(1ul << bit); 2423 /* If this was the last item, move it to tail */ 2424 for (field = 0; field < _NPCM; field++) 2425 if (pc->pc_map[field] != 0) { 2426 PV_STAT(pv_entry_spare--); 2427 return (pv); /* not full, return */ 2428 } 2429 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2430 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2431 PV_STAT(pv_entry_spare--); 2432 return (pv); 2433 } 2434 } 2435 /* 2436 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2437 * global lock. If "pv_vafree" is currently non-empty, it will 2438 * remain non-empty until pmap_ptelist_alloc() completes. 2439 */ 2440 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2441 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2442 if (try) { 2443 pv_entry_count--; 2444 PV_STAT(pc_chunk_tryfail++); 2445 return (NULL); 2446 } 2447 m = pmap_pv_reclaim(pmap); 2448 if (m == NULL) 2449 goto retry; 2450 } 2451 PV_STAT(pc_chunk_count++); 2452 PV_STAT(pc_chunk_allocs++); 2453 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2454 pmap_qenter((vm_offset_t)pc, &m, 1); 2455 pc->pc_pmap = pmap; 2456 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2457 for (field = 1; field < _NPCM; field++) 2458 pc->pc_map[field] = pc_freemask[field]; 2459 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2460 pv = &pc->pc_pventry[0]; 2461 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2462 PV_STAT(pv_entry_spare += _NPCPV - 1); 2463 return (pv); 2464} 2465 2466static __inline pv_entry_t 2467pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2468{ 2469 pv_entry_t pv; 2470 2471 rw_assert(&pvh_global_lock, RA_WLOCKED); 2472 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2473 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2474 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2475 break; 2476 } 2477 } 2478 return (pv); 2479} 2480 2481static void 2482pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2483{ 2484 struct md_page *pvh; 2485 pv_entry_t pv; 2486 vm_offset_t va_last; 2487 vm_page_t m; 2488 2489 rw_assert(&pvh_global_lock, RA_WLOCKED); 2490 KASSERT((pa & PDRMASK) == 0, 2491 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2492 2493 /* 2494 * Transfer the 4mpage's pv entry for this mapping to the first 2495 * page's pv list. 2496 */ 2497 pvh = pa_to_pvh(pa); 2498 va = trunc_4mpage(va); 2499 pv = pmap_pvh_remove(pvh, pmap, va); 2500 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2501 m = PHYS_TO_VM_PAGE(pa); 2502 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2503 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2504 va_last = va + NBPDR - PAGE_SIZE; 2505 do { 2506 m++; 2507 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2508 ("pmap_pv_demote_pde: page %p is not managed", m)); 2509 va += PAGE_SIZE; 2510 pmap_insert_entry(pmap, va, m); 2511 } while (va < va_last); 2512} 2513 2514static void 2515pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2516{ 2517 struct md_page *pvh; 2518 pv_entry_t pv; 2519 vm_offset_t va_last; 2520 vm_page_t m; 2521 2522 rw_assert(&pvh_global_lock, RA_WLOCKED); 2523 KASSERT((pa & PDRMASK) == 0, 2524 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2525 2526 /* 2527 * Transfer the first page's pv entry for this mapping to the 2528 * 4mpage's pv list. Aside from avoiding the cost of a call 2529 * to get_pv_entry(), a transfer avoids the possibility that 2530 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2531 * removes one of the mappings that is being promoted. 2532 */ 2533 m = PHYS_TO_VM_PAGE(pa); 2534 va = trunc_4mpage(va); 2535 pv = pmap_pvh_remove(&m->md, pmap, va); 2536 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2537 pvh = pa_to_pvh(pa); 2538 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2539 /* Free the remaining NPTEPG - 1 pv entries. */ 2540 va_last = va + NBPDR - PAGE_SIZE; 2541 do { 2542 m++; 2543 va += PAGE_SIZE; 2544 pmap_pvh_free(&m->md, pmap, va); 2545 } while (va < va_last); 2546} 2547 2548static void 2549pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2550{ 2551 pv_entry_t pv; 2552 2553 pv = pmap_pvh_remove(pvh, pmap, va); 2554 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2555 free_pv_entry(pmap, pv); 2556} 2557 2558static void 2559pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2560{ 2561 struct md_page *pvh; 2562 2563 rw_assert(&pvh_global_lock, RA_WLOCKED); 2564 pmap_pvh_free(&m->md, pmap, va); 2565 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2566 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2567 if (TAILQ_EMPTY(&pvh->pv_list)) 2568 vm_page_aflag_clear(m, PGA_WRITEABLE); 2569 } 2570} 2571 2572/* 2573 * Create a pv entry for page at pa for 2574 * (pmap, va). 2575 */ 2576static void 2577pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2578{ 2579 pv_entry_t pv; 2580 2581 rw_assert(&pvh_global_lock, RA_WLOCKED); 2582 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2583 pv = get_pv_entry(pmap, FALSE); 2584 pv->pv_va = va; 2585 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2586} 2587 2588/* 2589 * Conditionally create a pv entry. 2590 */ 2591static boolean_t 2592pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2593{ 2594 pv_entry_t pv; 2595 2596 rw_assert(&pvh_global_lock, RA_WLOCKED); 2597 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2598 if (pv_entry_count < pv_entry_high_water && 2599 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2600 pv->pv_va = va; 2601 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2602 return (TRUE); 2603 } else 2604 return (FALSE); 2605} 2606 2607/* 2608 * Create the pv entries for each of the pages within a superpage. 2609 */ 2610static boolean_t 2611pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2612{ 2613 struct md_page *pvh; 2614 pv_entry_t pv; 2615 2616 rw_assert(&pvh_global_lock, RA_WLOCKED); 2617 if (pv_entry_count < pv_entry_high_water && 2618 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2619 pv->pv_va = va; 2620 pvh = pa_to_pvh(pa); 2621 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2622 return (TRUE); 2623 } else 2624 return (FALSE); 2625} 2626 2627/* 2628 * Fills a page table page with mappings to consecutive physical pages. 2629 */ 2630static void 2631pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2632{ 2633 pt_entry_t *pte; 2634 2635 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2636 *pte = newpte; 2637 newpte += PAGE_SIZE; 2638 } 2639} 2640 2641/* 2642 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2643 * 2- or 4MB page mapping is invalidated. 2644 */ 2645static boolean_t 2646pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2647{ 2648 pd_entry_t newpde, oldpde; 2649 pt_entry_t *firstpte, newpte; 2650 vm_paddr_t mptepa; 2651 vm_page_t mpte; 2652 struct spglist free; 2653 2654 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2655 oldpde = *pde; 2656 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2657 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2658 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 2659 NULL) 2660 pmap_remove_pt_page(pmap, mpte); 2661 else { 2662 KASSERT((oldpde & PG_W) == 0, 2663 ("pmap_demote_pde: page table page for a wired mapping" 2664 " is missing")); 2665 2666 /* 2667 * Invalidate the 2- or 4MB page mapping and return 2668 * "failure" if the mapping was never accessed or the 2669 * allocation of the new page table page fails. 2670 */ 2671 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2672 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2673 VM_ALLOC_WIRED)) == NULL) { 2674 SLIST_INIT(&free); 2675 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2676 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2677 pmap_free_zero_pages(&free); 2678 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2679 " in pmap %p", va, pmap); 2680 return (FALSE); 2681 } 2682 if (va < VM_MAXUSER_ADDRESS) 2683 pmap->pm_stats.resident_count++; 2684 } 2685 mptepa = VM_PAGE_TO_PHYS(mpte); 2686 2687 /* 2688 * If the page mapping is in the kernel's address space, then the 2689 * KPTmap can provide access to the page table page. Otherwise, 2690 * temporarily map the page table page (mpte) into the kernel's 2691 * address space at either PADDR1 or PADDR2. 2692 */ 2693 if (va >= KERNBASE) 2694 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2695 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2696 if ((*PMAP1 & PG_FRAME) != mptepa) { 2697 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2698#ifdef SMP 2699 PMAP1cpu = PCPU_GET(cpuid); 2700#endif 2701 invlcaddr(PADDR1); 2702 PMAP1changed++; 2703 } else 2704#ifdef SMP 2705 if (PMAP1cpu != PCPU_GET(cpuid)) { 2706 PMAP1cpu = PCPU_GET(cpuid); 2707 invlcaddr(PADDR1); 2708 PMAP1changedcpu++; 2709 } else 2710#endif 2711 PMAP1unchanged++; 2712 firstpte = PADDR1; 2713 } else { 2714 mtx_lock(&PMAP2mutex); 2715 if ((*PMAP2 & PG_FRAME) != mptepa) { 2716 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2717 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2718 } 2719 firstpte = PADDR2; 2720 } 2721 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2722 KASSERT((oldpde & PG_A) != 0, 2723 ("pmap_demote_pde: oldpde is missing PG_A")); 2724 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2725 ("pmap_demote_pde: oldpde is missing PG_M")); 2726 newpte = oldpde & ~PG_PS; 2727 if ((newpte & PG_PDE_PAT) != 0) 2728 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2729 2730 /* 2731 * If the page table page is new, initialize it. 2732 */ 2733 if (mpte->wire_count == 1) { 2734 mpte->wire_count = NPTEPG; 2735 pmap_fill_ptp(firstpte, newpte); 2736 } 2737 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2738 ("pmap_demote_pde: firstpte and newpte map different physical" 2739 " addresses")); 2740 2741 /* 2742 * If the mapping has changed attributes, update the page table 2743 * entries. 2744 */ 2745 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2746 pmap_fill_ptp(firstpte, newpte); 2747 2748 /* 2749 * Demote the mapping. This pmap is locked. The old PDE has 2750 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2751 * set. Thus, there is no danger of a race with another 2752 * processor changing the setting of PG_A and/or PG_M between 2753 * the read above and the store below. 2754 */ 2755 if (workaround_erratum383) 2756 pmap_update_pde(pmap, va, pde, newpde); 2757 else if (pmap == kernel_pmap) 2758 pmap_kenter_pde(va, newpde); 2759 else 2760 pde_store(pde, newpde); 2761 if (firstpte == PADDR2) 2762 mtx_unlock(&PMAP2mutex); 2763 2764 /* 2765 * Invalidate the recursive mapping of the page table page. 2766 */ 2767 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2768 2769 /* 2770 * Demote the pv entry. This depends on the earlier demotion 2771 * of the mapping. Specifically, the (re)creation of a per- 2772 * page pv entry might trigger the execution of pmap_collect(), 2773 * which might reclaim a newly (re)created per-page pv entry 2774 * and destroy the associated mapping. In order to destroy 2775 * the mapping, the PDE must have already changed from mapping 2776 * the 2mpage to referencing the page table page. 2777 */ 2778 if ((oldpde & PG_MANAGED) != 0) 2779 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2780 2781 pmap_pde_demotions++; 2782 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2783 " in pmap %p", va, pmap); 2784 return (TRUE); 2785} 2786 2787/* 2788 * Removes a 2- or 4MB page mapping from the kernel pmap. 2789 */ 2790static void 2791pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2792{ 2793 pd_entry_t newpde; 2794 vm_paddr_t mptepa; 2795 vm_page_t mpte; 2796 2797 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2798 mpte = pmap_lookup_pt_page(pmap, va); 2799 if (mpte == NULL) 2800 panic("pmap_remove_kernel_pde: Missing pt page."); 2801 2802 pmap_remove_pt_page(pmap, mpte); 2803 mptepa = VM_PAGE_TO_PHYS(mpte); 2804 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2805 2806 /* 2807 * Initialize the page table page. 2808 */ 2809 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2810 2811 /* 2812 * Remove the mapping. 2813 */ 2814 if (workaround_erratum383) 2815 pmap_update_pde(pmap, va, pde, newpde); 2816 else 2817 pmap_kenter_pde(va, newpde); 2818 2819 /* 2820 * Invalidate the recursive mapping of the page table page. 2821 */ 2822 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2823} 2824 2825/* 2826 * pmap_remove_pde: do the things to unmap a superpage in a process 2827 */ 2828static void 2829pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2830 struct spglist *free) 2831{ 2832 struct md_page *pvh; 2833 pd_entry_t oldpde; 2834 vm_offset_t eva, va; 2835 vm_page_t m, mpte; 2836 2837 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2838 KASSERT((sva & PDRMASK) == 0, 2839 ("pmap_remove_pde: sva is not 4mpage aligned")); 2840 oldpde = pte_load_clear(pdq); 2841 if (oldpde & PG_W) 2842 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2843 2844 /* 2845 * Machines that don't support invlpg, also don't support 2846 * PG_G. 2847 */ 2848 if (oldpde & PG_G) 2849 pmap_invalidate_page(kernel_pmap, sva); 2850 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2851 if (oldpde & PG_MANAGED) { 2852 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2853 pmap_pvh_free(pvh, pmap, sva); 2854 eva = sva + NBPDR; 2855 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2856 va < eva; va += PAGE_SIZE, m++) { 2857 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2858 vm_page_dirty(m); 2859 if (oldpde & PG_A) 2860 vm_page_aflag_set(m, PGA_REFERENCED); 2861 if (TAILQ_EMPTY(&m->md.pv_list) && 2862 TAILQ_EMPTY(&pvh->pv_list)) 2863 vm_page_aflag_clear(m, PGA_WRITEABLE); 2864 } 2865 } 2866 if (pmap == kernel_pmap) { 2867 pmap_remove_kernel_pde(pmap, pdq, sva); 2868 } else { 2869 mpte = pmap_lookup_pt_page(pmap, sva); 2870 if (mpte != NULL) { 2871 pmap_remove_pt_page(pmap, mpte); 2872 pmap->pm_stats.resident_count--; 2873 KASSERT(mpte->wire_count == NPTEPG, 2874 ("pmap_remove_pde: pte page wire count error")); 2875 mpte->wire_count = 0; 2876 pmap_add_delayed_free_list(mpte, free, FALSE); 2877 atomic_subtract_int(&cnt.v_wire_count, 1); 2878 } 2879 } 2880} 2881 2882/* 2883 * pmap_remove_pte: do the things to unmap a page in a process 2884 */ 2885static int 2886pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2887 struct spglist *free) 2888{ 2889 pt_entry_t oldpte; 2890 vm_page_t m; 2891 2892 rw_assert(&pvh_global_lock, RA_WLOCKED); 2893 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2894 oldpte = pte_load_clear(ptq); 2895 KASSERT(oldpte != 0, 2896 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2897 if (oldpte & PG_W) 2898 pmap->pm_stats.wired_count -= 1; 2899 /* 2900 * Machines that don't support invlpg, also don't support 2901 * PG_G. 2902 */ 2903 if (oldpte & PG_G) 2904 pmap_invalidate_page(kernel_pmap, va); 2905 pmap->pm_stats.resident_count -= 1; 2906 if (oldpte & PG_MANAGED) { 2907 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2908 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2909 vm_page_dirty(m); 2910 if (oldpte & PG_A) 2911 vm_page_aflag_set(m, PGA_REFERENCED); 2912 pmap_remove_entry(pmap, m, va); 2913 } 2914 return (pmap_unuse_pt(pmap, va, free)); 2915} 2916 2917/* 2918 * Remove a single page from a process address space 2919 */ 2920static void 2921pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2922{ 2923 pt_entry_t *pte; 2924 2925 rw_assert(&pvh_global_lock, RA_WLOCKED); 2926 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2927 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2928 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2929 return; 2930 pmap_remove_pte(pmap, pte, va, free); 2931 pmap_invalidate_page(pmap, va); 2932} 2933 2934/* 2935 * Remove the given range of addresses from the specified map. 2936 * 2937 * It is assumed that the start and end are properly 2938 * rounded to the page size. 2939 */ 2940void 2941pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2942{ 2943 vm_offset_t pdnxt; 2944 pd_entry_t ptpaddr; 2945 pt_entry_t *pte; 2946 struct spglist free; 2947 int anyvalid; 2948 2949 /* 2950 * Perform an unsynchronized read. This is, however, safe. 2951 */ 2952 if (pmap->pm_stats.resident_count == 0) 2953 return; 2954 2955 anyvalid = 0; 2956 SLIST_INIT(&free); 2957 2958 rw_wlock(&pvh_global_lock); 2959 sched_pin(); 2960 PMAP_LOCK(pmap); 2961 2962 /* 2963 * special handling of removing one page. a very 2964 * common operation and easy to short circuit some 2965 * code. 2966 */ 2967 if ((sva + PAGE_SIZE == eva) && 2968 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2969 pmap_remove_page(pmap, sva, &free); 2970 goto out; 2971 } 2972 2973 for (; sva < eva; sva = pdnxt) { 2974 u_int pdirindex; 2975 2976 /* 2977 * Calculate index for next page table. 2978 */ 2979 pdnxt = (sva + NBPDR) & ~PDRMASK; 2980 if (pdnxt < sva) 2981 pdnxt = eva; 2982 if (pmap->pm_stats.resident_count == 0) 2983 break; 2984 2985 pdirindex = sva >> PDRSHIFT; 2986 ptpaddr = pmap->pm_pdir[pdirindex]; 2987 2988 /* 2989 * Weed out invalid mappings. Note: we assume that the page 2990 * directory table is always allocated, and in kernel virtual. 2991 */ 2992 if (ptpaddr == 0) 2993 continue; 2994 2995 /* 2996 * Check for large page. 2997 */ 2998 if ((ptpaddr & PG_PS) != 0) { 2999 /* 3000 * Are we removing the entire large page? If not, 3001 * demote the mapping and fall through. 3002 */ 3003 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3004 /* 3005 * The TLB entry for a PG_G mapping is 3006 * invalidated by pmap_remove_pde(). 3007 */ 3008 if ((ptpaddr & PG_G) == 0) 3009 anyvalid = 1; 3010 pmap_remove_pde(pmap, 3011 &pmap->pm_pdir[pdirindex], sva, &free); 3012 continue; 3013 } else if (!pmap_demote_pde(pmap, 3014 &pmap->pm_pdir[pdirindex], sva)) { 3015 /* The large page mapping was destroyed. */ 3016 continue; 3017 } 3018 } 3019 3020 /* 3021 * Limit our scan to either the end of the va represented 3022 * by the current page table page, or to the end of the 3023 * range being removed. 3024 */ 3025 if (pdnxt > eva) 3026 pdnxt = eva; 3027 3028 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3029 sva += PAGE_SIZE) { 3030 if (*pte == 0) 3031 continue; 3032 3033 /* 3034 * The TLB entry for a PG_G mapping is invalidated 3035 * by pmap_remove_pte(). 3036 */ 3037 if ((*pte & PG_G) == 0) 3038 anyvalid = 1; 3039 if (pmap_remove_pte(pmap, pte, sva, &free)) 3040 break; 3041 } 3042 } 3043out: 3044 sched_unpin(); 3045 if (anyvalid) 3046 pmap_invalidate_all(pmap); 3047 rw_wunlock(&pvh_global_lock); 3048 PMAP_UNLOCK(pmap); 3049 pmap_free_zero_pages(&free); 3050} 3051 3052/* 3053 * Routine: pmap_remove_all 3054 * Function: 3055 * Removes this physical page from 3056 * all physical maps in which it resides. 3057 * Reflects back modify bits to the pager. 3058 * 3059 * Notes: 3060 * Original versions of this routine were very 3061 * inefficient because they iteratively called 3062 * pmap_remove (slow...) 3063 */ 3064 3065void 3066pmap_remove_all(vm_page_t m) 3067{ 3068 struct md_page *pvh; 3069 pv_entry_t pv; 3070 pmap_t pmap; 3071 pt_entry_t *pte, tpte; 3072 pd_entry_t *pde; 3073 vm_offset_t va; 3074 struct spglist free; 3075 3076 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3077 ("pmap_remove_all: page %p is not managed", m)); 3078 SLIST_INIT(&free); 3079 rw_wlock(&pvh_global_lock); 3080 sched_pin(); 3081 if ((m->flags & PG_FICTITIOUS) != 0) 3082 goto small_mappings; 3083 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3084 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3085 va = pv->pv_va; 3086 pmap = PV_PMAP(pv); 3087 PMAP_LOCK(pmap); 3088 pde = pmap_pde(pmap, va); 3089 (void)pmap_demote_pde(pmap, pde, va); 3090 PMAP_UNLOCK(pmap); 3091 } 3092small_mappings: 3093 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3094 pmap = PV_PMAP(pv); 3095 PMAP_LOCK(pmap); 3096 pmap->pm_stats.resident_count--; 3097 pde = pmap_pde(pmap, pv->pv_va); 3098 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3099 " a 4mpage in page %p's pv list", m)); 3100 pte = pmap_pte_quick(pmap, pv->pv_va); 3101 tpte = pte_load_clear(pte); 3102 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3103 pmap, pv->pv_va)); 3104 if (tpte & PG_W) 3105 pmap->pm_stats.wired_count--; 3106 if (tpte & PG_A) 3107 vm_page_aflag_set(m, PGA_REFERENCED); 3108 3109 /* 3110 * Update the vm_page_t clean and reference bits. 3111 */ 3112 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3113 vm_page_dirty(m); 3114 pmap_unuse_pt(pmap, pv->pv_va, &free); 3115 pmap_invalidate_page(pmap, pv->pv_va); 3116 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3117 free_pv_entry(pmap, pv); 3118 PMAP_UNLOCK(pmap); 3119 } 3120 vm_page_aflag_clear(m, PGA_WRITEABLE); 3121 sched_unpin(); 3122 rw_wunlock(&pvh_global_lock); 3123 pmap_free_zero_pages(&free); 3124} 3125 3126/* 3127 * pmap_protect_pde: do the things to protect a 4mpage in a process 3128 */ 3129static boolean_t 3130pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3131{ 3132 pd_entry_t newpde, oldpde; 3133 vm_offset_t eva, va; 3134 vm_page_t m; 3135 boolean_t anychanged; 3136 3137 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3138 KASSERT((sva & PDRMASK) == 0, 3139 ("pmap_protect_pde: sva is not 4mpage aligned")); 3140 anychanged = FALSE; 3141retry: 3142 oldpde = newpde = *pde; 3143 if (oldpde & PG_MANAGED) { 3144 eva = sva + NBPDR; 3145 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3146 va < eva; va += PAGE_SIZE, m++) 3147 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3148 vm_page_dirty(m); 3149 } 3150 if ((prot & VM_PROT_WRITE) == 0) 3151 newpde &= ~(PG_RW | PG_M); 3152#ifdef PAE 3153 if ((prot & VM_PROT_EXECUTE) == 0) 3154 newpde |= pg_nx; 3155#endif 3156 if (newpde != oldpde) { 3157 if (!pde_cmpset(pde, oldpde, newpde)) 3158 goto retry; 3159 if (oldpde & PG_G) 3160 pmap_invalidate_page(pmap, sva); 3161 else 3162 anychanged = TRUE; 3163 } 3164 return (anychanged); 3165} 3166 3167/* 3168 * Set the physical protection on the 3169 * specified range of this map as requested. 3170 */ 3171void 3172pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3173{ 3174 vm_offset_t pdnxt; 3175 pd_entry_t ptpaddr; 3176 pt_entry_t *pte; 3177 boolean_t anychanged, pv_lists_locked; 3178 3179 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3180 if (prot == VM_PROT_NONE) { 3181 pmap_remove(pmap, sva, eva); 3182 return; 3183 } 3184 3185#ifdef PAE 3186 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3187 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3188 return; 3189#else 3190 if (prot & VM_PROT_WRITE) 3191 return; 3192#endif 3193 3194 if (pmap_is_current(pmap)) 3195 pv_lists_locked = FALSE; 3196 else { 3197 pv_lists_locked = TRUE; 3198resume: 3199 rw_wlock(&pvh_global_lock); 3200 sched_pin(); 3201 } 3202 anychanged = FALSE; 3203 3204 PMAP_LOCK(pmap); 3205 for (; sva < eva; sva = pdnxt) { 3206 pt_entry_t obits, pbits; 3207 u_int pdirindex; 3208 3209 pdnxt = (sva + NBPDR) & ~PDRMASK; 3210 if (pdnxt < sva) 3211 pdnxt = eva; 3212 3213 pdirindex = sva >> PDRSHIFT; 3214 ptpaddr = pmap->pm_pdir[pdirindex]; 3215 3216 /* 3217 * Weed out invalid mappings. Note: we assume that the page 3218 * directory table is always allocated, and in kernel virtual. 3219 */ 3220 if (ptpaddr == 0) 3221 continue; 3222 3223 /* 3224 * Check for large page. 3225 */ 3226 if ((ptpaddr & PG_PS) != 0) { 3227 /* 3228 * Are we protecting the entire large page? If not, 3229 * demote the mapping and fall through. 3230 */ 3231 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3232 /* 3233 * The TLB entry for a PG_G mapping is 3234 * invalidated by pmap_protect_pde(). 3235 */ 3236 if (pmap_protect_pde(pmap, 3237 &pmap->pm_pdir[pdirindex], sva, prot)) 3238 anychanged = TRUE; 3239 continue; 3240 } else { 3241 if (!pv_lists_locked) { 3242 pv_lists_locked = TRUE; 3243 if (!rw_try_wlock(&pvh_global_lock)) { 3244 if (anychanged) 3245 pmap_invalidate_all( 3246 pmap); 3247 PMAP_UNLOCK(pmap); 3248 goto resume; 3249 } 3250 sched_pin(); 3251 } 3252 if (!pmap_demote_pde(pmap, 3253 &pmap->pm_pdir[pdirindex], sva)) { 3254 /* 3255 * The large page mapping was 3256 * destroyed. 3257 */ 3258 continue; 3259 } 3260 } 3261 } 3262 3263 if (pdnxt > eva) 3264 pdnxt = eva; 3265 3266 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3267 sva += PAGE_SIZE) { 3268 vm_page_t m; 3269 3270retry: 3271 /* 3272 * Regardless of whether a pte is 32 or 64 bits in 3273 * size, PG_RW, PG_A, and PG_M are among the least 3274 * significant 32 bits. 3275 */ 3276 obits = pbits = *pte; 3277 if ((pbits & PG_V) == 0) 3278 continue; 3279 3280 if ((prot & VM_PROT_WRITE) == 0) { 3281 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3282 (PG_MANAGED | PG_M | PG_RW)) { 3283 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3284 vm_page_dirty(m); 3285 } 3286 pbits &= ~(PG_RW | PG_M); 3287 } 3288#ifdef PAE 3289 if ((prot & VM_PROT_EXECUTE) == 0) 3290 pbits |= pg_nx; 3291#endif 3292 3293 if (pbits != obits) { 3294#ifdef PAE 3295 if (!atomic_cmpset_64(pte, obits, pbits)) 3296 goto retry; 3297#else 3298 if (!atomic_cmpset_int((u_int *)pte, obits, 3299 pbits)) 3300 goto retry; 3301#endif 3302 if (obits & PG_G) 3303 pmap_invalidate_page(pmap, sva); 3304 else 3305 anychanged = TRUE; 3306 } 3307 } 3308 } 3309 if (anychanged) 3310 pmap_invalidate_all(pmap); 3311 if (pv_lists_locked) { 3312 sched_unpin(); 3313 rw_wunlock(&pvh_global_lock); 3314 } 3315 PMAP_UNLOCK(pmap); 3316} 3317 3318/* 3319 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3320 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3321 * For promotion to occur, two conditions must be met: (1) the 4KB page 3322 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3323 * mappings must have identical characteristics. 3324 * 3325 * Managed (PG_MANAGED) mappings within the kernel address space are not 3326 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3327 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3328 * pmap. 3329 */ 3330static void 3331pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3332{ 3333 pd_entry_t newpde; 3334 pt_entry_t *firstpte, oldpte, pa, *pte; 3335 vm_offset_t oldpteva; 3336 vm_page_t mpte; 3337 3338 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3339 3340 /* 3341 * Examine the first PTE in the specified PTP. Abort if this PTE is 3342 * either invalid, unused, or does not map the first 4KB physical page 3343 * within a 2- or 4MB page. 3344 */ 3345 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3346setpde: 3347 newpde = *firstpte; 3348 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3349 pmap_pde_p_failures++; 3350 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3351 " in pmap %p", va, pmap); 3352 return; 3353 } 3354 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3355 pmap_pde_p_failures++; 3356 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3357 " in pmap %p", va, pmap); 3358 return; 3359 } 3360 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3361 /* 3362 * When PG_M is already clear, PG_RW can be cleared without 3363 * a TLB invalidation. 3364 */ 3365 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3366 ~PG_RW)) 3367 goto setpde; 3368 newpde &= ~PG_RW; 3369 } 3370 3371 /* 3372 * Examine each of the other PTEs in the specified PTP. Abort if this 3373 * PTE maps an unexpected 4KB physical page or does not have identical 3374 * characteristics to the first PTE. 3375 */ 3376 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3377 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3378setpte: 3379 oldpte = *pte; 3380 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3381 pmap_pde_p_failures++; 3382 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3383 " in pmap %p", va, pmap); 3384 return; 3385 } 3386 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3387 /* 3388 * When PG_M is already clear, PG_RW can be cleared 3389 * without a TLB invalidation. 3390 */ 3391 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3392 oldpte & ~PG_RW)) 3393 goto setpte; 3394 oldpte &= ~PG_RW; 3395 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3396 (va & ~PDRMASK); 3397 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3398 " in pmap %p", oldpteva, pmap); 3399 } 3400 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3401 pmap_pde_p_failures++; 3402 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3403 " in pmap %p", va, pmap); 3404 return; 3405 } 3406 pa -= PAGE_SIZE; 3407 } 3408 3409 /* 3410 * Save the page table page in its current state until the PDE 3411 * mapping the superpage is demoted by pmap_demote_pde() or 3412 * destroyed by pmap_remove_pde(). 3413 */ 3414 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3415 KASSERT(mpte >= vm_page_array && 3416 mpte < &vm_page_array[vm_page_array_size], 3417 ("pmap_promote_pde: page table page is out of range")); 3418 KASSERT(mpte->pindex == va >> PDRSHIFT, 3419 ("pmap_promote_pde: page table page's pindex is wrong")); 3420 if (pmap_insert_pt_page(pmap, mpte)) { 3421 pmap_pde_p_failures++; 3422 CTR2(KTR_PMAP, 3423 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3424 pmap); 3425 return; 3426 } 3427 3428 /* 3429 * Promote the pv entries. 3430 */ 3431 if ((newpde & PG_MANAGED) != 0) 3432 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3433 3434 /* 3435 * Propagate the PAT index to its proper position. 3436 */ 3437 if ((newpde & PG_PTE_PAT) != 0) 3438 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3439 3440 /* 3441 * Map the superpage. 3442 */ 3443 if (workaround_erratum383) 3444 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3445 else if (pmap == kernel_pmap) 3446 pmap_kenter_pde(va, PG_PS | newpde); 3447 else 3448 pde_store(pde, PG_PS | newpde); 3449 3450 pmap_pde_promotions++; 3451 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3452 " in pmap %p", va, pmap); 3453} 3454 3455/* 3456 * Insert the given physical page (p) at 3457 * the specified virtual address (v) in the 3458 * target physical map with the protection requested. 3459 * 3460 * If specified, the page will be wired down, meaning 3461 * that the related pte can not be reclaimed. 3462 * 3463 * NB: This is the only routine which MAY NOT lazy-evaluate 3464 * or lose information. That is, this routine must actually 3465 * insert this page into the given map NOW. 3466 */ 3467int 3468pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3469 u_int flags, int8_t psind) 3470{ 3471 pd_entry_t *pde; 3472 pt_entry_t *pte; 3473 pt_entry_t newpte, origpte; 3474 pv_entry_t pv; 3475 vm_paddr_t opa, pa; 3476 vm_page_t mpte, om; 3477 boolean_t invlva, wired; 3478 3479 va = trunc_page(va); 3480 mpte = NULL; 3481 wired = (flags & PMAP_ENTER_WIRED) != 0; 3482 3483 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3484 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3485 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3486 va)); 3487 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3488 VM_OBJECT_ASSERT_LOCKED(m->object); 3489 3490 rw_wlock(&pvh_global_lock); 3491 PMAP_LOCK(pmap); 3492 sched_pin(); 3493 3494 /* 3495 * In the case that a page table page is not 3496 * resident, we are creating it here. 3497 */ 3498 if (va < VM_MAXUSER_ADDRESS) { 3499 mpte = pmap_allocpte(pmap, va, flags); 3500 if (mpte == NULL) { 3501 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3502 ("pmap_allocpte failed with sleep allowed")); 3503 sched_unpin(); 3504 rw_wunlock(&pvh_global_lock); 3505 PMAP_UNLOCK(pmap); 3506 return (KERN_RESOURCE_SHORTAGE); 3507 } 3508 } 3509 3510 pde = pmap_pde(pmap, va); 3511 if ((*pde & PG_PS) != 0) 3512 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3513 pte = pmap_pte_quick(pmap, va); 3514 3515 /* 3516 * Page Directory table entry not valid, we need a new PT page 3517 */ 3518 if (pte == NULL) { 3519 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3520 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3521 } 3522 3523 pa = VM_PAGE_TO_PHYS(m); 3524 om = NULL; 3525 origpte = *pte; 3526 opa = origpte & PG_FRAME; 3527 3528 /* 3529 * Mapping has not changed, must be protection or wiring change. 3530 */ 3531 if (origpte && (opa == pa)) { 3532 /* 3533 * Wiring change, just update stats. We don't worry about 3534 * wiring PT pages as they remain resident as long as there 3535 * are valid mappings in them. Hence, if a user page is wired, 3536 * the PT page will be also. 3537 */ 3538 if (wired && ((origpte & PG_W) == 0)) 3539 pmap->pm_stats.wired_count++; 3540 else if (!wired && (origpte & PG_W)) 3541 pmap->pm_stats.wired_count--; 3542 3543 /* 3544 * Remove extra pte reference 3545 */ 3546 if (mpte) 3547 mpte->wire_count--; 3548 3549 if (origpte & PG_MANAGED) { 3550 om = m; 3551 pa |= PG_MANAGED; 3552 } 3553 goto validate; 3554 } 3555 3556 pv = NULL; 3557 3558 /* 3559 * Mapping has changed, invalidate old range and fall through to 3560 * handle validating new mapping. 3561 */ 3562 if (opa) { 3563 if (origpte & PG_W) 3564 pmap->pm_stats.wired_count--; 3565 if (origpte & PG_MANAGED) { 3566 om = PHYS_TO_VM_PAGE(opa); 3567 pv = pmap_pvh_remove(&om->md, pmap, va); 3568 } 3569 if (mpte != NULL) { 3570 mpte->wire_count--; 3571 KASSERT(mpte->wire_count > 0, 3572 ("pmap_enter: missing reference to page table page," 3573 " va: 0x%x", va)); 3574 } 3575 } else 3576 pmap->pm_stats.resident_count++; 3577 3578 /* 3579 * Enter on the PV list if part of our managed memory. 3580 */ 3581 if ((m->oflags & VPO_UNMANAGED) == 0) { 3582 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3583 ("pmap_enter: managed mapping within the clean submap")); 3584 if (pv == NULL) 3585 pv = get_pv_entry(pmap, FALSE); 3586 pv->pv_va = va; 3587 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3588 pa |= PG_MANAGED; 3589 } else if (pv != NULL) 3590 free_pv_entry(pmap, pv); 3591 3592 /* 3593 * Increment counters 3594 */ 3595 if (wired) 3596 pmap->pm_stats.wired_count++; 3597 3598validate: 3599 /* 3600 * Now validate mapping with desired protection/wiring. 3601 */ 3602 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3603 if ((prot & VM_PROT_WRITE) != 0) { 3604 newpte |= PG_RW; 3605 if ((newpte & PG_MANAGED) != 0) 3606 vm_page_aflag_set(m, PGA_WRITEABLE); 3607 } 3608#ifdef PAE 3609 if ((prot & VM_PROT_EXECUTE) == 0) 3610 newpte |= pg_nx; 3611#endif 3612 if (wired) 3613 newpte |= PG_W; 3614 if (va < VM_MAXUSER_ADDRESS) 3615 newpte |= PG_U; 3616 if (pmap == kernel_pmap) 3617 newpte |= pgeflag; 3618 3619 /* 3620 * if the mapping or permission bits are different, we need 3621 * to update the pte. 3622 */ 3623 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3624 newpte |= PG_A; 3625 if ((flags & VM_PROT_WRITE) != 0) 3626 newpte |= PG_M; 3627 if (origpte & PG_V) { 3628 invlva = FALSE; 3629 origpte = pte_load_store(pte, newpte); 3630 if (origpte & PG_A) { 3631 if (origpte & PG_MANAGED) 3632 vm_page_aflag_set(om, PGA_REFERENCED); 3633 if (opa != VM_PAGE_TO_PHYS(m)) 3634 invlva = TRUE; 3635#ifdef PAE 3636 if ((origpte & PG_NX) == 0 && 3637 (newpte & PG_NX) != 0) 3638 invlva = TRUE; 3639#endif 3640 } 3641 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3642 if ((origpte & PG_MANAGED) != 0) 3643 vm_page_dirty(om); 3644 if ((prot & VM_PROT_WRITE) == 0) 3645 invlva = TRUE; 3646 } 3647 if ((origpte & PG_MANAGED) != 0 && 3648 TAILQ_EMPTY(&om->md.pv_list) && 3649 ((om->flags & PG_FICTITIOUS) != 0 || 3650 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3651 vm_page_aflag_clear(om, PGA_WRITEABLE); 3652 if (invlva) 3653 pmap_invalidate_page(pmap, va); 3654 } else 3655 pte_store(pte, newpte); 3656 } 3657 3658 /* 3659 * If both the page table page and the reservation are fully 3660 * populated, then attempt promotion. 3661 */ 3662 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3663 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3664 vm_reserv_level_iffullpop(m) == 0) 3665 pmap_promote_pde(pmap, pde, va); 3666 3667 sched_unpin(); 3668 rw_wunlock(&pvh_global_lock); 3669 PMAP_UNLOCK(pmap); 3670 return (KERN_SUCCESS); 3671} 3672 3673/* 3674 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3675 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3676 * blocking, (2) a mapping already exists at the specified virtual address, or 3677 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3678 */ 3679static boolean_t 3680pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3681{ 3682 pd_entry_t *pde, newpde; 3683 3684 rw_assert(&pvh_global_lock, RA_WLOCKED); 3685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3686 pde = pmap_pde(pmap, va); 3687 if (*pde != 0) { 3688 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3689 " in pmap %p", va, pmap); 3690 return (FALSE); 3691 } 3692 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3693 PG_PS | PG_V; 3694 if ((m->oflags & VPO_UNMANAGED) == 0) { 3695 newpde |= PG_MANAGED; 3696 3697 /* 3698 * Abort this mapping if its PV entry could not be created. 3699 */ 3700 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3701 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3702 " in pmap %p", va, pmap); 3703 return (FALSE); 3704 } 3705 } 3706#ifdef PAE 3707 if ((prot & VM_PROT_EXECUTE) == 0) 3708 newpde |= pg_nx; 3709#endif 3710 if (va < VM_MAXUSER_ADDRESS) 3711 newpde |= PG_U; 3712 3713 /* 3714 * Increment counters. 3715 */ 3716 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3717 3718 /* 3719 * Map the superpage. 3720 */ 3721 pde_store(pde, newpde); 3722 3723 pmap_pde_mappings++; 3724 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3725 " in pmap %p", va, pmap); 3726 return (TRUE); 3727} 3728 3729/* 3730 * Maps a sequence of resident pages belonging to the same object. 3731 * The sequence begins with the given page m_start. This page is 3732 * mapped at the given virtual address start. Each subsequent page is 3733 * mapped at a virtual address that is offset from start by the same 3734 * amount as the page is offset from m_start within the object. The 3735 * last page in the sequence is the page with the largest offset from 3736 * m_start that can be mapped at a virtual address less than the given 3737 * virtual address end. Not every virtual page between start and end 3738 * is mapped; only those for which a resident page exists with the 3739 * corresponding offset from m_start are mapped. 3740 */ 3741void 3742pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3743 vm_page_t m_start, vm_prot_t prot) 3744{ 3745 vm_offset_t va; 3746 vm_page_t m, mpte; 3747 vm_pindex_t diff, psize; 3748 3749 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3750 3751 psize = atop(end - start); 3752 mpte = NULL; 3753 m = m_start; 3754 rw_wlock(&pvh_global_lock); 3755 PMAP_LOCK(pmap); 3756 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3757 va = start + ptoa(diff); 3758 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3759 m->psind == 1 && pg_ps_enabled && 3760 pmap_enter_pde(pmap, va, m, prot)) 3761 m = &m[NBPDR / PAGE_SIZE - 1]; 3762 else 3763 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3764 mpte); 3765 m = TAILQ_NEXT(m, listq); 3766 } 3767 rw_wunlock(&pvh_global_lock); 3768 PMAP_UNLOCK(pmap); 3769} 3770 3771/* 3772 * this code makes some *MAJOR* assumptions: 3773 * 1. Current pmap & pmap exists. 3774 * 2. Not wired. 3775 * 3. Read access. 3776 * 4. No page table pages. 3777 * but is *MUCH* faster than pmap_enter... 3778 */ 3779 3780void 3781pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3782{ 3783 3784 rw_wlock(&pvh_global_lock); 3785 PMAP_LOCK(pmap); 3786 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3787 rw_wunlock(&pvh_global_lock); 3788 PMAP_UNLOCK(pmap); 3789} 3790 3791static vm_page_t 3792pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3793 vm_prot_t prot, vm_page_t mpte) 3794{ 3795 pt_entry_t *pte; 3796 vm_paddr_t pa; 3797 struct spglist free; 3798 3799 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3800 (m->oflags & VPO_UNMANAGED) != 0, 3801 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3802 rw_assert(&pvh_global_lock, RA_WLOCKED); 3803 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3804 3805 /* 3806 * In the case that a page table page is not 3807 * resident, we are creating it here. 3808 */ 3809 if (va < VM_MAXUSER_ADDRESS) { 3810 u_int ptepindex; 3811 pd_entry_t ptepa; 3812 3813 /* 3814 * Calculate pagetable page index 3815 */ 3816 ptepindex = va >> PDRSHIFT; 3817 if (mpte && (mpte->pindex == ptepindex)) { 3818 mpte->wire_count++; 3819 } else { 3820 /* 3821 * Get the page directory entry 3822 */ 3823 ptepa = pmap->pm_pdir[ptepindex]; 3824 3825 /* 3826 * If the page table page is mapped, we just increment 3827 * the hold count, and activate it. 3828 */ 3829 if (ptepa) { 3830 if (ptepa & PG_PS) 3831 return (NULL); 3832 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3833 mpte->wire_count++; 3834 } else { 3835 mpte = _pmap_allocpte(pmap, ptepindex, 3836 PMAP_ENTER_NOSLEEP); 3837 if (mpte == NULL) 3838 return (mpte); 3839 } 3840 } 3841 } else { 3842 mpte = NULL; 3843 } 3844 3845 /* 3846 * This call to vtopte makes the assumption that we are 3847 * entering the page into the current pmap. In order to support 3848 * quick entry into any pmap, one would likely use pmap_pte_quick. 3849 * But that isn't as quick as vtopte. 3850 */ 3851 pte = vtopte(va); 3852 if (*pte) { 3853 if (mpte != NULL) { 3854 mpte->wire_count--; 3855 mpte = NULL; 3856 } 3857 return (mpte); 3858 } 3859 3860 /* 3861 * Enter on the PV list if part of our managed memory. 3862 */ 3863 if ((m->oflags & VPO_UNMANAGED) == 0 && 3864 !pmap_try_insert_pv_entry(pmap, va, m)) { 3865 if (mpte != NULL) { 3866 SLIST_INIT(&free); 3867 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3868 pmap_invalidate_page(pmap, va); 3869 pmap_free_zero_pages(&free); 3870 } 3871 3872 mpte = NULL; 3873 } 3874 return (mpte); 3875 } 3876 3877 /* 3878 * Increment counters 3879 */ 3880 pmap->pm_stats.resident_count++; 3881 3882 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3883#ifdef PAE 3884 if ((prot & VM_PROT_EXECUTE) == 0) 3885 pa |= pg_nx; 3886#endif 3887 3888 /* 3889 * Now validate mapping with RO protection 3890 */ 3891 if ((m->oflags & VPO_UNMANAGED) != 0) 3892 pte_store(pte, pa | PG_V | PG_U); 3893 else 3894 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3895 return (mpte); 3896} 3897 3898/* 3899 * Make a temporary mapping for a physical address. This is only intended 3900 * to be used for panic dumps. 3901 */ 3902void * 3903pmap_kenter_temporary(vm_paddr_t pa, int i) 3904{ 3905 vm_offset_t va; 3906 3907 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3908 pmap_kenter(va, pa); 3909 invlpg(va); 3910 return ((void *)crashdumpmap); 3911} 3912 3913/* 3914 * This code maps large physical mmap regions into the 3915 * processor address space. Note that some shortcuts 3916 * are taken, but the code works. 3917 */ 3918void 3919pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3920 vm_pindex_t pindex, vm_size_t size) 3921{ 3922 pd_entry_t *pde; 3923 vm_paddr_t pa, ptepa; 3924 vm_page_t p; 3925 int pat_mode; 3926 3927 VM_OBJECT_ASSERT_WLOCKED(object); 3928 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3929 ("pmap_object_init_pt: non-device object")); 3930 if (pseflag && 3931 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3932 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3933 return; 3934 p = vm_page_lookup(object, pindex); 3935 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3936 ("pmap_object_init_pt: invalid page %p", p)); 3937 pat_mode = p->md.pat_mode; 3938 3939 /* 3940 * Abort the mapping if the first page is not physically 3941 * aligned to a 2/4MB page boundary. 3942 */ 3943 ptepa = VM_PAGE_TO_PHYS(p); 3944 if (ptepa & (NBPDR - 1)) 3945 return; 3946 3947 /* 3948 * Skip the first page. Abort the mapping if the rest of 3949 * the pages are not physically contiguous or have differing 3950 * memory attributes. 3951 */ 3952 p = TAILQ_NEXT(p, listq); 3953 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3954 pa += PAGE_SIZE) { 3955 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3956 ("pmap_object_init_pt: invalid page %p", p)); 3957 if (pa != VM_PAGE_TO_PHYS(p) || 3958 pat_mode != p->md.pat_mode) 3959 return; 3960 p = TAILQ_NEXT(p, listq); 3961 } 3962 3963 /* 3964 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3965 * "size" is a multiple of 2/4M, adding the PAT setting to 3966 * "pa" will not affect the termination of this loop. 3967 */ 3968 PMAP_LOCK(pmap); 3969 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3970 size; pa += NBPDR) { 3971 pde = pmap_pde(pmap, addr); 3972 if (*pde == 0) { 3973 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3974 PG_U | PG_RW | PG_V); 3975 pmap->pm_stats.resident_count += NBPDR / 3976 PAGE_SIZE; 3977 pmap_pde_mappings++; 3978 } 3979 /* Else continue on if the PDE is already valid. */ 3980 addr += NBPDR; 3981 } 3982 PMAP_UNLOCK(pmap); 3983 } 3984} 3985 3986/* 3987 * Clear the wired attribute from the mappings for the specified range of 3988 * addresses in the given pmap. Every valid mapping within that range 3989 * must have the wired attribute set. In contrast, invalid mappings 3990 * cannot have the wired attribute set, so they are ignored. 3991 * 3992 * The wired attribute of the page table entry is not a hardware feature, 3993 * so there is no need to invalidate any TLB entries. 3994 */ 3995void 3996pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3997{ 3998 vm_offset_t pdnxt; 3999 pd_entry_t *pde; 4000 pt_entry_t *pte; 4001 boolean_t pv_lists_locked; 4002 4003 if (pmap_is_current(pmap)) 4004 pv_lists_locked = FALSE; 4005 else { 4006 pv_lists_locked = TRUE; 4007resume: 4008 rw_wlock(&pvh_global_lock); 4009 sched_pin(); 4010 } 4011 PMAP_LOCK(pmap); 4012 for (; sva < eva; sva = pdnxt) { 4013 pdnxt = (sva + NBPDR) & ~PDRMASK; 4014 if (pdnxt < sva) 4015 pdnxt = eva; 4016 pde = pmap_pde(pmap, sva); 4017 if ((*pde & PG_V) == 0) 4018 continue; 4019 if ((*pde & PG_PS) != 0) { 4020 if ((*pde & PG_W) == 0) 4021 panic("pmap_unwire: pde %#jx is missing PG_W", 4022 (uintmax_t)*pde); 4023 4024 /* 4025 * Are we unwiring the entire large page? If not, 4026 * demote the mapping and fall through. 4027 */ 4028 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4029 /* 4030 * Regardless of whether a pde (or pte) is 32 4031 * or 64 bits in size, PG_W is among the least 4032 * significant 32 bits. 4033 */ 4034 atomic_clear_int((u_int *)pde, PG_W); 4035 pmap->pm_stats.wired_count -= NBPDR / 4036 PAGE_SIZE; 4037 continue; 4038 } else { 4039 if (!pv_lists_locked) { 4040 pv_lists_locked = TRUE; 4041 if (!rw_try_wlock(&pvh_global_lock)) { 4042 PMAP_UNLOCK(pmap); 4043 /* Repeat sva. */ 4044 goto resume; 4045 } 4046 sched_pin(); 4047 } 4048 if (!pmap_demote_pde(pmap, pde, sva)) 4049 panic("pmap_unwire: demotion failed"); 4050 } 4051 } 4052 if (pdnxt > eva) 4053 pdnxt = eva; 4054 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4055 sva += PAGE_SIZE) { 4056 if ((*pte & PG_V) == 0) 4057 continue; 4058 if ((*pte & PG_W) == 0) 4059 panic("pmap_unwire: pte %#jx is missing PG_W", 4060 (uintmax_t)*pte); 4061 4062 /* 4063 * PG_W must be cleared atomically. Although the pmap 4064 * lock synchronizes access to PG_W, another processor 4065 * could be setting PG_M and/or PG_A concurrently. 4066 * 4067 * PG_W is among the least significant 32 bits. 4068 */ 4069 atomic_clear_int((u_int *)pte, PG_W); 4070 pmap->pm_stats.wired_count--; 4071 } 4072 } 4073 if (pv_lists_locked) { 4074 sched_unpin(); 4075 rw_wunlock(&pvh_global_lock); 4076 } 4077 PMAP_UNLOCK(pmap); 4078} 4079 4080 4081/* 4082 * Copy the range specified by src_addr/len 4083 * from the source map to the range dst_addr/len 4084 * in the destination map. 4085 * 4086 * This routine is only advisory and need not do anything. 4087 */ 4088 4089void 4090pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4091 vm_offset_t src_addr) 4092{ 4093 struct spglist free; 4094 vm_offset_t addr; 4095 vm_offset_t end_addr = src_addr + len; 4096 vm_offset_t pdnxt; 4097 4098 if (dst_addr != src_addr) 4099 return; 4100 4101 if (!pmap_is_current(src_pmap)) 4102 return; 4103 4104 rw_wlock(&pvh_global_lock); 4105 if (dst_pmap < src_pmap) { 4106 PMAP_LOCK(dst_pmap); 4107 PMAP_LOCK(src_pmap); 4108 } else { 4109 PMAP_LOCK(src_pmap); 4110 PMAP_LOCK(dst_pmap); 4111 } 4112 sched_pin(); 4113 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4114 pt_entry_t *src_pte, *dst_pte; 4115 vm_page_t dstmpte, srcmpte; 4116 pd_entry_t srcptepaddr; 4117 u_int ptepindex; 4118 4119 KASSERT(addr < UPT_MIN_ADDRESS, 4120 ("pmap_copy: invalid to pmap_copy page tables")); 4121 4122 pdnxt = (addr + NBPDR) & ~PDRMASK; 4123 if (pdnxt < addr) 4124 pdnxt = end_addr; 4125 ptepindex = addr >> PDRSHIFT; 4126 4127 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4128 if (srcptepaddr == 0) 4129 continue; 4130 4131 if (srcptepaddr & PG_PS) { 4132 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4133 continue; 4134 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4135 ((srcptepaddr & PG_MANAGED) == 0 || 4136 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4137 PG_PS_FRAME))) { 4138 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4139 ~PG_W; 4140 dst_pmap->pm_stats.resident_count += 4141 NBPDR / PAGE_SIZE; 4142 } 4143 continue; 4144 } 4145 4146 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4147 KASSERT(srcmpte->wire_count > 0, 4148 ("pmap_copy: source page table page is unused")); 4149 4150 if (pdnxt > end_addr) 4151 pdnxt = end_addr; 4152 4153 src_pte = vtopte(addr); 4154 while (addr < pdnxt) { 4155 pt_entry_t ptetemp; 4156 ptetemp = *src_pte; 4157 /* 4158 * we only virtual copy managed pages 4159 */ 4160 if ((ptetemp & PG_MANAGED) != 0) { 4161 dstmpte = pmap_allocpte(dst_pmap, addr, 4162 PMAP_ENTER_NOSLEEP); 4163 if (dstmpte == NULL) 4164 goto out; 4165 dst_pte = pmap_pte_quick(dst_pmap, addr); 4166 if (*dst_pte == 0 && 4167 pmap_try_insert_pv_entry(dst_pmap, addr, 4168 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4169 /* 4170 * Clear the wired, modified, and 4171 * accessed (referenced) bits 4172 * during the copy. 4173 */ 4174 *dst_pte = ptetemp & ~(PG_W | PG_M | 4175 PG_A); 4176 dst_pmap->pm_stats.resident_count++; 4177 } else { 4178 SLIST_INIT(&free); 4179 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4180 &free)) { 4181 pmap_invalidate_page(dst_pmap, 4182 addr); 4183 pmap_free_zero_pages(&free); 4184 } 4185 goto out; 4186 } 4187 if (dstmpte->wire_count >= srcmpte->wire_count) 4188 break; 4189 } 4190 addr += PAGE_SIZE; 4191 src_pte++; 4192 } 4193 } 4194out: 4195 sched_unpin(); 4196 rw_wunlock(&pvh_global_lock); 4197 PMAP_UNLOCK(src_pmap); 4198 PMAP_UNLOCK(dst_pmap); 4199} 4200 4201static __inline void 4202pagezero(void *page) 4203{ 4204#if defined(I686_CPU) 4205 if (cpu_class == CPUCLASS_686) { 4206#if defined(CPU_ENABLE_SSE) 4207 if (cpu_feature & CPUID_SSE2) 4208 sse2_pagezero(page); 4209 else 4210#endif 4211 i686_pagezero(page); 4212 } else 4213#endif 4214 bzero(page, PAGE_SIZE); 4215} 4216 4217/* 4218 * pmap_zero_page zeros the specified hardware page by mapping 4219 * the page into KVM and using bzero to clear its contents. 4220 */ 4221void 4222pmap_zero_page(vm_page_t m) 4223{ 4224 struct sysmaps *sysmaps; 4225 4226 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4227 mtx_lock(&sysmaps->lock); 4228 if (*sysmaps->CMAP2) 4229 panic("pmap_zero_page: CMAP2 busy"); 4230 sched_pin(); 4231 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4232 pmap_cache_bits(m->md.pat_mode, 0); 4233 invlcaddr(sysmaps->CADDR2); 4234 pagezero(sysmaps->CADDR2); 4235 *sysmaps->CMAP2 = 0; 4236 sched_unpin(); 4237 mtx_unlock(&sysmaps->lock); 4238} 4239 4240/* 4241 * pmap_zero_page_area zeros the specified hardware page by mapping 4242 * the page into KVM and using bzero to clear its contents. 4243 * 4244 * off and size may not cover an area beyond a single hardware page. 4245 */ 4246void 4247pmap_zero_page_area(vm_page_t m, int off, int size) 4248{ 4249 struct sysmaps *sysmaps; 4250 4251 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4252 mtx_lock(&sysmaps->lock); 4253 if (*sysmaps->CMAP2) 4254 panic("pmap_zero_page_area: CMAP2 busy"); 4255 sched_pin(); 4256 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4257 pmap_cache_bits(m->md.pat_mode, 0); 4258 invlcaddr(sysmaps->CADDR2); 4259 if (off == 0 && size == PAGE_SIZE) 4260 pagezero(sysmaps->CADDR2); 4261 else 4262 bzero((char *)sysmaps->CADDR2 + off, size); 4263 *sysmaps->CMAP2 = 0; 4264 sched_unpin(); 4265 mtx_unlock(&sysmaps->lock); 4266} 4267 4268/* 4269 * pmap_zero_page_idle zeros the specified hardware page by mapping 4270 * the page into KVM and using bzero to clear its contents. This 4271 * is intended to be called from the vm_pagezero process only and 4272 * outside of Giant. 4273 */ 4274void 4275pmap_zero_page_idle(vm_page_t m) 4276{ 4277 4278 if (*CMAP3) 4279 panic("pmap_zero_page_idle: CMAP3 busy"); 4280 sched_pin(); 4281 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4282 pmap_cache_bits(m->md.pat_mode, 0); 4283 invlcaddr(CADDR3); 4284 pagezero(CADDR3); 4285 *CMAP3 = 0; 4286 sched_unpin(); 4287} 4288 4289/* 4290 * pmap_copy_page copies the specified (machine independent) 4291 * page by mapping the page into virtual memory and using 4292 * bcopy to copy the page, one machine dependent page at a 4293 * time. 4294 */ 4295void 4296pmap_copy_page(vm_page_t src, vm_page_t dst) 4297{ 4298 struct sysmaps *sysmaps; 4299 4300 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4301 mtx_lock(&sysmaps->lock); 4302 if (*sysmaps->CMAP1) 4303 panic("pmap_copy_page: CMAP1 busy"); 4304 if (*sysmaps->CMAP2) 4305 panic("pmap_copy_page: CMAP2 busy"); 4306 sched_pin(); 4307 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4308 pmap_cache_bits(src->md.pat_mode, 0); 4309 invlcaddr(sysmaps->CADDR1); 4310 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4311 pmap_cache_bits(dst->md.pat_mode, 0); 4312 invlcaddr(sysmaps->CADDR2); 4313 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4314 *sysmaps->CMAP1 = 0; 4315 *sysmaps->CMAP2 = 0; 4316 sched_unpin(); 4317 mtx_unlock(&sysmaps->lock); 4318} 4319 4320int unmapped_buf_allowed = 1; 4321 4322void 4323pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4324 vm_offset_t b_offset, int xfersize) 4325{ 4326 struct sysmaps *sysmaps; 4327 vm_page_t a_pg, b_pg; 4328 char *a_cp, *b_cp; 4329 vm_offset_t a_pg_offset, b_pg_offset; 4330 int cnt; 4331 4332 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4333 mtx_lock(&sysmaps->lock); 4334 if (*sysmaps->CMAP1 != 0) 4335 panic("pmap_copy_pages: CMAP1 busy"); 4336 if (*sysmaps->CMAP2 != 0) 4337 panic("pmap_copy_pages: CMAP2 busy"); 4338 sched_pin(); 4339 while (xfersize > 0) { 4340 a_pg = ma[a_offset >> PAGE_SHIFT]; 4341 a_pg_offset = a_offset & PAGE_MASK; 4342 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4343 b_pg = mb[b_offset >> PAGE_SHIFT]; 4344 b_pg_offset = b_offset & PAGE_MASK; 4345 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4346 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4347 pmap_cache_bits(a_pg->md.pat_mode, 0); 4348 invlcaddr(sysmaps->CADDR1); 4349 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4350 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4351 invlcaddr(sysmaps->CADDR2); 4352 a_cp = sysmaps->CADDR1 + a_pg_offset; 4353 b_cp = sysmaps->CADDR2 + b_pg_offset; 4354 bcopy(a_cp, b_cp, cnt); 4355 a_offset += cnt; 4356 b_offset += cnt; 4357 xfersize -= cnt; 4358 } 4359 *sysmaps->CMAP1 = 0; 4360 *sysmaps->CMAP2 = 0; 4361 sched_unpin(); 4362 mtx_unlock(&sysmaps->lock); 4363} 4364 4365/* 4366 * Returns true if the pmap's pv is one of the first 4367 * 16 pvs linked to from this page. This count may 4368 * be changed upwards or downwards in the future; it 4369 * is only necessary that true be returned for a small 4370 * subset of pmaps for proper page aging. 4371 */ 4372boolean_t 4373pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4374{ 4375 struct md_page *pvh; 4376 pv_entry_t pv; 4377 int loops = 0; 4378 boolean_t rv; 4379 4380 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4381 ("pmap_page_exists_quick: page %p is not managed", m)); 4382 rv = FALSE; 4383 rw_wlock(&pvh_global_lock); 4384 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4385 if (PV_PMAP(pv) == pmap) { 4386 rv = TRUE; 4387 break; 4388 } 4389 loops++; 4390 if (loops >= 16) 4391 break; 4392 } 4393 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4394 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4395 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4396 if (PV_PMAP(pv) == pmap) { 4397 rv = TRUE; 4398 break; 4399 } 4400 loops++; 4401 if (loops >= 16) 4402 break; 4403 } 4404 } 4405 rw_wunlock(&pvh_global_lock); 4406 return (rv); 4407} 4408 4409/* 4410 * pmap_page_wired_mappings: 4411 * 4412 * Return the number of managed mappings to the given physical page 4413 * that are wired. 4414 */ 4415int 4416pmap_page_wired_mappings(vm_page_t m) 4417{ 4418 int count; 4419 4420 count = 0; 4421 if ((m->oflags & VPO_UNMANAGED) != 0) 4422 return (count); 4423 rw_wlock(&pvh_global_lock); 4424 count = pmap_pvh_wired_mappings(&m->md, count); 4425 if ((m->flags & PG_FICTITIOUS) == 0) { 4426 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4427 count); 4428 } 4429 rw_wunlock(&pvh_global_lock); 4430 return (count); 4431} 4432 4433/* 4434 * pmap_pvh_wired_mappings: 4435 * 4436 * Return the updated number "count" of managed mappings that are wired. 4437 */ 4438static int 4439pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4440{ 4441 pmap_t pmap; 4442 pt_entry_t *pte; 4443 pv_entry_t pv; 4444 4445 rw_assert(&pvh_global_lock, RA_WLOCKED); 4446 sched_pin(); 4447 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4448 pmap = PV_PMAP(pv); 4449 PMAP_LOCK(pmap); 4450 pte = pmap_pte_quick(pmap, pv->pv_va); 4451 if ((*pte & PG_W) != 0) 4452 count++; 4453 PMAP_UNLOCK(pmap); 4454 } 4455 sched_unpin(); 4456 return (count); 4457} 4458 4459/* 4460 * Returns TRUE if the given page is mapped individually or as part of 4461 * a 4mpage. Otherwise, returns FALSE. 4462 */ 4463boolean_t 4464pmap_page_is_mapped(vm_page_t m) 4465{ 4466 boolean_t rv; 4467 4468 if ((m->oflags & VPO_UNMANAGED) != 0) 4469 return (FALSE); 4470 rw_wlock(&pvh_global_lock); 4471 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4472 ((m->flags & PG_FICTITIOUS) == 0 && 4473 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4474 rw_wunlock(&pvh_global_lock); 4475 return (rv); 4476} 4477 4478/* 4479 * Remove all pages from specified address space 4480 * this aids process exit speeds. Also, this code 4481 * is special cased for current process only, but 4482 * can have the more generic (and slightly slower) 4483 * mode enabled. This is much faster than pmap_remove 4484 * in the case of running down an entire address space. 4485 */ 4486void 4487pmap_remove_pages(pmap_t pmap) 4488{ 4489 pt_entry_t *pte, tpte; 4490 vm_page_t m, mpte, mt; 4491 pv_entry_t pv; 4492 struct md_page *pvh; 4493 struct pv_chunk *pc, *npc; 4494 struct spglist free; 4495 int field, idx; 4496 int32_t bit; 4497 uint32_t inuse, bitmask; 4498 int allfree; 4499 4500 if (pmap != PCPU_GET(curpmap)) { 4501 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4502 return; 4503 } 4504 SLIST_INIT(&free); 4505 rw_wlock(&pvh_global_lock); 4506 PMAP_LOCK(pmap); 4507 sched_pin(); 4508 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4509 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4510 pc->pc_pmap)); 4511 allfree = 1; 4512 for (field = 0; field < _NPCM; field++) { 4513 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4514 while (inuse != 0) { 4515 bit = bsfl(inuse); 4516 bitmask = 1UL << bit; 4517 idx = field * 32 + bit; 4518 pv = &pc->pc_pventry[idx]; 4519 inuse &= ~bitmask; 4520 4521 pte = pmap_pde(pmap, pv->pv_va); 4522 tpte = *pte; 4523 if ((tpte & PG_PS) == 0) { 4524 pte = vtopte(pv->pv_va); 4525 tpte = *pte & ~PG_PTE_PAT; 4526 } 4527 4528 if (tpte == 0) { 4529 printf( 4530 "TPTE at %p IS ZERO @ VA %08x\n", 4531 pte, pv->pv_va); 4532 panic("bad pte"); 4533 } 4534 4535/* 4536 * We cannot remove wired pages from a process' mapping at this time 4537 */ 4538 if (tpte & PG_W) { 4539 allfree = 0; 4540 continue; 4541 } 4542 4543 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4544 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4545 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4546 m, (uintmax_t)m->phys_addr, 4547 (uintmax_t)tpte)); 4548 4549 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4550 m < &vm_page_array[vm_page_array_size], 4551 ("pmap_remove_pages: bad tpte %#jx", 4552 (uintmax_t)tpte)); 4553 4554 pte_clear(pte); 4555 4556 /* 4557 * Update the vm_page_t clean/reference bits. 4558 */ 4559 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4560 if ((tpte & PG_PS) != 0) { 4561 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4562 vm_page_dirty(mt); 4563 } else 4564 vm_page_dirty(m); 4565 } 4566 4567 /* Mark free */ 4568 PV_STAT(pv_entry_frees++); 4569 PV_STAT(pv_entry_spare++); 4570 pv_entry_count--; 4571 pc->pc_map[field] |= bitmask; 4572 if ((tpte & PG_PS) != 0) { 4573 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4574 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4575 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4576 if (TAILQ_EMPTY(&pvh->pv_list)) { 4577 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4578 if (TAILQ_EMPTY(&mt->md.pv_list)) 4579 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4580 } 4581 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4582 if (mpte != NULL) { 4583 pmap_remove_pt_page(pmap, mpte); 4584 pmap->pm_stats.resident_count--; 4585 KASSERT(mpte->wire_count == NPTEPG, 4586 ("pmap_remove_pages: pte page wire count error")); 4587 mpte->wire_count = 0; 4588 pmap_add_delayed_free_list(mpte, &free, FALSE); 4589 atomic_subtract_int(&cnt.v_wire_count, 1); 4590 } 4591 } else { 4592 pmap->pm_stats.resident_count--; 4593 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4594 if (TAILQ_EMPTY(&m->md.pv_list) && 4595 (m->flags & PG_FICTITIOUS) == 0) { 4596 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4597 if (TAILQ_EMPTY(&pvh->pv_list)) 4598 vm_page_aflag_clear(m, PGA_WRITEABLE); 4599 } 4600 pmap_unuse_pt(pmap, pv->pv_va, &free); 4601 } 4602 } 4603 } 4604 if (allfree) { 4605 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4606 free_pv_chunk(pc); 4607 } 4608 } 4609 sched_unpin(); 4610 pmap_invalidate_all(pmap); 4611 rw_wunlock(&pvh_global_lock); 4612 PMAP_UNLOCK(pmap); 4613 pmap_free_zero_pages(&free); 4614} 4615 4616/* 4617 * pmap_is_modified: 4618 * 4619 * Return whether or not the specified physical page was modified 4620 * in any physical maps. 4621 */ 4622boolean_t 4623pmap_is_modified(vm_page_t m) 4624{ 4625 boolean_t rv; 4626 4627 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4628 ("pmap_is_modified: page %p is not managed", m)); 4629 4630 /* 4631 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4632 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4633 * is clear, no PTEs can have PG_M set. 4634 */ 4635 VM_OBJECT_ASSERT_WLOCKED(m->object); 4636 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4637 return (FALSE); 4638 rw_wlock(&pvh_global_lock); 4639 rv = pmap_is_modified_pvh(&m->md) || 4640 ((m->flags & PG_FICTITIOUS) == 0 && 4641 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4642 rw_wunlock(&pvh_global_lock); 4643 return (rv); 4644} 4645 4646/* 4647 * Returns TRUE if any of the given mappings were used to modify 4648 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4649 * mappings are supported. 4650 */ 4651static boolean_t 4652pmap_is_modified_pvh(struct md_page *pvh) 4653{ 4654 pv_entry_t pv; 4655 pt_entry_t *pte; 4656 pmap_t pmap; 4657 boolean_t rv; 4658 4659 rw_assert(&pvh_global_lock, RA_WLOCKED); 4660 rv = FALSE; 4661 sched_pin(); 4662 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4663 pmap = PV_PMAP(pv); 4664 PMAP_LOCK(pmap); 4665 pte = pmap_pte_quick(pmap, pv->pv_va); 4666 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4667 PMAP_UNLOCK(pmap); 4668 if (rv) 4669 break; 4670 } 4671 sched_unpin(); 4672 return (rv); 4673} 4674 4675/* 4676 * pmap_is_prefaultable: 4677 * 4678 * Return whether or not the specified virtual address is elgible 4679 * for prefault. 4680 */ 4681boolean_t 4682pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4683{ 4684 pd_entry_t *pde; 4685 pt_entry_t *pte; 4686 boolean_t rv; 4687 4688 rv = FALSE; 4689 PMAP_LOCK(pmap); 4690 pde = pmap_pde(pmap, addr); 4691 if (*pde != 0 && (*pde & PG_PS) == 0) { 4692 pte = vtopte(addr); 4693 rv = *pte == 0; 4694 } 4695 PMAP_UNLOCK(pmap); 4696 return (rv); 4697} 4698 4699/* 4700 * pmap_is_referenced: 4701 * 4702 * Return whether or not the specified physical page was referenced 4703 * in any physical maps. 4704 */ 4705boolean_t 4706pmap_is_referenced(vm_page_t m) 4707{ 4708 boolean_t rv; 4709 4710 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4711 ("pmap_is_referenced: page %p is not managed", m)); 4712 rw_wlock(&pvh_global_lock); 4713 rv = pmap_is_referenced_pvh(&m->md) || 4714 ((m->flags & PG_FICTITIOUS) == 0 && 4715 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4716 rw_wunlock(&pvh_global_lock); 4717 return (rv); 4718} 4719 4720/* 4721 * Returns TRUE if any of the given mappings were referenced and FALSE 4722 * otherwise. Both page and 4mpage mappings are supported. 4723 */ 4724static boolean_t 4725pmap_is_referenced_pvh(struct md_page *pvh) 4726{ 4727 pv_entry_t pv; 4728 pt_entry_t *pte; 4729 pmap_t pmap; 4730 boolean_t rv; 4731 4732 rw_assert(&pvh_global_lock, RA_WLOCKED); 4733 rv = FALSE; 4734 sched_pin(); 4735 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4736 pmap = PV_PMAP(pv); 4737 PMAP_LOCK(pmap); 4738 pte = pmap_pte_quick(pmap, pv->pv_va); 4739 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4740 PMAP_UNLOCK(pmap); 4741 if (rv) 4742 break; 4743 } 4744 sched_unpin(); 4745 return (rv); 4746} 4747 4748/* 4749 * Clear the write and modified bits in each of the given page's mappings. 4750 */ 4751void 4752pmap_remove_write(vm_page_t m) 4753{ 4754 struct md_page *pvh; 4755 pv_entry_t next_pv, pv; 4756 pmap_t pmap; 4757 pd_entry_t *pde; 4758 pt_entry_t oldpte, *pte; 4759 vm_offset_t va; 4760 4761 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4762 ("pmap_remove_write: page %p is not managed", m)); 4763 4764 /* 4765 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4766 * set by another thread while the object is locked. Thus, 4767 * if PGA_WRITEABLE is clear, no page table entries need updating. 4768 */ 4769 VM_OBJECT_ASSERT_WLOCKED(m->object); 4770 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4771 return; 4772 rw_wlock(&pvh_global_lock); 4773 sched_pin(); 4774 if ((m->flags & PG_FICTITIOUS) != 0) 4775 goto small_mappings; 4776 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4777 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4778 va = pv->pv_va; 4779 pmap = PV_PMAP(pv); 4780 PMAP_LOCK(pmap); 4781 pde = pmap_pde(pmap, va); 4782 if ((*pde & PG_RW) != 0) 4783 (void)pmap_demote_pde(pmap, pde, va); 4784 PMAP_UNLOCK(pmap); 4785 } 4786small_mappings: 4787 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4788 pmap = PV_PMAP(pv); 4789 PMAP_LOCK(pmap); 4790 pde = pmap_pde(pmap, pv->pv_va); 4791 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4792 " a 4mpage in page %p's pv list", m)); 4793 pte = pmap_pte_quick(pmap, pv->pv_va); 4794retry: 4795 oldpte = *pte; 4796 if ((oldpte & PG_RW) != 0) { 4797 /* 4798 * Regardless of whether a pte is 32 or 64 bits 4799 * in size, PG_RW and PG_M are among the least 4800 * significant 32 bits. 4801 */ 4802 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4803 oldpte & ~(PG_RW | PG_M))) 4804 goto retry; 4805 if ((oldpte & PG_M) != 0) 4806 vm_page_dirty(m); 4807 pmap_invalidate_page(pmap, pv->pv_va); 4808 } 4809 PMAP_UNLOCK(pmap); 4810 } 4811 vm_page_aflag_clear(m, PGA_WRITEABLE); 4812 sched_unpin(); 4813 rw_wunlock(&pvh_global_lock); 4814} 4815 4816#define PMAP_TS_REFERENCED_MAX 5 4817 4818/* 4819 * pmap_ts_referenced: 4820 * 4821 * Return a count of reference bits for a page, clearing those bits. 4822 * It is not necessary for every reference bit to be cleared, but it 4823 * is necessary that 0 only be returned when there are truly no 4824 * reference bits set. 4825 * 4826 * XXX: The exact number of bits to check and clear is a matter that 4827 * should be tested and standardized at some point in the future for 4828 * optimal aging of shared pages. 4829 */ 4830int 4831pmap_ts_referenced(vm_page_t m) 4832{ 4833 struct md_page *pvh; 4834 pv_entry_t pv, pvf; 4835 pmap_t pmap; 4836 pd_entry_t *pde; 4837 pt_entry_t *pte; 4838 vm_paddr_t pa; 4839 int rtval = 0; 4840 4841 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4842 ("pmap_ts_referenced: page %p is not managed", m)); 4843 pa = VM_PAGE_TO_PHYS(m); 4844 pvh = pa_to_pvh(pa); 4845 rw_wlock(&pvh_global_lock); 4846 sched_pin(); 4847 if ((m->flags & PG_FICTITIOUS) != 0 || 4848 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4849 goto small_mappings; 4850 pv = pvf; 4851 do { 4852 pmap = PV_PMAP(pv); 4853 PMAP_LOCK(pmap); 4854 pde = pmap_pde(pmap, pv->pv_va); 4855 if ((*pde & PG_A) != 0) { 4856 /* 4857 * Since this reference bit is shared by either 1024 4858 * or 512 4KB pages, it should not be cleared every 4859 * time it is tested. Apply a simple "hash" function 4860 * on the physical page number, the virtual superpage 4861 * number, and the pmap address to select one 4KB page 4862 * out of the 1024 or 512 on which testing the 4863 * reference bit will result in clearing that bit. 4864 * This function is designed to avoid the selection of 4865 * the same 4KB page for every 2- or 4MB page mapping. 4866 * 4867 * On demotion, a mapping that hasn't been referenced 4868 * is simply destroyed. To avoid the possibility of a 4869 * subsequent page fault on a demoted wired mapping, 4870 * always leave its reference bit set. Moreover, 4871 * since the superpage is wired, the current state of 4872 * its reference bit won't affect page replacement. 4873 */ 4874 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4875 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4876 (*pde & PG_W) == 0) { 4877 atomic_clear_int((u_int *)pde, PG_A); 4878 pmap_invalidate_page(pmap, pv->pv_va); 4879 } 4880 rtval++; 4881 } 4882 PMAP_UNLOCK(pmap); 4883 /* Rotate the PV list if it has more than one entry. */ 4884 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4885 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4886 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4887 } 4888 if (rtval >= PMAP_TS_REFERENCED_MAX) 4889 goto out; 4890 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4891small_mappings: 4892 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4893 goto out; 4894 pv = pvf; 4895 do { 4896 pmap = PV_PMAP(pv); 4897 PMAP_LOCK(pmap); 4898 pde = pmap_pde(pmap, pv->pv_va); 4899 KASSERT((*pde & PG_PS) == 0, 4900 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4901 m)); 4902 pte = pmap_pte_quick(pmap, pv->pv_va); 4903 if ((*pte & PG_A) != 0) { 4904 atomic_clear_int((u_int *)pte, PG_A); 4905 pmap_invalidate_page(pmap, pv->pv_va); 4906 rtval++; 4907 } 4908 PMAP_UNLOCK(pmap); 4909 /* Rotate the PV list if it has more than one entry. */ 4910 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4911 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4912 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4913 } 4914 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4915 PMAP_TS_REFERENCED_MAX); 4916out: 4917 sched_unpin(); 4918 rw_wunlock(&pvh_global_lock); 4919 return (rtval); 4920} 4921 4922/* 4923 * Apply the given advice to the specified range of addresses within the 4924 * given pmap. Depending on the advice, clear the referenced and/or 4925 * modified flags in each mapping and set the mapped page's dirty field. 4926 */ 4927void 4928pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4929{ 4930 pd_entry_t oldpde, *pde; 4931 pt_entry_t *pte; 4932 vm_offset_t pdnxt; 4933 vm_page_t m; 4934 boolean_t anychanged, pv_lists_locked; 4935 4936 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4937 return; 4938 if (pmap_is_current(pmap)) 4939 pv_lists_locked = FALSE; 4940 else { 4941 pv_lists_locked = TRUE; 4942resume: 4943 rw_wlock(&pvh_global_lock); 4944 sched_pin(); 4945 } 4946 anychanged = FALSE; 4947 PMAP_LOCK(pmap); 4948 for (; sva < eva; sva = pdnxt) { 4949 pdnxt = (sva + NBPDR) & ~PDRMASK; 4950 if (pdnxt < sva) 4951 pdnxt = eva; 4952 pde = pmap_pde(pmap, sva); 4953 oldpde = *pde; 4954 if ((oldpde & PG_V) == 0) 4955 continue; 4956 else if ((oldpde & PG_PS) != 0) { 4957 if ((oldpde & PG_MANAGED) == 0) 4958 continue; 4959 if (!pv_lists_locked) { 4960 pv_lists_locked = TRUE; 4961 if (!rw_try_wlock(&pvh_global_lock)) { 4962 if (anychanged) 4963 pmap_invalidate_all(pmap); 4964 PMAP_UNLOCK(pmap); 4965 goto resume; 4966 } 4967 sched_pin(); 4968 } 4969 if (!pmap_demote_pde(pmap, pde, sva)) { 4970 /* 4971 * The large page mapping was destroyed. 4972 */ 4973 continue; 4974 } 4975 4976 /* 4977 * Unless the page mappings are wired, remove the 4978 * mapping to a single page so that a subsequent 4979 * access may repromote. Since the underlying page 4980 * table page is fully populated, this removal never 4981 * frees a page table page. 4982 */ 4983 if ((oldpde & PG_W) == 0) { 4984 pte = pmap_pte_quick(pmap, sva); 4985 KASSERT((*pte & PG_V) != 0, 4986 ("pmap_advise: invalid PTE")); 4987 pmap_remove_pte(pmap, pte, sva, NULL); 4988 anychanged = TRUE; 4989 } 4990 } 4991 if (pdnxt > eva) 4992 pdnxt = eva; 4993 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4994 sva += PAGE_SIZE) { 4995 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 4996 PG_V)) 4997 continue; 4998 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4999 if (advice == MADV_DONTNEED) { 5000 /* 5001 * Future calls to pmap_is_modified() 5002 * can be avoided by making the page 5003 * dirty now. 5004 */ 5005 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5006 vm_page_dirty(m); 5007 } 5008 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5009 } else if ((*pte & PG_A) != 0) 5010 atomic_clear_int((u_int *)pte, PG_A); 5011 else 5012 continue; 5013 if ((*pte & PG_G) != 0) 5014 pmap_invalidate_page(pmap, sva); 5015 else 5016 anychanged = TRUE; 5017 } 5018 } 5019 if (anychanged) 5020 pmap_invalidate_all(pmap); 5021 if (pv_lists_locked) { 5022 sched_unpin(); 5023 rw_wunlock(&pvh_global_lock); 5024 } 5025 PMAP_UNLOCK(pmap); 5026} 5027 5028/* 5029 * Clear the modify bits on the specified physical page. 5030 */ 5031void 5032pmap_clear_modify(vm_page_t m) 5033{ 5034 struct md_page *pvh; 5035 pv_entry_t next_pv, pv; 5036 pmap_t pmap; 5037 pd_entry_t oldpde, *pde; 5038 pt_entry_t oldpte, *pte; 5039 vm_offset_t va; 5040 5041 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5042 ("pmap_clear_modify: page %p is not managed", m)); 5043 VM_OBJECT_ASSERT_WLOCKED(m->object); 5044 KASSERT(!vm_page_xbusied(m), 5045 ("pmap_clear_modify: page %p is exclusive busied", m)); 5046 5047 /* 5048 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5049 * If the object containing the page is locked and the page is not 5050 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5051 */ 5052 if ((m->aflags & PGA_WRITEABLE) == 0) 5053 return; 5054 rw_wlock(&pvh_global_lock); 5055 sched_pin(); 5056 if ((m->flags & PG_FICTITIOUS) != 0) 5057 goto small_mappings; 5058 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5059 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5060 va = pv->pv_va; 5061 pmap = PV_PMAP(pv); 5062 PMAP_LOCK(pmap); 5063 pde = pmap_pde(pmap, va); 5064 oldpde = *pde; 5065 if ((oldpde & PG_RW) != 0) { 5066 if (pmap_demote_pde(pmap, pde, va)) { 5067 if ((oldpde & PG_W) == 0) { 5068 /* 5069 * Write protect the mapping to a 5070 * single page so that a subsequent 5071 * write access may repromote. 5072 */ 5073 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5074 PG_PS_FRAME); 5075 pte = pmap_pte_quick(pmap, va); 5076 oldpte = *pte; 5077 if ((oldpte & PG_V) != 0) { 5078 /* 5079 * Regardless of whether a pte is 32 or 64 bits 5080 * in size, PG_RW and PG_M are among the least 5081 * significant 32 bits. 5082 */ 5083 while (!atomic_cmpset_int((u_int *)pte, 5084 oldpte, 5085 oldpte & ~(PG_M | PG_RW))) 5086 oldpte = *pte; 5087 vm_page_dirty(m); 5088 pmap_invalidate_page(pmap, va); 5089 } 5090 } 5091 } 5092 } 5093 PMAP_UNLOCK(pmap); 5094 } 5095small_mappings: 5096 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5097 pmap = PV_PMAP(pv); 5098 PMAP_LOCK(pmap); 5099 pde = pmap_pde(pmap, pv->pv_va); 5100 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5101 " a 4mpage in page %p's pv list", m)); 5102 pte = pmap_pte_quick(pmap, pv->pv_va); 5103 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5104 /* 5105 * Regardless of whether a pte is 32 or 64 bits 5106 * in size, PG_M is among the least significant 5107 * 32 bits. 5108 */ 5109 atomic_clear_int((u_int *)pte, PG_M); 5110 pmap_invalidate_page(pmap, pv->pv_va); 5111 } 5112 PMAP_UNLOCK(pmap); 5113 } 5114 sched_unpin(); 5115 rw_wunlock(&pvh_global_lock); 5116} 5117 5118/* 5119 * Miscellaneous support routines follow 5120 */ 5121 5122/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5123static __inline void 5124pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5125{ 5126 u_int opte, npte; 5127 5128 /* 5129 * The cache mode bits are all in the low 32-bits of the 5130 * PTE, so we can just spin on updating the low 32-bits. 5131 */ 5132 do { 5133 opte = *(u_int *)pte; 5134 npte = opte & ~PG_PTE_CACHE; 5135 npte |= cache_bits; 5136 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5137} 5138 5139/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5140static __inline void 5141pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5142{ 5143 u_int opde, npde; 5144 5145 /* 5146 * The cache mode bits are all in the low 32-bits of the 5147 * PDE, so we can just spin on updating the low 32-bits. 5148 */ 5149 do { 5150 opde = *(u_int *)pde; 5151 npde = opde & ~PG_PDE_CACHE; 5152 npde |= cache_bits; 5153 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5154} 5155 5156/* 5157 * Map a set of physical memory pages into the kernel virtual 5158 * address space. Return a pointer to where it is mapped. This 5159 * routine is intended to be used for mapping device memory, 5160 * NOT real memory. 5161 */ 5162void * 5163pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5164{ 5165 vm_offset_t va, offset; 5166 vm_size_t tmpsize; 5167 5168 offset = pa & PAGE_MASK; 5169 size = round_page(offset + size); 5170 pa = pa & PG_FRAME; 5171 5172 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5173 va = KERNBASE + pa; 5174 else 5175 va = kva_alloc(size); 5176 if (!va) 5177 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5178 5179 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5180 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5181 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5182 pmap_invalidate_cache_range(va, va + size, FALSE); 5183 return ((void *)(va + offset)); 5184} 5185 5186void * 5187pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5188{ 5189 5190 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5191} 5192 5193void * 5194pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5195{ 5196 5197 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5198} 5199 5200void 5201pmap_unmapdev(vm_offset_t va, vm_size_t size) 5202{ 5203 vm_offset_t base, offset; 5204 5205 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5206 return; 5207 base = trunc_page(va); 5208 offset = va & PAGE_MASK; 5209 size = round_page(offset + size); 5210 kva_free(base, size); 5211} 5212 5213/* 5214 * Sets the memory attribute for the specified page. 5215 */ 5216void 5217pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5218{ 5219 5220 m->md.pat_mode = ma; 5221 if ((m->flags & PG_FICTITIOUS) != 0) 5222 return; 5223 5224 /* 5225 * If "m" is a normal page, flush it from the cache. 5226 * See pmap_invalidate_cache_range(). 5227 * 5228 * First, try to find an existing mapping of the page by sf 5229 * buffer. sf_buf_invalidate_cache() modifies mapping and 5230 * flushes the cache. 5231 */ 5232 if (sf_buf_invalidate_cache(m)) 5233 return; 5234 5235 /* 5236 * If page is not mapped by sf buffer, but CPU does not 5237 * support self snoop, map the page transient and do 5238 * invalidation. In the worst case, whole cache is flushed by 5239 * pmap_invalidate_cache_range(). 5240 */ 5241 if ((cpu_feature & CPUID_SS) == 0) 5242 pmap_flush_page(m); 5243} 5244 5245static void 5246pmap_flush_page(vm_page_t m) 5247{ 5248 struct sysmaps *sysmaps; 5249 vm_offset_t sva, eva; 5250 5251 if ((cpu_feature & CPUID_CLFSH) != 0) { 5252 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 5253 mtx_lock(&sysmaps->lock); 5254 if (*sysmaps->CMAP2) 5255 panic("pmap_flush_page: CMAP2 busy"); 5256 sched_pin(); 5257 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5258 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5259 invlcaddr(sysmaps->CADDR2); 5260 sva = (vm_offset_t)sysmaps->CADDR2; 5261 eva = sva + PAGE_SIZE; 5262 5263 /* 5264 * Use mfence despite the ordering implied by 5265 * mtx_{un,}lock() because clflush is not guaranteed 5266 * to be ordered by any other instruction. 5267 */ 5268 mfence(); 5269 for (; sva < eva; sva += cpu_clflush_line_size) 5270 clflush(sva); 5271 mfence(); 5272 *sysmaps->CMAP2 = 0; 5273 sched_unpin(); 5274 mtx_unlock(&sysmaps->lock); 5275 } else 5276 pmap_invalidate_cache(); 5277} 5278 5279/* 5280 * Changes the specified virtual address range's memory type to that given by 5281 * the parameter "mode". The specified virtual address range must be 5282 * completely contained within either the kernel map. 5283 * 5284 * Returns zero if the change completed successfully, and either EINVAL or 5285 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5286 * of the virtual address range was not mapped, and ENOMEM is returned if 5287 * there was insufficient memory available to complete the change. 5288 */ 5289int 5290pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5291{ 5292 vm_offset_t base, offset, tmpva; 5293 pd_entry_t *pde; 5294 pt_entry_t *pte; 5295 int cache_bits_pte, cache_bits_pde; 5296 boolean_t changed; 5297 5298 base = trunc_page(va); 5299 offset = va & PAGE_MASK; 5300 size = round_page(offset + size); 5301 5302 /* 5303 * Only supported on kernel virtual addresses above the recursive map. 5304 */ 5305 if (base < VM_MIN_KERNEL_ADDRESS) 5306 return (EINVAL); 5307 5308 cache_bits_pde = pmap_cache_bits(mode, 1); 5309 cache_bits_pte = pmap_cache_bits(mode, 0); 5310 changed = FALSE; 5311 5312 /* 5313 * Pages that aren't mapped aren't supported. Also break down 5314 * 2/4MB pages into 4KB pages if required. 5315 */ 5316 PMAP_LOCK(kernel_pmap); 5317 for (tmpva = base; tmpva < base + size; ) { 5318 pde = pmap_pde(kernel_pmap, tmpva); 5319 if (*pde == 0) { 5320 PMAP_UNLOCK(kernel_pmap); 5321 return (EINVAL); 5322 } 5323 if (*pde & PG_PS) { 5324 /* 5325 * If the current 2/4MB page already has 5326 * the required memory type, then we need not 5327 * demote this page. Just increment tmpva to 5328 * the next 2/4MB page frame. 5329 */ 5330 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5331 tmpva = trunc_4mpage(tmpva) + NBPDR; 5332 continue; 5333 } 5334 5335 /* 5336 * If the current offset aligns with a 2/4MB 5337 * page frame and there is at least 2/4MB left 5338 * within the range, then we need not break 5339 * down this page into 4KB pages. 5340 */ 5341 if ((tmpva & PDRMASK) == 0 && 5342 tmpva + PDRMASK < base + size) { 5343 tmpva += NBPDR; 5344 continue; 5345 } 5346 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5347 PMAP_UNLOCK(kernel_pmap); 5348 return (ENOMEM); 5349 } 5350 } 5351 pte = vtopte(tmpva); 5352 if (*pte == 0) { 5353 PMAP_UNLOCK(kernel_pmap); 5354 return (EINVAL); 5355 } 5356 tmpva += PAGE_SIZE; 5357 } 5358 PMAP_UNLOCK(kernel_pmap); 5359 5360 /* 5361 * Ok, all the pages exist, so run through them updating their 5362 * cache mode if required. 5363 */ 5364 for (tmpva = base; tmpva < base + size; ) { 5365 pde = pmap_pde(kernel_pmap, tmpva); 5366 if (*pde & PG_PS) { 5367 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5368 pmap_pde_attr(pde, cache_bits_pde); 5369 changed = TRUE; 5370 } 5371 tmpva = trunc_4mpage(tmpva) + NBPDR; 5372 } else { 5373 pte = vtopte(tmpva); 5374 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5375 pmap_pte_attr(pte, cache_bits_pte); 5376 changed = TRUE; 5377 } 5378 tmpva += PAGE_SIZE; 5379 } 5380 } 5381 5382 /* 5383 * Flush CPU caches to make sure any data isn't cached that 5384 * shouldn't be, etc. 5385 */ 5386 if (changed) { 5387 pmap_invalidate_range(kernel_pmap, base, tmpva); 5388 pmap_invalidate_cache_range(base, tmpva, FALSE); 5389 } 5390 return (0); 5391} 5392 5393/* 5394 * perform the pmap work for mincore 5395 */ 5396int 5397pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5398{ 5399 pd_entry_t *pdep; 5400 pt_entry_t *ptep, pte; 5401 vm_paddr_t pa; 5402 int val; 5403 5404 PMAP_LOCK(pmap); 5405retry: 5406 pdep = pmap_pde(pmap, addr); 5407 if (*pdep != 0) { 5408 if (*pdep & PG_PS) { 5409 pte = *pdep; 5410 /* Compute the physical address of the 4KB page. */ 5411 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5412 PG_FRAME; 5413 val = MINCORE_SUPER; 5414 } else { 5415 ptep = pmap_pte(pmap, addr); 5416 pte = *ptep; 5417 pmap_pte_release(ptep); 5418 pa = pte & PG_FRAME; 5419 val = 0; 5420 } 5421 } else { 5422 pte = 0; 5423 pa = 0; 5424 val = 0; 5425 } 5426 if ((pte & PG_V) != 0) { 5427 val |= MINCORE_INCORE; 5428 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5429 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5430 if ((pte & PG_A) != 0) 5431 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5432 } 5433 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5434 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5435 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5436 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5437 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5438 goto retry; 5439 } else 5440 PA_UNLOCK_COND(*locked_pa); 5441 PMAP_UNLOCK(pmap); 5442 return (val); 5443} 5444 5445void 5446pmap_activate(struct thread *td) 5447{ 5448 pmap_t pmap, oldpmap; 5449 u_int cpuid; 5450 u_int32_t cr3; 5451 5452 critical_enter(); 5453 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5454 oldpmap = PCPU_GET(curpmap); 5455 cpuid = PCPU_GET(cpuid); 5456#if defined(SMP) 5457 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5458 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5459#else 5460 CPU_CLR(cpuid, &oldpmap->pm_active); 5461 CPU_SET(cpuid, &pmap->pm_active); 5462#endif 5463#ifdef PAE 5464 cr3 = vtophys(pmap->pm_pdpt); 5465#else 5466 cr3 = vtophys(pmap->pm_pdir); 5467#endif 5468 /* 5469 * pmap_activate is for the current thread on the current cpu 5470 */ 5471 td->td_pcb->pcb_cr3 = cr3; 5472 load_cr3(cr3); 5473 PCPU_SET(curpmap, pmap); 5474 critical_exit(); 5475} 5476 5477void 5478pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5479{ 5480} 5481 5482/* 5483 * Increase the starting virtual address of the given mapping if a 5484 * different alignment might result in more superpage mappings. 5485 */ 5486void 5487pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5488 vm_offset_t *addr, vm_size_t size) 5489{ 5490 vm_offset_t superpage_offset; 5491 5492 if (size < NBPDR) 5493 return; 5494 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5495 offset += ptoa(object->pg_color); 5496 superpage_offset = offset & PDRMASK; 5497 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5498 (*addr & PDRMASK) == superpage_offset) 5499 return; 5500 if ((*addr & PDRMASK) < superpage_offset) 5501 *addr = (*addr & ~PDRMASK) + superpage_offset; 5502 else 5503 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5504} 5505 5506 5507#if defined(PMAP_DEBUG) 5508pmap_pid_dump(int pid) 5509{ 5510 pmap_t pmap; 5511 struct proc *p; 5512 int npte = 0; 5513 int index; 5514 5515 sx_slock(&allproc_lock); 5516 FOREACH_PROC_IN_SYSTEM(p) { 5517 if (p->p_pid != pid) 5518 continue; 5519 5520 if (p->p_vmspace) { 5521 int i,j; 5522 index = 0; 5523 pmap = vmspace_pmap(p->p_vmspace); 5524 for (i = 0; i < NPDEPTD; i++) { 5525 pd_entry_t *pde; 5526 pt_entry_t *pte; 5527 vm_offset_t base = i << PDRSHIFT; 5528 5529 pde = &pmap->pm_pdir[i]; 5530 if (pde && pmap_pde_v(pde)) { 5531 for (j = 0; j < NPTEPG; j++) { 5532 vm_offset_t va = base + (j << PAGE_SHIFT); 5533 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5534 if (index) { 5535 index = 0; 5536 printf("\n"); 5537 } 5538 sx_sunlock(&allproc_lock); 5539 return (npte); 5540 } 5541 pte = pmap_pte(pmap, va); 5542 if (pte && pmap_pte_v(pte)) { 5543 pt_entry_t pa; 5544 vm_page_t m; 5545 pa = *pte; 5546 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5547 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5548 va, pa, m->hold_count, m->wire_count, m->flags); 5549 npte++; 5550 index++; 5551 if (index >= 2) { 5552 index = 0; 5553 printf("\n"); 5554 } else { 5555 printf(" "); 5556 } 5557 } 5558 } 5559 } 5560 } 5561 } 5562 } 5563 sx_sunlock(&allproc_lock); 5564 return (npte); 5565} 5566#endif 5567 5568#if defined(DEBUG) 5569 5570static void pads(pmap_t pm); 5571void pmap_pvdump(vm_paddr_t pa); 5572 5573/* print address space of pmap*/ 5574static void 5575pads(pmap_t pm) 5576{ 5577 int i, j; 5578 vm_paddr_t va; 5579 pt_entry_t *ptep; 5580 5581 if (pm == kernel_pmap) 5582 return; 5583 for (i = 0; i < NPDEPTD; i++) 5584 if (pm->pm_pdir[i]) 5585 for (j = 0; j < NPTEPG; j++) { 5586 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5587 if (pm == kernel_pmap && va < KERNBASE) 5588 continue; 5589 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5590 continue; 5591 ptep = pmap_pte(pm, va); 5592 if (pmap_pte_v(ptep)) 5593 printf("%x:%x ", va, *ptep); 5594 }; 5595 5596} 5597 5598void 5599pmap_pvdump(vm_paddr_t pa) 5600{ 5601 pv_entry_t pv; 5602 pmap_t pmap; 5603 vm_page_t m; 5604 5605 printf("pa %x", pa); 5606 m = PHYS_TO_VM_PAGE(pa); 5607 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5608 pmap = PV_PMAP(pv); 5609 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5610 pads(pmap); 5611 } 5612 printf(" "); 5613} 5614#endif 5615