pmap.c revision 287126
1219019Sgabor/*- 2219019Sgabor * Copyright (c) 1991 Regents of the University of California. 3219019Sgabor * All rights reserved. 4219019Sgabor * Copyright (c) 1994 John S. Dyson 5219019Sgabor * All rights reserved. 6219019Sgabor * Copyright (c) 1994 David Greenman 7219019Sgabor * All rights reserved. 8219019Sgabor * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9219019Sgabor * All rights reserved. 10219019Sgabor * 11219019Sgabor * This code is derived from software contributed to Berkeley by 12219019Sgabor * the Systems Programming Group of the University of Utah Computer 13219019Sgabor * Science Department and William Jolitz of UUNET Technologies Inc. 14219019Sgabor * 15219019Sgabor * Redistribution and use in source and binary forms, with or without 16219019Sgabor * modification, are permitted provided that the following conditions 17219019Sgabor * are met: 18219019Sgabor * 1. Redistributions of source code must retain the above copyright 19219019Sgabor * notice, this list of conditions and the following disclaimer. 20219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright 21219019Sgabor * notice, this list of conditions and the following disclaimer in the 22219019Sgabor * documentation and/or other materials provided with the distribution. 23219019Sgabor * 3. All advertising materials mentioning features or use of this software 24219019Sgabor * must display the following acknowledgement: 25219019Sgabor * This product includes software developed by the University of 26219019Sgabor * California, Berkeley and its contributors. 27219019Sgabor * 4. Neither the name of the University nor the names of its contributors 28219019Sgabor * may be used to endorse or promote products derived from this software 29219019Sgabor * without specific prior written permission. 30219019Sgabor * 31219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34219019Sgabor * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41219019Sgabor * SUCH DAMAGE. 42219019Sgabor * 43219019Sgabor * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44219019Sgabor */ 45219019Sgabor/*- 46219019Sgabor * Copyright (c) 2003 Networks Associates Technology, Inc. 47219019Sgabor * All rights reserved. 48219019Sgabor * 49219019Sgabor * This software was developed for the FreeBSD Project by Jake Burkholder, 50219019Sgabor * Safeport Network Services, and Network Associates Laboratories, the 51219019Sgabor * Security Research Division of Network Associates, Inc. under 52219019Sgabor * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53219019Sgabor * CHATS research program. 54219019Sgabor * 55219019Sgabor * Redistribution and use in source and binary forms, with or without 56219019Sgabor * modification, are permitted provided that the following conditions 57219019Sgabor * are met: 58219019Sgabor * 1. Redistributions of source code must retain the above copyright 59219019Sgabor * notice, this list of conditions and the following disclaimer. 60219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright 61219019Sgabor * notice, this list of conditions and the following disclaimer in the 62219019Sgabor * documentation and/or other materials provided with the distribution. 63219019Sgabor * 64219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67219019Sgabor * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74219019Sgabor * SUCH DAMAGE. 75219019Sgabor */ 76219019Sgabor 77219019Sgabor#include <sys/cdefs.h> 78219019Sgabor__FBSDID("$FreeBSD: stable/10/sys/i386/i386/pmap.c 287126 2015-08-25 14:39:40Z marcel $"); 79219019Sgabor 80219019Sgabor/* 81219019Sgabor * Manages physical address maps. 82219019Sgabor * 83219019Sgabor * Since the information managed by this module is 84219019Sgabor * also stored by the logical address mapping module, 85219019Sgabor * this module may throw away valid virtual-to-physical 86219019Sgabor * mappings at almost any time. However, invalidations 87219019Sgabor * of virtual-to-physical mappings must be done as 88219019Sgabor * requested. 89219019Sgabor * 90219019Sgabor * In order to cope with hardware architectures which 91219019Sgabor * make virtual-to-physical map invalidates expensive, 92219019Sgabor * this module may delay invalidate or reduced protection 93219019Sgabor * operations until such time as they are actually 94219019Sgabor * necessary. This module is given full information as 95219019Sgabor * to which processors are currently using which maps, 96219019Sgabor * and to when physical maps must be made correct. 97219019Sgabor */ 98219019Sgabor 99219019Sgabor#include "opt_apic.h" 100219019Sgabor#include "opt_cpu.h" 101219019Sgabor#include "opt_pmap.h" 102219019Sgabor#include "opt_smp.h" 103219019Sgabor#include "opt_xbox.h" 104219019Sgabor 105219019Sgabor#include <sys/param.h> 106219019Sgabor#include <sys/systm.h> 107219019Sgabor#include <sys/kernel.h> 108219019Sgabor#include <sys/ktr.h> 109219019Sgabor#include <sys/lock.h> 110219019Sgabor#include <sys/malloc.h> 111219019Sgabor#include <sys/mman.h> 112219019Sgabor#include <sys/msgbuf.h> 113219019Sgabor#include <sys/mutex.h> 114219019Sgabor#include <sys/proc.h> 115219019Sgabor#include <sys/rwlock.h> 116219019Sgabor#include <sys/sf_buf.h> 117219019Sgabor#include <sys/sx.h> 118219019Sgabor#include <sys/vmmeter.h> 119219019Sgabor#include <sys/sched.h> 120219019Sgabor#include <sys/sysctl.h> 121219019Sgabor#ifdef SMP 122219019Sgabor#include <sys/smp.h> 123219019Sgabor#else 124219019Sgabor#include <sys/cpuset.h> 125219019Sgabor#endif 126219019Sgabor 127219019Sgabor#include <vm/vm.h> 128219019Sgabor#include <vm/vm_param.h> 129219019Sgabor#include <vm/vm_kern.h> 130219019Sgabor#include <vm/vm_page.h> 131219019Sgabor#include <vm/vm_map.h> 132219019Sgabor#include <vm/vm_object.h> 133219019Sgabor#include <vm/vm_extern.h> 134219019Sgabor#include <vm/vm_pageout.h> 135219019Sgabor#include <vm/vm_pager.h> 136219019Sgabor#include <vm/vm_phys.h> 137219019Sgabor#include <vm/vm_radix.h> 138219019Sgabor#include <vm/vm_reserv.h> 139219019Sgabor#include <vm/uma.h> 140219019Sgabor 141219019Sgabor#ifdef DEV_APIC 142219019Sgabor#include <sys/bus.h> 143219019Sgabor#include <machine/intr_machdep.h> 144219019Sgabor#include <machine/apicvar.h> 145219019Sgabor#endif 146219019Sgabor#include <machine/cpu.h> 147219019Sgabor#include <machine/cputypes.h> 148219019Sgabor#include <machine/md_var.h> 149219019Sgabor#include <machine/pcb.h> 150219019Sgabor#include <machine/specialreg.h> 151219019Sgabor#ifdef SMP 152219019Sgabor#include <machine/smp.h> 153219019Sgabor#endif 154219019Sgabor 155219019Sgabor#ifdef XBOX 156219019Sgabor#include <machine/xbox.h> 157219019Sgabor#endif 158219019Sgabor 159219019Sgabor#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 160219019Sgabor#define CPU_ENABLE_SSE 161219019Sgabor#endif 162219019Sgabor 163219019Sgabor#ifndef PMAP_SHPGPERPROC 164219019Sgabor#define PMAP_SHPGPERPROC 200 165219019Sgabor#endif 166219019Sgabor 167219019Sgabor#if !defined(DIAGNOSTIC) 168219019Sgabor#ifdef __GNUC_GNU_INLINE__ 169219019Sgabor#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 170219019Sgabor#else 171219019Sgabor#define PMAP_INLINE extern inline 172219019Sgabor#endif 173219019Sgabor#else 174219019Sgabor#define PMAP_INLINE 175219019Sgabor#endif 176219019Sgabor 177219019Sgabor#ifdef PV_STATS 178219019Sgabor#define PV_STAT(x) do { x ; } while (0) 179219019Sgabor#else 180219019Sgabor#define PV_STAT(x) do { } while (0) 181219019Sgabor#endif 182219019Sgabor 183219019Sgabor#define pa_index(pa) ((pa) >> PDRSHIFT) 184219019Sgabor#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 185219019Sgabor 186219019Sgabor/* 187219019Sgabor * Get PDEs and PTEs for user/kernel address space 188219019Sgabor */ 189219019Sgabor#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 190219019Sgabor#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 191219019Sgabor 192219019Sgabor#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 193219019Sgabor#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 194219019Sgabor#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 195219019Sgabor#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 196219019Sgabor#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 197219019Sgabor 198219019Sgabor#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 199219019Sgabor atomic_clear_int((u_int *)(pte), PG_W)) 200219019Sgabor#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 201219019Sgabor 202219019Sgaborstruct pmap kernel_pmap_store; 203219019SgaborLIST_HEAD(pmaplist, pmap); 204219019Sgaborstatic struct pmaplist allpmaps; 205219019Sgaborstatic struct mtx allpmaps_lock; 206219019Sgabor 207219019Sgaborvm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 208219019Sgaborvm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 209219019Sgaborint pgeflag = 0; /* PG_G or-in */ 210219019Sgaborint pseflag = 0; /* PG_PS or-in */ 211219019Sgabor 212219019Sgaborstatic int nkpt = NKPT; 213219019Sgaborvm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 214219019Sgaborextern u_int32_t KERNend; 215219019Sgaborextern u_int32_t KPTphys; 216219019Sgabor 217219019Sgabor#if defined(PAE) || defined(PAE_TABLES) 218219019Sgaborpt_entry_t pg_nx; 219219019Sgaborstatic uma_zone_t pdptzone; 220219019Sgabor#endif 221219019Sgabor 222219019Sgaborstatic SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 223219019Sgabor 224219019Sgaborstatic int pat_works = 1; 225219019SgaborSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 226 "Is page attribute table fully functional?"); 227 228static int pg_ps_enabled = 1; 229SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 230 "Are large page mappings enabled?"); 231 232#define PAT_INDEX_SIZE 8 233static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 234 235/* 236 * pmap_mapdev support pre initialization (i.e. console) 237 */ 238#define PMAP_PREINIT_MAPPING_COUNT 8 239static struct pmap_preinit_mapping { 240 vm_paddr_t pa; 241 vm_offset_t va; 242 vm_size_t sz; 243 int mode; 244} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 245static int pmap_initialized; 246 247static struct rwlock_padalign pvh_global_lock; 248 249/* 250 * Data for the pv entry allocation mechanism 251 */ 252static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 253static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 254static struct md_page *pv_table; 255static int shpgperproc = PMAP_SHPGPERPROC; 256 257struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 258int pv_maxchunks; /* How many chunks we have KVA for */ 259vm_offset_t pv_vafree; /* freelist stored in the PTE */ 260 261/* 262 * All those kernel PT submaps that BSD is so fond of 263 */ 264struct sysmaps { 265 struct mtx lock; 266 pt_entry_t *CMAP1; 267 pt_entry_t *CMAP2; 268 caddr_t CADDR1; 269 caddr_t CADDR2; 270}; 271static struct sysmaps sysmaps_pcpu[MAXCPU]; 272pt_entry_t *CMAP3; 273static pd_entry_t *KPTD; 274caddr_t ptvmmap = 0; 275caddr_t CADDR3; 276struct msgbuf *msgbufp = 0; 277 278/* 279 * Crashdump maps. 280 */ 281static caddr_t crashdumpmap; 282 283static pt_entry_t *PMAP1 = 0, *PMAP2; 284static pt_entry_t *PADDR1 = 0, *PADDR2; 285#ifdef SMP 286static int PMAP1cpu; 287static int PMAP1changedcpu; 288SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 289 &PMAP1changedcpu, 0, 290 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 291#endif 292static int PMAP1changed; 293SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 294 &PMAP1changed, 0, 295 "Number of times pmap_pte_quick changed PMAP1"); 296static int PMAP1unchanged; 297SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 298 &PMAP1unchanged, 0, 299 "Number of times pmap_pte_quick didn't change PMAP1"); 300static struct mtx PMAP2mutex; 301 302static void free_pv_chunk(struct pv_chunk *pc); 303static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 304static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 305static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 306static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 307static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 308static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 309static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 310 vm_offset_t va); 311static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 312 313static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 314static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 315 vm_prot_t prot); 316static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 317 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 318static void pmap_flush_page(vm_page_t m); 319static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 320static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 321static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 322static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 323static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 324static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 325static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 326static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 327static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 328static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 329 vm_prot_t prot); 330static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 331static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 332 struct spglist *free); 333static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 334 struct spglist *free); 335static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 336static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 337 struct spglist *free); 338static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 339 vm_offset_t va); 340static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 341static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 342 vm_page_t m); 343static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 344 pd_entry_t newpde); 345static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 346 347static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 348 349static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 350static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 351static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 352static void pmap_pte_release(pt_entry_t *pte); 353static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 354#if defined(PAE) || defined(PAE_TABLES) 355static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 356#endif 357static void pmap_set_pg(void); 358 359static __inline void pagezero(void *page); 360 361CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 362CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 363 364/* 365 * If you get an error here, then you set KVA_PAGES wrong! See the 366 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 367 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 368 */ 369CTASSERT(KERNBASE % (1 << 24) == 0); 370 371/* 372 * Bootstrap the system enough to run with virtual memory. 373 * 374 * On the i386 this is called after mapping has already been enabled 375 * and just syncs the pmap module with what has already been done. 376 * [We can't call it easily with mapping off since the kernel is not 377 * mapped with PA == VA, hence we would have to relocate every address 378 * from the linked base (virtual) address "KERNBASE" to the actual 379 * (physical) address starting relative to 0] 380 */ 381void 382pmap_bootstrap(vm_paddr_t firstaddr) 383{ 384 vm_offset_t va; 385 pt_entry_t *pte, *unused; 386 struct sysmaps *sysmaps; 387 int i; 388 389 /* 390 * Add a physical memory segment (vm_phys_seg) corresponding to the 391 * preallocated kernel page table pages so that vm_page structures 392 * representing these pages will be created. The vm_page structures 393 * are required for promotion of the corresponding kernel virtual 394 * addresses to superpage mappings. 395 */ 396 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 397 398 /* 399 * Initialize the first available kernel virtual address. However, 400 * using "firstaddr" may waste a few pages of the kernel virtual 401 * address space, because locore may not have mapped every physical 402 * page that it allocated. Preferably, locore would provide a first 403 * unused virtual address in addition to "firstaddr". 404 */ 405 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 406 407 virtual_end = VM_MAX_KERNEL_ADDRESS; 408 409 /* 410 * Initialize the kernel pmap (which is statically allocated). 411 */ 412 PMAP_LOCK_INIT(kernel_pmap); 413 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 414#if defined(PAE) || defined(PAE_TABLES) 415 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 416#endif 417 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 418 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 419 420 /* 421 * Initialize the global pv list lock. 422 */ 423 rw_init(&pvh_global_lock, "pmap pv global"); 424 425 LIST_INIT(&allpmaps); 426 427 /* 428 * Request a spin mutex so that changes to allpmaps cannot be 429 * preempted by smp_rendezvous_cpus(). Otherwise, 430 * pmap_update_pde_kernel() could access allpmaps while it is 431 * being changed. 432 */ 433 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 434 mtx_lock_spin(&allpmaps_lock); 435 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 436 mtx_unlock_spin(&allpmaps_lock); 437 438 /* 439 * Reserve some special page table entries/VA space for temporary 440 * mapping of pages. 441 */ 442#define SYSMAP(c, p, v, n) \ 443 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 444 445 va = virtual_avail; 446 pte = vtopte(va); 447 448 /* 449 * CMAP1/CMAP2 are used for zeroing and copying pages. 450 * CMAP3 is used for the idle process page zeroing. 451 */ 452 for (i = 0; i < MAXCPU; i++) { 453 sysmaps = &sysmaps_pcpu[i]; 454 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 455 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 456 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 457 } 458 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 459 460 /* 461 * Crashdump maps. 462 */ 463 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 464 465 /* 466 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 467 */ 468 SYSMAP(caddr_t, unused, ptvmmap, 1) 469 470 /* 471 * msgbufp is used to map the system message buffer. 472 */ 473 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 474 475 /* 476 * KPTmap is used by pmap_kextract(). 477 * 478 * KPTmap is first initialized by locore. However, that initial 479 * KPTmap can only support NKPT page table pages. Here, a larger 480 * KPTmap is created that can support KVA_PAGES page table pages. 481 */ 482 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 483 484 for (i = 0; i < NKPT; i++) 485 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 486 487 /* 488 * Adjust the start of the KPTD and KPTmap so that the implementation 489 * of pmap_kextract() and pmap_growkernel() can be made simpler. 490 */ 491 KPTD -= KPTDI; 492 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 493 494 /* 495 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 496 * respectively. 497 */ 498 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 499 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 500 501 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 502 503 virtual_avail = va; 504 505 /* 506 * Leave in place an identity mapping (virt == phys) for the low 1 MB 507 * physical memory region that is used by the ACPI wakeup code. This 508 * mapping must not have PG_G set. 509 */ 510#ifdef XBOX 511 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 512 * an early stadium, we cannot yet neatly map video memory ... :-( 513 * Better fixes are very welcome! */ 514 if (!arch_i386_is_xbox) 515#endif 516 for (i = 1; i < NKPT; i++) 517 PTD[i] = 0; 518 519 /* Initialize the PAT MSR if present. */ 520 pmap_init_pat(); 521 522 /* Turn on PG_G on kernel page(s) */ 523 pmap_set_pg(); 524} 525 526/* 527 * Setup the PAT MSR. 528 */ 529void 530pmap_init_pat(void) 531{ 532 int pat_table[PAT_INDEX_SIZE]; 533 uint64_t pat_msr; 534 u_long cr0, cr4; 535 int i; 536 537 /* Set default PAT index table. */ 538 for (i = 0; i < PAT_INDEX_SIZE; i++) 539 pat_table[i] = -1; 540 pat_table[PAT_WRITE_BACK] = 0; 541 pat_table[PAT_WRITE_THROUGH] = 1; 542 pat_table[PAT_UNCACHEABLE] = 3; 543 pat_table[PAT_WRITE_COMBINING] = 3; 544 pat_table[PAT_WRITE_PROTECTED] = 3; 545 pat_table[PAT_UNCACHED] = 3; 546 547 /* Bail if this CPU doesn't implement PAT. */ 548 if ((cpu_feature & CPUID_PAT) == 0) { 549 for (i = 0; i < PAT_INDEX_SIZE; i++) 550 pat_index[i] = pat_table[i]; 551 pat_works = 0; 552 return; 553 } 554 555 /* 556 * Due to some Intel errata, we can only safely use the lower 4 557 * PAT entries. 558 * 559 * Intel Pentium III Processor Specification Update 560 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 561 * or Mode C Paging) 562 * 563 * Intel Pentium IV Processor Specification Update 564 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 565 */ 566 if (cpu_vendor_id == CPU_VENDOR_INTEL && 567 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 568 pat_works = 0; 569 570 /* Initialize default PAT entries. */ 571 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 572 PAT_VALUE(1, PAT_WRITE_THROUGH) | 573 PAT_VALUE(2, PAT_UNCACHED) | 574 PAT_VALUE(3, PAT_UNCACHEABLE) | 575 PAT_VALUE(4, PAT_WRITE_BACK) | 576 PAT_VALUE(5, PAT_WRITE_THROUGH) | 577 PAT_VALUE(6, PAT_UNCACHED) | 578 PAT_VALUE(7, PAT_UNCACHEABLE); 579 580 if (pat_works) { 581 /* 582 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 583 * Program 5 and 6 as WP and WC. 584 * Leave 4 and 7 as WB and UC. 585 */ 586 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 587 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 588 PAT_VALUE(6, PAT_WRITE_COMBINING); 589 pat_table[PAT_UNCACHED] = 2; 590 pat_table[PAT_WRITE_PROTECTED] = 5; 591 pat_table[PAT_WRITE_COMBINING] = 6; 592 } else { 593 /* 594 * Just replace PAT Index 2 with WC instead of UC-. 595 */ 596 pat_msr &= ~PAT_MASK(2); 597 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 598 pat_table[PAT_WRITE_COMBINING] = 2; 599 } 600 601 /* Disable PGE. */ 602 cr4 = rcr4(); 603 load_cr4(cr4 & ~CR4_PGE); 604 605 /* Disable caches (CD = 1, NW = 0). */ 606 cr0 = rcr0(); 607 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 608 609 /* Flushes caches and TLBs. */ 610 wbinvd(); 611 invltlb(); 612 613 /* Update PAT and index table. */ 614 wrmsr(MSR_PAT, pat_msr); 615 for (i = 0; i < PAT_INDEX_SIZE; i++) 616 pat_index[i] = pat_table[i]; 617 618 /* Flush caches and TLBs again. */ 619 wbinvd(); 620 invltlb(); 621 622 /* Restore caches and PGE. */ 623 load_cr0(cr0); 624 load_cr4(cr4); 625} 626 627/* 628 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 629 */ 630static void 631pmap_set_pg(void) 632{ 633 pt_entry_t *pte; 634 vm_offset_t va, endva; 635 636 if (pgeflag == 0) 637 return; 638 639 endva = KERNBASE + KERNend; 640 641 if (pseflag) { 642 va = KERNBASE + KERNLOAD; 643 while (va < endva) { 644 pdir_pde(PTD, va) |= pgeflag; 645 invltlb(); /* Play it safe, invltlb() every time */ 646 va += NBPDR; 647 } 648 } else { 649 va = (vm_offset_t)btext; 650 while (va < endva) { 651 pte = vtopte(va); 652 if (*pte) 653 *pte |= pgeflag; 654 invltlb(); /* Play it safe, invltlb() every time */ 655 va += PAGE_SIZE; 656 } 657 } 658} 659 660/* 661 * Initialize a vm_page's machine-dependent fields. 662 */ 663void 664pmap_page_init(vm_page_t m) 665{ 666 667 TAILQ_INIT(&m->md.pv_list); 668 m->md.pat_mode = PAT_WRITE_BACK; 669} 670 671#if defined(PAE) || defined(PAE_TABLES) 672static void * 673pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 674{ 675 676 /* Inform UMA that this allocator uses kernel_map/object. */ 677 *flags = UMA_SLAB_KERNEL; 678 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 679 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 680} 681#endif 682 683/* 684 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 685 * Requirements: 686 * - Must deal with pages in order to ensure that none of the PG_* bits 687 * are ever set, PG_V in particular. 688 * - Assumes we can write to ptes without pte_store() atomic ops, even 689 * on PAE systems. This should be ok. 690 * - Assumes nothing will ever test these addresses for 0 to indicate 691 * no mapping instead of correctly checking PG_V. 692 * - Assumes a vm_offset_t will fit in a pte (true for i386). 693 * Because PG_V is never set, there can be no mappings to invalidate. 694 */ 695static vm_offset_t 696pmap_ptelist_alloc(vm_offset_t *head) 697{ 698 pt_entry_t *pte; 699 vm_offset_t va; 700 701 va = *head; 702 if (va == 0) 703 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 704 pte = vtopte(va); 705 *head = *pte; 706 if (*head & PG_V) 707 panic("pmap_ptelist_alloc: va with PG_V set!"); 708 *pte = 0; 709 return (va); 710} 711 712static void 713pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 714{ 715 pt_entry_t *pte; 716 717 if (va & PG_V) 718 panic("pmap_ptelist_free: freeing va with PG_V set!"); 719 pte = vtopte(va); 720 *pte = *head; /* virtual! PG_V is 0 though */ 721 *head = va; 722} 723 724static void 725pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 726{ 727 int i; 728 vm_offset_t va; 729 730 *head = 0; 731 for (i = npages - 1; i >= 0; i--) { 732 va = (vm_offset_t)base + i * PAGE_SIZE; 733 pmap_ptelist_free(head, va); 734 } 735} 736 737 738/* 739 * Initialize the pmap module. 740 * Called by vm_init, to initialize any structures that the pmap 741 * system needs to map virtual memory. 742 */ 743void 744pmap_init(void) 745{ 746 struct pmap_preinit_mapping *ppim; 747 vm_page_t mpte; 748 vm_size_t s; 749 int i, pv_npg; 750 751 /* 752 * Initialize the vm page array entries for the kernel pmap's 753 * page table pages. 754 */ 755 for (i = 0; i < NKPT; i++) { 756 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 757 KASSERT(mpte >= vm_page_array && 758 mpte < &vm_page_array[vm_page_array_size], 759 ("pmap_init: page table page is out of range")); 760 mpte->pindex = i + KPTDI; 761 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 762 } 763 764 /* 765 * Initialize the address space (zone) for the pv entries. Set a 766 * high water mark so that the system can recover from excessive 767 * numbers of pv entries. 768 */ 769 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 770 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 771 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 772 pv_entry_max = roundup(pv_entry_max, _NPCPV); 773 pv_entry_high_water = 9 * (pv_entry_max / 10); 774 775 /* 776 * If the kernel is running on a virtual machine, then it must assume 777 * that MCA is enabled by the hypervisor. Moreover, the kernel must 778 * be prepared for the hypervisor changing the vendor and family that 779 * are reported by CPUID. Consequently, the workaround for AMD Family 780 * 10h Erratum 383 is enabled if the processor's feature set does not 781 * include at least one feature that is only supported by older Intel 782 * or newer AMD processors. 783 */ 784 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 785 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 786 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 787 AMDID2_FMA4)) == 0) 788 workaround_erratum383 = 1; 789 790 /* 791 * Are large page mappings supported and enabled? 792 */ 793 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 794 if (pseflag == 0) 795 pg_ps_enabled = 0; 796 else if (pg_ps_enabled) { 797 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 798 ("pmap_init: can't assign to pagesizes[1]")); 799 pagesizes[1] = NBPDR; 800 } 801 802 /* 803 * Calculate the size of the pv head table for superpages. 804 * Handle the possibility that "vm_phys_segs[...].end" is zero. 805 */ 806 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 807 PAGE_SIZE) / NBPDR + 1; 808 809 /* 810 * Allocate memory for the pv head table for superpages. 811 */ 812 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 813 s = round_page(s); 814 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 815 M_WAITOK | M_ZERO); 816 for (i = 0; i < pv_npg; i++) 817 TAILQ_INIT(&pv_table[i].pv_list); 818 819 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 820 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 821 if (pv_chunkbase == NULL) 822 panic("pmap_init: not enough kvm for pv chunks"); 823 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 824#if defined(PAE) || defined(PAE_TABLES) 825 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 826 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 827 UMA_ZONE_VM | UMA_ZONE_NOFREE); 828 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 829#endif 830 831 pmap_initialized = 1; 832 if (!bootverbose) 833 return; 834 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 835 ppim = pmap_preinit_mapping + i; 836 if (ppim->va == 0) 837 continue; 838 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 839 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 840 } 841} 842 843 844SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 845 "Max number of PV entries"); 846SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 847 "Page share factor per proc"); 848 849static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 850 "2/4MB page mapping counters"); 851 852static u_long pmap_pde_demotions; 853SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 854 &pmap_pde_demotions, 0, "2/4MB page demotions"); 855 856static u_long pmap_pde_mappings; 857SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 858 &pmap_pde_mappings, 0, "2/4MB page mappings"); 859 860static u_long pmap_pde_p_failures; 861SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 862 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 863 864static u_long pmap_pde_promotions; 865SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 866 &pmap_pde_promotions, 0, "2/4MB page promotions"); 867 868/*************************************************** 869 * Low level helper routines..... 870 ***************************************************/ 871 872/* 873 * Determine the appropriate bits to set in a PTE or PDE for a specified 874 * caching mode. 875 */ 876int 877pmap_cache_bits(int mode, boolean_t is_pde) 878{ 879 int cache_bits, pat_flag, pat_idx; 880 881 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 882 panic("Unknown caching mode %d\n", mode); 883 884 /* The PAT bit is different for PTE's and PDE's. */ 885 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 886 887 /* Map the caching mode to a PAT index. */ 888 pat_idx = pat_index[mode]; 889 890 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 891 cache_bits = 0; 892 if (pat_idx & 0x4) 893 cache_bits |= pat_flag; 894 if (pat_idx & 0x2) 895 cache_bits |= PG_NC_PCD; 896 if (pat_idx & 0x1) 897 cache_bits |= PG_NC_PWT; 898 return (cache_bits); 899} 900 901/* 902 * The caller is responsible for maintaining TLB consistency. 903 */ 904static void 905pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 906{ 907 pd_entry_t *pde; 908 pmap_t pmap; 909 boolean_t PTD_updated; 910 911 PTD_updated = FALSE; 912 mtx_lock_spin(&allpmaps_lock); 913 LIST_FOREACH(pmap, &allpmaps, pm_list) { 914 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 915 PG_FRAME)) 916 PTD_updated = TRUE; 917 pde = pmap_pde(pmap, va); 918 pde_store(pde, newpde); 919 } 920 mtx_unlock_spin(&allpmaps_lock); 921 KASSERT(PTD_updated, 922 ("pmap_kenter_pde: current page table is not in allpmaps")); 923} 924 925/* 926 * After changing the page size for the specified virtual address in the page 927 * table, flush the corresponding entries from the processor's TLB. Only the 928 * calling processor's TLB is affected. 929 * 930 * The calling thread must be pinned to a processor. 931 */ 932static void 933pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 934{ 935 u_long cr4; 936 937 if ((newpde & PG_PS) == 0) 938 /* Demotion: flush a specific 2MB page mapping. */ 939 invlpg(va); 940 else if ((newpde & PG_G) == 0) 941 /* 942 * Promotion: flush every 4KB page mapping from the TLB 943 * because there are too many to flush individually. 944 */ 945 invltlb(); 946 else { 947 /* 948 * Promotion: flush every 4KB page mapping from the TLB, 949 * including any global (PG_G) mappings. 950 */ 951 cr4 = rcr4(); 952 load_cr4(cr4 & ~CR4_PGE); 953 /* 954 * Although preemption at this point could be detrimental to 955 * performance, it would not lead to an error. PG_G is simply 956 * ignored if CR4.PGE is clear. Moreover, in case this block 957 * is re-entered, the load_cr4() either above or below will 958 * modify CR4.PGE flushing the TLB. 959 */ 960 load_cr4(cr4 | CR4_PGE); 961 } 962} 963#ifdef SMP 964/* 965 * For SMP, these functions have to use the IPI mechanism for coherence. 966 * 967 * N.B.: Before calling any of the following TLB invalidation functions, 968 * the calling processor must ensure that all stores updating a non- 969 * kernel page table are globally performed. Otherwise, another 970 * processor could cache an old, pre-update entry without being 971 * invalidated. This can happen one of two ways: (1) The pmap becomes 972 * active on another processor after its pm_active field is checked by 973 * one of the following functions but before a store updating the page 974 * table is globally performed. (2) The pmap becomes active on another 975 * processor before its pm_active field is checked but due to 976 * speculative loads one of the following functions stills reads the 977 * pmap as inactive on the other processor. 978 * 979 * The kernel page table is exempt because its pm_active field is 980 * immutable. The kernel page table is always active on every 981 * processor. 982 */ 983void 984pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 985{ 986 cpuset_t other_cpus; 987 u_int cpuid; 988 989 sched_pin(); 990 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 991 invlpg(va); 992 smp_invlpg(va); 993 } else { 994 cpuid = PCPU_GET(cpuid); 995 other_cpus = all_cpus; 996 CPU_CLR(cpuid, &other_cpus); 997 if (CPU_ISSET(cpuid, &pmap->pm_active)) 998 invlpg(va); 999 CPU_AND(&other_cpus, &pmap->pm_active); 1000 if (!CPU_EMPTY(&other_cpus)) 1001 smp_masked_invlpg(other_cpus, va); 1002 } 1003 sched_unpin(); 1004} 1005 1006void 1007pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1008{ 1009 cpuset_t other_cpus; 1010 vm_offset_t addr; 1011 u_int cpuid; 1012 1013 sched_pin(); 1014 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1015 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1016 invlpg(addr); 1017 smp_invlpg_range(sva, eva); 1018 } else { 1019 cpuid = PCPU_GET(cpuid); 1020 other_cpus = all_cpus; 1021 CPU_CLR(cpuid, &other_cpus); 1022 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1023 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1024 invlpg(addr); 1025 CPU_AND(&other_cpus, &pmap->pm_active); 1026 if (!CPU_EMPTY(&other_cpus)) 1027 smp_masked_invlpg_range(other_cpus, sva, eva); 1028 } 1029 sched_unpin(); 1030} 1031 1032void 1033pmap_invalidate_all(pmap_t pmap) 1034{ 1035 cpuset_t other_cpus; 1036 u_int cpuid; 1037 1038 sched_pin(); 1039 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1040 invltlb(); 1041 smp_invltlb(); 1042 } else { 1043 cpuid = PCPU_GET(cpuid); 1044 other_cpus = all_cpus; 1045 CPU_CLR(cpuid, &other_cpus); 1046 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1047 invltlb(); 1048 CPU_AND(&other_cpus, &pmap->pm_active); 1049 if (!CPU_EMPTY(&other_cpus)) 1050 smp_masked_invltlb(other_cpus); 1051 } 1052 sched_unpin(); 1053} 1054 1055void 1056pmap_invalidate_cache(void) 1057{ 1058 1059 sched_pin(); 1060 wbinvd(); 1061 smp_cache_flush(); 1062 sched_unpin(); 1063} 1064 1065struct pde_action { 1066 cpuset_t invalidate; /* processors that invalidate their TLB */ 1067 vm_offset_t va; 1068 pd_entry_t *pde; 1069 pd_entry_t newpde; 1070 u_int store; /* processor that updates the PDE */ 1071}; 1072 1073static void 1074pmap_update_pde_kernel(void *arg) 1075{ 1076 struct pde_action *act = arg; 1077 pd_entry_t *pde; 1078 pmap_t pmap; 1079 1080 if (act->store == PCPU_GET(cpuid)) { 1081 1082 /* 1083 * Elsewhere, this operation requires allpmaps_lock for 1084 * synchronization. Here, it does not because it is being 1085 * performed in the context of an all_cpus rendezvous. 1086 */ 1087 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1088 pde = pmap_pde(pmap, act->va); 1089 pde_store(pde, act->newpde); 1090 } 1091 } 1092} 1093 1094static void 1095pmap_update_pde_user(void *arg) 1096{ 1097 struct pde_action *act = arg; 1098 1099 if (act->store == PCPU_GET(cpuid)) 1100 pde_store(act->pde, act->newpde); 1101} 1102 1103static void 1104pmap_update_pde_teardown(void *arg) 1105{ 1106 struct pde_action *act = arg; 1107 1108 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1109 pmap_update_pde_invalidate(act->va, act->newpde); 1110} 1111 1112/* 1113 * Change the page size for the specified virtual address in a way that 1114 * prevents any possibility of the TLB ever having two entries that map the 1115 * same virtual address using different page sizes. This is the recommended 1116 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1117 * machine check exception for a TLB state that is improperly diagnosed as a 1118 * hardware error. 1119 */ 1120static void 1121pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1122{ 1123 struct pde_action act; 1124 cpuset_t active, other_cpus; 1125 u_int cpuid; 1126 1127 sched_pin(); 1128 cpuid = PCPU_GET(cpuid); 1129 other_cpus = all_cpus; 1130 CPU_CLR(cpuid, &other_cpus); 1131 if (pmap == kernel_pmap) 1132 active = all_cpus; 1133 else 1134 active = pmap->pm_active; 1135 if (CPU_OVERLAP(&active, &other_cpus)) { 1136 act.store = cpuid; 1137 act.invalidate = active; 1138 act.va = va; 1139 act.pde = pde; 1140 act.newpde = newpde; 1141 CPU_SET(cpuid, &active); 1142 smp_rendezvous_cpus(active, 1143 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1144 pmap_update_pde_kernel : pmap_update_pde_user, 1145 pmap_update_pde_teardown, &act); 1146 } else { 1147 if (pmap == kernel_pmap) 1148 pmap_kenter_pde(va, newpde); 1149 else 1150 pde_store(pde, newpde); 1151 if (CPU_ISSET(cpuid, &active)) 1152 pmap_update_pde_invalidate(va, newpde); 1153 } 1154 sched_unpin(); 1155} 1156#else /* !SMP */ 1157/* 1158 * Normal, non-SMP, 486+ invalidation functions. 1159 * We inline these within pmap.c for speed. 1160 */ 1161PMAP_INLINE void 1162pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1163{ 1164 1165 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1166 invlpg(va); 1167} 1168 1169PMAP_INLINE void 1170pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1171{ 1172 vm_offset_t addr; 1173 1174 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1175 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1176 invlpg(addr); 1177} 1178 1179PMAP_INLINE void 1180pmap_invalidate_all(pmap_t pmap) 1181{ 1182 1183 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1184 invltlb(); 1185} 1186 1187PMAP_INLINE void 1188pmap_invalidate_cache(void) 1189{ 1190 1191 wbinvd(); 1192} 1193 1194static void 1195pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1196{ 1197 1198 if (pmap == kernel_pmap) 1199 pmap_kenter_pde(va, newpde); 1200 else 1201 pde_store(pde, newpde); 1202 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1203 pmap_update_pde_invalidate(va, newpde); 1204} 1205#endif /* !SMP */ 1206 1207#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1208 1209void 1210pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1211{ 1212 1213 if (force) { 1214 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1215 } else { 1216 KASSERT((sva & PAGE_MASK) == 0, 1217 ("pmap_invalidate_cache_range: sva not page-aligned")); 1218 KASSERT((eva & PAGE_MASK) == 0, 1219 ("pmap_invalidate_cache_range: eva not page-aligned")); 1220 } 1221 1222 if ((cpu_feature & CPUID_SS) != 0 && !force) 1223 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1224 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1225 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1226 1227#ifdef DEV_APIC 1228 /* 1229 * XXX: Some CPUs fault, hang, or trash the local APIC 1230 * registers if we use CLFLUSH on the local APIC 1231 * range. The local APIC is always uncached, so we 1232 * don't need to flush for that range anyway. 1233 */ 1234 if (pmap_kextract(sva) == lapic_paddr) 1235 return; 1236#endif 1237 /* 1238 * Otherwise, do per-cache line flush. Use the mfence 1239 * instruction to insure that previous stores are 1240 * included in the write-back. The processor 1241 * propagates flush to other processors in the cache 1242 * coherence domain. 1243 */ 1244 mfence(); 1245 for (; sva < eva; sva += cpu_clflush_line_size) 1246 clflush(sva); 1247 mfence(); 1248 } else { 1249 1250 /* 1251 * No targeted cache flush methods are supported by CPU, 1252 * or the supplied range is bigger than 2MB. 1253 * Globally invalidate cache. 1254 */ 1255 pmap_invalidate_cache(); 1256 } 1257} 1258 1259void 1260pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1261{ 1262 int i; 1263 1264 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1265 (cpu_feature & CPUID_CLFSH) == 0) { 1266 pmap_invalidate_cache(); 1267 } else { 1268 for (i = 0; i < count; i++) 1269 pmap_flush_page(pages[i]); 1270 } 1271} 1272 1273/* 1274 * Are we current address space or kernel? N.B. We return FALSE when 1275 * a pmap's page table is in use because a kernel thread is borrowing 1276 * it. The borrowed page table can change spontaneously, making any 1277 * dependence on its continued use subject to a race condition. 1278 */ 1279static __inline int 1280pmap_is_current(pmap_t pmap) 1281{ 1282 1283 return (pmap == kernel_pmap || 1284 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1285 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1286} 1287 1288/* 1289 * If the given pmap is not the current or kernel pmap, the returned pte must 1290 * be released by passing it to pmap_pte_release(). 1291 */ 1292pt_entry_t * 1293pmap_pte(pmap_t pmap, vm_offset_t va) 1294{ 1295 pd_entry_t newpf; 1296 pd_entry_t *pde; 1297 1298 pde = pmap_pde(pmap, va); 1299 if (*pde & PG_PS) 1300 return (pde); 1301 if (*pde != 0) { 1302 /* are we current address space or kernel? */ 1303 if (pmap_is_current(pmap)) 1304 return (vtopte(va)); 1305 mtx_lock(&PMAP2mutex); 1306 newpf = *pde & PG_FRAME; 1307 if ((*PMAP2 & PG_FRAME) != newpf) { 1308 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1309 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1310 } 1311 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1312 } 1313 return (NULL); 1314} 1315 1316/* 1317 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1318 * being NULL. 1319 */ 1320static __inline void 1321pmap_pte_release(pt_entry_t *pte) 1322{ 1323 1324 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1325 mtx_unlock(&PMAP2mutex); 1326} 1327 1328/* 1329 * NB: The sequence of updating a page table followed by accesses to the 1330 * corresponding pages is subject to the situation described in the "AMD64 1331 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1332 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1333 * right after modifying the PTE bits is crucial. 1334 */ 1335static __inline void 1336invlcaddr(void *caddr) 1337{ 1338 1339 invlpg((u_int)caddr); 1340} 1341 1342/* 1343 * Super fast pmap_pte routine best used when scanning 1344 * the pv lists. This eliminates many coarse-grained 1345 * invltlb calls. Note that many of the pv list 1346 * scans are across different pmaps. It is very wasteful 1347 * to do an entire invltlb for checking a single mapping. 1348 * 1349 * If the given pmap is not the current pmap, pvh_global_lock 1350 * must be held and curthread pinned to a CPU. 1351 */ 1352static pt_entry_t * 1353pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1354{ 1355 pd_entry_t newpf; 1356 pd_entry_t *pde; 1357 1358 pde = pmap_pde(pmap, va); 1359 if (*pde & PG_PS) 1360 return (pde); 1361 if (*pde != 0) { 1362 /* are we current address space or kernel? */ 1363 if (pmap_is_current(pmap)) 1364 return (vtopte(va)); 1365 rw_assert(&pvh_global_lock, RA_WLOCKED); 1366 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1367 newpf = *pde & PG_FRAME; 1368 if ((*PMAP1 & PG_FRAME) != newpf) { 1369 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1370#ifdef SMP 1371 PMAP1cpu = PCPU_GET(cpuid); 1372#endif 1373 invlcaddr(PADDR1); 1374 PMAP1changed++; 1375 } else 1376#ifdef SMP 1377 if (PMAP1cpu != PCPU_GET(cpuid)) { 1378 PMAP1cpu = PCPU_GET(cpuid); 1379 invlcaddr(PADDR1); 1380 PMAP1changedcpu++; 1381 } else 1382#endif 1383 PMAP1unchanged++; 1384 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1385 } 1386 return (0); 1387} 1388 1389/* 1390 * Routine: pmap_extract 1391 * Function: 1392 * Extract the physical page address associated 1393 * with the given map/virtual_address pair. 1394 */ 1395vm_paddr_t 1396pmap_extract(pmap_t pmap, vm_offset_t va) 1397{ 1398 vm_paddr_t rtval; 1399 pt_entry_t *pte; 1400 pd_entry_t pde; 1401 1402 rtval = 0; 1403 PMAP_LOCK(pmap); 1404 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1405 if (pde != 0) { 1406 if ((pde & PG_PS) != 0) 1407 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1408 else { 1409 pte = pmap_pte(pmap, va); 1410 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1411 pmap_pte_release(pte); 1412 } 1413 } 1414 PMAP_UNLOCK(pmap); 1415 return (rtval); 1416} 1417 1418/* 1419 * Routine: pmap_extract_and_hold 1420 * Function: 1421 * Atomically extract and hold the physical page 1422 * with the given pmap and virtual address pair 1423 * if that mapping permits the given protection. 1424 */ 1425vm_page_t 1426pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1427{ 1428 pd_entry_t pde; 1429 pt_entry_t pte, *ptep; 1430 vm_page_t m; 1431 vm_paddr_t pa; 1432 1433 pa = 0; 1434 m = NULL; 1435 PMAP_LOCK(pmap); 1436retry: 1437 pde = *pmap_pde(pmap, va); 1438 if (pde != 0) { 1439 if (pde & PG_PS) { 1440 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1441 if (vm_page_pa_tryrelock(pmap, (pde & 1442 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1443 goto retry; 1444 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1445 (va & PDRMASK)); 1446 vm_page_hold(m); 1447 } 1448 } else { 1449 ptep = pmap_pte(pmap, va); 1450 pte = *ptep; 1451 pmap_pte_release(ptep); 1452 if (pte != 0 && 1453 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1454 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1455 &pa)) 1456 goto retry; 1457 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1458 vm_page_hold(m); 1459 } 1460 } 1461 } 1462 PA_UNLOCK_COND(pa); 1463 PMAP_UNLOCK(pmap); 1464 return (m); 1465} 1466 1467/*************************************************** 1468 * Low level mapping routines..... 1469 ***************************************************/ 1470 1471/* 1472 * Add a wired page to the kva. 1473 * Note: not SMP coherent. 1474 * 1475 * This function may be used before pmap_bootstrap() is called. 1476 */ 1477PMAP_INLINE void 1478pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1479{ 1480 pt_entry_t *pte; 1481 1482 pte = vtopte(va); 1483 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1484} 1485 1486static __inline void 1487pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1488{ 1489 pt_entry_t *pte; 1490 1491 pte = vtopte(va); 1492 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1493} 1494 1495/* 1496 * Remove a page from the kernel pagetables. 1497 * Note: not SMP coherent. 1498 * 1499 * This function may be used before pmap_bootstrap() is called. 1500 */ 1501PMAP_INLINE void 1502pmap_kremove(vm_offset_t va) 1503{ 1504 pt_entry_t *pte; 1505 1506 pte = vtopte(va); 1507 pte_clear(pte); 1508} 1509 1510/* 1511 * Used to map a range of physical addresses into kernel 1512 * virtual address space. 1513 * 1514 * The value passed in '*virt' is a suggested virtual address for 1515 * the mapping. Architectures which can support a direct-mapped 1516 * physical to virtual region can return the appropriate address 1517 * within that region, leaving '*virt' unchanged. Other 1518 * architectures should map the pages starting at '*virt' and 1519 * update '*virt' with the first usable address after the mapped 1520 * region. 1521 */ 1522vm_offset_t 1523pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1524{ 1525 vm_offset_t va, sva; 1526 vm_paddr_t superpage_offset; 1527 pd_entry_t newpde; 1528 1529 va = *virt; 1530 /* 1531 * Does the physical address range's size and alignment permit at 1532 * least one superpage mapping to be created? 1533 */ 1534 superpage_offset = start & PDRMASK; 1535 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1536 /* 1537 * Increase the starting virtual address so that its alignment 1538 * does not preclude the use of superpage mappings. 1539 */ 1540 if ((va & PDRMASK) < superpage_offset) 1541 va = (va & ~PDRMASK) + superpage_offset; 1542 else if ((va & PDRMASK) > superpage_offset) 1543 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1544 } 1545 sva = va; 1546 while (start < end) { 1547 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1548 pseflag) { 1549 KASSERT((va & PDRMASK) == 0, 1550 ("pmap_map: misaligned va %#x", va)); 1551 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1552 pmap_kenter_pde(va, newpde); 1553 va += NBPDR; 1554 start += NBPDR; 1555 } else { 1556 pmap_kenter(va, start); 1557 va += PAGE_SIZE; 1558 start += PAGE_SIZE; 1559 } 1560 } 1561 pmap_invalidate_range(kernel_pmap, sva, va); 1562 *virt = va; 1563 return (sva); 1564} 1565 1566 1567/* 1568 * Add a list of wired pages to the kva 1569 * this routine is only used for temporary 1570 * kernel mappings that do not need to have 1571 * page modification or references recorded. 1572 * Note that old mappings are simply written 1573 * over. The page *must* be wired. 1574 * Note: SMP coherent. Uses a ranged shootdown IPI. 1575 */ 1576void 1577pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1578{ 1579 pt_entry_t *endpte, oldpte, pa, *pte; 1580 vm_page_t m; 1581 1582 oldpte = 0; 1583 pte = vtopte(sva); 1584 endpte = pte + count; 1585 while (pte < endpte) { 1586 m = *ma++; 1587 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1588 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1589 oldpte |= *pte; 1590 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1591 } 1592 pte++; 1593 } 1594 if (__predict_false((oldpte & PG_V) != 0)) 1595 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1596 PAGE_SIZE); 1597} 1598 1599/* 1600 * This routine tears out page mappings from the 1601 * kernel -- it is meant only for temporary mappings. 1602 * Note: SMP coherent. Uses a ranged shootdown IPI. 1603 */ 1604void 1605pmap_qremove(vm_offset_t sva, int count) 1606{ 1607 vm_offset_t va; 1608 1609 va = sva; 1610 while (count-- > 0) { 1611 pmap_kremove(va); 1612 va += PAGE_SIZE; 1613 } 1614 pmap_invalidate_range(kernel_pmap, sva, va); 1615} 1616 1617/*************************************************** 1618 * Page table page management routines..... 1619 ***************************************************/ 1620static __inline void 1621pmap_free_zero_pages(struct spglist *free) 1622{ 1623 vm_page_t m; 1624 1625 while ((m = SLIST_FIRST(free)) != NULL) { 1626 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1627 /* Preserve the page's PG_ZERO setting. */ 1628 vm_page_free_toq(m); 1629 } 1630} 1631 1632/* 1633 * Schedule the specified unused page table page to be freed. Specifically, 1634 * add the page to the specified list of pages that will be released to the 1635 * physical memory manager after the TLB has been updated. 1636 */ 1637static __inline void 1638pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1639 boolean_t set_PG_ZERO) 1640{ 1641 1642 if (set_PG_ZERO) 1643 m->flags |= PG_ZERO; 1644 else 1645 m->flags &= ~PG_ZERO; 1646 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1647} 1648 1649/* 1650 * Inserts the specified page table page into the specified pmap's collection 1651 * of idle page table pages. Each of a pmap's page table pages is responsible 1652 * for mapping a distinct range of virtual addresses. The pmap's collection is 1653 * ordered by this virtual address range. 1654 */ 1655static __inline int 1656pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1657{ 1658 1659 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1660 return (vm_radix_insert(&pmap->pm_root, mpte)); 1661} 1662 1663/* 1664 * Looks for a page table page mapping the specified virtual address in the 1665 * specified pmap's collection of idle page table pages. Returns NULL if there 1666 * is no page table page corresponding to the specified virtual address. 1667 */ 1668static __inline vm_page_t 1669pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1670{ 1671 1672 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1673 return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); 1674} 1675 1676/* 1677 * Removes the specified page table page from the specified pmap's collection 1678 * of idle page table pages. The specified page table page must be a member of 1679 * the pmap's collection. 1680 */ 1681static __inline void 1682pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1683{ 1684 1685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1686 vm_radix_remove(&pmap->pm_root, mpte->pindex); 1687} 1688 1689/* 1690 * Decrements a page table page's wire count, which is used to record the 1691 * number of valid page table entries within the page. If the wire count 1692 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1693 * page table page was unmapped and FALSE otherwise. 1694 */ 1695static inline boolean_t 1696pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1697{ 1698 1699 --m->wire_count; 1700 if (m->wire_count == 0) { 1701 _pmap_unwire_ptp(pmap, m, free); 1702 return (TRUE); 1703 } else 1704 return (FALSE); 1705} 1706 1707static void 1708_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1709{ 1710 vm_offset_t pteva; 1711 1712 /* 1713 * unmap the page table page 1714 */ 1715 pmap->pm_pdir[m->pindex] = 0; 1716 --pmap->pm_stats.resident_count; 1717 1718 /* 1719 * This is a release store so that the ordinary store unmapping 1720 * the page table page is globally performed before TLB shoot- 1721 * down is begun. 1722 */ 1723 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1724 1725 /* 1726 * Do an invltlb to make the invalidated mapping 1727 * take effect immediately. 1728 */ 1729 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1730 pmap_invalidate_page(pmap, pteva); 1731 1732 /* 1733 * Put page on a list so that it is released after 1734 * *ALL* TLB shootdown is done 1735 */ 1736 pmap_add_delayed_free_list(m, free, TRUE); 1737} 1738 1739/* 1740 * After removing a page table entry, this routine is used to 1741 * conditionally free the page, and manage the hold/wire counts. 1742 */ 1743static int 1744pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1745{ 1746 pd_entry_t ptepde; 1747 vm_page_t mpte; 1748 1749 if (va >= VM_MAXUSER_ADDRESS) 1750 return (0); 1751 ptepde = *pmap_pde(pmap, va); 1752 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1753 return (pmap_unwire_ptp(pmap, mpte, free)); 1754} 1755 1756/* 1757 * Initialize the pmap for the swapper process. 1758 */ 1759void 1760pmap_pinit0(pmap_t pmap) 1761{ 1762 1763 PMAP_LOCK_INIT(pmap); 1764 /* 1765 * Since the page table directory is shared with the kernel pmap, 1766 * which is already included in the list "allpmaps", this pmap does 1767 * not need to be inserted into that list. 1768 */ 1769 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1770#if defined(PAE) || defined(PAE_TABLES) 1771 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1772#endif 1773 pmap->pm_root.rt_root = 0; 1774 CPU_ZERO(&pmap->pm_active); 1775 PCPU_SET(curpmap, pmap); 1776 TAILQ_INIT(&pmap->pm_pvchunk); 1777 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1778} 1779 1780/* 1781 * Initialize a preallocated and zeroed pmap structure, 1782 * such as one in a vmspace structure. 1783 */ 1784int 1785pmap_pinit(pmap_t pmap) 1786{ 1787 vm_page_t m, ptdpg[NPGPTD]; 1788 vm_paddr_t pa; 1789 int i; 1790 1791 /* 1792 * No need to allocate page table space yet but we do need a valid 1793 * page directory table. 1794 */ 1795 if (pmap->pm_pdir == NULL) { 1796 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1797 if (pmap->pm_pdir == NULL) 1798 return (0); 1799#if defined(PAE) || defined(PAE_TABLES) 1800 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1801 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1802 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1803 ("pmap_pinit: pdpt misaligned")); 1804 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1805 ("pmap_pinit: pdpt above 4g")); 1806#endif 1807 pmap->pm_root.rt_root = 0; 1808 } 1809 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1810 ("pmap_pinit: pmap has reserved page table page(s)")); 1811 1812 /* 1813 * allocate the page directory page(s) 1814 */ 1815 for (i = 0; i < NPGPTD;) { 1816 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1817 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1818 if (m == NULL) 1819 VM_WAIT; 1820 else { 1821 ptdpg[i++] = m; 1822 } 1823 } 1824 1825 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1826 1827 for (i = 0; i < NPGPTD; i++) 1828 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1829 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1830 1831 mtx_lock_spin(&allpmaps_lock); 1832 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1833 /* Copy the kernel page table directory entries. */ 1834 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1835 mtx_unlock_spin(&allpmaps_lock); 1836 1837 /* install self-referential address mapping entry(s) */ 1838 for (i = 0; i < NPGPTD; i++) { 1839 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1840 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1841#if defined(PAE) || defined(PAE_TABLES) 1842 pmap->pm_pdpt[i] = pa | PG_V; 1843#endif 1844 } 1845 1846 CPU_ZERO(&pmap->pm_active); 1847 TAILQ_INIT(&pmap->pm_pvchunk); 1848 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1849 1850 return (1); 1851} 1852 1853/* 1854 * this routine is called if the page table page is not 1855 * mapped correctly. 1856 */ 1857static vm_page_t 1858_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1859{ 1860 vm_paddr_t ptepa; 1861 vm_page_t m; 1862 1863 /* 1864 * Allocate a page table page. 1865 */ 1866 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1867 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1868 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1869 PMAP_UNLOCK(pmap); 1870 rw_wunlock(&pvh_global_lock); 1871 VM_WAIT; 1872 rw_wlock(&pvh_global_lock); 1873 PMAP_LOCK(pmap); 1874 } 1875 1876 /* 1877 * Indicate the need to retry. While waiting, the page table 1878 * page may have been allocated. 1879 */ 1880 return (NULL); 1881 } 1882 if ((m->flags & PG_ZERO) == 0) 1883 pmap_zero_page(m); 1884 1885 /* 1886 * Map the pagetable page into the process address space, if 1887 * it isn't already there. 1888 */ 1889 1890 pmap->pm_stats.resident_count++; 1891 1892 ptepa = VM_PAGE_TO_PHYS(m); 1893 pmap->pm_pdir[ptepindex] = 1894 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1895 1896 return (m); 1897} 1898 1899static vm_page_t 1900pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1901{ 1902 u_int ptepindex; 1903 pd_entry_t ptepa; 1904 vm_page_t m; 1905 1906 /* 1907 * Calculate pagetable page index 1908 */ 1909 ptepindex = va >> PDRSHIFT; 1910retry: 1911 /* 1912 * Get the page directory entry 1913 */ 1914 ptepa = pmap->pm_pdir[ptepindex]; 1915 1916 /* 1917 * This supports switching from a 4MB page to a 1918 * normal 4K page. 1919 */ 1920 if (ptepa & PG_PS) { 1921 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1922 ptepa = pmap->pm_pdir[ptepindex]; 1923 } 1924 1925 /* 1926 * If the page table page is mapped, we just increment the 1927 * hold count, and activate it. 1928 */ 1929 if (ptepa) { 1930 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1931 m->wire_count++; 1932 } else { 1933 /* 1934 * Here if the pte page isn't mapped, or if it has 1935 * been deallocated. 1936 */ 1937 m = _pmap_allocpte(pmap, ptepindex, flags); 1938 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 1939 goto retry; 1940 } 1941 return (m); 1942} 1943 1944 1945/*************************************************** 1946* Pmap allocation/deallocation routines. 1947 ***************************************************/ 1948 1949#ifdef SMP 1950/* 1951 * Deal with a SMP shootdown of other users of the pmap that we are 1952 * trying to dispose of. This can be a bit hairy. 1953 */ 1954static cpuset_t *lazymask; 1955static u_int lazyptd; 1956static volatile u_int lazywait; 1957 1958void pmap_lazyfix_action(void); 1959 1960void 1961pmap_lazyfix_action(void) 1962{ 1963 1964#ifdef COUNT_IPIS 1965 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1966#endif 1967 if (rcr3() == lazyptd) 1968 load_cr3(curpcb->pcb_cr3); 1969 CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); 1970 atomic_store_rel_int(&lazywait, 1); 1971} 1972 1973static void 1974pmap_lazyfix_self(u_int cpuid) 1975{ 1976 1977 if (rcr3() == lazyptd) 1978 load_cr3(curpcb->pcb_cr3); 1979 CPU_CLR_ATOMIC(cpuid, lazymask); 1980} 1981 1982 1983static void 1984pmap_lazyfix(pmap_t pmap) 1985{ 1986 cpuset_t mymask, mask; 1987 u_int cpuid, spins; 1988 int lsb; 1989 1990 mask = pmap->pm_active; 1991 while (!CPU_EMPTY(&mask)) { 1992 spins = 50000000; 1993 1994 /* Find least significant set bit. */ 1995 lsb = CPU_FFS(&mask); 1996 MPASS(lsb != 0); 1997 lsb--; 1998 CPU_SETOF(lsb, &mask); 1999 mtx_lock_spin(&smp_ipi_mtx); 2000#if defined(PAE) || defined(PAE_TABLES) 2001 lazyptd = vtophys(pmap->pm_pdpt); 2002#else 2003 lazyptd = vtophys(pmap->pm_pdir); 2004#endif 2005 cpuid = PCPU_GET(cpuid); 2006 2007 /* Use a cpuset just for having an easy check. */ 2008 CPU_SETOF(cpuid, &mymask); 2009 if (!CPU_CMP(&mask, &mymask)) { 2010 lazymask = &pmap->pm_active; 2011 pmap_lazyfix_self(cpuid); 2012 } else { 2013 atomic_store_rel_int((u_int *)&lazymask, 2014 (u_int)&pmap->pm_active); 2015 atomic_store_rel_int(&lazywait, 0); 2016 ipi_selected(mask, IPI_LAZYPMAP); 2017 while (lazywait == 0) { 2018 ia32_pause(); 2019 if (--spins == 0) 2020 break; 2021 } 2022 } 2023 mtx_unlock_spin(&smp_ipi_mtx); 2024 if (spins == 0) 2025 printf("pmap_lazyfix: spun for 50000000\n"); 2026 mask = pmap->pm_active; 2027 } 2028} 2029 2030#else /* SMP */ 2031 2032/* 2033 * Cleaning up on uniprocessor is easy. For various reasons, we're 2034 * unlikely to have to even execute this code, including the fact 2035 * that the cleanup is deferred until the parent does a wait(2), which 2036 * means that another userland process has run. 2037 */ 2038static void 2039pmap_lazyfix(pmap_t pmap) 2040{ 2041 u_int cr3; 2042 2043 cr3 = vtophys(pmap->pm_pdir); 2044 if (cr3 == rcr3()) { 2045 load_cr3(curpcb->pcb_cr3); 2046 CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); 2047 } 2048} 2049#endif /* SMP */ 2050 2051/* 2052 * Release any resources held by the given physical map. 2053 * Called when a pmap initialized by pmap_pinit is being released. 2054 * Should only be called if the map contains no valid mappings. 2055 */ 2056void 2057pmap_release(pmap_t pmap) 2058{ 2059 vm_page_t m, ptdpg[NPGPTD]; 2060 int i; 2061 2062 KASSERT(pmap->pm_stats.resident_count == 0, 2063 ("pmap_release: pmap resident count %ld != 0", 2064 pmap->pm_stats.resident_count)); 2065 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2066 ("pmap_release: pmap has reserved page table page(s)")); 2067 2068 pmap_lazyfix(pmap); 2069 mtx_lock_spin(&allpmaps_lock); 2070 LIST_REMOVE(pmap, pm_list); 2071 mtx_unlock_spin(&allpmaps_lock); 2072 2073 for (i = 0; i < NPGPTD; i++) 2074 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2075 PG_FRAME); 2076 2077 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2078 sizeof(*pmap->pm_pdir)); 2079 2080 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2081 2082 for (i = 0; i < NPGPTD; i++) { 2083 m = ptdpg[i]; 2084#if defined(PAE) || defined(PAE_TABLES) 2085 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2086 ("pmap_release: got wrong ptd page")); 2087#endif 2088 m->wire_count--; 2089 atomic_subtract_int(&cnt.v_wire_count, 1); 2090 vm_page_free_zero(m); 2091 } 2092} 2093 2094static int 2095kvm_size(SYSCTL_HANDLER_ARGS) 2096{ 2097 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2098 2099 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2100} 2101SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2102 0, 0, kvm_size, "IU", "Size of KVM"); 2103 2104static int 2105kvm_free(SYSCTL_HANDLER_ARGS) 2106{ 2107 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2108 2109 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2110} 2111SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2112 0, 0, kvm_free, "IU", "Amount of KVM free"); 2113 2114/* 2115 * grow the number of kernel page table entries, if needed 2116 */ 2117void 2118pmap_growkernel(vm_offset_t addr) 2119{ 2120 vm_paddr_t ptppaddr; 2121 vm_page_t nkpg; 2122 pd_entry_t newpdir; 2123 2124 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2125 addr = roundup2(addr, NBPDR); 2126 if (addr - 1 >= kernel_map->max_offset) 2127 addr = kernel_map->max_offset; 2128 while (kernel_vm_end < addr) { 2129 if (pdir_pde(PTD, kernel_vm_end)) { 2130 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2131 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2132 kernel_vm_end = kernel_map->max_offset; 2133 break; 2134 } 2135 continue; 2136 } 2137 2138 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2139 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2140 VM_ALLOC_ZERO); 2141 if (nkpg == NULL) 2142 panic("pmap_growkernel: no memory to grow kernel"); 2143 2144 nkpt++; 2145 2146 if ((nkpg->flags & PG_ZERO) == 0) 2147 pmap_zero_page(nkpg); 2148 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2149 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2150 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2151 2152 pmap_kenter_pde(kernel_vm_end, newpdir); 2153 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2154 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2155 kernel_vm_end = kernel_map->max_offset; 2156 break; 2157 } 2158 } 2159} 2160 2161 2162/*************************************************** 2163 * page management routines. 2164 ***************************************************/ 2165 2166CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2167CTASSERT(_NPCM == 11); 2168CTASSERT(_NPCPV == 336); 2169 2170static __inline struct pv_chunk * 2171pv_to_chunk(pv_entry_t pv) 2172{ 2173 2174 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2175} 2176 2177#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2178 2179#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2180#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2181 2182static const uint32_t pc_freemask[_NPCM] = { 2183 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2184 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2185 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2186 PC_FREE0_9, PC_FREE10 2187}; 2188 2189SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2190 "Current number of pv entries"); 2191 2192#ifdef PV_STATS 2193static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2194 2195SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2196 "Current number of pv entry chunks"); 2197SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2198 "Current number of pv entry chunks allocated"); 2199SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2200 "Current number of pv entry chunks frees"); 2201SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2202 "Number of times tried to get a chunk page but failed."); 2203 2204static long pv_entry_frees, pv_entry_allocs; 2205static int pv_entry_spare; 2206 2207SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2208 "Current number of pv entry frees"); 2209SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2210 "Current number of pv entry allocs"); 2211SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2212 "Current number of spare pv entries"); 2213#endif 2214 2215/* 2216 * We are in a serious low memory condition. Resort to 2217 * drastic measures to free some pages so we can allocate 2218 * another pv entry chunk. 2219 */ 2220static vm_page_t 2221pmap_pv_reclaim(pmap_t locked_pmap) 2222{ 2223 struct pch newtail; 2224 struct pv_chunk *pc; 2225 struct md_page *pvh; 2226 pd_entry_t *pde; 2227 pmap_t pmap; 2228 pt_entry_t *pte, tpte; 2229 pv_entry_t pv; 2230 vm_offset_t va; 2231 vm_page_t m, m_pc; 2232 struct spglist free; 2233 uint32_t inuse; 2234 int bit, field, freed; 2235 2236 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2237 pmap = NULL; 2238 m_pc = NULL; 2239 SLIST_INIT(&free); 2240 TAILQ_INIT(&newtail); 2241 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2242 SLIST_EMPTY(&free))) { 2243 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2244 if (pmap != pc->pc_pmap) { 2245 if (pmap != NULL) { 2246 pmap_invalidate_all(pmap); 2247 if (pmap != locked_pmap) 2248 PMAP_UNLOCK(pmap); 2249 } 2250 pmap = pc->pc_pmap; 2251 /* Avoid deadlock and lock recursion. */ 2252 if (pmap > locked_pmap) 2253 PMAP_LOCK(pmap); 2254 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2255 pmap = NULL; 2256 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2257 continue; 2258 } 2259 } 2260 2261 /* 2262 * Destroy every non-wired, 4 KB page mapping in the chunk. 2263 */ 2264 freed = 0; 2265 for (field = 0; field < _NPCM; field++) { 2266 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2267 inuse != 0; inuse &= ~(1UL << bit)) { 2268 bit = bsfl(inuse); 2269 pv = &pc->pc_pventry[field * 32 + bit]; 2270 va = pv->pv_va; 2271 pde = pmap_pde(pmap, va); 2272 if ((*pde & PG_PS) != 0) 2273 continue; 2274 pte = pmap_pte(pmap, va); 2275 tpte = *pte; 2276 if ((tpte & PG_W) == 0) 2277 tpte = pte_load_clear(pte); 2278 pmap_pte_release(pte); 2279 if ((tpte & PG_W) != 0) 2280 continue; 2281 KASSERT(tpte != 0, 2282 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2283 pmap, va)); 2284 if ((tpte & PG_G) != 0) 2285 pmap_invalidate_page(pmap, va); 2286 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2287 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2288 vm_page_dirty(m); 2289 if ((tpte & PG_A) != 0) 2290 vm_page_aflag_set(m, PGA_REFERENCED); 2291 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2292 if (TAILQ_EMPTY(&m->md.pv_list) && 2293 (m->flags & PG_FICTITIOUS) == 0) { 2294 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2295 if (TAILQ_EMPTY(&pvh->pv_list)) { 2296 vm_page_aflag_clear(m, 2297 PGA_WRITEABLE); 2298 } 2299 } 2300 pc->pc_map[field] |= 1UL << bit; 2301 pmap_unuse_pt(pmap, va, &free); 2302 freed++; 2303 } 2304 } 2305 if (freed == 0) { 2306 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2307 continue; 2308 } 2309 /* Every freed mapping is for a 4 KB page. */ 2310 pmap->pm_stats.resident_count -= freed; 2311 PV_STAT(pv_entry_frees += freed); 2312 PV_STAT(pv_entry_spare += freed); 2313 pv_entry_count -= freed; 2314 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2315 for (field = 0; field < _NPCM; field++) 2316 if (pc->pc_map[field] != pc_freemask[field]) { 2317 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2318 pc_list); 2319 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2320 2321 /* 2322 * One freed pv entry in locked_pmap is 2323 * sufficient. 2324 */ 2325 if (pmap == locked_pmap) 2326 goto out; 2327 break; 2328 } 2329 if (field == _NPCM) { 2330 PV_STAT(pv_entry_spare -= _NPCPV); 2331 PV_STAT(pc_chunk_count--); 2332 PV_STAT(pc_chunk_frees++); 2333 /* Entire chunk is free; return it. */ 2334 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2335 pmap_qremove((vm_offset_t)pc, 1); 2336 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2337 break; 2338 } 2339 } 2340out: 2341 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2342 if (pmap != NULL) { 2343 pmap_invalidate_all(pmap); 2344 if (pmap != locked_pmap) 2345 PMAP_UNLOCK(pmap); 2346 } 2347 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2348 m_pc = SLIST_FIRST(&free); 2349 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2350 /* Recycle a freed page table page. */ 2351 m_pc->wire_count = 1; 2352 atomic_add_int(&cnt.v_wire_count, 1); 2353 } 2354 pmap_free_zero_pages(&free); 2355 return (m_pc); 2356} 2357 2358/* 2359 * free the pv_entry back to the free list 2360 */ 2361static void 2362free_pv_entry(pmap_t pmap, pv_entry_t pv) 2363{ 2364 struct pv_chunk *pc; 2365 int idx, field, bit; 2366 2367 rw_assert(&pvh_global_lock, RA_WLOCKED); 2368 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2369 PV_STAT(pv_entry_frees++); 2370 PV_STAT(pv_entry_spare++); 2371 pv_entry_count--; 2372 pc = pv_to_chunk(pv); 2373 idx = pv - &pc->pc_pventry[0]; 2374 field = idx / 32; 2375 bit = idx % 32; 2376 pc->pc_map[field] |= 1ul << bit; 2377 for (idx = 0; idx < _NPCM; idx++) 2378 if (pc->pc_map[idx] != pc_freemask[idx]) { 2379 /* 2380 * 98% of the time, pc is already at the head of the 2381 * list. If it isn't already, move it to the head. 2382 */ 2383 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2384 pc)) { 2385 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2386 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2387 pc_list); 2388 } 2389 return; 2390 } 2391 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2392 free_pv_chunk(pc); 2393} 2394 2395static void 2396free_pv_chunk(struct pv_chunk *pc) 2397{ 2398 vm_page_t m; 2399 2400 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2401 PV_STAT(pv_entry_spare -= _NPCPV); 2402 PV_STAT(pc_chunk_count--); 2403 PV_STAT(pc_chunk_frees++); 2404 /* entire chunk is free, return it */ 2405 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2406 pmap_qremove((vm_offset_t)pc, 1); 2407 vm_page_unwire(m, 0); 2408 vm_page_free(m); 2409 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2410} 2411 2412/* 2413 * get a new pv_entry, allocating a block from the system 2414 * when needed. 2415 */ 2416static pv_entry_t 2417get_pv_entry(pmap_t pmap, boolean_t try) 2418{ 2419 static const struct timeval printinterval = { 60, 0 }; 2420 static struct timeval lastprint; 2421 int bit, field; 2422 pv_entry_t pv; 2423 struct pv_chunk *pc; 2424 vm_page_t m; 2425 2426 rw_assert(&pvh_global_lock, RA_WLOCKED); 2427 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2428 PV_STAT(pv_entry_allocs++); 2429 pv_entry_count++; 2430 if (pv_entry_count > pv_entry_high_water) 2431 if (ratecheck(&lastprint, &printinterval)) 2432 printf("Approaching the limit on PV entries, consider " 2433 "increasing either the vm.pmap.shpgperproc or the " 2434 "vm.pmap.pv_entry_max tunable.\n"); 2435retry: 2436 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2437 if (pc != NULL) { 2438 for (field = 0; field < _NPCM; field++) { 2439 if (pc->pc_map[field]) { 2440 bit = bsfl(pc->pc_map[field]); 2441 break; 2442 } 2443 } 2444 if (field < _NPCM) { 2445 pv = &pc->pc_pventry[field * 32 + bit]; 2446 pc->pc_map[field] &= ~(1ul << bit); 2447 /* If this was the last item, move it to tail */ 2448 for (field = 0; field < _NPCM; field++) 2449 if (pc->pc_map[field] != 0) { 2450 PV_STAT(pv_entry_spare--); 2451 return (pv); /* not full, return */ 2452 } 2453 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2454 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2455 PV_STAT(pv_entry_spare--); 2456 return (pv); 2457 } 2458 } 2459 /* 2460 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2461 * global lock. If "pv_vafree" is currently non-empty, it will 2462 * remain non-empty until pmap_ptelist_alloc() completes. 2463 */ 2464 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2465 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2466 if (try) { 2467 pv_entry_count--; 2468 PV_STAT(pc_chunk_tryfail++); 2469 return (NULL); 2470 } 2471 m = pmap_pv_reclaim(pmap); 2472 if (m == NULL) 2473 goto retry; 2474 } 2475 PV_STAT(pc_chunk_count++); 2476 PV_STAT(pc_chunk_allocs++); 2477 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2478 pmap_qenter((vm_offset_t)pc, &m, 1); 2479 pc->pc_pmap = pmap; 2480 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2481 for (field = 1; field < _NPCM; field++) 2482 pc->pc_map[field] = pc_freemask[field]; 2483 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2484 pv = &pc->pc_pventry[0]; 2485 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2486 PV_STAT(pv_entry_spare += _NPCPV - 1); 2487 return (pv); 2488} 2489 2490static __inline pv_entry_t 2491pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2492{ 2493 pv_entry_t pv; 2494 2495 rw_assert(&pvh_global_lock, RA_WLOCKED); 2496 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2497 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2498 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2499 break; 2500 } 2501 } 2502 return (pv); 2503} 2504 2505static void 2506pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2507{ 2508 struct md_page *pvh; 2509 pv_entry_t pv; 2510 vm_offset_t va_last; 2511 vm_page_t m; 2512 2513 rw_assert(&pvh_global_lock, RA_WLOCKED); 2514 KASSERT((pa & PDRMASK) == 0, 2515 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2516 2517 /* 2518 * Transfer the 4mpage's pv entry for this mapping to the first 2519 * page's pv list. 2520 */ 2521 pvh = pa_to_pvh(pa); 2522 va = trunc_4mpage(va); 2523 pv = pmap_pvh_remove(pvh, pmap, va); 2524 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2525 m = PHYS_TO_VM_PAGE(pa); 2526 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2527 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2528 va_last = va + NBPDR - PAGE_SIZE; 2529 do { 2530 m++; 2531 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2532 ("pmap_pv_demote_pde: page %p is not managed", m)); 2533 va += PAGE_SIZE; 2534 pmap_insert_entry(pmap, va, m); 2535 } while (va < va_last); 2536} 2537 2538static void 2539pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2540{ 2541 struct md_page *pvh; 2542 pv_entry_t pv; 2543 vm_offset_t va_last; 2544 vm_page_t m; 2545 2546 rw_assert(&pvh_global_lock, RA_WLOCKED); 2547 KASSERT((pa & PDRMASK) == 0, 2548 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2549 2550 /* 2551 * Transfer the first page's pv entry for this mapping to the 2552 * 4mpage's pv list. Aside from avoiding the cost of a call 2553 * to get_pv_entry(), a transfer avoids the possibility that 2554 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2555 * removes one of the mappings that is being promoted. 2556 */ 2557 m = PHYS_TO_VM_PAGE(pa); 2558 va = trunc_4mpage(va); 2559 pv = pmap_pvh_remove(&m->md, pmap, va); 2560 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2561 pvh = pa_to_pvh(pa); 2562 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2563 /* Free the remaining NPTEPG - 1 pv entries. */ 2564 va_last = va + NBPDR - PAGE_SIZE; 2565 do { 2566 m++; 2567 va += PAGE_SIZE; 2568 pmap_pvh_free(&m->md, pmap, va); 2569 } while (va < va_last); 2570} 2571 2572static void 2573pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2574{ 2575 pv_entry_t pv; 2576 2577 pv = pmap_pvh_remove(pvh, pmap, va); 2578 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2579 free_pv_entry(pmap, pv); 2580} 2581 2582static void 2583pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2584{ 2585 struct md_page *pvh; 2586 2587 rw_assert(&pvh_global_lock, RA_WLOCKED); 2588 pmap_pvh_free(&m->md, pmap, va); 2589 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2590 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2591 if (TAILQ_EMPTY(&pvh->pv_list)) 2592 vm_page_aflag_clear(m, PGA_WRITEABLE); 2593 } 2594} 2595 2596/* 2597 * Create a pv entry for page at pa for 2598 * (pmap, va). 2599 */ 2600static void 2601pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2602{ 2603 pv_entry_t pv; 2604 2605 rw_assert(&pvh_global_lock, RA_WLOCKED); 2606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2607 pv = get_pv_entry(pmap, FALSE); 2608 pv->pv_va = va; 2609 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2610} 2611 2612/* 2613 * Conditionally create a pv entry. 2614 */ 2615static boolean_t 2616pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2617{ 2618 pv_entry_t pv; 2619 2620 rw_assert(&pvh_global_lock, RA_WLOCKED); 2621 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2622 if (pv_entry_count < pv_entry_high_water && 2623 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2624 pv->pv_va = va; 2625 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2626 return (TRUE); 2627 } else 2628 return (FALSE); 2629} 2630 2631/* 2632 * Create the pv entries for each of the pages within a superpage. 2633 */ 2634static boolean_t 2635pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2636{ 2637 struct md_page *pvh; 2638 pv_entry_t pv; 2639 2640 rw_assert(&pvh_global_lock, RA_WLOCKED); 2641 if (pv_entry_count < pv_entry_high_water && 2642 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2643 pv->pv_va = va; 2644 pvh = pa_to_pvh(pa); 2645 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2646 return (TRUE); 2647 } else 2648 return (FALSE); 2649} 2650 2651/* 2652 * Fills a page table page with mappings to consecutive physical pages. 2653 */ 2654static void 2655pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2656{ 2657 pt_entry_t *pte; 2658 2659 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2660 *pte = newpte; 2661 newpte += PAGE_SIZE; 2662 } 2663} 2664 2665/* 2666 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2667 * 2- or 4MB page mapping is invalidated. 2668 */ 2669static boolean_t 2670pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2671{ 2672 pd_entry_t newpde, oldpde; 2673 pt_entry_t *firstpte, newpte; 2674 vm_paddr_t mptepa; 2675 vm_page_t mpte; 2676 struct spglist free; 2677 2678 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2679 oldpde = *pde; 2680 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2681 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2682 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 2683 NULL) 2684 pmap_remove_pt_page(pmap, mpte); 2685 else { 2686 KASSERT((oldpde & PG_W) == 0, 2687 ("pmap_demote_pde: page table page for a wired mapping" 2688 " is missing")); 2689 2690 /* 2691 * Invalidate the 2- or 4MB page mapping and return 2692 * "failure" if the mapping was never accessed or the 2693 * allocation of the new page table page fails. 2694 */ 2695 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2696 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2697 VM_ALLOC_WIRED)) == NULL) { 2698 SLIST_INIT(&free); 2699 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2700 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2701 pmap_free_zero_pages(&free); 2702 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2703 " in pmap %p", va, pmap); 2704 return (FALSE); 2705 } 2706 if (va < VM_MAXUSER_ADDRESS) 2707 pmap->pm_stats.resident_count++; 2708 } 2709 mptepa = VM_PAGE_TO_PHYS(mpte); 2710 2711 /* 2712 * If the page mapping is in the kernel's address space, then the 2713 * KPTmap can provide access to the page table page. Otherwise, 2714 * temporarily map the page table page (mpte) into the kernel's 2715 * address space at either PADDR1 or PADDR2. 2716 */ 2717 if (va >= KERNBASE) 2718 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2719 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2720 if ((*PMAP1 & PG_FRAME) != mptepa) { 2721 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2722#ifdef SMP 2723 PMAP1cpu = PCPU_GET(cpuid); 2724#endif 2725 invlcaddr(PADDR1); 2726 PMAP1changed++; 2727 } else 2728#ifdef SMP 2729 if (PMAP1cpu != PCPU_GET(cpuid)) { 2730 PMAP1cpu = PCPU_GET(cpuid); 2731 invlcaddr(PADDR1); 2732 PMAP1changedcpu++; 2733 } else 2734#endif 2735 PMAP1unchanged++; 2736 firstpte = PADDR1; 2737 } else { 2738 mtx_lock(&PMAP2mutex); 2739 if ((*PMAP2 & PG_FRAME) != mptepa) { 2740 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2741 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2742 } 2743 firstpte = PADDR2; 2744 } 2745 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2746 KASSERT((oldpde & PG_A) != 0, 2747 ("pmap_demote_pde: oldpde is missing PG_A")); 2748 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2749 ("pmap_demote_pde: oldpde is missing PG_M")); 2750 newpte = oldpde & ~PG_PS; 2751 if ((newpte & PG_PDE_PAT) != 0) 2752 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2753 2754 /* 2755 * If the page table page is new, initialize it. 2756 */ 2757 if (mpte->wire_count == 1) { 2758 mpte->wire_count = NPTEPG; 2759 pmap_fill_ptp(firstpte, newpte); 2760 } 2761 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2762 ("pmap_demote_pde: firstpte and newpte map different physical" 2763 " addresses")); 2764 2765 /* 2766 * If the mapping has changed attributes, update the page table 2767 * entries. 2768 */ 2769 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2770 pmap_fill_ptp(firstpte, newpte); 2771 2772 /* 2773 * Demote the mapping. This pmap is locked. The old PDE has 2774 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2775 * set. Thus, there is no danger of a race with another 2776 * processor changing the setting of PG_A and/or PG_M between 2777 * the read above and the store below. 2778 */ 2779 if (workaround_erratum383) 2780 pmap_update_pde(pmap, va, pde, newpde); 2781 else if (pmap == kernel_pmap) 2782 pmap_kenter_pde(va, newpde); 2783 else 2784 pde_store(pde, newpde); 2785 if (firstpte == PADDR2) 2786 mtx_unlock(&PMAP2mutex); 2787 2788 /* 2789 * Invalidate the recursive mapping of the page table page. 2790 */ 2791 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2792 2793 /* 2794 * Demote the pv entry. This depends on the earlier demotion 2795 * of the mapping. Specifically, the (re)creation of a per- 2796 * page pv entry might trigger the execution of pmap_collect(), 2797 * which might reclaim a newly (re)created per-page pv entry 2798 * and destroy the associated mapping. In order to destroy 2799 * the mapping, the PDE must have already changed from mapping 2800 * the 2mpage to referencing the page table page. 2801 */ 2802 if ((oldpde & PG_MANAGED) != 0) 2803 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2804 2805 pmap_pde_demotions++; 2806 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2807 " in pmap %p", va, pmap); 2808 return (TRUE); 2809} 2810 2811/* 2812 * Removes a 2- or 4MB page mapping from the kernel pmap. 2813 */ 2814static void 2815pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2816{ 2817 pd_entry_t newpde; 2818 vm_paddr_t mptepa; 2819 vm_page_t mpte; 2820 2821 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2822 mpte = pmap_lookup_pt_page(pmap, va); 2823 if (mpte == NULL) 2824 panic("pmap_remove_kernel_pde: Missing pt page."); 2825 2826 pmap_remove_pt_page(pmap, mpte); 2827 mptepa = VM_PAGE_TO_PHYS(mpte); 2828 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2829 2830 /* 2831 * Initialize the page table page. 2832 */ 2833 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2834 2835 /* 2836 * Remove the mapping. 2837 */ 2838 if (workaround_erratum383) 2839 pmap_update_pde(pmap, va, pde, newpde); 2840 else 2841 pmap_kenter_pde(va, newpde); 2842 2843 /* 2844 * Invalidate the recursive mapping of the page table page. 2845 */ 2846 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2847} 2848 2849/* 2850 * pmap_remove_pde: do the things to unmap a superpage in a process 2851 */ 2852static void 2853pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2854 struct spglist *free) 2855{ 2856 struct md_page *pvh; 2857 pd_entry_t oldpde; 2858 vm_offset_t eva, va; 2859 vm_page_t m, mpte; 2860 2861 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2862 KASSERT((sva & PDRMASK) == 0, 2863 ("pmap_remove_pde: sva is not 4mpage aligned")); 2864 oldpde = pte_load_clear(pdq); 2865 if (oldpde & PG_W) 2866 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2867 2868 /* 2869 * Machines that don't support invlpg, also don't support 2870 * PG_G. 2871 */ 2872 if (oldpde & PG_G) 2873 pmap_invalidate_page(kernel_pmap, sva); 2874 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2875 if (oldpde & PG_MANAGED) { 2876 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2877 pmap_pvh_free(pvh, pmap, sva); 2878 eva = sva + NBPDR; 2879 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2880 va < eva; va += PAGE_SIZE, m++) { 2881 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2882 vm_page_dirty(m); 2883 if (oldpde & PG_A) 2884 vm_page_aflag_set(m, PGA_REFERENCED); 2885 if (TAILQ_EMPTY(&m->md.pv_list) && 2886 TAILQ_EMPTY(&pvh->pv_list)) 2887 vm_page_aflag_clear(m, PGA_WRITEABLE); 2888 } 2889 } 2890 if (pmap == kernel_pmap) { 2891 pmap_remove_kernel_pde(pmap, pdq, sva); 2892 } else { 2893 mpte = pmap_lookup_pt_page(pmap, sva); 2894 if (mpte != NULL) { 2895 pmap_remove_pt_page(pmap, mpte); 2896 pmap->pm_stats.resident_count--; 2897 KASSERT(mpte->wire_count == NPTEPG, 2898 ("pmap_remove_pde: pte page wire count error")); 2899 mpte->wire_count = 0; 2900 pmap_add_delayed_free_list(mpte, free, FALSE); 2901 atomic_subtract_int(&cnt.v_wire_count, 1); 2902 } 2903 } 2904} 2905 2906/* 2907 * pmap_remove_pte: do the things to unmap a page in a process 2908 */ 2909static int 2910pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2911 struct spglist *free) 2912{ 2913 pt_entry_t oldpte; 2914 vm_page_t m; 2915 2916 rw_assert(&pvh_global_lock, RA_WLOCKED); 2917 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2918 oldpte = pte_load_clear(ptq); 2919 KASSERT(oldpte != 0, 2920 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2921 if (oldpte & PG_W) 2922 pmap->pm_stats.wired_count -= 1; 2923 /* 2924 * Machines that don't support invlpg, also don't support 2925 * PG_G. 2926 */ 2927 if (oldpte & PG_G) 2928 pmap_invalidate_page(kernel_pmap, va); 2929 pmap->pm_stats.resident_count -= 1; 2930 if (oldpte & PG_MANAGED) { 2931 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2932 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2933 vm_page_dirty(m); 2934 if (oldpte & PG_A) 2935 vm_page_aflag_set(m, PGA_REFERENCED); 2936 pmap_remove_entry(pmap, m, va); 2937 } 2938 return (pmap_unuse_pt(pmap, va, free)); 2939} 2940 2941/* 2942 * Remove a single page from a process address space 2943 */ 2944static void 2945pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2946{ 2947 pt_entry_t *pte; 2948 2949 rw_assert(&pvh_global_lock, RA_WLOCKED); 2950 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2951 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2952 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2953 return; 2954 pmap_remove_pte(pmap, pte, va, free); 2955 pmap_invalidate_page(pmap, va); 2956} 2957 2958/* 2959 * Remove the given range of addresses from the specified map. 2960 * 2961 * It is assumed that the start and end are properly 2962 * rounded to the page size. 2963 */ 2964void 2965pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2966{ 2967 vm_offset_t pdnxt; 2968 pd_entry_t ptpaddr; 2969 pt_entry_t *pte; 2970 struct spglist free; 2971 int anyvalid; 2972 2973 /* 2974 * Perform an unsynchronized read. This is, however, safe. 2975 */ 2976 if (pmap->pm_stats.resident_count == 0) 2977 return; 2978 2979 anyvalid = 0; 2980 SLIST_INIT(&free); 2981 2982 rw_wlock(&pvh_global_lock); 2983 sched_pin(); 2984 PMAP_LOCK(pmap); 2985 2986 /* 2987 * special handling of removing one page. a very 2988 * common operation and easy to short circuit some 2989 * code. 2990 */ 2991 if ((sva + PAGE_SIZE == eva) && 2992 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2993 pmap_remove_page(pmap, sva, &free); 2994 goto out; 2995 } 2996 2997 for (; sva < eva; sva = pdnxt) { 2998 u_int pdirindex; 2999 3000 /* 3001 * Calculate index for next page table. 3002 */ 3003 pdnxt = (sva + NBPDR) & ~PDRMASK; 3004 if (pdnxt < sva) 3005 pdnxt = eva; 3006 if (pmap->pm_stats.resident_count == 0) 3007 break; 3008 3009 pdirindex = sva >> PDRSHIFT; 3010 ptpaddr = pmap->pm_pdir[pdirindex]; 3011 3012 /* 3013 * Weed out invalid mappings. Note: we assume that the page 3014 * directory table is always allocated, and in kernel virtual. 3015 */ 3016 if (ptpaddr == 0) 3017 continue; 3018 3019 /* 3020 * Check for large page. 3021 */ 3022 if ((ptpaddr & PG_PS) != 0) { 3023 /* 3024 * Are we removing the entire large page? If not, 3025 * demote the mapping and fall through. 3026 */ 3027 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3028 /* 3029 * The TLB entry for a PG_G mapping is 3030 * invalidated by pmap_remove_pde(). 3031 */ 3032 if ((ptpaddr & PG_G) == 0) 3033 anyvalid = 1; 3034 pmap_remove_pde(pmap, 3035 &pmap->pm_pdir[pdirindex], sva, &free); 3036 continue; 3037 } else if (!pmap_demote_pde(pmap, 3038 &pmap->pm_pdir[pdirindex], sva)) { 3039 /* The large page mapping was destroyed. */ 3040 continue; 3041 } 3042 } 3043 3044 /* 3045 * Limit our scan to either the end of the va represented 3046 * by the current page table page, or to the end of the 3047 * range being removed. 3048 */ 3049 if (pdnxt > eva) 3050 pdnxt = eva; 3051 3052 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3053 sva += PAGE_SIZE) { 3054 if (*pte == 0) 3055 continue; 3056 3057 /* 3058 * The TLB entry for a PG_G mapping is invalidated 3059 * by pmap_remove_pte(). 3060 */ 3061 if ((*pte & PG_G) == 0) 3062 anyvalid = 1; 3063 if (pmap_remove_pte(pmap, pte, sva, &free)) 3064 break; 3065 } 3066 } 3067out: 3068 sched_unpin(); 3069 if (anyvalid) 3070 pmap_invalidate_all(pmap); 3071 rw_wunlock(&pvh_global_lock); 3072 PMAP_UNLOCK(pmap); 3073 pmap_free_zero_pages(&free); 3074} 3075 3076/* 3077 * Routine: pmap_remove_all 3078 * Function: 3079 * Removes this physical page from 3080 * all physical maps in which it resides. 3081 * Reflects back modify bits to the pager. 3082 * 3083 * Notes: 3084 * Original versions of this routine were very 3085 * inefficient because they iteratively called 3086 * pmap_remove (slow...) 3087 */ 3088 3089void 3090pmap_remove_all(vm_page_t m) 3091{ 3092 struct md_page *pvh; 3093 pv_entry_t pv; 3094 pmap_t pmap; 3095 pt_entry_t *pte, tpte; 3096 pd_entry_t *pde; 3097 vm_offset_t va; 3098 struct spglist free; 3099 3100 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3101 ("pmap_remove_all: page %p is not managed", m)); 3102 SLIST_INIT(&free); 3103 rw_wlock(&pvh_global_lock); 3104 sched_pin(); 3105 if ((m->flags & PG_FICTITIOUS) != 0) 3106 goto small_mappings; 3107 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3108 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3109 va = pv->pv_va; 3110 pmap = PV_PMAP(pv); 3111 PMAP_LOCK(pmap); 3112 pde = pmap_pde(pmap, va); 3113 (void)pmap_demote_pde(pmap, pde, va); 3114 PMAP_UNLOCK(pmap); 3115 } 3116small_mappings: 3117 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3118 pmap = PV_PMAP(pv); 3119 PMAP_LOCK(pmap); 3120 pmap->pm_stats.resident_count--; 3121 pde = pmap_pde(pmap, pv->pv_va); 3122 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3123 " a 4mpage in page %p's pv list", m)); 3124 pte = pmap_pte_quick(pmap, pv->pv_va); 3125 tpte = pte_load_clear(pte); 3126 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3127 pmap, pv->pv_va)); 3128 if (tpte & PG_W) 3129 pmap->pm_stats.wired_count--; 3130 if (tpte & PG_A) 3131 vm_page_aflag_set(m, PGA_REFERENCED); 3132 3133 /* 3134 * Update the vm_page_t clean and reference bits. 3135 */ 3136 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3137 vm_page_dirty(m); 3138 pmap_unuse_pt(pmap, pv->pv_va, &free); 3139 pmap_invalidate_page(pmap, pv->pv_va); 3140 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3141 free_pv_entry(pmap, pv); 3142 PMAP_UNLOCK(pmap); 3143 } 3144 vm_page_aflag_clear(m, PGA_WRITEABLE); 3145 sched_unpin(); 3146 rw_wunlock(&pvh_global_lock); 3147 pmap_free_zero_pages(&free); 3148} 3149 3150/* 3151 * pmap_protect_pde: do the things to protect a 4mpage in a process 3152 */ 3153static boolean_t 3154pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3155{ 3156 pd_entry_t newpde, oldpde; 3157 vm_offset_t eva, va; 3158 vm_page_t m; 3159 boolean_t anychanged; 3160 3161 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3162 KASSERT((sva & PDRMASK) == 0, 3163 ("pmap_protect_pde: sva is not 4mpage aligned")); 3164 anychanged = FALSE; 3165retry: 3166 oldpde = newpde = *pde; 3167 if (oldpde & PG_MANAGED) { 3168 eva = sva + NBPDR; 3169 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3170 va < eva; va += PAGE_SIZE, m++) 3171 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3172 vm_page_dirty(m); 3173 } 3174 if ((prot & VM_PROT_WRITE) == 0) 3175 newpde &= ~(PG_RW | PG_M); 3176#if defined(PAE) || defined(PAE_TABLES) 3177 if ((prot & VM_PROT_EXECUTE) == 0) 3178 newpde |= pg_nx; 3179#endif 3180 if (newpde != oldpde) { 3181 if (!pde_cmpset(pde, oldpde, newpde)) 3182 goto retry; 3183 if (oldpde & PG_G) 3184 pmap_invalidate_page(pmap, sva); 3185 else 3186 anychanged = TRUE; 3187 } 3188 return (anychanged); 3189} 3190 3191/* 3192 * Set the physical protection on the 3193 * specified range of this map as requested. 3194 */ 3195void 3196pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3197{ 3198 vm_offset_t pdnxt; 3199 pd_entry_t ptpaddr; 3200 pt_entry_t *pte; 3201 boolean_t anychanged, pv_lists_locked; 3202 3203 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3204 if (prot == VM_PROT_NONE) { 3205 pmap_remove(pmap, sva, eva); 3206 return; 3207 } 3208 3209#if defined(PAE) || defined(PAE_TABLES) 3210 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3211 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3212 return; 3213#else 3214 if (prot & VM_PROT_WRITE) 3215 return; 3216#endif 3217 3218 if (pmap_is_current(pmap)) 3219 pv_lists_locked = FALSE; 3220 else { 3221 pv_lists_locked = TRUE; 3222resume: 3223 rw_wlock(&pvh_global_lock); 3224 sched_pin(); 3225 } 3226 anychanged = FALSE; 3227 3228 PMAP_LOCK(pmap); 3229 for (; sva < eva; sva = pdnxt) { 3230 pt_entry_t obits, pbits; 3231 u_int pdirindex; 3232 3233 pdnxt = (sva + NBPDR) & ~PDRMASK; 3234 if (pdnxt < sva) 3235 pdnxt = eva; 3236 3237 pdirindex = sva >> PDRSHIFT; 3238 ptpaddr = pmap->pm_pdir[pdirindex]; 3239 3240 /* 3241 * Weed out invalid mappings. Note: we assume that the page 3242 * directory table is always allocated, and in kernel virtual. 3243 */ 3244 if (ptpaddr == 0) 3245 continue; 3246 3247 /* 3248 * Check for large page. 3249 */ 3250 if ((ptpaddr & PG_PS) != 0) { 3251 /* 3252 * Are we protecting the entire large page? If not, 3253 * demote the mapping and fall through. 3254 */ 3255 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3256 /* 3257 * The TLB entry for a PG_G mapping is 3258 * invalidated by pmap_protect_pde(). 3259 */ 3260 if (pmap_protect_pde(pmap, 3261 &pmap->pm_pdir[pdirindex], sva, prot)) 3262 anychanged = TRUE; 3263 continue; 3264 } else { 3265 if (!pv_lists_locked) { 3266 pv_lists_locked = TRUE; 3267 if (!rw_try_wlock(&pvh_global_lock)) { 3268 if (anychanged) 3269 pmap_invalidate_all( 3270 pmap); 3271 PMAP_UNLOCK(pmap); 3272 goto resume; 3273 } 3274 sched_pin(); 3275 } 3276 if (!pmap_demote_pde(pmap, 3277 &pmap->pm_pdir[pdirindex], sva)) { 3278 /* 3279 * The large page mapping was 3280 * destroyed. 3281 */ 3282 continue; 3283 } 3284 } 3285 } 3286 3287 if (pdnxt > eva) 3288 pdnxt = eva; 3289 3290 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3291 sva += PAGE_SIZE) { 3292 vm_page_t m; 3293 3294retry: 3295 /* 3296 * Regardless of whether a pte is 32 or 64 bits in 3297 * size, PG_RW, PG_A, and PG_M are among the least 3298 * significant 32 bits. 3299 */ 3300 obits = pbits = *pte; 3301 if ((pbits & PG_V) == 0) 3302 continue; 3303 3304 if ((prot & VM_PROT_WRITE) == 0) { 3305 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3306 (PG_MANAGED | PG_M | PG_RW)) { 3307 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3308 vm_page_dirty(m); 3309 } 3310 pbits &= ~(PG_RW | PG_M); 3311 } 3312#if defined(PAE) || defined(PAE_TABLES) 3313 if ((prot & VM_PROT_EXECUTE) == 0) 3314 pbits |= pg_nx; 3315#endif 3316 3317 if (pbits != obits) { 3318#if defined(PAE) || defined(PAE_TABLES) 3319 if (!atomic_cmpset_64(pte, obits, pbits)) 3320 goto retry; 3321#else 3322 if (!atomic_cmpset_int((u_int *)pte, obits, 3323 pbits)) 3324 goto retry; 3325#endif 3326 if (obits & PG_G) 3327 pmap_invalidate_page(pmap, sva); 3328 else 3329 anychanged = TRUE; 3330 } 3331 } 3332 } 3333 if (anychanged) 3334 pmap_invalidate_all(pmap); 3335 if (pv_lists_locked) { 3336 sched_unpin(); 3337 rw_wunlock(&pvh_global_lock); 3338 } 3339 PMAP_UNLOCK(pmap); 3340} 3341 3342/* 3343 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3344 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3345 * For promotion to occur, two conditions must be met: (1) the 4KB page 3346 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3347 * mappings must have identical characteristics. 3348 * 3349 * Managed (PG_MANAGED) mappings within the kernel address space are not 3350 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3351 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3352 * pmap. 3353 */ 3354static void 3355pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3356{ 3357 pd_entry_t newpde; 3358 pt_entry_t *firstpte, oldpte, pa, *pte; 3359 vm_offset_t oldpteva; 3360 vm_page_t mpte; 3361 3362 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3363 3364 /* 3365 * Examine the first PTE in the specified PTP. Abort if this PTE is 3366 * either invalid, unused, or does not map the first 4KB physical page 3367 * within a 2- or 4MB page. 3368 */ 3369 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3370setpde: 3371 newpde = *firstpte; 3372 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3373 pmap_pde_p_failures++; 3374 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3375 " in pmap %p", va, pmap); 3376 return; 3377 } 3378 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3379 pmap_pde_p_failures++; 3380 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3381 " in pmap %p", va, pmap); 3382 return; 3383 } 3384 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3385 /* 3386 * When PG_M is already clear, PG_RW can be cleared without 3387 * a TLB invalidation. 3388 */ 3389 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3390 ~PG_RW)) 3391 goto setpde; 3392 newpde &= ~PG_RW; 3393 } 3394 3395 /* 3396 * Examine each of the other PTEs in the specified PTP. Abort if this 3397 * PTE maps an unexpected 4KB physical page or does not have identical 3398 * characteristics to the first PTE. 3399 */ 3400 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3401 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3402setpte: 3403 oldpte = *pte; 3404 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3405 pmap_pde_p_failures++; 3406 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3407 " in pmap %p", va, pmap); 3408 return; 3409 } 3410 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3411 /* 3412 * When PG_M is already clear, PG_RW can be cleared 3413 * without a TLB invalidation. 3414 */ 3415 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3416 oldpte & ~PG_RW)) 3417 goto setpte; 3418 oldpte &= ~PG_RW; 3419 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3420 (va & ~PDRMASK); 3421 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3422 " in pmap %p", oldpteva, pmap); 3423 } 3424 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3425 pmap_pde_p_failures++; 3426 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3427 " in pmap %p", va, pmap); 3428 return; 3429 } 3430 pa -= PAGE_SIZE; 3431 } 3432 3433 /* 3434 * Save the page table page in its current state until the PDE 3435 * mapping the superpage is demoted by pmap_demote_pde() or 3436 * destroyed by pmap_remove_pde(). 3437 */ 3438 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3439 KASSERT(mpte >= vm_page_array && 3440 mpte < &vm_page_array[vm_page_array_size], 3441 ("pmap_promote_pde: page table page is out of range")); 3442 KASSERT(mpte->pindex == va >> PDRSHIFT, 3443 ("pmap_promote_pde: page table page's pindex is wrong")); 3444 if (pmap_insert_pt_page(pmap, mpte)) { 3445 pmap_pde_p_failures++; 3446 CTR2(KTR_PMAP, 3447 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3448 pmap); 3449 return; 3450 } 3451 3452 /* 3453 * Promote the pv entries. 3454 */ 3455 if ((newpde & PG_MANAGED) != 0) 3456 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3457 3458 /* 3459 * Propagate the PAT index to its proper position. 3460 */ 3461 if ((newpde & PG_PTE_PAT) != 0) 3462 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3463 3464 /* 3465 * Map the superpage. 3466 */ 3467 if (workaround_erratum383) 3468 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3469 else if (pmap == kernel_pmap) 3470 pmap_kenter_pde(va, PG_PS | newpde); 3471 else 3472 pde_store(pde, PG_PS | newpde); 3473 3474 pmap_pde_promotions++; 3475 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3476 " in pmap %p", va, pmap); 3477} 3478 3479/* 3480 * Insert the given physical page (p) at 3481 * the specified virtual address (v) in the 3482 * target physical map with the protection requested. 3483 * 3484 * If specified, the page will be wired down, meaning 3485 * that the related pte can not be reclaimed. 3486 * 3487 * NB: This is the only routine which MAY NOT lazy-evaluate 3488 * or lose information. That is, this routine must actually 3489 * insert this page into the given map NOW. 3490 */ 3491int 3492pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3493 u_int flags, int8_t psind) 3494{ 3495 pd_entry_t *pde; 3496 pt_entry_t *pte; 3497 pt_entry_t newpte, origpte; 3498 pv_entry_t pv; 3499 vm_paddr_t opa, pa; 3500 vm_page_t mpte, om; 3501 boolean_t invlva, wired; 3502 3503 va = trunc_page(va); 3504 mpte = NULL; 3505 wired = (flags & PMAP_ENTER_WIRED) != 0; 3506 3507 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3508 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3509 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3510 va)); 3511 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3512 VM_OBJECT_ASSERT_LOCKED(m->object); 3513 3514 rw_wlock(&pvh_global_lock); 3515 PMAP_LOCK(pmap); 3516 sched_pin(); 3517 3518 /* 3519 * In the case that a page table page is not 3520 * resident, we are creating it here. 3521 */ 3522 if (va < VM_MAXUSER_ADDRESS) { 3523 mpte = pmap_allocpte(pmap, va, flags); 3524 if (mpte == NULL) { 3525 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3526 ("pmap_allocpte failed with sleep allowed")); 3527 sched_unpin(); 3528 rw_wunlock(&pvh_global_lock); 3529 PMAP_UNLOCK(pmap); 3530 return (KERN_RESOURCE_SHORTAGE); 3531 } 3532 } 3533 3534 pde = pmap_pde(pmap, va); 3535 if ((*pde & PG_PS) != 0) 3536 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3537 pte = pmap_pte_quick(pmap, va); 3538 3539 /* 3540 * Page Directory table entry not valid, we need a new PT page 3541 */ 3542 if (pte == NULL) { 3543 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3544 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3545 } 3546 3547 pa = VM_PAGE_TO_PHYS(m); 3548 om = NULL; 3549 origpte = *pte; 3550 opa = origpte & PG_FRAME; 3551 3552 /* 3553 * Mapping has not changed, must be protection or wiring change. 3554 */ 3555 if (origpte && (opa == pa)) { 3556 /* 3557 * Wiring change, just update stats. We don't worry about 3558 * wiring PT pages as they remain resident as long as there 3559 * are valid mappings in them. Hence, if a user page is wired, 3560 * the PT page will be also. 3561 */ 3562 if (wired && ((origpte & PG_W) == 0)) 3563 pmap->pm_stats.wired_count++; 3564 else if (!wired && (origpte & PG_W)) 3565 pmap->pm_stats.wired_count--; 3566 3567 /* 3568 * Remove extra pte reference 3569 */ 3570 if (mpte) 3571 mpte->wire_count--; 3572 3573 if (origpte & PG_MANAGED) { 3574 om = m; 3575 pa |= PG_MANAGED; 3576 } 3577 goto validate; 3578 } 3579 3580 pv = NULL; 3581 3582 /* 3583 * Mapping has changed, invalidate old range and fall through to 3584 * handle validating new mapping. 3585 */ 3586 if (opa) { 3587 if (origpte & PG_W) 3588 pmap->pm_stats.wired_count--; 3589 if (origpte & PG_MANAGED) { 3590 om = PHYS_TO_VM_PAGE(opa); 3591 pv = pmap_pvh_remove(&om->md, pmap, va); 3592 } 3593 if (mpte != NULL) { 3594 mpte->wire_count--; 3595 KASSERT(mpte->wire_count > 0, 3596 ("pmap_enter: missing reference to page table page," 3597 " va: 0x%x", va)); 3598 } 3599 } else 3600 pmap->pm_stats.resident_count++; 3601 3602 /* 3603 * Enter on the PV list if part of our managed memory. 3604 */ 3605 if ((m->oflags & VPO_UNMANAGED) == 0) { 3606 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3607 ("pmap_enter: managed mapping within the clean submap")); 3608 if (pv == NULL) 3609 pv = get_pv_entry(pmap, FALSE); 3610 pv->pv_va = va; 3611 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3612 pa |= PG_MANAGED; 3613 } else if (pv != NULL) 3614 free_pv_entry(pmap, pv); 3615 3616 /* 3617 * Increment counters 3618 */ 3619 if (wired) 3620 pmap->pm_stats.wired_count++; 3621 3622validate: 3623 /* 3624 * Now validate mapping with desired protection/wiring. 3625 */ 3626 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3627 if ((prot & VM_PROT_WRITE) != 0) { 3628 newpte |= PG_RW; 3629 if ((newpte & PG_MANAGED) != 0) 3630 vm_page_aflag_set(m, PGA_WRITEABLE); 3631 } 3632#if defined(PAE) || defined(PAE_TABLES) 3633 if ((prot & VM_PROT_EXECUTE) == 0) 3634 newpte |= pg_nx; 3635#endif 3636 if (wired) 3637 newpte |= PG_W; 3638 if (va < VM_MAXUSER_ADDRESS) 3639 newpte |= PG_U; 3640 if (pmap == kernel_pmap) 3641 newpte |= pgeflag; 3642 3643 /* 3644 * if the mapping or permission bits are different, we need 3645 * to update the pte. 3646 */ 3647 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3648 newpte |= PG_A; 3649 if ((flags & VM_PROT_WRITE) != 0) 3650 newpte |= PG_M; 3651 if (origpte & PG_V) { 3652 invlva = FALSE; 3653 origpte = pte_load_store(pte, newpte); 3654 if (origpte & PG_A) { 3655 if (origpte & PG_MANAGED) 3656 vm_page_aflag_set(om, PGA_REFERENCED); 3657 if (opa != VM_PAGE_TO_PHYS(m)) 3658 invlva = TRUE; 3659#if defined(PAE) || defined(PAE_TABLES) 3660 if ((origpte & PG_NX) == 0 && 3661 (newpte & PG_NX) != 0) 3662 invlva = TRUE; 3663#endif 3664 } 3665 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3666 if ((origpte & PG_MANAGED) != 0) 3667 vm_page_dirty(om); 3668 if ((prot & VM_PROT_WRITE) == 0) 3669 invlva = TRUE; 3670 } 3671 if ((origpte & PG_MANAGED) != 0 && 3672 TAILQ_EMPTY(&om->md.pv_list) && 3673 ((om->flags & PG_FICTITIOUS) != 0 || 3674 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3675 vm_page_aflag_clear(om, PGA_WRITEABLE); 3676 if (invlva) 3677 pmap_invalidate_page(pmap, va); 3678 } else 3679 pte_store(pte, newpte); 3680 } 3681 3682 /* 3683 * If both the page table page and the reservation are fully 3684 * populated, then attempt promotion. 3685 */ 3686 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3687 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3688 vm_reserv_level_iffullpop(m) == 0) 3689 pmap_promote_pde(pmap, pde, va); 3690 3691 sched_unpin(); 3692 rw_wunlock(&pvh_global_lock); 3693 PMAP_UNLOCK(pmap); 3694 return (KERN_SUCCESS); 3695} 3696 3697/* 3698 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3699 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3700 * blocking, (2) a mapping already exists at the specified virtual address, or 3701 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3702 */ 3703static boolean_t 3704pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3705{ 3706 pd_entry_t *pde, newpde; 3707 3708 rw_assert(&pvh_global_lock, RA_WLOCKED); 3709 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3710 pde = pmap_pde(pmap, va); 3711 if (*pde != 0) { 3712 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3713 " in pmap %p", va, pmap); 3714 return (FALSE); 3715 } 3716 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3717 PG_PS | PG_V; 3718 if ((m->oflags & VPO_UNMANAGED) == 0) { 3719 newpde |= PG_MANAGED; 3720 3721 /* 3722 * Abort this mapping if its PV entry could not be created. 3723 */ 3724 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3725 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3726 " in pmap %p", va, pmap); 3727 return (FALSE); 3728 } 3729 } 3730#if defined(PAE) || defined(PAE_TABLES) 3731 if ((prot & VM_PROT_EXECUTE) == 0) 3732 newpde |= pg_nx; 3733#endif 3734 if (va < VM_MAXUSER_ADDRESS) 3735 newpde |= PG_U; 3736 3737 /* 3738 * Increment counters. 3739 */ 3740 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3741 3742 /* 3743 * Map the superpage. 3744 */ 3745 pde_store(pde, newpde); 3746 3747 pmap_pde_mappings++; 3748 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3749 " in pmap %p", va, pmap); 3750 return (TRUE); 3751} 3752 3753/* 3754 * Maps a sequence of resident pages belonging to the same object. 3755 * The sequence begins with the given page m_start. This page is 3756 * mapped at the given virtual address start. Each subsequent page is 3757 * mapped at a virtual address that is offset from start by the same 3758 * amount as the page is offset from m_start within the object. The 3759 * last page in the sequence is the page with the largest offset from 3760 * m_start that can be mapped at a virtual address less than the given 3761 * virtual address end. Not every virtual page between start and end 3762 * is mapped; only those for which a resident page exists with the 3763 * corresponding offset from m_start are mapped. 3764 */ 3765void 3766pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3767 vm_page_t m_start, vm_prot_t prot) 3768{ 3769 vm_offset_t va; 3770 vm_page_t m, mpte; 3771 vm_pindex_t diff, psize; 3772 3773 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3774 3775 psize = atop(end - start); 3776 mpte = NULL; 3777 m = m_start; 3778 rw_wlock(&pvh_global_lock); 3779 PMAP_LOCK(pmap); 3780 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3781 va = start + ptoa(diff); 3782 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3783 m->psind == 1 && pg_ps_enabled && 3784 pmap_enter_pde(pmap, va, m, prot)) 3785 m = &m[NBPDR / PAGE_SIZE - 1]; 3786 else 3787 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3788 mpte); 3789 m = TAILQ_NEXT(m, listq); 3790 } 3791 rw_wunlock(&pvh_global_lock); 3792 PMAP_UNLOCK(pmap); 3793} 3794 3795/* 3796 * this code makes some *MAJOR* assumptions: 3797 * 1. Current pmap & pmap exists. 3798 * 2. Not wired. 3799 * 3. Read access. 3800 * 4. No page table pages. 3801 * but is *MUCH* faster than pmap_enter... 3802 */ 3803 3804void 3805pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3806{ 3807 3808 rw_wlock(&pvh_global_lock); 3809 PMAP_LOCK(pmap); 3810 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3811 rw_wunlock(&pvh_global_lock); 3812 PMAP_UNLOCK(pmap); 3813} 3814 3815static vm_page_t 3816pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3817 vm_prot_t prot, vm_page_t mpte) 3818{ 3819 pt_entry_t *pte; 3820 vm_paddr_t pa; 3821 struct spglist free; 3822 3823 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3824 (m->oflags & VPO_UNMANAGED) != 0, 3825 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3826 rw_assert(&pvh_global_lock, RA_WLOCKED); 3827 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3828 3829 /* 3830 * In the case that a page table page is not 3831 * resident, we are creating it here. 3832 */ 3833 if (va < VM_MAXUSER_ADDRESS) { 3834 u_int ptepindex; 3835 pd_entry_t ptepa; 3836 3837 /* 3838 * Calculate pagetable page index 3839 */ 3840 ptepindex = va >> PDRSHIFT; 3841 if (mpte && (mpte->pindex == ptepindex)) { 3842 mpte->wire_count++; 3843 } else { 3844 /* 3845 * Get the page directory entry 3846 */ 3847 ptepa = pmap->pm_pdir[ptepindex]; 3848 3849 /* 3850 * If the page table page is mapped, we just increment 3851 * the hold count, and activate it. 3852 */ 3853 if (ptepa) { 3854 if (ptepa & PG_PS) 3855 return (NULL); 3856 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3857 mpte->wire_count++; 3858 } else { 3859 mpte = _pmap_allocpte(pmap, ptepindex, 3860 PMAP_ENTER_NOSLEEP); 3861 if (mpte == NULL) 3862 return (mpte); 3863 } 3864 } 3865 } else { 3866 mpte = NULL; 3867 } 3868 3869 /* 3870 * This call to vtopte makes the assumption that we are 3871 * entering the page into the current pmap. In order to support 3872 * quick entry into any pmap, one would likely use pmap_pte_quick. 3873 * But that isn't as quick as vtopte. 3874 */ 3875 pte = vtopte(va); 3876 if (*pte) { 3877 if (mpte != NULL) { 3878 mpte->wire_count--; 3879 mpte = NULL; 3880 } 3881 return (mpte); 3882 } 3883 3884 /* 3885 * Enter on the PV list if part of our managed memory. 3886 */ 3887 if ((m->oflags & VPO_UNMANAGED) == 0 && 3888 !pmap_try_insert_pv_entry(pmap, va, m)) { 3889 if (mpte != NULL) { 3890 SLIST_INIT(&free); 3891 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3892 pmap_invalidate_page(pmap, va); 3893 pmap_free_zero_pages(&free); 3894 } 3895 3896 mpte = NULL; 3897 } 3898 return (mpte); 3899 } 3900 3901 /* 3902 * Increment counters 3903 */ 3904 pmap->pm_stats.resident_count++; 3905 3906 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3907#if defined(PAE) || defined(PAE_TABLES) 3908 if ((prot & VM_PROT_EXECUTE) == 0) 3909 pa |= pg_nx; 3910#endif 3911 3912 /* 3913 * Now validate mapping with RO protection 3914 */ 3915 if ((m->oflags & VPO_UNMANAGED) != 0) 3916 pte_store(pte, pa | PG_V | PG_U); 3917 else 3918 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3919 return (mpte); 3920} 3921 3922/* 3923 * Make a temporary mapping for a physical address. This is only intended 3924 * to be used for panic dumps. 3925 */ 3926void * 3927pmap_kenter_temporary(vm_paddr_t pa, int i) 3928{ 3929 vm_offset_t va; 3930 3931 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3932 pmap_kenter(va, pa); 3933 invlpg(va); 3934 return ((void *)crashdumpmap); 3935} 3936 3937/* 3938 * This code maps large physical mmap regions into the 3939 * processor address space. Note that some shortcuts 3940 * are taken, but the code works. 3941 */ 3942void 3943pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3944 vm_pindex_t pindex, vm_size_t size) 3945{ 3946 pd_entry_t *pde; 3947 vm_paddr_t pa, ptepa; 3948 vm_page_t p; 3949 int pat_mode; 3950 3951 VM_OBJECT_ASSERT_WLOCKED(object); 3952 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3953 ("pmap_object_init_pt: non-device object")); 3954 if (pseflag && 3955 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3956 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3957 return; 3958 p = vm_page_lookup(object, pindex); 3959 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3960 ("pmap_object_init_pt: invalid page %p", p)); 3961 pat_mode = p->md.pat_mode; 3962 3963 /* 3964 * Abort the mapping if the first page is not physically 3965 * aligned to a 2/4MB page boundary. 3966 */ 3967 ptepa = VM_PAGE_TO_PHYS(p); 3968 if (ptepa & (NBPDR - 1)) 3969 return; 3970 3971 /* 3972 * Skip the first page. Abort the mapping if the rest of 3973 * the pages are not physically contiguous or have differing 3974 * memory attributes. 3975 */ 3976 p = TAILQ_NEXT(p, listq); 3977 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3978 pa += PAGE_SIZE) { 3979 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3980 ("pmap_object_init_pt: invalid page %p", p)); 3981 if (pa != VM_PAGE_TO_PHYS(p) || 3982 pat_mode != p->md.pat_mode) 3983 return; 3984 p = TAILQ_NEXT(p, listq); 3985 } 3986 3987 /* 3988 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3989 * "size" is a multiple of 2/4M, adding the PAT setting to 3990 * "pa" will not affect the termination of this loop. 3991 */ 3992 PMAP_LOCK(pmap); 3993 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3994 size; pa += NBPDR) { 3995 pde = pmap_pde(pmap, addr); 3996 if (*pde == 0) { 3997 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3998 PG_U | PG_RW | PG_V); 3999 pmap->pm_stats.resident_count += NBPDR / 4000 PAGE_SIZE; 4001 pmap_pde_mappings++; 4002 } 4003 /* Else continue on if the PDE is already valid. */ 4004 addr += NBPDR; 4005 } 4006 PMAP_UNLOCK(pmap); 4007 } 4008} 4009 4010/* 4011 * Clear the wired attribute from the mappings for the specified range of 4012 * addresses in the given pmap. Every valid mapping within that range 4013 * must have the wired attribute set. In contrast, invalid mappings 4014 * cannot have the wired attribute set, so they are ignored. 4015 * 4016 * The wired attribute of the page table entry is not a hardware feature, 4017 * so there is no need to invalidate any TLB entries. 4018 */ 4019void 4020pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4021{ 4022 vm_offset_t pdnxt; 4023 pd_entry_t *pde; 4024 pt_entry_t *pte; 4025 boolean_t pv_lists_locked; 4026 4027 if (pmap_is_current(pmap)) 4028 pv_lists_locked = FALSE; 4029 else { 4030 pv_lists_locked = TRUE; 4031resume: 4032 rw_wlock(&pvh_global_lock); 4033 sched_pin(); 4034 } 4035 PMAP_LOCK(pmap); 4036 for (; sva < eva; sva = pdnxt) { 4037 pdnxt = (sva + NBPDR) & ~PDRMASK; 4038 if (pdnxt < sva) 4039 pdnxt = eva; 4040 pde = pmap_pde(pmap, sva); 4041 if ((*pde & PG_V) == 0) 4042 continue; 4043 if ((*pde & PG_PS) != 0) { 4044 if ((*pde & PG_W) == 0) 4045 panic("pmap_unwire: pde %#jx is missing PG_W", 4046 (uintmax_t)*pde); 4047 4048 /* 4049 * Are we unwiring the entire large page? If not, 4050 * demote the mapping and fall through. 4051 */ 4052 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4053 /* 4054 * Regardless of whether a pde (or pte) is 32 4055 * or 64 bits in size, PG_W is among the least 4056 * significant 32 bits. 4057 */ 4058 atomic_clear_int((u_int *)pde, PG_W); 4059 pmap->pm_stats.wired_count -= NBPDR / 4060 PAGE_SIZE; 4061 continue; 4062 } else { 4063 if (!pv_lists_locked) { 4064 pv_lists_locked = TRUE; 4065 if (!rw_try_wlock(&pvh_global_lock)) { 4066 PMAP_UNLOCK(pmap); 4067 /* Repeat sva. */ 4068 goto resume; 4069 } 4070 sched_pin(); 4071 } 4072 if (!pmap_demote_pde(pmap, pde, sva)) 4073 panic("pmap_unwire: demotion failed"); 4074 } 4075 } 4076 if (pdnxt > eva) 4077 pdnxt = eva; 4078 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4079 sva += PAGE_SIZE) { 4080 if ((*pte & PG_V) == 0) 4081 continue; 4082 if ((*pte & PG_W) == 0) 4083 panic("pmap_unwire: pte %#jx is missing PG_W", 4084 (uintmax_t)*pte); 4085 4086 /* 4087 * PG_W must be cleared atomically. Although the pmap 4088 * lock synchronizes access to PG_W, another processor 4089 * could be setting PG_M and/or PG_A concurrently. 4090 * 4091 * PG_W is among the least significant 32 bits. 4092 */ 4093 atomic_clear_int((u_int *)pte, PG_W); 4094 pmap->pm_stats.wired_count--; 4095 } 4096 } 4097 if (pv_lists_locked) { 4098 sched_unpin(); 4099 rw_wunlock(&pvh_global_lock); 4100 } 4101 PMAP_UNLOCK(pmap); 4102} 4103 4104 4105/* 4106 * Copy the range specified by src_addr/len 4107 * from the source map to the range dst_addr/len 4108 * in the destination map. 4109 * 4110 * This routine is only advisory and need not do anything. 4111 */ 4112 4113void 4114pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4115 vm_offset_t src_addr) 4116{ 4117 struct spglist free; 4118 vm_offset_t addr; 4119 vm_offset_t end_addr = src_addr + len; 4120 vm_offset_t pdnxt; 4121 4122 if (dst_addr != src_addr) 4123 return; 4124 4125 if (!pmap_is_current(src_pmap)) 4126 return; 4127 4128 rw_wlock(&pvh_global_lock); 4129 if (dst_pmap < src_pmap) { 4130 PMAP_LOCK(dst_pmap); 4131 PMAP_LOCK(src_pmap); 4132 } else { 4133 PMAP_LOCK(src_pmap); 4134 PMAP_LOCK(dst_pmap); 4135 } 4136 sched_pin(); 4137 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4138 pt_entry_t *src_pte, *dst_pte; 4139 vm_page_t dstmpte, srcmpte; 4140 pd_entry_t srcptepaddr; 4141 u_int ptepindex; 4142 4143 KASSERT(addr < UPT_MIN_ADDRESS, 4144 ("pmap_copy: invalid to pmap_copy page tables")); 4145 4146 pdnxt = (addr + NBPDR) & ~PDRMASK; 4147 if (pdnxt < addr) 4148 pdnxt = end_addr; 4149 ptepindex = addr >> PDRSHIFT; 4150 4151 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4152 if (srcptepaddr == 0) 4153 continue; 4154 4155 if (srcptepaddr & PG_PS) { 4156 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4157 continue; 4158 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4159 ((srcptepaddr & PG_MANAGED) == 0 || 4160 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4161 PG_PS_FRAME))) { 4162 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4163 ~PG_W; 4164 dst_pmap->pm_stats.resident_count += 4165 NBPDR / PAGE_SIZE; 4166 } 4167 continue; 4168 } 4169 4170 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4171 KASSERT(srcmpte->wire_count > 0, 4172 ("pmap_copy: source page table page is unused")); 4173 4174 if (pdnxt > end_addr) 4175 pdnxt = end_addr; 4176 4177 src_pte = vtopte(addr); 4178 while (addr < pdnxt) { 4179 pt_entry_t ptetemp; 4180 ptetemp = *src_pte; 4181 /* 4182 * we only virtual copy managed pages 4183 */ 4184 if ((ptetemp & PG_MANAGED) != 0) { 4185 dstmpte = pmap_allocpte(dst_pmap, addr, 4186 PMAP_ENTER_NOSLEEP); 4187 if (dstmpte == NULL) 4188 goto out; 4189 dst_pte = pmap_pte_quick(dst_pmap, addr); 4190 if (*dst_pte == 0 && 4191 pmap_try_insert_pv_entry(dst_pmap, addr, 4192 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4193 /* 4194 * Clear the wired, modified, and 4195 * accessed (referenced) bits 4196 * during the copy. 4197 */ 4198 *dst_pte = ptetemp & ~(PG_W | PG_M | 4199 PG_A); 4200 dst_pmap->pm_stats.resident_count++; 4201 } else { 4202 SLIST_INIT(&free); 4203 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4204 &free)) { 4205 pmap_invalidate_page(dst_pmap, 4206 addr); 4207 pmap_free_zero_pages(&free); 4208 } 4209 goto out; 4210 } 4211 if (dstmpte->wire_count >= srcmpte->wire_count) 4212 break; 4213 } 4214 addr += PAGE_SIZE; 4215 src_pte++; 4216 } 4217 } 4218out: 4219 sched_unpin(); 4220 rw_wunlock(&pvh_global_lock); 4221 PMAP_UNLOCK(src_pmap); 4222 PMAP_UNLOCK(dst_pmap); 4223} 4224 4225static __inline void 4226pagezero(void *page) 4227{ 4228#if defined(I686_CPU) 4229 if (cpu_class == CPUCLASS_686) { 4230#if defined(CPU_ENABLE_SSE) 4231 if (cpu_feature & CPUID_SSE2) 4232 sse2_pagezero(page); 4233 else 4234#endif 4235 i686_pagezero(page); 4236 } else 4237#endif 4238 bzero(page, PAGE_SIZE); 4239} 4240 4241/* 4242 * pmap_zero_page zeros the specified hardware page by mapping 4243 * the page into KVM and using bzero to clear its contents. 4244 */ 4245void 4246pmap_zero_page(vm_page_t m) 4247{ 4248 struct sysmaps *sysmaps; 4249 4250 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4251 mtx_lock(&sysmaps->lock); 4252 if (*sysmaps->CMAP2) 4253 panic("pmap_zero_page: CMAP2 busy"); 4254 sched_pin(); 4255 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4256 pmap_cache_bits(m->md.pat_mode, 0); 4257 invlcaddr(sysmaps->CADDR2); 4258 pagezero(sysmaps->CADDR2); 4259 *sysmaps->CMAP2 = 0; 4260 sched_unpin(); 4261 mtx_unlock(&sysmaps->lock); 4262} 4263 4264/* 4265 * pmap_zero_page_area zeros the specified hardware page by mapping 4266 * the page into KVM and using bzero to clear its contents. 4267 * 4268 * off and size may not cover an area beyond a single hardware page. 4269 */ 4270void 4271pmap_zero_page_area(vm_page_t m, int off, int size) 4272{ 4273 struct sysmaps *sysmaps; 4274 4275 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4276 mtx_lock(&sysmaps->lock); 4277 if (*sysmaps->CMAP2) 4278 panic("pmap_zero_page_area: CMAP2 busy"); 4279 sched_pin(); 4280 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4281 pmap_cache_bits(m->md.pat_mode, 0); 4282 invlcaddr(sysmaps->CADDR2); 4283 if (off == 0 && size == PAGE_SIZE) 4284 pagezero(sysmaps->CADDR2); 4285 else 4286 bzero((char *)sysmaps->CADDR2 + off, size); 4287 *sysmaps->CMAP2 = 0; 4288 sched_unpin(); 4289 mtx_unlock(&sysmaps->lock); 4290} 4291 4292/* 4293 * pmap_zero_page_idle zeros the specified hardware page by mapping 4294 * the page into KVM and using bzero to clear its contents. This 4295 * is intended to be called from the vm_pagezero process only and 4296 * outside of Giant. 4297 */ 4298void 4299pmap_zero_page_idle(vm_page_t m) 4300{ 4301 4302 if (*CMAP3) 4303 panic("pmap_zero_page_idle: CMAP3 busy"); 4304 sched_pin(); 4305 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4306 pmap_cache_bits(m->md.pat_mode, 0); 4307 invlcaddr(CADDR3); 4308 pagezero(CADDR3); 4309 *CMAP3 = 0; 4310 sched_unpin(); 4311} 4312 4313/* 4314 * pmap_copy_page copies the specified (machine independent) 4315 * page by mapping the page into virtual memory and using 4316 * bcopy to copy the page, one machine dependent page at a 4317 * time. 4318 */ 4319void 4320pmap_copy_page(vm_page_t src, vm_page_t dst) 4321{ 4322 struct sysmaps *sysmaps; 4323 4324 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4325 mtx_lock(&sysmaps->lock); 4326 if (*sysmaps->CMAP1) 4327 panic("pmap_copy_page: CMAP1 busy"); 4328 if (*sysmaps->CMAP2) 4329 panic("pmap_copy_page: CMAP2 busy"); 4330 sched_pin(); 4331 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4332 pmap_cache_bits(src->md.pat_mode, 0); 4333 invlcaddr(sysmaps->CADDR1); 4334 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4335 pmap_cache_bits(dst->md.pat_mode, 0); 4336 invlcaddr(sysmaps->CADDR2); 4337 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4338 *sysmaps->CMAP1 = 0; 4339 *sysmaps->CMAP2 = 0; 4340 sched_unpin(); 4341 mtx_unlock(&sysmaps->lock); 4342} 4343 4344int unmapped_buf_allowed = 1; 4345 4346void 4347pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4348 vm_offset_t b_offset, int xfersize) 4349{ 4350 struct sysmaps *sysmaps; 4351 vm_page_t a_pg, b_pg; 4352 char *a_cp, *b_cp; 4353 vm_offset_t a_pg_offset, b_pg_offset; 4354 int cnt; 4355 4356 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4357 mtx_lock(&sysmaps->lock); 4358 if (*sysmaps->CMAP1 != 0) 4359 panic("pmap_copy_pages: CMAP1 busy"); 4360 if (*sysmaps->CMAP2 != 0) 4361 panic("pmap_copy_pages: CMAP2 busy"); 4362 sched_pin(); 4363 while (xfersize > 0) { 4364 a_pg = ma[a_offset >> PAGE_SHIFT]; 4365 a_pg_offset = a_offset & PAGE_MASK; 4366 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4367 b_pg = mb[b_offset >> PAGE_SHIFT]; 4368 b_pg_offset = b_offset & PAGE_MASK; 4369 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4370 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4371 pmap_cache_bits(a_pg->md.pat_mode, 0); 4372 invlcaddr(sysmaps->CADDR1); 4373 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4374 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4375 invlcaddr(sysmaps->CADDR2); 4376 a_cp = sysmaps->CADDR1 + a_pg_offset; 4377 b_cp = sysmaps->CADDR2 + b_pg_offset; 4378 bcopy(a_cp, b_cp, cnt); 4379 a_offset += cnt; 4380 b_offset += cnt; 4381 xfersize -= cnt; 4382 } 4383 *sysmaps->CMAP1 = 0; 4384 *sysmaps->CMAP2 = 0; 4385 sched_unpin(); 4386 mtx_unlock(&sysmaps->lock); 4387} 4388 4389/* 4390 * Returns true if the pmap's pv is one of the first 4391 * 16 pvs linked to from this page. This count may 4392 * be changed upwards or downwards in the future; it 4393 * is only necessary that true be returned for a small 4394 * subset of pmaps for proper page aging. 4395 */ 4396boolean_t 4397pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4398{ 4399 struct md_page *pvh; 4400 pv_entry_t pv; 4401 int loops = 0; 4402 boolean_t rv; 4403 4404 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4405 ("pmap_page_exists_quick: page %p is not managed", m)); 4406 rv = FALSE; 4407 rw_wlock(&pvh_global_lock); 4408 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4409 if (PV_PMAP(pv) == pmap) { 4410 rv = TRUE; 4411 break; 4412 } 4413 loops++; 4414 if (loops >= 16) 4415 break; 4416 } 4417 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4418 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4419 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4420 if (PV_PMAP(pv) == pmap) { 4421 rv = TRUE; 4422 break; 4423 } 4424 loops++; 4425 if (loops >= 16) 4426 break; 4427 } 4428 } 4429 rw_wunlock(&pvh_global_lock); 4430 return (rv); 4431} 4432 4433/* 4434 * pmap_page_wired_mappings: 4435 * 4436 * Return the number of managed mappings to the given physical page 4437 * that are wired. 4438 */ 4439int 4440pmap_page_wired_mappings(vm_page_t m) 4441{ 4442 int count; 4443 4444 count = 0; 4445 if ((m->oflags & VPO_UNMANAGED) != 0) 4446 return (count); 4447 rw_wlock(&pvh_global_lock); 4448 count = pmap_pvh_wired_mappings(&m->md, count); 4449 if ((m->flags & PG_FICTITIOUS) == 0) { 4450 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4451 count); 4452 } 4453 rw_wunlock(&pvh_global_lock); 4454 return (count); 4455} 4456 4457/* 4458 * pmap_pvh_wired_mappings: 4459 * 4460 * Return the updated number "count" of managed mappings that are wired. 4461 */ 4462static int 4463pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4464{ 4465 pmap_t pmap; 4466 pt_entry_t *pte; 4467 pv_entry_t pv; 4468 4469 rw_assert(&pvh_global_lock, RA_WLOCKED); 4470 sched_pin(); 4471 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4472 pmap = PV_PMAP(pv); 4473 PMAP_LOCK(pmap); 4474 pte = pmap_pte_quick(pmap, pv->pv_va); 4475 if ((*pte & PG_W) != 0) 4476 count++; 4477 PMAP_UNLOCK(pmap); 4478 } 4479 sched_unpin(); 4480 return (count); 4481} 4482 4483/* 4484 * Returns TRUE if the given page is mapped individually or as part of 4485 * a 4mpage. Otherwise, returns FALSE. 4486 */ 4487boolean_t 4488pmap_page_is_mapped(vm_page_t m) 4489{ 4490 boolean_t rv; 4491 4492 if ((m->oflags & VPO_UNMANAGED) != 0) 4493 return (FALSE); 4494 rw_wlock(&pvh_global_lock); 4495 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4496 ((m->flags & PG_FICTITIOUS) == 0 && 4497 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4498 rw_wunlock(&pvh_global_lock); 4499 return (rv); 4500} 4501 4502/* 4503 * Remove all pages from specified address space 4504 * this aids process exit speeds. Also, this code 4505 * is special cased for current process only, but 4506 * can have the more generic (and slightly slower) 4507 * mode enabled. This is much faster than pmap_remove 4508 * in the case of running down an entire address space. 4509 */ 4510void 4511pmap_remove_pages(pmap_t pmap) 4512{ 4513 pt_entry_t *pte, tpte; 4514 vm_page_t m, mpte, mt; 4515 pv_entry_t pv; 4516 struct md_page *pvh; 4517 struct pv_chunk *pc, *npc; 4518 struct spglist free; 4519 int field, idx; 4520 int32_t bit; 4521 uint32_t inuse, bitmask; 4522 int allfree; 4523 4524 if (pmap != PCPU_GET(curpmap)) { 4525 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4526 return; 4527 } 4528 SLIST_INIT(&free); 4529 rw_wlock(&pvh_global_lock); 4530 PMAP_LOCK(pmap); 4531 sched_pin(); 4532 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4533 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4534 pc->pc_pmap)); 4535 allfree = 1; 4536 for (field = 0; field < _NPCM; field++) { 4537 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4538 while (inuse != 0) { 4539 bit = bsfl(inuse); 4540 bitmask = 1UL << bit; 4541 idx = field * 32 + bit; 4542 pv = &pc->pc_pventry[idx]; 4543 inuse &= ~bitmask; 4544 4545 pte = pmap_pde(pmap, pv->pv_va); 4546 tpte = *pte; 4547 if ((tpte & PG_PS) == 0) { 4548 pte = vtopte(pv->pv_va); 4549 tpte = *pte & ~PG_PTE_PAT; 4550 } 4551 4552 if (tpte == 0) { 4553 printf( 4554 "TPTE at %p IS ZERO @ VA %08x\n", 4555 pte, pv->pv_va); 4556 panic("bad pte"); 4557 } 4558 4559/* 4560 * We cannot remove wired pages from a process' mapping at this time 4561 */ 4562 if (tpte & PG_W) { 4563 allfree = 0; 4564 continue; 4565 } 4566 4567 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4568 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4569 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4570 m, (uintmax_t)m->phys_addr, 4571 (uintmax_t)tpte)); 4572 4573 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4574 m < &vm_page_array[vm_page_array_size], 4575 ("pmap_remove_pages: bad tpte %#jx", 4576 (uintmax_t)tpte)); 4577 4578 pte_clear(pte); 4579 4580 /* 4581 * Update the vm_page_t clean/reference bits. 4582 */ 4583 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4584 if ((tpte & PG_PS) != 0) { 4585 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4586 vm_page_dirty(mt); 4587 } else 4588 vm_page_dirty(m); 4589 } 4590 4591 /* Mark free */ 4592 PV_STAT(pv_entry_frees++); 4593 PV_STAT(pv_entry_spare++); 4594 pv_entry_count--; 4595 pc->pc_map[field] |= bitmask; 4596 if ((tpte & PG_PS) != 0) { 4597 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4598 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4599 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4600 if (TAILQ_EMPTY(&pvh->pv_list)) { 4601 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4602 if (TAILQ_EMPTY(&mt->md.pv_list)) 4603 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4604 } 4605 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4606 if (mpte != NULL) { 4607 pmap_remove_pt_page(pmap, mpte); 4608 pmap->pm_stats.resident_count--; 4609 KASSERT(mpte->wire_count == NPTEPG, 4610 ("pmap_remove_pages: pte page wire count error")); 4611 mpte->wire_count = 0; 4612 pmap_add_delayed_free_list(mpte, &free, FALSE); 4613 atomic_subtract_int(&cnt.v_wire_count, 1); 4614 } 4615 } else { 4616 pmap->pm_stats.resident_count--; 4617 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4618 if (TAILQ_EMPTY(&m->md.pv_list) && 4619 (m->flags & PG_FICTITIOUS) == 0) { 4620 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4621 if (TAILQ_EMPTY(&pvh->pv_list)) 4622 vm_page_aflag_clear(m, PGA_WRITEABLE); 4623 } 4624 pmap_unuse_pt(pmap, pv->pv_va, &free); 4625 } 4626 } 4627 } 4628 if (allfree) { 4629 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4630 free_pv_chunk(pc); 4631 } 4632 } 4633 sched_unpin(); 4634 pmap_invalidate_all(pmap); 4635 rw_wunlock(&pvh_global_lock); 4636 PMAP_UNLOCK(pmap); 4637 pmap_free_zero_pages(&free); 4638} 4639 4640/* 4641 * pmap_is_modified: 4642 * 4643 * Return whether or not the specified physical page was modified 4644 * in any physical maps. 4645 */ 4646boolean_t 4647pmap_is_modified(vm_page_t m) 4648{ 4649 boolean_t rv; 4650 4651 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4652 ("pmap_is_modified: page %p is not managed", m)); 4653 4654 /* 4655 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4656 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4657 * is clear, no PTEs can have PG_M set. 4658 */ 4659 VM_OBJECT_ASSERT_WLOCKED(m->object); 4660 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4661 return (FALSE); 4662 rw_wlock(&pvh_global_lock); 4663 rv = pmap_is_modified_pvh(&m->md) || 4664 ((m->flags & PG_FICTITIOUS) == 0 && 4665 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4666 rw_wunlock(&pvh_global_lock); 4667 return (rv); 4668} 4669 4670/* 4671 * Returns TRUE if any of the given mappings were used to modify 4672 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4673 * mappings are supported. 4674 */ 4675static boolean_t 4676pmap_is_modified_pvh(struct md_page *pvh) 4677{ 4678 pv_entry_t pv; 4679 pt_entry_t *pte; 4680 pmap_t pmap; 4681 boolean_t rv; 4682 4683 rw_assert(&pvh_global_lock, RA_WLOCKED); 4684 rv = FALSE; 4685 sched_pin(); 4686 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4687 pmap = PV_PMAP(pv); 4688 PMAP_LOCK(pmap); 4689 pte = pmap_pte_quick(pmap, pv->pv_va); 4690 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4691 PMAP_UNLOCK(pmap); 4692 if (rv) 4693 break; 4694 } 4695 sched_unpin(); 4696 return (rv); 4697} 4698 4699/* 4700 * pmap_is_prefaultable: 4701 * 4702 * Return whether or not the specified virtual address is elgible 4703 * for prefault. 4704 */ 4705boolean_t 4706pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4707{ 4708 pd_entry_t *pde; 4709 pt_entry_t *pte; 4710 boolean_t rv; 4711 4712 rv = FALSE; 4713 PMAP_LOCK(pmap); 4714 pde = pmap_pde(pmap, addr); 4715 if (*pde != 0 && (*pde & PG_PS) == 0) { 4716 pte = vtopte(addr); 4717 rv = *pte == 0; 4718 } 4719 PMAP_UNLOCK(pmap); 4720 return (rv); 4721} 4722 4723/* 4724 * pmap_is_referenced: 4725 * 4726 * Return whether or not the specified physical page was referenced 4727 * in any physical maps. 4728 */ 4729boolean_t 4730pmap_is_referenced(vm_page_t m) 4731{ 4732 boolean_t rv; 4733 4734 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4735 ("pmap_is_referenced: page %p is not managed", m)); 4736 rw_wlock(&pvh_global_lock); 4737 rv = pmap_is_referenced_pvh(&m->md) || 4738 ((m->flags & PG_FICTITIOUS) == 0 && 4739 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4740 rw_wunlock(&pvh_global_lock); 4741 return (rv); 4742} 4743 4744/* 4745 * Returns TRUE if any of the given mappings were referenced and FALSE 4746 * otherwise. Both page and 4mpage mappings are supported. 4747 */ 4748static boolean_t 4749pmap_is_referenced_pvh(struct md_page *pvh) 4750{ 4751 pv_entry_t pv; 4752 pt_entry_t *pte; 4753 pmap_t pmap; 4754 boolean_t rv; 4755 4756 rw_assert(&pvh_global_lock, RA_WLOCKED); 4757 rv = FALSE; 4758 sched_pin(); 4759 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4760 pmap = PV_PMAP(pv); 4761 PMAP_LOCK(pmap); 4762 pte = pmap_pte_quick(pmap, pv->pv_va); 4763 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4764 PMAP_UNLOCK(pmap); 4765 if (rv) 4766 break; 4767 } 4768 sched_unpin(); 4769 return (rv); 4770} 4771 4772/* 4773 * Clear the write and modified bits in each of the given page's mappings. 4774 */ 4775void 4776pmap_remove_write(vm_page_t m) 4777{ 4778 struct md_page *pvh; 4779 pv_entry_t next_pv, pv; 4780 pmap_t pmap; 4781 pd_entry_t *pde; 4782 pt_entry_t oldpte, *pte; 4783 vm_offset_t va; 4784 4785 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4786 ("pmap_remove_write: page %p is not managed", m)); 4787 4788 /* 4789 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4790 * set by another thread while the object is locked. Thus, 4791 * if PGA_WRITEABLE is clear, no page table entries need updating. 4792 */ 4793 VM_OBJECT_ASSERT_WLOCKED(m->object); 4794 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4795 return; 4796 rw_wlock(&pvh_global_lock); 4797 sched_pin(); 4798 if ((m->flags & PG_FICTITIOUS) != 0) 4799 goto small_mappings; 4800 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4801 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4802 va = pv->pv_va; 4803 pmap = PV_PMAP(pv); 4804 PMAP_LOCK(pmap); 4805 pde = pmap_pde(pmap, va); 4806 if ((*pde & PG_RW) != 0) 4807 (void)pmap_demote_pde(pmap, pde, va); 4808 PMAP_UNLOCK(pmap); 4809 } 4810small_mappings: 4811 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4812 pmap = PV_PMAP(pv); 4813 PMAP_LOCK(pmap); 4814 pde = pmap_pde(pmap, pv->pv_va); 4815 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4816 " a 4mpage in page %p's pv list", m)); 4817 pte = pmap_pte_quick(pmap, pv->pv_va); 4818retry: 4819 oldpte = *pte; 4820 if ((oldpte & PG_RW) != 0) { 4821 /* 4822 * Regardless of whether a pte is 32 or 64 bits 4823 * in size, PG_RW and PG_M are among the least 4824 * significant 32 bits. 4825 */ 4826 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4827 oldpte & ~(PG_RW | PG_M))) 4828 goto retry; 4829 if ((oldpte & PG_M) != 0) 4830 vm_page_dirty(m); 4831 pmap_invalidate_page(pmap, pv->pv_va); 4832 } 4833 PMAP_UNLOCK(pmap); 4834 } 4835 vm_page_aflag_clear(m, PGA_WRITEABLE); 4836 sched_unpin(); 4837 rw_wunlock(&pvh_global_lock); 4838} 4839 4840#define PMAP_TS_REFERENCED_MAX 5 4841 4842/* 4843 * pmap_ts_referenced: 4844 * 4845 * Return a count of reference bits for a page, clearing those bits. 4846 * It is not necessary for every reference bit to be cleared, but it 4847 * is necessary that 0 only be returned when there are truly no 4848 * reference bits set. 4849 * 4850 * XXX: The exact number of bits to check and clear is a matter that 4851 * should be tested and standardized at some point in the future for 4852 * optimal aging of shared pages. 4853 */ 4854int 4855pmap_ts_referenced(vm_page_t m) 4856{ 4857 struct md_page *pvh; 4858 pv_entry_t pv, pvf; 4859 pmap_t pmap; 4860 pd_entry_t *pde; 4861 pt_entry_t *pte; 4862 vm_paddr_t pa; 4863 int rtval = 0; 4864 4865 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4866 ("pmap_ts_referenced: page %p is not managed", m)); 4867 pa = VM_PAGE_TO_PHYS(m); 4868 pvh = pa_to_pvh(pa); 4869 rw_wlock(&pvh_global_lock); 4870 sched_pin(); 4871 if ((m->flags & PG_FICTITIOUS) != 0 || 4872 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4873 goto small_mappings; 4874 pv = pvf; 4875 do { 4876 pmap = PV_PMAP(pv); 4877 PMAP_LOCK(pmap); 4878 pde = pmap_pde(pmap, pv->pv_va); 4879 if ((*pde & PG_A) != 0) { 4880 /* 4881 * Since this reference bit is shared by either 1024 4882 * or 512 4KB pages, it should not be cleared every 4883 * time it is tested. Apply a simple "hash" function 4884 * on the physical page number, the virtual superpage 4885 * number, and the pmap address to select one 4KB page 4886 * out of the 1024 or 512 on which testing the 4887 * reference bit will result in clearing that bit. 4888 * This function is designed to avoid the selection of 4889 * the same 4KB page for every 2- or 4MB page mapping. 4890 * 4891 * On demotion, a mapping that hasn't been referenced 4892 * is simply destroyed. To avoid the possibility of a 4893 * subsequent page fault on a demoted wired mapping, 4894 * always leave its reference bit set. Moreover, 4895 * since the superpage is wired, the current state of 4896 * its reference bit won't affect page replacement. 4897 */ 4898 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4899 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4900 (*pde & PG_W) == 0) { 4901 atomic_clear_int((u_int *)pde, PG_A); 4902 pmap_invalidate_page(pmap, pv->pv_va); 4903 } 4904 rtval++; 4905 } 4906 PMAP_UNLOCK(pmap); 4907 /* Rotate the PV list if it has more than one entry. */ 4908 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4909 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4910 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4911 } 4912 if (rtval >= PMAP_TS_REFERENCED_MAX) 4913 goto out; 4914 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4915small_mappings: 4916 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4917 goto out; 4918 pv = pvf; 4919 do { 4920 pmap = PV_PMAP(pv); 4921 PMAP_LOCK(pmap); 4922 pde = pmap_pde(pmap, pv->pv_va); 4923 KASSERT((*pde & PG_PS) == 0, 4924 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4925 m)); 4926 pte = pmap_pte_quick(pmap, pv->pv_va); 4927 if ((*pte & PG_A) != 0) { 4928 atomic_clear_int((u_int *)pte, PG_A); 4929 pmap_invalidate_page(pmap, pv->pv_va); 4930 rtval++; 4931 } 4932 PMAP_UNLOCK(pmap); 4933 /* Rotate the PV list if it has more than one entry. */ 4934 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4935 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4936 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4937 } 4938 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4939 PMAP_TS_REFERENCED_MAX); 4940out: 4941 sched_unpin(); 4942 rw_wunlock(&pvh_global_lock); 4943 return (rtval); 4944} 4945 4946/* 4947 * Apply the given advice to the specified range of addresses within the 4948 * given pmap. Depending on the advice, clear the referenced and/or 4949 * modified flags in each mapping and set the mapped page's dirty field. 4950 */ 4951void 4952pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4953{ 4954 pd_entry_t oldpde, *pde; 4955 pt_entry_t *pte; 4956 vm_offset_t pdnxt; 4957 vm_page_t m; 4958 boolean_t anychanged, pv_lists_locked; 4959 4960 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4961 return; 4962 if (pmap_is_current(pmap)) 4963 pv_lists_locked = FALSE; 4964 else { 4965 pv_lists_locked = TRUE; 4966resume: 4967 rw_wlock(&pvh_global_lock); 4968 sched_pin(); 4969 } 4970 anychanged = FALSE; 4971 PMAP_LOCK(pmap); 4972 for (; sva < eva; sva = pdnxt) { 4973 pdnxt = (sva + NBPDR) & ~PDRMASK; 4974 if (pdnxt < sva) 4975 pdnxt = eva; 4976 pde = pmap_pde(pmap, sva); 4977 oldpde = *pde; 4978 if ((oldpde & PG_V) == 0) 4979 continue; 4980 else if ((oldpde & PG_PS) != 0) { 4981 if ((oldpde & PG_MANAGED) == 0) 4982 continue; 4983 if (!pv_lists_locked) { 4984 pv_lists_locked = TRUE; 4985 if (!rw_try_wlock(&pvh_global_lock)) { 4986 if (anychanged) 4987 pmap_invalidate_all(pmap); 4988 PMAP_UNLOCK(pmap); 4989 goto resume; 4990 } 4991 sched_pin(); 4992 } 4993 if (!pmap_demote_pde(pmap, pde, sva)) { 4994 /* 4995 * The large page mapping was destroyed. 4996 */ 4997 continue; 4998 } 4999 5000 /* 5001 * Unless the page mappings are wired, remove the 5002 * mapping to a single page so that a subsequent 5003 * access may repromote. Since the underlying page 5004 * table page is fully populated, this removal never 5005 * frees a page table page. 5006 */ 5007 if ((oldpde & PG_W) == 0) { 5008 pte = pmap_pte_quick(pmap, sva); 5009 KASSERT((*pte & PG_V) != 0, 5010 ("pmap_advise: invalid PTE")); 5011 pmap_remove_pte(pmap, pte, sva, NULL); 5012 anychanged = TRUE; 5013 } 5014 } 5015 if (pdnxt > eva) 5016 pdnxt = eva; 5017 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5018 sva += PAGE_SIZE) { 5019 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 5020 PG_V)) 5021 continue; 5022 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5023 if (advice == MADV_DONTNEED) { 5024 /* 5025 * Future calls to pmap_is_modified() 5026 * can be avoided by making the page 5027 * dirty now. 5028 */ 5029 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5030 vm_page_dirty(m); 5031 } 5032 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5033 } else if ((*pte & PG_A) != 0) 5034 atomic_clear_int((u_int *)pte, PG_A); 5035 else 5036 continue; 5037 if ((*pte & PG_G) != 0) 5038 pmap_invalidate_page(pmap, sva); 5039 else 5040 anychanged = TRUE; 5041 } 5042 } 5043 if (anychanged) 5044 pmap_invalidate_all(pmap); 5045 if (pv_lists_locked) { 5046 sched_unpin(); 5047 rw_wunlock(&pvh_global_lock); 5048 } 5049 PMAP_UNLOCK(pmap); 5050} 5051 5052/* 5053 * Clear the modify bits on the specified physical page. 5054 */ 5055void 5056pmap_clear_modify(vm_page_t m) 5057{ 5058 struct md_page *pvh; 5059 pv_entry_t next_pv, pv; 5060 pmap_t pmap; 5061 pd_entry_t oldpde, *pde; 5062 pt_entry_t oldpte, *pte; 5063 vm_offset_t va; 5064 5065 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5066 ("pmap_clear_modify: page %p is not managed", m)); 5067 VM_OBJECT_ASSERT_WLOCKED(m->object); 5068 KASSERT(!vm_page_xbusied(m), 5069 ("pmap_clear_modify: page %p is exclusive busied", m)); 5070 5071 /* 5072 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5073 * If the object containing the page is locked and the page is not 5074 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5075 */ 5076 if ((m->aflags & PGA_WRITEABLE) == 0) 5077 return; 5078 rw_wlock(&pvh_global_lock); 5079 sched_pin(); 5080 if ((m->flags & PG_FICTITIOUS) != 0) 5081 goto small_mappings; 5082 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5083 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5084 va = pv->pv_va; 5085 pmap = PV_PMAP(pv); 5086 PMAP_LOCK(pmap); 5087 pde = pmap_pde(pmap, va); 5088 oldpde = *pde; 5089 if ((oldpde & PG_RW) != 0) { 5090 if (pmap_demote_pde(pmap, pde, va)) { 5091 if ((oldpde & PG_W) == 0) { 5092 /* 5093 * Write protect the mapping to a 5094 * single page so that a subsequent 5095 * write access may repromote. 5096 */ 5097 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5098 PG_PS_FRAME); 5099 pte = pmap_pte_quick(pmap, va); 5100 oldpte = *pte; 5101 if ((oldpte & PG_V) != 0) { 5102 /* 5103 * Regardless of whether a pte is 32 or 64 bits 5104 * in size, PG_RW and PG_M are among the least 5105 * significant 32 bits. 5106 */ 5107 while (!atomic_cmpset_int((u_int *)pte, 5108 oldpte, 5109 oldpte & ~(PG_M | PG_RW))) 5110 oldpte = *pte; 5111 vm_page_dirty(m); 5112 pmap_invalidate_page(pmap, va); 5113 } 5114 } 5115 } 5116 } 5117 PMAP_UNLOCK(pmap); 5118 } 5119small_mappings: 5120 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5121 pmap = PV_PMAP(pv); 5122 PMAP_LOCK(pmap); 5123 pde = pmap_pde(pmap, pv->pv_va); 5124 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5125 " a 4mpage in page %p's pv list", m)); 5126 pte = pmap_pte_quick(pmap, pv->pv_va); 5127 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5128 /* 5129 * Regardless of whether a pte is 32 or 64 bits 5130 * in size, PG_M is among the least significant 5131 * 32 bits. 5132 */ 5133 atomic_clear_int((u_int *)pte, PG_M); 5134 pmap_invalidate_page(pmap, pv->pv_va); 5135 } 5136 PMAP_UNLOCK(pmap); 5137 } 5138 sched_unpin(); 5139 rw_wunlock(&pvh_global_lock); 5140} 5141 5142/* 5143 * Miscellaneous support routines follow 5144 */ 5145 5146/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5147static __inline void 5148pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5149{ 5150 u_int opte, npte; 5151 5152 /* 5153 * The cache mode bits are all in the low 32-bits of the 5154 * PTE, so we can just spin on updating the low 32-bits. 5155 */ 5156 do { 5157 opte = *(u_int *)pte; 5158 npte = opte & ~PG_PTE_CACHE; 5159 npte |= cache_bits; 5160 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5161} 5162 5163/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5164static __inline void 5165pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5166{ 5167 u_int opde, npde; 5168 5169 /* 5170 * The cache mode bits are all in the low 32-bits of the 5171 * PDE, so we can just spin on updating the low 32-bits. 5172 */ 5173 do { 5174 opde = *(u_int *)pde; 5175 npde = opde & ~PG_PDE_CACHE; 5176 npde |= cache_bits; 5177 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5178} 5179 5180/* 5181 * Map a set of physical memory pages into the kernel virtual 5182 * address space. Return a pointer to where it is mapped. This 5183 * routine is intended to be used for mapping device memory, 5184 * NOT real memory. 5185 */ 5186void * 5187pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5188{ 5189 struct pmap_preinit_mapping *ppim; 5190 vm_offset_t va, offset; 5191 vm_size_t tmpsize; 5192 int i; 5193 5194 offset = pa & PAGE_MASK; 5195 size = round_page(offset + size); 5196 pa = pa & PG_FRAME; 5197 5198 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5199 va = KERNBASE + pa; 5200 else if (!pmap_initialized) { 5201 va = 0; 5202 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5203 ppim = pmap_preinit_mapping + i; 5204 if (ppim->va == 0) { 5205 ppim->pa = pa; 5206 ppim->sz = size; 5207 ppim->mode = mode; 5208 ppim->va = virtual_avail; 5209 virtual_avail += size; 5210 va = ppim->va; 5211 break; 5212 } 5213 } 5214 if (va == 0) 5215 panic("%s: too many preinit mappings", __func__); 5216 } else { 5217 /* 5218 * If we have a preinit mapping, re-use it. 5219 */ 5220 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5221 ppim = pmap_preinit_mapping + i; 5222 if (ppim->pa == pa && ppim->sz == size && 5223 ppim->mode == mode) 5224 return ((void *)(ppim->va + offset)); 5225 } 5226 va = kva_alloc(size); 5227 if (va == 0) 5228 panic("%s: Couldn't allocate KVA", __func__); 5229 } 5230 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5231 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5232 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5233 pmap_invalidate_cache_range(va, va + size, FALSE); 5234 return ((void *)(va + offset)); 5235} 5236 5237void * 5238pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5239{ 5240 5241 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5242} 5243 5244void * 5245pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5246{ 5247 5248 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5249} 5250 5251void 5252pmap_unmapdev(vm_offset_t va, vm_size_t size) 5253{ 5254 struct pmap_preinit_mapping *ppim; 5255 vm_offset_t offset; 5256 int i; 5257 5258 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5259 return; 5260 offset = va & PAGE_MASK; 5261 size = round_page(offset + size); 5262 va = trunc_page(va); 5263 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5264 ppim = pmap_preinit_mapping + i; 5265 if (ppim->va == va && ppim->sz == size) { 5266 if (pmap_initialized) 5267 return; 5268 ppim->pa = 0; 5269 ppim->va = 0; 5270 ppim->sz = 0; 5271 ppim->mode = 0; 5272 if (va + size == virtual_avail) 5273 virtual_avail = va; 5274 return; 5275 } 5276 } 5277 if (pmap_initialized) 5278 kva_free(va, size); 5279} 5280 5281/* 5282 * Sets the memory attribute for the specified page. 5283 */ 5284void 5285pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5286{ 5287 5288 m->md.pat_mode = ma; 5289 if ((m->flags & PG_FICTITIOUS) != 0) 5290 return; 5291 5292 /* 5293 * If "m" is a normal page, flush it from the cache. 5294 * See pmap_invalidate_cache_range(). 5295 * 5296 * First, try to find an existing mapping of the page by sf 5297 * buffer. sf_buf_invalidate_cache() modifies mapping and 5298 * flushes the cache. 5299 */ 5300 if (sf_buf_invalidate_cache(m)) 5301 return; 5302 5303 /* 5304 * If page is not mapped by sf buffer, but CPU does not 5305 * support self snoop, map the page transient and do 5306 * invalidation. In the worst case, whole cache is flushed by 5307 * pmap_invalidate_cache_range(). 5308 */ 5309 if ((cpu_feature & CPUID_SS) == 0) 5310 pmap_flush_page(m); 5311} 5312 5313static void 5314pmap_flush_page(vm_page_t m) 5315{ 5316 struct sysmaps *sysmaps; 5317 vm_offset_t sva, eva; 5318 5319 if ((cpu_feature & CPUID_CLFSH) != 0) { 5320 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 5321 mtx_lock(&sysmaps->lock); 5322 if (*sysmaps->CMAP2) 5323 panic("pmap_flush_page: CMAP2 busy"); 5324 sched_pin(); 5325 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5326 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5327 invlcaddr(sysmaps->CADDR2); 5328 sva = (vm_offset_t)sysmaps->CADDR2; 5329 eva = sva + PAGE_SIZE; 5330 5331 /* 5332 * Use mfence despite the ordering implied by 5333 * mtx_{un,}lock() because clflush is not guaranteed 5334 * to be ordered by any other instruction. 5335 */ 5336 mfence(); 5337 for (; sva < eva; sva += cpu_clflush_line_size) 5338 clflush(sva); 5339 mfence(); 5340 *sysmaps->CMAP2 = 0; 5341 sched_unpin(); 5342 mtx_unlock(&sysmaps->lock); 5343 } else 5344 pmap_invalidate_cache(); 5345} 5346 5347/* 5348 * Changes the specified virtual address range's memory type to that given by 5349 * the parameter "mode". The specified virtual address range must be 5350 * completely contained within either the kernel map. 5351 * 5352 * Returns zero if the change completed successfully, and either EINVAL or 5353 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5354 * of the virtual address range was not mapped, and ENOMEM is returned if 5355 * there was insufficient memory available to complete the change. 5356 */ 5357int 5358pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5359{ 5360 vm_offset_t base, offset, tmpva; 5361 pd_entry_t *pde; 5362 pt_entry_t *pte; 5363 int cache_bits_pte, cache_bits_pde; 5364 boolean_t changed; 5365 5366 base = trunc_page(va); 5367 offset = va & PAGE_MASK; 5368 size = round_page(offset + size); 5369 5370 /* 5371 * Only supported on kernel virtual addresses above the recursive map. 5372 */ 5373 if (base < VM_MIN_KERNEL_ADDRESS) 5374 return (EINVAL); 5375 5376 cache_bits_pde = pmap_cache_bits(mode, 1); 5377 cache_bits_pte = pmap_cache_bits(mode, 0); 5378 changed = FALSE; 5379 5380 /* 5381 * Pages that aren't mapped aren't supported. Also break down 5382 * 2/4MB pages into 4KB pages if required. 5383 */ 5384 PMAP_LOCK(kernel_pmap); 5385 for (tmpva = base; tmpva < base + size; ) { 5386 pde = pmap_pde(kernel_pmap, tmpva); 5387 if (*pde == 0) { 5388 PMAP_UNLOCK(kernel_pmap); 5389 return (EINVAL); 5390 } 5391 if (*pde & PG_PS) { 5392 /* 5393 * If the current 2/4MB page already has 5394 * the required memory type, then we need not 5395 * demote this page. Just increment tmpva to 5396 * the next 2/4MB page frame. 5397 */ 5398 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5399 tmpva = trunc_4mpage(tmpva) + NBPDR; 5400 continue; 5401 } 5402 5403 /* 5404 * If the current offset aligns with a 2/4MB 5405 * page frame and there is at least 2/4MB left 5406 * within the range, then we need not break 5407 * down this page into 4KB pages. 5408 */ 5409 if ((tmpva & PDRMASK) == 0 && 5410 tmpva + PDRMASK < base + size) { 5411 tmpva += NBPDR; 5412 continue; 5413 } 5414 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5415 PMAP_UNLOCK(kernel_pmap); 5416 return (ENOMEM); 5417 } 5418 } 5419 pte = vtopte(tmpva); 5420 if (*pte == 0) { 5421 PMAP_UNLOCK(kernel_pmap); 5422 return (EINVAL); 5423 } 5424 tmpva += PAGE_SIZE; 5425 } 5426 PMAP_UNLOCK(kernel_pmap); 5427 5428 /* 5429 * Ok, all the pages exist, so run through them updating their 5430 * cache mode if required. 5431 */ 5432 for (tmpva = base; tmpva < base + size; ) { 5433 pde = pmap_pde(kernel_pmap, tmpva); 5434 if (*pde & PG_PS) { 5435 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5436 pmap_pde_attr(pde, cache_bits_pde); 5437 changed = TRUE; 5438 } 5439 tmpva = trunc_4mpage(tmpva) + NBPDR; 5440 } else { 5441 pte = vtopte(tmpva); 5442 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5443 pmap_pte_attr(pte, cache_bits_pte); 5444 changed = TRUE; 5445 } 5446 tmpva += PAGE_SIZE; 5447 } 5448 } 5449 5450 /* 5451 * Flush CPU caches to make sure any data isn't cached that 5452 * shouldn't be, etc. 5453 */ 5454 if (changed) { 5455 pmap_invalidate_range(kernel_pmap, base, tmpva); 5456 pmap_invalidate_cache_range(base, tmpva, FALSE); 5457 } 5458 return (0); 5459} 5460 5461/* 5462 * perform the pmap work for mincore 5463 */ 5464int 5465pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5466{ 5467 pd_entry_t *pdep; 5468 pt_entry_t *ptep, pte; 5469 vm_paddr_t pa; 5470 int val; 5471 5472 PMAP_LOCK(pmap); 5473retry: 5474 pdep = pmap_pde(pmap, addr); 5475 if (*pdep != 0) { 5476 if (*pdep & PG_PS) { 5477 pte = *pdep; 5478 /* Compute the physical address of the 4KB page. */ 5479 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5480 PG_FRAME; 5481 val = MINCORE_SUPER; 5482 } else { 5483 ptep = pmap_pte(pmap, addr); 5484 pte = *ptep; 5485 pmap_pte_release(ptep); 5486 pa = pte & PG_FRAME; 5487 val = 0; 5488 } 5489 } else { 5490 pte = 0; 5491 pa = 0; 5492 val = 0; 5493 } 5494 if ((pte & PG_V) != 0) { 5495 val |= MINCORE_INCORE; 5496 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5497 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5498 if ((pte & PG_A) != 0) 5499 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5500 } 5501 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5502 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5503 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5504 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5505 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5506 goto retry; 5507 } else 5508 PA_UNLOCK_COND(*locked_pa); 5509 PMAP_UNLOCK(pmap); 5510 return (val); 5511} 5512 5513void 5514pmap_activate(struct thread *td) 5515{ 5516 pmap_t pmap, oldpmap; 5517 u_int cpuid; 5518 u_int32_t cr3; 5519 5520 critical_enter(); 5521 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5522 oldpmap = PCPU_GET(curpmap); 5523 cpuid = PCPU_GET(cpuid); 5524#if defined(SMP) 5525 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5526 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5527#else 5528 CPU_CLR(cpuid, &oldpmap->pm_active); 5529 CPU_SET(cpuid, &pmap->pm_active); 5530#endif 5531#if defined(PAE) || defined(PAE_TABLES) 5532 cr3 = vtophys(pmap->pm_pdpt); 5533#else 5534 cr3 = vtophys(pmap->pm_pdir); 5535#endif 5536 /* 5537 * pmap_activate is for the current thread on the current cpu 5538 */ 5539 td->td_pcb->pcb_cr3 = cr3; 5540 load_cr3(cr3); 5541 PCPU_SET(curpmap, pmap); 5542 critical_exit(); 5543} 5544 5545void 5546pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5547{ 5548} 5549 5550/* 5551 * Increase the starting virtual address of the given mapping if a 5552 * different alignment might result in more superpage mappings. 5553 */ 5554void 5555pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5556 vm_offset_t *addr, vm_size_t size) 5557{ 5558 vm_offset_t superpage_offset; 5559 5560 if (size < NBPDR) 5561 return; 5562 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5563 offset += ptoa(object->pg_color); 5564 superpage_offset = offset & PDRMASK; 5565 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5566 (*addr & PDRMASK) == superpage_offset) 5567 return; 5568 if ((*addr & PDRMASK) < superpage_offset) 5569 *addr = (*addr & ~PDRMASK) + superpage_offset; 5570 else 5571 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5572} 5573 5574 5575#if defined(PMAP_DEBUG) 5576pmap_pid_dump(int pid) 5577{ 5578 pmap_t pmap; 5579 struct proc *p; 5580 int npte = 0; 5581 int index; 5582 5583 sx_slock(&allproc_lock); 5584 FOREACH_PROC_IN_SYSTEM(p) { 5585 if (p->p_pid != pid) 5586 continue; 5587 5588 if (p->p_vmspace) { 5589 int i,j; 5590 index = 0; 5591 pmap = vmspace_pmap(p->p_vmspace); 5592 for (i = 0; i < NPDEPTD; i++) { 5593 pd_entry_t *pde; 5594 pt_entry_t *pte; 5595 vm_offset_t base = i << PDRSHIFT; 5596 5597 pde = &pmap->pm_pdir[i]; 5598 if (pde && pmap_pde_v(pde)) { 5599 for (j = 0; j < NPTEPG; j++) { 5600 vm_offset_t va = base + (j << PAGE_SHIFT); 5601 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5602 if (index) { 5603 index = 0; 5604 printf("\n"); 5605 } 5606 sx_sunlock(&allproc_lock); 5607 return (npte); 5608 } 5609 pte = pmap_pte(pmap, va); 5610 if (pte && pmap_pte_v(pte)) { 5611 pt_entry_t pa; 5612 vm_page_t m; 5613 pa = *pte; 5614 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5615 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5616 va, pa, m->hold_count, m->wire_count, m->flags); 5617 npte++; 5618 index++; 5619 if (index >= 2) { 5620 index = 0; 5621 printf("\n"); 5622 } else { 5623 printf(" "); 5624 } 5625 } 5626 } 5627 } 5628 } 5629 } 5630 } 5631 sx_sunlock(&allproc_lock); 5632 return (npte); 5633} 5634#endif 5635 5636#if defined(DEBUG) 5637 5638static void pads(pmap_t pm); 5639void pmap_pvdump(vm_paddr_t pa); 5640 5641/* print address space of pmap*/ 5642static void 5643pads(pmap_t pm) 5644{ 5645 int i, j; 5646 vm_paddr_t va; 5647 pt_entry_t *ptep; 5648 5649 if (pm == kernel_pmap) 5650 return; 5651 for (i = 0; i < NPDEPTD; i++) 5652 if (pm->pm_pdir[i]) 5653 for (j = 0; j < NPTEPG; j++) { 5654 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5655 if (pm == kernel_pmap && va < KERNBASE) 5656 continue; 5657 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5658 continue; 5659 ptep = pmap_pte(pm, va); 5660 if (pmap_pte_v(ptep)) 5661 printf("%x:%x ", va, *ptep); 5662 }; 5663 5664} 5665 5666void 5667pmap_pvdump(vm_paddr_t pa) 5668{ 5669 pv_entry_t pv; 5670 pmap_t pmap; 5671 vm_page_t m; 5672 5673 printf("pa %x", pa); 5674 m = PHYS_TO_VM_PAGE(pa); 5675 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5676 pmap = PV_PMAP(pv); 5677 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5678 pads(pmap); 5679 } 5680 printf(" "); 5681} 5682#endif 5683