pmap-v6.c revision 325238
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 6 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 7 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 8 * All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * the Systems Programming Group of the University of Utah Computer 12 * Science Department and William Jolitz of UUNET Technologies Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 39 */ 40/*- 41 * Copyright (c) 2003 Networks Associates Technology, Inc. 42 * All rights reserved. 43 * 44 * This software was developed for the FreeBSD Project by Jake Burkholder, 45 * Safeport Network Services, and Network Associates Laboratories, the 46 * Security Research Division of Network Associates, Inc. under 47 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 48 * CHATS research program. 49 * 50 * Redistribution and use in source and binary forms, with or without 51 * modification, are permitted provided that the following conditions 52 * are met: 53 * 1. Redistributions of source code must retain the above copyright 54 * notice, this list of conditions and the following disclaimer. 55 * 2. Redistributions in binary form must reproduce the above copyright 56 * notice, this list of conditions and the following disclaimer in the 57 * documentation and/or other materials provided with the distribution. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 */ 71 72#include <sys/cdefs.h> 73__FBSDID("$FreeBSD: stable/11/sys/arm/arm/pmap-v6.c 325238 2017-10-31 15:06:26Z markj $"); 74 75/* 76 * Manages physical address maps. 77 * 78 * Since the information managed by this module is 79 * also stored by the logical address mapping module, 80 * this module may throw away valid virtual-to-physical 81 * mappings at almost any time. However, invalidations 82 * of virtual-to-physical mappings must be done as 83 * requested. 84 * 85 * In order to cope with hardware architectures which 86 * make virtual-to-physical map invalidates expensive, 87 * this module may delay invalidate or reduced protection 88 * operations until such time as they are actually 89 * necessary. This module is given full information as 90 * to which processors are currently using which maps, 91 * and to when physical maps must be made correct. 92 */ 93 94#include "opt_vm.h" 95#include "opt_pmap.h" 96#include "opt_ddb.h" 97 98#include <sys/param.h> 99#include <sys/systm.h> 100#include <sys/kernel.h> 101#include <sys/ktr.h> 102#include <sys/lock.h> 103#include <sys/proc.h> 104#include <sys/rwlock.h> 105#include <sys/malloc.h> 106#include <sys/vmmeter.h> 107#include <sys/malloc.h> 108#include <sys/mman.h> 109#include <sys/sf_buf.h> 110#include <sys/smp.h> 111#include <sys/sched.h> 112#include <sys/sysctl.h> 113 114#ifdef DDB 115#include <ddb/ddb.h> 116#endif 117 118#include <machine/physmem.h> 119 120#include <vm/vm.h> 121#include <vm/uma.h> 122#include <vm/pmap.h> 123#include <vm/vm_param.h> 124#include <vm/vm_kern.h> 125#include <vm/vm_object.h> 126#include <vm/vm_map.h> 127#include <vm/vm_page.h> 128#include <vm/vm_pageout.h> 129#include <vm/vm_phys.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_reserv.h> 132#include <sys/lock.h> 133#include <sys/mutex.h> 134 135#include <machine/md_var.h> 136#include <machine/pmap_var.h> 137#include <machine/cpu.h> 138#include <machine/pcb.h> 139#include <machine/sf_buf.h> 140#ifdef SMP 141#include <machine/smp.h> 142#endif 143 144#ifndef PMAP_SHPGPERPROC 145#define PMAP_SHPGPERPROC 200 146#endif 147 148#ifndef DIAGNOSTIC 149#define PMAP_INLINE __inline 150#else 151#define PMAP_INLINE 152#endif 153 154#ifdef PMAP_DEBUG 155static void pmap_zero_page_check(vm_page_t m); 156void pmap_debug(int level); 157int pmap_pid_dump(int pid); 158 159#define PDEBUG(_lev_,_stat_) \ 160 if (pmap_debug_level >= (_lev_)) \ 161 ((_stat_)) 162#define dprintf printf 163int pmap_debug_level = 1; 164#else /* PMAP_DEBUG */ 165#define PDEBUG(_lev_,_stat_) /* Nothing */ 166#define dprintf(x, arg...) 167#endif /* PMAP_DEBUG */ 168 169/* 170 * Level 2 page tables map definion ('max' is excluded). 171 */ 172 173#define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 174#define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 175 176#define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 177#define UPT2V_MAX_ADDRESS \ 178 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 179 180/* 181 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 182 * 4KB (PTE2) page mappings have identical settings for the following fields: 183 */ 184#define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 185 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 186 PTE2_ATTR_MASK) 187 188#define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 189 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 190 PTE1_ATTR_MASK) 191 192#define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 193 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 194 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 195 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 196 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 197 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 198 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 199 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 200 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 201 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 202 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 203 204#define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 205 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 206 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 207 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 208 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 209 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 210 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 211 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 212 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 213 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 214 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 215 216/* 217 * PTE2 descriptors creation macros. 218 */ 219#define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 220#define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 221 222#define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223#define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 224 225#define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 226#define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 227 228#define PV_STATS 229#ifdef PV_STATS 230#define PV_STAT(x) do { x ; } while (0) 231#else 232#define PV_STAT(x) do { } while (0) 233#endif 234 235/* 236 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 237 * We can init many things with no memory allocation thanks to its static 238 * allocation and this brings two main advantages: 239 * (1) other cores can be started very simply, 240 * (2) various boot loaders can be supported as its arguments can be processed 241 * in virtual address space and can be moved to safe location before 242 * first allocation happened. 243 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 244 * However, the table is uninitialized and so lays in bss. Therefore kernel 245 * image size is not influenced. 246 * 247 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 248 * CPU suspend/resume game. 249 */ 250extern pt1_entry_t boot_pt1[]; 251 252vm_paddr_t base_pt1; 253pt1_entry_t *kern_pt1; 254pt2_entry_t *kern_pt2tab; 255pt2_entry_t *PT2MAP; 256 257static uint32_t ttb_flags; 258static vm_memattr_t pt_memattr; 259ttb_entry_t pmap_kern_ttb; 260 261struct pmap kernel_pmap_store; 262LIST_HEAD(pmaplist, pmap); 263static struct pmaplist allpmaps; 264static struct mtx allpmaps_lock; 265 266vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 267vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 268 269static vm_offset_t kernel_vm_end_new; 270vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 271vm_offset_t vm_max_kernel_address; 272vm_paddr_t kernel_l1pa; 273 274static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 275 276/* 277 * Data for the pv entry allocation mechanism 278 */ 279static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 280static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 281static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 282static int shpgperproc = PMAP_SHPGPERPROC; 283 284struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 285int pv_maxchunks; /* How many chunks we have KVA for */ 286vm_offset_t pv_vafree; /* freelist stored in the PTE */ 287 288vm_paddr_t first_managed_pa; 289#define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 290 291/* 292 * All those kernel PT submaps that BSD is so fond of 293 */ 294static pt2_entry_t *CMAP3; 295static caddr_t CADDR3; 296caddr_t _tmppt = 0; 297 298struct msgbuf *msgbufp = NULL; /* XXX move it to machdep.c */ 299 300/* 301 * Crashdump maps. 302 */ 303static caddr_t crashdumpmap; 304 305static pt2_entry_t *PMAP1 = NULL, *PMAP2; 306static pt2_entry_t *PADDR1 = NULL, *PADDR2; 307#ifdef DDB 308static pt2_entry_t *PMAP3; 309static pt2_entry_t *PADDR3; 310static int PMAP3cpu __unused; /* for SMP only */ 311#endif 312#ifdef SMP 313static int PMAP1cpu; 314static int PMAP1changedcpu; 315SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 316 &PMAP1changedcpu, 0, 317 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 318#endif 319static int PMAP1changed; 320SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 321 &PMAP1changed, 0, 322 "Number of times pmap_pte2_quick changed PMAP1"); 323static int PMAP1unchanged; 324SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 325 &PMAP1unchanged, 0, 326 "Number of times pmap_pte2_quick didn't change PMAP1"); 327static struct mtx PMAP2mutex; 328 329static __inline void pt2_wirecount_init(vm_page_t m); 330static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 331 vm_offset_t va); 332void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 333 334/* 335 * Function to set the debug level of the pmap code. 336 */ 337#ifdef PMAP_DEBUG 338void 339pmap_debug(int level) 340{ 341 342 pmap_debug_level = level; 343 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 344} 345#endif /* PMAP_DEBUG */ 346 347/* 348 * This table must corespond with memory attribute configuration in vm.h. 349 * First entry is used for normal system mapping. 350 * 351 * Device memory is always marked as shared. 352 * Normal memory is shared only in SMP . 353 * Not outer shareable bits are not used yet. 354 * Class 6 cannot be used on ARM11. 355 */ 356#define TEXDEF_TYPE_SHIFT 0 357#define TEXDEF_TYPE_MASK 0x3 358#define TEXDEF_INNER_SHIFT 2 359#define TEXDEF_INNER_MASK 0x3 360#define TEXDEF_OUTER_SHIFT 4 361#define TEXDEF_OUTER_MASK 0x3 362#define TEXDEF_NOS_SHIFT 6 363#define TEXDEF_NOS_MASK 0x1 364 365#define TEX(t, i, o, s) \ 366 ((t) << TEXDEF_TYPE_SHIFT) | \ 367 ((i) << TEXDEF_INNER_SHIFT) | \ 368 ((o) << TEXDEF_OUTER_SHIFT | \ 369 ((s) << TEXDEF_NOS_SHIFT)) 370 371static uint32_t tex_class[8] = { 372/* type inner cache outer cache */ 373 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 374 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 375 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 376 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 377 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 378 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 379 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 380 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 381}; 382#undef TEX 383 384static uint32_t pte2_attr_tab[8] = { 385 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 386 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 387 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 388 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 389 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 390 0, /* 5 - NOT USED YET */ 391 0, /* 6 - NOT USED YET */ 392 0 /* 7 - NOT USED YET */ 393}; 394CTASSERT(VM_MEMATTR_WB_WA == 0); 395CTASSERT(VM_MEMATTR_NOCACHE == 1); 396CTASSERT(VM_MEMATTR_DEVICE == 2); 397CTASSERT(VM_MEMATTR_SO == 3); 398CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 399 400static inline uint32_t 401vm_memattr_to_pte2(vm_memattr_t ma) 402{ 403 404 KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma)); 405 return (pte2_attr_tab[(u_int)ma]); 406} 407 408static inline uint32_t 409vm_page_pte2_attr(vm_page_t m) 410{ 411 412 return (vm_memattr_to_pte2(m->md.pat_mode)); 413} 414 415/* 416 * Convert TEX definition entry to TTB flags. 417 */ 418static uint32_t 419encode_ttb_flags(int idx) 420{ 421 uint32_t inner, outer, nos, reg; 422 423 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 424 TEXDEF_INNER_MASK; 425 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 426 TEXDEF_OUTER_MASK; 427 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 428 TEXDEF_NOS_MASK; 429 430 reg = nos << 5; 431 reg |= outer << 3; 432 if (cpuinfo.coherent_walk) 433 reg |= (inner & 0x1) << 6; 434 reg |= (inner & 0x2) >> 1; 435#ifdef SMP 436 reg |= 1 << 1; 437#endif 438 return reg; 439} 440 441/* 442 * Set TEX remapping registers in current CPU. 443 */ 444void 445pmap_set_tex(void) 446{ 447 uint32_t prrr, nmrr; 448 uint32_t type, inner, outer, nos; 449 int i; 450 451#ifdef PMAP_PTE_NOCACHE 452 /* XXX fixme */ 453 if (cpuinfo.coherent_walk) { 454 pt_memattr = VM_MEMATTR_WB_WA; 455 ttb_flags = encode_ttb_flags(0); 456 } 457 else { 458 pt_memattr = VM_MEMATTR_NOCACHE; 459 ttb_flags = encode_ttb_flags(1); 460 } 461#else 462 pt_memattr = VM_MEMATTR_WB_WA; 463 ttb_flags = encode_ttb_flags(0); 464#endif 465 466 prrr = 0; 467 nmrr = 0; 468 469 /* Build remapping register from TEX classes. */ 470 for (i = 0; i < 8; i++) { 471 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 472 TEXDEF_TYPE_MASK; 473 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 474 TEXDEF_INNER_MASK; 475 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 476 TEXDEF_OUTER_MASK; 477 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 478 TEXDEF_NOS_MASK; 479 480 prrr |= type << (i * 2); 481 prrr |= nos << (i + 24); 482 nmrr |= inner << (i * 2); 483 nmrr |= outer << (i * 2 + 16); 484 } 485 /* Add shareable bits for device memory. */ 486 prrr |= PRRR_DS0 | PRRR_DS1; 487 488 /* Add shareable bits for normal memory in SMP case. */ 489#ifdef SMP 490 prrr |= PRRR_NS1; 491#endif 492 cp15_prrr_set(prrr); 493 cp15_nmrr_set(nmrr); 494 495 /* Caches are disabled, so full TLB flush should be enough. */ 496 tlb_flush_all_local(); 497} 498 499/* 500 * Remap one vm_meattr class to another one. This can be useful as 501 * workaround for SOC errata, e.g. if devices must be accessed using 502 * SO memory class. 503 * 504 * !!! Please note that this function is absolutely last resort thing. 505 * It should not be used under normal circumstances. !!! 506 * 507 * Usage rules: 508 * - it shall be called after pmap_bootstrap_prepare() and before 509 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 510 * to be called from platform_attach() or platform_late_init(). 511 * 512 * - if remapping doesn't change caching mode, or until uncached class 513 * is remapped to any kind of cached one, then no other restriction exists. 514 * 515 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 516 * remapped) remain cached, then caller is resposible for calling 517 * of dcache_wbinv_poc_all(). 518 * 519 * - remapping of any kind of cached class to uncached is not permitted. 520 */ 521void 522pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 523{ 524 int old_idx, new_idx; 525 526 /* Map VM memattrs to indexes to tex_class table. */ 527 old_idx = pte2_attr_tab[(int)old_attr]; 528 new_idx = pte2_attr_tab[(int)new_attr]; 529 530 /* Replace TEX attribute and apply it. */ 531 tex_class[old_idx] = tex_class[new_idx]; 532 pmap_set_tex(); 533} 534 535/* 536 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 537 * KERNBASE is mapped by first L2 page table in L2 page table page. It 538 * meets same constrain due to PT2MAP being placed just under KERNBASE. 539 */ 540CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 541CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 542 543/* 544 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 545 * For now, anyhow, the following check must be fulfilled. 546 */ 547CTASSERT(PAGE_SIZE == PTE2_SIZE); 548/* 549 * We don't want to mess up MI code with all MMU and PMAP definitions, 550 * so some things, which depend on other ones, are defined independently. 551 * Now, it is time to check that we don't screw up something. 552 */ 553CTASSERT(PDRSHIFT == PTE1_SHIFT); 554/* 555 * Check L1 and L2 page table entries definitions consistency. 556 */ 557CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 558CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 559/* 560 * Check L2 page tables page consistency. 561 */ 562CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 563CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 564/* 565 * Check PT2TAB consistency. 566 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 567 * This should be done without remainder. 568 */ 569CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 570 571/* 572 * A PT2MAP magic. 573 * 574 * All level 2 page tables (PT2s) are mapped continuously and accordingly 575 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 576 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 577 * must be used together, but not necessary at once. The first PT2 in a page 578 * must map things on correctly aligned address and the others must follow 579 * in right order. 580 */ 581#define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 582#define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 583#define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 584 585/* 586 * Check PT2TAB consistency. 587 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 588 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 589 * The both should be done without remainder. 590 */ 591CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 592CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 593/* 594 * The implementation was made general, however, with the assumption 595 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 596 * the code should be once more rechecked. 597 */ 598CTASSERT(NPG_IN_PT2TAB == 1); 599 600/* 601 * Get offset of PT2 in a page 602 * associated with given PT1 index. 603 */ 604static __inline u_int 605page_pt2off(u_int pt1_idx) 606{ 607 608 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 609} 610 611/* 612 * Get physical address of PT2 613 * associated with given PT2s page and PT1 index. 614 */ 615static __inline vm_paddr_t 616page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 617{ 618 619 return (pgpa + page_pt2off(pt1_idx)); 620} 621 622/* 623 * Get first entry of PT2 624 * associated with given PT2s page and PT1 index. 625 */ 626static __inline pt2_entry_t * 627page_pt2(vm_offset_t pgva, u_int pt1_idx) 628{ 629 630 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 631} 632 633/* 634 * Get virtual address of PT2s page (mapped in PT2MAP) 635 * which holds PT2 which holds entry which maps given virtual address. 636 */ 637static __inline vm_offset_t 638pt2map_pt2pg(vm_offset_t va) 639{ 640 641 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 642 return ((vm_offset_t)pt2map_entry(va)); 643} 644 645/***************************************************************************** 646 * 647 * THREE pmap initialization milestones exist: 648 * 649 * locore.S 650 * -> fundamental init (including MMU) in ASM 651 * 652 * initarm() 653 * -> fundamental init continues in C 654 * -> first available physical address is known 655 * 656 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 657 * -> basic (safe) interface for physical address allocation is made 658 * -> basic (safe) interface for virtual mapping is made 659 * -> limited not SMP coherent work is possible 660 * 661 * -> more fundamental init continues in C 662 * -> locks and some more things are available 663 * -> all fundamental allocations and mappings are done 664 * 665 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 666 * -> phys_avail[] and virtual_avail is set 667 * -> control is passed to vm subsystem 668 * -> physical and virtual address allocation are off limit 669 * -> low level mapping functions, some SMP coherent, 670 * are available, which cannot be used before vm subsystem 671 * is being inited 672 * 673 * mi_startup() 674 * -> vm subsystem is being inited 675 * 676 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 677 * -> pmap is fully inited 678 * 679 *****************************************************************************/ 680 681/***************************************************************************** 682 * 683 * PMAP first stage initialization and utility functions 684 * for pre-bootstrap epoch. 685 * 686 * After pmap_bootstrap_prepare() is called, the following functions 687 * can be used: 688 * 689 * (1) strictly only for this stage functions for physical page allocations, 690 * virtual space allocations, and mappings: 691 * 692 * vm_paddr_t pmap_preboot_get_pages(u_int num); 693 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 694 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 695 * vm_offset_t pmap_preboot_get_vpages(u_int num); 696 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 697 * vm_prot_t prot, vm_memattr_t attr); 698 * 699 * (2) for all stages: 700 * 701 * vm_paddr_t pmap_kextract(vm_offset_t va); 702 * 703 * NOTE: This is not SMP coherent stage. 704 * 705 *****************************************************************************/ 706 707#define KERNEL_P2V(pa) \ 708 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 709#define KERNEL_V2P(va) \ 710 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 711 712static vm_paddr_t last_paddr; 713 714/* 715 * Pre-bootstrap epoch page allocator. 716 */ 717vm_paddr_t 718pmap_preboot_get_pages(u_int num) 719{ 720 vm_paddr_t ret; 721 722 ret = last_paddr; 723 last_paddr += num * PAGE_SIZE; 724 725 return (ret); 726} 727 728/* 729 * The fundamental initialization of PMAP stuff. 730 * 731 * Some things already happened in locore.S and some things could happen 732 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 733 * 1. Caches are disabled. 734 * 2. We are running on virtual addresses already with 'boot_pt1' 735 * as L1 page table. 736 * 3. So far, all virtual addresses can be converted to physical ones and 737 * vice versa by the following macros: 738 * KERNEL_P2V(pa) .... physical to virtual ones, 739 * KERNEL_V2P(va) .... virtual to physical ones. 740 * 741 * What is done herein: 742 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 743 * 2. PT2MAP magic is brought to live. 744 * 3. Basic preboot functions for page allocations and mappings can be used. 745 * 4. Everything is prepared for L1 cache enabling. 746 * 747 * Variations: 748 * 1. To use second TTB register, so kernel and users page tables will be 749 * separated. This way process forking - pmap_pinit() - could be faster, 750 * it saves physical pages and KVA per a process, and it's simple change. 751 * However, it will lead, due to hardware matter, to the following: 752 * (a) 2G space for kernel and 2G space for users. 753 * (b) 1G space for kernel in low addresses and 3G for users above it. 754 * A question is: Is the case (b) really an option? Note that case (b) 755 * does save neither physical memory and KVA. 756 */ 757void 758pmap_bootstrap_prepare(vm_paddr_t last) 759{ 760 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 761 vm_offset_t pt2pg_va; 762 pt1_entry_t *pte1p; 763 pt2_entry_t *pte2p; 764 u_int i; 765 uint32_t actlr_mask, actlr_set, l1_attr; 766 767 /* 768 * Now, we are going to make real kernel mapping. Note that we are 769 * already running on some mapping made in locore.S and we expect 770 * that it's large enough to ensure nofault access to physical memory 771 * allocated herein before switch. 772 * 773 * As kernel image and everything needed before are and will be mapped 774 * by section mappings, we align last physical address to PTE1_SIZE. 775 */ 776 last_paddr = pte1_roundup(last); 777 778 /* 779 * Allocate and zero page(s) for kernel L1 page table. 780 * 781 * Note that it's first allocation on space which was PTE1_SIZE 782 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 783 */ 784 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 785 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 786 bzero((void*)kern_pt1, NB_IN_PT1); 787 pte1_sync_range(kern_pt1, NB_IN_PT1); 788 789 /* Allocate and zero page(s) for kernel PT2TAB. */ 790 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 791 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 792 bzero(kern_pt2tab, NB_IN_PT2TAB); 793 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 794 795 /* Allocate and zero page(s) for kernel L2 page tables. */ 796 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 797 pt2pg_va = KERNEL_P2V(pt2pg_pa); 798 size = NKPT2PG * PAGE_SIZE; 799 bzero((void*)pt2pg_va, size); 800 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 801 802 /* 803 * Add a physical memory segment (vm_phys_seg) corresponding to the 804 * preallocated pages for kernel L2 page tables so that vm_page 805 * structures representing these pages will be created. The vm_page 806 * structures are required for promotion of the corresponding kernel 807 * virtual addresses to section mappings. 808 */ 809 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 810 811 /* 812 * Insert allocated L2 page table pages to PT2TAB and make 813 * link to all PT2s in L1 page table. See how kernel_vm_end 814 * is initialized. 815 * 816 * We play simple and safe. So every KVA will have underlaying 817 * L2 page table, even kernel image mapped by sections. 818 */ 819 pte2p = kern_pt2tab_entry(KERNBASE); 820 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 821 pt2tab_store(pte2p++, PTE2_KPT(pa)); 822 823 pte1p = kern_pte1(KERNBASE); 824 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 825 pte1_store(pte1p++, PTE1_LINK(pa)); 826 827 /* Make section mappings for kernel. */ 828 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 829 pte1p = kern_pte1(KERNBASE); 830 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 831 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 832 833 /* 834 * Get free and aligned space for PT2MAP and make L1 page table links 835 * to L2 page tables held in PT2TAB. 836 * 837 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 838 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 839 * each entry in PT2TAB maps all PT2s in a page. This implies that 840 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 841 */ 842 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 843 pte1p = kern_pte1((vm_offset_t)PT2MAP); 844 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 845 pte1_store(pte1p++, PTE1_LINK(pa)); 846 } 847 848 /* 849 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 850 * Each pmap will hold own PT2TAB, so the mapping should be not global. 851 */ 852 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 853 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 854 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 855 } 856 857 /* 858 * Choose correct L2 page table and make mappings for allocations 859 * made herein which replaces temporary locore.S mappings after a while. 860 * Note that PT2MAP cannot be used until we switch to kern_pt1. 861 * 862 * Note, that these allocations started aligned on 1M section and 863 * kernel PT1 was allocated first. Making of mappings must follow 864 * order of physical allocations as we've used KERNEL_P2V() macro 865 * for virtual addresses resolution. 866 */ 867 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 868 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 869 870 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 871 872 /* Make mapping for kernel L1 page table. */ 873 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 874 pte2_store(pte2p++, PTE2_KPT(pa)); 875 876 /* Make mapping for kernel PT2TAB. */ 877 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 878 pte2_store(pte2p++, PTE2_KPT(pa)); 879 880 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 881 pmap_kern_ttb = base_pt1 | ttb_flags; 882 cpuinfo_get_actlr_modifier(&actlr_mask, &actlr_set); 883 reinit_mmu(pmap_kern_ttb, actlr_mask, actlr_set); 884 /* 885 * Initialize the first available KVA. As kernel image is mapped by 886 * sections, we are leaving some gap behind. 887 */ 888 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 889} 890 891/* 892 * Setup L2 page table page for given KVA. 893 * Used in pre-bootstrap epoch. 894 * 895 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 896 * and used them for mapping KVA starting from KERNBASE. However, this is not 897 * enough. Vectors and devices need L2 page tables too. Note that they are 898 * even above VM_MAX_KERNEL_ADDRESS. 899 */ 900static __inline vm_paddr_t 901pmap_preboot_pt2pg_setup(vm_offset_t va) 902{ 903 pt2_entry_t *pte2p, pte2; 904 vm_paddr_t pt2pg_pa; 905 906 /* Get associated entry in PT2TAB. */ 907 pte2p = kern_pt2tab_entry(va); 908 909 /* Just return, if PT2s page exists already. */ 910 pte2 = pt2tab_load(pte2p); 911 if (pte2_is_valid(pte2)) 912 return (pte2_pa(pte2)); 913 914 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 915 ("%s: NKPT2PG too small", __func__)); 916 917 /* 918 * Allocate page for PT2s and insert it to PT2TAB. 919 * In other words, map it into PT2MAP space. 920 */ 921 pt2pg_pa = pmap_preboot_get_pages(1); 922 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 923 924 /* Zero all PT2s in allocated page. */ 925 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 926 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 927 928 return (pt2pg_pa); 929} 930 931/* 932 * Setup L2 page table for given KVA. 933 * Used in pre-bootstrap epoch. 934 */ 935static void 936pmap_preboot_pt2_setup(vm_offset_t va) 937{ 938 pt1_entry_t *pte1p; 939 vm_paddr_t pt2pg_pa, pt2_pa; 940 941 /* Setup PT2's page. */ 942 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 943 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 944 945 /* Insert PT2 to PT1. */ 946 pte1p = kern_pte1(va); 947 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 948} 949 950/* 951 * Get L2 page entry associated with given KVA. 952 * Used in pre-bootstrap epoch. 953 */ 954static __inline pt2_entry_t* 955pmap_preboot_vtopte2(vm_offset_t va) 956{ 957 pt1_entry_t *pte1p; 958 959 /* Setup PT2 if needed. */ 960 pte1p = kern_pte1(va); 961 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 962 pmap_preboot_pt2_setup(va); 963 964 return (pt2map_entry(va)); 965} 966 967/* 968 * Pre-bootstrap epoch page(s) mapping(s). 969 */ 970void 971pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 972{ 973 u_int i; 974 pt2_entry_t *pte2p; 975 976 /* Map all the pages. */ 977 for (i = 0; i < num; i++) { 978 pte2p = pmap_preboot_vtopte2(va); 979 pte2_store(pte2p, PTE2_KRW(pa)); 980 va += PAGE_SIZE; 981 pa += PAGE_SIZE; 982 } 983} 984 985/* 986 * Pre-bootstrap epoch virtual space alocator. 987 */ 988vm_offset_t 989pmap_preboot_reserve_pages(u_int num) 990{ 991 u_int i; 992 vm_offset_t start, va; 993 pt2_entry_t *pte2p; 994 995 /* Allocate virtual space. */ 996 start = va = virtual_avail; 997 virtual_avail += num * PAGE_SIZE; 998 999 /* Zero the mapping. */ 1000 for (i = 0; i < num; i++) { 1001 pte2p = pmap_preboot_vtopte2(va); 1002 pte2_store(pte2p, 0); 1003 va += PAGE_SIZE; 1004 } 1005 1006 return (start); 1007} 1008 1009/* 1010 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1011 */ 1012vm_offset_t 1013pmap_preboot_get_vpages(u_int num) 1014{ 1015 vm_paddr_t pa; 1016 vm_offset_t va; 1017 1018 /* Allocate physical page(s). */ 1019 pa = pmap_preboot_get_pages(num); 1020 1021 /* Allocate virtual space. */ 1022 va = virtual_avail; 1023 virtual_avail += num * PAGE_SIZE; 1024 1025 /* Map and zero all. */ 1026 pmap_preboot_map_pages(pa, va, num); 1027 bzero((void *)va, num * PAGE_SIZE); 1028 1029 return (va); 1030} 1031 1032/* 1033 * Pre-bootstrap epoch page mapping(s) with attributes. 1034 */ 1035void 1036pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1037 vm_prot_t prot, vm_memattr_t attr) 1038{ 1039 u_int num; 1040 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1041 pt1_entry_t *pte1p; 1042 pt2_entry_t *pte2p; 1043 1044 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1045 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1046 l2_attr = vm_memattr_to_pte2(attr); 1047 l1_prot = ATTR_TO_L1(l2_prot); 1048 l1_attr = ATTR_TO_L1(l2_attr); 1049 1050 /* Map all the pages. */ 1051 num = round_page(size); 1052 while (num > 0) { 1053 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1054 pte1p = kern_pte1(va); 1055 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1056 va += PTE1_SIZE; 1057 pa += PTE1_SIZE; 1058 num -= PTE1_SIZE; 1059 } else { 1060 pte2p = pmap_preboot_vtopte2(va); 1061 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1062 va += PAGE_SIZE; 1063 pa += PAGE_SIZE; 1064 num -= PAGE_SIZE; 1065 } 1066 } 1067} 1068 1069/* 1070 * Extract from the kernel page table the physical address 1071 * that is mapped by the given virtual address "va". 1072 */ 1073vm_paddr_t 1074pmap_kextract(vm_offset_t va) 1075{ 1076 vm_paddr_t pa; 1077 pt1_entry_t pte1; 1078 pt2_entry_t pte2; 1079 1080 pte1 = pte1_load(kern_pte1(va)); 1081 if (pte1_is_section(pte1)) { 1082 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1083 } else if (pte1_is_link(pte1)) { 1084 /* 1085 * We should beware of concurrent promotion that changes 1086 * pte1 at this point. However, it's not a problem as PT2 1087 * page is preserved by promotion in PT2TAB. So even if 1088 * it happens, using of PT2MAP is still safe. 1089 * 1090 * QQQ: However, concurrent removing is a problem which 1091 * ends in abort on PT2MAP space. Locking must be used 1092 * to deal with this. 1093 */ 1094 pte2 = pte2_load(pt2map_entry(va)); 1095 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1096 } 1097 else { 1098 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1099 } 1100 return (pa); 1101} 1102 1103/* 1104 * Extract from the kernel page table the physical address 1105 * that is mapped by the given virtual address "va". Also 1106 * return L2 page table entry which maps the address. 1107 * 1108 * This is only intended to be used for panic dumps. 1109 */ 1110vm_paddr_t 1111pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1112{ 1113 vm_paddr_t pa; 1114 pt1_entry_t pte1; 1115 pt2_entry_t pte2; 1116 1117 pte1 = pte1_load(kern_pte1(va)); 1118 if (pte1_is_section(pte1)) { 1119 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1120 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1121 } else if (pte1_is_link(pte1)) { 1122 pte2 = pte2_load(pt2map_entry(va)); 1123 pa = pte2_pa(pte2); 1124 } else { 1125 pte2 = 0; 1126 pa = 0; 1127 } 1128 if (pte2p != NULL) 1129 *pte2p = pte2; 1130 return (pa); 1131} 1132 1133/***************************************************************************** 1134 * 1135 * PMAP second stage initialization and utility functions 1136 * for bootstrap epoch. 1137 * 1138 * After pmap_bootstrap() is called, the following functions for 1139 * mappings can be used: 1140 * 1141 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1142 * void pmap_kremove(vm_offset_t va); 1143 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1144 * int prot); 1145 * 1146 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1147 * allowed during this stage. 1148 * 1149 *****************************************************************************/ 1150 1151/* 1152 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1153 * reserve various virtual spaces for temporary mappings. 1154 */ 1155void 1156pmap_bootstrap(vm_offset_t firstaddr) 1157{ 1158 pt2_entry_t *unused __unused; 1159 struct pcpu *pc; 1160 1161 /* 1162 * Initialize the kernel pmap (which is statically allocated). 1163 */ 1164 PMAP_LOCK_INIT(kernel_pmap); 1165 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1166 kernel_pmap->pm_pt1 = kern_pt1; 1167 kernel_pmap->pm_pt2tab = kern_pt2tab; 1168 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1169 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1170 1171 /* 1172 * Initialize the global pv list lock. 1173 */ 1174 rw_init(&pvh_global_lock, "pmap pv global"); 1175 1176 LIST_INIT(&allpmaps); 1177 1178 /* 1179 * Request a spin mutex so that changes to allpmaps cannot be 1180 * preempted by smp_rendezvous_cpus(). 1181 */ 1182 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1183 mtx_lock_spin(&allpmaps_lock); 1184 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1185 mtx_unlock_spin(&allpmaps_lock); 1186 1187 /* 1188 * Reserve some special page table entries/VA space for temporary 1189 * mapping of pages. 1190 */ 1191#define SYSMAP(c, p, v, n) do { \ 1192 v = (c)pmap_preboot_reserve_pages(n); \ 1193 p = pt2map_entry((vm_offset_t)v); \ 1194 } while (0) 1195 1196 /* 1197 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1198 * Local CMAP2 is also used for data cache cleaning. 1199 * Global CMAP3 is used for the idle process page zeroing. 1200 */ 1201 pc = get_pcpu(); 1202 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1203 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1204 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1205 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1206 SYSMAP(caddr_t, CMAP3, CADDR3, 1); 1207 1208 /* 1209 * Crashdump maps. 1210 */ 1211 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1212 1213 /* 1214 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1215 */ 1216 SYSMAP(caddr_t, unused, _tmppt, 1); 1217 1218 /* 1219 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1220 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1221 */ 1222 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1223 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1224#ifdef DDB 1225 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1226#endif 1227 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1228 1229 /* 1230 * Note that in very short time in initarm(), we are going to 1231 * initialize phys_avail[] array and no further page allocation 1232 * can happen after that until vm subsystem will be initialized. 1233 */ 1234 kernel_vm_end_new = kernel_vm_end; 1235 virtual_end = vm_max_kernel_address; 1236} 1237 1238static void 1239pmap_init_reserved_pages(void) 1240{ 1241 struct pcpu *pc; 1242 vm_offset_t pages; 1243 int i; 1244 1245 CPU_FOREACH(i) { 1246 pc = pcpu_find(i); 1247 /* 1248 * Skip if the mapping has already been initialized, 1249 * i.e. this is the BSP. 1250 */ 1251 if (pc->pc_cmap1_addr != 0) 1252 continue; 1253 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1254 pages = kva_alloc(PAGE_SIZE * 3); 1255 if (pages == 0) 1256 panic("%s: unable to allocate KVA", __func__); 1257 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1258 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1259 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1260 pc->pc_cmap1_addr = (caddr_t)pages; 1261 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1262 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1263 } 1264} 1265SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1266 1267/* 1268 * The function can already be use in second initialization stage. 1269 * As such, the function DOES NOT call pmap_growkernel() where PT2 1270 * allocation can happen. So if used, be sure that PT2 for given 1271 * virtual address is allocated already! 1272 * 1273 * Add a wired page to the kva. 1274 * Note: not SMP coherent. 1275 */ 1276static __inline void 1277pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1278 uint32_t attr) 1279{ 1280 pt1_entry_t *pte1p; 1281 pt2_entry_t *pte2p; 1282 1283 pte1p = kern_pte1(va); 1284 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1285 /* 1286 * This is a very low level function, so PT2 and particularly 1287 * PT2PG associated with given virtual address must be already 1288 * allocated. It's a pain mainly during pmap initialization 1289 * stage. However, called after pmap initialization with 1290 * virtual address not under kernel_vm_end will lead to 1291 * the same misery. 1292 */ 1293 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1294 panic("%s: kernel PT2 not allocated!", __func__); 1295 } 1296 1297 pte2p = pt2map_entry(va); 1298 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1299} 1300 1301PMAP_INLINE void 1302pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1303{ 1304 1305 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1306} 1307 1308/* 1309 * Remove a page from the kernel pagetables. 1310 * Note: not SMP coherent. 1311 */ 1312PMAP_INLINE void 1313pmap_kremove(vm_offset_t va) 1314{ 1315 pt2_entry_t *pte2p; 1316 1317 pte2p = pt2map_entry(va); 1318 pte2_clear(pte2p); 1319} 1320 1321/* 1322 * Share new kernel PT2PG with all pmaps. 1323 * The caller is responsible for maintaining TLB consistency. 1324 */ 1325static void 1326pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1327{ 1328 pmap_t pmap; 1329 pt2_entry_t *pte2p; 1330 1331 mtx_lock_spin(&allpmaps_lock); 1332 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1333 pte2p = pmap_pt2tab_entry(pmap, va); 1334 pt2tab_store(pte2p, npte2); 1335 } 1336 mtx_unlock_spin(&allpmaps_lock); 1337} 1338 1339/* 1340 * Share new kernel PTE1 with all pmaps. 1341 * The caller is responsible for maintaining TLB consistency. 1342 */ 1343static void 1344pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1345{ 1346 pmap_t pmap; 1347 pt1_entry_t *pte1p; 1348 1349 mtx_lock_spin(&allpmaps_lock); 1350 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1351 pte1p = pmap_pte1(pmap, va); 1352 pte1_store(pte1p, npte1); 1353 } 1354 mtx_unlock_spin(&allpmaps_lock); 1355} 1356 1357/* 1358 * Used to map a range of physical addresses into kernel 1359 * virtual address space. 1360 * 1361 * The value passed in '*virt' is a suggested virtual address for 1362 * the mapping. Architectures which can support a direct-mapped 1363 * physical to virtual region can return the appropriate address 1364 * within that region, leaving '*virt' unchanged. Other 1365 * architectures should map the pages starting at '*virt' and 1366 * update '*virt' with the first usable address after the mapped 1367 * region. 1368 * 1369 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1370 * the function is used herein! 1371 */ 1372vm_offset_t 1373pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1374{ 1375 vm_offset_t va, sva; 1376 vm_paddr_t pte1_offset; 1377 pt1_entry_t npte1; 1378 uint32_t l1prot, l2prot; 1379 uint32_t l1attr, l2attr; 1380 1381 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1382 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1383 1384 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1385 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1386 l1prot = ATTR_TO_L1(l2prot); 1387 1388 l2attr = PTE2_ATTR_DEFAULT; 1389 l1attr = ATTR_TO_L1(l2attr); 1390 1391 va = *virt; 1392 /* 1393 * Does the physical address range's size and alignment permit at 1394 * least one section mapping to be created? 1395 */ 1396 pte1_offset = start & PTE1_OFFSET; 1397 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1398 PTE1_SIZE) { 1399 /* 1400 * Increase the starting virtual address so that its alignment 1401 * does not preclude the use of section mappings. 1402 */ 1403 if ((va & PTE1_OFFSET) < pte1_offset) 1404 va = pte1_trunc(va) + pte1_offset; 1405 else if ((va & PTE1_OFFSET) > pte1_offset) 1406 va = pte1_roundup(va) + pte1_offset; 1407 } 1408 sva = va; 1409 while (start < end) { 1410 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1411 KASSERT((va & PTE1_OFFSET) == 0, 1412 ("%s: misaligned va %#x", __func__, va)); 1413 npte1 = PTE1_KERN(start, l1prot, l1attr); 1414 pmap_kenter_pte1(va, npte1); 1415 va += PTE1_SIZE; 1416 start += PTE1_SIZE; 1417 } else { 1418 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1419 va += PAGE_SIZE; 1420 start += PAGE_SIZE; 1421 } 1422 } 1423 tlb_flush_range(sva, va - sva); 1424 *virt = va; 1425 return (sva); 1426} 1427 1428/* 1429 * Make a temporary mapping for a physical address. 1430 * This is only intended to be used for panic dumps. 1431 */ 1432void * 1433pmap_kenter_temporary(vm_paddr_t pa, int i) 1434{ 1435 vm_offset_t va; 1436 1437 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1438 1439 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1440 pmap_kenter(va, pa); 1441 tlb_flush_local(va); 1442 return ((void *)crashdumpmap); 1443} 1444 1445 1446/************************************* 1447 * 1448 * TLB & cache maintenance routines. 1449 * 1450 *************************************/ 1451 1452/* 1453 * We inline these within pmap.c for speed. 1454 */ 1455PMAP_INLINE void 1456pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1457{ 1458 1459 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1460 tlb_flush(va); 1461} 1462 1463PMAP_INLINE void 1464pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1465{ 1466 1467 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1468 tlb_flush_range(sva, size); 1469} 1470 1471/* 1472 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1473 * Requirements: 1474 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1475 * are ever set, PTE2_V in particular. 1476 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1477 * - Assumes nothing will ever test these addresses for 0 to indicate 1478 * no mapping instead of correctly checking PTE2_V. 1479 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1480 * Because PTE2_V is never set, there can be no mappings to invalidate. 1481 */ 1482static vm_offset_t 1483pmap_pte2list_alloc(vm_offset_t *head) 1484{ 1485 pt2_entry_t *pte2p; 1486 vm_offset_t va; 1487 1488 va = *head; 1489 if (va == 0) 1490 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1491 pte2p = pt2map_entry(va); 1492 *head = *pte2p; 1493 if (*head & PTE2_V) 1494 panic("%s: va with PTE2_V set!", __func__); 1495 *pte2p = 0; 1496 return (va); 1497} 1498 1499static void 1500pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1501{ 1502 pt2_entry_t *pte2p; 1503 1504 if (va & PTE2_V) 1505 panic("%s: freeing va with PTE2_V set!", __func__); 1506 pte2p = pt2map_entry(va); 1507 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1508 *head = va; 1509} 1510 1511static void 1512pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1513{ 1514 int i; 1515 vm_offset_t va; 1516 1517 *head = 0; 1518 for (i = npages - 1; i >= 0; i--) { 1519 va = (vm_offset_t)base + i * PAGE_SIZE; 1520 pmap_pte2list_free(head, va); 1521 } 1522} 1523 1524/***************************************************************************** 1525 * 1526 * PMAP third and final stage initialization. 1527 * 1528 * After pmap_init() is called, PMAP subsystem is fully initialized. 1529 * 1530 *****************************************************************************/ 1531 1532SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 1533 1534SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1535 "Max number of PV entries"); 1536SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1537 "Page share factor per proc"); 1538 1539static u_long nkpt2pg = NKPT2PG; 1540SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1541 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1542 1543static int sp_enabled = 1; 1544SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1545 &sp_enabled, 0, "Are large page mappings enabled?"); 1546 1547static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, 1548 "1MB page mapping counters"); 1549 1550static u_long pmap_pte1_demotions; 1551SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1552 &pmap_pte1_demotions, 0, "1MB page demotions"); 1553 1554static u_long pmap_pte1_mappings; 1555SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1556 &pmap_pte1_mappings, 0, "1MB page mappings"); 1557 1558static u_long pmap_pte1_p_failures; 1559SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1560 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1561 1562static u_long pmap_pte1_promotions; 1563SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1564 &pmap_pte1_promotions, 0, "1MB page promotions"); 1565 1566static u_long pmap_pte1_kern_demotions; 1567SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1568 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1569 1570static u_long pmap_pte1_kern_promotions; 1571SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1572 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1573 1574static __inline ttb_entry_t 1575pmap_ttb_get(pmap_t pmap) 1576{ 1577 1578 return (vtophys(pmap->pm_pt1) | ttb_flags); 1579} 1580 1581/* 1582 * Initialize a vm_page's machine-dependent fields. 1583 * 1584 * Variations: 1585 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1586 * pt2_wirecount can share same physical space. However, proper 1587 * initialization on a page alloc for page tables and reinitialization 1588 * on the page free must be ensured. 1589 */ 1590void 1591pmap_page_init(vm_page_t m) 1592{ 1593 1594 TAILQ_INIT(&m->md.pv_list); 1595 pt2_wirecount_init(m); 1596 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1597} 1598 1599/* 1600 * Virtualization for faster way how to zero whole page. 1601 */ 1602static __inline void 1603pagezero(void *page) 1604{ 1605 1606 bzero(page, PAGE_SIZE); 1607} 1608 1609/* 1610 * Zero L2 page table page. 1611 * Use same KVA as in pmap_zero_page(). 1612 */ 1613static __inline vm_paddr_t 1614pmap_pt2pg_zero(vm_page_t m) 1615{ 1616 pt2_entry_t *cmap2_pte2p; 1617 vm_paddr_t pa; 1618 struct pcpu *pc; 1619 1620 pa = VM_PAGE_TO_PHYS(m); 1621 1622 /* 1623 * XXX: For now, we map whole page even if it's already zero, 1624 * to sync it even if the sync is only DSB. 1625 */ 1626 sched_pin(); 1627 pc = get_pcpu(); 1628 cmap2_pte2p = pc->pc_cmap2_pte2p; 1629 mtx_lock(&pc->pc_cmap_lock); 1630 if (pte2_load(cmap2_pte2p) != 0) 1631 panic("%s: CMAP2 busy", __func__); 1632 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1633 vm_page_pte2_attr(m))); 1634 /* Even VM_ALLOC_ZERO request is only advisory. */ 1635 if ((m->flags & PG_ZERO) == 0) 1636 pagezero(pc->pc_cmap2_addr); 1637 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1638 pte2_clear(cmap2_pte2p); 1639 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1640 1641 /* 1642 * Unpin the thread before releasing the lock. Otherwise the thread 1643 * could be rescheduled while still bound to the current CPU, only 1644 * to unpin itself immediately upon resuming execution. 1645 */ 1646 sched_unpin(); 1647 mtx_unlock(&pc->pc_cmap_lock); 1648 1649 return (pa); 1650} 1651 1652/* 1653 * Init just allocated page as L2 page table(s) holder 1654 * and return its physical address. 1655 */ 1656static __inline vm_paddr_t 1657pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1658{ 1659 vm_paddr_t pa; 1660 pt2_entry_t *pte2p; 1661 1662 /* Check page attributes. */ 1663 if (m->md.pat_mode != pt_memattr) 1664 pmap_page_set_memattr(m, pt_memattr); 1665 1666 /* Zero page and init wire counts. */ 1667 pa = pmap_pt2pg_zero(m); 1668 pt2_wirecount_init(m); 1669 1670 /* 1671 * Map page to PT2MAP address space for given pmap. 1672 * Note that PT2MAP space is shared with all pmaps. 1673 */ 1674 if (pmap == kernel_pmap) 1675 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1676 else { 1677 pte2p = pmap_pt2tab_entry(pmap, va); 1678 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1679 } 1680 1681 return (pa); 1682} 1683 1684/* 1685 * Initialize the pmap module. 1686 * Called by vm_init, to initialize any structures that the pmap 1687 * system needs to map virtual memory. 1688 */ 1689void 1690pmap_init(void) 1691{ 1692 vm_size_t s; 1693 pt2_entry_t *pte2p, pte2; 1694 u_int i, pte1_idx, pv_npg; 1695 1696 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1697 1698 /* 1699 * Initialize the vm page array entries for kernel pmap's 1700 * L2 page table pages allocated in advance. 1701 */ 1702 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1703 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1704 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1705 vm_paddr_t pa; 1706 vm_page_t m; 1707 1708 pte2 = pte2_load(pte2p); 1709 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1710 1711 pa = pte2_pa(pte2); 1712 m = PHYS_TO_VM_PAGE(pa); 1713 KASSERT(m >= vm_page_array && 1714 m < &vm_page_array[vm_page_array_size], 1715 ("%s: L2 page table page is out of range", __func__)); 1716 1717 m->pindex = pte1_idx; 1718 m->phys_addr = pa; 1719 pte1_idx += NPT2_IN_PG; 1720 } 1721 1722 /* 1723 * Initialize the address space (zone) for the pv entries. Set a 1724 * high water mark so that the system can recover from excessive 1725 * numbers of pv entries. 1726 */ 1727 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1728 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1729 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1730 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1731 pv_entry_high_water = 9 * (pv_entry_max / 10); 1732 1733 /* 1734 * Are large page mappings enabled? 1735 */ 1736 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1737 if (sp_enabled) { 1738 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1739 ("%s: can't assign to pagesizes[1]", __func__)); 1740 pagesizes[1] = PTE1_SIZE; 1741 } 1742 1743 /* 1744 * Calculate the size of the pv head table for sections. 1745 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1746 * Note that the table is only for sections which could be promoted. 1747 */ 1748 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1749 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1750 - first_managed_pa) / PTE1_SIZE + 1; 1751 1752 /* 1753 * Allocate memory for the pv head table for sections. 1754 */ 1755 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1756 s = round_page(s); 1757 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1758 M_WAITOK | M_ZERO); 1759 for (i = 0; i < pv_npg; i++) 1760 TAILQ_INIT(&pv_table[i].pv_list); 1761 1762 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1763 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1764 if (pv_chunkbase == NULL) 1765 panic("%s: not enough kvm for pv chunks", __func__); 1766 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1767} 1768 1769/* 1770 * Add a list of wired pages to the kva 1771 * this routine is only used for temporary 1772 * kernel mappings that do not need to have 1773 * page modification or references recorded. 1774 * Note that old mappings are simply written 1775 * over. The page *must* be wired. 1776 * Note: SMP coherent. Uses a ranged shootdown IPI. 1777 */ 1778void 1779pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1780{ 1781 u_int anychanged; 1782 pt2_entry_t *epte2p, *pte2p, pte2; 1783 vm_page_t m; 1784 vm_paddr_t pa; 1785 1786 anychanged = 0; 1787 pte2p = pt2map_entry(sva); 1788 epte2p = pte2p + count; 1789 while (pte2p < epte2p) { 1790 m = *ma++; 1791 pa = VM_PAGE_TO_PHYS(m); 1792 pte2 = pte2_load(pte2p); 1793 if ((pte2_pa(pte2) != pa) || 1794 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1795 anychanged++; 1796 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1797 vm_page_pte2_attr(m))); 1798 } 1799 pte2p++; 1800 } 1801 if (__predict_false(anychanged)) 1802 tlb_flush_range(sva, count * PAGE_SIZE); 1803} 1804 1805/* 1806 * This routine tears out page mappings from the 1807 * kernel -- it is meant only for temporary mappings. 1808 * Note: SMP coherent. Uses a ranged shootdown IPI. 1809 */ 1810void 1811pmap_qremove(vm_offset_t sva, int count) 1812{ 1813 vm_offset_t va; 1814 1815 va = sva; 1816 while (count-- > 0) { 1817 pmap_kremove(va); 1818 va += PAGE_SIZE; 1819 } 1820 tlb_flush_range(sva, va - sva); 1821} 1822 1823/* 1824 * Are we current address space or kernel? 1825 */ 1826static __inline int 1827pmap_is_current(pmap_t pmap) 1828{ 1829 1830 return (pmap == kernel_pmap || 1831 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1832} 1833 1834/* 1835 * If the given pmap is not the current or kernel pmap, the returned 1836 * pte2 must be released by passing it to pmap_pte2_release(). 1837 */ 1838static pt2_entry_t * 1839pmap_pte2(pmap_t pmap, vm_offset_t va) 1840{ 1841 pt1_entry_t pte1; 1842 vm_paddr_t pt2pg_pa; 1843 1844 pte1 = pte1_load(pmap_pte1(pmap, va)); 1845 if (pte1_is_section(pte1)) 1846 panic("%s: attempt to map PTE1", __func__); 1847 if (pte1_is_link(pte1)) { 1848 /* Are we current address space or kernel? */ 1849 if (pmap_is_current(pmap)) 1850 return (pt2map_entry(va)); 1851 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1852 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1853 mtx_lock(&PMAP2mutex); 1854 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1855 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1856 tlb_flush((vm_offset_t)PADDR2); 1857 } 1858 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1859 } 1860 return (NULL); 1861} 1862 1863/* 1864 * Releases a pte2 that was obtained from pmap_pte2(). 1865 * Be prepared for the pte2p being NULL. 1866 */ 1867static __inline void 1868pmap_pte2_release(pt2_entry_t *pte2p) 1869{ 1870 1871 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1872 mtx_unlock(&PMAP2mutex); 1873 } 1874} 1875 1876/* 1877 * Super fast pmap_pte2 routine best used when scanning 1878 * the pv lists. This eliminates many coarse-grained 1879 * invltlb calls. Note that many of the pv list 1880 * scans are across different pmaps. It is very wasteful 1881 * to do an entire tlb flush for checking a single mapping. 1882 * 1883 * If the given pmap is not the current pmap, pvh_global_lock 1884 * must be held and curthread pinned to a CPU. 1885 */ 1886static pt2_entry_t * 1887pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1888{ 1889 pt1_entry_t pte1; 1890 vm_paddr_t pt2pg_pa; 1891 1892 pte1 = pte1_load(pmap_pte1(pmap, va)); 1893 if (pte1_is_section(pte1)) 1894 panic("%s: attempt to map PTE1", __func__); 1895 if (pte1_is_link(pte1)) { 1896 /* Are we current address space or kernel? */ 1897 if (pmap_is_current(pmap)) 1898 return (pt2map_entry(va)); 1899 rw_assert(&pvh_global_lock, RA_WLOCKED); 1900 KASSERT(curthread->td_pinned > 0, 1901 ("%s: curthread not pinned", __func__)); 1902 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1903 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1904 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1905 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1906#ifdef SMP 1907 PMAP1cpu = PCPU_GET(cpuid); 1908#endif 1909 tlb_flush_local((vm_offset_t)PADDR1); 1910 PMAP1changed++; 1911 } else 1912#ifdef SMP 1913 if (PMAP1cpu != PCPU_GET(cpuid)) { 1914 PMAP1cpu = PCPU_GET(cpuid); 1915 tlb_flush_local((vm_offset_t)PADDR1); 1916 PMAP1changedcpu++; 1917 } else 1918#endif 1919 PMAP1unchanged++; 1920 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1921 } 1922 return (NULL); 1923} 1924 1925/* 1926 * Routine: pmap_extract 1927 * Function: 1928 * Extract the physical page address associated 1929 * with the given map/virtual_address pair. 1930 */ 1931vm_paddr_t 1932pmap_extract(pmap_t pmap, vm_offset_t va) 1933{ 1934 vm_paddr_t pa; 1935 pt1_entry_t pte1; 1936 pt2_entry_t *pte2p; 1937 1938 PMAP_LOCK(pmap); 1939 pte1 = pte1_load(pmap_pte1(pmap, va)); 1940 if (pte1_is_section(pte1)) 1941 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1942 else if (pte1_is_link(pte1)) { 1943 pte2p = pmap_pte2(pmap, va); 1944 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1945 pmap_pte2_release(pte2p); 1946 } else 1947 pa = 0; 1948 PMAP_UNLOCK(pmap); 1949 return (pa); 1950} 1951 1952/* 1953 * Routine: pmap_extract_and_hold 1954 * Function: 1955 * Atomically extract and hold the physical page 1956 * with the given pmap and virtual address pair 1957 * if that mapping permits the given protection. 1958 */ 1959vm_page_t 1960pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1961{ 1962 vm_paddr_t pa, lockpa; 1963 pt1_entry_t pte1; 1964 pt2_entry_t pte2, *pte2p; 1965 vm_page_t m; 1966 1967 lockpa = 0; 1968 m = NULL; 1969 PMAP_LOCK(pmap); 1970retry: 1971 pte1 = pte1_load(pmap_pte1(pmap, va)); 1972 if (pte1_is_section(pte1)) { 1973 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1974 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1975 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1976 goto retry; 1977 m = PHYS_TO_VM_PAGE(pa); 1978 vm_page_hold(m); 1979 } 1980 } else if (pte1_is_link(pte1)) { 1981 pte2p = pmap_pte2(pmap, va); 1982 pte2 = pte2_load(pte2p); 1983 pmap_pte2_release(pte2p); 1984 if (pte2_is_valid(pte2) && 1985 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 1986 pa = pte2_pa(pte2); 1987 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1988 goto retry; 1989 m = PHYS_TO_VM_PAGE(pa); 1990 vm_page_hold(m); 1991 } 1992 } 1993 PA_UNLOCK_COND(lockpa); 1994 PMAP_UNLOCK(pmap); 1995 return (m); 1996} 1997 1998/* 1999 * Grow the number of kernel L2 page table entries, if needed. 2000 */ 2001void 2002pmap_growkernel(vm_offset_t addr) 2003{ 2004 vm_page_t m; 2005 vm_paddr_t pt2pg_pa, pt2_pa; 2006 pt1_entry_t pte1; 2007 pt2_entry_t pte2; 2008 2009 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2010 /* 2011 * All the time kernel_vm_end is first KVA for which underlying 2012 * L2 page table is either not allocated or linked from L1 page table 2013 * (not considering sections). Except for two possible cases: 2014 * 2015 * (1) in the very beginning as long as pmap_growkernel() was 2016 * not called, it could be first unused KVA (which is not 2017 * rounded up to PTE1_SIZE), 2018 * 2019 * (2) when all KVA space is mapped and kernel_map->max_offset 2020 * address is not rounded up to PTE1_SIZE. (For example, 2021 * it could be 0xFFFFFFFF.) 2022 */ 2023 kernel_vm_end = pte1_roundup(kernel_vm_end); 2024 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2025 addr = roundup2(addr, PTE1_SIZE); 2026 if (addr - 1 >= kernel_map->max_offset) 2027 addr = kernel_map->max_offset; 2028 while (kernel_vm_end < addr) { 2029 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2030 if (pte1_is_valid(pte1)) { 2031 kernel_vm_end += PTE1_SIZE; 2032 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2033 kernel_vm_end = kernel_map->max_offset; 2034 break; 2035 } 2036 continue; 2037 } 2038 2039 /* 2040 * kernel_vm_end_new is used in pmap_pinit() when kernel 2041 * mappings are entered to new pmap all at once to avoid race 2042 * between pmap_kenter_pte1() and kernel_vm_end increase. 2043 * The same aplies to pmap_kenter_pt2tab(). 2044 */ 2045 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2046 2047 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2048 if (!pte2_is_valid(pte2)) { 2049 /* 2050 * Install new PT2s page into kernel PT2TAB. 2051 */ 2052 m = vm_page_alloc(NULL, 2053 pte1_index(kernel_vm_end) & ~PT2PG_MASK, 2054 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2055 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2056 if (m == NULL) 2057 panic("%s: no memory to grow kernel", __func__); 2058 /* 2059 * QQQ: To link all new L2 page tables from L1 page 2060 * table now and so pmap_kenter_pte1() them 2061 * at once together with pmap_kenter_pt2tab() 2062 * could be nice speed up. However, 2063 * pmap_growkernel() does not happen so often... 2064 * QQQ: The other TTBR is another option. 2065 */ 2066 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2067 m); 2068 } else 2069 pt2pg_pa = pte2_pa(pte2); 2070 2071 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2072 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2073 2074 kernel_vm_end = kernel_vm_end_new; 2075 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2076 kernel_vm_end = kernel_map->max_offset; 2077 break; 2078 } 2079 } 2080} 2081 2082static int 2083kvm_size(SYSCTL_HANDLER_ARGS) 2084{ 2085 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2086 2087 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2088} 2089SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2090 0, 0, kvm_size, "IU", "Size of KVM"); 2091 2092static int 2093kvm_free(SYSCTL_HANDLER_ARGS) 2094{ 2095 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2096 2097 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2098} 2099SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2100 0, 0, kvm_free, "IU", "Amount of KVM free"); 2101 2102/*********************************************** 2103 * 2104 * Pmap allocation/deallocation routines. 2105 * 2106 ***********************************************/ 2107 2108/* 2109 * Initialize the pmap for the swapper process. 2110 */ 2111void 2112pmap_pinit0(pmap_t pmap) 2113{ 2114 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2115 2116 PMAP_LOCK_INIT(pmap); 2117 2118 /* 2119 * Kernel page table directory and pmap stuff around is already 2120 * initialized, we are using it right now and here. So, finish 2121 * only PMAP structures initialization for process0 ... 2122 * 2123 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2124 * which is already included in the list "allpmaps", this pmap does 2125 * not need to be inserted into that list. 2126 */ 2127 pmap->pm_pt1 = kern_pt1; 2128 pmap->pm_pt2tab = kern_pt2tab; 2129 CPU_ZERO(&pmap->pm_active); 2130 PCPU_SET(curpmap, pmap); 2131 TAILQ_INIT(&pmap->pm_pvchunk); 2132 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2133 CPU_SET(0, &pmap->pm_active); 2134} 2135 2136static __inline void 2137pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2138 vm_offset_t eva) 2139{ 2140 u_int idx, count; 2141 2142 idx = pte1_index(sva); 2143 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2144 bcopy(spte1p + idx, dpte1p + idx, count); 2145} 2146 2147static __inline void 2148pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2149 vm_offset_t eva) 2150{ 2151 u_int idx, count; 2152 2153 idx = pt2tab_index(sva); 2154 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2155 bcopy(spte2p + idx, dpte2p + idx, count); 2156} 2157 2158/* 2159 * Initialize a preallocated and zeroed pmap structure, 2160 * such as one in a vmspace structure. 2161 */ 2162int 2163pmap_pinit(pmap_t pmap) 2164{ 2165 pt1_entry_t *pte1p; 2166 pt2_entry_t *pte2p; 2167 vm_paddr_t pa, pt2tab_pa; 2168 u_int i; 2169 2170 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2171 pmap->pm_pt1)); 2172 2173 /* 2174 * No need to allocate L2 page table space yet but we do need 2175 * a valid L1 page table and PT2TAB table. 2176 * 2177 * Install shared kernel mappings to these tables. It's a little 2178 * tricky as some parts of KVA are reserved for vectors, devices, 2179 * and whatever else. These parts are supposed to be above 2180 * vm_max_kernel_address. Thus two regions should be installed: 2181 * 2182 * (1) <KERNBASE, kernel_vm_end), 2183 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2184 * 2185 * QQQ: The second region should be stable enough to be installed 2186 * only once in time when the tables are allocated. 2187 * QQQ: Maybe copy of both regions at once could be faster ... 2188 * QQQ: Maybe the other TTBR is an option. 2189 * 2190 * Finally, install own PT2TAB table to these tables. 2191 */ 2192 2193 if (pmap->pm_pt1 == NULL) { 2194 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, 2195 NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, 2196 pt_memattr); 2197 if (pmap->pm_pt1 == NULL) 2198 return (0); 2199 } 2200 if (pmap->pm_pt2tab == NULL) { 2201 /* 2202 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2203 * only, what should be the only size for 32 bit systems, 2204 * then we could allocate it with vm_page_alloc() and all 2205 * the stuff needed as other L2 page table pages. 2206 * (2) Note that a process PT2TAB is special L2 page table 2207 * page. Its mapping in kernel_arena is permanent and can 2208 * be used no matter which process is current. Its mapping 2209 * in PT2MAP can be used only for current process. 2210 */ 2211 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, 2212 NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2213 if (pmap->pm_pt2tab == NULL) { 2214 /* 2215 * QQQ: As struct pmap is allocated from UMA with 2216 * UMA_ZONE_NOFREE flag, it's important to leave 2217 * no allocation in pmap if initialization failed. 2218 */ 2219 kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, 2220 NB_IN_PT1); 2221 pmap->pm_pt1 = NULL; 2222 return (0); 2223 } 2224 /* 2225 * QQQ: Each L2 page table page vm_page_t has pindex set to 2226 * pte1 index of virtual address mapped by this page. 2227 * It's not valid for non kernel PT2TABs themselves. 2228 * The pindex of these pages can not be altered because 2229 * of the way how they are allocated now. However, it 2230 * should not be a problem. 2231 */ 2232 } 2233 2234 mtx_lock_spin(&allpmaps_lock); 2235 /* 2236 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2237 * kernel_vm_end_new is used here instead of kernel_vm_end. 2238 */ 2239 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2240 kernel_vm_end_new - 1); 2241 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2242 0xFFFFFFFF); 2243 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2244 kernel_vm_end_new - 1); 2245 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2246 0xFFFFFFFF); 2247 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2248 mtx_unlock_spin(&allpmaps_lock); 2249 2250 /* 2251 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2252 * I.e. self reference mapping. The PT2TAB is private, however mapped 2253 * into shared PT2MAP space, so the mapping should be not global. 2254 */ 2255 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2256 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2257 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2258 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2259 } 2260 2261 /* Insert PT2MAP PT2s into pmap PT1. */ 2262 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2263 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2264 pte1_store(pte1p++, PTE1_LINK(pa)); 2265 } 2266 2267 /* 2268 * Now synchronize new mapping which was made above. 2269 */ 2270 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2271 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2272 2273 CPU_ZERO(&pmap->pm_active); 2274 TAILQ_INIT(&pmap->pm_pvchunk); 2275 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2276 2277 return (1); 2278} 2279 2280#ifdef INVARIANTS 2281static boolean_t 2282pt2tab_user_is_empty(pt2_entry_t *tab) 2283{ 2284 u_int i, end; 2285 2286 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2287 for (i = 0; i < end; i++) 2288 if (tab[i] != 0) return (FALSE); 2289 return (TRUE); 2290} 2291#endif 2292/* 2293 * Release any resources held by the given physical map. 2294 * Called when a pmap initialized by pmap_pinit is being released. 2295 * Should only be called if the map contains no valid mappings. 2296 */ 2297void 2298pmap_release(pmap_t pmap) 2299{ 2300#ifdef INVARIANTS 2301 vm_offset_t start, end; 2302#endif 2303 KASSERT(pmap->pm_stats.resident_count == 0, 2304 ("%s: pmap resident count %ld != 0", __func__, 2305 pmap->pm_stats.resident_count)); 2306 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2307 ("%s: has allocated user PT2(s)", __func__)); 2308 KASSERT(CPU_EMPTY(&pmap->pm_active), 2309 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2310 2311 mtx_lock_spin(&allpmaps_lock); 2312 LIST_REMOVE(pmap, pm_list); 2313 mtx_unlock_spin(&allpmaps_lock); 2314 2315#ifdef INVARIANTS 2316 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2317 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2318 bzero((char *)pmap->pm_pt1 + start, end - start); 2319 2320 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2321 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2322 bzero((char *)pmap->pm_pt2tab + start, end - start); 2323#endif 2324 /* 2325 * We are leaving PT1 and PT2TAB allocated on released pmap, 2326 * so hopefully UMA vmspace_zone will always be inited with 2327 * UMA_ZONE_NOFREE flag. 2328 */ 2329} 2330 2331/********************************************************* 2332 * 2333 * L2 table pages and their pages management routines. 2334 * 2335 *********************************************************/ 2336 2337/* 2338 * Virtual interface for L2 page table wire counting. 2339 * 2340 * Each L2 page table in a page has own counter which counts a number of 2341 * valid mappings in a table. Global page counter counts mappings in all 2342 * tables in a page plus a single itself mapping in PT2TAB. 2343 * 2344 * During a promotion we leave the associated L2 page table counter 2345 * untouched, so the table (strictly speaking a page which holds it) 2346 * is never freed if promoted. 2347 * 2348 * If a page m->wire_count == 1 then no valid mappings exist in any L2 page 2349 * table in the page and the page itself is only mapped in PT2TAB. 2350 */ 2351 2352static __inline void 2353pt2_wirecount_init(vm_page_t m) 2354{ 2355 u_int i; 2356 2357 /* 2358 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2359 * m->wire_count should be already set correctly. 2360 * So, there is no need to set it again herein. 2361 */ 2362 for (i = 0; i < NPT2_IN_PG; i++) 2363 m->md.pt2_wirecount[i] = 0; 2364} 2365 2366static __inline void 2367pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2368{ 2369 2370 /* 2371 * Note: A just modificated pte2 (i.e. already allocated) 2372 * is acquiring one extra reference which must be 2373 * explicitly cleared. It influences the KASSERTs herein. 2374 * All L2 page tables in a page always belong to the same 2375 * pmap, so we allow only one extra reference for the page. 2376 */ 2377 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2378 ("%s: PT2 is overflowing ...", __func__)); 2379 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2380 ("%s: PT2PG is overflowing ...", __func__)); 2381 2382 m->wire_count++; 2383 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2384} 2385 2386static __inline void 2387pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2388{ 2389 2390 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2391 ("%s: PT2 is underflowing ...", __func__)); 2392 KASSERT(m->wire_count > 1, 2393 ("%s: PT2PG is underflowing ...", __func__)); 2394 2395 m->wire_count--; 2396 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2397} 2398 2399static __inline void 2400pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2401{ 2402 2403 KASSERT(count <= NPTE2_IN_PT2, 2404 ("%s: invalid count %u", __func__, count)); 2405 KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2406 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, 2407 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2408 2409 m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2410 m->wire_count += count; 2411 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2412 2413 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2414 ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); 2415} 2416 2417static __inline uint32_t 2418pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2419{ 2420 2421 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2422} 2423 2424static __inline boolean_t 2425pt2_is_empty(vm_page_t m, vm_offset_t va) 2426{ 2427 2428 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2429} 2430 2431static __inline boolean_t 2432pt2_is_full(vm_page_t m, vm_offset_t va) 2433{ 2434 2435 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2436 NPTE2_IN_PT2); 2437} 2438 2439static __inline boolean_t 2440pt2pg_is_empty(vm_page_t m) 2441{ 2442 2443 return (m->wire_count == 1); 2444} 2445 2446/* 2447 * This routine is called if the L2 page table 2448 * is not mapped correctly. 2449 */ 2450static vm_page_t 2451_pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2452{ 2453 uint32_t pte1_idx; 2454 pt1_entry_t *pte1p; 2455 pt2_entry_t pte2; 2456 vm_page_t m; 2457 vm_paddr_t pt2pg_pa, pt2_pa; 2458 2459 pte1_idx = pte1_index(va); 2460 pte1p = pmap->pm_pt1 + pte1_idx; 2461 2462 KASSERT(pte1_load(pte1p) == 0, 2463 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2464 pte1_load(pte1p))); 2465 2466 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2467 if (!pte2_is_valid(pte2)) { 2468 /* 2469 * Install new PT2s page into pmap PT2TAB. 2470 */ 2471 m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, 2472 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2473 if (m == NULL) { 2474 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2475 PMAP_UNLOCK(pmap); 2476 rw_wunlock(&pvh_global_lock); 2477 VM_WAIT; 2478 rw_wlock(&pvh_global_lock); 2479 PMAP_LOCK(pmap); 2480 } 2481 2482 /* 2483 * Indicate the need to retry. While waiting, 2484 * the L2 page table page may have been allocated. 2485 */ 2486 return (NULL); 2487 } 2488 pmap->pm_stats.resident_count++; 2489 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2490 } else { 2491 pt2pg_pa = pte2_pa(pte2); 2492 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2493 } 2494 2495 pt2_wirecount_inc(m, pte1_idx); 2496 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2497 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2498 2499 return (m); 2500} 2501 2502static vm_page_t 2503pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2504{ 2505 u_int pte1_idx; 2506 pt1_entry_t *pte1p, pte1; 2507 vm_page_t m; 2508 2509 pte1_idx = pte1_index(va); 2510retry: 2511 pte1p = pmap->pm_pt1 + pte1_idx; 2512 pte1 = pte1_load(pte1p); 2513 2514 /* 2515 * This supports switching from a 1MB page to a 2516 * normal 4K page. 2517 */ 2518 if (pte1_is_section(pte1)) { 2519 (void)pmap_demote_pte1(pmap, pte1p, va); 2520 /* 2521 * Reload pte1 after demotion. 2522 * 2523 * Note: Demotion can even fail as either PT2 is not find for 2524 * the virtual address or PT2PG can not be allocated. 2525 */ 2526 pte1 = pte1_load(pte1p); 2527 } 2528 2529 /* 2530 * If the L2 page table page is mapped, we just increment the 2531 * hold count, and activate it. 2532 */ 2533 if (pte1_is_link(pte1)) { 2534 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2535 pt2_wirecount_inc(m, pte1_idx); 2536 } else { 2537 /* 2538 * Here if the PT2 isn't mapped, or if it has 2539 * been deallocated. 2540 */ 2541 m = _pmap_allocpte2(pmap, va, flags); 2542 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2543 goto retry; 2544 } 2545 2546 return (m); 2547} 2548 2549static __inline void 2550pmap_free_zero_pages(struct spglist *free) 2551{ 2552 vm_page_t m; 2553 2554 while ((m = SLIST_FIRST(free)) != NULL) { 2555 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2556 /* Preserve the page's PG_ZERO setting. */ 2557 vm_page_free_toq(m); 2558 } 2559} 2560 2561/* 2562 * Schedule the specified unused L2 page table page to be freed. Specifically, 2563 * add the page to the specified list of pages that will be released to the 2564 * physical memory manager after the TLB has been updated. 2565 */ 2566static __inline void 2567pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2568{ 2569 2570 /* 2571 * Put page on a list so that it is released after 2572 * *ALL* TLB shootdown is done 2573 */ 2574#ifdef PMAP_DEBUG 2575 pmap_zero_page_check(m); 2576#endif 2577 m->flags |= PG_ZERO; 2578 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2579} 2580 2581/* 2582 * Unwire L2 page tables page. 2583 */ 2584static void 2585pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2586{ 2587 pt1_entry_t *pte1p, opte1 __unused; 2588 pt2_entry_t *pte2p; 2589 uint32_t i; 2590 2591 KASSERT(pt2pg_is_empty(m), 2592 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2593 2594 /* 2595 * Unmap all L2 page tables in the page from L1 page table. 2596 * 2597 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2598 * earlier. However, we are doing that this way. 2599 */ 2600 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2601 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2602 pte1p = pmap->pm_pt1 + m->pindex; 2603 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2604 KASSERT(m->md.pt2_wirecount[i] == 0, 2605 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2606 opte1 = pte1_load(pte1p); 2607 if (pte1_is_link(opte1)) { 2608 pte1_clear(pte1p); 2609 /* 2610 * Flush intermediate TLB cache. 2611 */ 2612 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2613 } 2614#ifdef INVARIANTS 2615 else 2616 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2617 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2618 pmap, va, opte1, i)); 2619#endif 2620 } 2621 2622 /* 2623 * Unmap the page from PT2TAB. 2624 */ 2625 pte2p = pmap_pt2tab_entry(pmap, va); 2626 (void)pt2tab_load_clear(pte2p); 2627 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2628 2629 m->wire_count = 0; 2630 pmap->pm_stats.resident_count--; 2631 2632 /* 2633 * This is a release store so that the ordinary store unmapping 2634 * the L2 page table page is globally performed before TLB shoot- 2635 * down is begun. 2636 */ 2637 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); 2638} 2639 2640/* 2641 * Decrements a L2 page table page's wire count, which is used to record the 2642 * number of valid page table entries within the page. If the wire count 2643 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2644 * page table page was unmapped and FALSE otherwise. 2645 */ 2646static __inline boolean_t 2647pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2648{ 2649 pt2_wirecount_dec(m, pte1_index(va)); 2650 if (pt2pg_is_empty(m)) { 2651 /* 2652 * QQQ: Wire count is zero, so whole page should be zero and 2653 * we can set PG_ZERO flag to it. 2654 * Note that when promotion is enabled, it takes some 2655 * more efforts. See pmap_unwire_pt2_all() below. 2656 */ 2657 pmap_unwire_pt2pg(pmap, va, m); 2658 pmap_add_delayed_free_list(m, free); 2659 return (TRUE); 2660 } else 2661 return (FALSE); 2662} 2663 2664/* 2665 * Drop a L2 page table page's wire count at once, which is used to record 2666 * the number of valid L2 page table entries within the page. If the wire 2667 * count drops to zero, then the L2 page table page is unmapped. 2668 */ 2669static __inline void 2670pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2671 struct spglist *free) 2672{ 2673 u_int pte1_idx = pte1_index(va); 2674 2675 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2676 ("%s: PT2 page's pindex is wrong", __func__)); 2677 KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), 2678 ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, 2679 pt2_wirecount_get(m, pte1_idx))); 2680 2681 /* 2682 * It's possible that the L2 page table was never used. 2683 * It happened in case that a section was created without promotion. 2684 */ 2685 if (pt2_is_full(m, va)) { 2686 pt2_wirecount_set(m, pte1_idx, 0); 2687 2688 /* 2689 * QQQ: We clear L2 page table now, so when L2 page table page 2690 * is going to be freed, we can set it PG_ZERO flag ... 2691 * This function is called only on section mappings, so 2692 * hopefully it's not to big overload. 2693 * 2694 * XXX: If pmap is current, existing PT2MAP mapping could be 2695 * used for zeroing. 2696 */ 2697 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2698 } 2699#ifdef INVARIANTS 2700 else 2701 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2702 __func__, pt2_wirecount_get(m, pte1_idx))); 2703#endif 2704 if (pt2pg_is_empty(m)) { 2705 pmap_unwire_pt2pg(pmap, va, m); 2706 pmap_add_delayed_free_list(m, free); 2707 } 2708} 2709 2710/* 2711 * After removing a L2 page table entry, this routine is used to 2712 * conditionally free the page, and manage the hold/wire counts. 2713 */ 2714static boolean_t 2715pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2716{ 2717 pt1_entry_t pte1; 2718 vm_page_t mpte; 2719 2720 if (va >= VM_MAXUSER_ADDRESS) 2721 return (FALSE); 2722 pte1 = pte1_load(pmap_pte1(pmap, va)); 2723 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2724 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2725} 2726 2727/************************************* 2728 * 2729 * Page management routines. 2730 * 2731 *************************************/ 2732 2733CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2734CTASSERT(_NPCM == 11); 2735CTASSERT(_NPCPV == 336); 2736 2737static __inline struct pv_chunk * 2738pv_to_chunk(pv_entry_t pv) 2739{ 2740 2741 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2742} 2743 2744#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2745 2746#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2747#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2748 2749static const uint32_t pc_freemask[_NPCM] = { 2750 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2751 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2752 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2753 PC_FREE0_9, PC_FREE10 2754}; 2755 2756SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2757 "Current number of pv entries"); 2758 2759#ifdef PV_STATS 2760static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2761 2762SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2763 "Current number of pv entry chunks"); 2764SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2765 "Current number of pv entry chunks allocated"); 2766SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2767 "Current number of pv entry chunks frees"); 2768SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2769 0, "Number of times tried to get a chunk page but failed."); 2770 2771static long pv_entry_frees, pv_entry_allocs; 2772static int pv_entry_spare; 2773 2774SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2775 "Current number of pv entry frees"); 2776SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2777 0, "Current number of pv entry allocs"); 2778SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2779 "Current number of spare pv entries"); 2780#endif 2781 2782/* 2783 * Is given page managed? 2784 */ 2785static __inline bool 2786is_managed(vm_paddr_t pa) 2787{ 2788 vm_page_t m; 2789 2790 m = PHYS_TO_VM_PAGE(pa); 2791 if (m == NULL) 2792 return (false); 2793 return ((m->oflags & VPO_UNMANAGED) == 0); 2794} 2795 2796static __inline bool 2797pte1_is_managed(pt1_entry_t pte1) 2798{ 2799 2800 return (is_managed(pte1_pa(pte1))); 2801} 2802 2803static __inline bool 2804pte2_is_managed(pt2_entry_t pte2) 2805{ 2806 2807 return (is_managed(pte2_pa(pte2))); 2808} 2809 2810/* 2811 * We are in a serious low memory condition. Resort to 2812 * drastic measures to free some pages so we can allocate 2813 * another pv entry chunk. 2814 */ 2815static vm_page_t 2816pmap_pv_reclaim(pmap_t locked_pmap) 2817{ 2818 struct pch newtail; 2819 struct pv_chunk *pc; 2820 struct md_page *pvh; 2821 pt1_entry_t *pte1p; 2822 pmap_t pmap; 2823 pt2_entry_t *pte2p, tpte2; 2824 pv_entry_t pv; 2825 vm_offset_t va; 2826 vm_page_t m, m_pc; 2827 struct spglist free; 2828 uint32_t inuse; 2829 int bit, field, freed; 2830 2831 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2832 pmap = NULL; 2833 m_pc = NULL; 2834 SLIST_INIT(&free); 2835 TAILQ_INIT(&newtail); 2836 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2837 SLIST_EMPTY(&free))) { 2838 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2839 if (pmap != pc->pc_pmap) { 2840 if (pmap != NULL) { 2841 if (pmap != locked_pmap) 2842 PMAP_UNLOCK(pmap); 2843 } 2844 pmap = pc->pc_pmap; 2845 /* Avoid deadlock and lock recursion. */ 2846 if (pmap > locked_pmap) 2847 PMAP_LOCK(pmap); 2848 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2849 pmap = NULL; 2850 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2851 continue; 2852 } 2853 } 2854 2855 /* 2856 * Destroy every non-wired, 4 KB page mapping in the chunk. 2857 */ 2858 freed = 0; 2859 for (field = 0; field < _NPCM; field++) { 2860 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2861 inuse != 0; inuse &= ~(1UL << bit)) { 2862 bit = ffs(inuse) - 1; 2863 pv = &pc->pc_pventry[field * 32 + bit]; 2864 va = pv->pv_va; 2865 pte1p = pmap_pte1(pmap, va); 2866 if (pte1_is_section(pte1_load(pte1p))) 2867 continue; 2868 pte2p = pmap_pte2(pmap, va); 2869 tpte2 = pte2_load(pte2p); 2870 if ((tpte2 & PTE2_W) == 0) 2871 tpte2 = pte2_load_clear(pte2p); 2872 pmap_pte2_release(pte2p); 2873 if ((tpte2 & PTE2_W) != 0) 2874 continue; 2875 KASSERT(tpte2 != 0, 2876 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2877 pmap, va)); 2878 pmap_tlb_flush(pmap, va); 2879 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2880 if (pte2_is_dirty(tpte2)) 2881 vm_page_dirty(m); 2882 if ((tpte2 & PTE2_A) != 0) 2883 vm_page_aflag_set(m, PGA_REFERENCED); 2884 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2885 if (TAILQ_EMPTY(&m->md.pv_list) && 2886 (m->flags & PG_FICTITIOUS) == 0) { 2887 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2888 if (TAILQ_EMPTY(&pvh->pv_list)) { 2889 vm_page_aflag_clear(m, 2890 PGA_WRITEABLE); 2891 } 2892 } 2893 pc->pc_map[field] |= 1UL << bit; 2894 pmap_unuse_pt2(pmap, va, &free); 2895 freed++; 2896 } 2897 } 2898 if (freed == 0) { 2899 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2900 continue; 2901 } 2902 /* Every freed mapping is for a 4 KB page. */ 2903 pmap->pm_stats.resident_count -= freed; 2904 PV_STAT(pv_entry_frees += freed); 2905 PV_STAT(pv_entry_spare += freed); 2906 pv_entry_count -= freed; 2907 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2908 for (field = 0; field < _NPCM; field++) 2909 if (pc->pc_map[field] != pc_freemask[field]) { 2910 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2911 pc_list); 2912 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2913 2914 /* 2915 * One freed pv entry in locked_pmap is 2916 * sufficient. 2917 */ 2918 if (pmap == locked_pmap) 2919 goto out; 2920 break; 2921 } 2922 if (field == _NPCM) { 2923 PV_STAT(pv_entry_spare -= _NPCPV); 2924 PV_STAT(pc_chunk_count--); 2925 PV_STAT(pc_chunk_frees++); 2926 /* Entire chunk is free; return it. */ 2927 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2928 pmap_qremove((vm_offset_t)pc, 1); 2929 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2930 break; 2931 } 2932 } 2933out: 2934 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2935 if (pmap != NULL) { 2936 if (pmap != locked_pmap) 2937 PMAP_UNLOCK(pmap); 2938 } 2939 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2940 m_pc = SLIST_FIRST(&free); 2941 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2942 /* Recycle a freed page table page. */ 2943 m_pc->wire_count = 1; 2944 atomic_add_int(&vm_cnt.v_wire_count, 1); 2945 } 2946 pmap_free_zero_pages(&free); 2947 return (m_pc); 2948} 2949 2950static void 2951free_pv_chunk(struct pv_chunk *pc) 2952{ 2953 vm_page_t m; 2954 2955 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2956 PV_STAT(pv_entry_spare -= _NPCPV); 2957 PV_STAT(pc_chunk_count--); 2958 PV_STAT(pc_chunk_frees++); 2959 /* entire chunk is free, return it */ 2960 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2961 pmap_qremove((vm_offset_t)pc, 1); 2962 vm_page_unwire(m, PQ_NONE); 2963 vm_page_free(m); 2964 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2965} 2966 2967/* 2968 * Free the pv_entry back to the free list. 2969 */ 2970static void 2971free_pv_entry(pmap_t pmap, pv_entry_t pv) 2972{ 2973 struct pv_chunk *pc; 2974 int idx, field, bit; 2975 2976 rw_assert(&pvh_global_lock, RA_WLOCKED); 2977 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2978 PV_STAT(pv_entry_frees++); 2979 PV_STAT(pv_entry_spare++); 2980 pv_entry_count--; 2981 pc = pv_to_chunk(pv); 2982 idx = pv - &pc->pc_pventry[0]; 2983 field = idx / 32; 2984 bit = idx % 32; 2985 pc->pc_map[field] |= 1ul << bit; 2986 for (idx = 0; idx < _NPCM; idx++) 2987 if (pc->pc_map[idx] != pc_freemask[idx]) { 2988 /* 2989 * 98% of the time, pc is already at the head of the 2990 * list. If it isn't already, move it to the head. 2991 */ 2992 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2993 pc)) { 2994 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2995 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2996 pc_list); 2997 } 2998 return; 2999 } 3000 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3001 free_pv_chunk(pc); 3002} 3003 3004/* 3005 * Get a new pv_entry, allocating a block from the system 3006 * when needed. 3007 */ 3008static pv_entry_t 3009get_pv_entry(pmap_t pmap, boolean_t try) 3010{ 3011 static const struct timeval printinterval = { 60, 0 }; 3012 static struct timeval lastprint; 3013 int bit, field; 3014 pv_entry_t pv; 3015 struct pv_chunk *pc; 3016 vm_page_t m; 3017 3018 rw_assert(&pvh_global_lock, RA_WLOCKED); 3019 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3020 PV_STAT(pv_entry_allocs++); 3021 pv_entry_count++; 3022 if (pv_entry_count > pv_entry_high_water) 3023 if (ratecheck(&lastprint, &printinterval)) 3024 printf("Approaching the limit on PV entries, consider " 3025 "increasing either the vm.pmap.shpgperproc or the " 3026 "vm.pmap.pv_entry_max tunable.\n"); 3027retry: 3028 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3029 if (pc != NULL) { 3030 for (field = 0; field < _NPCM; field++) { 3031 if (pc->pc_map[field]) { 3032 bit = ffs(pc->pc_map[field]) - 1; 3033 break; 3034 } 3035 } 3036 if (field < _NPCM) { 3037 pv = &pc->pc_pventry[field * 32 + bit]; 3038 pc->pc_map[field] &= ~(1ul << bit); 3039 /* If this was the last item, move it to tail */ 3040 for (field = 0; field < _NPCM; field++) 3041 if (pc->pc_map[field] != 0) { 3042 PV_STAT(pv_entry_spare--); 3043 return (pv); /* not full, return */ 3044 } 3045 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3046 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3047 PV_STAT(pv_entry_spare--); 3048 return (pv); 3049 } 3050 } 3051 /* 3052 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3053 * global lock. If "pv_vafree" is currently non-empty, it will 3054 * remain non-empty until pmap_pte2list_alloc() completes. 3055 */ 3056 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3057 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3058 if (try) { 3059 pv_entry_count--; 3060 PV_STAT(pc_chunk_tryfail++); 3061 return (NULL); 3062 } 3063 m = pmap_pv_reclaim(pmap); 3064 if (m == NULL) 3065 goto retry; 3066 } 3067 PV_STAT(pc_chunk_count++); 3068 PV_STAT(pc_chunk_allocs++); 3069 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3070 pmap_qenter((vm_offset_t)pc, &m, 1); 3071 pc->pc_pmap = pmap; 3072 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3073 for (field = 1; field < _NPCM; field++) 3074 pc->pc_map[field] = pc_freemask[field]; 3075 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3076 pv = &pc->pc_pventry[0]; 3077 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3078 PV_STAT(pv_entry_spare += _NPCPV - 1); 3079 return (pv); 3080} 3081 3082/* 3083 * Create a pv entry for page at pa for 3084 * (pmap, va). 3085 */ 3086static void 3087pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3088{ 3089 pv_entry_t pv; 3090 3091 rw_assert(&pvh_global_lock, RA_WLOCKED); 3092 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3093 pv = get_pv_entry(pmap, FALSE); 3094 pv->pv_va = va; 3095 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3096} 3097 3098static __inline pv_entry_t 3099pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3100{ 3101 pv_entry_t pv; 3102 3103 rw_assert(&pvh_global_lock, RA_WLOCKED); 3104 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3105 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3106 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3107 break; 3108 } 3109 } 3110 return (pv); 3111} 3112 3113static void 3114pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3115{ 3116 pv_entry_t pv; 3117 3118 pv = pmap_pvh_remove(pvh, pmap, va); 3119 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3120 free_pv_entry(pmap, pv); 3121} 3122 3123static void 3124pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3125{ 3126 struct md_page *pvh; 3127 3128 rw_assert(&pvh_global_lock, RA_WLOCKED); 3129 pmap_pvh_free(&m->md, pmap, va); 3130 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3131 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3132 if (TAILQ_EMPTY(&pvh->pv_list)) 3133 vm_page_aflag_clear(m, PGA_WRITEABLE); 3134 } 3135} 3136 3137static void 3138pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3139{ 3140 struct md_page *pvh; 3141 pv_entry_t pv; 3142 vm_offset_t va_last; 3143 vm_page_t m; 3144 3145 rw_assert(&pvh_global_lock, RA_WLOCKED); 3146 KASSERT((pa & PTE1_OFFSET) == 0, 3147 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3148 3149 /* 3150 * Transfer the 1mpage's pv entry for this mapping to the first 3151 * page's pv list. 3152 */ 3153 pvh = pa_to_pvh(pa); 3154 va = pte1_trunc(va); 3155 pv = pmap_pvh_remove(pvh, pmap, va); 3156 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3157 m = PHYS_TO_VM_PAGE(pa); 3158 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3159 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3160 va_last = va + PTE1_SIZE - PAGE_SIZE; 3161 do { 3162 m++; 3163 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3164 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3165 va += PAGE_SIZE; 3166 pmap_insert_entry(pmap, va, m); 3167 } while (va < va_last); 3168} 3169 3170#if VM_NRESERVLEVEL > 0 3171static void 3172pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3173{ 3174 struct md_page *pvh; 3175 pv_entry_t pv; 3176 vm_offset_t va_last; 3177 vm_page_t m; 3178 3179 rw_assert(&pvh_global_lock, RA_WLOCKED); 3180 KASSERT((pa & PTE1_OFFSET) == 0, 3181 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3182 3183 /* 3184 * Transfer the first page's pv entry for this mapping to the 3185 * 1mpage's pv list. Aside from avoiding the cost of a call 3186 * to get_pv_entry(), a transfer avoids the possibility that 3187 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3188 * removes one of the mappings that is being promoted. 3189 */ 3190 m = PHYS_TO_VM_PAGE(pa); 3191 va = pte1_trunc(va); 3192 pv = pmap_pvh_remove(&m->md, pmap, va); 3193 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3194 pvh = pa_to_pvh(pa); 3195 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3196 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3197 va_last = va + PTE1_SIZE - PAGE_SIZE; 3198 do { 3199 m++; 3200 va += PAGE_SIZE; 3201 pmap_pvh_free(&m->md, pmap, va); 3202 } while (va < va_last); 3203} 3204#endif 3205 3206/* 3207 * Conditionally create a pv entry. 3208 */ 3209static boolean_t 3210pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3211{ 3212 pv_entry_t pv; 3213 3214 rw_assert(&pvh_global_lock, RA_WLOCKED); 3215 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3216 if (pv_entry_count < pv_entry_high_water && 3217 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3218 pv->pv_va = va; 3219 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3220 return (TRUE); 3221 } else 3222 return (FALSE); 3223} 3224 3225/* 3226 * Create the pv entries for each of the pages within a section. 3227 */ 3228static boolean_t 3229pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3230{ 3231 struct md_page *pvh; 3232 pv_entry_t pv; 3233 3234 rw_assert(&pvh_global_lock, RA_WLOCKED); 3235 if (pv_entry_count < pv_entry_high_water && 3236 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3237 pv->pv_va = va; 3238 pvh = pa_to_pvh(pa); 3239 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3240 return (TRUE); 3241 } else 3242 return (FALSE); 3243} 3244 3245static inline void 3246pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3247{ 3248 3249 /* Kill all the small mappings or the big one only. */ 3250 if (pte1_is_section(npte1)) 3251 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3252 else 3253 pmap_tlb_flush(pmap, pte1_trunc(va)); 3254} 3255 3256/* 3257 * Update kernel pte1 on all pmaps. 3258 * 3259 * The following function is called only on one cpu with disabled interrupts. 3260 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3261 * nobody can invoke explicit hardware table walk during the update of pte1. 3262 * Unsolicited hardware table walk can still happen, invoked by speculative 3263 * data or instruction prefetch or even by speculative hardware table walk. 3264 * 3265 * The break-before-make approach should be implemented here. However, it's 3266 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3267 * itself unexpectedly but voluntarily. 3268 */ 3269static void 3270pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3271{ 3272 pmap_t pmap; 3273 pt1_entry_t *pte1p; 3274 3275 /* 3276 * Get current pmap. Interrupts should be disabled here 3277 * so PCPU_GET() is done atomically. 3278 */ 3279 pmap = PCPU_GET(curpmap); 3280 if (pmap == NULL) 3281 pmap = kernel_pmap; 3282 3283 /* 3284 * (1) Change pte1 on current pmap. 3285 * (2) Flush all obsolete TLB entries on current CPU. 3286 * (3) Change pte1 on all pmaps. 3287 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3288 */ 3289 3290 pte1p = pmap_pte1(pmap, va); 3291 pte1_store(pte1p, npte1); 3292 3293 /* Kill all the small mappings or the big one only. */ 3294 if (pte1_is_section(npte1)) { 3295 pmap_pte1_kern_promotions++; 3296 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3297 } else { 3298 pmap_pte1_kern_demotions++; 3299 tlb_flush_local(pte1_trunc(va)); 3300 } 3301 3302 /* 3303 * In SMP case, this function is called when all cpus are at smp 3304 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3305 * In UP case, the function is called with this lock locked. 3306 */ 3307 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3308 pte1p = pmap_pte1(pmap, va); 3309 pte1_store(pte1p, npte1); 3310 } 3311 3312#ifdef SMP 3313 /* Kill all the small mappings or the big one only. */ 3314 if (pte1_is_section(npte1)) 3315 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3316 else 3317 tlb_flush(pte1_trunc(va)); 3318#endif 3319} 3320 3321#ifdef SMP 3322struct pte1_action { 3323 vm_offset_t va; 3324 pt1_entry_t npte1; 3325 u_int update; /* CPU that updates the PTE1 */ 3326}; 3327 3328static void 3329pmap_update_pte1_action(void *arg) 3330{ 3331 struct pte1_action *act = arg; 3332 3333 if (act->update == PCPU_GET(cpuid)) 3334 pmap_update_pte1_kernel(act->va, act->npte1); 3335} 3336 3337/* 3338 * Change pte1 on current pmap. 3339 * Note that kernel pte1 must be changed on all pmaps. 3340 * 3341 * According to the architecture reference manual published by ARM, 3342 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3343 * According to this manual, UNPREDICTABLE behaviours must never happen in 3344 * a viable system. In contrast, on x86 processors, it is not specified which 3345 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3346 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3347 * Black). 3348 * 3349 * It's a problem when either promotion or demotion is being done. The pte1 3350 * update and appropriate TLB flush must be done atomically in general. 3351 */ 3352static void 3353pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3354 pt1_entry_t npte1) 3355{ 3356 3357 if (pmap == kernel_pmap) { 3358 struct pte1_action act; 3359 3360 sched_pin(); 3361 act.va = va; 3362 act.npte1 = npte1; 3363 act.update = PCPU_GET(cpuid); 3364 smp_rendezvous_cpus(all_cpus, smp_no_rendevous_barrier, 3365 pmap_update_pte1_action, NULL, &act); 3366 sched_unpin(); 3367 } else { 3368 register_t cspr; 3369 3370 /* 3371 * Use break-before-make approach for changing userland 3372 * mappings. It can cause L1 translation aborts on other 3373 * cores in SMP case. So, special treatment is implemented 3374 * in pmap_fault(). To reduce the likelihood that another core 3375 * will be affected by the broken mapping, disable interrupts 3376 * until the mapping change is completed. 3377 */ 3378 cspr = disable_interrupts(PSR_I | PSR_F); 3379 pte1_clear(pte1p); 3380 pmap_tlb_flush_pte1(pmap, va, npte1); 3381 pte1_store(pte1p, npte1); 3382 restore_interrupts(cspr); 3383 } 3384} 3385#else 3386static void 3387pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3388 pt1_entry_t npte1) 3389{ 3390 3391 if (pmap == kernel_pmap) { 3392 mtx_lock_spin(&allpmaps_lock); 3393 pmap_update_pte1_kernel(va, npte1); 3394 mtx_unlock_spin(&allpmaps_lock); 3395 } else { 3396 register_t cspr; 3397 3398 /* 3399 * Use break-before-make approach for changing userland 3400 * mappings. It's absolutely safe in UP case when interrupts 3401 * are disabled. 3402 */ 3403 cspr = disable_interrupts(PSR_I | PSR_F); 3404 pte1_clear(pte1p); 3405 pmap_tlb_flush_pte1(pmap, va, npte1); 3406 pte1_store(pte1p, npte1); 3407 restore_interrupts(cspr); 3408 } 3409} 3410#endif 3411 3412#if VM_NRESERVLEVEL > 0 3413/* 3414 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3415 * within a single page table page (PT2) to a single 1MB page mapping. 3416 * For promotion to occur, two conditions must be met: (1) the 4KB page 3417 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3418 * mappings must have identical characteristics. 3419 * 3420 * Managed (PG_MANAGED) mappings within the kernel address space are not 3421 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3422 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3423 * read the PTE1 from the kernel pmap. 3424 */ 3425static void 3426pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3427{ 3428 pt1_entry_t npte1; 3429 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3430 pt2_entry_t *pte2p, pte2; 3431 vm_offset_t pteva __unused; 3432 vm_page_t m __unused; 3433 3434 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3435 pmap, va, pte1_load(pte1p), pte1p)); 3436 3437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3438 3439 /* 3440 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3441 * either invalid, unused, or does not map the first 4KB physical page 3442 * within a 1MB page. 3443 */ 3444 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3445 fpte2 = pte2_load(fpte2p); 3446 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3447 (PTE2_A | PTE2_V)) { 3448 pmap_pte1_p_failures++; 3449 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3450 __func__, va, pmap); 3451 return; 3452 } 3453 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3454 pmap_pte1_p_failures++; 3455 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3456 __func__, va, pmap); 3457 return; 3458 } 3459 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3460 /* 3461 * When page is not modified, PTE2_RO can be set without 3462 * a TLB invalidation. 3463 */ 3464 fpte2 |= PTE2_RO; 3465 pte2_store(fpte2p, fpte2); 3466 } 3467 3468 /* 3469 * Examine each of the other PTE2s in the specified PT2. Abort if this 3470 * PTE2 maps an unexpected 4KB physical page or does not have identical 3471 * characteristics to the first PTE2. 3472 */ 3473 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3474 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3475 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3476 pte2 = pte2_load(pte2p); 3477 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3478 pmap_pte1_p_failures++; 3479 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3480 __func__, va, pmap); 3481 return; 3482 } 3483 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3484 /* 3485 * When page is not modified, PTE2_RO can be set 3486 * without a TLB invalidation. See note above. 3487 */ 3488 pte2 |= PTE2_RO; 3489 pte2_store(pte2p, pte2); 3490 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3491 PTE2_FRAME); 3492 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3493 __func__, pteva, pmap); 3494 } 3495 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3496 pmap_pte1_p_failures++; 3497 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3498 __func__, va, pmap); 3499 return; 3500 } 3501 3502 fpte2_fav -= PTE2_SIZE; 3503 } 3504 /* 3505 * The page table page in its current state will stay in PT2TAB 3506 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3507 * or destroyed by pmap_remove_pte1(). 3508 * 3509 * Note that L2 page table size is not equal to PAGE_SIZE. 3510 */ 3511 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3512 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3513 ("%s: PT2 page is out of range", __func__)); 3514 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3515 ("%s: PT2 page's pindex is wrong", __func__)); 3516 3517 /* 3518 * Get pte1 from pte2 format. 3519 */ 3520 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3521 3522 /* 3523 * Promote the pv entries. 3524 */ 3525 if (pte2_is_managed(fpte2)) 3526 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3527 3528 /* 3529 * Promote the mappings. 3530 */ 3531 pmap_change_pte1(pmap, pte1p, va, npte1); 3532 3533 pmap_pte1_promotions++; 3534 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3535 __func__, va, pmap); 3536 3537 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3538 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3539} 3540#endif /* VM_NRESERVLEVEL > 0 */ 3541 3542/* 3543 * Zero L2 page table page. 3544 */ 3545static __inline void 3546pmap_clear_pt2(pt2_entry_t *fpte2p) 3547{ 3548 pt2_entry_t *pte2p; 3549 3550 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3551 pte2_clear(pte2p); 3552 3553} 3554 3555/* 3556 * Removes a 1MB page mapping from the kernel pmap. 3557 */ 3558static void 3559pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3560{ 3561 vm_page_t m; 3562 uint32_t pte1_idx; 3563 pt2_entry_t *fpte2p; 3564 vm_paddr_t pt2_pa; 3565 3566 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3567 m = pmap_pt2_page(pmap, va); 3568 if (m == NULL) 3569 /* 3570 * QQQ: Is this function called only on promoted pte1? 3571 * We certainly do section mappings directly 3572 * (without promotion) in kernel !!! 3573 */ 3574 panic("%s: missing pt2 page", __func__); 3575 3576 pte1_idx = pte1_index(va); 3577 3578 /* 3579 * Initialize the L2 page table. 3580 */ 3581 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3582 pmap_clear_pt2(fpte2p); 3583 3584 /* 3585 * Remove the mapping. 3586 */ 3587 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3588 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3589 3590 /* 3591 * QQQ: We do not need to invalidate PT2MAP mapping 3592 * as we did not change it. I.e. the L2 page table page 3593 * was and still is mapped the same way. 3594 */ 3595} 3596 3597/* 3598 * Do the things to unmap a section in a process 3599 */ 3600static void 3601pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3602 struct spglist *free) 3603{ 3604 pt1_entry_t opte1; 3605 struct md_page *pvh; 3606 vm_offset_t eva, va; 3607 vm_page_t m; 3608 3609 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3610 pte1_load(pte1p), pte1p)); 3611 3612 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3613 KASSERT((sva & PTE1_OFFSET) == 0, 3614 ("%s: sva is not 1mpage aligned", __func__)); 3615 3616 /* 3617 * Clear and invalidate the mapping. It should occupy one and only TLB 3618 * entry. So, pmap_tlb_flush() called with aligned address should be 3619 * sufficient. 3620 */ 3621 opte1 = pte1_load_clear(pte1p); 3622 pmap_tlb_flush(pmap, sva); 3623 3624 if (pte1_is_wired(opte1)) 3625 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3626 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3627 if (pte1_is_managed(opte1)) { 3628 pvh = pa_to_pvh(pte1_pa(opte1)); 3629 pmap_pvh_free(pvh, pmap, sva); 3630 eva = sva + PTE1_SIZE; 3631 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3632 va < eva; va += PAGE_SIZE, m++) { 3633 if (pte1_is_dirty(opte1)) 3634 vm_page_dirty(m); 3635 if (opte1 & PTE1_A) 3636 vm_page_aflag_set(m, PGA_REFERENCED); 3637 if (TAILQ_EMPTY(&m->md.pv_list) && 3638 TAILQ_EMPTY(&pvh->pv_list)) 3639 vm_page_aflag_clear(m, PGA_WRITEABLE); 3640 } 3641 } 3642 if (pmap == kernel_pmap) { 3643 /* 3644 * L2 page table(s) can't be removed from kernel map as 3645 * kernel counts on it (stuff around pmap_growkernel()). 3646 */ 3647 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3648 } else { 3649 /* 3650 * Get associated L2 page table page. 3651 * It's possible that the page was never allocated. 3652 */ 3653 m = pmap_pt2_page(pmap, sva); 3654 if (m != NULL) 3655 pmap_unwire_pt2_all(pmap, sva, m, free); 3656 } 3657} 3658 3659/* 3660 * Fills L2 page table page with mappings to consecutive physical pages. 3661 */ 3662static __inline void 3663pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3664{ 3665 pt2_entry_t *pte2p; 3666 3667 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3668 pte2_store(pte2p, npte2); 3669 npte2 += PTE2_SIZE; 3670 } 3671} 3672 3673/* 3674 * Tries to demote a 1MB page mapping. If demotion fails, the 3675 * 1MB page mapping is invalidated. 3676 */ 3677static boolean_t 3678pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3679{ 3680 pt1_entry_t opte1, npte1; 3681 pt2_entry_t *fpte2p, npte2; 3682 vm_paddr_t pt2pg_pa, pt2_pa; 3683 vm_page_t m; 3684 struct spglist free; 3685 uint32_t pte1_idx, isnew = 0; 3686 3687 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3688 pmap, va, pte1_load(pte1p), pte1p)); 3689 3690 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3691 3692 opte1 = pte1_load(pte1p); 3693 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3694 3695 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3696 KASSERT(!pte1_is_wired(opte1), 3697 ("%s: PT2 page for a wired mapping is missing", __func__)); 3698 3699 /* 3700 * Invalidate the 1MB page mapping and return 3701 * "failure" if the mapping was never accessed or the 3702 * allocation of the new page table page fails. 3703 */ 3704 if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, 3705 pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | 3706 VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { 3707 SLIST_INIT(&free); 3708 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3709 pmap_free_zero_pages(&free); 3710 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3711 __func__, va, pmap); 3712 return (FALSE); 3713 } 3714 if (va < VM_MAXUSER_ADDRESS) 3715 pmap->pm_stats.resident_count++; 3716 3717 isnew = 1; 3718 3719 /* 3720 * We init all L2 page tables in the page even if 3721 * we are going to change everything for one L2 page 3722 * table in a while. 3723 */ 3724 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3725 } else { 3726 if (va < VM_MAXUSER_ADDRESS) { 3727 if (pt2_is_empty(m, va)) 3728 isnew = 1; /* Demoting section w/o promotion. */ 3729#ifdef INVARIANTS 3730 else 3731 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3732 " count %u", __func__, 3733 pt2_wirecount_get(m, pte1_index(va)))); 3734#endif 3735 } 3736 } 3737 3738 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3739 pte1_idx = pte1_index(va); 3740 /* 3741 * If the pmap is current, then the PT2MAP can provide access to 3742 * the page table page (promoted L2 page tables are not unmapped). 3743 * Otherwise, temporarily map the L2 page table page (m) into 3744 * the kernel's address space at either PADDR1 or PADDR2. 3745 * 3746 * Note that L2 page table size is not equal to PAGE_SIZE. 3747 */ 3748 if (pmap_is_current(pmap)) 3749 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3750 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3751 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3752 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3753#ifdef SMP 3754 PMAP1cpu = PCPU_GET(cpuid); 3755#endif 3756 tlb_flush_local((vm_offset_t)PADDR1); 3757 PMAP1changed++; 3758 } else 3759#ifdef SMP 3760 if (PMAP1cpu != PCPU_GET(cpuid)) { 3761 PMAP1cpu = PCPU_GET(cpuid); 3762 tlb_flush_local((vm_offset_t)PADDR1); 3763 PMAP1changedcpu++; 3764 } else 3765#endif 3766 PMAP1unchanged++; 3767 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3768 } else { 3769 mtx_lock(&PMAP2mutex); 3770 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3771 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3772 tlb_flush((vm_offset_t)PADDR2); 3773 } 3774 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3775 } 3776 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3777 npte1 = PTE1_LINK(pt2_pa); 3778 3779 KASSERT((opte1 & PTE1_A) != 0, 3780 ("%s: opte1 is missing PTE1_A", __func__)); 3781 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3782 ("%s: opte1 has PTE1_NM", __func__)); 3783 3784 /* 3785 * Get pte2 from pte1 format. 3786 */ 3787 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3788 3789 /* 3790 * If the L2 page table page is new, initialize it. If the mapping 3791 * has changed attributes, update the page table entries. 3792 */ 3793 if (isnew != 0) { 3794 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3795 pmap_fill_pt2(fpte2p, npte2); 3796 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3797 (npte2 & PTE2_PROMOTE)) 3798 pmap_fill_pt2(fpte2p, npte2); 3799 3800 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3801 ("%s: fpte2p and npte2 map different physical addresses", 3802 __func__)); 3803 3804 if (fpte2p == PADDR2) 3805 mtx_unlock(&PMAP2mutex); 3806 3807 /* 3808 * Demote the mapping. This pmap is locked. The old PTE1 has 3809 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3810 * has not PTE1_NM set. Thus, there is no danger of a race with 3811 * another processor changing the setting of PTE1_A and/or PTE1_NM 3812 * between the read above and the store below. 3813 */ 3814 pmap_change_pte1(pmap, pte1p, va, npte1); 3815 3816 /* 3817 * Demote the pv entry. This depends on the earlier demotion 3818 * of the mapping. Specifically, the (re)creation of a per- 3819 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3820 * which might reclaim a newly (re)created per-page pv entry 3821 * and destroy the associated mapping. In order to destroy 3822 * the mapping, the PTE1 must have already changed from mapping 3823 * the 1mpage to referencing the page table page. 3824 */ 3825 if (pte1_is_managed(opte1)) 3826 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3827 3828 pmap_pte1_demotions++; 3829 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3830 __func__, va, pmap); 3831 3832 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3833 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3834 return (TRUE); 3835} 3836 3837/* 3838 * Insert the given physical page (p) at 3839 * the specified virtual address (v) in the 3840 * target physical map with the protection requested. 3841 * 3842 * If specified, the page will be wired down, meaning 3843 * that the related pte can not be reclaimed. 3844 * 3845 * NB: This is the only routine which MAY NOT lazy-evaluate 3846 * or lose information. That is, this routine must actually 3847 * insert this page into the given map NOW. 3848 */ 3849int 3850pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3851 u_int flags, int8_t psind) 3852{ 3853 pt1_entry_t *pte1p; 3854 pt2_entry_t *pte2p; 3855 pt2_entry_t npte2, opte2; 3856 pv_entry_t pv; 3857 vm_paddr_t opa, pa; 3858 vm_page_t mpte2, om; 3859 boolean_t wired; 3860 3861 va = trunc_page(va); 3862 mpte2 = NULL; 3863 wired = (flags & PMAP_ENTER_WIRED) != 0; 3864 3865 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3866 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3867 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3868 va)); 3869 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3870 VM_OBJECT_ASSERT_LOCKED(m->object); 3871 3872 rw_wlock(&pvh_global_lock); 3873 PMAP_LOCK(pmap); 3874 sched_pin(); 3875 3876 /* 3877 * In the case that a page table page is not 3878 * resident, we are creating it here. 3879 */ 3880 if (va < VM_MAXUSER_ADDRESS) { 3881 mpte2 = pmap_allocpte2(pmap, va, flags); 3882 if (mpte2 == NULL) { 3883 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3884 ("pmap_allocpte2 failed with sleep allowed")); 3885 sched_unpin(); 3886 rw_wunlock(&pvh_global_lock); 3887 PMAP_UNLOCK(pmap); 3888 return (KERN_RESOURCE_SHORTAGE); 3889 } 3890 } 3891 pte1p = pmap_pte1(pmap, va); 3892 if (pte1_is_section(pte1_load(pte1p))) 3893 panic("%s: attempted on 1MB page", __func__); 3894 pte2p = pmap_pte2_quick(pmap, va); 3895 if (pte2p == NULL) 3896 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3897 3898 om = NULL; 3899 pa = VM_PAGE_TO_PHYS(m); 3900 opte2 = pte2_load(pte2p); 3901 opa = pte2_pa(opte2); 3902 /* 3903 * Mapping has not changed, must be protection or wiring change. 3904 */ 3905 if (pte2_is_valid(opte2) && (opa == pa)) { 3906 /* 3907 * Wiring change, just update stats. We don't worry about 3908 * wiring PT2 pages as they remain resident as long as there 3909 * are valid mappings in them. Hence, if a user page is wired, 3910 * the PT2 page will be also. 3911 */ 3912 if (wired && !pte2_is_wired(opte2)) 3913 pmap->pm_stats.wired_count++; 3914 else if (!wired && pte2_is_wired(opte2)) 3915 pmap->pm_stats.wired_count--; 3916 3917 /* 3918 * Remove extra pte2 reference 3919 */ 3920 if (mpte2) 3921 pt2_wirecount_dec(mpte2, pte1_index(va)); 3922 if (pte2_is_managed(opte2)) 3923 om = m; 3924 goto validate; 3925 } 3926 3927 /* 3928 * QQQ: We think that changing physical address on writeable mapping 3929 * is not safe. Well, maybe on kernel address space with correct 3930 * locking, it can make a sense. However, we have no idea why 3931 * anyone should do that on user address space. Are we wrong? 3932 */ 3933 KASSERT((opa == 0) || (opa == pa) || 3934 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3935 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3936 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3937 3938 pv = NULL; 3939 3940 /* 3941 * Mapping has changed, invalidate old range and fall through to 3942 * handle validating new mapping. 3943 */ 3944 if (opa) { 3945 if (pte2_is_wired(opte2)) 3946 pmap->pm_stats.wired_count--; 3947 if (pte2_is_managed(opte2)) { 3948 om = PHYS_TO_VM_PAGE(opa); 3949 pv = pmap_pvh_remove(&om->md, pmap, va); 3950 } 3951 /* 3952 * Remove extra pte2 reference 3953 */ 3954 if (mpte2 != NULL) 3955 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3956 } else 3957 pmap->pm_stats.resident_count++; 3958 3959 /* 3960 * Enter on the PV list if part of our managed memory. 3961 */ 3962 if ((m->oflags & VPO_UNMANAGED) == 0) { 3963 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3964 ("%s: managed mapping within the clean submap", __func__)); 3965 if (pv == NULL) 3966 pv = get_pv_entry(pmap, FALSE); 3967 pv->pv_va = va; 3968 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3969 } else if (pv != NULL) 3970 free_pv_entry(pmap, pv); 3971 3972 /* 3973 * Increment counters 3974 */ 3975 if (wired) 3976 pmap->pm_stats.wired_count++; 3977 3978validate: 3979 /* 3980 * Now validate mapping with desired protection/wiring. 3981 */ 3982 npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m)); 3983 if (prot & VM_PROT_WRITE) { 3984 if (pte2_is_managed(npte2)) 3985 vm_page_aflag_set(m, PGA_WRITEABLE); 3986 } 3987 else 3988 npte2 |= PTE2_RO; 3989 if ((prot & VM_PROT_EXECUTE) == 0) 3990 npte2 |= PTE2_NX; 3991 if (wired) 3992 npte2 |= PTE2_W; 3993 if (va < VM_MAXUSER_ADDRESS) 3994 npte2 |= PTE2_U; 3995 if (pmap != kernel_pmap) 3996 npte2 |= PTE2_NG; 3997 3998 /* 3999 * If the mapping or permission bits are different, we need 4000 * to update the pte2. 4001 * 4002 * QQQ: Think again and again what to do 4003 * if the mapping is going to be changed! 4004 */ 4005 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4006 /* 4007 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4008 * is set. Do it now, before the mapping is stored and made 4009 * valid for hardware table walk. If done later, there is a race 4010 * for other threads of current process in lazy loading case. 4011 * Don't do it for kernel memory which is mapped with exec 4012 * permission even if the memory isn't going to hold executable 4013 * code. The only time when icache sync is needed is after 4014 * kernel module is loaded and the relocation info is processed. 4015 * And it's done in elf_cpu_load_file(). 4016 * 4017 * QQQ: (1) Does it exist any better way where 4018 * or how to sync icache? 4019 * (2) Now, we do it on a page basis. 4020 */ 4021 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4022 m->md.pat_mode == VM_MEMATTR_WB_WA && 4023 (opa != pa || (opte2 & PTE2_NX))) 4024 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4025 4026 npte2 |= PTE2_A; 4027 if (flags & VM_PROT_WRITE) 4028 npte2 &= ~PTE2_NM; 4029 if (opte2 & PTE2_V) { 4030 /* Change mapping with break-before-make approach. */ 4031 opte2 = pte2_load_clear(pte2p); 4032 pmap_tlb_flush(pmap, va); 4033 pte2_store(pte2p, npte2); 4034 if (opte2 & PTE2_A) { 4035 if (pte2_is_managed(opte2)) 4036 vm_page_aflag_set(om, PGA_REFERENCED); 4037 } 4038 if (pte2_is_dirty(opte2)) { 4039 if (pte2_is_managed(opte2)) 4040 vm_page_dirty(om); 4041 } 4042 if (pte2_is_managed(opte2) && 4043 TAILQ_EMPTY(&om->md.pv_list) && 4044 ((om->flags & PG_FICTITIOUS) != 0 || 4045 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4046 vm_page_aflag_clear(om, PGA_WRITEABLE); 4047 } else 4048 pte2_store(pte2p, npte2); 4049 } 4050#if 0 4051 else { 4052 /* 4053 * QQQ: In time when both access and not mofified bits are 4054 * emulated by software, this should not happen. Some 4055 * analysis is need, if this really happen. Missing 4056 * tlb flush somewhere could be the reason. 4057 */ 4058 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4059 va, opte2, npte2); 4060 } 4061#endif 4062 4063#if VM_NRESERVLEVEL > 0 4064 /* 4065 * If both the L2 page table page and the reservation are fully 4066 * populated, then attempt promotion. 4067 */ 4068 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4069 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4070 vm_reserv_level_iffullpop(m) == 0) 4071 pmap_promote_pte1(pmap, pte1p, va); 4072#endif 4073 sched_unpin(); 4074 rw_wunlock(&pvh_global_lock); 4075 PMAP_UNLOCK(pmap); 4076 return (KERN_SUCCESS); 4077} 4078 4079/* 4080 * Do the things to unmap a page in a process. 4081 */ 4082static int 4083pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4084 struct spglist *free) 4085{ 4086 pt2_entry_t opte2; 4087 vm_page_t m; 4088 4089 rw_assert(&pvh_global_lock, RA_WLOCKED); 4090 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4091 4092 /* Clear and invalidate the mapping. */ 4093 opte2 = pte2_load_clear(pte2p); 4094 pmap_tlb_flush(pmap, va); 4095 4096 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4097 __func__, pmap, va, opte2)); 4098 4099 if (opte2 & PTE2_W) 4100 pmap->pm_stats.wired_count -= 1; 4101 pmap->pm_stats.resident_count -= 1; 4102 if (pte2_is_managed(opte2)) { 4103 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4104 if (pte2_is_dirty(opte2)) 4105 vm_page_dirty(m); 4106 if (opte2 & PTE2_A) 4107 vm_page_aflag_set(m, PGA_REFERENCED); 4108 pmap_remove_entry(pmap, m, va); 4109 } 4110 return (pmap_unuse_pt2(pmap, va, free)); 4111} 4112 4113/* 4114 * Remove a single page from a process address space. 4115 */ 4116static void 4117pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4118{ 4119 pt2_entry_t *pte2p; 4120 4121 rw_assert(&pvh_global_lock, RA_WLOCKED); 4122 KASSERT(curthread->td_pinned > 0, 4123 ("%s: curthread not pinned", __func__)); 4124 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4125 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4126 !pte2_is_valid(pte2_load(pte2p))) 4127 return; 4128 pmap_remove_pte2(pmap, pte2p, va, free); 4129} 4130 4131/* 4132 * Remove the given range of addresses from the specified map. 4133 * 4134 * It is assumed that the start and end are properly 4135 * rounded to the page size. 4136 */ 4137void 4138pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4139{ 4140 vm_offset_t nextva; 4141 pt1_entry_t *pte1p, pte1; 4142 pt2_entry_t *pte2p, pte2; 4143 struct spglist free; 4144 4145 /* 4146 * Perform an unsynchronized read. This is, however, safe. 4147 */ 4148 if (pmap->pm_stats.resident_count == 0) 4149 return; 4150 4151 SLIST_INIT(&free); 4152 4153 rw_wlock(&pvh_global_lock); 4154 sched_pin(); 4155 PMAP_LOCK(pmap); 4156 4157 /* 4158 * Special handling of removing one page. A very common 4159 * operation and easy to short circuit some code. 4160 */ 4161 if (sva + PAGE_SIZE == eva) { 4162 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4163 if (pte1_is_link(pte1)) { 4164 pmap_remove_page(pmap, sva, &free); 4165 goto out; 4166 } 4167 } 4168 4169 for (; sva < eva; sva = nextva) { 4170 /* 4171 * Calculate address for next L2 page table. 4172 */ 4173 nextva = pte1_trunc(sva + PTE1_SIZE); 4174 if (nextva < sva) 4175 nextva = eva; 4176 if (pmap->pm_stats.resident_count == 0) 4177 break; 4178 4179 pte1p = pmap_pte1(pmap, sva); 4180 pte1 = pte1_load(pte1p); 4181 4182 /* 4183 * Weed out invalid mappings. Note: we assume that the L1 page 4184 * table is always allocated, and in kernel virtual. 4185 */ 4186 if (pte1 == 0) 4187 continue; 4188 4189 if (pte1_is_section(pte1)) { 4190 /* 4191 * Are we removing the entire large page? If not, 4192 * demote the mapping and fall through. 4193 */ 4194 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4195 pmap_remove_pte1(pmap, pte1p, sva, &free); 4196 continue; 4197 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4198 /* The large page mapping was destroyed. */ 4199 continue; 4200 } 4201#ifdef INVARIANTS 4202 else { 4203 /* Update pte1 after demotion. */ 4204 pte1 = pte1_load(pte1p); 4205 } 4206#endif 4207 } 4208 4209 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4210 " is not link", __func__, pmap, sva, pte1, pte1p)); 4211 4212 /* 4213 * Limit our scan to either the end of the va represented 4214 * by the current L2 page table page, or to the end of the 4215 * range being removed. 4216 */ 4217 if (nextva > eva) 4218 nextva = eva; 4219 4220 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4221 pte2p++, sva += PAGE_SIZE) { 4222 pte2 = pte2_load(pte2p); 4223 if (!pte2_is_valid(pte2)) 4224 continue; 4225 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4226 break; 4227 } 4228 } 4229out: 4230 sched_unpin(); 4231 rw_wunlock(&pvh_global_lock); 4232 PMAP_UNLOCK(pmap); 4233 pmap_free_zero_pages(&free); 4234} 4235 4236/* 4237 * Routine: pmap_remove_all 4238 * Function: 4239 * Removes this physical page from 4240 * all physical maps in which it resides. 4241 * Reflects back modify bits to the pager. 4242 * 4243 * Notes: 4244 * Original versions of this routine were very 4245 * inefficient because they iteratively called 4246 * pmap_remove (slow...) 4247 */ 4248 4249void 4250pmap_remove_all(vm_page_t m) 4251{ 4252 struct md_page *pvh; 4253 pv_entry_t pv; 4254 pmap_t pmap; 4255 pt2_entry_t *pte2p, opte2; 4256 pt1_entry_t *pte1p; 4257 vm_offset_t va; 4258 struct spglist free; 4259 4260 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4261 ("%s: page %p is not managed", __func__, m)); 4262 SLIST_INIT(&free); 4263 rw_wlock(&pvh_global_lock); 4264 sched_pin(); 4265 if ((m->flags & PG_FICTITIOUS) != 0) 4266 goto small_mappings; 4267 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4268 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4269 va = pv->pv_va; 4270 pmap = PV_PMAP(pv); 4271 PMAP_LOCK(pmap); 4272 pte1p = pmap_pte1(pmap, va); 4273 (void)pmap_demote_pte1(pmap, pte1p, va); 4274 PMAP_UNLOCK(pmap); 4275 } 4276small_mappings: 4277 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4278 pmap = PV_PMAP(pv); 4279 PMAP_LOCK(pmap); 4280 pmap->pm_stats.resident_count--; 4281 pte1p = pmap_pte1(pmap, pv->pv_va); 4282 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4283 "a 1mpage in page %p's pv list", __func__, m)); 4284 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4285 opte2 = pte2_load_clear(pte2p); 4286 pmap_tlb_flush(pmap, pv->pv_va); 4287 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4288 __func__, pmap, pv->pv_va)); 4289 if (pte2_is_wired(opte2)) 4290 pmap->pm_stats.wired_count--; 4291 if (opte2 & PTE2_A) 4292 vm_page_aflag_set(m, PGA_REFERENCED); 4293 4294 /* 4295 * Update the vm_page_t clean and reference bits. 4296 */ 4297 if (pte2_is_dirty(opte2)) 4298 vm_page_dirty(m); 4299 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4300 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4301 free_pv_entry(pmap, pv); 4302 PMAP_UNLOCK(pmap); 4303 } 4304 vm_page_aflag_clear(m, PGA_WRITEABLE); 4305 sched_unpin(); 4306 rw_wunlock(&pvh_global_lock); 4307 pmap_free_zero_pages(&free); 4308} 4309 4310/* 4311 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4312 * good coding style, a.k.a. 80 character line width limit hell. 4313 */ 4314static __inline void 4315pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4316 struct spglist *free) 4317{ 4318 vm_paddr_t pa; 4319 vm_page_t m, mt, mpt2pg; 4320 struct md_page *pvh; 4321 4322 pa = pte1_pa(pte1); 4323 m = PHYS_TO_VM_PAGE(pa); 4324 4325 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4326 __func__, m, m->phys_addr, pa)); 4327 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4328 m < &vm_page_array[vm_page_array_size], 4329 ("%s: bad pte1 %#x", __func__, pte1)); 4330 4331 if (pte1_is_dirty(pte1)) { 4332 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4333 vm_page_dirty(mt); 4334 } 4335 4336 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4337 pvh = pa_to_pvh(pa); 4338 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4339 if (TAILQ_EMPTY(&pvh->pv_list)) { 4340 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4341 if (TAILQ_EMPTY(&mt->md.pv_list)) 4342 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4343 } 4344 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4345 if (mpt2pg != NULL) 4346 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4347} 4348 4349/* 4350 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4351 * good coding style, a.k.a. 80 character line width limit hell. 4352 */ 4353static __inline void 4354pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4355 struct spglist *free) 4356{ 4357 vm_paddr_t pa; 4358 vm_page_t m; 4359 struct md_page *pvh; 4360 4361 pa = pte2_pa(pte2); 4362 m = PHYS_TO_VM_PAGE(pa); 4363 4364 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4365 __func__, m, m->phys_addr, pa)); 4366 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4367 m < &vm_page_array[vm_page_array_size], 4368 ("%s: bad pte2 %#x", __func__, pte2)); 4369 4370 if (pte2_is_dirty(pte2)) 4371 vm_page_dirty(m); 4372 4373 pmap->pm_stats.resident_count--; 4374 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4375 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4376 pvh = pa_to_pvh(pa); 4377 if (TAILQ_EMPTY(&pvh->pv_list)) 4378 vm_page_aflag_clear(m, PGA_WRITEABLE); 4379 } 4380 pmap_unuse_pt2(pmap, pv->pv_va, free); 4381} 4382 4383/* 4384 * Remove all pages from specified address space this aids process 4385 * exit speeds. Also, this code is special cased for current process 4386 * only, but can have the more generic (and slightly slower) mode enabled. 4387 * This is much faster than pmap_remove in the case of running down 4388 * an entire address space. 4389 */ 4390void 4391pmap_remove_pages(pmap_t pmap) 4392{ 4393 pt1_entry_t *pte1p, pte1; 4394 pt2_entry_t *pte2p, pte2; 4395 pv_entry_t pv; 4396 struct pv_chunk *pc, *npc; 4397 struct spglist free; 4398 int field, idx; 4399 int32_t bit; 4400 uint32_t inuse, bitmask; 4401 boolean_t allfree; 4402 4403 /* 4404 * Assert that the given pmap is only active on the current 4405 * CPU. Unfortunately, we cannot block another CPU from 4406 * activating the pmap while this function is executing. 4407 */ 4408 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4409 ("%s: non-current pmap %p", __func__, pmap)); 4410#if defined(SMP) && defined(INVARIANTS) 4411 { 4412 cpuset_t other_cpus; 4413 4414 sched_pin(); 4415 other_cpus = pmap->pm_active; 4416 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4417 sched_unpin(); 4418 KASSERT(CPU_EMPTY(&other_cpus), 4419 ("%s: pmap %p active on other cpus", __func__, pmap)); 4420 } 4421#endif 4422 SLIST_INIT(&free); 4423 rw_wlock(&pvh_global_lock); 4424 PMAP_LOCK(pmap); 4425 sched_pin(); 4426 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4427 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4428 __func__, pmap, pc->pc_pmap)); 4429 allfree = TRUE; 4430 for (field = 0; field < _NPCM; field++) { 4431 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4432 while (inuse != 0) { 4433 bit = ffs(inuse) - 1; 4434 bitmask = 1UL << bit; 4435 idx = field * 32 + bit; 4436 pv = &pc->pc_pventry[idx]; 4437 inuse &= ~bitmask; 4438 4439 /* 4440 * Note that we cannot remove wired pages 4441 * from a process' mapping at this time 4442 */ 4443 pte1p = pmap_pte1(pmap, pv->pv_va); 4444 pte1 = pte1_load(pte1p); 4445 if (pte1_is_section(pte1)) { 4446 if (pte1_is_wired(pte1)) { 4447 allfree = FALSE; 4448 continue; 4449 } 4450 pte1_clear(pte1p); 4451 pmap_remove_pte1_quick(pmap, pte1, pv, 4452 &free); 4453 } 4454 else if (pte1_is_link(pte1)) { 4455 pte2p = pt2map_entry(pv->pv_va); 4456 pte2 = pte2_load(pte2p); 4457 4458 if (!pte2_is_valid(pte2)) { 4459 printf("%s: pmap %p va %#x " 4460 "pte2 %#x\n", __func__, 4461 pmap, pv->pv_va, pte2); 4462 panic("bad pte2"); 4463 } 4464 4465 if (pte2_is_wired(pte2)) { 4466 allfree = FALSE; 4467 continue; 4468 } 4469 pte2_clear(pte2p); 4470 pmap_remove_pte2_quick(pmap, pte2, pv, 4471 &free); 4472 } else { 4473 printf("%s: pmap %p va %#x pte1 %#x\n", 4474 __func__, pmap, pv->pv_va, pte1); 4475 panic("bad pte1"); 4476 } 4477 4478 /* Mark free */ 4479 PV_STAT(pv_entry_frees++); 4480 PV_STAT(pv_entry_spare++); 4481 pv_entry_count--; 4482 pc->pc_map[field] |= bitmask; 4483 } 4484 } 4485 if (allfree) { 4486 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4487 free_pv_chunk(pc); 4488 } 4489 } 4490 tlb_flush_all_ng_local(); 4491 sched_unpin(); 4492 rw_wunlock(&pvh_global_lock); 4493 PMAP_UNLOCK(pmap); 4494 pmap_free_zero_pages(&free); 4495} 4496 4497/* 4498 * This code makes some *MAJOR* assumptions: 4499 * 1. Current pmap & pmap exists. 4500 * 2. Not wired. 4501 * 3. Read access. 4502 * 4. No L2 page table pages. 4503 * but is *MUCH* faster than pmap_enter... 4504 */ 4505static vm_page_t 4506pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4507 vm_prot_t prot, vm_page_t mpt2pg) 4508{ 4509 pt2_entry_t *pte2p, pte2; 4510 vm_paddr_t pa; 4511 struct spglist free; 4512 uint32_t l2prot; 4513 4514 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4515 (m->oflags & VPO_UNMANAGED) != 0, 4516 ("%s: managed mapping within the clean submap", __func__)); 4517 rw_assert(&pvh_global_lock, RA_WLOCKED); 4518 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4519 4520 /* 4521 * In the case that a L2 page table page is not 4522 * resident, we are creating it here. 4523 */ 4524 if (va < VM_MAXUSER_ADDRESS) { 4525 u_int pte1_idx; 4526 pt1_entry_t pte1, *pte1p; 4527 vm_paddr_t pt2_pa; 4528 4529 /* 4530 * Get L1 page table things. 4531 */ 4532 pte1_idx = pte1_index(va); 4533 pte1p = pmap_pte1(pmap, va); 4534 pte1 = pte1_load(pte1p); 4535 4536 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4537 /* 4538 * Each of NPT2_IN_PG L2 page tables on the page can 4539 * come here. Make sure that associated L1 page table 4540 * link is established. 4541 * 4542 * QQQ: It comes that we don't establish all links to 4543 * L2 page tables for newly allocated L2 page 4544 * tables page. 4545 */ 4546 KASSERT(!pte1_is_section(pte1), 4547 ("%s: pte1 %#x is section", __func__, pte1)); 4548 if (!pte1_is_link(pte1)) { 4549 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4550 pte1_idx); 4551 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4552 } 4553 pt2_wirecount_inc(mpt2pg, pte1_idx); 4554 } else { 4555 /* 4556 * If the L2 page table page is mapped, we just 4557 * increment the hold count, and activate it. 4558 */ 4559 if (pte1_is_section(pte1)) { 4560 return (NULL); 4561 } else if (pte1_is_link(pte1)) { 4562 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4563 pt2_wirecount_inc(mpt2pg, pte1_idx); 4564 } else { 4565 mpt2pg = _pmap_allocpte2(pmap, va, 4566 PMAP_ENTER_NOSLEEP); 4567 if (mpt2pg == NULL) 4568 return (NULL); 4569 } 4570 } 4571 } else { 4572 mpt2pg = NULL; 4573 } 4574 4575 /* 4576 * This call to pt2map_entry() makes the assumption that we are 4577 * entering the page into the current pmap. In order to support 4578 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4579 * But that isn't as quick as pt2map_entry(). 4580 */ 4581 pte2p = pt2map_entry(va); 4582 pte2 = pte2_load(pte2p); 4583 if (pte2_is_valid(pte2)) { 4584 if (mpt2pg != NULL) { 4585 /* 4586 * Remove extra pte2 reference 4587 */ 4588 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4589 mpt2pg = NULL; 4590 } 4591 return (NULL); 4592 } 4593 4594 /* 4595 * Enter on the PV list if part of our managed memory. 4596 */ 4597 if ((m->oflags & VPO_UNMANAGED) == 0 && 4598 !pmap_try_insert_pv_entry(pmap, va, m)) { 4599 if (mpt2pg != NULL) { 4600 SLIST_INIT(&free); 4601 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4602 pmap_tlb_flush(pmap, va); 4603 pmap_free_zero_pages(&free); 4604 } 4605 4606 mpt2pg = NULL; 4607 } 4608 return (NULL); 4609 } 4610 4611 /* 4612 * Increment counters 4613 */ 4614 pmap->pm_stats.resident_count++; 4615 4616 /* 4617 * Now validate mapping with RO protection 4618 */ 4619 pa = VM_PAGE_TO_PHYS(m); 4620 l2prot = PTE2_RO | PTE2_NM; 4621 if (va < VM_MAXUSER_ADDRESS) 4622 l2prot |= PTE2_U | PTE2_NG; 4623 if ((prot & VM_PROT_EXECUTE) == 0) 4624 l2prot |= PTE2_NX; 4625 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4626 /* 4627 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4628 * is set. QQQ: For more info, see comments in pmap_enter(). 4629 */ 4630 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4631 } 4632 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4633 4634 return (mpt2pg); 4635} 4636 4637void 4638pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4639{ 4640 4641 rw_wlock(&pvh_global_lock); 4642 PMAP_LOCK(pmap); 4643 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4644 rw_wunlock(&pvh_global_lock); 4645 PMAP_UNLOCK(pmap); 4646} 4647 4648/* 4649 * Tries to create 1MB page mapping. Returns TRUE if successful and 4650 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 4651 * blocking, (2) a mapping already exists at the specified virtual address, or 4652 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4653 */ 4654static boolean_t 4655pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4656{ 4657 pt1_entry_t *pte1p; 4658 vm_paddr_t pa; 4659 uint32_t l1prot; 4660 4661 rw_assert(&pvh_global_lock, RA_WLOCKED); 4662 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4663 pte1p = pmap_pte1(pmap, va); 4664 if (pte1_is_valid(pte1_load(pte1p))) { 4665 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, 4666 va, pmap); 4667 return (FALSE); 4668 } 4669 if ((m->oflags & VPO_UNMANAGED) == 0) { 4670 /* 4671 * Abort this mapping if its PV entry could not be created. 4672 */ 4673 if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { 4674 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4675 __func__, va, pmap); 4676 return (FALSE); 4677 } 4678 } 4679 /* 4680 * Increment counters. 4681 */ 4682 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4683 4684 /* 4685 * Map the section. 4686 * 4687 * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is 4688 * made readonly? 4689 */ 4690 pa = VM_PAGE_TO_PHYS(m); 4691 l1prot = PTE1_RO | PTE1_NM; 4692 if (va < VM_MAXUSER_ADDRESS) 4693 l1prot |= PTE1_U | PTE1_NG; 4694 if ((prot & VM_PROT_EXECUTE) == 0) 4695 l1prot |= PTE1_NX; 4696 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4697 /* 4698 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4699 * is set. QQQ: For more info, see comments in pmap_enter(). 4700 */ 4701 cache_icache_sync_fresh(va, pa, PTE1_SIZE); 4702 } 4703 pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m)))); 4704 4705 pmap_pte1_mappings++; 4706 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4707 pmap); 4708 return (TRUE); 4709} 4710 4711/* 4712 * Maps a sequence of resident pages belonging to the same object. 4713 * The sequence begins with the given page m_start. This page is 4714 * mapped at the given virtual address start. Each subsequent page is 4715 * mapped at a virtual address that is offset from start by the same 4716 * amount as the page is offset from m_start within the object. The 4717 * last page in the sequence is the page with the largest offset from 4718 * m_start that can be mapped at a virtual address less than the given 4719 * virtual address end. Not every virtual page between start and end 4720 * is mapped; only those for which a resident page exists with the 4721 * corresponding offset from m_start are mapped. 4722 */ 4723void 4724pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4725 vm_page_t m_start, vm_prot_t prot) 4726{ 4727 vm_offset_t va; 4728 vm_page_t m, mpt2pg; 4729 vm_pindex_t diff, psize; 4730 4731 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4732 __func__, pmap, start, end, m_start, prot)); 4733 4734 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4735 psize = atop(end - start); 4736 mpt2pg = NULL; 4737 m = m_start; 4738 rw_wlock(&pvh_global_lock); 4739 PMAP_LOCK(pmap); 4740 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4741 va = start + ptoa(diff); 4742 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4743 m->psind == 1 && sp_enabled && 4744 pmap_enter_pte1(pmap, va, m, prot)) 4745 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4746 else 4747 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4748 mpt2pg); 4749 m = TAILQ_NEXT(m, listq); 4750 } 4751 rw_wunlock(&pvh_global_lock); 4752 PMAP_UNLOCK(pmap); 4753} 4754 4755/* 4756 * This code maps large physical mmap regions into the 4757 * processor address space. Note that some shortcuts 4758 * are taken, but the code works. 4759 */ 4760void 4761pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4762 vm_pindex_t pindex, vm_size_t size) 4763{ 4764 pt1_entry_t *pte1p; 4765 vm_paddr_t pa, pte2_pa; 4766 vm_page_t p; 4767 vm_memattr_t pat_mode; 4768 u_int l1attr, l1prot; 4769 4770 VM_OBJECT_ASSERT_WLOCKED(object); 4771 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4772 ("%s: non-device object", __func__)); 4773 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4774 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4775 return; 4776 p = vm_page_lookup(object, pindex); 4777 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4778 ("%s: invalid page %p", __func__, p)); 4779 pat_mode = p->md.pat_mode; 4780 4781 /* 4782 * Abort the mapping if the first page is not physically 4783 * aligned to a 1MB page boundary. 4784 */ 4785 pte2_pa = VM_PAGE_TO_PHYS(p); 4786 if (pte2_pa & PTE1_OFFSET) 4787 return; 4788 4789 /* 4790 * Skip the first page. Abort the mapping if the rest of 4791 * the pages are not physically contiguous or have differing 4792 * memory attributes. 4793 */ 4794 p = TAILQ_NEXT(p, listq); 4795 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4796 pa += PAGE_SIZE) { 4797 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4798 ("%s: invalid page %p", __func__, p)); 4799 if (pa != VM_PAGE_TO_PHYS(p) || 4800 pat_mode != p->md.pat_mode) 4801 return; 4802 p = TAILQ_NEXT(p, listq); 4803 } 4804 4805 /* 4806 * Map using 1MB pages. 4807 * 4808 * QQQ: Well, we are mapping a section, so same condition must 4809 * be hold like during promotion. It looks that only RW mapping 4810 * is done here, so readonly mapping must be done elsewhere. 4811 */ 4812 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4813 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4814 PMAP_LOCK(pmap); 4815 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4816 pte1p = pmap_pte1(pmap, addr); 4817 if (!pte1_is_valid(pte1_load(pte1p))) { 4818 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4819 pmap->pm_stats.resident_count += PTE1_SIZE / 4820 PAGE_SIZE; 4821 pmap_pte1_mappings++; 4822 } 4823 /* Else continue on if the PTE1 is already valid. */ 4824 addr += PTE1_SIZE; 4825 } 4826 PMAP_UNLOCK(pmap); 4827 } 4828} 4829 4830/* 4831 * Do the things to protect a 1mpage in a process. 4832 */ 4833static void 4834pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4835 vm_prot_t prot) 4836{ 4837 pt1_entry_t npte1, opte1; 4838 vm_offset_t eva, va; 4839 vm_page_t m; 4840 4841 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4842 KASSERT((sva & PTE1_OFFSET) == 0, 4843 ("%s: sva is not 1mpage aligned", __func__)); 4844 4845 opte1 = npte1 = pte1_load(pte1p); 4846 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4847 eva = sva + PTE1_SIZE; 4848 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4849 va < eva; va += PAGE_SIZE, m++) 4850 vm_page_dirty(m); 4851 } 4852 if ((prot & VM_PROT_WRITE) == 0) 4853 npte1 |= PTE1_RO | PTE1_NM; 4854 if ((prot & VM_PROT_EXECUTE) == 0) 4855 npte1 |= PTE1_NX; 4856 4857 /* 4858 * QQQ: Herein, execute permission is never set. 4859 * It only can be cleared. So, no icache 4860 * syncing is needed. 4861 */ 4862 4863 if (npte1 != opte1) { 4864 pte1_store(pte1p, npte1); 4865 pmap_tlb_flush(pmap, sva); 4866 } 4867} 4868 4869/* 4870 * Set the physical protection on the 4871 * specified range of this map as requested. 4872 */ 4873void 4874pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4875{ 4876 boolean_t pv_lists_locked; 4877 vm_offset_t nextva; 4878 pt1_entry_t *pte1p, pte1; 4879 pt2_entry_t *pte2p, opte2, npte2; 4880 4881 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4882 if (prot == VM_PROT_NONE) { 4883 pmap_remove(pmap, sva, eva); 4884 return; 4885 } 4886 4887 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4888 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4889 return; 4890 4891 if (pmap_is_current(pmap)) 4892 pv_lists_locked = FALSE; 4893 else { 4894 pv_lists_locked = TRUE; 4895resume: 4896 rw_wlock(&pvh_global_lock); 4897 sched_pin(); 4898 } 4899 4900 PMAP_LOCK(pmap); 4901 for (; sva < eva; sva = nextva) { 4902 /* 4903 * Calculate address for next L2 page table. 4904 */ 4905 nextva = pte1_trunc(sva + PTE1_SIZE); 4906 if (nextva < sva) 4907 nextva = eva; 4908 4909 pte1p = pmap_pte1(pmap, sva); 4910 pte1 = pte1_load(pte1p); 4911 4912 /* 4913 * Weed out invalid mappings. Note: we assume that L1 page 4914 * page table is always allocated, and in kernel virtual. 4915 */ 4916 if (pte1 == 0) 4917 continue; 4918 4919 if (pte1_is_section(pte1)) { 4920 /* 4921 * Are we protecting the entire large page? If not, 4922 * demote the mapping and fall through. 4923 */ 4924 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4925 pmap_protect_pte1(pmap, pte1p, sva, prot); 4926 continue; 4927 } else { 4928 if (!pv_lists_locked) { 4929 pv_lists_locked = TRUE; 4930 if (!rw_try_wlock(&pvh_global_lock)) { 4931 PMAP_UNLOCK(pmap); 4932 goto resume; 4933 } 4934 sched_pin(); 4935 } 4936 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4937 /* 4938 * The large page mapping 4939 * was destroyed. 4940 */ 4941 continue; 4942 } 4943#ifdef INVARIANTS 4944 else { 4945 /* Update pte1 after demotion */ 4946 pte1 = pte1_load(pte1p); 4947 } 4948#endif 4949 } 4950 } 4951 4952 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4953 " is not link", __func__, pmap, sva, pte1, pte1p)); 4954 4955 /* 4956 * Limit our scan to either the end of the va represented 4957 * by the current L2 page table page, or to the end of the 4958 * range being protected. 4959 */ 4960 if (nextva > eva) 4961 nextva = eva; 4962 4963 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 4964 sva += PAGE_SIZE) { 4965 vm_page_t m; 4966 4967 opte2 = npte2 = pte2_load(pte2p); 4968 if (!pte2_is_valid(opte2)) 4969 continue; 4970 4971 if ((prot & VM_PROT_WRITE) == 0) { 4972 if (pte2_is_managed(opte2) && 4973 pte2_is_dirty(opte2)) { 4974 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4975 vm_page_dirty(m); 4976 } 4977 npte2 |= PTE2_RO | PTE2_NM; 4978 } 4979 4980 if ((prot & VM_PROT_EXECUTE) == 0) 4981 npte2 |= PTE2_NX; 4982 4983 /* 4984 * QQQ: Herein, execute permission is never set. 4985 * It only can be cleared. So, no icache 4986 * syncing is needed. 4987 */ 4988 4989 if (npte2 != opte2) { 4990 pte2_store(pte2p, npte2); 4991 pmap_tlb_flush(pmap, sva); 4992 } 4993 } 4994 } 4995 if (pv_lists_locked) { 4996 sched_unpin(); 4997 rw_wunlock(&pvh_global_lock); 4998 } 4999 PMAP_UNLOCK(pmap); 5000} 5001 5002/* 5003 * pmap_pvh_wired_mappings: 5004 * 5005 * Return the updated number "count" of managed mappings that are wired. 5006 */ 5007static int 5008pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5009{ 5010 pmap_t pmap; 5011 pt1_entry_t pte1; 5012 pt2_entry_t pte2; 5013 pv_entry_t pv; 5014 5015 rw_assert(&pvh_global_lock, RA_WLOCKED); 5016 sched_pin(); 5017 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5018 pmap = PV_PMAP(pv); 5019 PMAP_LOCK(pmap); 5020 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5021 if (pte1_is_section(pte1)) { 5022 if (pte1_is_wired(pte1)) 5023 count++; 5024 } else { 5025 KASSERT(pte1_is_link(pte1), 5026 ("%s: pte1 %#x is not link", __func__, pte1)); 5027 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5028 if (pte2_is_wired(pte2)) 5029 count++; 5030 } 5031 PMAP_UNLOCK(pmap); 5032 } 5033 sched_unpin(); 5034 return (count); 5035} 5036 5037/* 5038 * pmap_page_wired_mappings: 5039 * 5040 * Return the number of managed mappings to the given physical page 5041 * that are wired. 5042 */ 5043int 5044pmap_page_wired_mappings(vm_page_t m) 5045{ 5046 int count; 5047 5048 count = 0; 5049 if ((m->oflags & VPO_UNMANAGED) != 0) 5050 return (count); 5051 rw_wlock(&pvh_global_lock); 5052 count = pmap_pvh_wired_mappings(&m->md, count); 5053 if ((m->flags & PG_FICTITIOUS) == 0) { 5054 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5055 count); 5056 } 5057 rw_wunlock(&pvh_global_lock); 5058 return (count); 5059} 5060 5061/* 5062 * Returns TRUE if any of the given mappings were used to modify 5063 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5064 * mappings are supported. 5065 */ 5066static boolean_t 5067pmap_is_modified_pvh(struct md_page *pvh) 5068{ 5069 pv_entry_t pv; 5070 pt1_entry_t pte1; 5071 pt2_entry_t pte2; 5072 pmap_t pmap; 5073 boolean_t rv; 5074 5075 rw_assert(&pvh_global_lock, RA_WLOCKED); 5076 rv = FALSE; 5077 sched_pin(); 5078 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5079 pmap = PV_PMAP(pv); 5080 PMAP_LOCK(pmap); 5081 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5082 if (pte1_is_section(pte1)) { 5083 rv = pte1_is_dirty(pte1); 5084 } else { 5085 KASSERT(pte1_is_link(pte1), 5086 ("%s: pte1 %#x is not link", __func__, pte1)); 5087 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5088 rv = pte2_is_dirty(pte2); 5089 } 5090 PMAP_UNLOCK(pmap); 5091 if (rv) 5092 break; 5093 } 5094 sched_unpin(); 5095 return (rv); 5096} 5097 5098/* 5099 * pmap_is_modified: 5100 * 5101 * Return whether or not the specified physical page was modified 5102 * in any physical maps. 5103 */ 5104boolean_t 5105pmap_is_modified(vm_page_t m) 5106{ 5107 boolean_t rv; 5108 5109 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5110 ("%s: page %p is not managed", __func__, m)); 5111 5112 /* 5113 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5114 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5115 * is clear, no PTE2s can have PG_M set. 5116 */ 5117 VM_OBJECT_ASSERT_WLOCKED(m->object); 5118 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5119 return (FALSE); 5120 rw_wlock(&pvh_global_lock); 5121 rv = pmap_is_modified_pvh(&m->md) || 5122 ((m->flags & PG_FICTITIOUS) == 0 && 5123 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5124 rw_wunlock(&pvh_global_lock); 5125 return (rv); 5126} 5127 5128/* 5129 * pmap_is_prefaultable: 5130 * 5131 * Return whether or not the specified virtual address is eligible 5132 * for prefault. 5133 */ 5134boolean_t 5135pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5136{ 5137 pt1_entry_t pte1; 5138 pt2_entry_t pte2; 5139 boolean_t rv; 5140 5141 rv = FALSE; 5142 PMAP_LOCK(pmap); 5143 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5144 if (pte1_is_link(pte1)) { 5145 pte2 = pte2_load(pt2map_entry(addr)); 5146 rv = !pte2_is_valid(pte2) ; 5147 } 5148 PMAP_UNLOCK(pmap); 5149 return (rv); 5150} 5151 5152/* 5153 * Returns TRUE if any of the given mappings were referenced and FALSE 5154 * otherwise. Both page and 1mpage mappings are supported. 5155 */ 5156static boolean_t 5157pmap_is_referenced_pvh(struct md_page *pvh) 5158{ 5159 5160 pv_entry_t pv; 5161 pt1_entry_t pte1; 5162 pt2_entry_t pte2; 5163 pmap_t pmap; 5164 boolean_t rv; 5165 5166 rw_assert(&pvh_global_lock, RA_WLOCKED); 5167 rv = FALSE; 5168 sched_pin(); 5169 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5170 pmap = PV_PMAP(pv); 5171 PMAP_LOCK(pmap); 5172 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5173 if (pte1_is_section(pte1)) { 5174 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5175 } else { 5176 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5177 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5178 } 5179 PMAP_UNLOCK(pmap); 5180 if (rv) 5181 break; 5182 } 5183 sched_unpin(); 5184 return (rv); 5185} 5186 5187/* 5188 * pmap_is_referenced: 5189 * 5190 * Return whether or not the specified physical page was referenced 5191 * in any physical maps. 5192 */ 5193boolean_t 5194pmap_is_referenced(vm_page_t m) 5195{ 5196 boolean_t rv; 5197 5198 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5199 ("%s: page %p is not managed", __func__, m)); 5200 rw_wlock(&pvh_global_lock); 5201 rv = pmap_is_referenced_pvh(&m->md) || 5202 ((m->flags & PG_FICTITIOUS) == 0 && 5203 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5204 rw_wunlock(&pvh_global_lock); 5205 return (rv); 5206} 5207 5208/* 5209 * pmap_ts_referenced: 5210 * 5211 * Return a count of reference bits for a page, clearing those bits. 5212 * It is not necessary for every reference bit to be cleared, but it 5213 * is necessary that 0 only be returned when there are truly no 5214 * reference bits set. 5215 * 5216 * As an optimization, update the page's dirty field if a modified bit is 5217 * found while counting reference bits. This opportunistic update can be 5218 * performed at low cost and can eliminate the need for some future calls 5219 * to pmap_is_modified(). However, since this function stops after 5220 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5221 * dirty pages. Those dirty pages will only be detected by a future call 5222 * to pmap_is_modified(). 5223 */ 5224int 5225pmap_ts_referenced(vm_page_t m) 5226{ 5227 struct md_page *pvh; 5228 pv_entry_t pv, pvf; 5229 pmap_t pmap; 5230 pt1_entry_t *pte1p, opte1; 5231 pt2_entry_t *pte2p, opte2; 5232 vm_paddr_t pa; 5233 int rtval = 0; 5234 5235 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5236 ("%s: page %p is not managed", __func__, m)); 5237 pa = VM_PAGE_TO_PHYS(m); 5238 pvh = pa_to_pvh(pa); 5239 rw_wlock(&pvh_global_lock); 5240 sched_pin(); 5241 if ((m->flags & PG_FICTITIOUS) != 0 || 5242 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5243 goto small_mappings; 5244 pv = pvf; 5245 do { 5246 pmap = PV_PMAP(pv); 5247 PMAP_LOCK(pmap); 5248 pte1p = pmap_pte1(pmap, pv->pv_va); 5249 opte1 = pte1_load(pte1p); 5250 if (pte1_is_dirty(opte1)) { 5251 /* 5252 * Although "opte1" is mapping a 1MB page, because 5253 * this function is called at a 4KB page granularity, 5254 * we only update the 4KB page under test. 5255 */ 5256 vm_page_dirty(m); 5257 } 5258 if ((opte1 & PTE1_A) != 0) { 5259 /* 5260 * Since this reference bit is shared by 256 4KB pages, 5261 * it should not be cleared every time it is tested. 5262 * Apply a simple "hash" function on the physical page 5263 * number, the virtual section number, and the pmap 5264 * address to select one 4KB page out of the 256 5265 * on which testing the reference bit will result 5266 * in clearing that bit. This function is designed 5267 * to avoid the selection of the same 4KB page 5268 * for every 1MB page mapping. 5269 * 5270 * On demotion, a mapping that hasn't been referenced 5271 * is simply destroyed. To avoid the possibility of a 5272 * subsequent page fault on a demoted wired mapping, 5273 * always leave its reference bit set. Moreover, 5274 * since the section is wired, the current state of 5275 * its reference bit won't affect page replacement. 5276 */ 5277 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5278 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5279 !pte1_is_wired(opte1)) { 5280 pte1_clear_bit(pte1p, PTE1_A); 5281 pmap_tlb_flush(pmap, pv->pv_va); 5282 } 5283 rtval++; 5284 } 5285 PMAP_UNLOCK(pmap); 5286 /* Rotate the PV list if it has more than one entry. */ 5287 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5288 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5289 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5290 } 5291 if (rtval >= PMAP_TS_REFERENCED_MAX) 5292 goto out; 5293 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5294small_mappings: 5295 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5296 goto out; 5297 pv = pvf; 5298 do { 5299 pmap = PV_PMAP(pv); 5300 PMAP_LOCK(pmap); 5301 pte1p = pmap_pte1(pmap, pv->pv_va); 5302 KASSERT(pte1_is_link(pte1_load(pte1p)), 5303 ("%s: not found a link in page %p's pv list", __func__, m)); 5304 5305 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5306 opte2 = pte2_load(pte2p); 5307 if (pte2_is_dirty(opte2)) 5308 vm_page_dirty(m); 5309 if ((opte2 & PTE2_A) != 0) { 5310 pte2_clear_bit(pte2p, PTE2_A); 5311 pmap_tlb_flush(pmap, pv->pv_va); 5312 rtval++; 5313 } 5314 PMAP_UNLOCK(pmap); 5315 /* Rotate the PV list if it has more than one entry. */ 5316 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5317 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5318 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5319 } 5320 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5321 PMAP_TS_REFERENCED_MAX); 5322out: 5323 sched_unpin(); 5324 rw_wunlock(&pvh_global_lock); 5325 return (rtval); 5326} 5327 5328/* 5329 * Clear the wired attribute from the mappings for the specified range of 5330 * addresses in the given pmap. Every valid mapping within that range 5331 * must have the wired attribute set. In contrast, invalid mappings 5332 * cannot have the wired attribute set, so they are ignored. 5333 * 5334 * The wired attribute of the page table entry is not a hardware feature, 5335 * so there is no need to invalidate any TLB entries. 5336 */ 5337void 5338pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5339{ 5340 vm_offset_t nextva; 5341 pt1_entry_t *pte1p, pte1; 5342 pt2_entry_t *pte2p, pte2; 5343 boolean_t pv_lists_locked; 5344 5345 if (pmap_is_current(pmap)) 5346 pv_lists_locked = FALSE; 5347 else { 5348 pv_lists_locked = TRUE; 5349resume: 5350 rw_wlock(&pvh_global_lock); 5351 sched_pin(); 5352 } 5353 PMAP_LOCK(pmap); 5354 for (; sva < eva; sva = nextva) { 5355 nextva = pte1_trunc(sva + PTE1_SIZE); 5356 if (nextva < sva) 5357 nextva = eva; 5358 5359 pte1p = pmap_pte1(pmap, sva); 5360 pte1 = pte1_load(pte1p); 5361 5362 /* 5363 * Weed out invalid mappings. Note: we assume that L1 page 5364 * page table is always allocated, and in kernel virtual. 5365 */ 5366 if (pte1 == 0) 5367 continue; 5368 5369 if (pte1_is_section(pte1)) { 5370 if (!pte1_is_wired(pte1)) 5371 panic("%s: pte1 %#x not wired", __func__, pte1); 5372 5373 /* 5374 * Are we unwiring the entire large page? If not, 5375 * demote the mapping and fall through. 5376 */ 5377 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5378 pte1_clear_bit(pte1p, PTE1_W); 5379 pmap->pm_stats.wired_count -= PTE1_SIZE / 5380 PAGE_SIZE; 5381 continue; 5382 } else { 5383 if (!pv_lists_locked) { 5384 pv_lists_locked = TRUE; 5385 if (!rw_try_wlock(&pvh_global_lock)) { 5386 PMAP_UNLOCK(pmap); 5387 /* Repeat sva. */ 5388 goto resume; 5389 } 5390 sched_pin(); 5391 } 5392 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5393 panic("%s: demotion failed", __func__); 5394#ifdef INVARIANTS 5395 else { 5396 /* Update pte1 after demotion */ 5397 pte1 = pte1_load(pte1p); 5398 } 5399#endif 5400 } 5401 } 5402 5403 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5404 " is not link", __func__, pmap, sva, pte1, pte1p)); 5405 5406 /* 5407 * Limit our scan to either the end of the va represented 5408 * by the current L2 page table page, or to the end of the 5409 * range being protected. 5410 */ 5411 if (nextva > eva) 5412 nextva = eva; 5413 5414 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5415 sva += PAGE_SIZE) { 5416 pte2 = pte2_load(pte2p); 5417 if (!pte2_is_valid(pte2)) 5418 continue; 5419 if (!pte2_is_wired(pte2)) 5420 panic("%s: pte2 %#x is missing PTE2_W", 5421 __func__, pte2); 5422 5423 /* 5424 * PTE2_W must be cleared atomically. Although the pmap 5425 * lock synchronizes access to PTE2_W, another processor 5426 * could be changing PTE2_NM and/or PTE2_A concurrently. 5427 */ 5428 pte2_clear_bit(pte2p, PTE2_W); 5429 pmap->pm_stats.wired_count--; 5430 } 5431 } 5432 if (pv_lists_locked) { 5433 sched_unpin(); 5434 rw_wunlock(&pvh_global_lock); 5435 } 5436 PMAP_UNLOCK(pmap); 5437} 5438 5439/* 5440 * Clear the write and modified bits in each of the given page's mappings. 5441 */ 5442void 5443pmap_remove_write(vm_page_t m) 5444{ 5445 struct md_page *pvh; 5446 pv_entry_t next_pv, pv; 5447 pmap_t pmap; 5448 pt1_entry_t *pte1p; 5449 pt2_entry_t *pte2p, opte2; 5450 vm_offset_t va; 5451 5452 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5453 ("%s: page %p is not managed", __func__, m)); 5454 5455 /* 5456 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5457 * set by another thread while the object is locked. Thus, 5458 * if PGA_WRITEABLE is clear, no page table entries need updating. 5459 */ 5460 VM_OBJECT_ASSERT_WLOCKED(m->object); 5461 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5462 return; 5463 rw_wlock(&pvh_global_lock); 5464 sched_pin(); 5465 if ((m->flags & PG_FICTITIOUS) != 0) 5466 goto small_mappings; 5467 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5468 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5469 va = pv->pv_va; 5470 pmap = PV_PMAP(pv); 5471 PMAP_LOCK(pmap); 5472 pte1p = pmap_pte1(pmap, va); 5473 if (!(pte1_load(pte1p) & PTE1_RO)) 5474 (void)pmap_demote_pte1(pmap, pte1p, va); 5475 PMAP_UNLOCK(pmap); 5476 } 5477small_mappings: 5478 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5479 pmap = PV_PMAP(pv); 5480 PMAP_LOCK(pmap); 5481 pte1p = pmap_pte1(pmap, pv->pv_va); 5482 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5483 " a section in page %p's pv list", __func__, m)); 5484 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5485 opte2 = pte2_load(pte2p); 5486 if (!(opte2 & PTE2_RO)) { 5487 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5488 if (pte2_is_dirty(opte2)) 5489 vm_page_dirty(m); 5490 pmap_tlb_flush(pmap, pv->pv_va); 5491 } 5492 PMAP_UNLOCK(pmap); 5493 } 5494 vm_page_aflag_clear(m, PGA_WRITEABLE); 5495 sched_unpin(); 5496 rw_wunlock(&pvh_global_lock); 5497} 5498 5499/* 5500 * Apply the given advice to the specified range of addresses within the 5501 * given pmap. Depending on the advice, clear the referenced and/or 5502 * modified flags in each mapping and set the mapped page's dirty field. 5503 */ 5504void 5505pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5506{ 5507 pt1_entry_t *pte1p, opte1; 5508 pt2_entry_t *pte2p, pte2; 5509 vm_offset_t pdnxt; 5510 vm_page_t m; 5511 boolean_t pv_lists_locked; 5512 5513 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5514 return; 5515 if (pmap_is_current(pmap)) 5516 pv_lists_locked = FALSE; 5517 else { 5518 pv_lists_locked = TRUE; 5519resume: 5520 rw_wlock(&pvh_global_lock); 5521 sched_pin(); 5522 } 5523 PMAP_LOCK(pmap); 5524 for (; sva < eva; sva = pdnxt) { 5525 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5526 if (pdnxt < sva) 5527 pdnxt = eva; 5528 pte1p = pmap_pte1(pmap, sva); 5529 opte1 = pte1_load(pte1p); 5530 if (!pte1_is_valid(opte1)) /* XXX */ 5531 continue; 5532 else if (pte1_is_section(opte1)) { 5533 if (!pte1_is_managed(opte1)) 5534 continue; 5535 if (!pv_lists_locked) { 5536 pv_lists_locked = TRUE; 5537 if (!rw_try_wlock(&pvh_global_lock)) { 5538 PMAP_UNLOCK(pmap); 5539 goto resume; 5540 } 5541 sched_pin(); 5542 } 5543 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5544 /* 5545 * The large page mapping was destroyed. 5546 */ 5547 continue; 5548 } 5549 5550 /* 5551 * Unless the page mappings are wired, remove the 5552 * mapping to a single page so that a subsequent 5553 * access may repromote. Since the underlying L2 page 5554 * table is fully populated, this removal never 5555 * frees a L2 page table page. 5556 */ 5557 if (!pte1_is_wired(opte1)) { 5558 pte2p = pmap_pte2_quick(pmap, sva); 5559 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5560 ("%s: invalid PTE2", __func__)); 5561 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5562 } 5563 } 5564 if (pdnxt > eva) 5565 pdnxt = eva; 5566 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5567 sva += PAGE_SIZE) { 5568 pte2 = pte2_load(pte2p); 5569 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5570 continue; 5571 else if (pte2_is_dirty(pte2)) { 5572 if (advice == MADV_DONTNEED) { 5573 /* 5574 * Future calls to pmap_is_modified() 5575 * can be avoided by making the page 5576 * dirty now. 5577 */ 5578 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5579 vm_page_dirty(m); 5580 } 5581 pte2_set_bit(pte2p, PTE2_NM); 5582 pte2_clear_bit(pte2p, PTE2_A); 5583 } else if ((pte2 & PTE2_A) != 0) 5584 pte2_clear_bit(pte2p, PTE2_A); 5585 else 5586 continue; 5587 pmap_tlb_flush(pmap, sva); 5588 } 5589 } 5590 if (pv_lists_locked) { 5591 sched_unpin(); 5592 rw_wunlock(&pvh_global_lock); 5593 } 5594 PMAP_UNLOCK(pmap); 5595} 5596 5597/* 5598 * Clear the modify bits on the specified physical page. 5599 */ 5600void 5601pmap_clear_modify(vm_page_t m) 5602{ 5603 struct md_page *pvh; 5604 pv_entry_t next_pv, pv; 5605 pmap_t pmap; 5606 pt1_entry_t *pte1p, opte1; 5607 pt2_entry_t *pte2p, opte2; 5608 vm_offset_t va; 5609 5610 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5611 ("%s: page %p is not managed", __func__, m)); 5612 VM_OBJECT_ASSERT_WLOCKED(m->object); 5613 KASSERT(!vm_page_xbusied(m), 5614 ("%s: page %p is exclusive busy", __func__, m)); 5615 5616 /* 5617 * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM 5618 * cleared. If the object containing the page is locked and the page 5619 * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently 5620 * set. 5621 */ 5622 if ((m->flags & PGA_WRITEABLE) == 0) 5623 return; 5624 rw_wlock(&pvh_global_lock); 5625 sched_pin(); 5626 if ((m->flags & PG_FICTITIOUS) != 0) 5627 goto small_mappings; 5628 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5629 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5630 va = pv->pv_va; 5631 pmap = PV_PMAP(pv); 5632 PMAP_LOCK(pmap); 5633 pte1p = pmap_pte1(pmap, va); 5634 opte1 = pte1_load(pte1p); 5635 if (!(opte1 & PTE1_RO)) { 5636 if (pmap_demote_pte1(pmap, pte1p, va) && 5637 !pte1_is_wired(opte1)) { 5638 /* 5639 * Write protect the mapping to a 5640 * single page so that a subsequent 5641 * write access may repromote. 5642 */ 5643 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5644 pte2p = pmap_pte2_quick(pmap, va); 5645 opte2 = pte2_load(pte2p); 5646 if ((opte2 & PTE2_V)) { 5647 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5648 vm_page_dirty(m); 5649 pmap_tlb_flush(pmap, va); 5650 } 5651 } 5652 } 5653 PMAP_UNLOCK(pmap); 5654 } 5655small_mappings: 5656 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5657 pmap = PV_PMAP(pv); 5658 PMAP_LOCK(pmap); 5659 pte1p = pmap_pte1(pmap, pv->pv_va); 5660 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5661 " a section in page %p's pv list", __func__, m)); 5662 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5663 if (pte2_is_dirty(pte2_load(pte2p))) { 5664 pte2_set_bit(pte2p, PTE2_NM); 5665 pmap_tlb_flush(pmap, pv->pv_va); 5666 } 5667 PMAP_UNLOCK(pmap); 5668 } 5669 sched_unpin(); 5670 rw_wunlock(&pvh_global_lock); 5671} 5672 5673 5674/* 5675 * Sets the memory attribute for the specified page. 5676 */ 5677void 5678pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5679{ 5680 pt2_entry_t *cmap2_pte2p; 5681 vm_memattr_t oma; 5682 vm_paddr_t pa; 5683 struct pcpu *pc; 5684 5685 oma = m->md.pat_mode; 5686 m->md.pat_mode = ma; 5687 5688 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5689 VM_PAGE_TO_PHYS(m), oma, ma); 5690 if ((m->flags & PG_FICTITIOUS) != 0) 5691 return; 5692#if 0 5693 /* 5694 * If "m" is a normal page, flush it from the cache. 5695 * 5696 * First, try to find an existing mapping of the page by sf 5697 * buffer. sf_buf_invalidate_cache() modifies mapping and 5698 * flushes the cache. 5699 */ 5700 if (sf_buf_invalidate_cache(m, oma)) 5701 return; 5702#endif 5703 /* 5704 * If page is not mapped by sf buffer, map the page 5705 * transient and do invalidation. 5706 */ 5707 if (ma != oma) { 5708 pa = VM_PAGE_TO_PHYS(m); 5709 sched_pin(); 5710 pc = get_pcpu(); 5711 cmap2_pte2p = pc->pc_cmap2_pte2p; 5712 mtx_lock(&pc->pc_cmap_lock); 5713 if (pte2_load(cmap2_pte2p) != 0) 5714 panic("%s: CMAP2 busy", __func__); 5715 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5716 vm_memattr_to_pte2(ma))); 5717 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5718 pte2_clear(cmap2_pte2p); 5719 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5720 sched_unpin(); 5721 mtx_unlock(&pc->pc_cmap_lock); 5722 } 5723} 5724 5725/* 5726 * Miscellaneous support routines follow 5727 */ 5728 5729/* 5730 * Returns TRUE if the given page is mapped individually or as part of 5731 * a 1mpage. Otherwise, returns FALSE. 5732 */ 5733boolean_t 5734pmap_page_is_mapped(vm_page_t m) 5735{ 5736 boolean_t rv; 5737 5738 if ((m->oflags & VPO_UNMANAGED) != 0) 5739 return (FALSE); 5740 rw_wlock(&pvh_global_lock); 5741 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5742 ((m->flags & PG_FICTITIOUS) == 0 && 5743 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5744 rw_wunlock(&pvh_global_lock); 5745 return (rv); 5746} 5747 5748/* 5749 * Returns true if the pmap's pv is one of the first 5750 * 16 pvs linked to from this page. This count may 5751 * be changed upwards or downwards in the future; it 5752 * is only necessary that true be returned for a small 5753 * subset of pmaps for proper page aging. 5754 */ 5755boolean_t 5756pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5757{ 5758 struct md_page *pvh; 5759 pv_entry_t pv; 5760 int loops = 0; 5761 boolean_t rv; 5762 5763 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5764 ("%s: page %p is not managed", __func__, m)); 5765 rv = FALSE; 5766 rw_wlock(&pvh_global_lock); 5767 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5768 if (PV_PMAP(pv) == pmap) { 5769 rv = TRUE; 5770 break; 5771 } 5772 loops++; 5773 if (loops >= 16) 5774 break; 5775 } 5776 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5777 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5778 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5779 if (PV_PMAP(pv) == pmap) { 5780 rv = TRUE; 5781 break; 5782 } 5783 loops++; 5784 if (loops >= 16) 5785 break; 5786 } 5787 } 5788 rw_wunlock(&pvh_global_lock); 5789 return (rv); 5790} 5791 5792/* 5793 * pmap_zero_page zeros the specified hardware page by mapping 5794 * the page into KVM and using bzero to clear its contents. 5795 */ 5796void 5797pmap_zero_page(vm_page_t m) 5798{ 5799 pt2_entry_t *cmap2_pte2p; 5800 struct pcpu *pc; 5801 5802 sched_pin(); 5803 pc = get_pcpu(); 5804 cmap2_pte2p = pc->pc_cmap2_pte2p; 5805 mtx_lock(&pc->pc_cmap_lock); 5806 if (pte2_load(cmap2_pte2p) != 0) 5807 panic("%s: CMAP2 busy", __func__); 5808 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5809 vm_page_pte2_attr(m))); 5810 pagezero(pc->pc_cmap2_addr); 5811 pte2_clear(cmap2_pte2p); 5812 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5813 sched_unpin(); 5814 mtx_unlock(&pc->pc_cmap_lock); 5815} 5816 5817/* 5818 * pmap_zero_page_area zeros the specified hardware page by mapping 5819 * the page into KVM and using bzero to clear its contents. 5820 * 5821 * off and size may not cover an area beyond a single hardware page. 5822 */ 5823void 5824pmap_zero_page_area(vm_page_t m, int off, int size) 5825{ 5826 pt2_entry_t *cmap2_pte2p; 5827 struct pcpu *pc; 5828 5829 sched_pin(); 5830 pc = get_pcpu(); 5831 cmap2_pte2p = pc->pc_cmap2_pte2p; 5832 mtx_lock(&pc->pc_cmap_lock); 5833 if (pte2_load(cmap2_pte2p) != 0) 5834 panic("%s: CMAP2 busy", __func__); 5835 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5836 vm_page_pte2_attr(m))); 5837 if (off == 0 && size == PAGE_SIZE) 5838 pagezero(pc->pc_cmap2_addr); 5839 else 5840 bzero(pc->pc_cmap2_addr + off, size); 5841 pte2_clear(cmap2_pte2p); 5842 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5843 sched_unpin(); 5844 mtx_unlock(&pc->pc_cmap_lock); 5845} 5846 5847/* 5848 * pmap_zero_page_idle zeros the specified hardware page by mapping 5849 * the page into KVM and using bzero to clear its contents. This 5850 * is intended to be called from the vm_pagezero process only and 5851 * outside of Giant. 5852 */ 5853void 5854pmap_zero_page_idle(vm_page_t m) 5855{ 5856 5857 if (pte2_load(CMAP3) != 0) 5858 panic("%s: CMAP3 busy", __func__); 5859 sched_pin(); 5860 pte2_store(CMAP3, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5861 vm_page_pte2_attr(m))); 5862 pagezero(CADDR3); 5863 pte2_clear(CMAP3); 5864 tlb_flush((vm_offset_t)CADDR3); 5865 sched_unpin(); 5866} 5867 5868/* 5869 * pmap_copy_page copies the specified (machine independent) 5870 * page by mapping the page into virtual memory and using 5871 * bcopy to copy the page, one machine dependent page at a 5872 * time. 5873 */ 5874void 5875pmap_copy_page(vm_page_t src, vm_page_t dst) 5876{ 5877 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5878 struct pcpu *pc; 5879 5880 sched_pin(); 5881 pc = get_pcpu(); 5882 cmap1_pte2p = pc->pc_cmap1_pte2p; 5883 cmap2_pte2p = pc->pc_cmap2_pte2p; 5884 mtx_lock(&pc->pc_cmap_lock); 5885 if (pte2_load(cmap1_pte2p) != 0) 5886 panic("%s: CMAP1 busy", __func__); 5887 if (pte2_load(cmap2_pte2p) != 0) 5888 panic("%s: CMAP2 busy", __func__); 5889 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5890 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5891 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5892 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5893 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5894 pte2_clear(cmap1_pte2p); 5895 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5896 pte2_clear(cmap2_pte2p); 5897 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5898 sched_unpin(); 5899 mtx_unlock(&pc->pc_cmap_lock); 5900} 5901 5902int unmapped_buf_allowed = 1; 5903 5904void 5905pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5906 vm_offset_t b_offset, int xfersize) 5907{ 5908 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5909 vm_page_t a_pg, b_pg; 5910 char *a_cp, *b_cp; 5911 vm_offset_t a_pg_offset, b_pg_offset; 5912 struct pcpu *pc; 5913 int cnt; 5914 5915 sched_pin(); 5916 pc = get_pcpu(); 5917 cmap1_pte2p = pc->pc_cmap1_pte2p; 5918 cmap2_pte2p = pc->pc_cmap2_pte2p; 5919 mtx_lock(&pc->pc_cmap_lock); 5920 if (pte2_load(cmap1_pte2p) != 0) 5921 panic("pmap_copy_pages: CMAP1 busy"); 5922 if (pte2_load(cmap2_pte2p) != 0) 5923 panic("pmap_copy_pages: CMAP2 busy"); 5924 while (xfersize > 0) { 5925 a_pg = ma[a_offset >> PAGE_SHIFT]; 5926 a_pg_offset = a_offset & PAGE_MASK; 5927 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5928 b_pg = mb[b_offset >> PAGE_SHIFT]; 5929 b_pg_offset = b_offset & PAGE_MASK; 5930 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5931 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5932 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5933 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5934 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5935 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5936 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5937 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5938 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5939 bcopy(a_cp, b_cp, cnt); 5940 a_offset += cnt; 5941 b_offset += cnt; 5942 xfersize -= cnt; 5943 } 5944 pte2_clear(cmap1_pte2p); 5945 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5946 pte2_clear(cmap2_pte2p); 5947 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5948 sched_unpin(); 5949 mtx_unlock(&pc->pc_cmap_lock); 5950} 5951 5952vm_offset_t 5953pmap_quick_enter_page(vm_page_t m) 5954{ 5955 struct pcpu *pc; 5956 pt2_entry_t *pte2p; 5957 5958 critical_enter(); 5959 pc = get_pcpu(); 5960 pte2p = pc->pc_qmap_pte2p; 5961 5962 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 5963 5964 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5965 vm_page_pte2_attr(m))); 5966 return (pc->pc_qmap_addr); 5967} 5968 5969void 5970pmap_quick_remove_page(vm_offset_t addr) 5971{ 5972 struct pcpu *pc; 5973 pt2_entry_t *pte2p; 5974 5975 pc = get_pcpu(); 5976 pte2p = pc->pc_qmap_pte2p; 5977 5978 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 5979 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 5980 5981 pte2_clear(pte2p); 5982 tlb_flush(pc->pc_qmap_addr); 5983 critical_exit(); 5984} 5985 5986/* 5987 * Copy the range specified by src_addr/len 5988 * from the source map to the range dst_addr/len 5989 * in the destination map. 5990 * 5991 * This routine is only advisory and need not do anything. 5992 */ 5993void 5994pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5995 vm_offset_t src_addr) 5996{ 5997 struct spglist free; 5998 vm_offset_t addr; 5999 vm_offset_t end_addr = src_addr + len; 6000 vm_offset_t nextva; 6001 6002 if (dst_addr != src_addr) 6003 return; 6004 6005 if (!pmap_is_current(src_pmap)) 6006 return; 6007 6008 rw_wlock(&pvh_global_lock); 6009 if (dst_pmap < src_pmap) { 6010 PMAP_LOCK(dst_pmap); 6011 PMAP_LOCK(src_pmap); 6012 } else { 6013 PMAP_LOCK(src_pmap); 6014 PMAP_LOCK(dst_pmap); 6015 } 6016 sched_pin(); 6017 for (addr = src_addr; addr < end_addr; addr = nextva) { 6018 pt2_entry_t *src_pte2p, *dst_pte2p; 6019 vm_page_t dst_mpt2pg, src_mpt2pg; 6020 pt1_entry_t src_pte1; 6021 u_int pte1_idx; 6022 6023 KASSERT(addr < VM_MAXUSER_ADDRESS, 6024 ("%s: invalid to pmap_copy page tables", __func__)); 6025 6026 nextva = pte1_trunc(addr + PTE1_SIZE); 6027 if (nextva < addr) 6028 nextva = end_addr; 6029 6030 pte1_idx = pte1_index(addr); 6031 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6032 if (pte1_is_section(src_pte1)) { 6033 if ((addr & PTE1_OFFSET) != 0 || 6034 (addr + PTE1_SIZE) > end_addr) 6035 continue; 6036 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6037 (!pte1_is_managed(src_pte1) || 6038 pmap_pv_insert_pte1(dst_pmap, addr, 6039 pte1_pa(src_pte1)))) { 6040 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6041 ~PTE1_W; 6042 dst_pmap->pm_stats.resident_count += 6043 PTE1_SIZE / PAGE_SIZE; 6044 pmap_pte1_mappings++; 6045 } 6046 continue; 6047 } else if (!pte1_is_link(src_pte1)) 6048 continue; 6049 6050 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6051 6052 /* 6053 * We leave PT2s to be linked from PT1 even if they are not 6054 * referenced until all PT2s in a page are without reference. 6055 * 6056 * QQQ: It could be changed ... 6057 */ 6058#if 0 /* single_pt2_link_is_cleared */ 6059 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6060 ("%s: source page table page is unused", __func__)); 6061#else 6062 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6063 continue; 6064#endif 6065 if (nextva > end_addr) 6066 nextva = end_addr; 6067 6068 src_pte2p = pt2map_entry(addr); 6069 while (addr < nextva) { 6070 pt2_entry_t temp_pte2; 6071 temp_pte2 = pte2_load(src_pte2p); 6072 /* 6073 * we only virtual copy managed pages 6074 */ 6075 if (pte2_is_managed(temp_pte2)) { 6076 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6077 PMAP_ENTER_NOSLEEP); 6078 if (dst_mpt2pg == NULL) 6079 goto out; 6080 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6081 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6082 pmap_try_insert_pv_entry(dst_pmap, addr, 6083 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6084 /* 6085 * Clear the wired, modified, and 6086 * accessed (referenced) bits 6087 * during the copy. 6088 */ 6089 temp_pte2 &= ~(PTE2_W | PTE2_A); 6090 temp_pte2 |= PTE2_NM; 6091 pte2_store(dst_pte2p, temp_pte2); 6092 dst_pmap->pm_stats.resident_count++; 6093 } else { 6094 SLIST_INIT(&free); 6095 if (pmap_unwire_pt2(dst_pmap, addr, 6096 dst_mpt2pg, &free)) { 6097 pmap_tlb_flush(dst_pmap, addr); 6098 pmap_free_zero_pages(&free); 6099 } 6100 goto out; 6101 } 6102 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6103 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6104 break; 6105 } 6106 addr += PAGE_SIZE; 6107 src_pte2p++; 6108 } 6109 } 6110out: 6111 sched_unpin(); 6112 rw_wunlock(&pvh_global_lock); 6113 PMAP_UNLOCK(src_pmap); 6114 PMAP_UNLOCK(dst_pmap); 6115} 6116 6117/* 6118 * Increase the starting virtual address of the given mapping if a 6119 * different alignment might result in more section mappings. 6120 */ 6121void 6122pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6123 vm_offset_t *addr, vm_size_t size) 6124{ 6125 vm_offset_t pte1_offset; 6126 6127 if (size < PTE1_SIZE) 6128 return; 6129 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6130 offset += ptoa(object->pg_color); 6131 pte1_offset = offset & PTE1_OFFSET; 6132 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6133 (*addr & PTE1_OFFSET) == pte1_offset) 6134 return; 6135 if ((*addr & PTE1_OFFSET) < pte1_offset) 6136 *addr = pte1_trunc(*addr) + pte1_offset; 6137 else 6138 *addr = pte1_roundup(*addr) + pte1_offset; 6139} 6140 6141void 6142pmap_activate(struct thread *td) 6143{ 6144 pmap_t pmap, oldpmap; 6145 u_int cpuid, ttb; 6146 6147 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6148 6149 critical_enter(); 6150 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6151 oldpmap = PCPU_GET(curpmap); 6152 cpuid = PCPU_GET(cpuid); 6153 6154#if defined(SMP) 6155 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6156 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6157#else 6158 CPU_CLR(cpuid, &oldpmap->pm_active); 6159 CPU_SET(cpuid, &pmap->pm_active); 6160#endif 6161 6162 ttb = pmap_ttb_get(pmap); 6163 6164 /* 6165 * pmap_activate is for the current thread on the current cpu 6166 */ 6167 td->td_pcb->pcb_pagedir = ttb; 6168 cp15_ttbr_set(ttb); 6169 PCPU_SET(curpmap, pmap); 6170 critical_exit(); 6171} 6172 6173/* 6174 * Perform the pmap work for mincore. 6175 */ 6176int 6177pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6178{ 6179 pt1_entry_t *pte1p, pte1; 6180 pt2_entry_t *pte2p, pte2; 6181 vm_paddr_t pa; 6182 bool managed; 6183 int val; 6184 6185 PMAP_LOCK(pmap); 6186retry: 6187 pte1p = pmap_pte1(pmap, addr); 6188 pte1 = pte1_load(pte1p); 6189 if (pte1_is_section(pte1)) { 6190 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6191 managed = pte1_is_managed(pte1); 6192 val = MINCORE_SUPER | MINCORE_INCORE; 6193 if (pte1_is_dirty(pte1)) 6194 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6195 if (pte1 & PTE1_A) 6196 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6197 } else if (pte1_is_link(pte1)) { 6198 pte2p = pmap_pte2(pmap, addr); 6199 pte2 = pte2_load(pte2p); 6200 pmap_pte2_release(pte2p); 6201 pa = pte2_pa(pte2); 6202 managed = pte2_is_managed(pte2); 6203 val = MINCORE_INCORE; 6204 if (pte2_is_dirty(pte2)) 6205 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6206 if (pte2 & PTE2_A) 6207 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6208 } else { 6209 managed = false; 6210 val = 0; 6211 } 6212 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6213 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6214 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6215 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6216 goto retry; 6217 } else 6218 PA_UNLOCK_COND(*locked_pa); 6219 PMAP_UNLOCK(pmap); 6220 return (val); 6221} 6222 6223void 6224pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6225{ 6226 vm_offset_t sva; 6227 uint32_t l2attr; 6228 6229 KASSERT((size & PAGE_MASK) == 0, 6230 ("%s: device mapping not page-sized", __func__)); 6231 6232 sva = va; 6233 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6234 while (size != 0) { 6235 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6236 va += PAGE_SIZE; 6237 pa += PAGE_SIZE; 6238 size -= PAGE_SIZE; 6239 } 6240 tlb_flush_range(sva, va - sva); 6241} 6242 6243void 6244pmap_kremove_device(vm_offset_t va, vm_size_t size) 6245{ 6246 vm_offset_t sva; 6247 6248 KASSERT((size & PAGE_MASK) == 0, 6249 ("%s: device mapping not page-sized", __func__)); 6250 6251 sva = va; 6252 while (size != 0) { 6253 pmap_kremove(va); 6254 va += PAGE_SIZE; 6255 size -= PAGE_SIZE; 6256 } 6257 tlb_flush_range(sva, va - sva); 6258} 6259 6260void 6261pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6262{ 6263 6264 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6265} 6266 6267 6268/* 6269 * Clean L1 data cache range by physical address. 6270 * The range must be within a single page. 6271 */ 6272static void 6273pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6274{ 6275 pt2_entry_t *cmap2_pte2p; 6276 struct pcpu *pc; 6277 6278 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6279 ("%s: not on single page", __func__)); 6280 6281 sched_pin(); 6282 pc = get_pcpu(); 6283 cmap2_pte2p = pc->pc_cmap2_pte2p; 6284 mtx_lock(&pc->pc_cmap_lock); 6285 if (pte2_load(cmap2_pte2p) != 0) 6286 panic("%s: CMAP2 busy", __func__); 6287 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6288 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6289 pte2_clear(cmap2_pte2p); 6290 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6291 sched_unpin(); 6292 mtx_unlock(&pc->pc_cmap_lock); 6293} 6294 6295/* 6296 * Sync instruction cache range which is not mapped yet. 6297 */ 6298void 6299cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6300{ 6301 uint32_t len, offset; 6302 vm_page_t m; 6303 6304 /* Write back d-cache on given address range. */ 6305 offset = pa & PAGE_MASK; 6306 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6307 len = min(PAGE_SIZE - offset, size); 6308 m = PHYS_TO_VM_PAGE(pa); 6309 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6310 __func__, pa)); 6311 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6312 } 6313 /* 6314 * I-cache is VIPT. Only way how to flush all virtual mappings 6315 * on given physical address is to invalidate all i-cache. 6316 */ 6317 icache_inv_all(); 6318} 6319 6320void 6321pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6322{ 6323 6324 /* Write back d-cache on given address range. */ 6325 if (va >= VM_MIN_KERNEL_ADDRESS) { 6326 dcache_wb_pou(va, size); 6327 } else { 6328 uint32_t len, offset; 6329 vm_paddr_t pa; 6330 vm_page_t m; 6331 6332 offset = va & PAGE_MASK; 6333 for ( ; size != 0; size -= len, va += len, offset = 0) { 6334 pa = pmap_extract(pmap, va); /* offset is preserved */ 6335 len = min(PAGE_SIZE - offset, size); 6336 m = PHYS_TO_VM_PAGE(pa); 6337 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6338 __func__, pa)); 6339 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6340 } 6341 } 6342 /* 6343 * I-cache is VIPT. Only way how to flush all virtual mappings 6344 * on given physical address is to invalidate all i-cache. 6345 */ 6346 icache_inv_all(); 6347} 6348 6349/* 6350 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6351 * depends on the fact that given range size is a power of 2. 6352 */ 6353CTASSERT(powerof2(NB_IN_PT1)); 6354CTASSERT(powerof2(PT2MAP_SIZE)); 6355 6356#define IN_RANGE2(addr, start, size) \ 6357 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6358 6359/* 6360 * Handle access and R/W emulation faults. 6361 */ 6362int 6363pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6364{ 6365 pt1_entry_t *pte1p, pte1; 6366 pt2_entry_t *pte2p, pte2; 6367 6368 if (pmap == NULL) 6369 pmap = kernel_pmap; 6370 6371 /* 6372 * In kernel, we should never get abort with FAR which is in range of 6373 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6374 * and print out a useful abort message and even get to the debugger 6375 * otherwise it likely ends with never ending loop of aborts. 6376 */ 6377 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6378 /* 6379 * All L1 tables should always be mapped and present. 6380 * However, we check only current one herein. For user mode, 6381 * only permission abort from malicious user is not fatal. 6382 * And alignment abort as it may have higher priority. 6383 */ 6384 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6385 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6386 __func__, pmap, pmap->pm_pt1, far); 6387 panic("%s: pm_pt1 abort", __func__); 6388 } 6389 return (KERN_INVALID_ADDRESS); 6390 } 6391 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6392 /* 6393 * PT2MAP should be always mapped and present in current 6394 * L1 table. However, only existing L2 tables are mapped 6395 * in PT2MAP. For user mode, only L2 translation abort and 6396 * permission abort from malicious user is not fatal. 6397 * And alignment abort as it may have higher priority. 6398 */ 6399 if (!usermode || (idx != FAULT_ALIGN && 6400 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6401 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6402 __func__, pmap, PT2MAP, far); 6403 panic("%s: PT2MAP abort", __func__); 6404 } 6405 return (KERN_INVALID_ADDRESS); 6406 } 6407 6408 /* 6409 * A pmap lock is used below for handling of access and R/W emulation 6410 * aborts. They were handled by atomic operations before so some 6411 * analysis of new situation is needed to answer the following question: 6412 * Is it safe to use the lock even for these aborts? 6413 * 6414 * There may happen two cases in general: 6415 * 6416 * (1) Aborts while the pmap lock is locked already - this should not 6417 * happen as pmap lock is not recursive. However, under pmap lock only 6418 * internal kernel data should be accessed and such data should be 6419 * mapped with A bit set and NM bit cleared. If double abort happens, 6420 * then a mapping of data which has caused it must be fixed. Further, 6421 * all new mappings are always made with A bit set and the bit can be 6422 * cleared only on managed mappings. 6423 * 6424 * (2) Aborts while another lock(s) is/are locked - this already can 6425 * happen. However, there is no difference here if it's either access or 6426 * R/W emulation abort, or if it's some other abort. 6427 */ 6428 6429 PMAP_LOCK(pmap); 6430#ifdef SMP 6431 /* 6432 * Special treatment is due to break-before-make approach done when 6433 * pte1 is updated for userland mapping during section promotion or 6434 * demotion. If not caught here, pmap_enter() can find a section 6435 * mapping on faulting address. That is not allowed. 6436 */ 6437 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6438 PMAP_UNLOCK(pmap); 6439 return (KERN_SUCCESS); 6440 } 6441#endif 6442 /* 6443 * Accesss bits for page and section. Note that the entry 6444 * is not in TLB yet, so TLB flush is not necessary. 6445 * 6446 * QQQ: This is hardware emulation, we do not call userret() 6447 * for aborts from user mode. 6448 */ 6449 if (idx == FAULT_ACCESS_L2) { 6450 pte2p = pt2map_entry(far); 6451 pte2 = pte2_load(pte2p); 6452 if (pte2_is_valid(pte2)) { 6453 pte2_store(pte2p, pte2 | PTE2_A); 6454 PMAP_UNLOCK(pmap); 6455 return (KERN_SUCCESS); 6456 } 6457 } 6458 if (idx == FAULT_ACCESS_L1) { 6459 pte1p = pmap_pte1(pmap, far); 6460 pte1 = pte1_load(pte1p); 6461 if (pte1_is_section(pte1)) { 6462 pte1_store(pte1p, pte1 | PTE1_A); 6463 PMAP_UNLOCK(pmap); 6464 return (KERN_SUCCESS); 6465 } 6466 } 6467 6468 /* 6469 * Handle modify bits for page and section. Note that the modify 6470 * bit is emulated by software. So PTEx_RO is software read only 6471 * bit and PTEx_NM flag is real hardware read only bit. 6472 * 6473 * QQQ: This is hardware emulation, we do not call userret() 6474 * for aborts from user mode. 6475 */ 6476 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6477 pte2p = pt2map_entry(far); 6478 pte2 = pte2_load(pte2p); 6479 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6480 (pte2 & PTE2_NM)) { 6481 pte2_store(pte2p, pte2 & ~PTE2_NM); 6482 tlb_flush(trunc_page(far)); 6483 PMAP_UNLOCK(pmap); 6484 return (KERN_SUCCESS); 6485 } 6486 } 6487 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6488 pte1p = pmap_pte1(pmap, far); 6489 pte1 = pte1_load(pte1p); 6490 if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) && 6491 (pte1 & PTE1_NM)) { 6492 pte1_store(pte1p, pte1 & ~PTE1_NM); 6493 tlb_flush(pte1_trunc(far)); 6494 PMAP_UNLOCK(pmap); 6495 return (KERN_SUCCESS); 6496 } 6497 } 6498 6499 /* 6500 * QQQ: The previous code, mainly fast handling of access and 6501 * modify bits aborts, could be moved to ASM. Now we are 6502 * starting to deal with not fast aborts. 6503 */ 6504 6505#ifdef INVARIANTS 6506 /* 6507 * Read an entry in PT2TAB associated with both pmap and far. 6508 * It's safe because PT2TAB is always mapped. 6509 */ 6510 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6511 if (pte2_is_valid(pte2)) { 6512 /* 6513 * Now, when we know that L2 page table is allocated, 6514 * we can use PT2MAP to get L2 page table entry. 6515 */ 6516 pte2 = pte2_load(pt2map_entry(far)); 6517 if (pte2_is_valid(pte2)) { 6518 /* 6519 * If L2 page table entry is valid, make sure that 6520 * L1 page table entry is valid too. Note that we 6521 * leave L2 page entries untouched when promoted. 6522 */ 6523 pte1 = pte1_load(pmap_pte1(pmap, far)); 6524 if (!pte1_is_valid(pte1)) { 6525 panic("%s: missing L1 page entry (%p, %#x)", 6526 __func__, pmap, far); 6527 } 6528 } 6529 } 6530#endif 6531 PMAP_UNLOCK(pmap); 6532 return (KERN_FAILURE); 6533} 6534 6535#if defined(PMAP_DEBUG) 6536/* 6537 * Reusing of KVA used in pmap_zero_page function !!! 6538 */ 6539static void 6540pmap_zero_page_check(vm_page_t m) 6541{ 6542 pt2_entry_t *cmap2_pte2p; 6543 uint32_t *p, *end; 6544 struct pcpu *pc; 6545 6546 sched_pin(); 6547 pc = get_pcpu(); 6548 cmap2_pte2p = pc->pc_cmap2_pte2p; 6549 mtx_lock(&pc->pc_cmap_lock); 6550 if (pte2_load(cmap2_pte2p) != 0) 6551 panic("%s: CMAP2 busy", __func__); 6552 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6553 vm_page_pte2_attr(m))); 6554 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6555 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6556 if (*p != 0) 6557 panic("%s: page %p not zero, va: %p", __func__, m, 6558 pc->pc_cmap2_addr); 6559 pte2_clear(cmap2_pte2p); 6560 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6561 sched_unpin(); 6562 mtx_unlock(&pc->pc_cmap_lock); 6563} 6564 6565int 6566pmap_pid_dump(int pid) 6567{ 6568 pmap_t pmap; 6569 struct proc *p; 6570 int npte2 = 0; 6571 int i, j, index; 6572 6573 sx_slock(&allproc_lock); 6574 FOREACH_PROC_IN_SYSTEM(p) { 6575 if (p->p_pid != pid || p->p_vmspace == NULL) 6576 continue; 6577 index = 0; 6578 pmap = vmspace_pmap(p->p_vmspace); 6579 for (i = 0; i < NPTE1_IN_PT1; i++) { 6580 pt1_entry_t pte1; 6581 pt2_entry_t *pte2p, pte2; 6582 vm_offset_t base, va; 6583 vm_paddr_t pa; 6584 vm_page_t m; 6585 6586 base = i << PTE1_SHIFT; 6587 pte1 = pte1_load(&pmap->pm_pt1[i]); 6588 6589 if (pte1_is_section(pte1)) { 6590 /* 6591 * QQQ: Do something here! 6592 */ 6593 } else if (pte1_is_link(pte1)) { 6594 for (j = 0; j < NPTE2_IN_PT2; j++) { 6595 va = base + (j << PAGE_SHIFT); 6596 if (va >= VM_MIN_KERNEL_ADDRESS) { 6597 if (index) { 6598 index = 0; 6599 printf("\n"); 6600 } 6601 sx_sunlock(&allproc_lock); 6602 return (npte2); 6603 } 6604 pte2p = pmap_pte2(pmap, va); 6605 pte2 = pte2_load(pte2p); 6606 pmap_pte2_release(pte2p); 6607 if (!pte2_is_valid(pte2)) 6608 continue; 6609 6610 pa = pte2_pa(pte2); 6611 m = PHYS_TO_VM_PAGE(pa); 6612 printf("va: 0x%x, pa: 0x%x, h: %d, w:" 6613 " %d, f: 0x%x", va, pa, 6614 m->hold_count, m->wire_count, 6615 m->flags); 6616 npte2++; 6617 index++; 6618 if (index >= 2) { 6619 index = 0; 6620 printf("\n"); 6621 } else { 6622 printf(" "); 6623 } 6624 } 6625 } 6626 } 6627 } 6628 sx_sunlock(&allproc_lock); 6629 return (npte2); 6630} 6631 6632#endif 6633 6634#ifdef DDB 6635static pt2_entry_t * 6636pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6637{ 6638 pt1_entry_t pte1; 6639 vm_paddr_t pt2pg_pa; 6640 6641 pte1 = pte1_load(pmap_pte1(pmap, va)); 6642 if (!pte1_is_link(pte1)) 6643 return (NULL); 6644 6645 if (pmap_is_current(pmap)) 6646 return (pt2map_entry(va)); 6647 6648 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6649 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6650 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6651 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6652#ifdef SMP 6653 PMAP3cpu = PCPU_GET(cpuid); 6654#endif 6655 tlb_flush_local((vm_offset_t)PADDR3); 6656 } 6657#ifdef SMP 6658 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6659 PMAP3cpu = PCPU_GET(cpuid); 6660 tlb_flush_local((vm_offset_t)PADDR3); 6661 } 6662#endif 6663 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6664} 6665 6666static void 6667dump_pmap(pmap_t pmap) 6668{ 6669 6670 printf("pmap %p\n", pmap); 6671 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6672 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6673 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6674} 6675 6676DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6677{ 6678 6679 pmap_t pmap; 6680 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6681 dump_pmap(pmap); 6682 } 6683} 6684 6685static int 6686pte2_class(pt2_entry_t pte2) 6687{ 6688 int cls; 6689 6690 cls = (pte2 >> 2) & 0x03; 6691 cls |= (pte2 >> 4) & 0x04; 6692 return (cls); 6693} 6694 6695static void 6696dump_section(pmap_t pmap, uint32_t pte1_idx) 6697{ 6698} 6699 6700static void 6701dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6702{ 6703 uint32_t i; 6704 vm_offset_t va; 6705 pt2_entry_t *pte2p, pte2; 6706 vm_page_t m; 6707 6708 va = pte1_idx << PTE1_SHIFT; 6709 pte2p = pmap_pte2_ddb(pmap, va); 6710 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6711 pte2 = pte2_load(pte2p); 6712 if (pte2 == 0) 6713 continue; 6714 if (!pte2_is_valid(pte2)) { 6715 printf(" 0x%08X: 0x%08X", va, pte2); 6716 if (!invalid_ok) 6717 printf(" - not valid !!!"); 6718 printf("\n"); 6719 continue; 6720 } 6721 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6722 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6723 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6724 if (m != NULL) { 6725 printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, 6726 m->hold_count, m->wire_count, m->flags); 6727 } else { 6728 printf("\n"); 6729 } 6730 } 6731} 6732 6733static __inline boolean_t 6734is_pv_chunk_space(vm_offset_t va) 6735{ 6736 6737 if ((((vm_offset_t)pv_chunkbase) <= va) && 6738 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6739 return (TRUE); 6740 return (FALSE); 6741} 6742 6743DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6744{ 6745 /* XXX convert args. */ 6746 pmap_t pmap = (pmap_t)addr; 6747 pt1_entry_t pte1; 6748 pt2_entry_t pte2; 6749 vm_offset_t va, eva; 6750 vm_page_t m; 6751 uint32_t i; 6752 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6753 6754 if (have_addr) { 6755 pmap_t pm; 6756 6757 LIST_FOREACH(pm, &allpmaps, pm_list) 6758 if (pm == pmap) break; 6759 if (pm == NULL) { 6760 printf("given pmap %p is not in allpmaps list\n", pmap); 6761 return; 6762 } 6763 } else 6764 pmap = PCPU_GET(curpmap); 6765 6766 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6767 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6768 6769 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6770 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6771 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6772 6773 for(i = 0; i < NPTE1_IN_PT1; i++) { 6774 pte1 = pte1_load(&pmap->pm_pt1[i]); 6775 if (pte1 == 0) 6776 continue; 6777 va = i << PTE1_SHIFT; 6778 if (va >= eva) 6779 break; 6780 6781 if (pte1_is_section(pte1)) { 6782 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6783 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6784 dump_section(pmap, i); 6785 } else if (pte1_is_link(pte1)) { 6786 dump_link_ok = TRUE; 6787 invalid_ok = FALSE; 6788 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6789 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6790 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6791 va, pte1, pte2, m); 6792 if (is_pv_chunk_space(va)) { 6793 printf(" - pv_chunk space"); 6794 if (dump_pv_chunk) 6795 invalid_ok = TRUE; 6796 else 6797 dump_link_ok = FALSE; 6798 } 6799 else if (m != NULL) 6800 printf(" w:%d w2:%u", m->wire_count, 6801 pt2_wirecount_get(m, pte1_index(va))); 6802 if (pte2 == 0) 6803 printf(" !!! pt2tab entry is ZERO"); 6804 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6805 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6806 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6807 printf("\n"); 6808 if (dump_link_ok) 6809 dump_link(pmap, i, invalid_ok); 6810 } else 6811 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6812 } 6813} 6814 6815static void 6816dump_pt2tab(pmap_t pmap) 6817{ 6818 uint32_t i; 6819 pt2_entry_t pte2; 6820 vm_offset_t va; 6821 vm_paddr_t pa; 6822 vm_page_t m; 6823 6824 printf("PT2TAB:\n"); 6825 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6826 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6827 if (!pte2_is_valid(pte2)) 6828 continue; 6829 va = i << PT2TAB_SHIFT; 6830 pa = pte2_pa(pte2); 6831 m = PHYS_TO_VM_PAGE(pa); 6832 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6833 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6834 if (m != NULL) 6835 printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", 6836 m->hold_count, m->wire_count, m->flags, m->pindex); 6837 printf("\n"); 6838 } 6839} 6840 6841DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6842{ 6843 /* XXX convert args. */ 6844 pmap_t pmap = (pmap_t)addr; 6845 pt1_entry_t pte1; 6846 pt2_entry_t pte2; 6847 vm_offset_t va; 6848 uint32_t i, start; 6849 6850 if (have_addr) { 6851 printf("supported only on current pmap\n"); 6852 return; 6853 } 6854 6855 pmap = PCPU_GET(curpmap); 6856 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6857 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6858 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6859 6860 start = pte1_index((vm_offset_t)PT2MAP); 6861 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6862 pte1 = pte1_load(&pmap->pm_pt1[i]); 6863 if (pte1 == 0) 6864 continue; 6865 va = i << PTE1_SHIFT; 6866 if (pte1_is_section(pte1)) { 6867 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6868 !!(pte1 & PTE1_S)); 6869 dump_section(pmap, i); 6870 } else if (pte1_is_link(pte1)) { 6871 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6872 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6873 pte1, pte2); 6874 if (pte2 == 0) 6875 printf(" !!! pt2tab entry is ZERO\n"); 6876 } else 6877 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6878 } 6879 dump_pt2tab(pmap); 6880} 6881#endif 6882