pmap.c revision 324400
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: stable/11/sys/i386/i386/pmap.c 324400 2017-10-07 21:13:54Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	Since the information managed by this module is
84 *	also stored by the logical address mapping module,
85 *	this module may throw away valid virtual-to-physical
86 *	mappings at almost any time.  However, invalidations
87 *	of virtual-to-physical mappings must be done as
88 *	requested.
89 *
90 *	In order to cope with hardware architectures which
91 *	make virtual-to-physical map invalidates expensive,
92 *	this module may delay invalidate or reduced protection
93 *	operations until such time as they are actually
94 *	necessary.  This module is given full information as
95 *	to which processors are currently using which maps,
96 *	and to when physical maps must be made correct.
97 */
98
99#include "opt_apic.h"
100#include "opt_cpu.h"
101#include "opt_pmap.h"
102#include "opt_smp.h"
103#include "opt_xbox.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/ktr.h>
109#include <sys/lock.h>
110#include <sys/malloc.h>
111#include <sys/mman.h>
112#include <sys/msgbuf.h>
113#include <sys/mutex.h>
114#include <sys/proc.h>
115#include <sys/rwlock.h>
116#include <sys/sf_buf.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/smp.h>
122
123#include <vm/vm.h>
124#include <vm/vm_param.h>
125#include <vm/vm_kern.h>
126#include <vm/vm_page.h>
127#include <vm/vm_map.h>
128#include <vm/vm_object.h>
129#include <vm/vm_extern.h>
130#include <vm/vm_pageout.h>
131#include <vm/vm_pager.h>
132#include <vm/vm_phys.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#ifdef DEV_APIC
138#include <sys/bus.h>
139#include <machine/intr_machdep.h>
140#include <x86/apicvar.h>
141#endif
142#include <machine/cpu.h>
143#include <machine/cputypes.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146#include <machine/specialreg.h>
147#ifdef SMP
148#include <machine/smp.h>
149#endif
150
151#ifdef XBOX
152#include <machine/xbox.h>
153#endif
154
155#ifndef PMAP_SHPGPERPROC
156#define PMAP_SHPGPERPROC 200
157#endif
158
159#if !defined(DIAGNOSTIC)
160#ifdef __GNUC_GNU_INLINE__
161#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
162#else
163#define PMAP_INLINE	extern inline
164#endif
165#else
166#define PMAP_INLINE
167#endif
168
169#ifdef PV_STATS
170#define PV_STAT(x)	do { x ; } while (0)
171#else
172#define PV_STAT(x)	do { } while (0)
173#endif
174
175#define	pa_index(pa)	((pa) >> PDRSHIFT)
176#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
177
178/*
179 * Get PDEs and PTEs for user/kernel address space
180 */
181#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
182#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
183
184#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
185#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
186#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
187#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
188#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
189
190#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
191    atomic_clear_int((u_int *)(pte), PG_W))
192#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
193
194struct pmap kernel_pmap_store;
195LIST_HEAD(pmaplist, pmap);
196static struct pmaplist allpmaps;
197static struct mtx allpmaps_lock;
198
199vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
200vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
201int pgeflag = 0;		/* PG_G or-in */
202int pseflag = 0;		/* PG_PS or-in */
203
204static int nkpt = NKPT;
205vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
206extern u_int32_t KERNend;
207extern u_int32_t KPTphys;
208
209#if defined(PAE) || defined(PAE_TABLES)
210pt_entry_t pg_nx;
211static uma_zone_t pdptzone;
212#endif
213
214static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
215
216static int pat_works = 1;
217SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
218    "Is page attribute table fully functional?");
219
220static int pg_ps_enabled = 1;
221SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
222    &pg_ps_enabled, 0, "Are large page mappings enabled?");
223
224#define	PAT_INDEX_SIZE	8
225static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
226
227/*
228 * pmap_mapdev support pre initialization (i.e. console)
229 */
230#define	PMAP_PREINIT_MAPPING_COUNT	8
231static struct pmap_preinit_mapping {
232	vm_paddr_t	pa;
233	vm_offset_t	va;
234	vm_size_t	sz;
235	int		mode;
236} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
237static int pmap_initialized;
238
239static struct rwlock_padalign pvh_global_lock;
240
241/*
242 * Data for the pv entry allocation mechanism
243 */
244static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
245static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
246static struct md_page *pv_table;
247static int shpgperproc = PMAP_SHPGPERPROC;
248
249struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
250int pv_maxchunks;			/* How many chunks we have KVA for */
251vm_offset_t pv_vafree;			/* freelist stored in the PTE */
252
253/*
254 * All those kernel PT submaps that BSD is so fond of
255 */
256pt_entry_t *CMAP3;
257static pd_entry_t *KPTD;
258caddr_t ptvmmap = 0;
259caddr_t CADDR3;
260struct msgbuf *msgbufp = NULL;
261
262/*
263 * Crashdump maps.
264 */
265static caddr_t crashdumpmap;
266
267static pt_entry_t *PMAP1 = NULL, *PMAP2;
268static pt_entry_t *PADDR1 = NULL, *PADDR2;
269#ifdef SMP
270static int PMAP1cpu;
271static int PMAP1changedcpu;
272SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
273	   &PMAP1changedcpu, 0,
274	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
275#endif
276static int PMAP1changed;
277SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
278	   &PMAP1changed, 0,
279	   "Number of times pmap_pte_quick changed PMAP1");
280static int PMAP1unchanged;
281SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
282	   &PMAP1unchanged, 0,
283	   "Number of times pmap_pte_quick didn't change PMAP1");
284static struct mtx PMAP2mutex;
285
286static void	free_pv_chunk(struct pv_chunk *pc);
287static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
288static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
289static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
290static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
291static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
292static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
293static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
294		    vm_offset_t va);
295static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
296
297static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
298static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
299    vm_prot_t prot);
300static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
301    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
302static void pmap_flush_page(vm_page_t m);
303static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
304static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
305		    pd_entry_t pde);
306static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
307static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
308static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
309static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
310static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
311static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
312static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
313static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
314    vm_prot_t prot);
315static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
316static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
317    struct spglist *free);
318static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
319    struct spglist *free);
320static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
321static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
322    struct spglist *free);
323static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
324					vm_offset_t va);
325static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
326static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
327    vm_page_t m);
328static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
329    pd_entry_t newpde);
330static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
331
332static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
333
334static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
335static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
336static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
337static void pmap_pte_release(pt_entry_t *pte);
338static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
339#if defined(PAE) || defined(PAE_TABLES)
340static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
341    int wait);
342#endif
343static void pmap_set_pg(void);
344
345static __inline void pagezero(void *page);
346
347CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
348CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
349
350/*
351 * If you get an error here, then you set KVA_PAGES wrong! See the
352 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
353 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
354 */
355CTASSERT(KERNBASE % (1 << 24) == 0);
356
357/*
358 *	Bootstrap the system enough to run with virtual memory.
359 *
360 *	On the i386 this is called after mapping has already been enabled
361 *	and just syncs the pmap module with what has already been done.
362 *	[We can't call it easily with mapping off since the kernel is not
363 *	mapped with PA == VA, hence we would have to relocate every address
364 *	from the linked base (virtual) address "KERNBASE" to the actual
365 *	(physical) address starting relative to 0]
366 */
367void
368pmap_bootstrap(vm_paddr_t firstaddr)
369{
370	vm_offset_t va;
371	pt_entry_t *pte, *unused;
372	struct pcpu *pc;
373	int i;
374
375	/*
376	 * Add a physical memory segment (vm_phys_seg) corresponding to the
377	 * preallocated kernel page table pages so that vm_page structures
378	 * representing these pages will be created.  The vm_page structures
379	 * are required for promotion of the corresponding kernel virtual
380	 * addresses to superpage mappings.
381	 */
382	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
383
384	/*
385	 * Initialize the first available kernel virtual address.  However,
386	 * using "firstaddr" may waste a few pages of the kernel virtual
387	 * address space, because locore may not have mapped every physical
388	 * page that it allocated.  Preferably, locore would provide a first
389	 * unused virtual address in addition to "firstaddr".
390	 */
391	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
392
393	virtual_end = VM_MAX_KERNEL_ADDRESS;
394
395	/*
396	 * Initialize the kernel pmap (which is statically allocated).
397	 */
398	PMAP_LOCK_INIT(kernel_pmap);
399	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
400#if defined(PAE) || defined(PAE_TABLES)
401	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
402#endif
403	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
404	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
405
406 	/*
407	 * Initialize the global pv list lock.
408	 */
409	rw_init(&pvh_global_lock, "pmap pv global");
410
411	LIST_INIT(&allpmaps);
412
413	/*
414	 * Request a spin mutex so that changes to allpmaps cannot be
415	 * preempted by smp_rendezvous_cpus().  Otherwise,
416	 * pmap_update_pde_kernel() could access allpmaps while it is
417	 * being changed.
418	 */
419	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
420	mtx_lock_spin(&allpmaps_lock);
421	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
422	mtx_unlock_spin(&allpmaps_lock);
423
424	/*
425	 * Reserve some special page table entries/VA space for temporary
426	 * mapping of pages.
427	 */
428#define	SYSMAP(c, p, v, n)	\
429	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
430
431	va = virtual_avail;
432	pte = vtopte(va);
433
434
435	/*
436	 * Initialize temporary map objects on the current CPU for use
437	 * during early boot.
438	 * CMAP1/CMAP2 are used for zeroing and copying pages.
439	 * CMAP3 is used for the idle process page zeroing.
440	 */
441	pc = get_pcpu();
442	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
443	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
444	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
445	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
446
447	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
448
449	/*
450	 * Crashdump maps.
451	 */
452	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
453
454	/*
455	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
456	 */
457	SYSMAP(caddr_t, unused, ptvmmap, 1)
458
459	/*
460	 * msgbufp is used to map the system message buffer.
461	 */
462	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
463
464	/*
465	 * KPTmap is used by pmap_kextract().
466	 *
467	 * KPTmap is first initialized by locore.  However, that initial
468	 * KPTmap can only support NKPT page table pages.  Here, a larger
469	 * KPTmap is created that can support KVA_PAGES page table pages.
470	 */
471	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
472
473	for (i = 0; i < NKPT; i++)
474		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
475
476	/*
477	 * Adjust the start of the KPTD and KPTmap so that the implementation
478	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
479	 */
480	KPTD -= KPTDI;
481	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
482
483	/*
484	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
485	 * respectively.
486	 */
487	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
488	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
489
490	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
491
492	virtual_avail = va;
493
494	/*
495	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
496	 * physical memory region that is used by the ACPI wakeup code.  This
497	 * mapping must not have PG_G set.
498	 */
499#ifdef XBOX
500	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
501	 * an early stadium, we cannot yet neatly map video memory ... :-(
502	 * Better fixes are very welcome! */
503	if (!arch_i386_is_xbox)
504#endif
505	for (i = 1; i < NKPT; i++)
506		PTD[i] = 0;
507
508	/*
509	 * Initialize the PAT MSR if present.
510	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
511	 * side-effect, invalidates stale PG_G TLB entries that might
512	 * have been created in our pre-boot environment.  We assume
513	 * that PAT support implies PGE and in reverse, PGE presence
514	 * comes with PAT.  Both features were added for Pentium Pro.
515	 */
516	pmap_init_pat();
517
518	/* Turn on PG_G on kernel page(s) */
519	pmap_set_pg();
520}
521
522static void
523pmap_init_reserved_pages(void)
524{
525	struct pcpu *pc;
526	vm_offset_t pages;
527	int i;
528
529	CPU_FOREACH(i) {
530		pc = pcpu_find(i);
531		/*
532		 * Skip if the mapping has already been initialized,
533		 * i.e. this is the BSP.
534		 */
535		if (pc->pc_cmap_addr1 != 0)
536			continue;
537		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
538		pages = kva_alloc(PAGE_SIZE * 3);
539		if (pages == 0)
540			panic("%s: unable to allocate KVA", __func__);
541		pc->pc_cmap_pte1 = vtopte(pages);
542		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
543		pc->pc_cmap_addr1 = (caddr_t)pages;
544		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
545		pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
546	}
547}
548
549SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
550
551/*
552 * Setup the PAT MSR.
553 */
554void
555pmap_init_pat(void)
556{
557	int pat_table[PAT_INDEX_SIZE];
558	uint64_t pat_msr;
559	u_long cr0, cr4;
560	int i;
561
562	/* Set default PAT index table. */
563	for (i = 0; i < PAT_INDEX_SIZE; i++)
564		pat_table[i] = -1;
565	pat_table[PAT_WRITE_BACK] = 0;
566	pat_table[PAT_WRITE_THROUGH] = 1;
567	pat_table[PAT_UNCACHEABLE] = 3;
568	pat_table[PAT_WRITE_COMBINING] = 3;
569	pat_table[PAT_WRITE_PROTECTED] = 3;
570	pat_table[PAT_UNCACHED] = 3;
571
572	/*
573	 * Bail if this CPU doesn't implement PAT.
574	 * We assume that PAT support implies PGE.
575	 */
576	if ((cpu_feature & CPUID_PAT) == 0) {
577		for (i = 0; i < PAT_INDEX_SIZE; i++)
578			pat_index[i] = pat_table[i];
579		pat_works = 0;
580		return;
581	}
582
583	/*
584	 * Due to some Intel errata, we can only safely use the lower 4
585	 * PAT entries.
586	 *
587	 *   Intel Pentium III Processor Specification Update
588	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
589	 * or Mode C Paging)
590	 *
591	 *   Intel Pentium IV  Processor Specification Update
592	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
593	 */
594	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
595	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
596		pat_works = 0;
597
598	/* Initialize default PAT entries. */
599	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
600	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
601	    PAT_VALUE(2, PAT_UNCACHED) |
602	    PAT_VALUE(3, PAT_UNCACHEABLE) |
603	    PAT_VALUE(4, PAT_WRITE_BACK) |
604	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
605	    PAT_VALUE(6, PAT_UNCACHED) |
606	    PAT_VALUE(7, PAT_UNCACHEABLE);
607
608	if (pat_works) {
609		/*
610		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
611		 * Program 5 and 6 as WP and WC.
612		 * Leave 4 and 7 as WB and UC.
613		 */
614		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
615		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
616		    PAT_VALUE(6, PAT_WRITE_COMBINING);
617		pat_table[PAT_UNCACHED] = 2;
618		pat_table[PAT_WRITE_PROTECTED] = 5;
619		pat_table[PAT_WRITE_COMBINING] = 6;
620	} else {
621		/*
622		 * Just replace PAT Index 2 with WC instead of UC-.
623		 */
624		pat_msr &= ~PAT_MASK(2);
625		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
626		pat_table[PAT_WRITE_COMBINING] = 2;
627	}
628
629	/* Disable PGE. */
630	cr4 = rcr4();
631	load_cr4(cr4 & ~CR4_PGE);
632
633	/* Disable caches (CD = 1, NW = 0). */
634	cr0 = rcr0();
635	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
636
637	/* Flushes caches and TLBs. */
638	wbinvd();
639	invltlb();
640
641	/* Update PAT and index table. */
642	wrmsr(MSR_PAT, pat_msr);
643	for (i = 0; i < PAT_INDEX_SIZE; i++)
644		pat_index[i] = pat_table[i];
645
646	/* Flush caches and TLBs again. */
647	wbinvd();
648	invltlb();
649
650	/* Restore caches and PGE. */
651	load_cr0(cr0);
652	load_cr4(cr4);
653}
654
655/*
656 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
657 */
658static void
659pmap_set_pg(void)
660{
661	pt_entry_t *pte;
662	vm_offset_t va, endva;
663
664	if (pgeflag == 0)
665		return;
666
667	endva = KERNBASE + KERNend;
668
669	if (pseflag) {
670		va = KERNBASE + KERNLOAD;
671		while (va  < endva) {
672			pdir_pde(PTD, va) |= pgeflag;
673			invltlb();	/* Flush non-PG_G entries. */
674			va += NBPDR;
675		}
676	} else {
677		va = (vm_offset_t)btext;
678		while (va < endva) {
679			pte = vtopte(va);
680			if (*pte)
681				*pte |= pgeflag;
682			invltlb();	/* Flush non-PG_G entries. */
683			va += PAGE_SIZE;
684		}
685	}
686}
687
688/*
689 * Initialize a vm_page's machine-dependent fields.
690 */
691void
692pmap_page_init(vm_page_t m)
693{
694
695	TAILQ_INIT(&m->md.pv_list);
696	m->md.pat_mode = PAT_WRITE_BACK;
697}
698
699#if defined(PAE) || defined(PAE_TABLES)
700static void *
701pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
702{
703
704	/* Inform UMA that this allocator uses kernel_map/object. */
705	*flags = UMA_SLAB_KERNEL;
706	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
707	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
708}
709#endif
710
711/*
712 * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
713 * Requirements:
714 *  - Must deal with pages in order to ensure that none of the PG_* bits
715 *    are ever set, PG_V in particular.
716 *  - Assumes we can write to ptes without pte_store() atomic ops, even
717 *    on PAE systems.  This should be ok.
718 *  - Assumes nothing will ever test these addresses for 0 to indicate
719 *    no mapping instead of correctly checking PG_V.
720 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
721 * Because PG_V is never set, there can be no mappings to invalidate.
722 */
723static vm_offset_t
724pmap_ptelist_alloc(vm_offset_t *head)
725{
726	pt_entry_t *pte;
727	vm_offset_t va;
728
729	va = *head;
730	if (va == 0)
731		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
732	pte = vtopte(va);
733	*head = *pte;
734	if (*head & PG_V)
735		panic("pmap_ptelist_alloc: va with PG_V set!");
736	*pte = 0;
737	return (va);
738}
739
740static void
741pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
742{
743	pt_entry_t *pte;
744
745	if (va & PG_V)
746		panic("pmap_ptelist_free: freeing va with PG_V set!");
747	pte = vtopte(va);
748	*pte = *head;		/* virtual! PG_V is 0 though */
749	*head = va;
750}
751
752static void
753pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
754{
755	int i;
756	vm_offset_t va;
757
758	*head = 0;
759	for (i = npages - 1; i >= 0; i--) {
760		va = (vm_offset_t)base + i * PAGE_SIZE;
761		pmap_ptelist_free(head, va);
762	}
763}
764
765
766/*
767 *	Initialize the pmap module.
768 *	Called by vm_init, to initialize any structures that the pmap
769 *	system needs to map virtual memory.
770 */
771void
772pmap_init(void)
773{
774	struct pmap_preinit_mapping *ppim;
775	vm_page_t mpte;
776	vm_size_t s;
777	int i, pv_npg;
778
779	/*
780	 * Initialize the vm page array entries for the kernel pmap's
781	 * page table pages.
782	 */
783	for (i = 0; i < NKPT; i++) {
784		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
785		KASSERT(mpte >= vm_page_array &&
786		    mpte < &vm_page_array[vm_page_array_size],
787		    ("pmap_init: page table page is out of range"));
788		mpte->pindex = i + KPTDI;
789		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
790	}
791
792	/*
793	 * Initialize the address space (zone) for the pv entries.  Set a
794	 * high water mark so that the system can recover from excessive
795	 * numbers of pv entries.
796	 */
797	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
798	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
799	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
800	pv_entry_max = roundup(pv_entry_max, _NPCPV);
801	pv_entry_high_water = 9 * (pv_entry_max / 10);
802
803	/*
804	 * If the kernel is running on a virtual machine, then it must assume
805	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
806	 * be prepared for the hypervisor changing the vendor and family that
807	 * are reported by CPUID.  Consequently, the workaround for AMD Family
808	 * 10h Erratum 383 is enabled if the processor's feature set does not
809	 * include at least one feature that is only supported by older Intel
810	 * or newer AMD processors.
811	 */
812	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
813	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
814	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
815	    AMDID2_FMA4)) == 0)
816		workaround_erratum383 = 1;
817
818	/*
819	 * Are large page mappings supported and enabled?
820	 */
821	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
822	if (pseflag == 0)
823		pg_ps_enabled = 0;
824	else if (pg_ps_enabled) {
825		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
826		    ("pmap_init: can't assign to pagesizes[1]"));
827		pagesizes[1] = NBPDR;
828	}
829
830	/*
831	 * Calculate the size of the pv head table for superpages.
832	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
833	 */
834	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
835	    PAGE_SIZE) / NBPDR + 1;
836
837	/*
838	 * Allocate memory for the pv head table for superpages.
839	 */
840	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
841	s = round_page(s);
842	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
843	    M_WAITOK | M_ZERO);
844	for (i = 0; i < pv_npg; i++)
845		TAILQ_INIT(&pv_table[i].pv_list);
846
847	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
848	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
849	if (pv_chunkbase == NULL)
850		panic("pmap_init: not enough kvm for pv chunks");
851	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
852#if defined(PAE) || defined(PAE_TABLES)
853	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
854	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
855	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
856	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
857#endif
858
859	pmap_initialized = 1;
860	if (!bootverbose)
861		return;
862	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
863		ppim = pmap_preinit_mapping + i;
864		if (ppim->va == 0)
865			continue;
866		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
867		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
868	}
869}
870
871
872SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
873	"Max number of PV entries");
874SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
875	"Page share factor per proc");
876
877static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
878    "2/4MB page mapping counters");
879
880static u_long pmap_pde_demotions;
881SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
882    &pmap_pde_demotions, 0, "2/4MB page demotions");
883
884static u_long pmap_pde_mappings;
885SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
886    &pmap_pde_mappings, 0, "2/4MB page mappings");
887
888static u_long pmap_pde_p_failures;
889SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
890    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
891
892static u_long pmap_pde_promotions;
893SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
894    &pmap_pde_promotions, 0, "2/4MB page promotions");
895
896/***************************************************
897 * Low level helper routines.....
898 ***************************************************/
899
900/*
901 * Determine the appropriate bits to set in a PTE or PDE for a specified
902 * caching mode.
903 */
904int
905pmap_cache_bits(int mode, boolean_t is_pde)
906{
907	int cache_bits, pat_flag, pat_idx;
908
909	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
910		panic("Unknown caching mode %d\n", mode);
911
912	/* The PAT bit is different for PTE's and PDE's. */
913	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
914
915	/* Map the caching mode to a PAT index. */
916	pat_idx = pat_index[mode];
917
918	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
919	cache_bits = 0;
920	if (pat_idx & 0x4)
921		cache_bits |= pat_flag;
922	if (pat_idx & 0x2)
923		cache_bits |= PG_NC_PCD;
924	if (pat_idx & 0x1)
925		cache_bits |= PG_NC_PWT;
926	return (cache_bits);
927}
928
929/*
930 * The caller is responsible for maintaining TLB consistency.
931 */
932static void
933pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
934{
935	pd_entry_t *pde;
936	pmap_t pmap;
937	boolean_t PTD_updated;
938
939	PTD_updated = FALSE;
940	mtx_lock_spin(&allpmaps_lock);
941	LIST_FOREACH(pmap, &allpmaps, pm_list) {
942		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
943		    PG_FRAME))
944			PTD_updated = TRUE;
945		pde = pmap_pde(pmap, va);
946		pde_store(pde, newpde);
947	}
948	mtx_unlock_spin(&allpmaps_lock);
949	KASSERT(PTD_updated,
950	    ("pmap_kenter_pde: current page table is not in allpmaps"));
951}
952
953/*
954 * After changing the page size for the specified virtual address in the page
955 * table, flush the corresponding entries from the processor's TLB.  Only the
956 * calling processor's TLB is affected.
957 *
958 * The calling thread must be pinned to a processor.
959 */
960static void
961pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
962{
963	u_long cr4;
964
965	if ((newpde & PG_PS) == 0)
966		/* Demotion: flush a specific 2MB page mapping. */
967		invlpg(va);
968	else if ((newpde & PG_G) == 0)
969		/*
970		 * Promotion: flush every 4KB page mapping from the TLB
971		 * because there are too many to flush individually.
972		 */
973		invltlb();
974	else {
975		/*
976		 * Promotion: flush every 4KB page mapping from the TLB,
977		 * including any global (PG_G) mappings.
978		 */
979		cr4 = rcr4();
980		load_cr4(cr4 & ~CR4_PGE);
981		/*
982		 * Although preemption at this point could be detrimental to
983		 * performance, it would not lead to an error.  PG_G is simply
984		 * ignored if CR4.PGE is clear.  Moreover, in case this block
985		 * is re-entered, the load_cr4() either above or below will
986		 * modify CR4.PGE flushing the TLB.
987		 */
988		load_cr4(cr4 | CR4_PGE);
989	}
990}
991
992void
993invltlb_glob(void)
994{
995	uint64_t cr4;
996
997	if (pgeflag == 0) {
998		invltlb();
999	} else {
1000		cr4 = rcr4();
1001		load_cr4(cr4 & ~CR4_PGE);
1002		load_cr4(cr4 | CR4_PGE);
1003	}
1004}
1005
1006
1007#ifdef SMP
1008/*
1009 * For SMP, these functions have to use the IPI mechanism for coherence.
1010 *
1011 * N.B.: Before calling any of the following TLB invalidation functions,
1012 * the calling processor must ensure that all stores updating a non-
1013 * kernel page table are globally performed.  Otherwise, another
1014 * processor could cache an old, pre-update entry without being
1015 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1016 * active on another processor after its pm_active field is checked by
1017 * one of the following functions but before a store updating the page
1018 * table is globally performed. (2) The pmap becomes active on another
1019 * processor before its pm_active field is checked but due to
1020 * speculative loads one of the following functions stills reads the
1021 * pmap as inactive on the other processor.
1022 *
1023 * The kernel page table is exempt because its pm_active field is
1024 * immutable.  The kernel page table is always active on every
1025 * processor.
1026 */
1027void
1028pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1029{
1030	cpuset_t *mask, other_cpus;
1031	u_int cpuid;
1032
1033	sched_pin();
1034	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1035		invlpg(va);
1036		mask = &all_cpus;
1037	} else {
1038		cpuid = PCPU_GET(cpuid);
1039		other_cpus = all_cpus;
1040		CPU_CLR(cpuid, &other_cpus);
1041		if (CPU_ISSET(cpuid, &pmap->pm_active))
1042			invlpg(va);
1043		CPU_AND(&other_cpus, &pmap->pm_active);
1044		mask = &other_cpus;
1045	}
1046	smp_masked_invlpg(*mask, va);
1047	sched_unpin();
1048}
1049
1050/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1051#define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1052
1053void
1054pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1055{
1056	cpuset_t *mask, other_cpus;
1057	vm_offset_t addr;
1058	u_int cpuid;
1059
1060	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1061		pmap_invalidate_all(pmap);
1062		return;
1063	}
1064
1065	sched_pin();
1066	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1067		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1068			invlpg(addr);
1069		mask = &all_cpus;
1070	} else {
1071		cpuid = PCPU_GET(cpuid);
1072		other_cpus = all_cpus;
1073		CPU_CLR(cpuid, &other_cpus);
1074		if (CPU_ISSET(cpuid, &pmap->pm_active))
1075			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1076				invlpg(addr);
1077		CPU_AND(&other_cpus, &pmap->pm_active);
1078		mask = &other_cpus;
1079	}
1080	smp_masked_invlpg_range(*mask, sva, eva);
1081	sched_unpin();
1082}
1083
1084void
1085pmap_invalidate_all(pmap_t pmap)
1086{
1087	cpuset_t *mask, other_cpus;
1088	u_int cpuid;
1089
1090	sched_pin();
1091	if (pmap == kernel_pmap) {
1092		invltlb_glob();
1093		mask = &all_cpus;
1094	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
1095		invltlb();
1096		mask = &all_cpus;
1097	} else {
1098		cpuid = PCPU_GET(cpuid);
1099		other_cpus = all_cpus;
1100		CPU_CLR(cpuid, &other_cpus);
1101		if (CPU_ISSET(cpuid, &pmap->pm_active))
1102			invltlb();
1103		CPU_AND(&other_cpus, &pmap->pm_active);
1104		mask = &other_cpus;
1105	}
1106	smp_masked_invltlb(*mask, pmap);
1107	sched_unpin();
1108}
1109
1110void
1111pmap_invalidate_cache(void)
1112{
1113
1114	sched_pin();
1115	wbinvd();
1116	smp_cache_flush();
1117	sched_unpin();
1118}
1119
1120struct pde_action {
1121	cpuset_t invalidate;	/* processors that invalidate their TLB */
1122	vm_offset_t va;
1123	pd_entry_t *pde;
1124	pd_entry_t newpde;
1125	u_int store;		/* processor that updates the PDE */
1126};
1127
1128static void
1129pmap_update_pde_kernel(void *arg)
1130{
1131	struct pde_action *act = arg;
1132	pd_entry_t *pde;
1133	pmap_t pmap;
1134
1135	if (act->store == PCPU_GET(cpuid)) {
1136
1137		/*
1138		 * Elsewhere, this operation requires allpmaps_lock for
1139		 * synchronization.  Here, it does not because it is being
1140		 * performed in the context of an all_cpus rendezvous.
1141		 */
1142		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1143			pde = pmap_pde(pmap, act->va);
1144			pde_store(pde, act->newpde);
1145		}
1146	}
1147}
1148
1149static void
1150pmap_update_pde_user(void *arg)
1151{
1152	struct pde_action *act = arg;
1153
1154	if (act->store == PCPU_GET(cpuid))
1155		pde_store(act->pde, act->newpde);
1156}
1157
1158static void
1159pmap_update_pde_teardown(void *arg)
1160{
1161	struct pde_action *act = arg;
1162
1163	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1164		pmap_update_pde_invalidate(act->va, act->newpde);
1165}
1166
1167/*
1168 * Change the page size for the specified virtual address in a way that
1169 * prevents any possibility of the TLB ever having two entries that map the
1170 * same virtual address using different page sizes.  This is the recommended
1171 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1172 * machine check exception for a TLB state that is improperly diagnosed as a
1173 * hardware error.
1174 */
1175static void
1176pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1177{
1178	struct pde_action act;
1179	cpuset_t active, other_cpus;
1180	u_int cpuid;
1181
1182	sched_pin();
1183	cpuid = PCPU_GET(cpuid);
1184	other_cpus = all_cpus;
1185	CPU_CLR(cpuid, &other_cpus);
1186	if (pmap == kernel_pmap)
1187		active = all_cpus;
1188	else
1189		active = pmap->pm_active;
1190	if (CPU_OVERLAP(&active, &other_cpus)) {
1191		act.store = cpuid;
1192		act.invalidate = active;
1193		act.va = va;
1194		act.pde = pde;
1195		act.newpde = newpde;
1196		CPU_SET(cpuid, &active);
1197		smp_rendezvous_cpus(active,
1198		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1199		    pmap_update_pde_kernel : pmap_update_pde_user,
1200		    pmap_update_pde_teardown, &act);
1201	} else {
1202		if (pmap == kernel_pmap)
1203			pmap_kenter_pde(va, newpde);
1204		else
1205			pde_store(pde, newpde);
1206		if (CPU_ISSET(cpuid, &active))
1207			pmap_update_pde_invalidate(va, newpde);
1208	}
1209	sched_unpin();
1210}
1211#else /* !SMP */
1212/*
1213 * Normal, non-SMP, 486+ invalidation functions.
1214 * We inline these within pmap.c for speed.
1215 */
1216PMAP_INLINE void
1217pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1218{
1219
1220	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1221		invlpg(va);
1222}
1223
1224PMAP_INLINE void
1225pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1226{
1227	vm_offset_t addr;
1228
1229	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1230		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1231			invlpg(addr);
1232}
1233
1234PMAP_INLINE void
1235pmap_invalidate_all(pmap_t pmap)
1236{
1237
1238	if (pmap == kernel_pmap)
1239		invltlb_glob();
1240	else if (!CPU_EMPTY(&pmap->pm_active))
1241		invltlb();
1242}
1243
1244PMAP_INLINE void
1245pmap_invalidate_cache(void)
1246{
1247
1248	wbinvd();
1249}
1250
1251static void
1252pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1253{
1254
1255	if (pmap == kernel_pmap)
1256		pmap_kenter_pde(va, newpde);
1257	else
1258		pde_store(pde, newpde);
1259	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1260		pmap_update_pde_invalidate(va, newpde);
1261}
1262#endif /* !SMP */
1263
1264static void
1265pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
1266{
1267
1268	/*
1269	 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
1270	 * created by a promotion that did not invalidate the 512 or 1024 4KB
1271	 * page mappings that might exist in the TLB.  Consequently, at this
1272	 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
1273	 * the address range [va, va + NBPDR).  Therefore, the entire range
1274	 * must be invalidated here.  In contrast, when PG_PROMOTED is clear,
1275	 * the TLB will not hold any 4KB page mappings for the address range
1276	 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
1277	 * 2- or 4MB page mapping from the TLB.
1278	 */
1279	if ((pde & PG_PROMOTED) != 0)
1280		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
1281	else
1282		pmap_invalidate_page(pmap, va);
1283}
1284
1285#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1286
1287void
1288pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1289{
1290
1291	if (force) {
1292		sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
1293	} else {
1294		KASSERT((sva & PAGE_MASK) == 0,
1295		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1296		KASSERT((eva & PAGE_MASK) == 0,
1297		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1298	}
1299
1300	if ((cpu_feature & CPUID_SS) != 0 && !force)
1301		; /* If "Self Snoop" is supported and allowed, do nothing. */
1302	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1303	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1304#ifdef DEV_APIC
1305		/*
1306		 * XXX: Some CPUs fault, hang, or trash the local APIC
1307		 * registers if we use CLFLUSH on the local APIC
1308		 * range.  The local APIC is always uncached, so we
1309		 * don't need to flush for that range anyway.
1310		 */
1311		if (pmap_kextract(sva) == lapic_paddr)
1312			return;
1313#endif
1314		/*
1315		 * Otherwise, do per-cache line flush.  Use the sfence
1316		 * instruction to insure that previous stores are
1317		 * included in the write-back.  The processor
1318		 * propagates flush to other processors in the cache
1319		 * coherence domain.
1320		 */
1321		sfence();
1322		for (; sva < eva; sva += cpu_clflush_line_size)
1323			clflushopt(sva);
1324		sfence();
1325	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1326	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1327#ifdef DEV_APIC
1328		if (pmap_kextract(sva) == lapic_paddr)
1329			return;
1330#endif
1331		/*
1332		 * Writes are ordered by CLFLUSH on Intel CPUs.
1333		 */
1334		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1335			mfence();
1336		for (; sva < eva; sva += cpu_clflush_line_size)
1337			clflush(sva);
1338		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1339			mfence();
1340	} else {
1341
1342		/*
1343		 * No targeted cache flush methods are supported by CPU,
1344		 * or the supplied range is bigger than 2MB.
1345		 * Globally invalidate cache.
1346		 */
1347		pmap_invalidate_cache();
1348	}
1349}
1350
1351void
1352pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1353{
1354	int i;
1355
1356	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1357	    (cpu_feature & CPUID_CLFSH) == 0) {
1358		pmap_invalidate_cache();
1359	} else {
1360		for (i = 0; i < count; i++)
1361			pmap_flush_page(pages[i]);
1362	}
1363}
1364
1365/*
1366 * Are we current address space or kernel?
1367 */
1368static __inline int
1369pmap_is_current(pmap_t pmap)
1370{
1371
1372	return (pmap == kernel_pmap || pmap ==
1373	    vmspace_pmap(curthread->td_proc->p_vmspace));
1374}
1375
1376/*
1377 * If the given pmap is not the current or kernel pmap, the returned pte must
1378 * be released by passing it to pmap_pte_release().
1379 */
1380pt_entry_t *
1381pmap_pte(pmap_t pmap, vm_offset_t va)
1382{
1383	pd_entry_t newpf;
1384	pd_entry_t *pde;
1385
1386	pde = pmap_pde(pmap, va);
1387	if (*pde & PG_PS)
1388		return (pde);
1389	if (*pde != 0) {
1390		/* are we current address space or kernel? */
1391		if (pmap_is_current(pmap))
1392			return (vtopte(va));
1393		mtx_lock(&PMAP2mutex);
1394		newpf = *pde & PG_FRAME;
1395		if ((*PMAP2 & PG_FRAME) != newpf) {
1396			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1397			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1398		}
1399		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1400	}
1401	return (NULL);
1402}
1403
1404/*
1405 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1406 * being NULL.
1407 */
1408static __inline void
1409pmap_pte_release(pt_entry_t *pte)
1410{
1411
1412	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1413		mtx_unlock(&PMAP2mutex);
1414}
1415
1416/*
1417 * NB:  The sequence of updating a page table followed by accesses to the
1418 * corresponding pages is subject to the situation described in the "AMD64
1419 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1420 * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1421 * right after modifying the PTE bits is crucial.
1422 */
1423static __inline void
1424invlcaddr(void *caddr)
1425{
1426
1427	invlpg((u_int)caddr);
1428}
1429
1430/*
1431 * Super fast pmap_pte routine best used when scanning
1432 * the pv lists.  This eliminates many coarse-grained
1433 * invltlb calls.  Note that many of the pv list
1434 * scans are across different pmaps.  It is very wasteful
1435 * to do an entire invltlb for checking a single mapping.
1436 *
1437 * If the given pmap is not the current pmap, pvh_global_lock
1438 * must be held and curthread pinned to a CPU.
1439 */
1440static pt_entry_t *
1441pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1442{
1443	pd_entry_t newpf;
1444	pd_entry_t *pde;
1445
1446	pde = pmap_pde(pmap, va);
1447	if (*pde & PG_PS)
1448		return (pde);
1449	if (*pde != 0) {
1450		/* are we current address space or kernel? */
1451		if (pmap_is_current(pmap))
1452			return (vtopte(va));
1453		rw_assert(&pvh_global_lock, RA_WLOCKED);
1454		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1455		newpf = *pde & PG_FRAME;
1456		if ((*PMAP1 & PG_FRAME) != newpf) {
1457			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1458#ifdef SMP
1459			PMAP1cpu = PCPU_GET(cpuid);
1460#endif
1461			invlcaddr(PADDR1);
1462			PMAP1changed++;
1463		} else
1464#ifdef SMP
1465		if (PMAP1cpu != PCPU_GET(cpuid)) {
1466			PMAP1cpu = PCPU_GET(cpuid);
1467			invlcaddr(PADDR1);
1468			PMAP1changedcpu++;
1469		} else
1470#endif
1471			PMAP1unchanged++;
1472		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1473	}
1474	return (0);
1475}
1476
1477/*
1478 *	Routine:	pmap_extract
1479 *	Function:
1480 *		Extract the physical page address associated
1481 *		with the given map/virtual_address pair.
1482 */
1483vm_paddr_t
1484pmap_extract(pmap_t pmap, vm_offset_t va)
1485{
1486	vm_paddr_t rtval;
1487	pt_entry_t *pte;
1488	pd_entry_t pde;
1489
1490	rtval = 0;
1491	PMAP_LOCK(pmap);
1492	pde = pmap->pm_pdir[va >> PDRSHIFT];
1493	if (pde != 0) {
1494		if ((pde & PG_PS) != 0)
1495			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1496		else {
1497			pte = pmap_pte(pmap, va);
1498			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1499			pmap_pte_release(pte);
1500		}
1501	}
1502	PMAP_UNLOCK(pmap);
1503	return (rtval);
1504}
1505
1506/*
1507 *	Routine:	pmap_extract_and_hold
1508 *	Function:
1509 *		Atomically extract and hold the physical page
1510 *		with the given pmap and virtual address pair
1511 *		if that mapping permits the given protection.
1512 */
1513vm_page_t
1514pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1515{
1516	pd_entry_t pde;
1517	pt_entry_t pte, *ptep;
1518	vm_page_t m;
1519	vm_paddr_t pa;
1520
1521	pa = 0;
1522	m = NULL;
1523	PMAP_LOCK(pmap);
1524retry:
1525	pde = *pmap_pde(pmap, va);
1526	if (pde != 0) {
1527		if (pde & PG_PS) {
1528			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1529				if (vm_page_pa_tryrelock(pmap, (pde &
1530				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1531					goto retry;
1532				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1533				    (va & PDRMASK));
1534				vm_page_hold(m);
1535			}
1536		} else {
1537			ptep = pmap_pte(pmap, va);
1538			pte = *ptep;
1539			pmap_pte_release(ptep);
1540			if (pte != 0 &&
1541			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1542				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1543				    &pa))
1544					goto retry;
1545				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1546				vm_page_hold(m);
1547			}
1548		}
1549	}
1550	PA_UNLOCK_COND(pa);
1551	PMAP_UNLOCK(pmap);
1552	return (m);
1553}
1554
1555/***************************************************
1556 * Low level mapping routines.....
1557 ***************************************************/
1558
1559/*
1560 * Add a wired page to the kva.
1561 * Note: not SMP coherent.
1562 *
1563 * This function may be used before pmap_bootstrap() is called.
1564 */
1565PMAP_INLINE void
1566pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1567{
1568	pt_entry_t *pte;
1569
1570	pte = vtopte(va);
1571	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1572}
1573
1574static __inline void
1575pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1576{
1577	pt_entry_t *pte;
1578
1579	pte = vtopte(va);
1580	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1581}
1582
1583/*
1584 * Remove a page from the kernel pagetables.
1585 * Note: not SMP coherent.
1586 *
1587 * This function may be used before pmap_bootstrap() is called.
1588 */
1589PMAP_INLINE void
1590pmap_kremove(vm_offset_t va)
1591{
1592	pt_entry_t *pte;
1593
1594	pte = vtopte(va);
1595	pte_clear(pte);
1596}
1597
1598/*
1599 *	Used to map a range of physical addresses into kernel
1600 *	virtual address space.
1601 *
1602 *	The value passed in '*virt' is a suggested virtual address for
1603 *	the mapping. Architectures which can support a direct-mapped
1604 *	physical to virtual region can return the appropriate address
1605 *	within that region, leaving '*virt' unchanged. Other
1606 *	architectures should map the pages starting at '*virt' and
1607 *	update '*virt' with the first usable address after the mapped
1608 *	region.
1609 */
1610vm_offset_t
1611pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1612{
1613	vm_offset_t va, sva;
1614	vm_paddr_t superpage_offset;
1615	pd_entry_t newpde;
1616
1617	va = *virt;
1618	/*
1619	 * Does the physical address range's size and alignment permit at
1620	 * least one superpage mapping to be created?
1621	 */
1622	superpage_offset = start & PDRMASK;
1623	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1624		/*
1625		 * Increase the starting virtual address so that its alignment
1626		 * does not preclude the use of superpage mappings.
1627		 */
1628		if ((va & PDRMASK) < superpage_offset)
1629			va = (va & ~PDRMASK) + superpage_offset;
1630		else if ((va & PDRMASK) > superpage_offset)
1631			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1632	}
1633	sva = va;
1634	while (start < end) {
1635		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1636		    pseflag) {
1637			KASSERT((va & PDRMASK) == 0,
1638			    ("pmap_map: misaligned va %#x", va));
1639			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1640			pmap_kenter_pde(va, newpde);
1641			va += NBPDR;
1642			start += NBPDR;
1643		} else {
1644			pmap_kenter(va, start);
1645			va += PAGE_SIZE;
1646			start += PAGE_SIZE;
1647		}
1648	}
1649	pmap_invalidate_range(kernel_pmap, sva, va);
1650	*virt = va;
1651	return (sva);
1652}
1653
1654
1655/*
1656 * Add a list of wired pages to the kva
1657 * this routine is only used for temporary
1658 * kernel mappings that do not need to have
1659 * page modification or references recorded.
1660 * Note that old mappings are simply written
1661 * over.  The page *must* be wired.
1662 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1663 */
1664void
1665pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1666{
1667	pt_entry_t *endpte, oldpte, pa, *pte;
1668	vm_page_t m;
1669
1670	oldpte = 0;
1671	pte = vtopte(sva);
1672	endpte = pte + count;
1673	while (pte < endpte) {
1674		m = *ma++;
1675		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1676		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1677			oldpte |= *pte;
1678			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1679		}
1680		pte++;
1681	}
1682	if (__predict_false((oldpte & PG_V) != 0))
1683		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1684		    PAGE_SIZE);
1685}
1686
1687/*
1688 * This routine tears out page mappings from the
1689 * kernel -- it is meant only for temporary mappings.
1690 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1691 */
1692void
1693pmap_qremove(vm_offset_t sva, int count)
1694{
1695	vm_offset_t va;
1696
1697	va = sva;
1698	while (count-- > 0) {
1699		pmap_kremove(va);
1700		va += PAGE_SIZE;
1701	}
1702	pmap_invalidate_range(kernel_pmap, sva, va);
1703}
1704
1705/***************************************************
1706 * Page table page management routines.....
1707 ***************************************************/
1708static __inline void
1709pmap_free_zero_pages(struct spglist *free)
1710{
1711	vm_page_t m;
1712	int count;
1713
1714	for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) {
1715		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1716		/* Preserve the page's PG_ZERO setting. */
1717		vm_page_free_toq(m);
1718	}
1719	atomic_subtract_int(&vm_cnt.v_wire_count, count);
1720}
1721
1722/*
1723 * Schedule the specified unused page table page to be freed.  Specifically,
1724 * add the page to the specified list of pages that will be released to the
1725 * physical memory manager after the TLB has been updated.
1726 */
1727static __inline void
1728pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1729    boolean_t set_PG_ZERO)
1730{
1731
1732	if (set_PG_ZERO)
1733		m->flags |= PG_ZERO;
1734	else
1735		m->flags &= ~PG_ZERO;
1736	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1737}
1738
1739/*
1740 * Inserts the specified page table page into the specified pmap's collection
1741 * of idle page table pages.  Each of a pmap's page table pages is responsible
1742 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1743 * ordered by this virtual address range.
1744 */
1745static __inline int
1746pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1747{
1748
1749	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1750	return (vm_radix_insert(&pmap->pm_root, mpte));
1751}
1752
1753/*
1754 * Removes the page table page mapping the specified virtual address from the
1755 * specified pmap's collection of idle page table pages, and returns it.
1756 * Otherwise, returns NULL if there is no page table page corresponding to the
1757 * specified virtual address.
1758 */
1759static __inline vm_page_t
1760pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1761{
1762
1763	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1764	return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
1765}
1766
1767/*
1768 * Decrements a page table page's wire count, which is used to record the
1769 * number of valid page table entries within the page.  If the wire count
1770 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1771 * page table page was unmapped and FALSE otherwise.
1772 */
1773static inline boolean_t
1774pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1775{
1776
1777	--m->wire_count;
1778	if (m->wire_count == 0) {
1779		_pmap_unwire_ptp(pmap, m, free);
1780		return (TRUE);
1781	} else
1782		return (FALSE);
1783}
1784
1785static void
1786_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1787{
1788	vm_offset_t pteva;
1789
1790	/*
1791	 * unmap the page table page
1792	 */
1793	pmap->pm_pdir[m->pindex] = 0;
1794	--pmap->pm_stats.resident_count;
1795
1796	/*
1797	 * Do an invltlb to make the invalidated mapping
1798	 * take effect immediately.
1799	 */
1800	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1801	pmap_invalidate_page(pmap, pteva);
1802
1803	/*
1804	 * Put page on a list so that it is released after
1805	 * *ALL* TLB shootdown is done
1806	 */
1807	pmap_add_delayed_free_list(m, free, TRUE);
1808}
1809
1810/*
1811 * After removing a page table entry, this routine is used to
1812 * conditionally free the page, and manage the hold/wire counts.
1813 */
1814static int
1815pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1816{
1817	pd_entry_t ptepde;
1818	vm_page_t mpte;
1819
1820	if (va >= VM_MAXUSER_ADDRESS)
1821		return (0);
1822	ptepde = *pmap_pde(pmap, va);
1823	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1824	return (pmap_unwire_ptp(pmap, mpte, free));
1825}
1826
1827/*
1828 * Initialize the pmap for the swapper process.
1829 */
1830void
1831pmap_pinit0(pmap_t pmap)
1832{
1833
1834	PMAP_LOCK_INIT(pmap);
1835	/*
1836	 * Since the page table directory is shared with the kernel pmap,
1837	 * which is already included in the list "allpmaps", this pmap does
1838	 * not need to be inserted into that list.
1839	 */
1840	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1841#if defined(PAE) || defined(PAE_TABLES)
1842	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1843#endif
1844	pmap->pm_root.rt_root = 0;
1845	CPU_ZERO(&pmap->pm_active);
1846	PCPU_SET(curpmap, pmap);
1847	TAILQ_INIT(&pmap->pm_pvchunk);
1848	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1849}
1850
1851/*
1852 * Initialize a preallocated and zeroed pmap structure,
1853 * such as one in a vmspace structure.
1854 */
1855int
1856pmap_pinit(pmap_t pmap)
1857{
1858	vm_page_t m, ptdpg[NPGPTD];
1859	vm_paddr_t pa;
1860	int i;
1861
1862	/*
1863	 * No need to allocate page table space yet but we do need a valid
1864	 * page directory table.
1865	 */
1866	if (pmap->pm_pdir == NULL) {
1867		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1868		if (pmap->pm_pdir == NULL)
1869			return (0);
1870#if defined(PAE) || defined(PAE_TABLES)
1871		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1872		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1873		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1874		    ("pmap_pinit: pdpt misaligned"));
1875		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1876		    ("pmap_pinit: pdpt above 4g"));
1877#endif
1878		pmap->pm_root.rt_root = 0;
1879	}
1880	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1881	    ("pmap_pinit: pmap has reserved page table page(s)"));
1882
1883	/*
1884	 * allocate the page directory page(s)
1885	 */
1886	for (i = 0; i < NPGPTD;) {
1887		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1888		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1889		if (m == NULL)
1890			VM_WAIT;
1891		else {
1892			ptdpg[i++] = m;
1893		}
1894	}
1895
1896	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1897
1898	for (i = 0; i < NPGPTD; i++)
1899		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1900			pagezero(pmap->pm_pdir + (i * NPDEPG));
1901
1902	mtx_lock_spin(&allpmaps_lock);
1903	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1904	/* Copy the kernel page table directory entries. */
1905	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1906	mtx_unlock_spin(&allpmaps_lock);
1907
1908	/* install self-referential address mapping entry(s) */
1909	for (i = 0; i < NPGPTD; i++) {
1910		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1911		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1912#if defined(PAE) || defined(PAE_TABLES)
1913		pmap->pm_pdpt[i] = pa | PG_V;
1914#endif
1915	}
1916
1917	CPU_ZERO(&pmap->pm_active);
1918	TAILQ_INIT(&pmap->pm_pvchunk);
1919	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1920
1921	return (1);
1922}
1923
1924/*
1925 * this routine is called if the page table page is not
1926 * mapped correctly.
1927 */
1928static vm_page_t
1929_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1930{
1931	vm_paddr_t ptepa;
1932	vm_page_t m;
1933
1934	/*
1935	 * Allocate a page table page.
1936	 */
1937	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1938	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1939		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1940			PMAP_UNLOCK(pmap);
1941			rw_wunlock(&pvh_global_lock);
1942			VM_WAIT;
1943			rw_wlock(&pvh_global_lock);
1944			PMAP_LOCK(pmap);
1945		}
1946
1947		/*
1948		 * Indicate the need to retry.  While waiting, the page table
1949		 * page may have been allocated.
1950		 */
1951		return (NULL);
1952	}
1953	if ((m->flags & PG_ZERO) == 0)
1954		pmap_zero_page(m);
1955
1956	/*
1957	 * Map the pagetable page into the process address space, if
1958	 * it isn't already there.
1959	 */
1960
1961	pmap->pm_stats.resident_count++;
1962
1963	ptepa = VM_PAGE_TO_PHYS(m);
1964	pmap->pm_pdir[ptepindex] =
1965		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1966
1967	return (m);
1968}
1969
1970static vm_page_t
1971pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1972{
1973	u_int ptepindex;
1974	pd_entry_t ptepa;
1975	vm_page_t m;
1976
1977	/*
1978	 * Calculate pagetable page index
1979	 */
1980	ptepindex = va >> PDRSHIFT;
1981retry:
1982	/*
1983	 * Get the page directory entry
1984	 */
1985	ptepa = pmap->pm_pdir[ptepindex];
1986
1987	/*
1988	 * This supports switching from a 4MB page to a
1989	 * normal 4K page.
1990	 */
1991	if (ptepa & PG_PS) {
1992		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1993		ptepa = pmap->pm_pdir[ptepindex];
1994	}
1995
1996	/*
1997	 * If the page table page is mapped, we just increment the
1998	 * hold count, and activate it.
1999	 */
2000	if (ptepa) {
2001		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
2002		m->wire_count++;
2003	} else {
2004		/*
2005		 * Here if the pte page isn't mapped, or if it has
2006		 * been deallocated.
2007		 */
2008		m = _pmap_allocpte(pmap, ptepindex, flags);
2009		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
2010			goto retry;
2011	}
2012	return (m);
2013}
2014
2015
2016/***************************************************
2017* Pmap allocation/deallocation routines.
2018 ***************************************************/
2019
2020/*
2021 * Release any resources held by the given physical map.
2022 * Called when a pmap initialized by pmap_pinit is being released.
2023 * Should only be called if the map contains no valid mappings.
2024 */
2025void
2026pmap_release(pmap_t pmap)
2027{
2028	vm_page_t m, ptdpg[NPGPTD];
2029	int i;
2030
2031	KASSERT(pmap->pm_stats.resident_count == 0,
2032	    ("pmap_release: pmap resident count %ld != 0",
2033	    pmap->pm_stats.resident_count));
2034	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2035	    ("pmap_release: pmap has reserved page table page(s)"));
2036	KASSERT(CPU_EMPTY(&pmap->pm_active),
2037	    ("releasing active pmap %p", pmap));
2038
2039	mtx_lock_spin(&allpmaps_lock);
2040	LIST_REMOVE(pmap, pm_list);
2041	mtx_unlock_spin(&allpmaps_lock);
2042
2043	for (i = 0; i < NPGPTD; i++)
2044		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2045		    PG_FRAME);
2046
2047	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2048	    sizeof(*pmap->pm_pdir));
2049
2050	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2051
2052	for (i = 0; i < NPGPTD; i++) {
2053		m = ptdpg[i];
2054#if defined(PAE) || defined(PAE_TABLES)
2055		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2056		    ("pmap_release: got wrong ptd page"));
2057#endif
2058		m->wire_count--;
2059		vm_page_free_zero(m);
2060	}
2061	atomic_subtract_int(&vm_cnt.v_wire_count, NPGPTD);
2062}
2063
2064static int
2065kvm_size(SYSCTL_HANDLER_ARGS)
2066{
2067	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2068
2069	return (sysctl_handle_long(oidp, &ksize, 0, req));
2070}
2071SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2072    0, 0, kvm_size, "IU", "Size of KVM");
2073
2074static int
2075kvm_free(SYSCTL_HANDLER_ARGS)
2076{
2077	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2078
2079	return (sysctl_handle_long(oidp, &kfree, 0, req));
2080}
2081SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2082    0, 0, kvm_free, "IU", "Amount of KVM free");
2083
2084/*
2085 * grow the number of kernel page table entries, if needed
2086 */
2087void
2088pmap_growkernel(vm_offset_t addr)
2089{
2090	vm_paddr_t ptppaddr;
2091	vm_page_t nkpg;
2092	pd_entry_t newpdir;
2093
2094	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2095	addr = roundup2(addr, NBPDR);
2096	if (addr - 1 >= kernel_map->max_offset)
2097		addr = kernel_map->max_offset;
2098	while (kernel_vm_end < addr) {
2099		if (pdir_pde(PTD, kernel_vm_end)) {
2100			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2101			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2102				kernel_vm_end = kernel_map->max_offset;
2103				break;
2104			}
2105			continue;
2106		}
2107
2108		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2109		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2110		    VM_ALLOC_ZERO);
2111		if (nkpg == NULL)
2112			panic("pmap_growkernel: no memory to grow kernel");
2113
2114		nkpt++;
2115
2116		if ((nkpg->flags & PG_ZERO) == 0)
2117			pmap_zero_page(nkpg);
2118		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2119		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2120		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2121
2122		pmap_kenter_pde(kernel_vm_end, newpdir);
2123		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2124		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2125			kernel_vm_end = kernel_map->max_offset;
2126			break;
2127		}
2128	}
2129}
2130
2131
2132/***************************************************
2133 * page management routines.
2134 ***************************************************/
2135
2136CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2137CTASSERT(_NPCM == 11);
2138CTASSERT(_NPCPV == 336);
2139
2140static __inline struct pv_chunk *
2141pv_to_chunk(pv_entry_t pv)
2142{
2143
2144	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2145}
2146
2147#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2148
2149#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2150#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2151
2152static const uint32_t pc_freemask[_NPCM] = {
2153	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2154	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2155	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2156	PC_FREE0_9, PC_FREE10
2157};
2158
2159SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2160	"Current number of pv entries");
2161
2162#ifdef PV_STATS
2163static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2164
2165SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2166	"Current number of pv entry chunks");
2167SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2168	"Current number of pv entry chunks allocated");
2169SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2170	"Current number of pv entry chunks frees");
2171SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2172	"Number of times tried to get a chunk page but failed.");
2173
2174static long pv_entry_frees, pv_entry_allocs;
2175static int pv_entry_spare;
2176
2177SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2178	"Current number of pv entry frees");
2179SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2180	"Current number of pv entry allocs");
2181SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2182	"Current number of spare pv entries");
2183#endif
2184
2185/*
2186 * We are in a serious low memory condition.  Resort to
2187 * drastic measures to free some pages so we can allocate
2188 * another pv entry chunk.
2189 */
2190static vm_page_t
2191pmap_pv_reclaim(pmap_t locked_pmap)
2192{
2193	struct pch newtail;
2194	struct pv_chunk *pc;
2195	struct md_page *pvh;
2196	pd_entry_t *pde;
2197	pmap_t pmap;
2198	pt_entry_t *pte, tpte;
2199	pv_entry_t pv;
2200	vm_offset_t va;
2201	vm_page_t m, m_pc;
2202	struct spglist free;
2203	uint32_t inuse;
2204	int bit, field, freed;
2205
2206	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2207	pmap = NULL;
2208	m_pc = NULL;
2209	SLIST_INIT(&free);
2210	TAILQ_INIT(&newtail);
2211	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2212	    SLIST_EMPTY(&free))) {
2213		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2214		if (pmap != pc->pc_pmap) {
2215			if (pmap != NULL) {
2216				pmap_invalidate_all(pmap);
2217				if (pmap != locked_pmap)
2218					PMAP_UNLOCK(pmap);
2219			}
2220			pmap = pc->pc_pmap;
2221			/* Avoid deadlock and lock recursion. */
2222			if (pmap > locked_pmap)
2223				PMAP_LOCK(pmap);
2224			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2225				pmap = NULL;
2226				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2227				continue;
2228			}
2229		}
2230
2231		/*
2232		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2233		 */
2234		freed = 0;
2235		for (field = 0; field < _NPCM; field++) {
2236			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2237			    inuse != 0; inuse &= ~(1UL << bit)) {
2238				bit = bsfl(inuse);
2239				pv = &pc->pc_pventry[field * 32 + bit];
2240				va = pv->pv_va;
2241				pde = pmap_pde(pmap, va);
2242				if ((*pde & PG_PS) != 0)
2243					continue;
2244				pte = pmap_pte(pmap, va);
2245				tpte = *pte;
2246				if ((tpte & PG_W) == 0)
2247					tpte = pte_load_clear(pte);
2248				pmap_pte_release(pte);
2249				if ((tpte & PG_W) != 0)
2250					continue;
2251				KASSERT(tpte != 0,
2252				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2253				    pmap, va));
2254				if ((tpte & PG_G) != 0)
2255					pmap_invalidate_page(pmap, va);
2256				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2257				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2258					vm_page_dirty(m);
2259				if ((tpte & PG_A) != 0)
2260					vm_page_aflag_set(m, PGA_REFERENCED);
2261				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2262				if (TAILQ_EMPTY(&m->md.pv_list) &&
2263				    (m->flags & PG_FICTITIOUS) == 0) {
2264					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2265					if (TAILQ_EMPTY(&pvh->pv_list)) {
2266						vm_page_aflag_clear(m,
2267						    PGA_WRITEABLE);
2268					}
2269				}
2270				pc->pc_map[field] |= 1UL << bit;
2271				pmap_unuse_pt(pmap, va, &free);
2272				freed++;
2273			}
2274		}
2275		if (freed == 0) {
2276			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2277			continue;
2278		}
2279		/* Every freed mapping is for a 4 KB page. */
2280		pmap->pm_stats.resident_count -= freed;
2281		PV_STAT(pv_entry_frees += freed);
2282		PV_STAT(pv_entry_spare += freed);
2283		pv_entry_count -= freed;
2284		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2285		for (field = 0; field < _NPCM; field++)
2286			if (pc->pc_map[field] != pc_freemask[field]) {
2287				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2288				    pc_list);
2289				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2290
2291				/*
2292				 * One freed pv entry in locked_pmap is
2293				 * sufficient.
2294				 */
2295				if (pmap == locked_pmap)
2296					goto out;
2297				break;
2298			}
2299		if (field == _NPCM) {
2300			PV_STAT(pv_entry_spare -= _NPCPV);
2301			PV_STAT(pc_chunk_count--);
2302			PV_STAT(pc_chunk_frees++);
2303			/* Entire chunk is free; return it. */
2304			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2305			pmap_qremove((vm_offset_t)pc, 1);
2306			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2307			break;
2308		}
2309	}
2310out:
2311	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2312	if (pmap != NULL) {
2313		pmap_invalidate_all(pmap);
2314		if (pmap != locked_pmap)
2315			PMAP_UNLOCK(pmap);
2316	}
2317	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2318		m_pc = SLIST_FIRST(&free);
2319		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2320		/* Recycle a freed page table page. */
2321		m_pc->wire_count = 1;
2322	}
2323	pmap_free_zero_pages(&free);
2324	return (m_pc);
2325}
2326
2327/*
2328 * free the pv_entry back to the free list
2329 */
2330static void
2331free_pv_entry(pmap_t pmap, pv_entry_t pv)
2332{
2333	struct pv_chunk *pc;
2334	int idx, field, bit;
2335
2336	rw_assert(&pvh_global_lock, RA_WLOCKED);
2337	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2338	PV_STAT(pv_entry_frees++);
2339	PV_STAT(pv_entry_spare++);
2340	pv_entry_count--;
2341	pc = pv_to_chunk(pv);
2342	idx = pv - &pc->pc_pventry[0];
2343	field = idx / 32;
2344	bit = idx % 32;
2345	pc->pc_map[field] |= 1ul << bit;
2346	for (idx = 0; idx < _NPCM; idx++)
2347		if (pc->pc_map[idx] != pc_freemask[idx]) {
2348			/*
2349			 * 98% of the time, pc is already at the head of the
2350			 * list.  If it isn't already, move it to the head.
2351			 */
2352			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2353			    pc)) {
2354				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2355				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2356				    pc_list);
2357			}
2358			return;
2359		}
2360	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2361	free_pv_chunk(pc);
2362}
2363
2364static void
2365free_pv_chunk(struct pv_chunk *pc)
2366{
2367	vm_page_t m;
2368
2369 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2370	PV_STAT(pv_entry_spare -= _NPCPV);
2371	PV_STAT(pc_chunk_count--);
2372	PV_STAT(pc_chunk_frees++);
2373	/* entire chunk is free, return it */
2374	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2375	pmap_qremove((vm_offset_t)pc, 1);
2376	vm_page_unwire(m, PQ_NONE);
2377	vm_page_free(m);
2378	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2379}
2380
2381/*
2382 * get a new pv_entry, allocating a block from the system
2383 * when needed.
2384 */
2385static pv_entry_t
2386get_pv_entry(pmap_t pmap, boolean_t try)
2387{
2388	static const struct timeval printinterval = { 60, 0 };
2389	static struct timeval lastprint;
2390	int bit, field;
2391	pv_entry_t pv;
2392	struct pv_chunk *pc;
2393	vm_page_t m;
2394
2395	rw_assert(&pvh_global_lock, RA_WLOCKED);
2396	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2397	PV_STAT(pv_entry_allocs++);
2398	pv_entry_count++;
2399	if (pv_entry_count > pv_entry_high_water)
2400		if (ratecheck(&lastprint, &printinterval))
2401			printf("Approaching the limit on PV entries, consider "
2402			    "increasing either the vm.pmap.shpgperproc or the "
2403			    "vm.pmap.pv_entry_max tunable.\n");
2404retry:
2405	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2406	if (pc != NULL) {
2407		for (field = 0; field < _NPCM; field++) {
2408			if (pc->pc_map[field]) {
2409				bit = bsfl(pc->pc_map[field]);
2410				break;
2411			}
2412		}
2413		if (field < _NPCM) {
2414			pv = &pc->pc_pventry[field * 32 + bit];
2415			pc->pc_map[field] &= ~(1ul << bit);
2416			/* If this was the last item, move it to tail */
2417			for (field = 0; field < _NPCM; field++)
2418				if (pc->pc_map[field] != 0) {
2419					PV_STAT(pv_entry_spare--);
2420					return (pv);	/* not full, return */
2421				}
2422			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2423			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2424			PV_STAT(pv_entry_spare--);
2425			return (pv);
2426		}
2427	}
2428	/*
2429	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2430	 * global lock.  If "pv_vafree" is currently non-empty, it will
2431	 * remain non-empty until pmap_ptelist_alloc() completes.
2432	 */
2433	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2434	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2435		if (try) {
2436			pv_entry_count--;
2437			PV_STAT(pc_chunk_tryfail++);
2438			return (NULL);
2439		}
2440		m = pmap_pv_reclaim(pmap);
2441		if (m == NULL)
2442			goto retry;
2443	}
2444	PV_STAT(pc_chunk_count++);
2445	PV_STAT(pc_chunk_allocs++);
2446	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2447	pmap_qenter((vm_offset_t)pc, &m, 1);
2448	pc->pc_pmap = pmap;
2449	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2450	for (field = 1; field < _NPCM; field++)
2451		pc->pc_map[field] = pc_freemask[field];
2452	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2453	pv = &pc->pc_pventry[0];
2454	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2455	PV_STAT(pv_entry_spare += _NPCPV - 1);
2456	return (pv);
2457}
2458
2459static __inline pv_entry_t
2460pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2461{
2462	pv_entry_t pv;
2463
2464	rw_assert(&pvh_global_lock, RA_WLOCKED);
2465	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2466		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2467			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2468			break;
2469		}
2470	}
2471	return (pv);
2472}
2473
2474static void
2475pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2476{
2477	struct md_page *pvh;
2478	pv_entry_t pv;
2479	vm_offset_t va_last;
2480	vm_page_t m;
2481
2482	rw_assert(&pvh_global_lock, RA_WLOCKED);
2483	KASSERT((pa & PDRMASK) == 0,
2484	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2485
2486	/*
2487	 * Transfer the 4mpage's pv entry for this mapping to the first
2488	 * page's pv list.
2489	 */
2490	pvh = pa_to_pvh(pa);
2491	va = trunc_4mpage(va);
2492	pv = pmap_pvh_remove(pvh, pmap, va);
2493	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2494	m = PHYS_TO_VM_PAGE(pa);
2495	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2496	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2497	va_last = va + NBPDR - PAGE_SIZE;
2498	do {
2499		m++;
2500		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2501		    ("pmap_pv_demote_pde: page %p is not managed", m));
2502		va += PAGE_SIZE;
2503		pmap_insert_entry(pmap, va, m);
2504	} while (va < va_last);
2505}
2506
2507static void
2508pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2509{
2510	struct md_page *pvh;
2511	pv_entry_t pv;
2512	vm_offset_t va_last;
2513	vm_page_t m;
2514
2515	rw_assert(&pvh_global_lock, RA_WLOCKED);
2516	KASSERT((pa & PDRMASK) == 0,
2517	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2518
2519	/*
2520	 * Transfer the first page's pv entry for this mapping to the
2521	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2522	 * to get_pv_entry(), a transfer avoids the possibility that
2523	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2524	 * removes one of the mappings that is being promoted.
2525	 */
2526	m = PHYS_TO_VM_PAGE(pa);
2527	va = trunc_4mpage(va);
2528	pv = pmap_pvh_remove(&m->md, pmap, va);
2529	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2530	pvh = pa_to_pvh(pa);
2531	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2532	/* Free the remaining NPTEPG - 1 pv entries. */
2533	va_last = va + NBPDR - PAGE_SIZE;
2534	do {
2535		m++;
2536		va += PAGE_SIZE;
2537		pmap_pvh_free(&m->md, pmap, va);
2538	} while (va < va_last);
2539}
2540
2541static void
2542pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2543{
2544	pv_entry_t pv;
2545
2546	pv = pmap_pvh_remove(pvh, pmap, va);
2547	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2548	free_pv_entry(pmap, pv);
2549}
2550
2551static void
2552pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2553{
2554	struct md_page *pvh;
2555
2556	rw_assert(&pvh_global_lock, RA_WLOCKED);
2557	pmap_pvh_free(&m->md, pmap, va);
2558	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2559		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2560		if (TAILQ_EMPTY(&pvh->pv_list))
2561			vm_page_aflag_clear(m, PGA_WRITEABLE);
2562	}
2563}
2564
2565/*
2566 * Create a pv entry for page at pa for
2567 * (pmap, va).
2568 */
2569static void
2570pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2571{
2572	pv_entry_t pv;
2573
2574	rw_assert(&pvh_global_lock, RA_WLOCKED);
2575	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2576	pv = get_pv_entry(pmap, FALSE);
2577	pv->pv_va = va;
2578	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2579}
2580
2581/*
2582 * Conditionally create a pv entry.
2583 */
2584static boolean_t
2585pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2586{
2587	pv_entry_t pv;
2588
2589	rw_assert(&pvh_global_lock, RA_WLOCKED);
2590	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2591	if (pv_entry_count < pv_entry_high_water &&
2592	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2593		pv->pv_va = va;
2594		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2595		return (TRUE);
2596	} else
2597		return (FALSE);
2598}
2599
2600/*
2601 * Create the pv entries for each of the pages within a superpage.
2602 */
2603static boolean_t
2604pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2605{
2606	struct md_page *pvh;
2607	pv_entry_t pv;
2608
2609	rw_assert(&pvh_global_lock, RA_WLOCKED);
2610	if (pv_entry_count < pv_entry_high_water &&
2611	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2612		pv->pv_va = va;
2613		pvh = pa_to_pvh(pa);
2614		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2615		return (TRUE);
2616	} else
2617		return (FALSE);
2618}
2619
2620/*
2621 * Fills a page table page with mappings to consecutive physical pages.
2622 */
2623static void
2624pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2625{
2626	pt_entry_t *pte;
2627
2628	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2629		*pte = newpte;
2630		newpte += PAGE_SIZE;
2631	}
2632}
2633
2634/*
2635 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2636 * 2- or 4MB page mapping is invalidated.
2637 */
2638static boolean_t
2639pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2640{
2641	pd_entry_t newpde, oldpde;
2642	pt_entry_t *firstpte, newpte;
2643	vm_paddr_t mptepa;
2644	vm_page_t mpte;
2645	struct spglist free;
2646	vm_offset_t sva;
2647
2648	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2649	oldpde = *pde;
2650	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2651	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2652	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2653	    NULL) {
2654		KASSERT((oldpde & PG_W) == 0,
2655		    ("pmap_demote_pde: page table page for a wired mapping"
2656		    " is missing"));
2657
2658		/*
2659		 * Invalidate the 2- or 4MB page mapping and return
2660		 * "failure" if the mapping was never accessed or the
2661		 * allocation of the new page table page fails.
2662		 */
2663		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2664		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2665		    VM_ALLOC_WIRED)) == NULL) {
2666			SLIST_INIT(&free);
2667			sva = trunc_4mpage(va);
2668			pmap_remove_pde(pmap, pde, sva, &free);
2669			if ((oldpde & PG_G) == 0)
2670				pmap_invalidate_pde_page(pmap, sva, oldpde);
2671			pmap_free_zero_pages(&free);
2672			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2673			    " in pmap %p", va, pmap);
2674			return (FALSE);
2675		}
2676		if (va < VM_MAXUSER_ADDRESS)
2677			pmap->pm_stats.resident_count++;
2678	}
2679	mptepa = VM_PAGE_TO_PHYS(mpte);
2680
2681	/*
2682	 * If the page mapping is in the kernel's address space, then the
2683	 * KPTmap can provide access to the page table page.  Otherwise,
2684	 * temporarily map the page table page (mpte) into the kernel's
2685	 * address space at either PADDR1 or PADDR2.
2686	 */
2687	if (va >= KERNBASE)
2688		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2689	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2690		if ((*PMAP1 & PG_FRAME) != mptepa) {
2691			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2692#ifdef SMP
2693			PMAP1cpu = PCPU_GET(cpuid);
2694#endif
2695			invlcaddr(PADDR1);
2696			PMAP1changed++;
2697		} else
2698#ifdef SMP
2699		if (PMAP1cpu != PCPU_GET(cpuid)) {
2700			PMAP1cpu = PCPU_GET(cpuid);
2701			invlcaddr(PADDR1);
2702			PMAP1changedcpu++;
2703		} else
2704#endif
2705			PMAP1unchanged++;
2706		firstpte = PADDR1;
2707	} else {
2708		mtx_lock(&PMAP2mutex);
2709		if ((*PMAP2 & PG_FRAME) != mptepa) {
2710			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2711			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2712		}
2713		firstpte = PADDR2;
2714	}
2715	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2716	KASSERT((oldpde & PG_A) != 0,
2717	    ("pmap_demote_pde: oldpde is missing PG_A"));
2718	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2719	    ("pmap_demote_pde: oldpde is missing PG_M"));
2720	newpte = oldpde & ~PG_PS;
2721	if ((newpte & PG_PDE_PAT) != 0)
2722		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2723
2724	/*
2725	 * If the page table page is new, initialize it.
2726	 */
2727	if (mpte->wire_count == 1) {
2728		mpte->wire_count = NPTEPG;
2729		pmap_fill_ptp(firstpte, newpte);
2730	}
2731	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2732	    ("pmap_demote_pde: firstpte and newpte map different physical"
2733	    " addresses"));
2734
2735	/*
2736	 * If the mapping has changed attributes, update the page table
2737	 * entries.
2738	 */
2739	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2740		pmap_fill_ptp(firstpte, newpte);
2741
2742	/*
2743	 * Demote the mapping.  This pmap is locked.  The old PDE has
2744	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2745	 * set.  Thus, there is no danger of a race with another
2746	 * processor changing the setting of PG_A and/or PG_M between
2747	 * the read above and the store below.
2748	 */
2749	if (workaround_erratum383)
2750		pmap_update_pde(pmap, va, pde, newpde);
2751	else if (pmap == kernel_pmap)
2752		pmap_kenter_pde(va, newpde);
2753	else
2754		pde_store(pde, newpde);
2755	if (firstpte == PADDR2)
2756		mtx_unlock(&PMAP2mutex);
2757
2758	/*
2759	 * Invalidate the recursive mapping of the page table page.
2760	 */
2761	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2762
2763	/*
2764	 * Demote the pv entry.  This depends on the earlier demotion
2765	 * of the mapping.  Specifically, the (re)creation of a per-
2766	 * page pv entry might trigger the execution of pmap_collect(),
2767	 * which might reclaim a newly (re)created per-page pv entry
2768	 * and destroy the associated mapping.  In order to destroy
2769	 * the mapping, the PDE must have already changed from mapping
2770	 * the 2mpage to referencing the page table page.
2771	 */
2772	if ((oldpde & PG_MANAGED) != 0)
2773		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2774
2775	pmap_pde_demotions++;
2776	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2777	    " in pmap %p", va, pmap);
2778	return (TRUE);
2779}
2780
2781/*
2782 * Removes a 2- or 4MB page mapping from the kernel pmap.
2783 */
2784static void
2785pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2786{
2787	pd_entry_t newpde;
2788	vm_paddr_t mptepa;
2789	vm_page_t mpte;
2790
2791	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2792	mpte = pmap_remove_pt_page(pmap, va);
2793	if (mpte == NULL)
2794		panic("pmap_remove_kernel_pde: Missing pt page.");
2795
2796	mptepa = VM_PAGE_TO_PHYS(mpte);
2797	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2798
2799	/*
2800	 * Initialize the page table page.
2801	 */
2802	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2803
2804	/*
2805	 * Remove the mapping.
2806	 */
2807	if (workaround_erratum383)
2808		pmap_update_pde(pmap, va, pde, newpde);
2809	else
2810		pmap_kenter_pde(va, newpde);
2811
2812	/*
2813	 * Invalidate the recursive mapping of the page table page.
2814	 */
2815	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2816}
2817
2818/*
2819 * pmap_remove_pde: do the things to unmap a superpage in a process
2820 */
2821static void
2822pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2823    struct spglist *free)
2824{
2825	struct md_page *pvh;
2826	pd_entry_t oldpde;
2827	vm_offset_t eva, va;
2828	vm_page_t m, mpte;
2829
2830	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2831	KASSERT((sva & PDRMASK) == 0,
2832	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2833	oldpde = pte_load_clear(pdq);
2834	if (oldpde & PG_W)
2835		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2836
2837	/*
2838	 * Machines that don't support invlpg, also don't support
2839	 * PG_G.
2840	 */
2841	if ((oldpde & PG_G) != 0)
2842		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
2843
2844	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2845	if (oldpde & PG_MANAGED) {
2846		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2847		pmap_pvh_free(pvh, pmap, sva);
2848		eva = sva + NBPDR;
2849		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2850		    va < eva; va += PAGE_SIZE, m++) {
2851			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2852				vm_page_dirty(m);
2853			if (oldpde & PG_A)
2854				vm_page_aflag_set(m, PGA_REFERENCED);
2855			if (TAILQ_EMPTY(&m->md.pv_list) &&
2856			    TAILQ_EMPTY(&pvh->pv_list))
2857				vm_page_aflag_clear(m, PGA_WRITEABLE);
2858		}
2859	}
2860	if (pmap == kernel_pmap) {
2861		pmap_remove_kernel_pde(pmap, pdq, sva);
2862	} else {
2863		mpte = pmap_remove_pt_page(pmap, sva);
2864		if (mpte != NULL) {
2865			pmap->pm_stats.resident_count--;
2866			KASSERT(mpte->wire_count == NPTEPG,
2867			    ("pmap_remove_pde: pte page wire count error"));
2868			mpte->wire_count = 0;
2869			pmap_add_delayed_free_list(mpte, free, FALSE);
2870		}
2871	}
2872}
2873
2874/*
2875 * pmap_remove_pte: do the things to unmap a page in a process
2876 */
2877static int
2878pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2879    struct spglist *free)
2880{
2881	pt_entry_t oldpte;
2882	vm_page_t m;
2883
2884	rw_assert(&pvh_global_lock, RA_WLOCKED);
2885	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2886	oldpte = pte_load_clear(ptq);
2887	KASSERT(oldpte != 0,
2888	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2889	if (oldpte & PG_W)
2890		pmap->pm_stats.wired_count -= 1;
2891	/*
2892	 * Machines that don't support invlpg, also don't support
2893	 * PG_G.
2894	 */
2895	if (oldpte & PG_G)
2896		pmap_invalidate_page(kernel_pmap, va);
2897	pmap->pm_stats.resident_count -= 1;
2898	if (oldpte & PG_MANAGED) {
2899		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2900		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2901			vm_page_dirty(m);
2902		if (oldpte & PG_A)
2903			vm_page_aflag_set(m, PGA_REFERENCED);
2904		pmap_remove_entry(pmap, m, va);
2905	}
2906	return (pmap_unuse_pt(pmap, va, free));
2907}
2908
2909/*
2910 * Remove a single page from a process address space
2911 */
2912static void
2913pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2914{
2915	pt_entry_t *pte;
2916
2917	rw_assert(&pvh_global_lock, RA_WLOCKED);
2918	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2919	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2920	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2921		return;
2922	pmap_remove_pte(pmap, pte, va, free);
2923	pmap_invalidate_page(pmap, va);
2924}
2925
2926/*
2927 *	Remove the given range of addresses from the specified map.
2928 *
2929 *	It is assumed that the start and end are properly
2930 *	rounded to the page size.
2931 */
2932void
2933pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2934{
2935	vm_offset_t pdnxt;
2936	pd_entry_t ptpaddr;
2937	pt_entry_t *pte;
2938	struct spglist free;
2939	int anyvalid;
2940
2941	/*
2942	 * Perform an unsynchronized read.  This is, however, safe.
2943	 */
2944	if (pmap->pm_stats.resident_count == 0)
2945		return;
2946
2947	anyvalid = 0;
2948	SLIST_INIT(&free);
2949
2950	rw_wlock(&pvh_global_lock);
2951	sched_pin();
2952	PMAP_LOCK(pmap);
2953
2954	/*
2955	 * special handling of removing one page.  a very
2956	 * common operation and easy to short circuit some
2957	 * code.
2958	 */
2959	if ((sva + PAGE_SIZE == eva) &&
2960	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2961		pmap_remove_page(pmap, sva, &free);
2962		goto out;
2963	}
2964
2965	for (; sva < eva; sva = pdnxt) {
2966		u_int pdirindex;
2967
2968		/*
2969		 * Calculate index for next page table.
2970		 */
2971		pdnxt = (sva + NBPDR) & ~PDRMASK;
2972		if (pdnxt < sva)
2973			pdnxt = eva;
2974		if (pmap->pm_stats.resident_count == 0)
2975			break;
2976
2977		pdirindex = sva >> PDRSHIFT;
2978		ptpaddr = pmap->pm_pdir[pdirindex];
2979
2980		/*
2981		 * Weed out invalid mappings. Note: we assume that the page
2982		 * directory table is always allocated, and in kernel virtual.
2983		 */
2984		if (ptpaddr == 0)
2985			continue;
2986
2987		/*
2988		 * Check for large page.
2989		 */
2990		if ((ptpaddr & PG_PS) != 0) {
2991			/*
2992			 * Are we removing the entire large page?  If not,
2993			 * demote the mapping and fall through.
2994			 */
2995			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2996				/*
2997				 * The TLB entry for a PG_G mapping is
2998				 * invalidated by pmap_remove_pde().
2999				 */
3000				if ((ptpaddr & PG_G) == 0)
3001					anyvalid = 1;
3002				pmap_remove_pde(pmap,
3003				    &pmap->pm_pdir[pdirindex], sva, &free);
3004				continue;
3005			} else if (!pmap_demote_pde(pmap,
3006			    &pmap->pm_pdir[pdirindex], sva)) {
3007				/* The large page mapping was destroyed. */
3008				continue;
3009			}
3010		}
3011
3012		/*
3013		 * Limit our scan to either the end of the va represented
3014		 * by the current page table page, or to the end of the
3015		 * range being removed.
3016		 */
3017		if (pdnxt > eva)
3018			pdnxt = eva;
3019
3020		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3021		    sva += PAGE_SIZE) {
3022			if (*pte == 0)
3023				continue;
3024
3025			/*
3026			 * The TLB entry for a PG_G mapping is invalidated
3027			 * by pmap_remove_pte().
3028			 */
3029			if ((*pte & PG_G) == 0)
3030				anyvalid = 1;
3031			if (pmap_remove_pte(pmap, pte, sva, &free))
3032				break;
3033		}
3034	}
3035out:
3036	sched_unpin();
3037	if (anyvalid)
3038		pmap_invalidate_all(pmap);
3039	rw_wunlock(&pvh_global_lock);
3040	PMAP_UNLOCK(pmap);
3041	pmap_free_zero_pages(&free);
3042}
3043
3044/*
3045 *	Routine:	pmap_remove_all
3046 *	Function:
3047 *		Removes this physical page from
3048 *		all physical maps in which it resides.
3049 *		Reflects back modify bits to the pager.
3050 *
3051 *	Notes:
3052 *		Original versions of this routine were very
3053 *		inefficient because they iteratively called
3054 *		pmap_remove (slow...)
3055 */
3056
3057void
3058pmap_remove_all(vm_page_t m)
3059{
3060	struct md_page *pvh;
3061	pv_entry_t pv;
3062	pmap_t pmap;
3063	pt_entry_t *pte, tpte;
3064	pd_entry_t *pde;
3065	vm_offset_t va;
3066	struct spglist free;
3067
3068	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3069	    ("pmap_remove_all: page %p is not managed", m));
3070	SLIST_INIT(&free);
3071	rw_wlock(&pvh_global_lock);
3072	sched_pin();
3073	if ((m->flags & PG_FICTITIOUS) != 0)
3074		goto small_mappings;
3075	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3076	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3077		va = pv->pv_va;
3078		pmap = PV_PMAP(pv);
3079		PMAP_LOCK(pmap);
3080		pde = pmap_pde(pmap, va);
3081		(void)pmap_demote_pde(pmap, pde, va);
3082		PMAP_UNLOCK(pmap);
3083	}
3084small_mappings:
3085	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3086		pmap = PV_PMAP(pv);
3087		PMAP_LOCK(pmap);
3088		pmap->pm_stats.resident_count--;
3089		pde = pmap_pde(pmap, pv->pv_va);
3090		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3091		    " a 4mpage in page %p's pv list", m));
3092		pte = pmap_pte_quick(pmap, pv->pv_va);
3093		tpte = pte_load_clear(pte);
3094		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3095		    pmap, pv->pv_va));
3096		if (tpte & PG_W)
3097			pmap->pm_stats.wired_count--;
3098		if (tpte & PG_A)
3099			vm_page_aflag_set(m, PGA_REFERENCED);
3100
3101		/*
3102		 * Update the vm_page_t clean and reference bits.
3103		 */
3104		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3105			vm_page_dirty(m);
3106		pmap_unuse_pt(pmap, pv->pv_va, &free);
3107		pmap_invalidate_page(pmap, pv->pv_va);
3108		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3109		free_pv_entry(pmap, pv);
3110		PMAP_UNLOCK(pmap);
3111	}
3112	vm_page_aflag_clear(m, PGA_WRITEABLE);
3113	sched_unpin();
3114	rw_wunlock(&pvh_global_lock);
3115	pmap_free_zero_pages(&free);
3116}
3117
3118/*
3119 * pmap_protect_pde: do the things to protect a 4mpage in a process
3120 */
3121static boolean_t
3122pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3123{
3124	pd_entry_t newpde, oldpde;
3125	vm_offset_t eva, va;
3126	vm_page_t m;
3127	boolean_t anychanged;
3128
3129	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3130	KASSERT((sva & PDRMASK) == 0,
3131	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3132	anychanged = FALSE;
3133retry:
3134	oldpde = newpde = *pde;
3135	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
3136	    (PG_MANAGED | PG_M | PG_RW)) {
3137		eva = sva + NBPDR;
3138		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3139		    va < eva; va += PAGE_SIZE, m++)
3140			vm_page_dirty(m);
3141	}
3142	if ((prot & VM_PROT_WRITE) == 0)
3143		newpde &= ~(PG_RW | PG_M);
3144#if defined(PAE) || defined(PAE_TABLES)
3145	if ((prot & VM_PROT_EXECUTE) == 0)
3146		newpde |= pg_nx;
3147#endif
3148	if (newpde != oldpde) {
3149		/*
3150		 * As an optimization to future operations on this PDE, clear
3151		 * PG_PROMOTED.  The impending invalidation will remove any
3152		 * lingering 4KB page mappings from the TLB.
3153		 */
3154		if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
3155			goto retry;
3156		if ((oldpde & PG_G) != 0)
3157			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3158		else
3159			anychanged = TRUE;
3160	}
3161	return (anychanged);
3162}
3163
3164/*
3165 *	Set the physical protection on the
3166 *	specified range of this map as requested.
3167 */
3168void
3169pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3170{
3171	vm_offset_t pdnxt;
3172	pd_entry_t ptpaddr;
3173	pt_entry_t *pte;
3174	boolean_t anychanged, pv_lists_locked;
3175
3176	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3177	if (prot == VM_PROT_NONE) {
3178		pmap_remove(pmap, sva, eva);
3179		return;
3180	}
3181
3182#if defined(PAE) || defined(PAE_TABLES)
3183	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3184	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3185		return;
3186#else
3187	if (prot & VM_PROT_WRITE)
3188		return;
3189#endif
3190
3191	if (pmap_is_current(pmap))
3192		pv_lists_locked = FALSE;
3193	else {
3194		pv_lists_locked = TRUE;
3195resume:
3196		rw_wlock(&pvh_global_lock);
3197		sched_pin();
3198	}
3199	anychanged = FALSE;
3200
3201	PMAP_LOCK(pmap);
3202	for (; sva < eva; sva = pdnxt) {
3203		pt_entry_t obits, pbits;
3204		u_int pdirindex;
3205
3206		pdnxt = (sva + NBPDR) & ~PDRMASK;
3207		if (pdnxt < sva)
3208			pdnxt = eva;
3209
3210		pdirindex = sva >> PDRSHIFT;
3211		ptpaddr = pmap->pm_pdir[pdirindex];
3212
3213		/*
3214		 * Weed out invalid mappings. Note: we assume that the page
3215		 * directory table is always allocated, and in kernel virtual.
3216		 */
3217		if (ptpaddr == 0)
3218			continue;
3219
3220		/*
3221		 * Check for large page.
3222		 */
3223		if ((ptpaddr & PG_PS) != 0) {
3224			/*
3225			 * Are we protecting the entire large page?  If not,
3226			 * demote the mapping and fall through.
3227			 */
3228			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3229				/*
3230				 * The TLB entry for a PG_G mapping is
3231				 * invalidated by pmap_protect_pde().
3232				 */
3233				if (pmap_protect_pde(pmap,
3234				    &pmap->pm_pdir[pdirindex], sva, prot))
3235					anychanged = TRUE;
3236				continue;
3237			} else {
3238				if (!pv_lists_locked) {
3239					pv_lists_locked = TRUE;
3240					if (!rw_try_wlock(&pvh_global_lock)) {
3241						if (anychanged)
3242							pmap_invalidate_all(
3243							    pmap);
3244						PMAP_UNLOCK(pmap);
3245						goto resume;
3246					}
3247					sched_pin();
3248				}
3249				if (!pmap_demote_pde(pmap,
3250				    &pmap->pm_pdir[pdirindex], sva)) {
3251					/*
3252					 * The large page mapping was
3253					 * destroyed.
3254					 */
3255					continue;
3256				}
3257			}
3258		}
3259
3260		if (pdnxt > eva)
3261			pdnxt = eva;
3262
3263		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3264		    sva += PAGE_SIZE) {
3265			vm_page_t m;
3266
3267retry:
3268			/*
3269			 * Regardless of whether a pte is 32 or 64 bits in
3270			 * size, PG_RW, PG_A, and PG_M are among the least
3271			 * significant 32 bits.
3272			 */
3273			obits = pbits = *pte;
3274			if ((pbits & PG_V) == 0)
3275				continue;
3276
3277			if ((prot & VM_PROT_WRITE) == 0) {
3278				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3279				    (PG_MANAGED | PG_M | PG_RW)) {
3280					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3281					vm_page_dirty(m);
3282				}
3283				pbits &= ~(PG_RW | PG_M);
3284			}
3285#if defined(PAE) || defined(PAE_TABLES)
3286			if ((prot & VM_PROT_EXECUTE) == 0)
3287				pbits |= pg_nx;
3288#endif
3289
3290			if (pbits != obits) {
3291#if defined(PAE) || defined(PAE_TABLES)
3292				if (!atomic_cmpset_64(pte, obits, pbits))
3293					goto retry;
3294#else
3295				if (!atomic_cmpset_int((u_int *)pte, obits,
3296				    pbits))
3297					goto retry;
3298#endif
3299				if (obits & PG_G)
3300					pmap_invalidate_page(pmap, sva);
3301				else
3302					anychanged = TRUE;
3303			}
3304		}
3305	}
3306	if (anychanged)
3307		pmap_invalidate_all(pmap);
3308	if (pv_lists_locked) {
3309		sched_unpin();
3310		rw_wunlock(&pvh_global_lock);
3311	}
3312	PMAP_UNLOCK(pmap);
3313}
3314
3315/*
3316 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3317 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3318 * For promotion to occur, two conditions must be met: (1) the 4KB page
3319 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3320 * mappings must have identical characteristics.
3321 *
3322 * Managed (PG_MANAGED) mappings within the kernel address space are not
3323 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3324 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3325 * pmap.
3326 */
3327static void
3328pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3329{
3330	pd_entry_t newpde;
3331	pt_entry_t *firstpte, oldpte, pa, *pte;
3332	vm_offset_t oldpteva;
3333	vm_page_t mpte;
3334
3335	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3336
3337	/*
3338	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3339	 * either invalid, unused, or does not map the first 4KB physical page
3340	 * within a 2- or 4MB page.
3341	 */
3342	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3343setpde:
3344	newpde = *firstpte;
3345	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3346		pmap_pde_p_failures++;
3347		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3348		    " in pmap %p", va, pmap);
3349		return;
3350	}
3351	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3352		pmap_pde_p_failures++;
3353		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3354		    " in pmap %p", va, pmap);
3355		return;
3356	}
3357	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3358		/*
3359		 * When PG_M is already clear, PG_RW can be cleared without
3360		 * a TLB invalidation.
3361		 */
3362		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3363		    ~PG_RW))
3364			goto setpde;
3365		newpde &= ~PG_RW;
3366	}
3367
3368	/*
3369	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3370	 * PTE maps an unexpected 4KB physical page or does not have identical
3371	 * characteristics to the first PTE.
3372	 */
3373	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3374	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3375setpte:
3376		oldpte = *pte;
3377		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3378			pmap_pde_p_failures++;
3379			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3380			    " in pmap %p", va, pmap);
3381			return;
3382		}
3383		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3384			/*
3385			 * When PG_M is already clear, PG_RW can be cleared
3386			 * without a TLB invalidation.
3387			 */
3388			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3389			    oldpte & ~PG_RW))
3390				goto setpte;
3391			oldpte &= ~PG_RW;
3392			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3393			    (va & ~PDRMASK);
3394			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3395			    " in pmap %p", oldpteva, pmap);
3396		}
3397		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3398			pmap_pde_p_failures++;
3399			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3400			    " in pmap %p", va, pmap);
3401			return;
3402		}
3403		pa -= PAGE_SIZE;
3404	}
3405
3406	/*
3407	 * Save the page table page in its current state until the PDE
3408	 * mapping the superpage is demoted by pmap_demote_pde() or
3409	 * destroyed by pmap_remove_pde().
3410	 */
3411	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3412	KASSERT(mpte >= vm_page_array &&
3413	    mpte < &vm_page_array[vm_page_array_size],
3414	    ("pmap_promote_pde: page table page is out of range"));
3415	KASSERT(mpte->pindex == va >> PDRSHIFT,
3416	    ("pmap_promote_pde: page table page's pindex is wrong"));
3417	if (pmap_insert_pt_page(pmap, mpte)) {
3418		pmap_pde_p_failures++;
3419		CTR2(KTR_PMAP,
3420		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3421		    pmap);
3422		return;
3423	}
3424
3425	/*
3426	 * Promote the pv entries.
3427	 */
3428	if ((newpde & PG_MANAGED) != 0)
3429		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3430
3431	/*
3432	 * Propagate the PAT index to its proper position.
3433	 */
3434	if ((newpde & PG_PTE_PAT) != 0)
3435		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3436
3437	/*
3438	 * Map the superpage.
3439	 */
3440	if (workaround_erratum383)
3441		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3442	else if (pmap == kernel_pmap)
3443		pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
3444	else
3445		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
3446
3447	pmap_pde_promotions++;
3448	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3449	    " in pmap %p", va, pmap);
3450}
3451
3452/*
3453 *	Insert the given physical page (p) at
3454 *	the specified virtual address (v) in the
3455 *	target physical map with the protection requested.
3456 *
3457 *	If specified, the page will be wired down, meaning
3458 *	that the related pte can not be reclaimed.
3459 *
3460 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3461 *	or lose information.  That is, this routine must actually
3462 *	insert this page into the given map NOW.
3463 */
3464int
3465pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3466    u_int flags, int8_t psind)
3467{
3468	pd_entry_t *pde;
3469	pt_entry_t *pte;
3470	pt_entry_t newpte, origpte;
3471	pv_entry_t pv;
3472	vm_paddr_t opa, pa;
3473	vm_page_t mpte, om;
3474	boolean_t invlva, wired;
3475
3476	va = trunc_page(va);
3477	mpte = NULL;
3478	wired = (flags & PMAP_ENTER_WIRED) != 0;
3479
3480	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3481	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3482	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3483	    va));
3484	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3485		VM_OBJECT_ASSERT_LOCKED(m->object);
3486
3487	rw_wlock(&pvh_global_lock);
3488	PMAP_LOCK(pmap);
3489	sched_pin();
3490
3491	pde = pmap_pde(pmap, va);
3492	if (va < VM_MAXUSER_ADDRESS) {
3493		/*
3494		 * va is for UVA.
3495		 * In the case that a page table page is not resident,
3496		 * we are creating it here.  pmap_allocpte() handles
3497		 * demotion.
3498		 */
3499		mpte = pmap_allocpte(pmap, va, flags);
3500		if (mpte == NULL) {
3501			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3502			    ("pmap_allocpte failed with sleep allowed"));
3503			sched_unpin();
3504			rw_wunlock(&pvh_global_lock);
3505			PMAP_UNLOCK(pmap);
3506			return (KERN_RESOURCE_SHORTAGE);
3507		}
3508	} else {
3509		/*
3510		 * va is for KVA, so pmap_demote_pde() will never fail
3511		 * to install a page table page.  PG_V is also
3512		 * asserted by pmap_demote_pde().
3513		 */
3514		KASSERT(pde != NULL && (*pde & PG_V) != 0,
3515		    ("KVA %#x invalid pde pdir %#jx", va,
3516		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
3517		if ((*pde & PG_PS) != 0)
3518			pmap_demote_pde(pmap, pde, va);
3519	}
3520	pte = pmap_pte_quick(pmap, va);
3521
3522	/*
3523	 * Page Directory table entry is not valid, which should not
3524	 * happen.  We should have either allocated the page table
3525	 * page or demoted the existing mapping above.
3526	 */
3527	if (pte == NULL) {
3528		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3529		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3530	}
3531
3532	pa = VM_PAGE_TO_PHYS(m);
3533	om = NULL;
3534	origpte = *pte;
3535	opa = origpte & PG_FRAME;
3536
3537	/*
3538	 * Mapping has not changed, must be protection or wiring change.
3539	 */
3540	if (origpte && (opa == pa)) {
3541		/*
3542		 * Wiring change, just update stats. We don't worry about
3543		 * wiring PT pages as they remain resident as long as there
3544		 * are valid mappings in them. Hence, if a user page is wired,
3545		 * the PT page will be also.
3546		 */
3547		if (wired && ((origpte & PG_W) == 0))
3548			pmap->pm_stats.wired_count++;
3549		else if (!wired && (origpte & PG_W))
3550			pmap->pm_stats.wired_count--;
3551
3552		/*
3553		 * Remove extra pte reference
3554		 */
3555		if (mpte)
3556			mpte->wire_count--;
3557
3558		if (origpte & PG_MANAGED) {
3559			om = m;
3560			pa |= PG_MANAGED;
3561		}
3562		goto validate;
3563	}
3564
3565	pv = NULL;
3566
3567	/*
3568	 * Mapping has changed, invalidate old range and fall through to
3569	 * handle validating new mapping.
3570	 */
3571	if (opa) {
3572		if (origpte & PG_W)
3573			pmap->pm_stats.wired_count--;
3574		if (origpte & PG_MANAGED) {
3575			om = PHYS_TO_VM_PAGE(opa);
3576			pv = pmap_pvh_remove(&om->md, pmap, va);
3577		}
3578		if (mpte != NULL) {
3579			mpte->wire_count--;
3580			KASSERT(mpte->wire_count > 0,
3581			    ("pmap_enter: missing reference to page table page,"
3582			     " va: 0x%x", va));
3583		}
3584	} else
3585		pmap->pm_stats.resident_count++;
3586
3587	/*
3588	 * Enter on the PV list if part of our managed memory.
3589	 */
3590	if ((m->oflags & VPO_UNMANAGED) == 0) {
3591		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3592		    ("pmap_enter: managed mapping within the clean submap"));
3593		if (pv == NULL)
3594			pv = get_pv_entry(pmap, FALSE);
3595		pv->pv_va = va;
3596		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3597		pa |= PG_MANAGED;
3598	} else if (pv != NULL)
3599		free_pv_entry(pmap, pv);
3600
3601	/*
3602	 * Increment counters
3603	 */
3604	if (wired)
3605		pmap->pm_stats.wired_count++;
3606
3607validate:
3608	/*
3609	 * Now validate mapping with desired protection/wiring.
3610	 */
3611	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3612	if ((prot & VM_PROT_WRITE) != 0) {
3613		newpte |= PG_RW;
3614		if ((newpte & PG_MANAGED) != 0)
3615			vm_page_aflag_set(m, PGA_WRITEABLE);
3616	}
3617#if defined(PAE) || defined(PAE_TABLES)
3618	if ((prot & VM_PROT_EXECUTE) == 0)
3619		newpte |= pg_nx;
3620#endif
3621	if (wired)
3622		newpte |= PG_W;
3623	if (va < VM_MAXUSER_ADDRESS)
3624		newpte |= PG_U;
3625	if (pmap == kernel_pmap)
3626		newpte |= pgeflag;
3627
3628	/*
3629	 * if the mapping or permission bits are different, we need
3630	 * to update the pte.
3631	 */
3632	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3633		newpte |= PG_A;
3634		if ((flags & VM_PROT_WRITE) != 0)
3635			newpte |= PG_M;
3636		if (origpte & PG_V) {
3637			invlva = FALSE;
3638			origpte = pte_load_store(pte, newpte);
3639			if (origpte & PG_A) {
3640				if (origpte & PG_MANAGED)
3641					vm_page_aflag_set(om, PGA_REFERENCED);
3642				if (opa != VM_PAGE_TO_PHYS(m))
3643					invlva = TRUE;
3644#if defined(PAE) || defined(PAE_TABLES)
3645				if ((origpte & PG_NX) == 0 &&
3646				    (newpte & PG_NX) != 0)
3647					invlva = TRUE;
3648#endif
3649			}
3650			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3651				if ((origpte & PG_MANAGED) != 0)
3652					vm_page_dirty(om);
3653				if ((prot & VM_PROT_WRITE) == 0)
3654					invlva = TRUE;
3655			}
3656			if ((origpte & PG_MANAGED) != 0 &&
3657			    TAILQ_EMPTY(&om->md.pv_list) &&
3658			    ((om->flags & PG_FICTITIOUS) != 0 ||
3659			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3660				vm_page_aflag_clear(om, PGA_WRITEABLE);
3661			if (invlva)
3662				pmap_invalidate_page(pmap, va);
3663		} else
3664			pte_store(pte, newpte);
3665	}
3666
3667	/*
3668	 * If both the page table page and the reservation are fully
3669	 * populated, then attempt promotion.
3670	 */
3671	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3672	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3673	    vm_reserv_level_iffullpop(m) == 0)
3674		pmap_promote_pde(pmap, pde, va);
3675
3676	sched_unpin();
3677	rw_wunlock(&pvh_global_lock);
3678	PMAP_UNLOCK(pmap);
3679	return (KERN_SUCCESS);
3680}
3681
3682/*
3683 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3684 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3685 * blocking, (2) a mapping already exists at the specified virtual address, or
3686 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3687 */
3688static boolean_t
3689pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3690{
3691	pd_entry_t *pde, newpde;
3692
3693	rw_assert(&pvh_global_lock, RA_WLOCKED);
3694	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3695	pde = pmap_pde(pmap, va);
3696	if (*pde != 0) {
3697		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3698		    " in pmap %p", va, pmap);
3699		return (FALSE);
3700	}
3701	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3702	    PG_PS | PG_V;
3703	if ((m->oflags & VPO_UNMANAGED) == 0) {
3704		newpde |= PG_MANAGED;
3705
3706		/*
3707		 * Abort this mapping if its PV entry could not be created.
3708		 */
3709		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3710			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3711			    " in pmap %p", va, pmap);
3712			return (FALSE);
3713		}
3714	}
3715#if defined(PAE) || defined(PAE_TABLES)
3716	if ((prot & VM_PROT_EXECUTE) == 0)
3717		newpde |= pg_nx;
3718#endif
3719	if (va < VM_MAXUSER_ADDRESS)
3720		newpde |= PG_U;
3721
3722	/*
3723	 * Increment counters.
3724	 */
3725	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3726
3727	/*
3728	 * Map the superpage.  (This is not a promoted mapping; there will not
3729	 * be any lingering 4KB page mappings in the TLB.)
3730	 */
3731	pde_store(pde, newpde);
3732
3733	pmap_pde_mappings++;
3734	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3735	    " in pmap %p", va, pmap);
3736	return (TRUE);
3737}
3738
3739/*
3740 * Maps a sequence of resident pages belonging to the same object.
3741 * The sequence begins with the given page m_start.  This page is
3742 * mapped at the given virtual address start.  Each subsequent page is
3743 * mapped at a virtual address that is offset from start by the same
3744 * amount as the page is offset from m_start within the object.  The
3745 * last page in the sequence is the page with the largest offset from
3746 * m_start that can be mapped at a virtual address less than the given
3747 * virtual address end.  Not every virtual page between start and end
3748 * is mapped; only those for which a resident page exists with the
3749 * corresponding offset from m_start are mapped.
3750 */
3751void
3752pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3753    vm_page_t m_start, vm_prot_t prot)
3754{
3755	vm_offset_t va;
3756	vm_page_t m, mpte;
3757	vm_pindex_t diff, psize;
3758
3759	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3760
3761	psize = atop(end - start);
3762	mpte = NULL;
3763	m = m_start;
3764	rw_wlock(&pvh_global_lock);
3765	PMAP_LOCK(pmap);
3766	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3767		va = start + ptoa(diff);
3768		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3769		    m->psind == 1 && pg_ps_enabled &&
3770		    pmap_enter_pde(pmap, va, m, prot))
3771			m = &m[NBPDR / PAGE_SIZE - 1];
3772		else
3773			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3774			    mpte);
3775		m = TAILQ_NEXT(m, listq);
3776	}
3777	rw_wunlock(&pvh_global_lock);
3778	PMAP_UNLOCK(pmap);
3779}
3780
3781/*
3782 * this code makes some *MAJOR* assumptions:
3783 * 1. Current pmap & pmap exists.
3784 * 2. Not wired.
3785 * 3. Read access.
3786 * 4. No page table pages.
3787 * but is *MUCH* faster than pmap_enter...
3788 */
3789
3790void
3791pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3792{
3793
3794	rw_wlock(&pvh_global_lock);
3795	PMAP_LOCK(pmap);
3796	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3797	rw_wunlock(&pvh_global_lock);
3798	PMAP_UNLOCK(pmap);
3799}
3800
3801static vm_page_t
3802pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3803    vm_prot_t prot, vm_page_t mpte)
3804{
3805	pt_entry_t *pte;
3806	vm_paddr_t pa;
3807	struct spglist free;
3808
3809	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3810	    (m->oflags & VPO_UNMANAGED) != 0,
3811	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3812	rw_assert(&pvh_global_lock, RA_WLOCKED);
3813	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3814
3815	/*
3816	 * In the case that a page table page is not
3817	 * resident, we are creating it here.
3818	 */
3819	if (va < VM_MAXUSER_ADDRESS) {
3820		u_int ptepindex;
3821		pd_entry_t ptepa;
3822
3823		/*
3824		 * Calculate pagetable page index
3825		 */
3826		ptepindex = va >> PDRSHIFT;
3827		if (mpte && (mpte->pindex == ptepindex)) {
3828			mpte->wire_count++;
3829		} else {
3830			/*
3831			 * Get the page directory entry
3832			 */
3833			ptepa = pmap->pm_pdir[ptepindex];
3834
3835			/*
3836			 * If the page table page is mapped, we just increment
3837			 * the hold count, and activate it.
3838			 */
3839			if (ptepa) {
3840				if (ptepa & PG_PS)
3841					return (NULL);
3842				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3843				mpte->wire_count++;
3844			} else {
3845				mpte = _pmap_allocpte(pmap, ptepindex,
3846				    PMAP_ENTER_NOSLEEP);
3847				if (mpte == NULL)
3848					return (mpte);
3849			}
3850		}
3851	} else {
3852		mpte = NULL;
3853	}
3854
3855	/*
3856	 * This call to vtopte makes the assumption that we are
3857	 * entering the page into the current pmap.  In order to support
3858	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3859	 * But that isn't as quick as vtopte.
3860	 */
3861	pte = vtopte(va);
3862	if (*pte) {
3863		if (mpte != NULL) {
3864			mpte->wire_count--;
3865			mpte = NULL;
3866		}
3867		return (mpte);
3868	}
3869
3870	/*
3871	 * Enter on the PV list if part of our managed memory.
3872	 */
3873	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3874	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3875		if (mpte != NULL) {
3876			SLIST_INIT(&free);
3877			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3878				pmap_invalidate_page(pmap, va);
3879				pmap_free_zero_pages(&free);
3880			}
3881
3882			mpte = NULL;
3883		}
3884		return (mpte);
3885	}
3886
3887	/*
3888	 * Increment counters
3889	 */
3890	pmap->pm_stats.resident_count++;
3891
3892	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3893#if defined(PAE) || defined(PAE_TABLES)
3894	if ((prot & VM_PROT_EXECUTE) == 0)
3895		pa |= pg_nx;
3896#endif
3897
3898	/*
3899	 * Now validate mapping with RO protection
3900	 */
3901	if ((m->oflags & VPO_UNMANAGED) != 0)
3902		pte_store(pte, pa | PG_V | PG_U);
3903	else
3904		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3905	return (mpte);
3906}
3907
3908/*
3909 * Make a temporary mapping for a physical address.  This is only intended
3910 * to be used for panic dumps.
3911 */
3912void *
3913pmap_kenter_temporary(vm_paddr_t pa, int i)
3914{
3915	vm_offset_t va;
3916
3917	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3918	pmap_kenter(va, pa);
3919	invlpg(va);
3920	return ((void *)crashdumpmap);
3921}
3922
3923/*
3924 * This code maps large physical mmap regions into the
3925 * processor address space.  Note that some shortcuts
3926 * are taken, but the code works.
3927 */
3928void
3929pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3930    vm_pindex_t pindex, vm_size_t size)
3931{
3932	pd_entry_t *pde;
3933	vm_paddr_t pa, ptepa;
3934	vm_page_t p;
3935	int pat_mode;
3936
3937	VM_OBJECT_ASSERT_WLOCKED(object);
3938	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3939	    ("pmap_object_init_pt: non-device object"));
3940	if (pseflag &&
3941	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3942		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3943			return;
3944		p = vm_page_lookup(object, pindex);
3945		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3946		    ("pmap_object_init_pt: invalid page %p", p));
3947		pat_mode = p->md.pat_mode;
3948
3949		/*
3950		 * Abort the mapping if the first page is not physically
3951		 * aligned to a 2/4MB page boundary.
3952		 */
3953		ptepa = VM_PAGE_TO_PHYS(p);
3954		if (ptepa & (NBPDR - 1))
3955			return;
3956
3957		/*
3958		 * Skip the first page.  Abort the mapping if the rest of
3959		 * the pages are not physically contiguous or have differing
3960		 * memory attributes.
3961		 */
3962		p = TAILQ_NEXT(p, listq);
3963		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3964		    pa += PAGE_SIZE) {
3965			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3966			    ("pmap_object_init_pt: invalid page %p", p));
3967			if (pa != VM_PAGE_TO_PHYS(p) ||
3968			    pat_mode != p->md.pat_mode)
3969				return;
3970			p = TAILQ_NEXT(p, listq);
3971		}
3972
3973		/*
3974		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3975		 * "size" is a multiple of 2/4M, adding the PAT setting to
3976		 * "pa" will not affect the termination of this loop.
3977		 */
3978		PMAP_LOCK(pmap);
3979		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3980		    size; pa += NBPDR) {
3981			pde = pmap_pde(pmap, addr);
3982			if (*pde == 0) {
3983				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3984				    PG_U | PG_RW | PG_V);
3985				pmap->pm_stats.resident_count += NBPDR /
3986				    PAGE_SIZE;
3987				pmap_pde_mappings++;
3988			}
3989			/* Else continue on if the PDE is already valid. */
3990			addr += NBPDR;
3991		}
3992		PMAP_UNLOCK(pmap);
3993	}
3994}
3995
3996/*
3997 *	Clear the wired attribute from the mappings for the specified range of
3998 *	addresses in the given pmap.  Every valid mapping within that range
3999 *	must have the wired attribute set.  In contrast, invalid mappings
4000 *	cannot have the wired attribute set, so they are ignored.
4001 *
4002 *	The wired attribute of the page table entry is not a hardware feature,
4003 *	so there is no need to invalidate any TLB entries.
4004 */
4005void
4006pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4007{
4008	vm_offset_t pdnxt;
4009	pd_entry_t *pde;
4010	pt_entry_t *pte;
4011	boolean_t pv_lists_locked;
4012
4013	if (pmap_is_current(pmap))
4014		pv_lists_locked = FALSE;
4015	else {
4016		pv_lists_locked = TRUE;
4017resume:
4018		rw_wlock(&pvh_global_lock);
4019		sched_pin();
4020	}
4021	PMAP_LOCK(pmap);
4022	for (; sva < eva; sva = pdnxt) {
4023		pdnxt = (sva + NBPDR) & ~PDRMASK;
4024		if (pdnxt < sva)
4025			pdnxt = eva;
4026		pde = pmap_pde(pmap, sva);
4027		if ((*pde & PG_V) == 0)
4028			continue;
4029		if ((*pde & PG_PS) != 0) {
4030			if ((*pde & PG_W) == 0)
4031				panic("pmap_unwire: pde %#jx is missing PG_W",
4032				    (uintmax_t)*pde);
4033
4034			/*
4035			 * Are we unwiring the entire large page?  If not,
4036			 * demote the mapping and fall through.
4037			 */
4038			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4039				/*
4040				 * Regardless of whether a pde (or pte) is 32
4041				 * or 64 bits in size, PG_W is among the least
4042				 * significant 32 bits.
4043				 */
4044				atomic_clear_int((u_int *)pde, PG_W);
4045				pmap->pm_stats.wired_count -= NBPDR /
4046				    PAGE_SIZE;
4047				continue;
4048			} else {
4049				if (!pv_lists_locked) {
4050					pv_lists_locked = TRUE;
4051					if (!rw_try_wlock(&pvh_global_lock)) {
4052						PMAP_UNLOCK(pmap);
4053						/* Repeat sva. */
4054						goto resume;
4055					}
4056					sched_pin();
4057				}
4058				if (!pmap_demote_pde(pmap, pde, sva))
4059					panic("pmap_unwire: demotion failed");
4060			}
4061		}
4062		if (pdnxt > eva)
4063			pdnxt = eva;
4064		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4065		    sva += PAGE_SIZE) {
4066			if ((*pte & PG_V) == 0)
4067				continue;
4068			if ((*pte & PG_W) == 0)
4069				panic("pmap_unwire: pte %#jx is missing PG_W",
4070				    (uintmax_t)*pte);
4071
4072			/*
4073			 * PG_W must be cleared atomically.  Although the pmap
4074			 * lock synchronizes access to PG_W, another processor
4075			 * could be setting PG_M and/or PG_A concurrently.
4076			 *
4077			 * PG_W is among the least significant 32 bits.
4078			 */
4079			atomic_clear_int((u_int *)pte, PG_W);
4080			pmap->pm_stats.wired_count--;
4081		}
4082	}
4083	if (pv_lists_locked) {
4084		sched_unpin();
4085		rw_wunlock(&pvh_global_lock);
4086	}
4087	PMAP_UNLOCK(pmap);
4088}
4089
4090
4091/*
4092 *	Copy the range specified by src_addr/len
4093 *	from the source map to the range dst_addr/len
4094 *	in the destination map.
4095 *
4096 *	This routine is only advisory and need not do anything.
4097 */
4098
4099void
4100pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4101    vm_offset_t src_addr)
4102{
4103	struct spglist free;
4104	vm_offset_t addr;
4105	vm_offset_t end_addr = src_addr + len;
4106	vm_offset_t pdnxt;
4107
4108	if (dst_addr != src_addr)
4109		return;
4110
4111	if (!pmap_is_current(src_pmap))
4112		return;
4113
4114	rw_wlock(&pvh_global_lock);
4115	if (dst_pmap < src_pmap) {
4116		PMAP_LOCK(dst_pmap);
4117		PMAP_LOCK(src_pmap);
4118	} else {
4119		PMAP_LOCK(src_pmap);
4120		PMAP_LOCK(dst_pmap);
4121	}
4122	sched_pin();
4123	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4124		pt_entry_t *src_pte, *dst_pte;
4125		vm_page_t dstmpte, srcmpte;
4126		pd_entry_t srcptepaddr;
4127		u_int ptepindex;
4128
4129		KASSERT(addr < UPT_MIN_ADDRESS,
4130		    ("pmap_copy: invalid to pmap_copy page tables"));
4131
4132		pdnxt = (addr + NBPDR) & ~PDRMASK;
4133		if (pdnxt < addr)
4134			pdnxt = end_addr;
4135		ptepindex = addr >> PDRSHIFT;
4136
4137		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4138		if (srcptepaddr == 0)
4139			continue;
4140
4141		if (srcptepaddr & PG_PS) {
4142			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4143				continue;
4144			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4145			    ((srcptepaddr & PG_MANAGED) == 0 ||
4146			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4147			    PG_PS_FRAME))) {
4148				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4149				    ~PG_W;
4150				dst_pmap->pm_stats.resident_count +=
4151				    NBPDR / PAGE_SIZE;
4152				pmap_pde_mappings++;
4153			}
4154			continue;
4155		}
4156
4157		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4158		KASSERT(srcmpte->wire_count > 0,
4159		    ("pmap_copy: source page table page is unused"));
4160
4161		if (pdnxt > end_addr)
4162			pdnxt = end_addr;
4163
4164		src_pte = vtopte(addr);
4165		while (addr < pdnxt) {
4166			pt_entry_t ptetemp;
4167			ptetemp = *src_pte;
4168			/*
4169			 * we only virtual copy managed pages
4170			 */
4171			if ((ptetemp & PG_MANAGED) != 0) {
4172				dstmpte = pmap_allocpte(dst_pmap, addr,
4173				    PMAP_ENTER_NOSLEEP);
4174				if (dstmpte == NULL)
4175					goto out;
4176				dst_pte = pmap_pte_quick(dst_pmap, addr);
4177				if (*dst_pte == 0 &&
4178				    pmap_try_insert_pv_entry(dst_pmap, addr,
4179				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4180					/*
4181					 * Clear the wired, modified, and
4182					 * accessed (referenced) bits
4183					 * during the copy.
4184					 */
4185					*dst_pte = ptetemp & ~(PG_W | PG_M |
4186					    PG_A);
4187					dst_pmap->pm_stats.resident_count++;
4188	 			} else {
4189					SLIST_INIT(&free);
4190					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4191					    &free)) {
4192						pmap_invalidate_page(dst_pmap,
4193						    addr);
4194						pmap_free_zero_pages(&free);
4195					}
4196					goto out;
4197				}
4198				if (dstmpte->wire_count >= srcmpte->wire_count)
4199					break;
4200			}
4201			addr += PAGE_SIZE;
4202			src_pte++;
4203		}
4204	}
4205out:
4206	sched_unpin();
4207	rw_wunlock(&pvh_global_lock);
4208	PMAP_UNLOCK(src_pmap);
4209	PMAP_UNLOCK(dst_pmap);
4210}
4211
4212static __inline void
4213pagezero(void *page)
4214{
4215#if defined(I686_CPU)
4216	if (cpu_class == CPUCLASS_686) {
4217		if (cpu_feature & CPUID_SSE2)
4218			sse2_pagezero(page);
4219		else
4220			i686_pagezero(page);
4221	} else
4222#endif
4223		bzero(page, PAGE_SIZE);
4224}
4225
4226/*
4227 *	pmap_zero_page zeros the specified hardware page by mapping
4228 *	the page into KVM and using bzero to clear its contents.
4229 */
4230void
4231pmap_zero_page(vm_page_t m)
4232{
4233	pt_entry_t *cmap_pte2;
4234	struct pcpu *pc;
4235
4236	sched_pin();
4237	pc = get_pcpu();
4238	cmap_pte2 = pc->pc_cmap_pte2;
4239	mtx_lock(&pc->pc_cmap_lock);
4240	if (*cmap_pte2)
4241		panic("pmap_zero_page: CMAP2 busy");
4242	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4243	    pmap_cache_bits(m->md.pat_mode, 0);
4244	invlcaddr(pc->pc_cmap_addr2);
4245	pagezero(pc->pc_cmap_addr2);
4246	*cmap_pte2 = 0;
4247
4248	/*
4249	 * Unpin the thread before releasing the lock.  Otherwise the thread
4250	 * could be rescheduled while still bound to the current CPU, only
4251	 * to unpin itself immediately upon resuming execution.
4252	 */
4253	sched_unpin();
4254	mtx_unlock(&pc->pc_cmap_lock);
4255}
4256
4257/*
4258 *	pmap_zero_page_area zeros the specified hardware page by mapping
4259 *	the page into KVM and using bzero to clear its contents.
4260 *
4261 *	off and size may not cover an area beyond a single hardware page.
4262 */
4263void
4264pmap_zero_page_area(vm_page_t m, int off, int size)
4265{
4266	pt_entry_t *cmap_pte2;
4267	struct pcpu *pc;
4268
4269	sched_pin();
4270	pc = get_pcpu();
4271	cmap_pte2 = pc->pc_cmap_pte2;
4272	mtx_lock(&pc->pc_cmap_lock);
4273	if (*cmap_pte2)
4274		panic("pmap_zero_page_area: CMAP2 busy");
4275	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4276	    pmap_cache_bits(m->md.pat_mode, 0);
4277	invlcaddr(pc->pc_cmap_addr2);
4278	if (off == 0 && size == PAGE_SIZE)
4279		pagezero(pc->pc_cmap_addr2);
4280	else
4281		bzero(pc->pc_cmap_addr2 + off, size);
4282	*cmap_pte2 = 0;
4283	sched_unpin();
4284	mtx_unlock(&pc->pc_cmap_lock);
4285}
4286
4287/*
4288 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4289 *	the page into KVM and using bzero to clear its contents.  This
4290 *	is intended to be called from the vm_pagezero process only and
4291 *	outside of Giant.
4292 */
4293void
4294pmap_zero_page_idle(vm_page_t m)
4295{
4296
4297	if (*CMAP3)
4298		panic("pmap_zero_page_idle: CMAP3 busy");
4299	sched_pin();
4300	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4301	    pmap_cache_bits(m->md.pat_mode, 0);
4302	invlcaddr(CADDR3);
4303	pagezero(CADDR3);
4304	*CMAP3 = 0;
4305	sched_unpin();
4306}
4307
4308/*
4309 *	pmap_copy_page copies the specified (machine independent)
4310 *	page by mapping the page into virtual memory and using
4311 *	bcopy to copy the page, one machine dependent page at a
4312 *	time.
4313 */
4314void
4315pmap_copy_page(vm_page_t src, vm_page_t dst)
4316{
4317	pt_entry_t *cmap_pte1, *cmap_pte2;
4318	struct pcpu *pc;
4319
4320	sched_pin();
4321	pc = get_pcpu();
4322	cmap_pte1 = pc->pc_cmap_pte1;
4323	cmap_pte2 = pc->pc_cmap_pte2;
4324	mtx_lock(&pc->pc_cmap_lock);
4325	if (*cmap_pte1)
4326		panic("pmap_copy_page: CMAP1 busy");
4327	if (*cmap_pte2)
4328		panic("pmap_copy_page: CMAP2 busy");
4329	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4330	    pmap_cache_bits(src->md.pat_mode, 0);
4331	invlcaddr(pc->pc_cmap_addr1);
4332	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4333	    pmap_cache_bits(dst->md.pat_mode, 0);
4334	invlcaddr(pc->pc_cmap_addr2);
4335	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
4336	*cmap_pte1 = 0;
4337	*cmap_pte2 = 0;
4338	sched_unpin();
4339	mtx_unlock(&pc->pc_cmap_lock);
4340}
4341
4342int unmapped_buf_allowed = 1;
4343
4344void
4345pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4346    vm_offset_t b_offset, int xfersize)
4347{
4348	vm_page_t a_pg, b_pg;
4349	char *a_cp, *b_cp;
4350	vm_offset_t a_pg_offset, b_pg_offset;
4351	pt_entry_t *cmap_pte1, *cmap_pte2;
4352	struct pcpu *pc;
4353	int cnt;
4354
4355	sched_pin();
4356	pc = get_pcpu();
4357	cmap_pte1 = pc->pc_cmap_pte1;
4358	cmap_pte2 = pc->pc_cmap_pte2;
4359	mtx_lock(&pc->pc_cmap_lock);
4360	if (*cmap_pte1 != 0)
4361		panic("pmap_copy_pages: CMAP1 busy");
4362	if (*cmap_pte2 != 0)
4363		panic("pmap_copy_pages: CMAP2 busy");
4364	while (xfersize > 0) {
4365		a_pg = ma[a_offset >> PAGE_SHIFT];
4366		a_pg_offset = a_offset & PAGE_MASK;
4367		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4368		b_pg = mb[b_offset >> PAGE_SHIFT];
4369		b_pg_offset = b_offset & PAGE_MASK;
4370		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4371		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4372		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4373		invlcaddr(pc->pc_cmap_addr1);
4374		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4375		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4376		invlcaddr(pc->pc_cmap_addr2);
4377		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
4378		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
4379		bcopy(a_cp, b_cp, cnt);
4380		a_offset += cnt;
4381		b_offset += cnt;
4382		xfersize -= cnt;
4383	}
4384	*cmap_pte1 = 0;
4385	*cmap_pte2 = 0;
4386	sched_unpin();
4387	mtx_unlock(&pc->pc_cmap_lock);
4388}
4389
4390/*
4391 * Returns true if the pmap's pv is one of the first
4392 * 16 pvs linked to from this page.  This count may
4393 * be changed upwards or downwards in the future; it
4394 * is only necessary that true be returned for a small
4395 * subset of pmaps for proper page aging.
4396 */
4397boolean_t
4398pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4399{
4400	struct md_page *pvh;
4401	pv_entry_t pv;
4402	int loops = 0;
4403	boolean_t rv;
4404
4405	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4406	    ("pmap_page_exists_quick: page %p is not managed", m));
4407	rv = FALSE;
4408	rw_wlock(&pvh_global_lock);
4409	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4410		if (PV_PMAP(pv) == pmap) {
4411			rv = TRUE;
4412			break;
4413		}
4414		loops++;
4415		if (loops >= 16)
4416			break;
4417	}
4418	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4419		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4420		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4421			if (PV_PMAP(pv) == pmap) {
4422				rv = TRUE;
4423				break;
4424			}
4425			loops++;
4426			if (loops >= 16)
4427				break;
4428		}
4429	}
4430	rw_wunlock(&pvh_global_lock);
4431	return (rv);
4432}
4433
4434/*
4435 *	pmap_page_wired_mappings:
4436 *
4437 *	Return the number of managed mappings to the given physical page
4438 *	that are wired.
4439 */
4440int
4441pmap_page_wired_mappings(vm_page_t m)
4442{
4443	int count;
4444
4445	count = 0;
4446	if ((m->oflags & VPO_UNMANAGED) != 0)
4447		return (count);
4448	rw_wlock(&pvh_global_lock);
4449	count = pmap_pvh_wired_mappings(&m->md, count);
4450	if ((m->flags & PG_FICTITIOUS) == 0) {
4451	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4452	        count);
4453	}
4454	rw_wunlock(&pvh_global_lock);
4455	return (count);
4456}
4457
4458/*
4459 *	pmap_pvh_wired_mappings:
4460 *
4461 *	Return the updated number "count" of managed mappings that are wired.
4462 */
4463static int
4464pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4465{
4466	pmap_t pmap;
4467	pt_entry_t *pte;
4468	pv_entry_t pv;
4469
4470	rw_assert(&pvh_global_lock, RA_WLOCKED);
4471	sched_pin();
4472	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4473		pmap = PV_PMAP(pv);
4474		PMAP_LOCK(pmap);
4475		pte = pmap_pte_quick(pmap, pv->pv_va);
4476		if ((*pte & PG_W) != 0)
4477			count++;
4478		PMAP_UNLOCK(pmap);
4479	}
4480	sched_unpin();
4481	return (count);
4482}
4483
4484/*
4485 * Returns TRUE if the given page is mapped individually or as part of
4486 * a 4mpage.  Otherwise, returns FALSE.
4487 */
4488boolean_t
4489pmap_page_is_mapped(vm_page_t m)
4490{
4491	boolean_t rv;
4492
4493	if ((m->oflags & VPO_UNMANAGED) != 0)
4494		return (FALSE);
4495	rw_wlock(&pvh_global_lock);
4496	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4497	    ((m->flags & PG_FICTITIOUS) == 0 &&
4498	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4499	rw_wunlock(&pvh_global_lock);
4500	return (rv);
4501}
4502
4503/*
4504 * Remove all pages from specified address space
4505 * this aids process exit speeds.  Also, this code
4506 * is special cased for current process only, but
4507 * can have the more generic (and slightly slower)
4508 * mode enabled.  This is much faster than pmap_remove
4509 * in the case of running down an entire address space.
4510 */
4511void
4512pmap_remove_pages(pmap_t pmap)
4513{
4514	pt_entry_t *pte, tpte;
4515	vm_page_t m, mpte, mt;
4516	pv_entry_t pv;
4517	struct md_page *pvh;
4518	struct pv_chunk *pc, *npc;
4519	struct spglist free;
4520	int field, idx;
4521	int32_t bit;
4522	uint32_t inuse, bitmask;
4523	int allfree;
4524
4525	if (pmap != PCPU_GET(curpmap)) {
4526		printf("warning: pmap_remove_pages called with non-current pmap\n");
4527		return;
4528	}
4529	SLIST_INIT(&free);
4530	rw_wlock(&pvh_global_lock);
4531	PMAP_LOCK(pmap);
4532	sched_pin();
4533	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4534		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4535		    pc->pc_pmap));
4536		allfree = 1;
4537		for (field = 0; field < _NPCM; field++) {
4538			inuse = ~pc->pc_map[field] & pc_freemask[field];
4539			while (inuse != 0) {
4540				bit = bsfl(inuse);
4541				bitmask = 1UL << bit;
4542				idx = field * 32 + bit;
4543				pv = &pc->pc_pventry[idx];
4544				inuse &= ~bitmask;
4545
4546				pte = pmap_pde(pmap, pv->pv_va);
4547				tpte = *pte;
4548				if ((tpte & PG_PS) == 0) {
4549					pte = vtopte(pv->pv_va);
4550					tpte = *pte & ~PG_PTE_PAT;
4551				}
4552
4553				if (tpte == 0) {
4554					printf(
4555					    "TPTE at %p  IS ZERO @ VA %08x\n",
4556					    pte, pv->pv_va);
4557					panic("bad pte");
4558				}
4559
4560/*
4561 * We cannot remove wired pages from a process' mapping at this time
4562 */
4563				if (tpte & PG_W) {
4564					allfree = 0;
4565					continue;
4566				}
4567
4568				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4569				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4570				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4571				    m, (uintmax_t)m->phys_addr,
4572				    (uintmax_t)tpte));
4573
4574				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4575				    m < &vm_page_array[vm_page_array_size],
4576				    ("pmap_remove_pages: bad tpte %#jx",
4577				    (uintmax_t)tpte));
4578
4579				pte_clear(pte);
4580
4581				/*
4582				 * Update the vm_page_t clean/reference bits.
4583				 */
4584				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4585					if ((tpte & PG_PS) != 0) {
4586						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4587							vm_page_dirty(mt);
4588					} else
4589						vm_page_dirty(m);
4590				}
4591
4592				/* Mark free */
4593				PV_STAT(pv_entry_frees++);
4594				PV_STAT(pv_entry_spare++);
4595				pv_entry_count--;
4596				pc->pc_map[field] |= bitmask;
4597				if ((tpte & PG_PS) != 0) {
4598					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4599					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4600					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4601					if (TAILQ_EMPTY(&pvh->pv_list)) {
4602						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4603							if (TAILQ_EMPTY(&mt->md.pv_list))
4604								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4605					}
4606					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4607					if (mpte != NULL) {
4608						pmap->pm_stats.resident_count--;
4609						KASSERT(mpte->wire_count == NPTEPG,
4610						    ("pmap_remove_pages: pte page wire count error"));
4611						mpte->wire_count = 0;
4612						pmap_add_delayed_free_list(mpte, &free, FALSE);
4613					}
4614				} else {
4615					pmap->pm_stats.resident_count--;
4616					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4617					if (TAILQ_EMPTY(&m->md.pv_list) &&
4618					    (m->flags & PG_FICTITIOUS) == 0) {
4619						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4620						if (TAILQ_EMPTY(&pvh->pv_list))
4621							vm_page_aflag_clear(m, PGA_WRITEABLE);
4622					}
4623					pmap_unuse_pt(pmap, pv->pv_va, &free);
4624				}
4625			}
4626		}
4627		if (allfree) {
4628			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4629			free_pv_chunk(pc);
4630		}
4631	}
4632	sched_unpin();
4633	pmap_invalidate_all(pmap);
4634	rw_wunlock(&pvh_global_lock);
4635	PMAP_UNLOCK(pmap);
4636	pmap_free_zero_pages(&free);
4637}
4638
4639/*
4640 *	pmap_is_modified:
4641 *
4642 *	Return whether or not the specified physical page was modified
4643 *	in any physical maps.
4644 */
4645boolean_t
4646pmap_is_modified(vm_page_t m)
4647{
4648	boolean_t rv;
4649
4650	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4651	    ("pmap_is_modified: page %p is not managed", m));
4652
4653	/*
4654	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4655	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4656	 * is clear, no PTEs can have PG_M set.
4657	 */
4658	VM_OBJECT_ASSERT_WLOCKED(m->object);
4659	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4660		return (FALSE);
4661	rw_wlock(&pvh_global_lock);
4662	rv = pmap_is_modified_pvh(&m->md) ||
4663	    ((m->flags & PG_FICTITIOUS) == 0 &&
4664	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4665	rw_wunlock(&pvh_global_lock);
4666	return (rv);
4667}
4668
4669/*
4670 * Returns TRUE if any of the given mappings were used to modify
4671 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4672 * mappings are supported.
4673 */
4674static boolean_t
4675pmap_is_modified_pvh(struct md_page *pvh)
4676{
4677	pv_entry_t pv;
4678	pt_entry_t *pte;
4679	pmap_t pmap;
4680	boolean_t rv;
4681
4682	rw_assert(&pvh_global_lock, RA_WLOCKED);
4683	rv = FALSE;
4684	sched_pin();
4685	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4686		pmap = PV_PMAP(pv);
4687		PMAP_LOCK(pmap);
4688		pte = pmap_pte_quick(pmap, pv->pv_va);
4689		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4690		PMAP_UNLOCK(pmap);
4691		if (rv)
4692			break;
4693	}
4694	sched_unpin();
4695	return (rv);
4696}
4697
4698/*
4699 *	pmap_is_prefaultable:
4700 *
4701 *	Return whether or not the specified virtual address is elgible
4702 *	for prefault.
4703 */
4704boolean_t
4705pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4706{
4707	pd_entry_t *pde;
4708	pt_entry_t *pte;
4709	boolean_t rv;
4710
4711	rv = FALSE;
4712	PMAP_LOCK(pmap);
4713	pde = pmap_pde(pmap, addr);
4714	if (*pde != 0 && (*pde & PG_PS) == 0) {
4715		pte = vtopte(addr);
4716		rv = *pte == 0;
4717	}
4718	PMAP_UNLOCK(pmap);
4719	return (rv);
4720}
4721
4722/*
4723 *	pmap_is_referenced:
4724 *
4725 *	Return whether or not the specified physical page was referenced
4726 *	in any physical maps.
4727 */
4728boolean_t
4729pmap_is_referenced(vm_page_t m)
4730{
4731	boolean_t rv;
4732
4733	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4734	    ("pmap_is_referenced: page %p is not managed", m));
4735	rw_wlock(&pvh_global_lock);
4736	rv = pmap_is_referenced_pvh(&m->md) ||
4737	    ((m->flags & PG_FICTITIOUS) == 0 &&
4738	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4739	rw_wunlock(&pvh_global_lock);
4740	return (rv);
4741}
4742
4743/*
4744 * Returns TRUE if any of the given mappings were referenced and FALSE
4745 * otherwise.  Both page and 4mpage mappings are supported.
4746 */
4747static boolean_t
4748pmap_is_referenced_pvh(struct md_page *pvh)
4749{
4750	pv_entry_t pv;
4751	pt_entry_t *pte;
4752	pmap_t pmap;
4753	boolean_t rv;
4754
4755	rw_assert(&pvh_global_lock, RA_WLOCKED);
4756	rv = FALSE;
4757	sched_pin();
4758	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4759		pmap = PV_PMAP(pv);
4760		PMAP_LOCK(pmap);
4761		pte = pmap_pte_quick(pmap, pv->pv_va);
4762		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4763		PMAP_UNLOCK(pmap);
4764		if (rv)
4765			break;
4766	}
4767	sched_unpin();
4768	return (rv);
4769}
4770
4771/*
4772 * Clear the write and modified bits in each of the given page's mappings.
4773 */
4774void
4775pmap_remove_write(vm_page_t m)
4776{
4777	struct md_page *pvh;
4778	pv_entry_t next_pv, pv;
4779	pmap_t pmap;
4780	pd_entry_t *pde;
4781	pt_entry_t oldpte, *pte;
4782	vm_offset_t va;
4783
4784	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4785	    ("pmap_remove_write: page %p is not managed", m));
4786
4787	/*
4788	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4789	 * set by another thread while the object is locked.  Thus,
4790	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4791	 */
4792	VM_OBJECT_ASSERT_WLOCKED(m->object);
4793	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4794		return;
4795	rw_wlock(&pvh_global_lock);
4796	sched_pin();
4797	if ((m->flags & PG_FICTITIOUS) != 0)
4798		goto small_mappings;
4799	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4800	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4801		va = pv->pv_va;
4802		pmap = PV_PMAP(pv);
4803		PMAP_LOCK(pmap);
4804		pde = pmap_pde(pmap, va);
4805		if ((*pde & PG_RW) != 0)
4806			(void)pmap_demote_pde(pmap, pde, va);
4807		PMAP_UNLOCK(pmap);
4808	}
4809small_mappings:
4810	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4811		pmap = PV_PMAP(pv);
4812		PMAP_LOCK(pmap);
4813		pde = pmap_pde(pmap, pv->pv_va);
4814		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4815		    " a 4mpage in page %p's pv list", m));
4816		pte = pmap_pte_quick(pmap, pv->pv_va);
4817retry:
4818		oldpte = *pte;
4819		if ((oldpte & PG_RW) != 0) {
4820			/*
4821			 * Regardless of whether a pte is 32 or 64 bits
4822			 * in size, PG_RW and PG_M are among the least
4823			 * significant 32 bits.
4824			 */
4825			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4826			    oldpte & ~(PG_RW | PG_M)))
4827				goto retry;
4828			if ((oldpte & PG_M) != 0)
4829				vm_page_dirty(m);
4830			pmap_invalidate_page(pmap, pv->pv_va);
4831		}
4832		PMAP_UNLOCK(pmap);
4833	}
4834	vm_page_aflag_clear(m, PGA_WRITEABLE);
4835	sched_unpin();
4836	rw_wunlock(&pvh_global_lock);
4837}
4838
4839/*
4840 *	pmap_ts_referenced:
4841 *
4842 *	Return a count of reference bits for a page, clearing those bits.
4843 *	It is not necessary for every reference bit to be cleared, but it
4844 *	is necessary that 0 only be returned when there are truly no
4845 *	reference bits set.
4846 *
4847 *	As an optimization, update the page's dirty field if a modified bit is
4848 *	found while counting reference bits.  This opportunistic update can be
4849 *	performed at low cost and can eliminate the need for some future calls
4850 *	to pmap_is_modified().  However, since this function stops after
4851 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4852 *	dirty pages.  Those dirty pages will only be detected by a future call
4853 *	to pmap_is_modified().
4854 */
4855int
4856pmap_ts_referenced(vm_page_t m)
4857{
4858	struct md_page *pvh;
4859	pv_entry_t pv, pvf;
4860	pmap_t pmap;
4861	pd_entry_t *pde;
4862	pt_entry_t *pte;
4863	vm_paddr_t pa;
4864	int rtval = 0;
4865
4866	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4867	    ("pmap_ts_referenced: page %p is not managed", m));
4868	pa = VM_PAGE_TO_PHYS(m);
4869	pvh = pa_to_pvh(pa);
4870	rw_wlock(&pvh_global_lock);
4871	sched_pin();
4872	if ((m->flags & PG_FICTITIOUS) != 0 ||
4873	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4874		goto small_mappings;
4875	pv = pvf;
4876	do {
4877		pmap = PV_PMAP(pv);
4878		PMAP_LOCK(pmap);
4879		pde = pmap_pde(pmap, pv->pv_va);
4880		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4881			/*
4882			 * Although "*pde" is mapping a 2/4MB page, because
4883			 * this function is called at a 4KB page granularity,
4884			 * we only update the 4KB page under test.
4885			 */
4886			vm_page_dirty(m);
4887		}
4888		if ((*pde & PG_A) != 0) {
4889			/*
4890			 * Since this reference bit is shared by either 1024
4891			 * or 512 4KB pages, it should not be cleared every
4892			 * time it is tested.  Apply a simple "hash" function
4893			 * on the physical page number, the virtual superpage
4894			 * number, and the pmap address to select one 4KB page
4895			 * out of the 1024 or 512 on which testing the
4896			 * reference bit will result in clearing that bit.
4897			 * This function is designed to avoid the selection of
4898			 * the same 4KB page for every 2- or 4MB page mapping.
4899			 *
4900			 * On demotion, a mapping that hasn't been referenced
4901			 * is simply destroyed.  To avoid the possibility of a
4902			 * subsequent page fault on a demoted wired mapping,
4903			 * always leave its reference bit set.  Moreover,
4904			 * since the superpage is wired, the current state of
4905			 * its reference bit won't affect page replacement.
4906			 */
4907			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4908			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4909			    (*pde & PG_W) == 0) {
4910				atomic_clear_int((u_int *)pde, PG_A);
4911				pmap_invalidate_page(pmap, pv->pv_va);
4912			}
4913			rtval++;
4914		}
4915		PMAP_UNLOCK(pmap);
4916		/* Rotate the PV list if it has more than one entry. */
4917		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4918			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4919			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4920		}
4921		if (rtval >= PMAP_TS_REFERENCED_MAX)
4922			goto out;
4923	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4924small_mappings:
4925	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4926		goto out;
4927	pv = pvf;
4928	do {
4929		pmap = PV_PMAP(pv);
4930		PMAP_LOCK(pmap);
4931		pde = pmap_pde(pmap, pv->pv_va);
4932		KASSERT((*pde & PG_PS) == 0,
4933		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4934		    m));
4935		pte = pmap_pte_quick(pmap, pv->pv_va);
4936		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4937			vm_page_dirty(m);
4938		if ((*pte & PG_A) != 0) {
4939			atomic_clear_int((u_int *)pte, PG_A);
4940			pmap_invalidate_page(pmap, pv->pv_va);
4941			rtval++;
4942		}
4943		PMAP_UNLOCK(pmap);
4944		/* Rotate the PV list if it has more than one entry. */
4945		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4946			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4947			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4948		}
4949	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4950	    PMAP_TS_REFERENCED_MAX);
4951out:
4952	sched_unpin();
4953	rw_wunlock(&pvh_global_lock);
4954	return (rtval);
4955}
4956
4957/*
4958 *	Apply the given advice to the specified range of addresses within the
4959 *	given pmap.  Depending on the advice, clear the referenced and/or
4960 *	modified flags in each mapping and set the mapped page's dirty field.
4961 */
4962void
4963pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4964{
4965	pd_entry_t oldpde, *pde;
4966	pt_entry_t *pte;
4967	vm_offset_t va, pdnxt;
4968	vm_page_t m;
4969	boolean_t anychanged, pv_lists_locked;
4970
4971	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4972		return;
4973	if (pmap_is_current(pmap))
4974		pv_lists_locked = FALSE;
4975	else {
4976		pv_lists_locked = TRUE;
4977resume:
4978		rw_wlock(&pvh_global_lock);
4979		sched_pin();
4980	}
4981	anychanged = FALSE;
4982	PMAP_LOCK(pmap);
4983	for (; sva < eva; sva = pdnxt) {
4984		pdnxt = (sva + NBPDR) & ~PDRMASK;
4985		if (pdnxt < sva)
4986			pdnxt = eva;
4987		pde = pmap_pde(pmap, sva);
4988		oldpde = *pde;
4989		if ((oldpde & PG_V) == 0)
4990			continue;
4991		else if ((oldpde & PG_PS) != 0) {
4992			if ((oldpde & PG_MANAGED) == 0)
4993				continue;
4994			if (!pv_lists_locked) {
4995				pv_lists_locked = TRUE;
4996				if (!rw_try_wlock(&pvh_global_lock)) {
4997					if (anychanged)
4998						pmap_invalidate_all(pmap);
4999					PMAP_UNLOCK(pmap);
5000					goto resume;
5001				}
5002				sched_pin();
5003			}
5004			if (!pmap_demote_pde(pmap, pde, sva)) {
5005				/*
5006				 * The large page mapping was destroyed.
5007				 */
5008				continue;
5009			}
5010
5011			/*
5012			 * Unless the page mappings are wired, remove the
5013			 * mapping to a single page so that a subsequent
5014			 * access may repromote.  Since the underlying page
5015			 * table page is fully populated, this removal never
5016			 * frees a page table page.
5017			 */
5018			if ((oldpde & PG_W) == 0) {
5019				pte = pmap_pte_quick(pmap, sva);
5020				KASSERT((*pte & PG_V) != 0,
5021				    ("pmap_advise: invalid PTE"));
5022				pmap_remove_pte(pmap, pte, sva, NULL);
5023				anychanged = TRUE;
5024			}
5025		}
5026		if (pdnxt > eva)
5027			pdnxt = eva;
5028		va = pdnxt;
5029		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
5030		    sva += PAGE_SIZE) {
5031			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
5032				goto maybe_invlrng;
5033			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5034				if (advice == MADV_DONTNEED) {
5035					/*
5036					 * Future calls to pmap_is_modified()
5037					 * can be avoided by making the page
5038					 * dirty now.
5039					 */
5040					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5041					vm_page_dirty(m);
5042				}
5043				atomic_clear_int((u_int *)pte, PG_M | PG_A);
5044			} else if ((*pte & PG_A) != 0)
5045				atomic_clear_int((u_int *)pte, PG_A);
5046			else
5047				goto maybe_invlrng;
5048			if ((*pte & PG_G) != 0) {
5049				if (va == pdnxt)
5050					va = sva;
5051			} else
5052				anychanged = TRUE;
5053			continue;
5054maybe_invlrng:
5055			if (va != pdnxt) {
5056				pmap_invalidate_range(pmap, va, sva);
5057				va = pdnxt;
5058			}
5059		}
5060		if (va != pdnxt)
5061			pmap_invalidate_range(pmap, va, sva);
5062	}
5063	if (anychanged)
5064		pmap_invalidate_all(pmap);
5065	if (pv_lists_locked) {
5066		sched_unpin();
5067		rw_wunlock(&pvh_global_lock);
5068	}
5069	PMAP_UNLOCK(pmap);
5070}
5071
5072/*
5073 *	Clear the modify bits on the specified physical page.
5074 */
5075void
5076pmap_clear_modify(vm_page_t m)
5077{
5078	struct md_page *pvh;
5079	pv_entry_t next_pv, pv;
5080	pmap_t pmap;
5081	pd_entry_t oldpde, *pde;
5082	pt_entry_t oldpte, *pte;
5083	vm_offset_t va;
5084
5085	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5086	    ("pmap_clear_modify: page %p is not managed", m));
5087	VM_OBJECT_ASSERT_WLOCKED(m->object);
5088	KASSERT(!vm_page_xbusied(m),
5089	    ("pmap_clear_modify: page %p is exclusive busied", m));
5090
5091	/*
5092	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5093	 * If the object containing the page is locked and the page is not
5094	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5095	 */
5096	if ((m->aflags & PGA_WRITEABLE) == 0)
5097		return;
5098	rw_wlock(&pvh_global_lock);
5099	sched_pin();
5100	if ((m->flags & PG_FICTITIOUS) != 0)
5101		goto small_mappings;
5102	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5103	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5104		va = pv->pv_va;
5105		pmap = PV_PMAP(pv);
5106		PMAP_LOCK(pmap);
5107		pde = pmap_pde(pmap, va);
5108		oldpde = *pde;
5109		if ((oldpde & PG_RW) != 0) {
5110			if (pmap_demote_pde(pmap, pde, va)) {
5111				if ((oldpde & PG_W) == 0) {
5112					/*
5113					 * Write protect the mapping to a
5114					 * single page so that a subsequent
5115					 * write access may repromote.
5116					 */
5117					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5118					    PG_PS_FRAME);
5119					pte = pmap_pte_quick(pmap, va);
5120					oldpte = *pte;
5121					if ((oldpte & PG_V) != 0) {
5122						/*
5123						 * Regardless of whether a pte is 32 or 64 bits
5124						 * in size, PG_RW and PG_M are among the least
5125						 * significant 32 bits.
5126						 */
5127						while (!atomic_cmpset_int((u_int *)pte,
5128						    oldpte,
5129						    oldpte & ~(PG_M | PG_RW)))
5130							oldpte = *pte;
5131						vm_page_dirty(m);
5132						pmap_invalidate_page(pmap, va);
5133					}
5134				}
5135			}
5136		}
5137		PMAP_UNLOCK(pmap);
5138	}
5139small_mappings:
5140	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5141		pmap = PV_PMAP(pv);
5142		PMAP_LOCK(pmap);
5143		pde = pmap_pde(pmap, pv->pv_va);
5144		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5145		    " a 4mpage in page %p's pv list", m));
5146		pte = pmap_pte_quick(pmap, pv->pv_va);
5147		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5148			/*
5149			 * Regardless of whether a pte is 32 or 64 bits
5150			 * in size, PG_M is among the least significant
5151			 * 32 bits.
5152			 */
5153			atomic_clear_int((u_int *)pte, PG_M);
5154			pmap_invalidate_page(pmap, pv->pv_va);
5155		}
5156		PMAP_UNLOCK(pmap);
5157	}
5158	sched_unpin();
5159	rw_wunlock(&pvh_global_lock);
5160}
5161
5162/*
5163 * Miscellaneous support routines follow
5164 */
5165
5166/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5167static __inline void
5168pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5169{
5170	u_int opte, npte;
5171
5172	/*
5173	 * The cache mode bits are all in the low 32-bits of the
5174	 * PTE, so we can just spin on updating the low 32-bits.
5175	 */
5176	do {
5177		opte = *(u_int *)pte;
5178		npte = opte & ~PG_PTE_CACHE;
5179		npte |= cache_bits;
5180	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5181}
5182
5183/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5184static __inline void
5185pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5186{
5187	u_int opde, npde;
5188
5189	/*
5190	 * The cache mode bits are all in the low 32-bits of the
5191	 * PDE, so we can just spin on updating the low 32-bits.
5192	 */
5193	do {
5194		opde = *(u_int *)pde;
5195		npde = opde & ~PG_PDE_CACHE;
5196		npde |= cache_bits;
5197	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5198}
5199
5200/*
5201 * Map a set of physical memory pages into the kernel virtual
5202 * address space. Return a pointer to where it is mapped. This
5203 * routine is intended to be used for mapping device memory,
5204 * NOT real memory.
5205 */
5206void *
5207pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5208{
5209	struct pmap_preinit_mapping *ppim;
5210	vm_offset_t va, offset;
5211	vm_size_t tmpsize;
5212	int i;
5213
5214	offset = pa & PAGE_MASK;
5215	size = round_page(offset + size);
5216	pa = pa & PG_FRAME;
5217
5218	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5219		va = KERNBASE + pa;
5220	else if (!pmap_initialized) {
5221		va = 0;
5222		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5223			ppim = pmap_preinit_mapping + i;
5224			if (ppim->va == 0) {
5225				ppim->pa = pa;
5226				ppim->sz = size;
5227				ppim->mode = mode;
5228				ppim->va = virtual_avail;
5229				virtual_avail += size;
5230				va = ppim->va;
5231				break;
5232			}
5233		}
5234		if (va == 0)
5235			panic("%s: too many preinit mappings", __func__);
5236	} else {
5237		/*
5238		 * If we have a preinit mapping, re-use it.
5239		 */
5240		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5241			ppim = pmap_preinit_mapping + i;
5242			if (ppim->pa == pa && ppim->sz == size &&
5243			    ppim->mode == mode)
5244				return ((void *)(ppim->va + offset));
5245		}
5246		va = kva_alloc(size);
5247		if (va == 0)
5248			panic("%s: Couldn't allocate KVA", __func__);
5249	}
5250	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5251		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5252	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5253	pmap_invalidate_cache_range(va, va + size, FALSE);
5254	return ((void *)(va + offset));
5255}
5256
5257void *
5258pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5259{
5260
5261	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5262}
5263
5264void *
5265pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5266{
5267
5268	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5269}
5270
5271void
5272pmap_unmapdev(vm_offset_t va, vm_size_t size)
5273{
5274	struct pmap_preinit_mapping *ppim;
5275	vm_offset_t offset;
5276	int i;
5277
5278	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5279		return;
5280	offset = va & PAGE_MASK;
5281	size = round_page(offset + size);
5282	va = trunc_page(va);
5283	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5284		ppim = pmap_preinit_mapping + i;
5285		if (ppim->va == va && ppim->sz == size) {
5286			if (pmap_initialized)
5287				return;
5288			ppim->pa = 0;
5289			ppim->va = 0;
5290			ppim->sz = 0;
5291			ppim->mode = 0;
5292			if (va + size == virtual_avail)
5293				virtual_avail = va;
5294			return;
5295		}
5296	}
5297	if (pmap_initialized)
5298		kva_free(va, size);
5299}
5300
5301/*
5302 * Sets the memory attribute for the specified page.
5303 */
5304void
5305pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5306{
5307
5308	m->md.pat_mode = ma;
5309	if ((m->flags & PG_FICTITIOUS) != 0)
5310		return;
5311
5312	/*
5313	 * If "m" is a normal page, flush it from the cache.
5314	 * See pmap_invalidate_cache_range().
5315	 *
5316	 * First, try to find an existing mapping of the page by sf
5317	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5318	 * flushes the cache.
5319	 */
5320	if (sf_buf_invalidate_cache(m))
5321		return;
5322
5323	/*
5324	 * If page is not mapped by sf buffer, but CPU does not
5325	 * support self snoop, map the page transient and do
5326	 * invalidation. In the worst case, whole cache is flushed by
5327	 * pmap_invalidate_cache_range().
5328	 */
5329	if ((cpu_feature & CPUID_SS) == 0)
5330		pmap_flush_page(m);
5331}
5332
5333static void
5334pmap_flush_page(vm_page_t m)
5335{
5336	pt_entry_t *cmap_pte2;
5337	struct pcpu *pc;
5338	vm_offset_t sva, eva;
5339	bool useclflushopt;
5340
5341	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
5342	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
5343		sched_pin();
5344		pc = get_pcpu();
5345		cmap_pte2 = pc->pc_cmap_pte2;
5346		mtx_lock(&pc->pc_cmap_lock);
5347		if (*cmap_pte2)
5348			panic("pmap_flush_page: CMAP2 busy");
5349		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5350		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5351		invlcaddr(pc->pc_cmap_addr2);
5352		sva = (vm_offset_t)pc->pc_cmap_addr2;
5353		eva = sva + PAGE_SIZE;
5354
5355		/*
5356		 * Use mfence or sfence despite the ordering implied by
5357		 * mtx_{un,}lock() because clflush on non-Intel CPUs
5358		 * and clflushopt are not guaranteed to be ordered by
5359		 * any other instruction.
5360		 */
5361		if (useclflushopt)
5362			sfence();
5363		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5364			mfence();
5365		for (; sva < eva; sva += cpu_clflush_line_size) {
5366			if (useclflushopt)
5367				clflushopt(sva);
5368			else
5369				clflush(sva);
5370		}
5371		if (useclflushopt)
5372			sfence();
5373		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5374			mfence();
5375		*cmap_pte2 = 0;
5376		sched_unpin();
5377		mtx_unlock(&pc->pc_cmap_lock);
5378	} else
5379		pmap_invalidate_cache();
5380}
5381
5382/*
5383 * Changes the specified virtual address range's memory type to that given by
5384 * the parameter "mode".  The specified virtual address range must be
5385 * completely contained within either the kernel map.
5386 *
5387 * Returns zero if the change completed successfully, and either EINVAL or
5388 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5389 * of the virtual address range was not mapped, and ENOMEM is returned if
5390 * there was insufficient memory available to complete the change.
5391 */
5392int
5393pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5394{
5395	vm_offset_t base, offset, tmpva;
5396	pd_entry_t *pde;
5397	pt_entry_t *pte;
5398	int cache_bits_pte, cache_bits_pde;
5399	boolean_t changed;
5400
5401	base = trunc_page(va);
5402	offset = va & PAGE_MASK;
5403	size = round_page(offset + size);
5404
5405	/*
5406	 * Only supported on kernel virtual addresses above the recursive map.
5407	 */
5408	if (base < VM_MIN_KERNEL_ADDRESS)
5409		return (EINVAL);
5410
5411	cache_bits_pde = pmap_cache_bits(mode, 1);
5412	cache_bits_pte = pmap_cache_bits(mode, 0);
5413	changed = FALSE;
5414
5415	/*
5416	 * Pages that aren't mapped aren't supported.  Also break down
5417	 * 2/4MB pages into 4KB pages if required.
5418	 */
5419	PMAP_LOCK(kernel_pmap);
5420	for (tmpva = base; tmpva < base + size; ) {
5421		pde = pmap_pde(kernel_pmap, tmpva);
5422		if (*pde == 0) {
5423			PMAP_UNLOCK(kernel_pmap);
5424			return (EINVAL);
5425		}
5426		if (*pde & PG_PS) {
5427			/*
5428			 * If the current 2/4MB page already has
5429			 * the required memory type, then we need not
5430			 * demote this page.  Just increment tmpva to
5431			 * the next 2/4MB page frame.
5432			 */
5433			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5434				tmpva = trunc_4mpage(tmpva) + NBPDR;
5435				continue;
5436			}
5437
5438			/*
5439			 * If the current offset aligns with a 2/4MB
5440			 * page frame and there is at least 2/4MB left
5441			 * within the range, then we need not break
5442			 * down this page into 4KB pages.
5443			 */
5444			if ((tmpva & PDRMASK) == 0 &&
5445			    tmpva + PDRMASK < base + size) {
5446				tmpva += NBPDR;
5447				continue;
5448			}
5449			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5450				PMAP_UNLOCK(kernel_pmap);
5451				return (ENOMEM);
5452			}
5453		}
5454		pte = vtopte(tmpva);
5455		if (*pte == 0) {
5456			PMAP_UNLOCK(kernel_pmap);
5457			return (EINVAL);
5458		}
5459		tmpva += PAGE_SIZE;
5460	}
5461	PMAP_UNLOCK(kernel_pmap);
5462
5463	/*
5464	 * Ok, all the pages exist, so run through them updating their
5465	 * cache mode if required.
5466	 */
5467	for (tmpva = base; tmpva < base + size; ) {
5468		pde = pmap_pde(kernel_pmap, tmpva);
5469		if (*pde & PG_PS) {
5470			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5471				pmap_pde_attr(pde, cache_bits_pde);
5472				changed = TRUE;
5473			}
5474			tmpva = trunc_4mpage(tmpva) + NBPDR;
5475		} else {
5476			pte = vtopte(tmpva);
5477			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5478				pmap_pte_attr(pte, cache_bits_pte);
5479				changed = TRUE;
5480			}
5481			tmpva += PAGE_SIZE;
5482		}
5483	}
5484
5485	/*
5486	 * Flush CPU caches to make sure any data isn't cached that
5487	 * shouldn't be, etc.
5488	 */
5489	if (changed) {
5490		pmap_invalidate_range(kernel_pmap, base, tmpva);
5491		pmap_invalidate_cache_range(base, tmpva, FALSE);
5492	}
5493	return (0);
5494}
5495
5496/*
5497 * perform the pmap work for mincore
5498 */
5499int
5500pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5501{
5502	pd_entry_t *pdep;
5503	pt_entry_t *ptep, pte;
5504	vm_paddr_t pa;
5505	int val;
5506
5507	PMAP_LOCK(pmap);
5508retry:
5509	pdep = pmap_pde(pmap, addr);
5510	if (*pdep != 0) {
5511		if (*pdep & PG_PS) {
5512			pte = *pdep;
5513			/* Compute the physical address of the 4KB page. */
5514			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5515			    PG_FRAME;
5516			val = MINCORE_SUPER;
5517		} else {
5518			ptep = pmap_pte(pmap, addr);
5519			pte = *ptep;
5520			pmap_pte_release(ptep);
5521			pa = pte & PG_FRAME;
5522			val = 0;
5523		}
5524	} else {
5525		pte = 0;
5526		pa = 0;
5527		val = 0;
5528	}
5529	if ((pte & PG_V) != 0) {
5530		val |= MINCORE_INCORE;
5531		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5532			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5533		if ((pte & PG_A) != 0)
5534			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5535	}
5536	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5537	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5538	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5539		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5540		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5541			goto retry;
5542	} else
5543		PA_UNLOCK_COND(*locked_pa);
5544	PMAP_UNLOCK(pmap);
5545	return (val);
5546}
5547
5548void
5549pmap_activate(struct thread *td)
5550{
5551	pmap_t	pmap, oldpmap;
5552	u_int	cpuid;
5553	u_int32_t  cr3;
5554
5555	critical_enter();
5556	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5557	oldpmap = PCPU_GET(curpmap);
5558	cpuid = PCPU_GET(cpuid);
5559#if defined(SMP)
5560	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5561	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5562#else
5563	CPU_CLR(cpuid, &oldpmap->pm_active);
5564	CPU_SET(cpuid, &pmap->pm_active);
5565#endif
5566#if defined(PAE) || defined(PAE_TABLES)
5567	cr3 = vtophys(pmap->pm_pdpt);
5568#else
5569	cr3 = vtophys(pmap->pm_pdir);
5570#endif
5571	/*
5572	 * pmap_activate is for the current thread on the current cpu
5573	 */
5574	td->td_pcb->pcb_cr3 = cr3;
5575	load_cr3(cr3);
5576	PCPU_SET(curpmap, pmap);
5577	critical_exit();
5578}
5579
5580void
5581pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5582{
5583}
5584
5585/*
5586 *	Increase the starting virtual address of the given mapping if a
5587 *	different alignment might result in more superpage mappings.
5588 */
5589void
5590pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5591    vm_offset_t *addr, vm_size_t size)
5592{
5593	vm_offset_t superpage_offset;
5594
5595	if (size < NBPDR)
5596		return;
5597	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5598		offset += ptoa(object->pg_color);
5599	superpage_offset = offset & PDRMASK;
5600	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5601	    (*addr & PDRMASK) == superpage_offset)
5602		return;
5603	if ((*addr & PDRMASK) < superpage_offset)
5604		*addr = (*addr & ~PDRMASK) + superpage_offset;
5605	else
5606		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5607}
5608
5609vm_offset_t
5610pmap_quick_enter_page(vm_page_t m)
5611{
5612	vm_offset_t qaddr;
5613	pt_entry_t *pte;
5614
5615	critical_enter();
5616	qaddr = PCPU_GET(qmap_addr);
5617	pte = vtopte(qaddr);
5618
5619	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
5620	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
5621	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
5622	invlpg(qaddr);
5623
5624	return (qaddr);
5625}
5626
5627void
5628pmap_quick_remove_page(vm_offset_t addr)
5629{
5630	vm_offset_t qaddr;
5631	pt_entry_t *pte;
5632
5633	qaddr = PCPU_GET(qmap_addr);
5634	pte = vtopte(qaddr);
5635
5636	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
5637	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
5638
5639	*pte = 0;
5640	critical_exit();
5641}
5642
5643#if defined(PMAP_DEBUG)
5644pmap_pid_dump(int pid)
5645{
5646	pmap_t pmap;
5647	struct proc *p;
5648	int npte = 0;
5649	int index;
5650
5651	sx_slock(&allproc_lock);
5652	FOREACH_PROC_IN_SYSTEM(p) {
5653		if (p->p_pid != pid)
5654			continue;
5655
5656		if (p->p_vmspace) {
5657			int i,j;
5658			index = 0;
5659			pmap = vmspace_pmap(p->p_vmspace);
5660			for (i = 0; i < NPDEPTD; i++) {
5661				pd_entry_t *pde;
5662				pt_entry_t *pte;
5663				vm_offset_t base = i << PDRSHIFT;
5664
5665				pde = &pmap->pm_pdir[i];
5666				if (pde && pmap_pde_v(pde)) {
5667					for (j = 0; j < NPTEPG; j++) {
5668						vm_offset_t va = base + (j << PAGE_SHIFT);
5669						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5670							if (index) {
5671								index = 0;
5672								printf("\n");
5673							}
5674							sx_sunlock(&allproc_lock);
5675							return (npte);
5676						}
5677						pte = pmap_pte(pmap, va);
5678						if (pte && pmap_pte_v(pte)) {
5679							pt_entry_t pa;
5680							vm_page_t m;
5681							pa = *pte;
5682							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5683							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5684								va, pa, m->hold_count, m->wire_count, m->flags);
5685							npte++;
5686							index++;
5687							if (index >= 2) {
5688								index = 0;
5689								printf("\n");
5690							} else {
5691								printf(" ");
5692							}
5693						}
5694					}
5695				}
5696			}
5697		}
5698	}
5699	sx_sunlock(&allproc_lock);
5700	return (npte);
5701}
5702#endif
5703