pmap.c revision 276546
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: stable/10/sys/i386/i386/pmap.c 276546 2015-01-02 17:45:52Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	Since the information managed by this module is
84 *	also stored by the logical address mapping module,
85 *	this module may throw away valid virtual-to-physical
86 *	mappings at almost any time.  However, invalidations
87 *	of virtual-to-physical mappings must be done as
88 *	requested.
89 *
90 *	In order to cope with hardware architectures which
91 *	make virtual-to-physical map invalidates expensive,
92 *	this module may delay invalidate or reduced protection
93 *	operations until such time as they are actually
94 *	necessary.  This module is given full information as
95 *	to which processors are currently using which maps,
96 *	and to when physical maps must be made correct.
97 */
98
99#include "opt_apic.h"
100#include "opt_cpu.h"
101#include "opt_pmap.h"
102#include "opt_smp.h"
103#include "opt_xbox.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/ktr.h>
109#include <sys/lock.h>
110#include <sys/malloc.h>
111#include <sys/mman.h>
112#include <sys/msgbuf.h>
113#include <sys/mutex.h>
114#include <sys/proc.h>
115#include <sys/rwlock.h>
116#include <sys/sf_buf.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#ifdef SMP
122#include <sys/smp.h>
123#else
124#include <sys/cpuset.h>
125#endif
126
127#include <vm/vm.h>
128#include <vm/vm_param.h>
129#include <vm/vm_kern.h>
130#include <vm/vm_page.h>
131#include <vm/vm_map.h>
132#include <vm/vm_object.h>
133#include <vm/vm_extern.h>
134#include <vm/vm_pageout.h>
135#include <vm/vm_pager.h>
136#include <vm/vm_phys.h>
137#include <vm/vm_radix.h>
138#include <vm/vm_reserv.h>
139#include <vm/uma.h>
140
141#ifdef DEV_APIC
142#include <sys/bus.h>
143#include <machine/intr_machdep.h>
144#include <machine/apicvar.h>
145#endif
146#include <machine/cpu.h>
147#include <machine/cputypes.h>
148#include <machine/md_var.h>
149#include <machine/pcb.h>
150#include <machine/specialreg.h>
151#ifdef SMP
152#include <machine/smp.h>
153#endif
154
155#ifdef XBOX
156#include <machine/xbox.h>
157#endif
158
159#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
160#define CPU_ENABLE_SSE
161#endif
162
163#ifndef PMAP_SHPGPERPROC
164#define PMAP_SHPGPERPROC 200
165#endif
166
167#if !defined(DIAGNOSTIC)
168#ifdef __GNUC_GNU_INLINE__
169#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
170#else
171#define PMAP_INLINE	extern inline
172#endif
173#else
174#define PMAP_INLINE
175#endif
176
177#ifdef PV_STATS
178#define PV_STAT(x)	do { x ; } while (0)
179#else
180#define PV_STAT(x)	do { } while (0)
181#endif
182
183#define	pa_index(pa)	((pa) >> PDRSHIFT)
184#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
185
186/*
187 * Get PDEs and PTEs for user/kernel address space
188 */
189#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
190#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
191
192#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
193#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
194#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
195#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
196#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
197
198#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
199    atomic_clear_int((u_int *)(pte), PG_W))
200#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
201
202struct pmap kernel_pmap_store;
203LIST_HEAD(pmaplist, pmap);
204static struct pmaplist allpmaps;
205static struct mtx allpmaps_lock;
206
207vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
208vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
209int pgeflag = 0;		/* PG_G or-in */
210int pseflag = 0;		/* PG_PS or-in */
211
212static int nkpt = NKPT;
213vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
214extern u_int32_t KERNend;
215extern u_int32_t KPTphys;
216
217#ifdef PAE
218pt_entry_t pg_nx;
219static uma_zone_t pdptzone;
220#endif
221
222static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
223
224static int pat_works = 1;
225SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
226    "Is page attribute table fully functional?");
227
228static int pg_ps_enabled = 1;
229SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
230    "Are large page mappings enabled?");
231
232#define	PAT_INDEX_SIZE	8
233static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
234
235static struct rwlock_padalign pvh_global_lock;
236
237/*
238 * Data for the pv entry allocation mechanism
239 */
240static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
241static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
242static struct md_page *pv_table;
243static int shpgperproc = PMAP_SHPGPERPROC;
244
245struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
246int pv_maxchunks;			/* How many chunks we have KVA for */
247vm_offset_t pv_vafree;			/* freelist stored in the PTE */
248
249/*
250 * All those kernel PT submaps that BSD is so fond of
251 */
252struct sysmaps {
253	struct	mtx lock;
254	pt_entry_t *CMAP1;
255	pt_entry_t *CMAP2;
256	caddr_t	CADDR1;
257	caddr_t	CADDR2;
258};
259static struct sysmaps sysmaps_pcpu[MAXCPU];
260pt_entry_t *CMAP3;
261static pd_entry_t *KPTD;
262caddr_t ptvmmap = 0;
263caddr_t CADDR3;
264struct msgbuf *msgbufp = 0;
265
266/*
267 * Crashdump maps.
268 */
269static caddr_t crashdumpmap;
270
271static pt_entry_t *PMAP1 = 0, *PMAP2;
272static pt_entry_t *PADDR1 = 0, *PADDR2;
273#ifdef SMP
274static int PMAP1cpu;
275static int PMAP1changedcpu;
276SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
277	   &PMAP1changedcpu, 0,
278	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
279#endif
280static int PMAP1changed;
281SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
282	   &PMAP1changed, 0,
283	   "Number of times pmap_pte_quick changed PMAP1");
284static int PMAP1unchanged;
285SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
286	   &PMAP1unchanged, 0,
287	   "Number of times pmap_pte_quick didn't change PMAP1");
288static struct mtx PMAP2mutex;
289
290static void	free_pv_chunk(struct pv_chunk *pc);
291static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
292static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
293static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
294static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
295static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
296static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
297static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
298		    vm_offset_t va);
299static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
300
301static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
302static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
303    vm_prot_t prot);
304static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
305    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
306static void pmap_flush_page(vm_page_t m);
307static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
308static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
309static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
310static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
311static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
312static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
313static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
314static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
315static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
316static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
317    vm_prot_t prot);
318static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
319static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
320    struct spglist *free);
321static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
322    struct spglist *free);
323static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
324static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
325    struct spglist *free);
326static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
327					vm_offset_t va);
328static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
329static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
330    vm_page_t m);
331static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
332    pd_entry_t newpde);
333static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
334
335static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
336
337static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
338static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
339static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
340static void pmap_pte_release(pt_entry_t *pte);
341static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
342#ifdef PAE
343static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
344#endif
345static void pmap_set_pg(void);
346
347static __inline void pagezero(void *page);
348
349CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
350CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
351
352/*
353 * If you get an error here, then you set KVA_PAGES wrong! See the
354 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
355 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
356 */
357CTASSERT(KERNBASE % (1 << 24) == 0);
358
359/*
360 *	Bootstrap the system enough to run with virtual memory.
361 *
362 *	On the i386 this is called after mapping has already been enabled
363 *	and just syncs the pmap module with what has already been done.
364 *	[We can't call it easily with mapping off since the kernel is not
365 *	mapped with PA == VA, hence we would have to relocate every address
366 *	from the linked base (virtual) address "KERNBASE" to the actual
367 *	(physical) address starting relative to 0]
368 */
369void
370pmap_bootstrap(vm_paddr_t firstaddr)
371{
372	vm_offset_t va;
373	pt_entry_t *pte, *unused;
374	struct sysmaps *sysmaps;
375	int i;
376
377	/*
378	 * Add a physical memory segment (vm_phys_seg) corresponding to the
379	 * preallocated kernel page table pages so that vm_page structures
380	 * representing these pages will be created.  The vm_page structures
381	 * are required for promotion of the corresponding kernel virtual
382	 * addresses to superpage mappings.
383	 */
384	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
385
386	/*
387	 * Initialize the first available kernel virtual address.  However,
388	 * using "firstaddr" may waste a few pages of the kernel virtual
389	 * address space, because locore may not have mapped every physical
390	 * page that it allocated.  Preferably, locore would provide a first
391	 * unused virtual address in addition to "firstaddr".
392	 */
393	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
394
395	virtual_end = VM_MAX_KERNEL_ADDRESS;
396
397	/*
398	 * Initialize the kernel pmap (which is statically allocated).
399	 */
400	PMAP_LOCK_INIT(kernel_pmap);
401	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
402#ifdef PAE
403	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
404#endif
405	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
406	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
407
408 	/*
409	 * Initialize the global pv list lock.
410	 */
411	rw_init(&pvh_global_lock, "pmap pv global");
412
413	LIST_INIT(&allpmaps);
414
415	/*
416	 * Request a spin mutex so that changes to allpmaps cannot be
417	 * preempted by smp_rendezvous_cpus().  Otherwise,
418	 * pmap_update_pde_kernel() could access allpmaps while it is
419	 * being changed.
420	 */
421	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
422	mtx_lock_spin(&allpmaps_lock);
423	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
424	mtx_unlock_spin(&allpmaps_lock);
425
426	/*
427	 * Reserve some special page table entries/VA space for temporary
428	 * mapping of pages.
429	 */
430#define	SYSMAP(c, p, v, n)	\
431	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
432
433	va = virtual_avail;
434	pte = vtopte(va);
435
436	/*
437	 * CMAP1/CMAP2 are used for zeroing and copying pages.
438	 * CMAP3 is used for the idle process page zeroing.
439	 */
440	for (i = 0; i < MAXCPU; i++) {
441		sysmaps = &sysmaps_pcpu[i];
442		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
443		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
444		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
445	}
446	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
447
448	/*
449	 * Crashdump maps.
450	 */
451	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
452
453	/*
454	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
455	 */
456	SYSMAP(caddr_t, unused, ptvmmap, 1)
457
458	/*
459	 * msgbufp is used to map the system message buffer.
460	 */
461	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
462
463	/*
464	 * KPTmap is used by pmap_kextract().
465	 *
466	 * KPTmap is first initialized by locore.  However, that initial
467	 * KPTmap can only support NKPT page table pages.  Here, a larger
468	 * KPTmap is created that can support KVA_PAGES page table pages.
469	 */
470	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
471
472	for (i = 0; i < NKPT; i++)
473		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
474
475	/*
476	 * Adjust the start of the KPTD and KPTmap so that the implementation
477	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
478	 */
479	KPTD -= KPTDI;
480	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
481
482	/*
483	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
484	 * respectively.
485	 */
486	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
487	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
488
489	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
490
491	virtual_avail = va;
492
493	/*
494	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
495	 * physical memory region that is used by the ACPI wakeup code.  This
496	 * mapping must not have PG_G set.
497	 */
498#ifdef XBOX
499	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
500	 * an early stadium, we cannot yet neatly map video memory ... :-(
501	 * Better fixes are very welcome! */
502	if (!arch_i386_is_xbox)
503#endif
504	for (i = 1; i < NKPT; i++)
505		PTD[i] = 0;
506
507	/* Initialize the PAT MSR if present. */
508	pmap_init_pat();
509
510	/* Turn on PG_G on kernel page(s) */
511	pmap_set_pg();
512}
513
514/*
515 * Setup the PAT MSR.
516 */
517void
518pmap_init_pat(void)
519{
520	int pat_table[PAT_INDEX_SIZE];
521	uint64_t pat_msr;
522	u_long cr0, cr4;
523	int i;
524
525	/* Set default PAT index table. */
526	for (i = 0; i < PAT_INDEX_SIZE; i++)
527		pat_table[i] = -1;
528	pat_table[PAT_WRITE_BACK] = 0;
529	pat_table[PAT_WRITE_THROUGH] = 1;
530	pat_table[PAT_UNCACHEABLE] = 3;
531	pat_table[PAT_WRITE_COMBINING] = 3;
532	pat_table[PAT_WRITE_PROTECTED] = 3;
533	pat_table[PAT_UNCACHED] = 3;
534
535	/* Bail if this CPU doesn't implement PAT. */
536	if ((cpu_feature & CPUID_PAT) == 0) {
537		for (i = 0; i < PAT_INDEX_SIZE; i++)
538			pat_index[i] = pat_table[i];
539		pat_works = 0;
540		return;
541	}
542
543	/*
544	 * Due to some Intel errata, we can only safely use the lower 4
545	 * PAT entries.
546	 *
547	 *   Intel Pentium III Processor Specification Update
548	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
549	 * or Mode C Paging)
550	 *
551	 *   Intel Pentium IV  Processor Specification Update
552	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
553	 */
554	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
555	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
556		pat_works = 0;
557
558	/* Initialize default PAT entries. */
559	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
560	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
561	    PAT_VALUE(2, PAT_UNCACHED) |
562	    PAT_VALUE(3, PAT_UNCACHEABLE) |
563	    PAT_VALUE(4, PAT_WRITE_BACK) |
564	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
565	    PAT_VALUE(6, PAT_UNCACHED) |
566	    PAT_VALUE(7, PAT_UNCACHEABLE);
567
568	if (pat_works) {
569		/*
570		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
571		 * Program 5 and 6 as WP and WC.
572		 * Leave 4 and 7 as WB and UC.
573		 */
574		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
575		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
576		    PAT_VALUE(6, PAT_WRITE_COMBINING);
577		pat_table[PAT_UNCACHED] = 2;
578		pat_table[PAT_WRITE_PROTECTED] = 5;
579		pat_table[PAT_WRITE_COMBINING] = 6;
580	} else {
581		/*
582		 * Just replace PAT Index 2 with WC instead of UC-.
583		 */
584		pat_msr &= ~PAT_MASK(2);
585		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
586		pat_table[PAT_WRITE_COMBINING] = 2;
587	}
588
589	/* Disable PGE. */
590	cr4 = rcr4();
591	load_cr4(cr4 & ~CR4_PGE);
592
593	/* Disable caches (CD = 1, NW = 0). */
594	cr0 = rcr0();
595	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
596
597	/* Flushes caches and TLBs. */
598	wbinvd();
599	invltlb();
600
601	/* Update PAT and index table. */
602	wrmsr(MSR_PAT, pat_msr);
603	for (i = 0; i < PAT_INDEX_SIZE; i++)
604		pat_index[i] = pat_table[i];
605
606	/* Flush caches and TLBs again. */
607	wbinvd();
608	invltlb();
609
610	/* Restore caches and PGE. */
611	load_cr0(cr0);
612	load_cr4(cr4);
613}
614
615/*
616 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
617 */
618static void
619pmap_set_pg(void)
620{
621	pt_entry_t *pte;
622	vm_offset_t va, endva;
623
624	if (pgeflag == 0)
625		return;
626
627	endva = KERNBASE + KERNend;
628
629	if (pseflag) {
630		va = KERNBASE + KERNLOAD;
631		while (va  < endva) {
632			pdir_pde(PTD, va) |= pgeflag;
633			invltlb();	/* Play it safe, invltlb() every time */
634			va += NBPDR;
635		}
636	} else {
637		va = (vm_offset_t)btext;
638		while (va < endva) {
639			pte = vtopte(va);
640			if (*pte)
641				*pte |= pgeflag;
642			invltlb();	/* Play it safe, invltlb() every time */
643			va += PAGE_SIZE;
644		}
645	}
646}
647
648/*
649 * Initialize a vm_page's machine-dependent fields.
650 */
651void
652pmap_page_init(vm_page_t m)
653{
654
655	TAILQ_INIT(&m->md.pv_list);
656	m->md.pat_mode = PAT_WRITE_BACK;
657}
658
659#ifdef PAE
660static void *
661pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
662{
663
664	/* Inform UMA that this allocator uses kernel_map/object. */
665	*flags = UMA_SLAB_KERNEL;
666	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
667	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
668}
669#endif
670
671/*
672 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
673 * Requirements:
674 *  - Must deal with pages in order to ensure that none of the PG_* bits
675 *    are ever set, PG_V in particular.
676 *  - Assumes we can write to ptes without pte_store() atomic ops, even
677 *    on PAE systems.  This should be ok.
678 *  - Assumes nothing will ever test these addresses for 0 to indicate
679 *    no mapping instead of correctly checking PG_V.
680 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
681 * Because PG_V is never set, there can be no mappings to invalidate.
682 */
683static vm_offset_t
684pmap_ptelist_alloc(vm_offset_t *head)
685{
686	pt_entry_t *pte;
687	vm_offset_t va;
688
689	va = *head;
690	if (va == 0)
691		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
692	pte = vtopte(va);
693	*head = *pte;
694	if (*head & PG_V)
695		panic("pmap_ptelist_alloc: va with PG_V set!");
696	*pte = 0;
697	return (va);
698}
699
700static void
701pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
702{
703	pt_entry_t *pte;
704
705	if (va & PG_V)
706		panic("pmap_ptelist_free: freeing va with PG_V set!");
707	pte = vtopte(va);
708	*pte = *head;		/* virtual! PG_V is 0 though */
709	*head = va;
710}
711
712static void
713pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
714{
715	int i;
716	vm_offset_t va;
717
718	*head = 0;
719	for (i = npages - 1; i >= 0; i--) {
720		va = (vm_offset_t)base + i * PAGE_SIZE;
721		pmap_ptelist_free(head, va);
722	}
723}
724
725
726/*
727 *	Initialize the pmap module.
728 *	Called by vm_init, to initialize any structures that the pmap
729 *	system needs to map virtual memory.
730 */
731void
732pmap_init(void)
733{
734	vm_page_t mpte;
735	vm_size_t s;
736	int i, pv_npg;
737
738	/*
739	 * Initialize the vm page array entries for the kernel pmap's
740	 * page table pages.
741	 */
742	for (i = 0; i < NKPT; i++) {
743		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
744		KASSERT(mpte >= vm_page_array &&
745		    mpte < &vm_page_array[vm_page_array_size],
746		    ("pmap_init: page table page is out of range"));
747		mpte->pindex = i + KPTDI;
748		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
749	}
750
751	/*
752	 * Initialize the address space (zone) for the pv entries.  Set a
753	 * high water mark so that the system can recover from excessive
754	 * numbers of pv entries.
755	 */
756	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
757	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
758	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
759	pv_entry_max = roundup(pv_entry_max, _NPCPV);
760	pv_entry_high_water = 9 * (pv_entry_max / 10);
761
762	/*
763	 * If the kernel is running on a virtual machine, then it must assume
764	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
765	 * be prepared for the hypervisor changing the vendor and family that
766	 * are reported by CPUID.  Consequently, the workaround for AMD Family
767	 * 10h Erratum 383 is enabled if the processor's feature set does not
768	 * include at least one feature that is only supported by older Intel
769	 * or newer AMD processors.
770	 */
771	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
772	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
773	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
774	    AMDID2_FMA4)) == 0)
775		workaround_erratum383 = 1;
776
777	/*
778	 * Are large page mappings supported and enabled?
779	 */
780	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
781	if (pseflag == 0)
782		pg_ps_enabled = 0;
783	else if (pg_ps_enabled) {
784		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
785		    ("pmap_init: can't assign to pagesizes[1]"));
786		pagesizes[1] = NBPDR;
787	}
788
789	/*
790	 * Calculate the size of the pv head table for superpages.
791	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
792	 */
793	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
794	    PAGE_SIZE) / NBPDR + 1;
795
796	/*
797	 * Allocate memory for the pv head table for superpages.
798	 */
799	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
800	s = round_page(s);
801	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
802	    M_WAITOK | M_ZERO);
803	for (i = 0; i < pv_npg; i++)
804		TAILQ_INIT(&pv_table[i].pv_list);
805
806	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
807	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
808	if (pv_chunkbase == NULL)
809		panic("pmap_init: not enough kvm for pv chunks");
810	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
811#ifdef PAE
812	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
813	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
814	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
815	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
816#endif
817}
818
819
820SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
821	"Max number of PV entries");
822SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
823	"Page share factor per proc");
824
825static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
826    "2/4MB page mapping counters");
827
828static u_long pmap_pde_demotions;
829SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
830    &pmap_pde_demotions, 0, "2/4MB page demotions");
831
832static u_long pmap_pde_mappings;
833SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
834    &pmap_pde_mappings, 0, "2/4MB page mappings");
835
836static u_long pmap_pde_p_failures;
837SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
838    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
839
840static u_long pmap_pde_promotions;
841SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
842    &pmap_pde_promotions, 0, "2/4MB page promotions");
843
844/***************************************************
845 * Low level helper routines.....
846 ***************************************************/
847
848/*
849 * Determine the appropriate bits to set in a PTE or PDE for a specified
850 * caching mode.
851 */
852int
853pmap_cache_bits(int mode, boolean_t is_pde)
854{
855	int cache_bits, pat_flag, pat_idx;
856
857	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
858		panic("Unknown caching mode %d\n", mode);
859
860	/* The PAT bit is different for PTE's and PDE's. */
861	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
862
863	/* Map the caching mode to a PAT index. */
864	pat_idx = pat_index[mode];
865
866	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
867	cache_bits = 0;
868	if (pat_idx & 0x4)
869		cache_bits |= pat_flag;
870	if (pat_idx & 0x2)
871		cache_bits |= PG_NC_PCD;
872	if (pat_idx & 0x1)
873		cache_bits |= PG_NC_PWT;
874	return (cache_bits);
875}
876
877/*
878 * The caller is responsible for maintaining TLB consistency.
879 */
880static void
881pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
882{
883	pd_entry_t *pde;
884	pmap_t pmap;
885	boolean_t PTD_updated;
886
887	PTD_updated = FALSE;
888	mtx_lock_spin(&allpmaps_lock);
889	LIST_FOREACH(pmap, &allpmaps, pm_list) {
890		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
891		    PG_FRAME))
892			PTD_updated = TRUE;
893		pde = pmap_pde(pmap, va);
894		pde_store(pde, newpde);
895	}
896	mtx_unlock_spin(&allpmaps_lock);
897	KASSERT(PTD_updated,
898	    ("pmap_kenter_pde: current page table is not in allpmaps"));
899}
900
901/*
902 * After changing the page size for the specified virtual address in the page
903 * table, flush the corresponding entries from the processor's TLB.  Only the
904 * calling processor's TLB is affected.
905 *
906 * The calling thread must be pinned to a processor.
907 */
908static void
909pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
910{
911	u_long cr4;
912
913	if ((newpde & PG_PS) == 0)
914		/* Demotion: flush a specific 2MB page mapping. */
915		invlpg(va);
916	else if ((newpde & PG_G) == 0)
917		/*
918		 * Promotion: flush every 4KB page mapping from the TLB
919		 * because there are too many to flush individually.
920		 */
921		invltlb();
922	else {
923		/*
924		 * Promotion: flush every 4KB page mapping from the TLB,
925		 * including any global (PG_G) mappings.
926		 */
927		cr4 = rcr4();
928		load_cr4(cr4 & ~CR4_PGE);
929		/*
930		 * Although preemption at this point could be detrimental to
931		 * performance, it would not lead to an error.  PG_G is simply
932		 * ignored if CR4.PGE is clear.  Moreover, in case this block
933		 * is re-entered, the load_cr4() either above or below will
934		 * modify CR4.PGE flushing the TLB.
935		 */
936		load_cr4(cr4 | CR4_PGE);
937	}
938}
939#ifdef SMP
940/*
941 * For SMP, these functions have to use the IPI mechanism for coherence.
942 *
943 * N.B.: Before calling any of the following TLB invalidation functions,
944 * the calling processor must ensure that all stores updating a non-
945 * kernel page table are globally performed.  Otherwise, another
946 * processor could cache an old, pre-update entry without being
947 * invalidated.  This can happen one of two ways: (1) The pmap becomes
948 * active on another processor after its pm_active field is checked by
949 * one of the following functions but before a store updating the page
950 * table is globally performed. (2) The pmap becomes active on another
951 * processor before its pm_active field is checked but due to
952 * speculative loads one of the following functions stills reads the
953 * pmap as inactive on the other processor.
954 *
955 * The kernel page table is exempt because its pm_active field is
956 * immutable.  The kernel page table is always active on every
957 * processor.
958 */
959void
960pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
961{
962	cpuset_t other_cpus;
963	u_int cpuid;
964
965	sched_pin();
966	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
967		invlpg(va);
968		smp_invlpg(va);
969	} else {
970		cpuid = PCPU_GET(cpuid);
971		other_cpus = all_cpus;
972		CPU_CLR(cpuid, &other_cpus);
973		if (CPU_ISSET(cpuid, &pmap->pm_active))
974			invlpg(va);
975		CPU_AND(&other_cpus, &pmap->pm_active);
976		if (!CPU_EMPTY(&other_cpus))
977			smp_masked_invlpg(other_cpus, va);
978	}
979	sched_unpin();
980}
981
982void
983pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
984{
985	cpuset_t other_cpus;
986	vm_offset_t addr;
987	u_int cpuid;
988
989	sched_pin();
990	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
991		for (addr = sva; addr < eva; addr += PAGE_SIZE)
992			invlpg(addr);
993		smp_invlpg_range(sva, eva);
994	} else {
995		cpuid = PCPU_GET(cpuid);
996		other_cpus = all_cpus;
997		CPU_CLR(cpuid, &other_cpus);
998		if (CPU_ISSET(cpuid, &pmap->pm_active))
999			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1000				invlpg(addr);
1001		CPU_AND(&other_cpus, &pmap->pm_active);
1002		if (!CPU_EMPTY(&other_cpus))
1003			smp_masked_invlpg_range(other_cpus, sva, eva);
1004	}
1005	sched_unpin();
1006}
1007
1008void
1009pmap_invalidate_all(pmap_t pmap)
1010{
1011	cpuset_t other_cpus;
1012	u_int cpuid;
1013
1014	sched_pin();
1015	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1016		invltlb();
1017		smp_invltlb();
1018	} else {
1019		cpuid = PCPU_GET(cpuid);
1020		other_cpus = all_cpus;
1021		CPU_CLR(cpuid, &other_cpus);
1022		if (CPU_ISSET(cpuid, &pmap->pm_active))
1023			invltlb();
1024		CPU_AND(&other_cpus, &pmap->pm_active);
1025		if (!CPU_EMPTY(&other_cpus))
1026			smp_masked_invltlb(other_cpus);
1027	}
1028	sched_unpin();
1029}
1030
1031void
1032pmap_invalidate_cache(void)
1033{
1034
1035	sched_pin();
1036	wbinvd();
1037	smp_cache_flush();
1038	sched_unpin();
1039}
1040
1041struct pde_action {
1042	cpuset_t invalidate;	/* processors that invalidate their TLB */
1043	vm_offset_t va;
1044	pd_entry_t *pde;
1045	pd_entry_t newpde;
1046	u_int store;		/* processor that updates the PDE */
1047};
1048
1049static void
1050pmap_update_pde_kernel(void *arg)
1051{
1052	struct pde_action *act = arg;
1053	pd_entry_t *pde;
1054	pmap_t pmap;
1055
1056	if (act->store == PCPU_GET(cpuid)) {
1057
1058		/*
1059		 * Elsewhere, this operation requires allpmaps_lock for
1060		 * synchronization.  Here, it does not because it is being
1061		 * performed in the context of an all_cpus rendezvous.
1062		 */
1063		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1064			pde = pmap_pde(pmap, act->va);
1065			pde_store(pde, act->newpde);
1066		}
1067	}
1068}
1069
1070static void
1071pmap_update_pde_user(void *arg)
1072{
1073	struct pde_action *act = arg;
1074
1075	if (act->store == PCPU_GET(cpuid))
1076		pde_store(act->pde, act->newpde);
1077}
1078
1079static void
1080pmap_update_pde_teardown(void *arg)
1081{
1082	struct pde_action *act = arg;
1083
1084	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1085		pmap_update_pde_invalidate(act->va, act->newpde);
1086}
1087
1088/*
1089 * Change the page size for the specified virtual address in a way that
1090 * prevents any possibility of the TLB ever having two entries that map the
1091 * same virtual address using different page sizes.  This is the recommended
1092 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1093 * machine check exception for a TLB state that is improperly diagnosed as a
1094 * hardware error.
1095 */
1096static void
1097pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1098{
1099	struct pde_action act;
1100	cpuset_t active, other_cpus;
1101	u_int cpuid;
1102
1103	sched_pin();
1104	cpuid = PCPU_GET(cpuid);
1105	other_cpus = all_cpus;
1106	CPU_CLR(cpuid, &other_cpus);
1107	if (pmap == kernel_pmap)
1108		active = all_cpus;
1109	else
1110		active = pmap->pm_active;
1111	if (CPU_OVERLAP(&active, &other_cpus)) {
1112		act.store = cpuid;
1113		act.invalidate = active;
1114		act.va = va;
1115		act.pde = pde;
1116		act.newpde = newpde;
1117		CPU_SET(cpuid, &active);
1118		smp_rendezvous_cpus(active,
1119		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1120		    pmap_update_pde_kernel : pmap_update_pde_user,
1121		    pmap_update_pde_teardown, &act);
1122	} else {
1123		if (pmap == kernel_pmap)
1124			pmap_kenter_pde(va, newpde);
1125		else
1126			pde_store(pde, newpde);
1127		if (CPU_ISSET(cpuid, &active))
1128			pmap_update_pde_invalidate(va, newpde);
1129	}
1130	sched_unpin();
1131}
1132#else /* !SMP */
1133/*
1134 * Normal, non-SMP, 486+ invalidation functions.
1135 * We inline these within pmap.c for speed.
1136 */
1137PMAP_INLINE void
1138pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1139{
1140
1141	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1142		invlpg(va);
1143}
1144
1145PMAP_INLINE void
1146pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1147{
1148	vm_offset_t addr;
1149
1150	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1151		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1152			invlpg(addr);
1153}
1154
1155PMAP_INLINE void
1156pmap_invalidate_all(pmap_t pmap)
1157{
1158
1159	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1160		invltlb();
1161}
1162
1163PMAP_INLINE void
1164pmap_invalidate_cache(void)
1165{
1166
1167	wbinvd();
1168}
1169
1170static void
1171pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1172{
1173
1174	if (pmap == kernel_pmap)
1175		pmap_kenter_pde(va, newpde);
1176	else
1177		pde_store(pde, newpde);
1178	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1179		pmap_update_pde_invalidate(va, newpde);
1180}
1181#endif /* !SMP */
1182
1183#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1184
1185void
1186pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1187{
1188
1189	if (force) {
1190		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1191	} else {
1192		KASSERT((sva & PAGE_MASK) == 0,
1193		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1194		KASSERT((eva & PAGE_MASK) == 0,
1195		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1196	}
1197
1198	if ((cpu_feature & CPUID_SS) != 0 && !force)
1199		; /* If "Self Snoop" is supported and allowed, do nothing. */
1200	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1201	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1202
1203#ifdef DEV_APIC
1204		/*
1205		 * XXX: Some CPUs fault, hang, or trash the local APIC
1206		 * registers if we use CLFLUSH on the local APIC
1207		 * range.  The local APIC is always uncached, so we
1208		 * don't need to flush for that range anyway.
1209		 */
1210		if (pmap_kextract(sva) == lapic_paddr)
1211			return;
1212#endif
1213		/*
1214		 * Otherwise, do per-cache line flush.  Use the mfence
1215		 * instruction to insure that previous stores are
1216		 * included in the write-back.  The processor
1217		 * propagates flush to other processors in the cache
1218		 * coherence domain.
1219		 */
1220		mfence();
1221		for (; sva < eva; sva += cpu_clflush_line_size)
1222			clflush(sva);
1223		mfence();
1224	} else {
1225
1226		/*
1227		 * No targeted cache flush methods are supported by CPU,
1228		 * or the supplied range is bigger than 2MB.
1229		 * Globally invalidate cache.
1230		 */
1231		pmap_invalidate_cache();
1232	}
1233}
1234
1235void
1236pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1237{
1238	int i;
1239
1240	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1241	    (cpu_feature & CPUID_CLFSH) == 0) {
1242		pmap_invalidate_cache();
1243	} else {
1244		for (i = 0; i < count; i++)
1245			pmap_flush_page(pages[i]);
1246	}
1247}
1248
1249/*
1250 * Are we current address space or kernel?  N.B. We return FALSE when
1251 * a pmap's page table is in use because a kernel thread is borrowing
1252 * it.  The borrowed page table can change spontaneously, making any
1253 * dependence on its continued use subject to a race condition.
1254 */
1255static __inline int
1256pmap_is_current(pmap_t pmap)
1257{
1258
1259	return (pmap == kernel_pmap ||
1260	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1261	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1262}
1263
1264/*
1265 * If the given pmap is not the current or kernel pmap, the returned pte must
1266 * be released by passing it to pmap_pte_release().
1267 */
1268pt_entry_t *
1269pmap_pte(pmap_t pmap, vm_offset_t va)
1270{
1271	pd_entry_t newpf;
1272	pd_entry_t *pde;
1273
1274	pde = pmap_pde(pmap, va);
1275	if (*pde & PG_PS)
1276		return (pde);
1277	if (*pde != 0) {
1278		/* are we current address space or kernel? */
1279		if (pmap_is_current(pmap))
1280			return (vtopte(va));
1281		mtx_lock(&PMAP2mutex);
1282		newpf = *pde & PG_FRAME;
1283		if ((*PMAP2 & PG_FRAME) != newpf) {
1284			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1285			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1286		}
1287		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1288	}
1289	return (NULL);
1290}
1291
1292/*
1293 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1294 * being NULL.
1295 */
1296static __inline void
1297pmap_pte_release(pt_entry_t *pte)
1298{
1299
1300	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1301		mtx_unlock(&PMAP2mutex);
1302}
1303
1304/*
1305 * NB:  The sequence of updating a page table followed by accesses to the
1306 * corresponding pages is subject to the situation described in the "AMD64
1307 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1308 * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1309 * right after modifying the PTE bits is crucial.
1310 */
1311static __inline void
1312invlcaddr(void *caddr)
1313{
1314
1315	invlpg((u_int)caddr);
1316}
1317
1318/*
1319 * Super fast pmap_pte routine best used when scanning
1320 * the pv lists.  This eliminates many coarse-grained
1321 * invltlb calls.  Note that many of the pv list
1322 * scans are across different pmaps.  It is very wasteful
1323 * to do an entire invltlb for checking a single mapping.
1324 *
1325 * If the given pmap is not the current pmap, pvh_global_lock
1326 * must be held and curthread pinned to a CPU.
1327 */
1328static pt_entry_t *
1329pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1330{
1331	pd_entry_t newpf;
1332	pd_entry_t *pde;
1333
1334	pde = pmap_pde(pmap, va);
1335	if (*pde & PG_PS)
1336		return (pde);
1337	if (*pde != 0) {
1338		/* are we current address space or kernel? */
1339		if (pmap_is_current(pmap))
1340			return (vtopte(va));
1341		rw_assert(&pvh_global_lock, RA_WLOCKED);
1342		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1343		newpf = *pde & PG_FRAME;
1344		if ((*PMAP1 & PG_FRAME) != newpf) {
1345			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1346#ifdef SMP
1347			PMAP1cpu = PCPU_GET(cpuid);
1348#endif
1349			invlcaddr(PADDR1);
1350			PMAP1changed++;
1351		} else
1352#ifdef SMP
1353		if (PMAP1cpu != PCPU_GET(cpuid)) {
1354			PMAP1cpu = PCPU_GET(cpuid);
1355			invlcaddr(PADDR1);
1356			PMAP1changedcpu++;
1357		} else
1358#endif
1359			PMAP1unchanged++;
1360		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1361	}
1362	return (0);
1363}
1364
1365/*
1366 *	Routine:	pmap_extract
1367 *	Function:
1368 *		Extract the physical page address associated
1369 *		with the given map/virtual_address pair.
1370 */
1371vm_paddr_t
1372pmap_extract(pmap_t pmap, vm_offset_t va)
1373{
1374	vm_paddr_t rtval;
1375	pt_entry_t *pte;
1376	pd_entry_t pde;
1377
1378	rtval = 0;
1379	PMAP_LOCK(pmap);
1380	pde = pmap->pm_pdir[va >> PDRSHIFT];
1381	if (pde != 0) {
1382		if ((pde & PG_PS) != 0)
1383			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1384		else {
1385			pte = pmap_pte(pmap, va);
1386			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1387			pmap_pte_release(pte);
1388		}
1389	}
1390	PMAP_UNLOCK(pmap);
1391	return (rtval);
1392}
1393
1394/*
1395 *	Routine:	pmap_extract_and_hold
1396 *	Function:
1397 *		Atomically extract and hold the physical page
1398 *		with the given pmap and virtual address pair
1399 *		if that mapping permits the given protection.
1400 */
1401vm_page_t
1402pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1403{
1404	pd_entry_t pde;
1405	pt_entry_t pte, *ptep;
1406	vm_page_t m;
1407	vm_paddr_t pa;
1408
1409	pa = 0;
1410	m = NULL;
1411	PMAP_LOCK(pmap);
1412retry:
1413	pde = *pmap_pde(pmap, va);
1414	if (pde != 0) {
1415		if (pde & PG_PS) {
1416			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1417				if (vm_page_pa_tryrelock(pmap, (pde &
1418				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1419					goto retry;
1420				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1421				    (va & PDRMASK));
1422				vm_page_hold(m);
1423			}
1424		} else {
1425			ptep = pmap_pte(pmap, va);
1426			pte = *ptep;
1427			pmap_pte_release(ptep);
1428			if (pte != 0 &&
1429			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1430				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1431				    &pa))
1432					goto retry;
1433				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1434				vm_page_hold(m);
1435			}
1436		}
1437	}
1438	PA_UNLOCK_COND(pa);
1439	PMAP_UNLOCK(pmap);
1440	return (m);
1441}
1442
1443/***************************************************
1444 * Low level mapping routines.....
1445 ***************************************************/
1446
1447/*
1448 * Add a wired page to the kva.
1449 * Note: not SMP coherent.
1450 *
1451 * This function may be used before pmap_bootstrap() is called.
1452 */
1453PMAP_INLINE void
1454pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1455{
1456	pt_entry_t *pte;
1457
1458	pte = vtopte(va);
1459	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1460}
1461
1462static __inline void
1463pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1464{
1465	pt_entry_t *pte;
1466
1467	pte = vtopte(va);
1468	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1469}
1470
1471/*
1472 * Remove a page from the kernel pagetables.
1473 * Note: not SMP coherent.
1474 *
1475 * This function may be used before pmap_bootstrap() is called.
1476 */
1477PMAP_INLINE void
1478pmap_kremove(vm_offset_t va)
1479{
1480	pt_entry_t *pte;
1481
1482	pte = vtopte(va);
1483	pte_clear(pte);
1484}
1485
1486/*
1487 *	Used to map a range of physical addresses into kernel
1488 *	virtual address space.
1489 *
1490 *	The value passed in '*virt' is a suggested virtual address for
1491 *	the mapping. Architectures which can support a direct-mapped
1492 *	physical to virtual region can return the appropriate address
1493 *	within that region, leaving '*virt' unchanged. Other
1494 *	architectures should map the pages starting at '*virt' and
1495 *	update '*virt' with the first usable address after the mapped
1496 *	region.
1497 */
1498vm_offset_t
1499pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1500{
1501	vm_offset_t va, sva;
1502	vm_paddr_t superpage_offset;
1503	pd_entry_t newpde;
1504
1505	va = *virt;
1506	/*
1507	 * Does the physical address range's size and alignment permit at
1508	 * least one superpage mapping to be created?
1509	 */
1510	superpage_offset = start & PDRMASK;
1511	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1512		/*
1513		 * Increase the starting virtual address so that its alignment
1514		 * does not preclude the use of superpage mappings.
1515		 */
1516		if ((va & PDRMASK) < superpage_offset)
1517			va = (va & ~PDRMASK) + superpage_offset;
1518		else if ((va & PDRMASK) > superpage_offset)
1519			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1520	}
1521	sva = va;
1522	while (start < end) {
1523		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1524		    pseflag) {
1525			KASSERT((va & PDRMASK) == 0,
1526			    ("pmap_map: misaligned va %#x", va));
1527			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1528			pmap_kenter_pde(va, newpde);
1529			va += NBPDR;
1530			start += NBPDR;
1531		} else {
1532			pmap_kenter(va, start);
1533			va += PAGE_SIZE;
1534			start += PAGE_SIZE;
1535		}
1536	}
1537	pmap_invalidate_range(kernel_pmap, sva, va);
1538	*virt = va;
1539	return (sva);
1540}
1541
1542
1543/*
1544 * Add a list of wired pages to the kva
1545 * this routine is only used for temporary
1546 * kernel mappings that do not need to have
1547 * page modification or references recorded.
1548 * Note that old mappings are simply written
1549 * over.  The page *must* be wired.
1550 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1551 */
1552void
1553pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1554{
1555	pt_entry_t *endpte, oldpte, pa, *pte;
1556	vm_page_t m;
1557
1558	oldpte = 0;
1559	pte = vtopte(sva);
1560	endpte = pte + count;
1561	while (pte < endpte) {
1562		m = *ma++;
1563		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1564		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1565			oldpte |= *pte;
1566			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1567		}
1568		pte++;
1569	}
1570	if (__predict_false((oldpte & PG_V) != 0))
1571		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1572		    PAGE_SIZE);
1573}
1574
1575/*
1576 * This routine tears out page mappings from the
1577 * kernel -- it is meant only for temporary mappings.
1578 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1579 */
1580void
1581pmap_qremove(vm_offset_t sva, int count)
1582{
1583	vm_offset_t va;
1584
1585	va = sva;
1586	while (count-- > 0) {
1587		pmap_kremove(va);
1588		va += PAGE_SIZE;
1589	}
1590	pmap_invalidate_range(kernel_pmap, sva, va);
1591}
1592
1593/***************************************************
1594 * Page table page management routines.....
1595 ***************************************************/
1596static __inline void
1597pmap_free_zero_pages(struct spglist *free)
1598{
1599	vm_page_t m;
1600
1601	while ((m = SLIST_FIRST(free)) != NULL) {
1602		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1603		/* Preserve the page's PG_ZERO setting. */
1604		vm_page_free_toq(m);
1605	}
1606}
1607
1608/*
1609 * Schedule the specified unused page table page to be freed.  Specifically,
1610 * add the page to the specified list of pages that will be released to the
1611 * physical memory manager after the TLB has been updated.
1612 */
1613static __inline void
1614pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1615    boolean_t set_PG_ZERO)
1616{
1617
1618	if (set_PG_ZERO)
1619		m->flags |= PG_ZERO;
1620	else
1621		m->flags &= ~PG_ZERO;
1622	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1623}
1624
1625/*
1626 * Inserts the specified page table page into the specified pmap's collection
1627 * of idle page table pages.  Each of a pmap's page table pages is responsible
1628 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1629 * ordered by this virtual address range.
1630 */
1631static __inline int
1632pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1633{
1634
1635	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1636	return (vm_radix_insert(&pmap->pm_root, mpte));
1637}
1638
1639/*
1640 * Looks for a page table page mapping the specified virtual address in the
1641 * specified pmap's collection of idle page table pages.  Returns NULL if there
1642 * is no page table page corresponding to the specified virtual address.
1643 */
1644static __inline vm_page_t
1645pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1646{
1647
1648	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1649	return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT));
1650}
1651
1652/*
1653 * Removes the specified page table page from the specified pmap's collection
1654 * of idle page table pages.  The specified page table page must be a member of
1655 * the pmap's collection.
1656 */
1657static __inline void
1658pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1659{
1660
1661	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1662	vm_radix_remove(&pmap->pm_root, mpte->pindex);
1663}
1664
1665/*
1666 * Decrements a page table page's wire count, which is used to record the
1667 * number of valid page table entries within the page.  If the wire count
1668 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1669 * page table page was unmapped and FALSE otherwise.
1670 */
1671static inline boolean_t
1672pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1673{
1674
1675	--m->wire_count;
1676	if (m->wire_count == 0) {
1677		_pmap_unwire_ptp(pmap, m, free);
1678		return (TRUE);
1679	} else
1680		return (FALSE);
1681}
1682
1683static void
1684_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1685{
1686	vm_offset_t pteva;
1687
1688	/*
1689	 * unmap the page table page
1690	 */
1691	pmap->pm_pdir[m->pindex] = 0;
1692	--pmap->pm_stats.resident_count;
1693
1694	/*
1695	 * This is a release store so that the ordinary store unmapping
1696	 * the page table page is globally performed before TLB shoot-
1697	 * down is begun.
1698	 */
1699	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1700
1701	/*
1702	 * Do an invltlb to make the invalidated mapping
1703	 * take effect immediately.
1704	 */
1705	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1706	pmap_invalidate_page(pmap, pteva);
1707
1708	/*
1709	 * Put page on a list so that it is released after
1710	 * *ALL* TLB shootdown is done
1711	 */
1712	pmap_add_delayed_free_list(m, free, TRUE);
1713}
1714
1715/*
1716 * After removing a page table entry, this routine is used to
1717 * conditionally free the page, and manage the hold/wire counts.
1718 */
1719static int
1720pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1721{
1722	pd_entry_t ptepde;
1723	vm_page_t mpte;
1724
1725	if (va >= VM_MAXUSER_ADDRESS)
1726		return (0);
1727	ptepde = *pmap_pde(pmap, va);
1728	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1729	return (pmap_unwire_ptp(pmap, mpte, free));
1730}
1731
1732/*
1733 * Initialize the pmap for the swapper process.
1734 */
1735void
1736pmap_pinit0(pmap_t pmap)
1737{
1738
1739	PMAP_LOCK_INIT(pmap);
1740	/*
1741	 * Since the page table directory is shared with the kernel pmap,
1742	 * which is already included in the list "allpmaps", this pmap does
1743	 * not need to be inserted into that list.
1744	 */
1745	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1746#ifdef PAE
1747	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1748#endif
1749	pmap->pm_root.rt_root = 0;
1750	CPU_ZERO(&pmap->pm_active);
1751	PCPU_SET(curpmap, pmap);
1752	TAILQ_INIT(&pmap->pm_pvchunk);
1753	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1754}
1755
1756/*
1757 * Initialize a preallocated and zeroed pmap structure,
1758 * such as one in a vmspace structure.
1759 */
1760int
1761pmap_pinit(pmap_t pmap)
1762{
1763	vm_page_t m, ptdpg[NPGPTD];
1764	vm_paddr_t pa;
1765	int i;
1766
1767	/*
1768	 * No need to allocate page table space yet but we do need a valid
1769	 * page directory table.
1770	 */
1771	if (pmap->pm_pdir == NULL) {
1772		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1773		if (pmap->pm_pdir == NULL)
1774			return (0);
1775#ifdef PAE
1776		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1777		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1778		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1779		    ("pmap_pinit: pdpt misaligned"));
1780		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1781		    ("pmap_pinit: pdpt above 4g"));
1782#endif
1783		pmap->pm_root.rt_root = 0;
1784	}
1785	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1786	    ("pmap_pinit: pmap has reserved page table page(s)"));
1787
1788	/*
1789	 * allocate the page directory page(s)
1790	 */
1791	for (i = 0; i < NPGPTD;) {
1792		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1793		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1794		if (m == NULL)
1795			VM_WAIT;
1796		else {
1797			ptdpg[i++] = m;
1798		}
1799	}
1800
1801	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1802
1803	for (i = 0; i < NPGPTD; i++)
1804		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1805			pagezero(pmap->pm_pdir + (i * NPDEPG));
1806
1807	mtx_lock_spin(&allpmaps_lock);
1808	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1809	/* Copy the kernel page table directory entries. */
1810	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1811	mtx_unlock_spin(&allpmaps_lock);
1812
1813	/* install self-referential address mapping entry(s) */
1814	for (i = 0; i < NPGPTD; i++) {
1815		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1816		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1817#ifdef PAE
1818		pmap->pm_pdpt[i] = pa | PG_V;
1819#endif
1820	}
1821
1822	CPU_ZERO(&pmap->pm_active);
1823	TAILQ_INIT(&pmap->pm_pvchunk);
1824	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1825
1826	return (1);
1827}
1828
1829/*
1830 * this routine is called if the page table page is not
1831 * mapped correctly.
1832 */
1833static vm_page_t
1834_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1835{
1836	vm_paddr_t ptepa;
1837	vm_page_t m;
1838
1839	/*
1840	 * Allocate a page table page.
1841	 */
1842	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1843	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1844		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1845			PMAP_UNLOCK(pmap);
1846			rw_wunlock(&pvh_global_lock);
1847			VM_WAIT;
1848			rw_wlock(&pvh_global_lock);
1849			PMAP_LOCK(pmap);
1850		}
1851
1852		/*
1853		 * Indicate the need to retry.  While waiting, the page table
1854		 * page may have been allocated.
1855		 */
1856		return (NULL);
1857	}
1858	if ((m->flags & PG_ZERO) == 0)
1859		pmap_zero_page(m);
1860
1861	/*
1862	 * Map the pagetable page into the process address space, if
1863	 * it isn't already there.
1864	 */
1865
1866	pmap->pm_stats.resident_count++;
1867
1868	ptepa = VM_PAGE_TO_PHYS(m);
1869	pmap->pm_pdir[ptepindex] =
1870		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1871
1872	return (m);
1873}
1874
1875static vm_page_t
1876pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1877{
1878	u_int ptepindex;
1879	pd_entry_t ptepa;
1880	vm_page_t m;
1881
1882	/*
1883	 * Calculate pagetable page index
1884	 */
1885	ptepindex = va >> PDRSHIFT;
1886retry:
1887	/*
1888	 * Get the page directory entry
1889	 */
1890	ptepa = pmap->pm_pdir[ptepindex];
1891
1892	/*
1893	 * This supports switching from a 4MB page to a
1894	 * normal 4K page.
1895	 */
1896	if (ptepa & PG_PS) {
1897		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1898		ptepa = pmap->pm_pdir[ptepindex];
1899	}
1900
1901	/*
1902	 * If the page table page is mapped, we just increment the
1903	 * hold count, and activate it.
1904	 */
1905	if (ptepa) {
1906		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1907		m->wire_count++;
1908	} else {
1909		/*
1910		 * Here if the pte page isn't mapped, or if it has
1911		 * been deallocated.
1912		 */
1913		m = _pmap_allocpte(pmap, ptepindex, flags);
1914		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
1915			goto retry;
1916	}
1917	return (m);
1918}
1919
1920
1921/***************************************************
1922* Pmap allocation/deallocation routines.
1923 ***************************************************/
1924
1925#ifdef SMP
1926/*
1927 * Deal with a SMP shootdown of other users of the pmap that we are
1928 * trying to dispose of.  This can be a bit hairy.
1929 */
1930static cpuset_t *lazymask;
1931static u_int lazyptd;
1932static volatile u_int lazywait;
1933
1934void pmap_lazyfix_action(void);
1935
1936void
1937pmap_lazyfix_action(void)
1938{
1939
1940#ifdef COUNT_IPIS
1941	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1942#endif
1943	if (rcr3() == lazyptd)
1944		load_cr3(curpcb->pcb_cr3);
1945	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1946	atomic_store_rel_int(&lazywait, 1);
1947}
1948
1949static void
1950pmap_lazyfix_self(u_int cpuid)
1951{
1952
1953	if (rcr3() == lazyptd)
1954		load_cr3(curpcb->pcb_cr3);
1955	CPU_CLR_ATOMIC(cpuid, lazymask);
1956}
1957
1958
1959static void
1960pmap_lazyfix(pmap_t pmap)
1961{
1962	cpuset_t mymask, mask;
1963	u_int cpuid, spins;
1964	int lsb;
1965
1966	mask = pmap->pm_active;
1967	while (!CPU_EMPTY(&mask)) {
1968		spins = 50000000;
1969
1970		/* Find least significant set bit. */
1971		lsb = CPU_FFS(&mask);
1972		MPASS(lsb != 0);
1973		lsb--;
1974		CPU_SETOF(lsb, &mask);
1975		mtx_lock_spin(&smp_ipi_mtx);
1976#ifdef PAE
1977		lazyptd = vtophys(pmap->pm_pdpt);
1978#else
1979		lazyptd = vtophys(pmap->pm_pdir);
1980#endif
1981		cpuid = PCPU_GET(cpuid);
1982
1983		/* Use a cpuset just for having an easy check. */
1984		CPU_SETOF(cpuid, &mymask);
1985		if (!CPU_CMP(&mask, &mymask)) {
1986			lazymask = &pmap->pm_active;
1987			pmap_lazyfix_self(cpuid);
1988		} else {
1989			atomic_store_rel_int((u_int *)&lazymask,
1990			    (u_int)&pmap->pm_active);
1991			atomic_store_rel_int(&lazywait, 0);
1992			ipi_selected(mask, IPI_LAZYPMAP);
1993			while (lazywait == 0) {
1994				ia32_pause();
1995				if (--spins == 0)
1996					break;
1997			}
1998		}
1999		mtx_unlock_spin(&smp_ipi_mtx);
2000		if (spins == 0)
2001			printf("pmap_lazyfix: spun for 50000000\n");
2002		mask = pmap->pm_active;
2003	}
2004}
2005
2006#else	/* SMP */
2007
2008/*
2009 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2010 * unlikely to have to even execute this code, including the fact
2011 * that the cleanup is deferred until the parent does a wait(2), which
2012 * means that another userland process has run.
2013 */
2014static void
2015pmap_lazyfix(pmap_t pmap)
2016{
2017	u_int cr3;
2018
2019	cr3 = vtophys(pmap->pm_pdir);
2020	if (cr3 == rcr3()) {
2021		load_cr3(curpcb->pcb_cr3);
2022		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2023	}
2024}
2025#endif	/* SMP */
2026
2027/*
2028 * Release any resources held by the given physical map.
2029 * Called when a pmap initialized by pmap_pinit is being released.
2030 * Should only be called if the map contains no valid mappings.
2031 */
2032void
2033pmap_release(pmap_t pmap)
2034{
2035	vm_page_t m, ptdpg[NPGPTD];
2036	int i;
2037
2038	KASSERT(pmap->pm_stats.resident_count == 0,
2039	    ("pmap_release: pmap resident count %ld != 0",
2040	    pmap->pm_stats.resident_count));
2041	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2042	    ("pmap_release: pmap has reserved page table page(s)"));
2043
2044	pmap_lazyfix(pmap);
2045	mtx_lock_spin(&allpmaps_lock);
2046	LIST_REMOVE(pmap, pm_list);
2047	mtx_unlock_spin(&allpmaps_lock);
2048
2049	for (i = 0; i < NPGPTD; i++)
2050		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2051		    PG_FRAME);
2052
2053	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2054	    sizeof(*pmap->pm_pdir));
2055
2056	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2057
2058	for (i = 0; i < NPGPTD; i++) {
2059		m = ptdpg[i];
2060#ifdef PAE
2061		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2062		    ("pmap_release: got wrong ptd page"));
2063#endif
2064		m->wire_count--;
2065		atomic_subtract_int(&cnt.v_wire_count, 1);
2066		vm_page_free_zero(m);
2067	}
2068}
2069
2070static int
2071kvm_size(SYSCTL_HANDLER_ARGS)
2072{
2073	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2074
2075	return (sysctl_handle_long(oidp, &ksize, 0, req));
2076}
2077SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2078    0, 0, kvm_size, "IU", "Size of KVM");
2079
2080static int
2081kvm_free(SYSCTL_HANDLER_ARGS)
2082{
2083	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2084
2085	return (sysctl_handle_long(oidp, &kfree, 0, req));
2086}
2087SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2088    0, 0, kvm_free, "IU", "Amount of KVM free");
2089
2090/*
2091 * grow the number of kernel page table entries, if needed
2092 */
2093void
2094pmap_growkernel(vm_offset_t addr)
2095{
2096	vm_paddr_t ptppaddr;
2097	vm_page_t nkpg;
2098	pd_entry_t newpdir;
2099
2100	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2101	addr = roundup2(addr, NBPDR);
2102	if (addr - 1 >= kernel_map->max_offset)
2103		addr = kernel_map->max_offset;
2104	while (kernel_vm_end < addr) {
2105		if (pdir_pde(PTD, kernel_vm_end)) {
2106			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2107			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2108				kernel_vm_end = kernel_map->max_offset;
2109				break;
2110			}
2111			continue;
2112		}
2113
2114		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2115		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2116		    VM_ALLOC_ZERO);
2117		if (nkpg == NULL)
2118			panic("pmap_growkernel: no memory to grow kernel");
2119
2120		nkpt++;
2121
2122		if ((nkpg->flags & PG_ZERO) == 0)
2123			pmap_zero_page(nkpg);
2124		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2125		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2126		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2127
2128		pmap_kenter_pde(kernel_vm_end, newpdir);
2129		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2130		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2131			kernel_vm_end = kernel_map->max_offset;
2132			break;
2133		}
2134	}
2135}
2136
2137
2138/***************************************************
2139 * page management routines.
2140 ***************************************************/
2141
2142CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2143CTASSERT(_NPCM == 11);
2144CTASSERT(_NPCPV == 336);
2145
2146static __inline struct pv_chunk *
2147pv_to_chunk(pv_entry_t pv)
2148{
2149
2150	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2151}
2152
2153#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2154
2155#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2156#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2157
2158static const uint32_t pc_freemask[_NPCM] = {
2159	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2160	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2161	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2162	PC_FREE0_9, PC_FREE10
2163};
2164
2165SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2166	"Current number of pv entries");
2167
2168#ifdef PV_STATS
2169static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2170
2171SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2172	"Current number of pv entry chunks");
2173SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2174	"Current number of pv entry chunks allocated");
2175SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2176	"Current number of pv entry chunks frees");
2177SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2178	"Number of times tried to get a chunk page but failed.");
2179
2180static long pv_entry_frees, pv_entry_allocs;
2181static int pv_entry_spare;
2182
2183SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2184	"Current number of pv entry frees");
2185SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2186	"Current number of pv entry allocs");
2187SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2188	"Current number of spare pv entries");
2189#endif
2190
2191/*
2192 * We are in a serious low memory condition.  Resort to
2193 * drastic measures to free some pages so we can allocate
2194 * another pv entry chunk.
2195 */
2196static vm_page_t
2197pmap_pv_reclaim(pmap_t locked_pmap)
2198{
2199	struct pch newtail;
2200	struct pv_chunk *pc;
2201	struct md_page *pvh;
2202	pd_entry_t *pde;
2203	pmap_t pmap;
2204	pt_entry_t *pte, tpte;
2205	pv_entry_t pv;
2206	vm_offset_t va;
2207	vm_page_t m, m_pc;
2208	struct spglist free;
2209	uint32_t inuse;
2210	int bit, field, freed;
2211
2212	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2213	pmap = NULL;
2214	m_pc = NULL;
2215	SLIST_INIT(&free);
2216	TAILQ_INIT(&newtail);
2217	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2218	    SLIST_EMPTY(&free))) {
2219		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2220		if (pmap != pc->pc_pmap) {
2221			if (pmap != NULL) {
2222				pmap_invalidate_all(pmap);
2223				if (pmap != locked_pmap)
2224					PMAP_UNLOCK(pmap);
2225			}
2226			pmap = pc->pc_pmap;
2227			/* Avoid deadlock and lock recursion. */
2228			if (pmap > locked_pmap)
2229				PMAP_LOCK(pmap);
2230			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2231				pmap = NULL;
2232				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2233				continue;
2234			}
2235		}
2236
2237		/*
2238		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2239		 */
2240		freed = 0;
2241		for (field = 0; field < _NPCM; field++) {
2242			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2243			    inuse != 0; inuse &= ~(1UL << bit)) {
2244				bit = bsfl(inuse);
2245				pv = &pc->pc_pventry[field * 32 + bit];
2246				va = pv->pv_va;
2247				pde = pmap_pde(pmap, va);
2248				if ((*pde & PG_PS) != 0)
2249					continue;
2250				pte = pmap_pte(pmap, va);
2251				tpte = *pte;
2252				if ((tpte & PG_W) == 0)
2253					tpte = pte_load_clear(pte);
2254				pmap_pte_release(pte);
2255				if ((tpte & PG_W) != 0)
2256					continue;
2257				KASSERT(tpte != 0,
2258				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2259				    pmap, va));
2260				if ((tpte & PG_G) != 0)
2261					pmap_invalidate_page(pmap, va);
2262				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2263				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2264					vm_page_dirty(m);
2265				if ((tpte & PG_A) != 0)
2266					vm_page_aflag_set(m, PGA_REFERENCED);
2267				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2268				if (TAILQ_EMPTY(&m->md.pv_list) &&
2269				    (m->flags & PG_FICTITIOUS) == 0) {
2270					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2271					if (TAILQ_EMPTY(&pvh->pv_list)) {
2272						vm_page_aflag_clear(m,
2273						    PGA_WRITEABLE);
2274					}
2275				}
2276				pc->pc_map[field] |= 1UL << bit;
2277				pmap_unuse_pt(pmap, va, &free);
2278				freed++;
2279			}
2280		}
2281		if (freed == 0) {
2282			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2283			continue;
2284		}
2285		/* Every freed mapping is for a 4 KB page. */
2286		pmap->pm_stats.resident_count -= freed;
2287		PV_STAT(pv_entry_frees += freed);
2288		PV_STAT(pv_entry_spare += freed);
2289		pv_entry_count -= freed;
2290		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2291		for (field = 0; field < _NPCM; field++)
2292			if (pc->pc_map[field] != pc_freemask[field]) {
2293				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2294				    pc_list);
2295				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2296
2297				/*
2298				 * One freed pv entry in locked_pmap is
2299				 * sufficient.
2300				 */
2301				if (pmap == locked_pmap)
2302					goto out;
2303				break;
2304			}
2305		if (field == _NPCM) {
2306			PV_STAT(pv_entry_spare -= _NPCPV);
2307			PV_STAT(pc_chunk_count--);
2308			PV_STAT(pc_chunk_frees++);
2309			/* Entire chunk is free; return it. */
2310			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2311			pmap_qremove((vm_offset_t)pc, 1);
2312			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2313			break;
2314		}
2315	}
2316out:
2317	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2318	if (pmap != NULL) {
2319		pmap_invalidate_all(pmap);
2320		if (pmap != locked_pmap)
2321			PMAP_UNLOCK(pmap);
2322	}
2323	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2324		m_pc = SLIST_FIRST(&free);
2325		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2326		/* Recycle a freed page table page. */
2327		m_pc->wire_count = 1;
2328		atomic_add_int(&cnt.v_wire_count, 1);
2329	}
2330	pmap_free_zero_pages(&free);
2331	return (m_pc);
2332}
2333
2334/*
2335 * free the pv_entry back to the free list
2336 */
2337static void
2338free_pv_entry(pmap_t pmap, pv_entry_t pv)
2339{
2340	struct pv_chunk *pc;
2341	int idx, field, bit;
2342
2343	rw_assert(&pvh_global_lock, RA_WLOCKED);
2344	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2345	PV_STAT(pv_entry_frees++);
2346	PV_STAT(pv_entry_spare++);
2347	pv_entry_count--;
2348	pc = pv_to_chunk(pv);
2349	idx = pv - &pc->pc_pventry[0];
2350	field = idx / 32;
2351	bit = idx % 32;
2352	pc->pc_map[field] |= 1ul << bit;
2353	for (idx = 0; idx < _NPCM; idx++)
2354		if (pc->pc_map[idx] != pc_freemask[idx]) {
2355			/*
2356			 * 98% of the time, pc is already at the head of the
2357			 * list.  If it isn't already, move it to the head.
2358			 */
2359			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2360			    pc)) {
2361				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2362				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2363				    pc_list);
2364			}
2365			return;
2366		}
2367	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2368	free_pv_chunk(pc);
2369}
2370
2371static void
2372free_pv_chunk(struct pv_chunk *pc)
2373{
2374	vm_page_t m;
2375
2376 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2377	PV_STAT(pv_entry_spare -= _NPCPV);
2378	PV_STAT(pc_chunk_count--);
2379	PV_STAT(pc_chunk_frees++);
2380	/* entire chunk is free, return it */
2381	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2382	pmap_qremove((vm_offset_t)pc, 1);
2383	vm_page_unwire(m, 0);
2384	vm_page_free(m);
2385	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2386}
2387
2388/*
2389 * get a new pv_entry, allocating a block from the system
2390 * when needed.
2391 */
2392static pv_entry_t
2393get_pv_entry(pmap_t pmap, boolean_t try)
2394{
2395	static const struct timeval printinterval = { 60, 0 };
2396	static struct timeval lastprint;
2397	int bit, field;
2398	pv_entry_t pv;
2399	struct pv_chunk *pc;
2400	vm_page_t m;
2401
2402	rw_assert(&pvh_global_lock, RA_WLOCKED);
2403	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2404	PV_STAT(pv_entry_allocs++);
2405	pv_entry_count++;
2406	if (pv_entry_count > pv_entry_high_water)
2407		if (ratecheck(&lastprint, &printinterval))
2408			printf("Approaching the limit on PV entries, consider "
2409			    "increasing either the vm.pmap.shpgperproc or the "
2410			    "vm.pmap.pv_entry_max tunable.\n");
2411retry:
2412	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2413	if (pc != NULL) {
2414		for (field = 0; field < _NPCM; field++) {
2415			if (pc->pc_map[field]) {
2416				bit = bsfl(pc->pc_map[field]);
2417				break;
2418			}
2419		}
2420		if (field < _NPCM) {
2421			pv = &pc->pc_pventry[field * 32 + bit];
2422			pc->pc_map[field] &= ~(1ul << bit);
2423			/* If this was the last item, move it to tail */
2424			for (field = 0; field < _NPCM; field++)
2425				if (pc->pc_map[field] != 0) {
2426					PV_STAT(pv_entry_spare--);
2427					return (pv);	/* not full, return */
2428				}
2429			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2430			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2431			PV_STAT(pv_entry_spare--);
2432			return (pv);
2433		}
2434	}
2435	/*
2436	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2437	 * global lock.  If "pv_vafree" is currently non-empty, it will
2438	 * remain non-empty until pmap_ptelist_alloc() completes.
2439	 */
2440	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2441	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2442		if (try) {
2443			pv_entry_count--;
2444			PV_STAT(pc_chunk_tryfail++);
2445			return (NULL);
2446		}
2447		m = pmap_pv_reclaim(pmap);
2448		if (m == NULL)
2449			goto retry;
2450	}
2451	PV_STAT(pc_chunk_count++);
2452	PV_STAT(pc_chunk_allocs++);
2453	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2454	pmap_qenter((vm_offset_t)pc, &m, 1);
2455	pc->pc_pmap = pmap;
2456	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2457	for (field = 1; field < _NPCM; field++)
2458		pc->pc_map[field] = pc_freemask[field];
2459	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2460	pv = &pc->pc_pventry[0];
2461	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2462	PV_STAT(pv_entry_spare += _NPCPV - 1);
2463	return (pv);
2464}
2465
2466static __inline pv_entry_t
2467pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2468{
2469	pv_entry_t pv;
2470
2471	rw_assert(&pvh_global_lock, RA_WLOCKED);
2472	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2473		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2474			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2475			break;
2476		}
2477	}
2478	return (pv);
2479}
2480
2481static void
2482pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2483{
2484	struct md_page *pvh;
2485	pv_entry_t pv;
2486	vm_offset_t va_last;
2487	vm_page_t m;
2488
2489	rw_assert(&pvh_global_lock, RA_WLOCKED);
2490	KASSERT((pa & PDRMASK) == 0,
2491	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2492
2493	/*
2494	 * Transfer the 4mpage's pv entry for this mapping to the first
2495	 * page's pv list.
2496	 */
2497	pvh = pa_to_pvh(pa);
2498	va = trunc_4mpage(va);
2499	pv = pmap_pvh_remove(pvh, pmap, va);
2500	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2501	m = PHYS_TO_VM_PAGE(pa);
2502	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2503	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2504	va_last = va + NBPDR - PAGE_SIZE;
2505	do {
2506		m++;
2507		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2508		    ("pmap_pv_demote_pde: page %p is not managed", m));
2509		va += PAGE_SIZE;
2510		pmap_insert_entry(pmap, va, m);
2511	} while (va < va_last);
2512}
2513
2514static void
2515pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2516{
2517	struct md_page *pvh;
2518	pv_entry_t pv;
2519	vm_offset_t va_last;
2520	vm_page_t m;
2521
2522	rw_assert(&pvh_global_lock, RA_WLOCKED);
2523	KASSERT((pa & PDRMASK) == 0,
2524	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2525
2526	/*
2527	 * Transfer the first page's pv entry for this mapping to the
2528	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2529	 * to get_pv_entry(), a transfer avoids the possibility that
2530	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2531	 * removes one of the mappings that is being promoted.
2532	 */
2533	m = PHYS_TO_VM_PAGE(pa);
2534	va = trunc_4mpage(va);
2535	pv = pmap_pvh_remove(&m->md, pmap, va);
2536	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2537	pvh = pa_to_pvh(pa);
2538	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2539	/* Free the remaining NPTEPG - 1 pv entries. */
2540	va_last = va + NBPDR - PAGE_SIZE;
2541	do {
2542		m++;
2543		va += PAGE_SIZE;
2544		pmap_pvh_free(&m->md, pmap, va);
2545	} while (va < va_last);
2546}
2547
2548static void
2549pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2550{
2551	pv_entry_t pv;
2552
2553	pv = pmap_pvh_remove(pvh, pmap, va);
2554	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2555	free_pv_entry(pmap, pv);
2556}
2557
2558static void
2559pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2560{
2561	struct md_page *pvh;
2562
2563	rw_assert(&pvh_global_lock, RA_WLOCKED);
2564	pmap_pvh_free(&m->md, pmap, va);
2565	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2566		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2567		if (TAILQ_EMPTY(&pvh->pv_list))
2568			vm_page_aflag_clear(m, PGA_WRITEABLE);
2569	}
2570}
2571
2572/*
2573 * Create a pv entry for page at pa for
2574 * (pmap, va).
2575 */
2576static void
2577pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2578{
2579	pv_entry_t pv;
2580
2581	rw_assert(&pvh_global_lock, RA_WLOCKED);
2582	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2583	pv = get_pv_entry(pmap, FALSE);
2584	pv->pv_va = va;
2585	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2586}
2587
2588/*
2589 * Conditionally create a pv entry.
2590 */
2591static boolean_t
2592pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2593{
2594	pv_entry_t pv;
2595
2596	rw_assert(&pvh_global_lock, RA_WLOCKED);
2597	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2598	if (pv_entry_count < pv_entry_high_water &&
2599	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2600		pv->pv_va = va;
2601		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2602		return (TRUE);
2603	} else
2604		return (FALSE);
2605}
2606
2607/*
2608 * Create the pv entries for each of the pages within a superpage.
2609 */
2610static boolean_t
2611pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2612{
2613	struct md_page *pvh;
2614	pv_entry_t pv;
2615
2616	rw_assert(&pvh_global_lock, RA_WLOCKED);
2617	if (pv_entry_count < pv_entry_high_water &&
2618	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2619		pv->pv_va = va;
2620		pvh = pa_to_pvh(pa);
2621		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2622		return (TRUE);
2623	} else
2624		return (FALSE);
2625}
2626
2627/*
2628 * Fills a page table page with mappings to consecutive physical pages.
2629 */
2630static void
2631pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2632{
2633	pt_entry_t *pte;
2634
2635	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2636		*pte = newpte;
2637		newpte += PAGE_SIZE;
2638	}
2639}
2640
2641/*
2642 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2643 * 2- or 4MB page mapping is invalidated.
2644 */
2645static boolean_t
2646pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2647{
2648	pd_entry_t newpde, oldpde;
2649	pt_entry_t *firstpte, newpte;
2650	vm_paddr_t mptepa;
2651	vm_page_t mpte;
2652	struct spglist free;
2653
2654	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2655	oldpde = *pde;
2656	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2657	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2658	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
2659	    NULL)
2660		pmap_remove_pt_page(pmap, mpte);
2661	else {
2662		KASSERT((oldpde & PG_W) == 0,
2663		    ("pmap_demote_pde: page table page for a wired mapping"
2664		    " is missing"));
2665
2666		/*
2667		 * Invalidate the 2- or 4MB page mapping and return
2668		 * "failure" if the mapping was never accessed or the
2669		 * allocation of the new page table page fails.
2670		 */
2671		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2672		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2673		    VM_ALLOC_WIRED)) == NULL) {
2674			SLIST_INIT(&free);
2675			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2676			pmap_invalidate_page(pmap, trunc_4mpage(va));
2677			pmap_free_zero_pages(&free);
2678			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2679			    " in pmap %p", va, pmap);
2680			return (FALSE);
2681		}
2682		if (va < VM_MAXUSER_ADDRESS)
2683			pmap->pm_stats.resident_count++;
2684	}
2685	mptepa = VM_PAGE_TO_PHYS(mpte);
2686
2687	/*
2688	 * If the page mapping is in the kernel's address space, then the
2689	 * KPTmap can provide access to the page table page.  Otherwise,
2690	 * temporarily map the page table page (mpte) into the kernel's
2691	 * address space at either PADDR1 or PADDR2.
2692	 */
2693	if (va >= KERNBASE)
2694		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2695	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2696		if ((*PMAP1 & PG_FRAME) != mptepa) {
2697			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2698#ifdef SMP
2699			PMAP1cpu = PCPU_GET(cpuid);
2700#endif
2701			invlcaddr(PADDR1);
2702			PMAP1changed++;
2703		} else
2704#ifdef SMP
2705		if (PMAP1cpu != PCPU_GET(cpuid)) {
2706			PMAP1cpu = PCPU_GET(cpuid);
2707			invlcaddr(PADDR1);
2708			PMAP1changedcpu++;
2709		} else
2710#endif
2711			PMAP1unchanged++;
2712		firstpte = PADDR1;
2713	} else {
2714		mtx_lock(&PMAP2mutex);
2715		if ((*PMAP2 & PG_FRAME) != mptepa) {
2716			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2717			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2718		}
2719		firstpte = PADDR2;
2720	}
2721	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2722	KASSERT((oldpde & PG_A) != 0,
2723	    ("pmap_demote_pde: oldpde is missing PG_A"));
2724	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2725	    ("pmap_demote_pde: oldpde is missing PG_M"));
2726	newpte = oldpde & ~PG_PS;
2727	if ((newpte & PG_PDE_PAT) != 0)
2728		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2729
2730	/*
2731	 * If the page table page is new, initialize it.
2732	 */
2733	if (mpte->wire_count == 1) {
2734		mpte->wire_count = NPTEPG;
2735		pmap_fill_ptp(firstpte, newpte);
2736	}
2737	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2738	    ("pmap_demote_pde: firstpte and newpte map different physical"
2739	    " addresses"));
2740
2741	/*
2742	 * If the mapping has changed attributes, update the page table
2743	 * entries.
2744	 */
2745	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2746		pmap_fill_ptp(firstpte, newpte);
2747
2748	/*
2749	 * Demote the mapping.  This pmap is locked.  The old PDE has
2750	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2751	 * set.  Thus, there is no danger of a race with another
2752	 * processor changing the setting of PG_A and/or PG_M between
2753	 * the read above and the store below.
2754	 */
2755	if (workaround_erratum383)
2756		pmap_update_pde(pmap, va, pde, newpde);
2757	else if (pmap == kernel_pmap)
2758		pmap_kenter_pde(va, newpde);
2759	else
2760		pde_store(pde, newpde);
2761	if (firstpte == PADDR2)
2762		mtx_unlock(&PMAP2mutex);
2763
2764	/*
2765	 * Invalidate the recursive mapping of the page table page.
2766	 */
2767	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2768
2769	/*
2770	 * Demote the pv entry.  This depends on the earlier demotion
2771	 * of the mapping.  Specifically, the (re)creation of a per-
2772	 * page pv entry might trigger the execution of pmap_collect(),
2773	 * which might reclaim a newly (re)created per-page pv entry
2774	 * and destroy the associated mapping.  In order to destroy
2775	 * the mapping, the PDE must have already changed from mapping
2776	 * the 2mpage to referencing the page table page.
2777	 */
2778	if ((oldpde & PG_MANAGED) != 0)
2779		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2780
2781	pmap_pde_demotions++;
2782	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2783	    " in pmap %p", va, pmap);
2784	return (TRUE);
2785}
2786
2787/*
2788 * Removes a 2- or 4MB page mapping from the kernel pmap.
2789 */
2790static void
2791pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2792{
2793	pd_entry_t newpde;
2794	vm_paddr_t mptepa;
2795	vm_page_t mpte;
2796
2797	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2798	mpte = pmap_lookup_pt_page(pmap, va);
2799	if (mpte == NULL)
2800		panic("pmap_remove_kernel_pde: Missing pt page.");
2801
2802	pmap_remove_pt_page(pmap, mpte);
2803	mptepa = VM_PAGE_TO_PHYS(mpte);
2804	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2805
2806	/*
2807	 * Initialize the page table page.
2808	 */
2809	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2810
2811	/*
2812	 * Remove the mapping.
2813	 */
2814	if (workaround_erratum383)
2815		pmap_update_pde(pmap, va, pde, newpde);
2816	else
2817		pmap_kenter_pde(va, newpde);
2818
2819	/*
2820	 * Invalidate the recursive mapping of the page table page.
2821	 */
2822	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2823}
2824
2825/*
2826 * pmap_remove_pde: do the things to unmap a superpage in a process
2827 */
2828static void
2829pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2830    struct spglist *free)
2831{
2832	struct md_page *pvh;
2833	pd_entry_t oldpde;
2834	vm_offset_t eva, va;
2835	vm_page_t m, mpte;
2836
2837	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2838	KASSERT((sva & PDRMASK) == 0,
2839	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2840	oldpde = pte_load_clear(pdq);
2841	if (oldpde & PG_W)
2842		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2843
2844	/*
2845	 * Machines that don't support invlpg, also don't support
2846	 * PG_G.
2847	 */
2848	if (oldpde & PG_G)
2849		pmap_invalidate_page(kernel_pmap, sva);
2850	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2851	if (oldpde & PG_MANAGED) {
2852		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2853		pmap_pvh_free(pvh, pmap, sva);
2854		eva = sva + NBPDR;
2855		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2856		    va < eva; va += PAGE_SIZE, m++) {
2857			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2858				vm_page_dirty(m);
2859			if (oldpde & PG_A)
2860				vm_page_aflag_set(m, PGA_REFERENCED);
2861			if (TAILQ_EMPTY(&m->md.pv_list) &&
2862			    TAILQ_EMPTY(&pvh->pv_list))
2863				vm_page_aflag_clear(m, PGA_WRITEABLE);
2864		}
2865	}
2866	if (pmap == kernel_pmap) {
2867		pmap_remove_kernel_pde(pmap, pdq, sva);
2868	} else {
2869		mpte = pmap_lookup_pt_page(pmap, sva);
2870		if (mpte != NULL) {
2871			pmap_remove_pt_page(pmap, mpte);
2872			pmap->pm_stats.resident_count--;
2873			KASSERT(mpte->wire_count == NPTEPG,
2874			    ("pmap_remove_pde: pte page wire count error"));
2875			mpte->wire_count = 0;
2876			pmap_add_delayed_free_list(mpte, free, FALSE);
2877			atomic_subtract_int(&cnt.v_wire_count, 1);
2878		}
2879	}
2880}
2881
2882/*
2883 * pmap_remove_pte: do the things to unmap a page in a process
2884 */
2885static int
2886pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2887    struct spglist *free)
2888{
2889	pt_entry_t oldpte;
2890	vm_page_t m;
2891
2892	rw_assert(&pvh_global_lock, RA_WLOCKED);
2893	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2894	oldpte = pte_load_clear(ptq);
2895	KASSERT(oldpte != 0,
2896	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2897	if (oldpte & PG_W)
2898		pmap->pm_stats.wired_count -= 1;
2899	/*
2900	 * Machines that don't support invlpg, also don't support
2901	 * PG_G.
2902	 */
2903	if (oldpte & PG_G)
2904		pmap_invalidate_page(kernel_pmap, va);
2905	pmap->pm_stats.resident_count -= 1;
2906	if (oldpte & PG_MANAGED) {
2907		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2908		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2909			vm_page_dirty(m);
2910		if (oldpte & PG_A)
2911			vm_page_aflag_set(m, PGA_REFERENCED);
2912		pmap_remove_entry(pmap, m, va);
2913	}
2914	return (pmap_unuse_pt(pmap, va, free));
2915}
2916
2917/*
2918 * Remove a single page from a process address space
2919 */
2920static void
2921pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2922{
2923	pt_entry_t *pte;
2924
2925	rw_assert(&pvh_global_lock, RA_WLOCKED);
2926	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2927	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2928	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2929		return;
2930	pmap_remove_pte(pmap, pte, va, free);
2931	pmap_invalidate_page(pmap, va);
2932}
2933
2934/*
2935 *	Remove the given range of addresses from the specified map.
2936 *
2937 *	It is assumed that the start and end are properly
2938 *	rounded to the page size.
2939 */
2940void
2941pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2942{
2943	vm_offset_t pdnxt;
2944	pd_entry_t ptpaddr;
2945	pt_entry_t *pte;
2946	struct spglist free;
2947	int anyvalid;
2948
2949	/*
2950	 * Perform an unsynchronized read.  This is, however, safe.
2951	 */
2952	if (pmap->pm_stats.resident_count == 0)
2953		return;
2954
2955	anyvalid = 0;
2956	SLIST_INIT(&free);
2957
2958	rw_wlock(&pvh_global_lock);
2959	sched_pin();
2960	PMAP_LOCK(pmap);
2961
2962	/*
2963	 * special handling of removing one page.  a very
2964	 * common operation and easy to short circuit some
2965	 * code.
2966	 */
2967	if ((sva + PAGE_SIZE == eva) &&
2968	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2969		pmap_remove_page(pmap, sva, &free);
2970		goto out;
2971	}
2972
2973	for (; sva < eva; sva = pdnxt) {
2974		u_int pdirindex;
2975
2976		/*
2977		 * Calculate index for next page table.
2978		 */
2979		pdnxt = (sva + NBPDR) & ~PDRMASK;
2980		if (pdnxt < sva)
2981			pdnxt = eva;
2982		if (pmap->pm_stats.resident_count == 0)
2983			break;
2984
2985		pdirindex = sva >> PDRSHIFT;
2986		ptpaddr = pmap->pm_pdir[pdirindex];
2987
2988		/*
2989		 * Weed out invalid mappings. Note: we assume that the page
2990		 * directory table is always allocated, and in kernel virtual.
2991		 */
2992		if (ptpaddr == 0)
2993			continue;
2994
2995		/*
2996		 * Check for large page.
2997		 */
2998		if ((ptpaddr & PG_PS) != 0) {
2999			/*
3000			 * Are we removing the entire large page?  If not,
3001			 * demote the mapping and fall through.
3002			 */
3003			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3004				/*
3005				 * The TLB entry for a PG_G mapping is
3006				 * invalidated by pmap_remove_pde().
3007				 */
3008				if ((ptpaddr & PG_G) == 0)
3009					anyvalid = 1;
3010				pmap_remove_pde(pmap,
3011				    &pmap->pm_pdir[pdirindex], sva, &free);
3012				continue;
3013			} else if (!pmap_demote_pde(pmap,
3014			    &pmap->pm_pdir[pdirindex], sva)) {
3015				/* The large page mapping was destroyed. */
3016				continue;
3017			}
3018		}
3019
3020		/*
3021		 * Limit our scan to either the end of the va represented
3022		 * by the current page table page, or to the end of the
3023		 * range being removed.
3024		 */
3025		if (pdnxt > eva)
3026			pdnxt = eva;
3027
3028		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3029		    sva += PAGE_SIZE) {
3030			if (*pte == 0)
3031				continue;
3032
3033			/*
3034			 * The TLB entry for a PG_G mapping is invalidated
3035			 * by pmap_remove_pte().
3036			 */
3037			if ((*pte & PG_G) == 0)
3038				anyvalid = 1;
3039			if (pmap_remove_pte(pmap, pte, sva, &free))
3040				break;
3041		}
3042	}
3043out:
3044	sched_unpin();
3045	if (anyvalid)
3046		pmap_invalidate_all(pmap);
3047	rw_wunlock(&pvh_global_lock);
3048	PMAP_UNLOCK(pmap);
3049	pmap_free_zero_pages(&free);
3050}
3051
3052/*
3053 *	Routine:	pmap_remove_all
3054 *	Function:
3055 *		Removes this physical page from
3056 *		all physical maps in which it resides.
3057 *		Reflects back modify bits to the pager.
3058 *
3059 *	Notes:
3060 *		Original versions of this routine were very
3061 *		inefficient because they iteratively called
3062 *		pmap_remove (slow...)
3063 */
3064
3065void
3066pmap_remove_all(vm_page_t m)
3067{
3068	struct md_page *pvh;
3069	pv_entry_t pv;
3070	pmap_t pmap;
3071	pt_entry_t *pte, tpte;
3072	pd_entry_t *pde;
3073	vm_offset_t va;
3074	struct spglist free;
3075
3076	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3077	    ("pmap_remove_all: page %p is not managed", m));
3078	SLIST_INIT(&free);
3079	rw_wlock(&pvh_global_lock);
3080	sched_pin();
3081	if ((m->flags & PG_FICTITIOUS) != 0)
3082		goto small_mappings;
3083	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3084	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3085		va = pv->pv_va;
3086		pmap = PV_PMAP(pv);
3087		PMAP_LOCK(pmap);
3088		pde = pmap_pde(pmap, va);
3089		(void)pmap_demote_pde(pmap, pde, va);
3090		PMAP_UNLOCK(pmap);
3091	}
3092small_mappings:
3093	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3094		pmap = PV_PMAP(pv);
3095		PMAP_LOCK(pmap);
3096		pmap->pm_stats.resident_count--;
3097		pde = pmap_pde(pmap, pv->pv_va);
3098		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3099		    " a 4mpage in page %p's pv list", m));
3100		pte = pmap_pte_quick(pmap, pv->pv_va);
3101		tpte = pte_load_clear(pte);
3102		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3103		    pmap, pv->pv_va));
3104		if (tpte & PG_W)
3105			pmap->pm_stats.wired_count--;
3106		if (tpte & PG_A)
3107			vm_page_aflag_set(m, PGA_REFERENCED);
3108
3109		/*
3110		 * Update the vm_page_t clean and reference bits.
3111		 */
3112		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3113			vm_page_dirty(m);
3114		pmap_unuse_pt(pmap, pv->pv_va, &free);
3115		pmap_invalidate_page(pmap, pv->pv_va);
3116		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3117		free_pv_entry(pmap, pv);
3118		PMAP_UNLOCK(pmap);
3119	}
3120	vm_page_aflag_clear(m, PGA_WRITEABLE);
3121	sched_unpin();
3122	rw_wunlock(&pvh_global_lock);
3123	pmap_free_zero_pages(&free);
3124}
3125
3126/*
3127 * pmap_protect_pde: do the things to protect a 4mpage in a process
3128 */
3129static boolean_t
3130pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3131{
3132	pd_entry_t newpde, oldpde;
3133	vm_offset_t eva, va;
3134	vm_page_t m;
3135	boolean_t anychanged;
3136
3137	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3138	KASSERT((sva & PDRMASK) == 0,
3139	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3140	anychanged = FALSE;
3141retry:
3142	oldpde = newpde = *pde;
3143	if (oldpde & PG_MANAGED) {
3144		eva = sva + NBPDR;
3145		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3146		    va < eva; va += PAGE_SIZE, m++)
3147			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3148				vm_page_dirty(m);
3149	}
3150	if ((prot & VM_PROT_WRITE) == 0)
3151		newpde &= ~(PG_RW | PG_M);
3152#ifdef PAE
3153	if ((prot & VM_PROT_EXECUTE) == 0)
3154		newpde |= pg_nx;
3155#endif
3156	if (newpde != oldpde) {
3157		if (!pde_cmpset(pde, oldpde, newpde))
3158			goto retry;
3159		if (oldpde & PG_G)
3160			pmap_invalidate_page(pmap, sva);
3161		else
3162			anychanged = TRUE;
3163	}
3164	return (anychanged);
3165}
3166
3167/*
3168 *	Set the physical protection on the
3169 *	specified range of this map as requested.
3170 */
3171void
3172pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3173{
3174	vm_offset_t pdnxt;
3175	pd_entry_t ptpaddr;
3176	pt_entry_t *pte;
3177	boolean_t anychanged, pv_lists_locked;
3178
3179	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3180	if (prot == VM_PROT_NONE) {
3181		pmap_remove(pmap, sva, eva);
3182		return;
3183	}
3184
3185#ifdef PAE
3186	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3187	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3188		return;
3189#else
3190	if (prot & VM_PROT_WRITE)
3191		return;
3192#endif
3193
3194	if (pmap_is_current(pmap))
3195		pv_lists_locked = FALSE;
3196	else {
3197		pv_lists_locked = TRUE;
3198resume:
3199		rw_wlock(&pvh_global_lock);
3200		sched_pin();
3201	}
3202	anychanged = FALSE;
3203
3204	PMAP_LOCK(pmap);
3205	for (; sva < eva; sva = pdnxt) {
3206		pt_entry_t obits, pbits;
3207		u_int pdirindex;
3208
3209		pdnxt = (sva + NBPDR) & ~PDRMASK;
3210		if (pdnxt < sva)
3211			pdnxt = eva;
3212
3213		pdirindex = sva >> PDRSHIFT;
3214		ptpaddr = pmap->pm_pdir[pdirindex];
3215
3216		/*
3217		 * Weed out invalid mappings. Note: we assume that the page
3218		 * directory table is always allocated, and in kernel virtual.
3219		 */
3220		if (ptpaddr == 0)
3221			continue;
3222
3223		/*
3224		 * Check for large page.
3225		 */
3226		if ((ptpaddr & PG_PS) != 0) {
3227			/*
3228			 * Are we protecting the entire large page?  If not,
3229			 * demote the mapping and fall through.
3230			 */
3231			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3232				/*
3233				 * The TLB entry for a PG_G mapping is
3234				 * invalidated by pmap_protect_pde().
3235				 */
3236				if (pmap_protect_pde(pmap,
3237				    &pmap->pm_pdir[pdirindex], sva, prot))
3238					anychanged = TRUE;
3239				continue;
3240			} else {
3241				if (!pv_lists_locked) {
3242					pv_lists_locked = TRUE;
3243					if (!rw_try_wlock(&pvh_global_lock)) {
3244						if (anychanged)
3245							pmap_invalidate_all(
3246							    pmap);
3247						PMAP_UNLOCK(pmap);
3248						goto resume;
3249					}
3250					sched_pin();
3251				}
3252				if (!pmap_demote_pde(pmap,
3253				    &pmap->pm_pdir[pdirindex], sva)) {
3254					/*
3255					 * The large page mapping was
3256					 * destroyed.
3257					 */
3258					continue;
3259				}
3260			}
3261		}
3262
3263		if (pdnxt > eva)
3264			pdnxt = eva;
3265
3266		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3267		    sva += PAGE_SIZE) {
3268			vm_page_t m;
3269
3270retry:
3271			/*
3272			 * Regardless of whether a pte is 32 or 64 bits in
3273			 * size, PG_RW, PG_A, and PG_M are among the least
3274			 * significant 32 bits.
3275			 */
3276			obits = pbits = *pte;
3277			if ((pbits & PG_V) == 0)
3278				continue;
3279
3280			if ((prot & VM_PROT_WRITE) == 0) {
3281				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3282				    (PG_MANAGED | PG_M | PG_RW)) {
3283					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3284					vm_page_dirty(m);
3285				}
3286				pbits &= ~(PG_RW | PG_M);
3287			}
3288#ifdef PAE
3289			if ((prot & VM_PROT_EXECUTE) == 0)
3290				pbits |= pg_nx;
3291#endif
3292
3293			if (pbits != obits) {
3294#ifdef PAE
3295				if (!atomic_cmpset_64(pte, obits, pbits))
3296					goto retry;
3297#else
3298				if (!atomic_cmpset_int((u_int *)pte, obits,
3299				    pbits))
3300					goto retry;
3301#endif
3302				if (obits & PG_G)
3303					pmap_invalidate_page(pmap, sva);
3304				else
3305					anychanged = TRUE;
3306			}
3307		}
3308	}
3309	if (anychanged)
3310		pmap_invalidate_all(pmap);
3311	if (pv_lists_locked) {
3312		sched_unpin();
3313		rw_wunlock(&pvh_global_lock);
3314	}
3315	PMAP_UNLOCK(pmap);
3316}
3317
3318/*
3319 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3320 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3321 * For promotion to occur, two conditions must be met: (1) the 4KB page
3322 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3323 * mappings must have identical characteristics.
3324 *
3325 * Managed (PG_MANAGED) mappings within the kernel address space are not
3326 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3327 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3328 * pmap.
3329 */
3330static void
3331pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3332{
3333	pd_entry_t newpde;
3334	pt_entry_t *firstpte, oldpte, pa, *pte;
3335	vm_offset_t oldpteva;
3336	vm_page_t mpte;
3337
3338	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3339
3340	/*
3341	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3342	 * either invalid, unused, or does not map the first 4KB physical page
3343	 * within a 2- or 4MB page.
3344	 */
3345	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3346setpde:
3347	newpde = *firstpte;
3348	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3349		pmap_pde_p_failures++;
3350		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3351		    " in pmap %p", va, pmap);
3352		return;
3353	}
3354	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3355		pmap_pde_p_failures++;
3356		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3357		    " in pmap %p", va, pmap);
3358		return;
3359	}
3360	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3361		/*
3362		 * When PG_M is already clear, PG_RW can be cleared without
3363		 * a TLB invalidation.
3364		 */
3365		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3366		    ~PG_RW))
3367			goto setpde;
3368		newpde &= ~PG_RW;
3369	}
3370
3371	/*
3372	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3373	 * PTE maps an unexpected 4KB physical page or does not have identical
3374	 * characteristics to the first PTE.
3375	 */
3376	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3377	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3378setpte:
3379		oldpte = *pte;
3380		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3381			pmap_pde_p_failures++;
3382			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3383			    " in pmap %p", va, pmap);
3384			return;
3385		}
3386		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3387			/*
3388			 * When PG_M is already clear, PG_RW can be cleared
3389			 * without a TLB invalidation.
3390			 */
3391			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3392			    oldpte & ~PG_RW))
3393				goto setpte;
3394			oldpte &= ~PG_RW;
3395			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3396			    (va & ~PDRMASK);
3397			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3398			    " in pmap %p", oldpteva, pmap);
3399		}
3400		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3401			pmap_pde_p_failures++;
3402			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3403			    " in pmap %p", va, pmap);
3404			return;
3405		}
3406		pa -= PAGE_SIZE;
3407	}
3408
3409	/*
3410	 * Save the page table page in its current state until the PDE
3411	 * mapping the superpage is demoted by pmap_demote_pde() or
3412	 * destroyed by pmap_remove_pde().
3413	 */
3414	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3415	KASSERT(mpte >= vm_page_array &&
3416	    mpte < &vm_page_array[vm_page_array_size],
3417	    ("pmap_promote_pde: page table page is out of range"));
3418	KASSERT(mpte->pindex == va >> PDRSHIFT,
3419	    ("pmap_promote_pde: page table page's pindex is wrong"));
3420	if (pmap_insert_pt_page(pmap, mpte)) {
3421		pmap_pde_p_failures++;
3422		CTR2(KTR_PMAP,
3423		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3424		    pmap);
3425		return;
3426	}
3427
3428	/*
3429	 * Promote the pv entries.
3430	 */
3431	if ((newpde & PG_MANAGED) != 0)
3432		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3433
3434	/*
3435	 * Propagate the PAT index to its proper position.
3436	 */
3437	if ((newpde & PG_PTE_PAT) != 0)
3438		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3439
3440	/*
3441	 * Map the superpage.
3442	 */
3443	if (workaround_erratum383)
3444		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3445	else if (pmap == kernel_pmap)
3446		pmap_kenter_pde(va, PG_PS | newpde);
3447	else
3448		pde_store(pde, PG_PS | newpde);
3449
3450	pmap_pde_promotions++;
3451	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3452	    " in pmap %p", va, pmap);
3453}
3454
3455/*
3456 *	Insert the given physical page (p) at
3457 *	the specified virtual address (v) in the
3458 *	target physical map with the protection requested.
3459 *
3460 *	If specified, the page will be wired down, meaning
3461 *	that the related pte can not be reclaimed.
3462 *
3463 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3464 *	or lose information.  That is, this routine must actually
3465 *	insert this page into the given map NOW.
3466 */
3467int
3468pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3469    u_int flags, int8_t psind)
3470{
3471	pd_entry_t *pde;
3472	pt_entry_t *pte;
3473	pt_entry_t newpte, origpte;
3474	pv_entry_t pv;
3475	vm_paddr_t opa, pa;
3476	vm_page_t mpte, om;
3477	boolean_t invlva, wired;
3478
3479	va = trunc_page(va);
3480	mpte = NULL;
3481	wired = (flags & PMAP_ENTER_WIRED) != 0;
3482
3483	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3484	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3485	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3486	    va));
3487	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3488		VM_OBJECT_ASSERT_LOCKED(m->object);
3489
3490	rw_wlock(&pvh_global_lock);
3491	PMAP_LOCK(pmap);
3492	sched_pin();
3493
3494	/*
3495	 * In the case that a page table page is not
3496	 * resident, we are creating it here.
3497	 */
3498	if (va < VM_MAXUSER_ADDRESS) {
3499		mpte = pmap_allocpte(pmap, va, flags);
3500		if (mpte == NULL) {
3501			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3502			    ("pmap_allocpte failed with sleep allowed"));
3503			sched_unpin();
3504			rw_wunlock(&pvh_global_lock);
3505			PMAP_UNLOCK(pmap);
3506			return (KERN_RESOURCE_SHORTAGE);
3507		}
3508	}
3509
3510	pde = pmap_pde(pmap, va);
3511	if ((*pde & PG_PS) != 0)
3512		panic("pmap_enter: attempted pmap_enter on 4MB page");
3513	pte = pmap_pte_quick(pmap, va);
3514
3515	/*
3516	 * Page Directory table entry not valid, we need a new PT page
3517	 */
3518	if (pte == NULL) {
3519		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3520			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3521	}
3522
3523	pa = VM_PAGE_TO_PHYS(m);
3524	om = NULL;
3525	origpte = *pte;
3526	opa = origpte & PG_FRAME;
3527
3528	/*
3529	 * Mapping has not changed, must be protection or wiring change.
3530	 */
3531	if (origpte && (opa == pa)) {
3532		/*
3533		 * Wiring change, just update stats. We don't worry about
3534		 * wiring PT pages as they remain resident as long as there
3535		 * are valid mappings in them. Hence, if a user page is wired,
3536		 * the PT page will be also.
3537		 */
3538		if (wired && ((origpte & PG_W) == 0))
3539			pmap->pm_stats.wired_count++;
3540		else if (!wired && (origpte & PG_W))
3541			pmap->pm_stats.wired_count--;
3542
3543		/*
3544		 * Remove extra pte reference
3545		 */
3546		if (mpte)
3547			mpte->wire_count--;
3548
3549		if (origpte & PG_MANAGED) {
3550			om = m;
3551			pa |= PG_MANAGED;
3552		}
3553		goto validate;
3554	}
3555
3556	pv = NULL;
3557
3558	/*
3559	 * Mapping has changed, invalidate old range and fall through to
3560	 * handle validating new mapping.
3561	 */
3562	if (opa) {
3563		if (origpte & PG_W)
3564			pmap->pm_stats.wired_count--;
3565		if (origpte & PG_MANAGED) {
3566			om = PHYS_TO_VM_PAGE(opa);
3567			pv = pmap_pvh_remove(&om->md, pmap, va);
3568		}
3569		if (mpte != NULL) {
3570			mpte->wire_count--;
3571			KASSERT(mpte->wire_count > 0,
3572			    ("pmap_enter: missing reference to page table page,"
3573			     " va: 0x%x", va));
3574		}
3575	} else
3576		pmap->pm_stats.resident_count++;
3577
3578	/*
3579	 * Enter on the PV list if part of our managed memory.
3580	 */
3581	if ((m->oflags & VPO_UNMANAGED) == 0) {
3582		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3583		    ("pmap_enter: managed mapping within the clean submap"));
3584		if (pv == NULL)
3585			pv = get_pv_entry(pmap, FALSE);
3586		pv->pv_va = va;
3587		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3588		pa |= PG_MANAGED;
3589	} else if (pv != NULL)
3590		free_pv_entry(pmap, pv);
3591
3592	/*
3593	 * Increment counters
3594	 */
3595	if (wired)
3596		pmap->pm_stats.wired_count++;
3597
3598validate:
3599	/*
3600	 * Now validate mapping with desired protection/wiring.
3601	 */
3602	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3603	if ((prot & VM_PROT_WRITE) != 0) {
3604		newpte |= PG_RW;
3605		if ((newpte & PG_MANAGED) != 0)
3606			vm_page_aflag_set(m, PGA_WRITEABLE);
3607	}
3608#ifdef PAE
3609	if ((prot & VM_PROT_EXECUTE) == 0)
3610		newpte |= pg_nx;
3611#endif
3612	if (wired)
3613		newpte |= PG_W;
3614	if (va < VM_MAXUSER_ADDRESS)
3615		newpte |= PG_U;
3616	if (pmap == kernel_pmap)
3617		newpte |= pgeflag;
3618
3619	/*
3620	 * if the mapping or permission bits are different, we need
3621	 * to update the pte.
3622	 */
3623	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3624		newpte |= PG_A;
3625		if ((flags & VM_PROT_WRITE) != 0)
3626			newpte |= PG_M;
3627		if (origpte & PG_V) {
3628			invlva = FALSE;
3629			origpte = pte_load_store(pte, newpte);
3630			if (origpte & PG_A) {
3631				if (origpte & PG_MANAGED)
3632					vm_page_aflag_set(om, PGA_REFERENCED);
3633				if (opa != VM_PAGE_TO_PHYS(m))
3634					invlva = TRUE;
3635#ifdef PAE
3636				if ((origpte & PG_NX) == 0 &&
3637				    (newpte & PG_NX) != 0)
3638					invlva = TRUE;
3639#endif
3640			}
3641			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3642				if ((origpte & PG_MANAGED) != 0)
3643					vm_page_dirty(om);
3644				if ((prot & VM_PROT_WRITE) == 0)
3645					invlva = TRUE;
3646			}
3647			if ((origpte & PG_MANAGED) != 0 &&
3648			    TAILQ_EMPTY(&om->md.pv_list) &&
3649			    ((om->flags & PG_FICTITIOUS) != 0 ||
3650			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3651				vm_page_aflag_clear(om, PGA_WRITEABLE);
3652			if (invlva)
3653				pmap_invalidate_page(pmap, va);
3654		} else
3655			pte_store(pte, newpte);
3656	}
3657
3658	/*
3659	 * If both the page table page and the reservation are fully
3660	 * populated, then attempt promotion.
3661	 */
3662	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3663	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3664	    vm_reserv_level_iffullpop(m) == 0)
3665		pmap_promote_pde(pmap, pde, va);
3666
3667	sched_unpin();
3668	rw_wunlock(&pvh_global_lock);
3669	PMAP_UNLOCK(pmap);
3670	return (KERN_SUCCESS);
3671}
3672
3673/*
3674 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3675 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3676 * blocking, (2) a mapping already exists at the specified virtual address, or
3677 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3678 */
3679static boolean_t
3680pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3681{
3682	pd_entry_t *pde, newpde;
3683
3684	rw_assert(&pvh_global_lock, RA_WLOCKED);
3685	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3686	pde = pmap_pde(pmap, va);
3687	if (*pde != 0) {
3688		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3689		    " in pmap %p", va, pmap);
3690		return (FALSE);
3691	}
3692	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3693	    PG_PS | PG_V;
3694	if ((m->oflags & VPO_UNMANAGED) == 0) {
3695		newpde |= PG_MANAGED;
3696
3697		/*
3698		 * Abort this mapping if its PV entry could not be created.
3699		 */
3700		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3701			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3702			    " in pmap %p", va, pmap);
3703			return (FALSE);
3704		}
3705	}
3706#ifdef PAE
3707	if ((prot & VM_PROT_EXECUTE) == 0)
3708		newpde |= pg_nx;
3709#endif
3710	if (va < VM_MAXUSER_ADDRESS)
3711		newpde |= PG_U;
3712
3713	/*
3714	 * Increment counters.
3715	 */
3716	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3717
3718	/*
3719	 * Map the superpage.
3720	 */
3721	pde_store(pde, newpde);
3722
3723	pmap_pde_mappings++;
3724	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3725	    " in pmap %p", va, pmap);
3726	return (TRUE);
3727}
3728
3729/*
3730 * Maps a sequence of resident pages belonging to the same object.
3731 * The sequence begins with the given page m_start.  This page is
3732 * mapped at the given virtual address start.  Each subsequent page is
3733 * mapped at a virtual address that is offset from start by the same
3734 * amount as the page is offset from m_start within the object.  The
3735 * last page in the sequence is the page with the largest offset from
3736 * m_start that can be mapped at a virtual address less than the given
3737 * virtual address end.  Not every virtual page between start and end
3738 * is mapped; only those for which a resident page exists with the
3739 * corresponding offset from m_start are mapped.
3740 */
3741void
3742pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3743    vm_page_t m_start, vm_prot_t prot)
3744{
3745	vm_offset_t va;
3746	vm_page_t m, mpte;
3747	vm_pindex_t diff, psize;
3748
3749	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3750
3751	psize = atop(end - start);
3752	mpte = NULL;
3753	m = m_start;
3754	rw_wlock(&pvh_global_lock);
3755	PMAP_LOCK(pmap);
3756	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3757		va = start + ptoa(diff);
3758		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3759		    m->psind == 1 && pg_ps_enabled &&
3760		    pmap_enter_pde(pmap, va, m, prot))
3761			m = &m[NBPDR / PAGE_SIZE - 1];
3762		else
3763			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3764			    mpte);
3765		m = TAILQ_NEXT(m, listq);
3766	}
3767	rw_wunlock(&pvh_global_lock);
3768	PMAP_UNLOCK(pmap);
3769}
3770
3771/*
3772 * this code makes some *MAJOR* assumptions:
3773 * 1. Current pmap & pmap exists.
3774 * 2. Not wired.
3775 * 3. Read access.
3776 * 4. No page table pages.
3777 * but is *MUCH* faster than pmap_enter...
3778 */
3779
3780void
3781pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3782{
3783
3784	rw_wlock(&pvh_global_lock);
3785	PMAP_LOCK(pmap);
3786	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3787	rw_wunlock(&pvh_global_lock);
3788	PMAP_UNLOCK(pmap);
3789}
3790
3791static vm_page_t
3792pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3793    vm_prot_t prot, vm_page_t mpte)
3794{
3795	pt_entry_t *pte;
3796	vm_paddr_t pa;
3797	struct spglist free;
3798
3799	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3800	    (m->oflags & VPO_UNMANAGED) != 0,
3801	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3802	rw_assert(&pvh_global_lock, RA_WLOCKED);
3803	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3804
3805	/*
3806	 * In the case that a page table page is not
3807	 * resident, we are creating it here.
3808	 */
3809	if (va < VM_MAXUSER_ADDRESS) {
3810		u_int ptepindex;
3811		pd_entry_t ptepa;
3812
3813		/*
3814		 * Calculate pagetable page index
3815		 */
3816		ptepindex = va >> PDRSHIFT;
3817		if (mpte && (mpte->pindex == ptepindex)) {
3818			mpte->wire_count++;
3819		} else {
3820			/*
3821			 * Get the page directory entry
3822			 */
3823			ptepa = pmap->pm_pdir[ptepindex];
3824
3825			/*
3826			 * If the page table page is mapped, we just increment
3827			 * the hold count, and activate it.
3828			 */
3829			if (ptepa) {
3830				if (ptepa & PG_PS)
3831					return (NULL);
3832				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3833				mpte->wire_count++;
3834			} else {
3835				mpte = _pmap_allocpte(pmap, ptepindex,
3836				    PMAP_ENTER_NOSLEEP);
3837				if (mpte == NULL)
3838					return (mpte);
3839			}
3840		}
3841	} else {
3842		mpte = NULL;
3843	}
3844
3845	/*
3846	 * This call to vtopte makes the assumption that we are
3847	 * entering the page into the current pmap.  In order to support
3848	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3849	 * But that isn't as quick as vtopte.
3850	 */
3851	pte = vtopte(va);
3852	if (*pte) {
3853		if (mpte != NULL) {
3854			mpte->wire_count--;
3855			mpte = NULL;
3856		}
3857		return (mpte);
3858	}
3859
3860	/*
3861	 * Enter on the PV list if part of our managed memory.
3862	 */
3863	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3864	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3865		if (mpte != NULL) {
3866			SLIST_INIT(&free);
3867			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3868				pmap_invalidate_page(pmap, va);
3869				pmap_free_zero_pages(&free);
3870			}
3871
3872			mpte = NULL;
3873		}
3874		return (mpte);
3875	}
3876
3877	/*
3878	 * Increment counters
3879	 */
3880	pmap->pm_stats.resident_count++;
3881
3882	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3883#ifdef PAE
3884	if ((prot & VM_PROT_EXECUTE) == 0)
3885		pa |= pg_nx;
3886#endif
3887
3888	/*
3889	 * Now validate mapping with RO protection
3890	 */
3891	if ((m->oflags & VPO_UNMANAGED) != 0)
3892		pte_store(pte, pa | PG_V | PG_U);
3893	else
3894		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3895	return (mpte);
3896}
3897
3898/*
3899 * Make a temporary mapping for a physical address.  This is only intended
3900 * to be used for panic dumps.
3901 */
3902void *
3903pmap_kenter_temporary(vm_paddr_t pa, int i)
3904{
3905	vm_offset_t va;
3906
3907	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3908	pmap_kenter(va, pa);
3909	invlpg(va);
3910	return ((void *)crashdumpmap);
3911}
3912
3913/*
3914 * This code maps large physical mmap regions into the
3915 * processor address space.  Note that some shortcuts
3916 * are taken, but the code works.
3917 */
3918void
3919pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3920    vm_pindex_t pindex, vm_size_t size)
3921{
3922	pd_entry_t *pde;
3923	vm_paddr_t pa, ptepa;
3924	vm_page_t p;
3925	int pat_mode;
3926
3927	VM_OBJECT_ASSERT_WLOCKED(object);
3928	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3929	    ("pmap_object_init_pt: non-device object"));
3930	if (pseflag &&
3931	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3932		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3933			return;
3934		p = vm_page_lookup(object, pindex);
3935		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3936		    ("pmap_object_init_pt: invalid page %p", p));
3937		pat_mode = p->md.pat_mode;
3938
3939		/*
3940		 * Abort the mapping if the first page is not physically
3941		 * aligned to a 2/4MB page boundary.
3942		 */
3943		ptepa = VM_PAGE_TO_PHYS(p);
3944		if (ptepa & (NBPDR - 1))
3945			return;
3946
3947		/*
3948		 * Skip the first page.  Abort the mapping if the rest of
3949		 * the pages are not physically contiguous or have differing
3950		 * memory attributes.
3951		 */
3952		p = TAILQ_NEXT(p, listq);
3953		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3954		    pa += PAGE_SIZE) {
3955			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3956			    ("pmap_object_init_pt: invalid page %p", p));
3957			if (pa != VM_PAGE_TO_PHYS(p) ||
3958			    pat_mode != p->md.pat_mode)
3959				return;
3960			p = TAILQ_NEXT(p, listq);
3961		}
3962
3963		/*
3964		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3965		 * "size" is a multiple of 2/4M, adding the PAT setting to
3966		 * "pa" will not affect the termination of this loop.
3967		 */
3968		PMAP_LOCK(pmap);
3969		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3970		    size; pa += NBPDR) {
3971			pde = pmap_pde(pmap, addr);
3972			if (*pde == 0) {
3973				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3974				    PG_U | PG_RW | PG_V);
3975				pmap->pm_stats.resident_count += NBPDR /
3976				    PAGE_SIZE;
3977				pmap_pde_mappings++;
3978			}
3979			/* Else continue on if the PDE is already valid. */
3980			addr += NBPDR;
3981		}
3982		PMAP_UNLOCK(pmap);
3983	}
3984}
3985
3986/*
3987 *	Clear the wired attribute from the mappings for the specified range of
3988 *	addresses in the given pmap.  Every valid mapping within that range
3989 *	must have the wired attribute set.  In contrast, invalid mappings
3990 *	cannot have the wired attribute set, so they are ignored.
3991 *
3992 *	The wired attribute of the page table entry is not a hardware feature,
3993 *	so there is no need to invalidate any TLB entries.
3994 */
3995void
3996pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3997{
3998	vm_offset_t pdnxt;
3999	pd_entry_t *pde;
4000	pt_entry_t *pte;
4001	boolean_t pv_lists_locked;
4002
4003	if (pmap_is_current(pmap))
4004		pv_lists_locked = FALSE;
4005	else {
4006		pv_lists_locked = TRUE;
4007resume:
4008		rw_wlock(&pvh_global_lock);
4009		sched_pin();
4010	}
4011	PMAP_LOCK(pmap);
4012	for (; sva < eva; sva = pdnxt) {
4013		pdnxt = (sva + NBPDR) & ~PDRMASK;
4014		if (pdnxt < sva)
4015			pdnxt = eva;
4016		pde = pmap_pde(pmap, sva);
4017		if ((*pde & PG_V) == 0)
4018			continue;
4019		if ((*pde & PG_PS) != 0) {
4020			if ((*pde & PG_W) == 0)
4021				panic("pmap_unwire: pde %#jx is missing PG_W",
4022				    (uintmax_t)*pde);
4023
4024			/*
4025			 * Are we unwiring the entire large page?  If not,
4026			 * demote the mapping and fall through.
4027			 */
4028			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4029				/*
4030				 * Regardless of whether a pde (or pte) is 32
4031				 * or 64 bits in size, PG_W is among the least
4032				 * significant 32 bits.
4033				 */
4034				atomic_clear_int((u_int *)pde, PG_W);
4035				pmap->pm_stats.wired_count -= NBPDR /
4036				    PAGE_SIZE;
4037				continue;
4038			} else {
4039				if (!pv_lists_locked) {
4040					pv_lists_locked = TRUE;
4041					if (!rw_try_wlock(&pvh_global_lock)) {
4042						PMAP_UNLOCK(pmap);
4043						/* Repeat sva. */
4044						goto resume;
4045					}
4046					sched_pin();
4047				}
4048				if (!pmap_demote_pde(pmap, pde, sva))
4049					panic("pmap_unwire: demotion failed");
4050			}
4051		}
4052		if (pdnxt > eva)
4053			pdnxt = eva;
4054		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4055		    sva += PAGE_SIZE) {
4056			if ((*pte & PG_V) == 0)
4057				continue;
4058			if ((*pte & PG_W) == 0)
4059				panic("pmap_unwire: pte %#jx is missing PG_W",
4060				    (uintmax_t)*pte);
4061
4062			/*
4063			 * PG_W must be cleared atomically.  Although the pmap
4064			 * lock synchronizes access to PG_W, another processor
4065			 * could be setting PG_M and/or PG_A concurrently.
4066			 *
4067			 * PG_W is among the least significant 32 bits.
4068			 */
4069			atomic_clear_int((u_int *)pte, PG_W);
4070			pmap->pm_stats.wired_count--;
4071		}
4072	}
4073	if (pv_lists_locked) {
4074		sched_unpin();
4075		rw_wunlock(&pvh_global_lock);
4076	}
4077	PMAP_UNLOCK(pmap);
4078}
4079
4080
4081/*
4082 *	Copy the range specified by src_addr/len
4083 *	from the source map to the range dst_addr/len
4084 *	in the destination map.
4085 *
4086 *	This routine is only advisory and need not do anything.
4087 */
4088
4089void
4090pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4091    vm_offset_t src_addr)
4092{
4093	struct spglist free;
4094	vm_offset_t addr;
4095	vm_offset_t end_addr = src_addr + len;
4096	vm_offset_t pdnxt;
4097
4098	if (dst_addr != src_addr)
4099		return;
4100
4101	if (!pmap_is_current(src_pmap))
4102		return;
4103
4104	rw_wlock(&pvh_global_lock);
4105	if (dst_pmap < src_pmap) {
4106		PMAP_LOCK(dst_pmap);
4107		PMAP_LOCK(src_pmap);
4108	} else {
4109		PMAP_LOCK(src_pmap);
4110		PMAP_LOCK(dst_pmap);
4111	}
4112	sched_pin();
4113	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4114		pt_entry_t *src_pte, *dst_pte;
4115		vm_page_t dstmpte, srcmpte;
4116		pd_entry_t srcptepaddr;
4117		u_int ptepindex;
4118
4119		KASSERT(addr < UPT_MIN_ADDRESS,
4120		    ("pmap_copy: invalid to pmap_copy page tables"));
4121
4122		pdnxt = (addr + NBPDR) & ~PDRMASK;
4123		if (pdnxt < addr)
4124			pdnxt = end_addr;
4125		ptepindex = addr >> PDRSHIFT;
4126
4127		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4128		if (srcptepaddr == 0)
4129			continue;
4130
4131		if (srcptepaddr & PG_PS) {
4132			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4133				continue;
4134			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4135			    ((srcptepaddr & PG_MANAGED) == 0 ||
4136			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4137			    PG_PS_FRAME))) {
4138				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4139				    ~PG_W;
4140				dst_pmap->pm_stats.resident_count +=
4141				    NBPDR / PAGE_SIZE;
4142			}
4143			continue;
4144		}
4145
4146		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4147		KASSERT(srcmpte->wire_count > 0,
4148		    ("pmap_copy: source page table page is unused"));
4149
4150		if (pdnxt > end_addr)
4151			pdnxt = end_addr;
4152
4153		src_pte = vtopte(addr);
4154		while (addr < pdnxt) {
4155			pt_entry_t ptetemp;
4156			ptetemp = *src_pte;
4157			/*
4158			 * we only virtual copy managed pages
4159			 */
4160			if ((ptetemp & PG_MANAGED) != 0) {
4161				dstmpte = pmap_allocpte(dst_pmap, addr,
4162				    PMAP_ENTER_NOSLEEP);
4163				if (dstmpte == NULL)
4164					goto out;
4165				dst_pte = pmap_pte_quick(dst_pmap, addr);
4166				if (*dst_pte == 0 &&
4167				    pmap_try_insert_pv_entry(dst_pmap, addr,
4168				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4169					/*
4170					 * Clear the wired, modified, and
4171					 * accessed (referenced) bits
4172					 * during the copy.
4173					 */
4174					*dst_pte = ptetemp & ~(PG_W | PG_M |
4175					    PG_A);
4176					dst_pmap->pm_stats.resident_count++;
4177	 			} else {
4178					SLIST_INIT(&free);
4179					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4180					    &free)) {
4181						pmap_invalidate_page(dst_pmap,
4182						    addr);
4183						pmap_free_zero_pages(&free);
4184					}
4185					goto out;
4186				}
4187				if (dstmpte->wire_count >= srcmpte->wire_count)
4188					break;
4189			}
4190			addr += PAGE_SIZE;
4191			src_pte++;
4192		}
4193	}
4194out:
4195	sched_unpin();
4196	rw_wunlock(&pvh_global_lock);
4197	PMAP_UNLOCK(src_pmap);
4198	PMAP_UNLOCK(dst_pmap);
4199}
4200
4201static __inline void
4202pagezero(void *page)
4203{
4204#if defined(I686_CPU)
4205	if (cpu_class == CPUCLASS_686) {
4206#if defined(CPU_ENABLE_SSE)
4207		if (cpu_feature & CPUID_SSE2)
4208			sse2_pagezero(page);
4209		else
4210#endif
4211			i686_pagezero(page);
4212	} else
4213#endif
4214		bzero(page, PAGE_SIZE);
4215}
4216
4217/*
4218 *	pmap_zero_page zeros the specified hardware page by mapping
4219 *	the page into KVM and using bzero to clear its contents.
4220 */
4221void
4222pmap_zero_page(vm_page_t m)
4223{
4224	struct sysmaps *sysmaps;
4225
4226	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4227	mtx_lock(&sysmaps->lock);
4228	if (*sysmaps->CMAP2)
4229		panic("pmap_zero_page: CMAP2 busy");
4230	sched_pin();
4231	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4232	    pmap_cache_bits(m->md.pat_mode, 0);
4233	invlcaddr(sysmaps->CADDR2);
4234	pagezero(sysmaps->CADDR2);
4235	*sysmaps->CMAP2 = 0;
4236	sched_unpin();
4237	mtx_unlock(&sysmaps->lock);
4238}
4239
4240/*
4241 *	pmap_zero_page_area zeros the specified hardware page by mapping
4242 *	the page into KVM and using bzero to clear its contents.
4243 *
4244 *	off and size may not cover an area beyond a single hardware page.
4245 */
4246void
4247pmap_zero_page_area(vm_page_t m, int off, int size)
4248{
4249	struct sysmaps *sysmaps;
4250
4251	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4252	mtx_lock(&sysmaps->lock);
4253	if (*sysmaps->CMAP2)
4254		panic("pmap_zero_page_area: CMAP2 busy");
4255	sched_pin();
4256	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4257	    pmap_cache_bits(m->md.pat_mode, 0);
4258	invlcaddr(sysmaps->CADDR2);
4259	if (off == 0 && size == PAGE_SIZE)
4260		pagezero(sysmaps->CADDR2);
4261	else
4262		bzero((char *)sysmaps->CADDR2 + off, size);
4263	*sysmaps->CMAP2 = 0;
4264	sched_unpin();
4265	mtx_unlock(&sysmaps->lock);
4266}
4267
4268/*
4269 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4270 *	the page into KVM and using bzero to clear its contents.  This
4271 *	is intended to be called from the vm_pagezero process only and
4272 *	outside of Giant.
4273 */
4274void
4275pmap_zero_page_idle(vm_page_t m)
4276{
4277
4278	if (*CMAP3)
4279		panic("pmap_zero_page_idle: CMAP3 busy");
4280	sched_pin();
4281	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4282	    pmap_cache_bits(m->md.pat_mode, 0);
4283	invlcaddr(CADDR3);
4284	pagezero(CADDR3);
4285	*CMAP3 = 0;
4286	sched_unpin();
4287}
4288
4289/*
4290 *	pmap_copy_page copies the specified (machine independent)
4291 *	page by mapping the page into virtual memory and using
4292 *	bcopy to copy the page, one machine dependent page at a
4293 *	time.
4294 */
4295void
4296pmap_copy_page(vm_page_t src, vm_page_t dst)
4297{
4298	struct sysmaps *sysmaps;
4299
4300	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4301	mtx_lock(&sysmaps->lock);
4302	if (*sysmaps->CMAP1)
4303		panic("pmap_copy_page: CMAP1 busy");
4304	if (*sysmaps->CMAP2)
4305		panic("pmap_copy_page: CMAP2 busy");
4306	sched_pin();
4307	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4308	    pmap_cache_bits(src->md.pat_mode, 0);
4309	invlcaddr(sysmaps->CADDR1);
4310	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4311	    pmap_cache_bits(dst->md.pat_mode, 0);
4312	invlcaddr(sysmaps->CADDR2);
4313	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4314	*sysmaps->CMAP1 = 0;
4315	*sysmaps->CMAP2 = 0;
4316	sched_unpin();
4317	mtx_unlock(&sysmaps->lock);
4318}
4319
4320int unmapped_buf_allowed = 1;
4321
4322void
4323pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4324    vm_offset_t b_offset, int xfersize)
4325{
4326	struct sysmaps *sysmaps;
4327	vm_page_t a_pg, b_pg;
4328	char *a_cp, *b_cp;
4329	vm_offset_t a_pg_offset, b_pg_offset;
4330	int cnt;
4331
4332	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4333	mtx_lock(&sysmaps->lock);
4334	if (*sysmaps->CMAP1 != 0)
4335		panic("pmap_copy_pages: CMAP1 busy");
4336	if (*sysmaps->CMAP2 != 0)
4337		panic("pmap_copy_pages: CMAP2 busy");
4338	sched_pin();
4339	while (xfersize > 0) {
4340		a_pg = ma[a_offset >> PAGE_SHIFT];
4341		a_pg_offset = a_offset & PAGE_MASK;
4342		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4343		b_pg = mb[b_offset >> PAGE_SHIFT];
4344		b_pg_offset = b_offset & PAGE_MASK;
4345		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4346		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4347		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4348		invlcaddr(sysmaps->CADDR1);
4349		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4350		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4351		invlcaddr(sysmaps->CADDR2);
4352		a_cp = sysmaps->CADDR1 + a_pg_offset;
4353		b_cp = sysmaps->CADDR2 + b_pg_offset;
4354		bcopy(a_cp, b_cp, cnt);
4355		a_offset += cnt;
4356		b_offset += cnt;
4357		xfersize -= cnt;
4358	}
4359	*sysmaps->CMAP1 = 0;
4360	*sysmaps->CMAP2 = 0;
4361	sched_unpin();
4362	mtx_unlock(&sysmaps->lock);
4363}
4364
4365/*
4366 * Returns true if the pmap's pv is one of the first
4367 * 16 pvs linked to from this page.  This count may
4368 * be changed upwards or downwards in the future; it
4369 * is only necessary that true be returned for a small
4370 * subset of pmaps for proper page aging.
4371 */
4372boolean_t
4373pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4374{
4375	struct md_page *pvh;
4376	pv_entry_t pv;
4377	int loops = 0;
4378	boolean_t rv;
4379
4380	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4381	    ("pmap_page_exists_quick: page %p is not managed", m));
4382	rv = FALSE;
4383	rw_wlock(&pvh_global_lock);
4384	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4385		if (PV_PMAP(pv) == pmap) {
4386			rv = TRUE;
4387			break;
4388		}
4389		loops++;
4390		if (loops >= 16)
4391			break;
4392	}
4393	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4394		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4395		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4396			if (PV_PMAP(pv) == pmap) {
4397				rv = TRUE;
4398				break;
4399			}
4400			loops++;
4401			if (loops >= 16)
4402				break;
4403		}
4404	}
4405	rw_wunlock(&pvh_global_lock);
4406	return (rv);
4407}
4408
4409/*
4410 *	pmap_page_wired_mappings:
4411 *
4412 *	Return the number of managed mappings to the given physical page
4413 *	that are wired.
4414 */
4415int
4416pmap_page_wired_mappings(vm_page_t m)
4417{
4418	int count;
4419
4420	count = 0;
4421	if ((m->oflags & VPO_UNMANAGED) != 0)
4422		return (count);
4423	rw_wlock(&pvh_global_lock);
4424	count = pmap_pvh_wired_mappings(&m->md, count);
4425	if ((m->flags & PG_FICTITIOUS) == 0) {
4426	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4427	        count);
4428	}
4429	rw_wunlock(&pvh_global_lock);
4430	return (count);
4431}
4432
4433/*
4434 *	pmap_pvh_wired_mappings:
4435 *
4436 *	Return the updated number "count" of managed mappings that are wired.
4437 */
4438static int
4439pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4440{
4441	pmap_t pmap;
4442	pt_entry_t *pte;
4443	pv_entry_t pv;
4444
4445	rw_assert(&pvh_global_lock, RA_WLOCKED);
4446	sched_pin();
4447	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4448		pmap = PV_PMAP(pv);
4449		PMAP_LOCK(pmap);
4450		pte = pmap_pte_quick(pmap, pv->pv_va);
4451		if ((*pte & PG_W) != 0)
4452			count++;
4453		PMAP_UNLOCK(pmap);
4454	}
4455	sched_unpin();
4456	return (count);
4457}
4458
4459/*
4460 * Returns TRUE if the given page is mapped individually or as part of
4461 * a 4mpage.  Otherwise, returns FALSE.
4462 */
4463boolean_t
4464pmap_page_is_mapped(vm_page_t m)
4465{
4466	boolean_t rv;
4467
4468	if ((m->oflags & VPO_UNMANAGED) != 0)
4469		return (FALSE);
4470	rw_wlock(&pvh_global_lock);
4471	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4472	    ((m->flags & PG_FICTITIOUS) == 0 &&
4473	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4474	rw_wunlock(&pvh_global_lock);
4475	return (rv);
4476}
4477
4478/*
4479 * Remove all pages from specified address space
4480 * this aids process exit speeds.  Also, this code
4481 * is special cased for current process only, but
4482 * can have the more generic (and slightly slower)
4483 * mode enabled.  This is much faster than pmap_remove
4484 * in the case of running down an entire address space.
4485 */
4486void
4487pmap_remove_pages(pmap_t pmap)
4488{
4489	pt_entry_t *pte, tpte;
4490	vm_page_t m, mpte, mt;
4491	pv_entry_t pv;
4492	struct md_page *pvh;
4493	struct pv_chunk *pc, *npc;
4494	struct spglist free;
4495	int field, idx;
4496	int32_t bit;
4497	uint32_t inuse, bitmask;
4498	int allfree;
4499
4500	if (pmap != PCPU_GET(curpmap)) {
4501		printf("warning: pmap_remove_pages called with non-current pmap\n");
4502		return;
4503	}
4504	SLIST_INIT(&free);
4505	rw_wlock(&pvh_global_lock);
4506	PMAP_LOCK(pmap);
4507	sched_pin();
4508	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4509		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4510		    pc->pc_pmap));
4511		allfree = 1;
4512		for (field = 0; field < _NPCM; field++) {
4513			inuse = ~pc->pc_map[field] & pc_freemask[field];
4514			while (inuse != 0) {
4515				bit = bsfl(inuse);
4516				bitmask = 1UL << bit;
4517				idx = field * 32 + bit;
4518				pv = &pc->pc_pventry[idx];
4519				inuse &= ~bitmask;
4520
4521				pte = pmap_pde(pmap, pv->pv_va);
4522				tpte = *pte;
4523				if ((tpte & PG_PS) == 0) {
4524					pte = vtopte(pv->pv_va);
4525					tpte = *pte & ~PG_PTE_PAT;
4526				}
4527
4528				if (tpte == 0) {
4529					printf(
4530					    "TPTE at %p  IS ZERO @ VA %08x\n",
4531					    pte, pv->pv_va);
4532					panic("bad pte");
4533				}
4534
4535/*
4536 * We cannot remove wired pages from a process' mapping at this time
4537 */
4538				if (tpte & PG_W) {
4539					allfree = 0;
4540					continue;
4541				}
4542
4543				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4544				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4545				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4546				    m, (uintmax_t)m->phys_addr,
4547				    (uintmax_t)tpte));
4548
4549				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4550				    m < &vm_page_array[vm_page_array_size],
4551				    ("pmap_remove_pages: bad tpte %#jx",
4552				    (uintmax_t)tpte));
4553
4554				pte_clear(pte);
4555
4556				/*
4557				 * Update the vm_page_t clean/reference bits.
4558				 */
4559				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4560					if ((tpte & PG_PS) != 0) {
4561						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4562							vm_page_dirty(mt);
4563					} else
4564						vm_page_dirty(m);
4565				}
4566
4567				/* Mark free */
4568				PV_STAT(pv_entry_frees++);
4569				PV_STAT(pv_entry_spare++);
4570				pv_entry_count--;
4571				pc->pc_map[field] |= bitmask;
4572				if ((tpte & PG_PS) != 0) {
4573					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4574					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4575					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4576					if (TAILQ_EMPTY(&pvh->pv_list)) {
4577						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4578							if (TAILQ_EMPTY(&mt->md.pv_list))
4579								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4580					}
4581					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4582					if (mpte != NULL) {
4583						pmap_remove_pt_page(pmap, mpte);
4584						pmap->pm_stats.resident_count--;
4585						KASSERT(mpte->wire_count == NPTEPG,
4586						    ("pmap_remove_pages: pte page wire count error"));
4587						mpte->wire_count = 0;
4588						pmap_add_delayed_free_list(mpte, &free, FALSE);
4589						atomic_subtract_int(&cnt.v_wire_count, 1);
4590					}
4591				} else {
4592					pmap->pm_stats.resident_count--;
4593					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4594					if (TAILQ_EMPTY(&m->md.pv_list) &&
4595					    (m->flags & PG_FICTITIOUS) == 0) {
4596						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4597						if (TAILQ_EMPTY(&pvh->pv_list))
4598							vm_page_aflag_clear(m, PGA_WRITEABLE);
4599					}
4600					pmap_unuse_pt(pmap, pv->pv_va, &free);
4601				}
4602			}
4603		}
4604		if (allfree) {
4605			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4606			free_pv_chunk(pc);
4607		}
4608	}
4609	sched_unpin();
4610	pmap_invalidate_all(pmap);
4611	rw_wunlock(&pvh_global_lock);
4612	PMAP_UNLOCK(pmap);
4613	pmap_free_zero_pages(&free);
4614}
4615
4616/*
4617 *	pmap_is_modified:
4618 *
4619 *	Return whether or not the specified physical page was modified
4620 *	in any physical maps.
4621 */
4622boolean_t
4623pmap_is_modified(vm_page_t m)
4624{
4625	boolean_t rv;
4626
4627	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4628	    ("pmap_is_modified: page %p is not managed", m));
4629
4630	/*
4631	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4632	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4633	 * is clear, no PTEs can have PG_M set.
4634	 */
4635	VM_OBJECT_ASSERT_WLOCKED(m->object);
4636	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4637		return (FALSE);
4638	rw_wlock(&pvh_global_lock);
4639	rv = pmap_is_modified_pvh(&m->md) ||
4640	    ((m->flags & PG_FICTITIOUS) == 0 &&
4641	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4642	rw_wunlock(&pvh_global_lock);
4643	return (rv);
4644}
4645
4646/*
4647 * Returns TRUE if any of the given mappings were used to modify
4648 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4649 * mappings are supported.
4650 */
4651static boolean_t
4652pmap_is_modified_pvh(struct md_page *pvh)
4653{
4654	pv_entry_t pv;
4655	pt_entry_t *pte;
4656	pmap_t pmap;
4657	boolean_t rv;
4658
4659	rw_assert(&pvh_global_lock, RA_WLOCKED);
4660	rv = FALSE;
4661	sched_pin();
4662	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4663		pmap = PV_PMAP(pv);
4664		PMAP_LOCK(pmap);
4665		pte = pmap_pte_quick(pmap, pv->pv_va);
4666		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4667		PMAP_UNLOCK(pmap);
4668		if (rv)
4669			break;
4670	}
4671	sched_unpin();
4672	return (rv);
4673}
4674
4675/*
4676 *	pmap_is_prefaultable:
4677 *
4678 *	Return whether or not the specified virtual address is elgible
4679 *	for prefault.
4680 */
4681boolean_t
4682pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4683{
4684	pd_entry_t *pde;
4685	pt_entry_t *pte;
4686	boolean_t rv;
4687
4688	rv = FALSE;
4689	PMAP_LOCK(pmap);
4690	pde = pmap_pde(pmap, addr);
4691	if (*pde != 0 && (*pde & PG_PS) == 0) {
4692		pte = vtopte(addr);
4693		rv = *pte == 0;
4694	}
4695	PMAP_UNLOCK(pmap);
4696	return (rv);
4697}
4698
4699/*
4700 *	pmap_is_referenced:
4701 *
4702 *	Return whether or not the specified physical page was referenced
4703 *	in any physical maps.
4704 */
4705boolean_t
4706pmap_is_referenced(vm_page_t m)
4707{
4708	boolean_t rv;
4709
4710	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4711	    ("pmap_is_referenced: page %p is not managed", m));
4712	rw_wlock(&pvh_global_lock);
4713	rv = pmap_is_referenced_pvh(&m->md) ||
4714	    ((m->flags & PG_FICTITIOUS) == 0 &&
4715	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4716	rw_wunlock(&pvh_global_lock);
4717	return (rv);
4718}
4719
4720/*
4721 * Returns TRUE if any of the given mappings were referenced and FALSE
4722 * otherwise.  Both page and 4mpage mappings are supported.
4723 */
4724static boolean_t
4725pmap_is_referenced_pvh(struct md_page *pvh)
4726{
4727	pv_entry_t pv;
4728	pt_entry_t *pte;
4729	pmap_t pmap;
4730	boolean_t rv;
4731
4732	rw_assert(&pvh_global_lock, RA_WLOCKED);
4733	rv = FALSE;
4734	sched_pin();
4735	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4736		pmap = PV_PMAP(pv);
4737		PMAP_LOCK(pmap);
4738		pte = pmap_pte_quick(pmap, pv->pv_va);
4739		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4740		PMAP_UNLOCK(pmap);
4741		if (rv)
4742			break;
4743	}
4744	sched_unpin();
4745	return (rv);
4746}
4747
4748/*
4749 * Clear the write and modified bits in each of the given page's mappings.
4750 */
4751void
4752pmap_remove_write(vm_page_t m)
4753{
4754	struct md_page *pvh;
4755	pv_entry_t next_pv, pv;
4756	pmap_t pmap;
4757	pd_entry_t *pde;
4758	pt_entry_t oldpte, *pte;
4759	vm_offset_t va;
4760
4761	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4762	    ("pmap_remove_write: page %p is not managed", m));
4763
4764	/*
4765	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4766	 * set by another thread while the object is locked.  Thus,
4767	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4768	 */
4769	VM_OBJECT_ASSERT_WLOCKED(m->object);
4770	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4771		return;
4772	rw_wlock(&pvh_global_lock);
4773	sched_pin();
4774	if ((m->flags & PG_FICTITIOUS) != 0)
4775		goto small_mappings;
4776	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4777	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4778		va = pv->pv_va;
4779		pmap = PV_PMAP(pv);
4780		PMAP_LOCK(pmap);
4781		pde = pmap_pde(pmap, va);
4782		if ((*pde & PG_RW) != 0)
4783			(void)pmap_demote_pde(pmap, pde, va);
4784		PMAP_UNLOCK(pmap);
4785	}
4786small_mappings:
4787	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4788		pmap = PV_PMAP(pv);
4789		PMAP_LOCK(pmap);
4790		pde = pmap_pde(pmap, pv->pv_va);
4791		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4792		    " a 4mpage in page %p's pv list", m));
4793		pte = pmap_pte_quick(pmap, pv->pv_va);
4794retry:
4795		oldpte = *pte;
4796		if ((oldpte & PG_RW) != 0) {
4797			/*
4798			 * Regardless of whether a pte is 32 or 64 bits
4799			 * in size, PG_RW and PG_M are among the least
4800			 * significant 32 bits.
4801			 */
4802			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4803			    oldpte & ~(PG_RW | PG_M)))
4804				goto retry;
4805			if ((oldpte & PG_M) != 0)
4806				vm_page_dirty(m);
4807			pmap_invalidate_page(pmap, pv->pv_va);
4808		}
4809		PMAP_UNLOCK(pmap);
4810	}
4811	vm_page_aflag_clear(m, PGA_WRITEABLE);
4812	sched_unpin();
4813	rw_wunlock(&pvh_global_lock);
4814}
4815
4816#define	PMAP_TS_REFERENCED_MAX	5
4817
4818/*
4819 *	pmap_ts_referenced:
4820 *
4821 *	Return a count of reference bits for a page, clearing those bits.
4822 *	It is not necessary for every reference bit to be cleared, but it
4823 *	is necessary that 0 only be returned when there are truly no
4824 *	reference bits set.
4825 *
4826 *	XXX: The exact number of bits to check and clear is a matter that
4827 *	should be tested and standardized at some point in the future for
4828 *	optimal aging of shared pages.
4829 */
4830int
4831pmap_ts_referenced(vm_page_t m)
4832{
4833	struct md_page *pvh;
4834	pv_entry_t pv, pvf;
4835	pmap_t pmap;
4836	pd_entry_t *pde;
4837	pt_entry_t *pte;
4838	vm_paddr_t pa;
4839	int rtval = 0;
4840
4841	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4842	    ("pmap_ts_referenced: page %p is not managed", m));
4843	pa = VM_PAGE_TO_PHYS(m);
4844	pvh = pa_to_pvh(pa);
4845	rw_wlock(&pvh_global_lock);
4846	sched_pin();
4847	if ((m->flags & PG_FICTITIOUS) != 0 ||
4848	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4849		goto small_mappings;
4850	pv = pvf;
4851	do {
4852		pmap = PV_PMAP(pv);
4853		PMAP_LOCK(pmap);
4854		pde = pmap_pde(pmap, pv->pv_va);
4855		if ((*pde & PG_A) != 0) {
4856			/*
4857			 * Since this reference bit is shared by either 1024
4858			 * or 512 4KB pages, it should not be cleared every
4859			 * time it is tested.  Apply a simple "hash" function
4860			 * on the physical page number, the virtual superpage
4861			 * number, and the pmap address to select one 4KB page
4862			 * out of the 1024 or 512 on which testing the
4863			 * reference bit will result in clearing that bit.
4864			 * This function is designed to avoid the selection of
4865			 * the same 4KB page for every 2- or 4MB page mapping.
4866			 *
4867			 * On demotion, a mapping that hasn't been referenced
4868			 * is simply destroyed.  To avoid the possibility of a
4869			 * subsequent page fault on a demoted wired mapping,
4870			 * always leave its reference bit set.  Moreover,
4871			 * since the superpage is wired, the current state of
4872			 * its reference bit won't affect page replacement.
4873			 */
4874			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4875			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4876			    (*pde & PG_W) == 0) {
4877				atomic_clear_int((u_int *)pde, PG_A);
4878				pmap_invalidate_page(pmap, pv->pv_va);
4879			}
4880			rtval++;
4881		}
4882		PMAP_UNLOCK(pmap);
4883		/* Rotate the PV list if it has more than one entry. */
4884		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4885			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4886			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4887		}
4888		if (rtval >= PMAP_TS_REFERENCED_MAX)
4889			goto out;
4890	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4891small_mappings:
4892	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4893		goto out;
4894	pv = pvf;
4895	do {
4896		pmap = PV_PMAP(pv);
4897		PMAP_LOCK(pmap);
4898		pde = pmap_pde(pmap, pv->pv_va);
4899		KASSERT((*pde & PG_PS) == 0,
4900		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4901		    m));
4902		pte = pmap_pte_quick(pmap, pv->pv_va);
4903		if ((*pte & PG_A) != 0) {
4904			atomic_clear_int((u_int *)pte, PG_A);
4905			pmap_invalidate_page(pmap, pv->pv_va);
4906			rtval++;
4907		}
4908		PMAP_UNLOCK(pmap);
4909		/* Rotate the PV list if it has more than one entry. */
4910		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4911			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4912			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4913		}
4914	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4915	    PMAP_TS_REFERENCED_MAX);
4916out:
4917	sched_unpin();
4918	rw_wunlock(&pvh_global_lock);
4919	return (rtval);
4920}
4921
4922/*
4923 *	Apply the given advice to the specified range of addresses within the
4924 *	given pmap.  Depending on the advice, clear the referenced and/or
4925 *	modified flags in each mapping and set the mapped page's dirty field.
4926 */
4927void
4928pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4929{
4930	pd_entry_t oldpde, *pde;
4931	pt_entry_t *pte;
4932	vm_offset_t pdnxt;
4933	vm_page_t m;
4934	boolean_t anychanged, pv_lists_locked;
4935
4936	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4937		return;
4938	if (pmap_is_current(pmap))
4939		pv_lists_locked = FALSE;
4940	else {
4941		pv_lists_locked = TRUE;
4942resume:
4943		rw_wlock(&pvh_global_lock);
4944		sched_pin();
4945	}
4946	anychanged = FALSE;
4947	PMAP_LOCK(pmap);
4948	for (; sva < eva; sva = pdnxt) {
4949		pdnxt = (sva + NBPDR) & ~PDRMASK;
4950		if (pdnxt < sva)
4951			pdnxt = eva;
4952		pde = pmap_pde(pmap, sva);
4953		oldpde = *pde;
4954		if ((oldpde & PG_V) == 0)
4955			continue;
4956		else if ((oldpde & PG_PS) != 0) {
4957			if ((oldpde & PG_MANAGED) == 0)
4958				continue;
4959			if (!pv_lists_locked) {
4960				pv_lists_locked = TRUE;
4961				if (!rw_try_wlock(&pvh_global_lock)) {
4962					if (anychanged)
4963						pmap_invalidate_all(pmap);
4964					PMAP_UNLOCK(pmap);
4965					goto resume;
4966				}
4967				sched_pin();
4968			}
4969			if (!pmap_demote_pde(pmap, pde, sva)) {
4970				/*
4971				 * The large page mapping was destroyed.
4972				 */
4973				continue;
4974			}
4975
4976			/*
4977			 * Unless the page mappings are wired, remove the
4978			 * mapping to a single page so that a subsequent
4979			 * access may repromote.  Since the underlying page
4980			 * table page is fully populated, this removal never
4981			 * frees a page table page.
4982			 */
4983			if ((oldpde & PG_W) == 0) {
4984				pte = pmap_pte_quick(pmap, sva);
4985				KASSERT((*pte & PG_V) != 0,
4986				    ("pmap_advise: invalid PTE"));
4987				pmap_remove_pte(pmap, pte, sva, NULL);
4988				anychanged = TRUE;
4989			}
4990		}
4991		if (pdnxt > eva)
4992			pdnxt = eva;
4993		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4994		    sva += PAGE_SIZE) {
4995			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
4996			    PG_V))
4997				continue;
4998			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4999				if (advice == MADV_DONTNEED) {
5000					/*
5001					 * Future calls to pmap_is_modified()
5002					 * can be avoided by making the page
5003					 * dirty now.
5004					 */
5005					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5006					vm_page_dirty(m);
5007				}
5008				atomic_clear_int((u_int *)pte, PG_M | PG_A);
5009			} else if ((*pte & PG_A) != 0)
5010				atomic_clear_int((u_int *)pte, PG_A);
5011			else
5012				continue;
5013			if ((*pte & PG_G) != 0)
5014				pmap_invalidate_page(pmap, sva);
5015			else
5016				anychanged = TRUE;
5017		}
5018	}
5019	if (anychanged)
5020		pmap_invalidate_all(pmap);
5021	if (pv_lists_locked) {
5022		sched_unpin();
5023		rw_wunlock(&pvh_global_lock);
5024	}
5025	PMAP_UNLOCK(pmap);
5026}
5027
5028/*
5029 *	Clear the modify bits on the specified physical page.
5030 */
5031void
5032pmap_clear_modify(vm_page_t m)
5033{
5034	struct md_page *pvh;
5035	pv_entry_t next_pv, pv;
5036	pmap_t pmap;
5037	pd_entry_t oldpde, *pde;
5038	pt_entry_t oldpte, *pte;
5039	vm_offset_t va;
5040
5041	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5042	    ("pmap_clear_modify: page %p is not managed", m));
5043	VM_OBJECT_ASSERT_WLOCKED(m->object);
5044	KASSERT(!vm_page_xbusied(m),
5045	    ("pmap_clear_modify: page %p is exclusive busied", m));
5046
5047	/*
5048	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5049	 * If the object containing the page is locked and the page is not
5050	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5051	 */
5052	if ((m->aflags & PGA_WRITEABLE) == 0)
5053		return;
5054	rw_wlock(&pvh_global_lock);
5055	sched_pin();
5056	if ((m->flags & PG_FICTITIOUS) != 0)
5057		goto small_mappings;
5058	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5059	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5060		va = pv->pv_va;
5061		pmap = PV_PMAP(pv);
5062		PMAP_LOCK(pmap);
5063		pde = pmap_pde(pmap, va);
5064		oldpde = *pde;
5065		if ((oldpde & PG_RW) != 0) {
5066			if (pmap_demote_pde(pmap, pde, va)) {
5067				if ((oldpde & PG_W) == 0) {
5068					/*
5069					 * Write protect the mapping to a
5070					 * single page so that a subsequent
5071					 * write access may repromote.
5072					 */
5073					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5074					    PG_PS_FRAME);
5075					pte = pmap_pte_quick(pmap, va);
5076					oldpte = *pte;
5077					if ((oldpte & PG_V) != 0) {
5078						/*
5079						 * Regardless of whether a pte is 32 or 64 bits
5080						 * in size, PG_RW and PG_M are among the least
5081						 * significant 32 bits.
5082						 */
5083						while (!atomic_cmpset_int((u_int *)pte,
5084						    oldpte,
5085						    oldpte & ~(PG_M | PG_RW)))
5086							oldpte = *pte;
5087						vm_page_dirty(m);
5088						pmap_invalidate_page(pmap, va);
5089					}
5090				}
5091			}
5092		}
5093		PMAP_UNLOCK(pmap);
5094	}
5095small_mappings:
5096	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5097		pmap = PV_PMAP(pv);
5098		PMAP_LOCK(pmap);
5099		pde = pmap_pde(pmap, pv->pv_va);
5100		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5101		    " a 4mpage in page %p's pv list", m));
5102		pte = pmap_pte_quick(pmap, pv->pv_va);
5103		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5104			/*
5105			 * Regardless of whether a pte is 32 or 64 bits
5106			 * in size, PG_M is among the least significant
5107			 * 32 bits.
5108			 */
5109			atomic_clear_int((u_int *)pte, PG_M);
5110			pmap_invalidate_page(pmap, pv->pv_va);
5111		}
5112		PMAP_UNLOCK(pmap);
5113	}
5114	sched_unpin();
5115	rw_wunlock(&pvh_global_lock);
5116}
5117
5118/*
5119 * Miscellaneous support routines follow
5120 */
5121
5122/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5123static __inline void
5124pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5125{
5126	u_int opte, npte;
5127
5128	/*
5129	 * The cache mode bits are all in the low 32-bits of the
5130	 * PTE, so we can just spin on updating the low 32-bits.
5131	 */
5132	do {
5133		opte = *(u_int *)pte;
5134		npte = opte & ~PG_PTE_CACHE;
5135		npte |= cache_bits;
5136	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5137}
5138
5139/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5140static __inline void
5141pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5142{
5143	u_int opde, npde;
5144
5145	/*
5146	 * The cache mode bits are all in the low 32-bits of the
5147	 * PDE, so we can just spin on updating the low 32-bits.
5148	 */
5149	do {
5150		opde = *(u_int *)pde;
5151		npde = opde & ~PG_PDE_CACHE;
5152		npde |= cache_bits;
5153	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5154}
5155
5156/*
5157 * Map a set of physical memory pages into the kernel virtual
5158 * address space. Return a pointer to where it is mapped. This
5159 * routine is intended to be used for mapping device memory,
5160 * NOT real memory.
5161 */
5162void *
5163pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5164{
5165	vm_offset_t va, offset;
5166	vm_size_t tmpsize;
5167
5168	offset = pa & PAGE_MASK;
5169	size = round_page(offset + size);
5170	pa = pa & PG_FRAME;
5171
5172	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5173		va = KERNBASE + pa;
5174	else
5175		va = kva_alloc(size);
5176	if (!va)
5177		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5178
5179	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5180		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5181	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5182	pmap_invalidate_cache_range(va, va + size, FALSE);
5183	return ((void *)(va + offset));
5184}
5185
5186void *
5187pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5188{
5189
5190	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5191}
5192
5193void *
5194pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5195{
5196
5197	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5198}
5199
5200void
5201pmap_unmapdev(vm_offset_t va, vm_size_t size)
5202{
5203	vm_offset_t base, offset;
5204
5205	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5206		return;
5207	base = trunc_page(va);
5208	offset = va & PAGE_MASK;
5209	size = round_page(offset + size);
5210	kva_free(base, size);
5211}
5212
5213/*
5214 * Sets the memory attribute for the specified page.
5215 */
5216void
5217pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5218{
5219
5220	m->md.pat_mode = ma;
5221	if ((m->flags & PG_FICTITIOUS) != 0)
5222		return;
5223
5224	/*
5225	 * If "m" is a normal page, flush it from the cache.
5226	 * See pmap_invalidate_cache_range().
5227	 *
5228	 * First, try to find an existing mapping of the page by sf
5229	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5230	 * flushes the cache.
5231	 */
5232	if (sf_buf_invalidate_cache(m))
5233		return;
5234
5235	/*
5236	 * If page is not mapped by sf buffer, but CPU does not
5237	 * support self snoop, map the page transient and do
5238	 * invalidation. In the worst case, whole cache is flushed by
5239	 * pmap_invalidate_cache_range().
5240	 */
5241	if ((cpu_feature & CPUID_SS) == 0)
5242		pmap_flush_page(m);
5243}
5244
5245static void
5246pmap_flush_page(vm_page_t m)
5247{
5248	struct sysmaps *sysmaps;
5249	vm_offset_t sva, eva;
5250
5251	if ((cpu_feature & CPUID_CLFSH) != 0) {
5252		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5253		mtx_lock(&sysmaps->lock);
5254		if (*sysmaps->CMAP2)
5255			panic("pmap_flush_page: CMAP2 busy");
5256		sched_pin();
5257		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5258		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5259		invlcaddr(sysmaps->CADDR2);
5260		sva = (vm_offset_t)sysmaps->CADDR2;
5261		eva = sva + PAGE_SIZE;
5262
5263		/*
5264		 * Use mfence despite the ordering implied by
5265		 * mtx_{un,}lock() because clflush is not guaranteed
5266		 * to be ordered by any other instruction.
5267		 */
5268		mfence();
5269		for (; sva < eva; sva += cpu_clflush_line_size)
5270			clflush(sva);
5271		mfence();
5272		*sysmaps->CMAP2 = 0;
5273		sched_unpin();
5274		mtx_unlock(&sysmaps->lock);
5275	} else
5276		pmap_invalidate_cache();
5277}
5278
5279/*
5280 * Changes the specified virtual address range's memory type to that given by
5281 * the parameter "mode".  The specified virtual address range must be
5282 * completely contained within either the kernel map.
5283 *
5284 * Returns zero if the change completed successfully, and either EINVAL or
5285 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5286 * of the virtual address range was not mapped, and ENOMEM is returned if
5287 * there was insufficient memory available to complete the change.
5288 */
5289int
5290pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5291{
5292	vm_offset_t base, offset, tmpva;
5293	pd_entry_t *pde;
5294	pt_entry_t *pte;
5295	int cache_bits_pte, cache_bits_pde;
5296	boolean_t changed;
5297
5298	base = trunc_page(va);
5299	offset = va & PAGE_MASK;
5300	size = round_page(offset + size);
5301
5302	/*
5303	 * Only supported on kernel virtual addresses above the recursive map.
5304	 */
5305	if (base < VM_MIN_KERNEL_ADDRESS)
5306		return (EINVAL);
5307
5308	cache_bits_pde = pmap_cache_bits(mode, 1);
5309	cache_bits_pte = pmap_cache_bits(mode, 0);
5310	changed = FALSE;
5311
5312	/*
5313	 * Pages that aren't mapped aren't supported.  Also break down
5314	 * 2/4MB pages into 4KB pages if required.
5315	 */
5316	PMAP_LOCK(kernel_pmap);
5317	for (tmpva = base; tmpva < base + size; ) {
5318		pde = pmap_pde(kernel_pmap, tmpva);
5319		if (*pde == 0) {
5320			PMAP_UNLOCK(kernel_pmap);
5321			return (EINVAL);
5322		}
5323		if (*pde & PG_PS) {
5324			/*
5325			 * If the current 2/4MB page already has
5326			 * the required memory type, then we need not
5327			 * demote this page.  Just increment tmpva to
5328			 * the next 2/4MB page frame.
5329			 */
5330			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5331				tmpva = trunc_4mpage(tmpva) + NBPDR;
5332				continue;
5333			}
5334
5335			/*
5336			 * If the current offset aligns with a 2/4MB
5337			 * page frame and there is at least 2/4MB left
5338			 * within the range, then we need not break
5339			 * down this page into 4KB pages.
5340			 */
5341			if ((tmpva & PDRMASK) == 0 &&
5342			    tmpva + PDRMASK < base + size) {
5343				tmpva += NBPDR;
5344				continue;
5345			}
5346			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5347				PMAP_UNLOCK(kernel_pmap);
5348				return (ENOMEM);
5349			}
5350		}
5351		pte = vtopte(tmpva);
5352		if (*pte == 0) {
5353			PMAP_UNLOCK(kernel_pmap);
5354			return (EINVAL);
5355		}
5356		tmpva += PAGE_SIZE;
5357	}
5358	PMAP_UNLOCK(kernel_pmap);
5359
5360	/*
5361	 * Ok, all the pages exist, so run through them updating their
5362	 * cache mode if required.
5363	 */
5364	for (tmpva = base; tmpva < base + size; ) {
5365		pde = pmap_pde(kernel_pmap, tmpva);
5366		if (*pde & PG_PS) {
5367			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5368				pmap_pde_attr(pde, cache_bits_pde);
5369				changed = TRUE;
5370			}
5371			tmpva = trunc_4mpage(tmpva) + NBPDR;
5372		} else {
5373			pte = vtopte(tmpva);
5374			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5375				pmap_pte_attr(pte, cache_bits_pte);
5376				changed = TRUE;
5377			}
5378			tmpva += PAGE_SIZE;
5379		}
5380	}
5381
5382	/*
5383	 * Flush CPU caches to make sure any data isn't cached that
5384	 * shouldn't be, etc.
5385	 */
5386	if (changed) {
5387		pmap_invalidate_range(kernel_pmap, base, tmpva);
5388		pmap_invalidate_cache_range(base, tmpva, FALSE);
5389	}
5390	return (0);
5391}
5392
5393/*
5394 * perform the pmap work for mincore
5395 */
5396int
5397pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5398{
5399	pd_entry_t *pdep;
5400	pt_entry_t *ptep, pte;
5401	vm_paddr_t pa;
5402	int val;
5403
5404	PMAP_LOCK(pmap);
5405retry:
5406	pdep = pmap_pde(pmap, addr);
5407	if (*pdep != 0) {
5408		if (*pdep & PG_PS) {
5409			pte = *pdep;
5410			/* Compute the physical address of the 4KB page. */
5411			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5412			    PG_FRAME;
5413			val = MINCORE_SUPER;
5414		} else {
5415			ptep = pmap_pte(pmap, addr);
5416			pte = *ptep;
5417			pmap_pte_release(ptep);
5418			pa = pte & PG_FRAME;
5419			val = 0;
5420		}
5421	} else {
5422		pte = 0;
5423		pa = 0;
5424		val = 0;
5425	}
5426	if ((pte & PG_V) != 0) {
5427		val |= MINCORE_INCORE;
5428		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5429			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5430		if ((pte & PG_A) != 0)
5431			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5432	}
5433	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5434	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5435	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5436		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5437		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5438			goto retry;
5439	} else
5440		PA_UNLOCK_COND(*locked_pa);
5441	PMAP_UNLOCK(pmap);
5442	return (val);
5443}
5444
5445void
5446pmap_activate(struct thread *td)
5447{
5448	pmap_t	pmap, oldpmap;
5449	u_int	cpuid;
5450	u_int32_t  cr3;
5451
5452	critical_enter();
5453	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5454	oldpmap = PCPU_GET(curpmap);
5455	cpuid = PCPU_GET(cpuid);
5456#if defined(SMP)
5457	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5458	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5459#else
5460	CPU_CLR(cpuid, &oldpmap->pm_active);
5461	CPU_SET(cpuid, &pmap->pm_active);
5462#endif
5463#ifdef PAE
5464	cr3 = vtophys(pmap->pm_pdpt);
5465#else
5466	cr3 = vtophys(pmap->pm_pdir);
5467#endif
5468	/*
5469	 * pmap_activate is for the current thread on the current cpu
5470	 */
5471	td->td_pcb->pcb_cr3 = cr3;
5472	load_cr3(cr3);
5473	PCPU_SET(curpmap, pmap);
5474	critical_exit();
5475}
5476
5477void
5478pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5479{
5480}
5481
5482/*
5483 *	Increase the starting virtual address of the given mapping if a
5484 *	different alignment might result in more superpage mappings.
5485 */
5486void
5487pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5488    vm_offset_t *addr, vm_size_t size)
5489{
5490	vm_offset_t superpage_offset;
5491
5492	if (size < NBPDR)
5493		return;
5494	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5495		offset += ptoa(object->pg_color);
5496	superpage_offset = offset & PDRMASK;
5497	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5498	    (*addr & PDRMASK) == superpage_offset)
5499		return;
5500	if ((*addr & PDRMASK) < superpage_offset)
5501		*addr = (*addr & ~PDRMASK) + superpage_offset;
5502	else
5503		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5504}
5505
5506
5507#if defined(PMAP_DEBUG)
5508pmap_pid_dump(int pid)
5509{
5510	pmap_t pmap;
5511	struct proc *p;
5512	int npte = 0;
5513	int index;
5514
5515	sx_slock(&allproc_lock);
5516	FOREACH_PROC_IN_SYSTEM(p) {
5517		if (p->p_pid != pid)
5518			continue;
5519
5520		if (p->p_vmspace) {
5521			int i,j;
5522			index = 0;
5523			pmap = vmspace_pmap(p->p_vmspace);
5524			for (i = 0; i < NPDEPTD; i++) {
5525				pd_entry_t *pde;
5526				pt_entry_t *pte;
5527				vm_offset_t base = i << PDRSHIFT;
5528
5529				pde = &pmap->pm_pdir[i];
5530				if (pde && pmap_pde_v(pde)) {
5531					for (j = 0; j < NPTEPG; j++) {
5532						vm_offset_t va = base + (j << PAGE_SHIFT);
5533						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5534							if (index) {
5535								index = 0;
5536								printf("\n");
5537							}
5538							sx_sunlock(&allproc_lock);
5539							return (npte);
5540						}
5541						pte = pmap_pte(pmap, va);
5542						if (pte && pmap_pte_v(pte)) {
5543							pt_entry_t pa;
5544							vm_page_t m;
5545							pa = *pte;
5546							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5547							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5548								va, pa, m->hold_count, m->wire_count, m->flags);
5549							npte++;
5550							index++;
5551							if (index >= 2) {
5552								index = 0;
5553								printf("\n");
5554							} else {
5555								printf(" ");
5556							}
5557						}
5558					}
5559				}
5560			}
5561		}
5562	}
5563	sx_sunlock(&allproc_lock);
5564	return (npte);
5565}
5566#endif
5567
5568#if defined(DEBUG)
5569
5570static void	pads(pmap_t pm);
5571void		pmap_pvdump(vm_paddr_t pa);
5572
5573/* print address space of pmap*/
5574static void
5575pads(pmap_t pm)
5576{
5577	int i, j;
5578	vm_paddr_t va;
5579	pt_entry_t *ptep;
5580
5581	if (pm == kernel_pmap)
5582		return;
5583	for (i = 0; i < NPDEPTD; i++)
5584		if (pm->pm_pdir[i])
5585			for (j = 0; j < NPTEPG; j++) {
5586				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5587				if (pm == kernel_pmap && va < KERNBASE)
5588					continue;
5589				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5590					continue;
5591				ptep = pmap_pte(pm, va);
5592				if (pmap_pte_v(ptep))
5593					printf("%x:%x ", va, *ptep);
5594			};
5595
5596}
5597
5598void
5599pmap_pvdump(vm_paddr_t pa)
5600{
5601	pv_entry_t pv;
5602	pmap_t pmap;
5603	vm_page_t m;
5604
5605	printf("pa %x", pa);
5606	m = PHYS_TO_VM_PAGE(pa);
5607	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5608		pmap = PV_PMAP(pv);
5609		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5610		pads(pmap);
5611	}
5612	printf(" ");
5613}
5614#endif
5615