pmap.c revision 287126
1219019Sgabor/*-
2219019Sgabor * Copyright (c) 1991 Regents of the University of California.
3219019Sgabor * All rights reserved.
4219019Sgabor * Copyright (c) 1994 John S. Dyson
5219019Sgabor * All rights reserved.
6219019Sgabor * Copyright (c) 1994 David Greenman
7219019Sgabor * All rights reserved.
8219019Sgabor * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9219019Sgabor * All rights reserved.
10219019Sgabor *
11219019Sgabor * This code is derived from software contributed to Berkeley by
12219019Sgabor * the Systems Programming Group of the University of Utah Computer
13219019Sgabor * Science Department and William Jolitz of UUNET Technologies Inc.
14219019Sgabor *
15219019Sgabor * Redistribution and use in source and binary forms, with or without
16219019Sgabor * modification, are permitted provided that the following conditions
17219019Sgabor * are met:
18219019Sgabor * 1. Redistributions of source code must retain the above copyright
19219019Sgabor *    notice, this list of conditions and the following disclaimer.
20219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright
21219019Sgabor *    notice, this list of conditions and the following disclaimer in the
22219019Sgabor *    documentation and/or other materials provided with the distribution.
23219019Sgabor * 3. All advertising materials mentioning features or use of this software
24219019Sgabor *    must display the following acknowledgement:
25219019Sgabor *	This product includes software developed by the University of
26219019Sgabor *	California, Berkeley and its contributors.
27219019Sgabor * 4. Neither the name of the University nor the names of its contributors
28219019Sgabor *    may be used to endorse or promote products derived from this software
29219019Sgabor *    without specific prior written permission.
30219019Sgabor *
31219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34219019Sgabor * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41219019Sgabor * SUCH DAMAGE.
42219019Sgabor *
43219019Sgabor *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44219019Sgabor */
45219019Sgabor/*-
46219019Sgabor * Copyright (c) 2003 Networks Associates Technology, Inc.
47219019Sgabor * All rights reserved.
48219019Sgabor *
49219019Sgabor * This software was developed for the FreeBSD Project by Jake Burkholder,
50219019Sgabor * Safeport Network Services, and Network Associates Laboratories, the
51219019Sgabor * Security Research Division of Network Associates, Inc. under
52219019Sgabor * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53219019Sgabor * CHATS research program.
54219019Sgabor *
55219019Sgabor * Redistribution and use in source and binary forms, with or without
56219019Sgabor * modification, are permitted provided that the following conditions
57219019Sgabor * are met:
58219019Sgabor * 1. Redistributions of source code must retain the above copyright
59219019Sgabor *    notice, this list of conditions and the following disclaimer.
60219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright
61219019Sgabor *    notice, this list of conditions and the following disclaimer in the
62219019Sgabor *    documentation and/or other materials provided with the distribution.
63219019Sgabor *
64219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67219019Sgabor * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74219019Sgabor * SUCH DAMAGE.
75219019Sgabor */
76219019Sgabor
77219019Sgabor#include <sys/cdefs.h>
78219019Sgabor__FBSDID("$FreeBSD: stable/10/sys/i386/i386/pmap.c 287126 2015-08-25 14:39:40Z marcel $");
79219019Sgabor
80219019Sgabor/*
81219019Sgabor *	Manages physical address maps.
82219019Sgabor *
83219019Sgabor *	Since the information managed by this module is
84219019Sgabor *	also stored by the logical address mapping module,
85219019Sgabor *	this module may throw away valid virtual-to-physical
86219019Sgabor *	mappings at almost any time.  However, invalidations
87219019Sgabor *	of virtual-to-physical mappings must be done as
88219019Sgabor *	requested.
89219019Sgabor *
90219019Sgabor *	In order to cope with hardware architectures which
91219019Sgabor *	make virtual-to-physical map invalidates expensive,
92219019Sgabor *	this module may delay invalidate or reduced protection
93219019Sgabor *	operations until such time as they are actually
94219019Sgabor *	necessary.  This module is given full information as
95219019Sgabor *	to which processors are currently using which maps,
96219019Sgabor *	and to when physical maps must be made correct.
97219019Sgabor */
98219019Sgabor
99219019Sgabor#include "opt_apic.h"
100219019Sgabor#include "opt_cpu.h"
101219019Sgabor#include "opt_pmap.h"
102219019Sgabor#include "opt_smp.h"
103219019Sgabor#include "opt_xbox.h"
104219019Sgabor
105219019Sgabor#include <sys/param.h>
106219019Sgabor#include <sys/systm.h>
107219019Sgabor#include <sys/kernel.h>
108219019Sgabor#include <sys/ktr.h>
109219019Sgabor#include <sys/lock.h>
110219019Sgabor#include <sys/malloc.h>
111219019Sgabor#include <sys/mman.h>
112219019Sgabor#include <sys/msgbuf.h>
113219019Sgabor#include <sys/mutex.h>
114219019Sgabor#include <sys/proc.h>
115219019Sgabor#include <sys/rwlock.h>
116219019Sgabor#include <sys/sf_buf.h>
117219019Sgabor#include <sys/sx.h>
118219019Sgabor#include <sys/vmmeter.h>
119219019Sgabor#include <sys/sched.h>
120219019Sgabor#include <sys/sysctl.h>
121219019Sgabor#ifdef SMP
122219019Sgabor#include <sys/smp.h>
123219019Sgabor#else
124219019Sgabor#include <sys/cpuset.h>
125219019Sgabor#endif
126219019Sgabor
127219019Sgabor#include <vm/vm.h>
128219019Sgabor#include <vm/vm_param.h>
129219019Sgabor#include <vm/vm_kern.h>
130219019Sgabor#include <vm/vm_page.h>
131219019Sgabor#include <vm/vm_map.h>
132219019Sgabor#include <vm/vm_object.h>
133219019Sgabor#include <vm/vm_extern.h>
134219019Sgabor#include <vm/vm_pageout.h>
135219019Sgabor#include <vm/vm_pager.h>
136219019Sgabor#include <vm/vm_phys.h>
137219019Sgabor#include <vm/vm_radix.h>
138219019Sgabor#include <vm/vm_reserv.h>
139219019Sgabor#include <vm/uma.h>
140219019Sgabor
141219019Sgabor#ifdef DEV_APIC
142219019Sgabor#include <sys/bus.h>
143219019Sgabor#include <machine/intr_machdep.h>
144219019Sgabor#include <machine/apicvar.h>
145219019Sgabor#endif
146219019Sgabor#include <machine/cpu.h>
147219019Sgabor#include <machine/cputypes.h>
148219019Sgabor#include <machine/md_var.h>
149219019Sgabor#include <machine/pcb.h>
150219019Sgabor#include <machine/specialreg.h>
151219019Sgabor#ifdef SMP
152219019Sgabor#include <machine/smp.h>
153219019Sgabor#endif
154219019Sgabor
155219019Sgabor#ifdef XBOX
156219019Sgabor#include <machine/xbox.h>
157219019Sgabor#endif
158219019Sgabor
159219019Sgabor#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
160219019Sgabor#define CPU_ENABLE_SSE
161219019Sgabor#endif
162219019Sgabor
163219019Sgabor#ifndef PMAP_SHPGPERPROC
164219019Sgabor#define PMAP_SHPGPERPROC 200
165219019Sgabor#endif
166219019Sgabor
167219019Sgabor#if !defined(DIAGNOSTIC)
168219019Sgabor#ifdef __GNUC_GNU_INLINE__
169219019Sgabor#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
170219019Sgabor#else
171219019Sgabor#define PMAP_INLINE	extern inline
172219019Sgabor#endif
173219019Sgabor#else
174219019Sgabor#define PMAP_INLINE
175219019Sgabor#endif
176219019Sgabor
177219019Sgabor#ifdef PV_STATS
178219019Sgabor#define PV_STAT(x)	do { x ; } while (0)
179219019Sgabor#else
180219019Sgabor#define PV_STAT(x)	do { } while (0)
181219019Sgabor#endif
182219019Sgabor
183219019Sgabor#define	pa_index(pa)	((pa) >> PDRSHIFT)
184219019Sgabor#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
185219019Sgabor
186219019Sgabor/*
187219019Sgabor * Get PDEs and PTEs for user/kernel address space
188219019Sgabor */
189219019Sgabor#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
190219019Sgabor#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
191219019Sgabor
192219019Sgabor#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
193219019Sgabor#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
194219019Sgabor#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
195219019Sgabor#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
196219019Sgabor#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
197219019Sgabor
198219019Sgabor#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
199219019Sgabor    atomic_clear_int((u_int *)(pte), PG_W))
200219019Sgabor#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
201219019Sgabor
202219019Sgaborstruct pmap kernel_pmap_store;
203219019SgaborLIST_HEAD(pmaplist, pmap);
204219019Sgaborstatic struct pmaplist allpmaps;
205219019Sgaborstatic struct mtx allpmaps_lock;
206219019Sgabor
207219019Sgaborvm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
208219019Sgaborvm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
209219019Sgaborint pgeflag = 0;		/* PG_G or-in */
210219019Sgaborint pseflag = 0;		/* PG_PS or-in */
211219019Sgabor
212219019Sgaborstatic int nkpt = NKPT;
213219019Sgaborvm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
214219019Sgaborextern u_int32_t KERNend;
215219019Sgaborextern u_int32_t KPTphys;
216219019Sgabor
217219019Sgabor#if defined(PAE) || defined(PAE_TABLES)
218219019Sgaborpt_entry_t pg_nx;
219219019Sgaborstatic uma_zone_t pdptzone;
220219019Sgabor#endif
221219019Sgabor
222219019Sgaborstatic SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
223219019Sgabor
224219019Sgaborstatic int pat_works = 1;
225219019SgaborSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
226    "Is page attribute table fully functional?");
227
228static int pg_ps_enabled = 1;
229SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
230    "Are large page mappings enabled?");
231
232#define	PAT_INDEX_SIZE	8
233static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
234
235/*
236 * pmap_mapdev support pre initialization (i.e. console)
237 */
238#define	PMAP_PREINIT_MAPPING_COUNT	8
239static struct pmap_preinit_mapping {
240	vm_paddr_t	pa;
241	vm_offset_t	va;
242	vm_size_t	sz;
243	int		mode;
244} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
245static int pmap_initialized;
246
247static struct rwlock_padalign pvh_global_lock;
248
249/*
250 * Data for the pv entry allocation mechanism
251 */
252static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
253static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
254static struct md_page *pv_table;
255static int shpgperproc = PMAP_SHPGPERPROC;
256
257struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
258int pv_maxchunks;			/* How many chunks we have KVA for */
259vm_offset_t pv_vafree;			/* freelist stored in the PTE */
260
261/*
262 * All those kernel PT submaps that BSD is so fond of
263 */
264struct sysmaps {
265	struct	mtx lock;
266	pt_entry_t *CMAP1;
267	pt_entry_t *CMAP2;
268	caddr_t	CADDR1;
269	caddr_t	CADDR2;
270};
271static struct sysmaps sysmaps_pcpu[MAXCPU];
272pt_entry_t *CMAP3;
273static pd_entry_t *KPTD;
274caddr_t ptvmmap = 0;
275caddr_t CADDR3;
276struct msgbuf *msgbufp = 0;
277
278/*
279 * Crashdump maps.
280 */
281static caddr_t crashdumpmap;
282
283static pt_entry_t *PMAP1 = 0, *PMAP2;
284static pt_entry_t *PADDR1 = 0, *PADDR2;
285#ifdef SMP
286static int PMAP1cpu;
287static int PMAP1changedcpu;
288SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
289	   &PMAP1changedcpu, 0,
290	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
291#endif
292static int PMAP1changed;
293SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
294	   &PMAP1changed, 0,
295	   "Number of times pmap_pte_quick changed PMAP1");
296static int PMAP1unchanged;
297SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
298	   &PMAP1unchanged, 0,
299	   "Number of times pmap_pte_quick didn't change PMAP1");
300static struct mtx PMAP2mutex;
301
302static void	free_pv_chunk(struct pv_chunk *pc);
303static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
304static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
305static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
306static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
307static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
308static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
309static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
310		    vm_offset_t va);
311static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
312
313static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
314static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
315    vm_prot_t prot);
316static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
317    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
318static void pmap_flush_page(vm_page_t m);
319static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
320static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
321static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
322static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
323static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
324static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
325static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
326static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
327static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
328static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
329    vm_prot_t prot);
330static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
331static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
332    struct spglist *free);
333static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
334    struct spglist *free);
335static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
336static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
337    struct spglist *free);
338static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
339					vm_offset_t va);
340static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
341static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
342    vm_page_t m);
343static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
344    pd_entry_t newpde);
345static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
346
347static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
348
349static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
350static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
351static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
352static void pmap_pte_release(pt_entry_t *pte);
353static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
354#if defined(PAE) || defined(PAE_TABLES)
355static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
356#endif
357static void pmap_set_pg(void);
358
359static __inline void pagezero(void *page);
360
361CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
362CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
363
364/*
365 * If you get an error here, then you set KVA_PAGES wrong! See the
366 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
367 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
368 */
369CTASSERT(KERNBASE % (1 << 24) == 0);
370
371/*
372 *	Bootstrap the system enough to run with virtual memory.
373 *
374 *	On the i386 this is called after mapping has already been enabled
375 *	and just syncs the pmap module with what has already been done.
376 *	[We can't call it easily with mapping off since the kernel is not
377 *	mapped with PA == VA, hence we would have to relocate every address
378 *	from the linked base (virtual) address "KERNBASE" to the actual
379 *	(physical) address starting relative to 0]
380 */
381void
382pmap_bootstrap(vm_paddr_t firstaddr)
383{
384	vm_offset_t va;
385	pt_entry_t *pte, *unused;
386	struct sysmaps *sysmaps;
387	int i;
388
389	/*
390	 * Add a physical memory segment (vm_phys_seg) corresponding to the
391	 * preallocated kernel page table pages so that vm_page structures
392	 * representing these pages will be created.  The vm_page structures
393	 * are required for promotion of the corresponding kernel virtual
394	 * addresses to superpage mappings.
395	 */
396	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
397
398	/*
399	 * Initialize the first available kernel virtual address.  However,
400	 * using "firstaddr" may waste a few pages of the kernel virtual
401	 * address space, because locore may not have mapped every physical
402	 * page that it allocated.  Preferably, locore would provide a first
403	 * unused virtual address in addition to "firstaddr".
404	 */
405	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
406
407	virtual_end = VM_MAX_KERNEL_ADDRESS;
408
409	/*
410	 * Initialize the kernel pmap (which is statically allocated).
411	 */
412	PMAP_LOCK_INIT(kernel_pmap);
413	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
414#if defined(PAE) || defined(PAE_TABLES)
415	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
416#endif
417	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
418	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
419
420 	/*
421	 * Initialize the global pv list lock.
422	 */
423	rw_init(&pvh_global_lock, "pmap pv global");
424
425	LIST_INIT(&allpmaps);
426
427	/*
428	 * Request a spin mutex so that changes to allpmaps cannot be
429	 * preempted by smp_rendezvous_cpus().  Otherwise,
430	 * pmap_update_pde_kernel() could access allpmaps while it is
431	 * being changed.
432	 */
433	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
434	mtx_lock_spin(&allpmaps_lock);
435	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
436	mtx_unlock_spin(&allpmaps_lock);
437
438	/*
439	 * Reserve some special page table entries/VA space for temporary
440	 * mapping of pages.
441	 */
442#define	SYSMAP(c, p, v, n)	\
443	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
444
445	va = virtual_avail;
446	pte = vtopte(va);
447
448	/*
449	 * CMAP1/CMAP2 are used for zeroing and copying pages.
450	 * CMAP3 is used for the idle process page zeroing.
451	 */
452	for (i = 0; i < MAXCPU; i++) {
453		sysmaps = &sysmaps_pcpu[i];
454		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
455		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
456		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
457	}
458	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
459
460	/*
461	 * Crashdump maps.
462	 */
463	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
464
465	/*
466	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
467	 */
468	SYSMAP(caddr_t, unused, ptvmmap, 1)
469
470	/*
471	 * msgbufp is used to map the system message buffer.
472	 */
473	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
474
475	/*
476	 * KPTmap is used by pmap_kextract().
477	 *
478	 * KPTmap is first initialized by locore.  However, that initial
479	 * KPTmap can only support NKPT page table pages.  Here, a larger
480	 * KPTmap is created that can support KVA_PAGES page table pages.
481	 */
482	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
483
484	for (i = 0; i < NKPT; i++)
485		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
486
487	/*
488	 * Adjust the start of the KPTD and KPTmap so that the implementation
489	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
490	 */
491	KPTD -= KPTDI;
492	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
493
494	/*
495	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
496	 * respectively.
497	 */
498	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
499	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
500
501	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
502
503	virtual_avail = va;
504
505	/*
506	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
507	 * physical memory region that is used by the ACPI wakeup code.  This
508	 * mapping must not have PG_G set.
509	 */
510#ifdef XBOX
511	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
512	 * an early stadium, we cannot yet neatly map video memory ... :-(
513	 * Better fixes are very welcome! */
514	if (!arch_i386_is_xbox)
515#endif
516	for (i = 1; i < NKPT; i++)
517		PTD[i] = 0;
518
519	/* Initialize the PAT MSR if present. */
520	pmap_init_pat();
521
522	/* Turn on PG_G on kernel page(s) */
523	pmap_set_pg();
524}
525
526/*
527 * Setup the PAT MSR.
528 */
529void
530pmap_init_pat(void)
531{
532	int pat_table[PAT_INDEX_SIZE];
533	uint64_t pat_msr;
534	u_long cr0, cr4;
535	int i;
536
537	/* Set default PAT index table. */
538	for (i = 0; i < PAT_INDEX_SIZE; i++)
539		pat_table[i] = -1;
540	pat_table[PAT_WRITE_BACK] = 0;
541	pat_table[PAT_WRITE_THROUGH] = 1;
542	pat_table[PAT_UNCACHEABLE] = 3;
543	pat_table[PAT_WRITE_COMBINING] = 3;
544	pat_table[PAT_WRITE_PROTECTED] = 3;
545	pat_table[PAT_UNCACHED] = 3;
546
547	/* Bail if this CPU doesn't implement PAT. */
548	if ((cpu_feature & CPUID_PAT) == 0) {
549		for (i = 0; i < PAT_INDEX_SIZE; i++)
550			pat_index[i] = pat_table[i];
551		pat_works = 0;
552		return;
553	}
554
555	/*
556	 * Due to some Intel errata, we can only safely use the lower 4
557	 * PAT entries.
558	 *
559	 *   Intel Pentium III Processor Specification Update
560	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
561	 * or Mode C Paging)
562	 *
563	 *   Intel Pentium IV  Processor Specification Update
564	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
565	 */
566	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
567	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
568		pat_works = 0;
569
570	/* Initialize default PAT entries. */
571	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
572	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
573	    PAT_VALUE(2, PAT_UNCACHED) |
574	    PAT_VALUE(3, PAT_UNCACHEABLE) |
575	    PAT_VALUE(4, PAT_WRITE_BACK) |
576	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
577	    PAT_VALUE(6, PAT_UNCACHED) |
578	    PAT_VALUE(7, PAT_UNCACHEABLE);
579
580	if (pat_works) {
581		/*
582		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
583		 * Program 5 and 6 as WP and WC.
584		 * Leave 4 and 7 as WB and UC.
585		 */
586		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
587		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
588		    PAT_VALUE(6, PAT_WRITE_COMBINING);
589		pat_table[PAT_UNCACHED] = 2;
590		pat_table[PAT_WRITE_PROTECTED] = 5;
591		pat_table[PAT_WRITE_COMBINING] = 6;
592	} else {
593		/*
594		 * Just replace PAT Index 2 with WC instead of UC-.
595		 */
596		pat_msr &= ~PAT_MASK(2);
597		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
598		pat_table[PAT_WRITE_COMBINING] = 2;
599	}
600
601	/* Disable PGE. */
602	cr4 = rcr4();
603	load_cr4(cr4 & ~CR4_PGE);
604
605	/* Disable caches (CD = 1, NW = 0). */
606	cr0 = rcr0();
607	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
608
609	/* Flushes caches and TLBs. */
610	wbinvd();
611	invltlb();
612
613	/* Update PAT and index table. */
614	wrmsr(MSR_PAT, pat_msr);
615	for (i = 0; i < PAT_INDEX_SIZE; i++)
616		pat_index[i] = pat_table[i];
617
618	/* Flush caches and TLBs again. */
619	wbinvd();
620	invltlb();
621
622	/* Restore caches and PGE. */
623	load_cr0(cr0);
624	load_cr4(cr4);
625}
626
627/*
628 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
629 */
630static void
631pmap_set_pg(void)
632{
633	pt_entry_t *pte;
634	vm_offset_t va, endva;
635
636	if (pgeflag == 0)
637		return;
638
639	endva = KERNBASE + KERNend;
640
641	if (pseflag) {
642		va = KERNBASE + KERNLOAD;
643		while (va  < endva) {
644			pdir_pde(PTD, va) |= pgeflag;
645			invltlb();	/* Play it safe, invltlb() every time */
646			va += NBPDR;
647		}
648	} else {
649		va = (vm_offset_t)btext;
650		while (va < endva) {
651			pte = vtopte(va);
652			if (*pte)
653				*pte |= pgeflag;
654			invltlb();	/* Play it safe, invltlb() every time */
655			va += PAGE_SIZE;
656		}
657	}
658}
659
660/*
661 * Initialize a vm_page's machine-dependent fields.
662 */
663void
664pmap_page_init(vm_page_t m)
665{
666
667	TAILQ_INIT(&m->md.pv_list);
668	m->md.pat_mode = PAT_WRITE_BACK;
669}
670
671#if defined(PAE) || defined(PAE_TABLES)
672static void *
673pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
674{
675
676	/* Inform UMA that this allocator uses kernel_map/object. */
677	*flags = UMA_SLAB_KERNEL;
678	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
679	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
680}
681#endif
682
683/*
684 * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
685 * Requirements:
686 *  - Must deal with pages in order to ensure that none of the PG_* bits
687 *    are ever set, PG_V in particular.
688 *  - Assumes we can write to ptes without pte_store() atomic ops, even
689 *    on PAE systems.  This should be ok.
690 *  - Assumes nothing will ever test these addresses for 0 to indicate
691 *    no mapping instead of correctly checking PG_V.
692 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
693 * Because PG_V is never set, there can be no mappings to invalidate.
694 */
695static vm_offset_t
696pmap_ptelist_alloc(vm_offset_t *head)
697{
698	pt_entry_t *pte;
699	vm_offset_t va;
700
701	va = *head;
702	if (va == 0)
703		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
704	pte = vtopte(va);
705	*head = *pte;
706	if (*head & PG_V)
707		panic("pmap_ptelist_alloc: va with PG_V set!");
708	*pte = 0;
709	return (va);
710}
711
712static void
713pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
714{
715	pt_entry_t *pte;
716
717	if (va & PG_V)
718		panic("pmap_ptelist_free: freeing va with PG_V set!");
719	pte = vtopte(va);
720	*pte = *head;		/* virtual! PG_V is 0 though */
721	*head = va;
722}
723
724static void
725pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
726{
727	int i;
728	vm_offset_t va;
729
730	*head = 0;
731	for (i = npages - 1; i >= 0; i--) {
732		va = (vm_offset_t)base + i * PAGE_SIZE;
733		pmap_ptelist_free(head, va);
734	}
735}
736
737
738/*
739 *	Initialize the pmap module.
740 *	Called by vm_init, to initialize any structures that the pmap
741 *	system needs to map virtual memory.
742 */
743void
744pmap_init(void)
745{
746	struct pmap_preinit_mapping *ppim;
747	vm_page_t mpte;
748	vm_size_t s;
749	int i, pv_npg;
750
751	/*
752	 * Initialize the vm page array entries for the kernel pmap's
753	 * page table pages.
754	 */
755	for (i = 0; i < NKPT; i++) {
756		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
757		KASSERT(mpte >= vm_page_array &&
758		    mpte < &vm_page_array[vm_page_array_size],
759		    ("pmap_init: page table page is out of range"));
760		mpte->pindex = i + KPTDI;
761		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
762	}
763
764	/*
765	 * Initialize the address space (zone) for the pv entries.  Set a
766	 * high water mark so that the system can recover from excessive
767	 * numbers of pv entries.
768	 */
769	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
770	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
771	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
772	pv_entry_max = roundup(pv_entry_max, _NPCPV);
773	pv_entry_high_water = 9 * (pv_entry_max / 10);
774
775	/*
776	 * If the kernel is running on a virtual machine, then it must assume
777	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
778	 * be prepared for the hypervisor changing the vendor and family that
779	 * are reported by CPUID.  Consequently, the workaround for AMD Family
780	 * 10h Erratum 383 is enabled if the processor's feature set does not
781	 * include at least one feature that is only supported by older Intel
782	 * or newer AMD processors.
783	 */
784	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
785	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
786	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
787	    AMDID2_FMA4)) == 0)
788		workaround_erratum383 = 1;
789
790	/*
791	 * Are large page mappings supported and enabled?
792	 */
793	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
794	if (pseflag == 0)
795		pg_ps_enabled = 0;
796	else if (pg_ps_enabled) {
797		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
798		    ("pmap_init: can't assign to pagesizes[1]"));
799		pagesizes[1] = NBPDR;
800	}
801
802	/*
803	 * Calculate the size of the pv head table for superpages.
804	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
805	 */
806	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
807	    PAGE_SIZE) / NBPDR + 1;
808
809	/*
810	 * Allocate memory for the pv head table for superpages.
811	 */
812	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
813	s = round_page(s);
814	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
815	    M_WAITOK | M_ZERO);
816	for (i = 0; i < pv_npg; i++)
817		TAILQ_INIT(&pv_table[i].pv_list);
818
819	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
820	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
821	if (pv_chunkbase == NULL)
822		panic("pmap_init: not enough kvm for pv chunks");
823	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
824#if defined(PAE) || defined(PAE_TABLES)
825	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
826	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
827	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
828	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
829#endif
830
831	pmap_initialized = 1;
832	if (!bootverbose)
833		return;
834	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
835		ppim = pmap_preinit_mapping + i;
836		if (ppim->va == 0)
837			continue;
838		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
839		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
840	}
841}
842
843
844SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
845	"Max number of PV entries");
846SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
847	"Page share factor per proc");
848
849static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
850    "2/4MB page mapping counters");
851
852static u_long pmap_pde_demotions;
853SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
854    &pmap_pde_demotions, 0, "2/4MB page demotions");
855
856static u_long pmap_pde_mappings;
857SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
858    &pmap_pde_mappings, 0, "2/4MB page mappings");
859
860static u_long pmap_pde_p_failures;
861SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
862    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
863
864static u_long pmap_pde_promotions;
865SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
866    &pmap_pde_promotions, 0, "2/4MB page promotions");
867
868/***************************************************
869 * Low level helper routines.....
870 ***************************************************/
871
872/*
873 * Determine the appropriate bits to set in a PTE or PDE for a specified
874 * caching mode.
875 */
876int
877pmap_cache_bits(int mode, boolean_t is_pde)
878{
879	int cache_bits, pat_flag, pat_idx;
880
881	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
882		panic("Unknown caching mode %d\n", mode);
883
884	/* The PAT bit is different for PTE's and PDE's. */
885	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
886
887	/* Map the caching mode to a PAT index. */
888	pat_idx = pat_index[mode];
889
890	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
891	cache_bits = 0;
892	if (pat_idx & 0x4)
893		cache_bits |= pat_flag;
894	if (pat_idx & 0x2)
895		cache_bits |= PG_NC_PCD;
896	if (pat_idx & 0x1)
897		cache_bits |= PG_NC_PWT;
898	return (cache_bits);
899}
900
901/*
902 * The caller is responsible for maintaining TLB consistency.
903 */
904static void
905pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
906{
907	pd_entry_t *pde;
908	pmap_t pmap;
909	boolean_t PTD_updated;
910
911	PTD_updated = FALSE;
912	mtx_lock_spin(&allpmaps_lock);
913	LIST_FOREACH(pmap, &allpmaps, pm_list) {
914		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
915		    PG_FRAME))
916			PTD_updated = TRUE;
917		pde = pmap_pde(pmap, va);
918		pde_store(pde, newpde);
919	}
920	mtx_unlock_spin(&allpmaps_lock);
921	KASSERT(PTD_updated,
922	    ("pmap_kenter_pde: current page table is not in allpmaps"));
923}
924
925/*
926 * After changing the page size for the specified virtual address in the page
927 * table, flush the corresponding entries from the processor's TLB.  Only the
928 * calling processor's TLB is affected.
929 *
930 * The calling thread must be pinned to a processor.
931 */
932static void
933pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
934{
935	u_long cr4;
936
937	if ((newpde & PG_PS) == 0)
938		/* Demotion: flush a specific 2MB page mapping. */
939		invlpg(va);
940	else if ((newpde & PG_G) == 0)
941		/*
942		 * Promotion: flush every 4KB page mapping from the TLB
943		 * because there are too many to flush individually.
944		 */
945		invltlb();
946	else {
947		/*
948		 * Promotion: flush every 4KB page mapping from the TLB,
949		 * including any global (PG_G) mappings.
950		 */
951		cr4 = rcr4();
952		load_cr4(cr4 & ~CR4_PGE);
953		/*
954		 * Although preemption at this point could be detrimental to
955		 * performance, it would not lead to an error.  PG_G is simply
956		 * ignored if CR4.PGE is clear.  Moreover, in case this block
957		 * is re-entered, the load_cr4() either above or below will
958		 * modify CR4.PGE flushing the TLB.
959		 */
960		load_cr4(cr4 | CR4_PGE);
961	}
962}
963#ifdef SMP
964/*
965 * For SMP, these functions have to use the IPI mechanism for coherence.
966 *
967 * N.B.: Before calling any of the following TLB invalidation functions,
968 * the calling processor must ensure that all stores updating a non-
969 * kernel page table are globally performed.  Otherwise, another
970 * processor could cache an old, pre-update entry without being
971 * invalidated.  This can happen one of two ways: (1) The pmap becomes
972 * active on another processor after its pm_active field is checked by
973 * one of the following functions but before a store updating the page
974 * table is globally performed. (2) The pmap becomes active on another
975 * processor before its pm_active field is checked but due to
976 * speculative loads one of the following functions stills reads the
977 * pmap as inactive on the other processor.
978 *
979 * The kernel page table is exempt because its pm_active field is
980 * immutable.  The kernel page table is always active on every
981 * processor.
982 */
983void
984pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
985{
986	cpuset_t other_cpus;
987	u_int cpuid;
988
989	sched_pin();
990	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
991		invlpg(va);
992		smp_invlpg(va);
993	} else {
994		cpuid = PCPU_GET(cpuid);
995		other_cpus = all_cpus;
996		CPU_CLR(cpuid, &other_cpus);
997		if (CPU_ISSET(cpuid, &pmap->pm_active))
998			invlpg(va);
999		CPU_AND(&other_cpus, &pmap->pm_active);
1000		if (!CPU_EMPTY(&other_cpus))
1001			smp_masked_invlpg(other_cpus, va);
1002	}
1003	sched_unpin();
1004}
1005
1006void
1007pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1008{
1009	cpuset_t other_cpus;
1010	vm_offset_t addr;
1011	u_int cpuid;
1012
1013	sched_pin();
1014	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1015		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1016			invlpg(addr);
1017		smp_invlpg_range(sva, eva);
1018	} else {
1019		cpuid = PCPU_GET(cpuid);
1020		other_cpus = all_cpus;
1021		CPU_CLR(cpuid, &other_cpus);
1022		if (CPU_ISSET(cpuid, &pmap->pm_active))
1023			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1024				invlpg(addr);
1025		CPU_AND(&other_cpus, &pmap->pm_active);
1026		if (!CPU_EMPTY(&other_cpus))
1027			smp_masked_invlpg_range(other_cpus, sva, eva);
1028	}
1029	sched_unpin();
1030}
1031
1032void
1033pmap_invalidate_all(pmap_t pmap)
1034{
1035	cpuset_t other_cpus;
1036	u_int cpuid;
1037
1038	sched_pin();
1039	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1040		invltlb();
1041		smp_invltlb();
1042	} else {
1043		cpuid = PCPU_GET(cpuid);
1044		other_cpus = all_cpus;
1045		CPU_CLR(cpuid, &other_cpus);
1046		if (CPU_ISSET(cpuid, &pmap->pm_active))
1047			invltlb();
1048		CPU_AND(&other_cpus, &pmap->pm_active);
1049		if (!CPU_EMPTY(&other_cpus))
1050			smp_masked_invltlb(other_cpus);
1051	}
1052	sched_unpin();
1053}
1054
1055void
1056pmap_invalidate_cache(void)
1057{
1058
1059	sched_pin();
1060	wbinvd();
1061	smp_cache_flush();
1062	sched_unpin();
1063}
1064
1065struct pde_action {
1066	cpuset_t invalidate;	/* processors that invalidate their TLB */
1067	vm_offset_t va;
1068	pd_entry_t *pde;
1069	pd_entry_t newpde;
1070	u_int store;		/* processor that updates the PDE */
1071};
1072
1073static void
1074pmap_update_pde_kernel(void *arg)
1075{
1076	struct pde_action *act = arg;
1077	pd_entry_t *pde;
1078	pmap_t pmap;
1079
1080	if (act->store == PCPU_GET(cpuid)) {
1081
1082		/*
1083		 * Elsewhere, this operation requires allpmaps_lock for
1084		 * synchronization.  Here, it does not because it is being
1085		 * performed in the context of an all_cpus rendezvous.
1086		 */
1087		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1088			pde = pmap_pde(pmap, act->va);
1089			pde_store(pde, act->newpde);
1090		}
1091	}
1092}
1093
1094static void
1095pmap_update_pde_user(void *arg)
1096{
1097	struct pde_action *act = arg;
1098
1099	if (act->store == PCPU_GET(cpuid))
1100		pde_store(act->pde, act->newpde);
1101}
1102
1103static void
1104pmap_update_pde_teardown(void *arg)
1105{
1106	struct pde_action *act = arg;
1107
1108	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1109		pmap_update_pde_invalidate(act->va, act->newpde);
1110}
1111
1112/*
1113 * Change the page size for the specified virtual address in a way that
1114 * prevents any possibility of the TLB ever having two entries that map the
1115 * same virtual address using different page sizes.  This is the recommended
1116 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1117 * machine check exception for a TLB state that is improperly diagnosed as a
1118 * hardware error.
1119 */
1120static void
1121pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1122{
1123	struct pde_action act;
1124	cpuset_t active, other_cpus;
1125	u_int cpuid;
1126
1127	sched_pin();
1128	cpuid = PCPU_GET(cpuid);
1129	other_cpus = all_cpus;
1130	CPU_CLR(cpuid, &other_cpus);
1131	if (pmap == kernel_pmap)
1132		active = all_cpus;
1133	else
1134		active = pmap->pm_active;
1135	if (CPU_OVERLAP(&active, &other_cpus)) {
1136		act.store = cpuid;
1137		act.invalidate = active;
1138		act.va = va;
1139		act.pde = pde;
1140		act.newpde = newpde;
1141		CPU_SET(cpuid, &active);
1142		smp_rendezvous_cpus(active,
1143		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1144		    pmap_update_pde_kernel : pmap_update_pde_user,
1145		    pmap_update_pde_teardown, &act);
1146	} else {
1147		if (pmap == kernel_pmap)
1148			pmap_kenter_pde(va, newpde);
1149		else
1150			pde_store(pde, newpde);
1151		if (CPU_ISSET(cpuid, &active))
1152			pmap_update_pde_invalidate(va, newpde);
1153	}
1154	sched_unpin();
1155}
1156#else /* !SMP */
1157/*
1158 * Normal, non-SMP, 486+ invalidation functions.
1159 * We inline these within pmap.c for speed.
1160 */
1161PMAP_INLINE void
1162pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1163{
1164
1165	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1166		invlpg(va);
1167}
1168
1169PMAP_INLINE void
1170pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1171{
1172	vm_offset_t addr;
1173
1174	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1175		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1176			invlpg(addr);
1177}
1178
1179PMAP_INLINE void
1180pmap_invalidate_all(pmap_t pmap)
1181{
1182
1183	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1184		invltlb();
1185}
1186
1187PMAP_INLINE void
1188pmap_invalidate_cache(void)
1189{
1190
1191	wbinvd();
1192}
1193
1194static void
1195pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1196{
1197
1198	if (pmap == kernel_pmap)
1199		pmap_kenter_pde(va, newpde);
1200	else
1201		pde_store(pde, newpde);
1202	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1203		pmap_update_pde_invalidate(va, newpde);
1204}
1205#endif /* !SMP */
1206
1207#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1208
1209void
1210pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1211{
1212
1213	if (force) {
1214		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1215	} else {
1216		KASSERT((sva & PAGE_MASK) == 0,
1217		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1218		KASSERT((eva & PAGE_MASK) == 0,
1219		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1220	}
1221
1222	if ((cpu_feature & CPUID_SS) != 0 && !force)
1223		; /* If "Self Snoop" is supported and allowed, do nothing. */
1224	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1225	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1226
1227#ifdef DEV_APIC
1228		/*
1229		 * XXX: Some CPUs fault, hang, or trash the local APIC
1230		 * registers if we use CLFLUSH on the local APIC
1231		 * range.  The local APIC is always uncached, so we
1232		 * don't need to flush for that range anyway.
1233		 */
1234		if (pmap_kextract(sva) == lapic_paddr)
1235			return;
1236#endif
1237		/*
1238		 * Otherwise, do per-cache line flush.  Use the mfence
1239		 * instruction to insure that previous stores are
1240		 * included in the write-back.  The processor
1241		 * propagates flush to other processors in the cache
1242		 * coherence domain.
1243		 */
1244		mfence();
1245		for (; sva < eva; sva += cpu_clflush_line_size)
1246			clflush(sva);
1247		mfence();
1248	} else {
1249
1250		/*
1251		 * No targeted cache flush methods are supported by CPU,
1252		 * or the supplied range is bigger than 2MB.
1253		 * Globally invalidate cache.
1254		 */
1255		pmap_invalidate_cache();
1256	}
1257}
1258
1259void
1260pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1261{
1262	int i;
1263
1264	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1265	    (cpu_feature & CPUID_CLFSH) == 0) {
1266		pmap_invalidate_cache();
1267	} else {
1268		for (i = 0; i < count; i++)
1269			pmap_flush_page(pages[i]);
1270	}
1271}
1272
1273/*
1274 * Are we current address space or kernel?  N.B. We return FALSE when
1275 * a pmap's page table is in use because a kernel thread is borrowing
1276 * it.  The borrowed page table can change spontaneously, making any
1277 * dependence on its continued use subject to a race condition.
1278 */
1279static __inline int
1280pmap_is_current(pmap_t pmap)
1281{
1282
1283	return (pmap == kernel_pmap ||
1284	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1285	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1286}
1287
1288/*
1289 * If the given pmap is not the current or kernel pmap, the returned pte must
1290 * be released by passing it to pmap_pte_release().
1291 */
1292pt_entry_t *
1293pmap_pte(pmap_t pmap, vm_offset_t va)
1294{
1295	pd_entry_t newpf;
1296	pd_entry_t *pde;
1297
1298	pde = pmap_pde(pmap, va);
1299	if (*pde & PG_PS)
1300		return (pde);
1301	if (*pde != 0) {
1302		/* are we current address space or kernel? */
1303		if (pmap_is_current(pmap))
1304			return (vtopte(va));
1305		mtx_lock(&PMAP2mutex);
1306		newpf = *pde & PG_FRAME;
1307		if ((*PMAP2 & PG_FRAME) != newpf) {
1308			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1309			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1310		}
1311		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1312	}
1313	return (NULL);
1314}
1315
1316/*
1317 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1318 * being NULL.
1319 */
1320static __inline void
1321pmap_pte_release(pt_entry_t *pte)
1322{
1323
1324	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1325		mtx_unlock(&PMAP2mutex);
1326}
1327
1328/*
1329 * NB:  The sequence of updating a page table followed by accesses to the
1330 * corresponding pages is subject to the situation described in the "AMD64
1331 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1332 * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1333 * right after modifying the PTE bits is crucial.
1334 */
1335static __inline void
1336invlcaddr(void *caddr)
1337{
1338
1339	invlpg((u_int)caddr);
1340}
1341
1342/*
1343 * Super fast pmap_pte routine best used when scanning
1344 * the pv lists.  This eliminates many coarse-grained
1345 * invltlb calls.  Note that many of the pv list
1346 * scans are across different pmaps.  It is very wasteful
1347 * to do an entire invltlb for checking a single mapping.
1348 *
1349 * If the given pmap is not the current pmap, pvh_global_lock
1350 * must be held and curthread pinned to a CPU.
1351 */
1352static pt_entry_t *
1353pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1354{
1355	pd_entry_t newpf;
1356	pd_entry_t *pde;
1357
1358	pde = pmap_pde(pmap, va);
1359	if (*pde & PG_PS)
1360		return (pde);
1361	if (*pde != 0) {
1362		/* are we current address space or kernel? */
1363		if (pmap_is_current(pmap))
1364			return (vtopte(va));
1365		rw_assert(&pvh_global_lock, RA_WLOCKED);
1366		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1367		newpf = *pde & PG_FRAME;
1368		if ((*PMAP1 & PG_FRAME) != newpf) {
1369			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1370#ifdef SMP
1371			PMAP1cpu = PCPU_GET(cpuid);
1372#endif
1373			invlcaddr(PADDR1);
1374			PMAP1changed++;
1375		} else
1376#ifdef SMP
1377		if (PMAP1cpu != PCPU_GET(cpuid)) {
1378			PMAP1cpu = PCPU_GET(cpuid);
1379			invlcaddr(PADDR1);
1380			PMAP1changedcpu++;
1381		} else
1382#endif
1383			PMAP1unchanged++;
1384		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1385	}
1386	return (0);
1387}
1388
1389/*
1390 *	Routine:	pmap_extract
1391 *	Function:
1392 *		Extract the physical page address associated
1393 *		with the given map/virtual_address pair.
1394 */
1395vm_paddr_t
1396pmap_extract(pmap_t pmap, vm_offset_t va)
1397{
1398	vm_paddr_t rtval;
1399	pt_entry_t *pte;
1400	pd_entry_t pde;
1401
1402	rtval = 0;
1403	PMAP_LOCK(pmap);
1404	pde = pmap->pm_pdir[va >> PDRSHIFT];
1405	if (pde != 0) {
1406		if ((pde & PG_PS) != 0)
1407			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1408		else {
1409			pte = pmap_pte(pmap, va);
1410			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1411			pmap_pte_release(pte);
1412		}
1413	}
1414	PMAP_UNLOCK(pmap);
1415	return (rtval);
1416}
1417
1418/*
1419 *	Routine:	pmap_extract_and_hold
1420 *	Function:
1421 *		Atomically extract and hold the physical page
1422 *		with the given pmap and virtual address pair
1423 *		if that mapping permits the given protection.
1424 */
1425vm_page_t
1426pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1427{
1428	pd_entry_t pde;
1429	pt_entry_t pte, *ptep;
1430	vm_page_t m;
1431	vm_paddr_t pa;
1432
1433	pa = 0;
1434	m = NULL;
1435	PMAP_LOCK(pmap);
1436retry:
1437	pde = *pmap_pde(pmap, va);
1438	if (pde != 0) {
1439		if (pde & PG_PS) {
1440			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1441				if (vm_page_pa_tryrelock(pmap, (pde &
1442				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1443					goto retry;
1444				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1445				    (va & PDRMASK));
1446				vm_page_hold(m);
1447			}
1448		} else {
1449			ptep = pmap_pte(pmap, va);
1450			pte = *ptep;
1451			pmap_pte_release(ptep);
1452			if (pte != 0 &&
1453			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1454				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1455				    &pa))
1456					goto retry;
1457				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1458				vm_page_hold(m);
1459			}
1460		}
1461	}
1462	PA_UNLOCK_COND(pa);
1463	PMAP_UNLOCK(pmap);
1464	return (m);
1465}
1466
1467/***************************************************
1468 * Low level mapping routines.....
1469 ***************************************************/
1470
1471/*
1472 * Add a wired page to the kva.
1473 * Note: not SMP coherent.
1474 *
1475 * This function may be used before pmap_bootstrap() is called.
1476 */
1477PMAP_INLINE void
1478pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1479{
1480	pt_entry_t *pte;
1481
1482	pte = vtopte(va);
1483	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1484}
1485
1486static __inline void
1487pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1488{
1489	pt_entry_t *pte;
1490
1491	pte = vtopte(va);
1492	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1493}
1494
1495/*
1496 * Remove a page from the kernel pagetables.
1497 * Note: not SMP coherent.
1498 *
1499 * This function may be used before pmap_bootstrap() is called.
1500 */
1501PMAP_INLINE void
1502pmap_kremove(vm_offset_t va)
1503{
1504	pt_entry_t *pte;
1505
1506	pte = vtopte(va);
1507	pte_clear(pte);
1508}
1509
1510/*
1511 *	Used to map a range of physical addresses into kernel
1512 *	virtual address space.
1513 *
1514 *	The value passed in '*virt' is a suggested virtual address for
1515 *	the mapping. Architectures which can support a direct-mapped
1516 *	physical to virtual region can return the appropriate address
1517 *	within that region, leaving '*virt' unchanged. Other
1518 *	architectures should map the pages starting at '*virt' and
1519 *	update '*virt' with the first usable address after the mapped
1520 *	region.
1521 */
1522vm_offset_t
1523pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1524{
1525	vm_offset_t va, sva;
1526	vm_paddr_t superpage_offset;
1527	pd_entry_t newpde;
1528
1529	va = *virt;
1530	/*
1531	 * Does the physical address range's size and alignment permit at
1532	 * least one superpage mapping to be created?
1533	 */
1534	superpage_offset = start & PDRMASK;
1535	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1536		/*
1537		 * Increase the starting virtual address so that its alignment
1538		 * does not preclude the use of superpage mappings.
1539		 */
1540		if ((va & PDRMASK) < superpage_offset)
1541			va = (va & ~PDRMASK) + superpage_offset;
1542		else if ((va & PDRMASK) > superpage_offset)
1543			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1544	}
1545	sva = va;
1546	while (start < end) {
1547		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1548		    pseflag) {
1549			KASSERT((va & PDRMASK) == 0,
1550			    ("pmap_map: misaligned va %#x", va));
1551			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1552			pmap_kenter_pde(va, newpde);
1553			va += NBPDR;
1554			start += NBPDR;
1555		} else {
1556			pmap_kenter(va, start);
1557			va += PAGE_SIZE;
1558			start += PAGE_SIZE;
1559		}
1560	}
1561	pmap_invalidate_range(kernel_pmap, sva, va);
1562	*virt = va;
1563	return (sva);
1564}
1565
1566
1567/*
1568 * Add a list of wired pages to the kva
1569 * this routine is only used for temporary
1570 * kernel mappings that do not need to have
1571 * page modification or references recorded.
1572 * Note that old mappings are simply written
1573 * over.  The page *must* be wired.
1574 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1575 */
1576void
1577pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1578{
1579	pt_entry_t *endpte, oldpte, pa, *pte;
1580	vm_page_t m;
1581
1582	oldpte = 0;
1583	pte = vtopte(sva);
1584	endpte = pte + count;
1585	while (pte < endpte) {
1586		m = *ma++;
1587		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1588		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1589			oldpte |= *pte;
1590			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1591		}
1592		pte++;
1593	}
1594	if (__predict_false((oldpte & PG_V) != 0))
1595		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1596		    PAGE_SIZE);
1597}
1598
1599/*
1600 * This routine tears out page mappings from the
1601 * kernel -- it is meant only for temporary mappings.
1602 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1603 */
1604void
1605pmap_qremove(vm_offset_t sva, int count)
1606{
1607	vm_offset_t va;
1608
1609	va = sva;
1610	while (count-- > 0) {
1611		pmap_kremove(va);
1612		va += PAGE_SIZE;
1613	}
1614	pmap_invalidate_range(kernel_pmap, sva, va);
1615}
1616
1617/***************************************************
1618 * Page table page management routines.....
1619 ***************************************************/
1620static __inline void
1621pmap_free_zero_pages(struct spglist *free)
1622{
1623	vm_page_t m;
1624
1625	while ((m = SLIST_FIRST(free)) != NULL) {
1626		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1627		/* Preserve the page's PG_ZERO setting. */
1628		vm_page_free_toq(m);
1629	}
1630}
1631
1632/*
1633 * Schedule the specified unused page table page to be freed.  Specifically,
1634 * add the page to the specified list of pages that will be released to the
1635 * physical memory manager after the TLB has been updated.
1636 */
1637static __inline void
1638pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1639    boolean_t set_PG_ZERO)
1640{
1641
1642	if (set_PG_ZERO)
1643		m->flags |= PG_ZERO;
1644	else
1645		m->flags &= ~PG_ZERO;
1646	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1647}
1648
1649/*
1650 * Inserts the specified page table page into the specified pmap's collection
1651 * of idle page table pages.  Each of a pmap's page table pages is responsible
1652 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1653 * ordered by this virtual address range.
1654 */
1655static __inline int
1656pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1657{
1658
1659	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1660	return (vm_radix_insert(&pmap->pm_root, mpte));
1661}
1662
1663/*
1664 * Looks for a page table page mapping the specified virtual address in the
1665 * specified pmap's collection of idle page table pages.  Returns NULL if there
1666 * is no page table page corresponding to the specified virtual address.
1667 */
1668static __inline vm_page_t
1669pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1670{
1671
1672	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1673	return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT));
1674}
1675
1676/*
1677 * Removes the specified page table page from the specified pmap's collection
1678 * of idle page table pages.  The specified page table page must be a member of
1679 * the pmap's collection.
1680 */
1681static __inline void
1682pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1683{
1684
1685	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1686	vm_radix_remove(&pmap->pm_root, mpte->pindex);
1687}
1688
1689/*
1690 * Decrements a page table page's wire count, which is used to record the
1691 * number of valid page table entries within the page.  If the wire count
1692 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1693 * page table page was unmapped and FALSE otherwise.
1694 */
1695static inline boolean_t
1696pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1697{
1698
1699	--m->wire_count;
1700	if (m->wire_count == 0) {
1701		_pmap_unwire_ptp(pmap, m, free);
1702		return (TRUE);
1703	} else
1704		return (FALSE);
1705}
1706
1707static void
1708_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1709{
1710	vm_offset_t pteva;
1711
1712	/*
1713	 * unmap the page table page
1714	 */
1715	pmap->pm_pdir[m->pindex] = 0;
1716	--pmap->pm_stats.resident_count;
1717
1718	/*
1719	 * This is a release store so that the ordinary store unmapping
1720	 * the page table page is globally performed before TLB shoot-
1721	 * down is begun.
1722	 */
1723	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1724
1725	/*
1726	 * Do an invltlb to make the invalidated mapping
1727	 * take effect immediately.
1728	 */
1729	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1730	pmap_invalidate_page(pmap, pteva);
1731
1732	/*
1733	 * Put page on a list so that it is released after
1734	 * *ALL* TLB shootdown is done
1735	 */
1736	pmap_add_delayed_free_list(m, free, TRUE);
1737}
1738
1739/*
1740 * After removing a page table entry, this routine is used to
1741 * conditionally free the page, and manage the hold/wire counts.
1742 */
1743static int
1744pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1745{
1746	pd_entry_t ptepde;
1747	vm_page_t mpte;
1748
1749	if (va >= VM_MAXUSER_ADDRESS)
1750		return (0);
1751	ptepde = *pmap_pde(pmap, va);
1752	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1753	return (pmap_unwire_ptp(pmap, mpte, free));
1754}
1755
1756/*
1757 * Initialize the pmap for the swapper process.
1758 */
1759void
1760pmap_pinit0(pmap_t pmap)
1761{
1762
1763	PMAP_LOCK_INIT(pmap);
1764	/*
1765	 * Since the page table directory is shared with the kernel pmap,
1766	 * which is already included in the list "allpmaps", this pmap does
1767	 * not need to be inserted into that list.
1768	 */
1769	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1770#if defined(PAE) || defined(PAE_TABLES)
1771	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1772#endif
1773	pmap->pm_root.rt_root = 0;
1774	CPU_ZERO(&pmap->pm_active);
1775	PCPU_SET(curpmap, pmap);
1776	TAILQ_INIT(&pmap->pm_pvchunk);
1777	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1778}
1779
1780/*
1781 * Initialize a preallocated and zeroed pmap structure,
1782 * such as one in a vmspace structure.
1783 */
1784int
1785pmap_pinit(pmap_t pmap)
1786{
1787	vm_page_t m, ptdpg[NPGPTD];
1788	vm_paddr_t pa;
1789	int i;
1790
1791	/*
1792	 * No need to allocate page table space yet but we do need a valid
1793	 * page directory table.
1794	 */
1795	if (pmap->pm_pdir == NULL) {
1796		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1797		if (pmap->pm_pdir == NULL)
1798			return (0);
1799#if defined(PAE) || defined(PAE_TABLES)
1800		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1801		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1802		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1803		    ("pmap_pinit: pdpt misaligned"));
1804		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1805		    ("pmap_pinit: pdpt above 4g"));
1806#endif
1807		pmap->pm_root.rt_root = 0;
1808	}
1809	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1810	    ("pmap_pinit: pmap has reserved page table page(s)"));
1811
1812	/*
1813	 * allocate the page directory page(s)
1814	 */
1815	for (i = 0; i < NPGPTD;) {
1816		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1817		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1818		if (m == NULL)
1819			VM_WAIT;
1820		else {
1821			ptdpg[i++] = m;
1822		}
1823	}
1824
1825	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1826
1827	for (i = 0; i < NPGPTD; i++)
1828		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1829			pagezero(pmap->pm_pdir + (i * NPDEPG));
1830
1831	mtx_lock_spin(&allpmaps_lock);
1832	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1833	/* Copy the kernel page table directory entries. */
1834	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1835	mtx_unlock_spin(&allpmaps_lock);
1836
1837	/* install self-referential address mapping entry(s) */
1838	for (i = 0; i < NPGPTD; i++) {
1839		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1840		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1841#if defined(PAE) || defined(PAE_TABLES)
1842		pmap->pm_pdpt[i] = pa | PG_V;
1843#endif
1844	}
1845
1846	CPU_ZERO(&pmap->pm_active);
1847	TAILQ_INIT(&pmap->pm_pvchunk);
1848	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1849
1850	return (1);
1851}
1852
1853/*
1854 * this routine is called if the page table page is not
1855 * mapped correctly.
1856 */
1857static vm_page_t
1858_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1859{
1860	vm_paddr_t ptepa;
1861	vm_page_t m;
1862
1863	/*
1864	 * Allocate a page table page.
1865	 */
1866	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1867	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1868		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1869			PMAP_UNLOCK(pmap);
1870			rw_wunlock(&pvh_global_lock);
1871			VM_WAIT;
1872			rw_wlock(&pvh_global_lock);
1873			PMAP_LOCK(pmap);
1874		}
1875
1876		/*
1877		 * Indicate the need to retry.  While waiting, the page table
1878		 * page may have been allocated.
1879		 */
1880		return (NULL);
1881	}
1882	if ((m->flags & PG_ZERO) == 0)
1883		pmap_zero_page(m);
1884
1885	/*
1886	 * Map the pagetable page into the process address space, if
1887	 * it isn't already there.
1888	 */
1889
1890	pmap->pm_stats.resident_count++;
1891
1892	ptepa = VM_PAGE_TO_PHYS(m);
1893	pmap->pm_pdir[ptepindex] =
1894		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1895
1896	return (m);
1897}
1898
1899static vm_page_t
1900pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1901{
1902	u_int ptepindex;
1903	pd_entry_t ptepa;
1904	vm_page_t m;
1905
1906	/*
1907	 * Calculate pagetable page index
1908	 */
1909	ptepindex = va >> PDRSHIFT;
1910retry:
1911	/*
1912	 * Get the page directory entry
1913	 */
1914	ptepa = pmap->pm_pdir[ptepindex];
1915
1916	/*
1917	 * This supports switching from a 4MB page to a
1918	 * normal 4K page.
1919	 */
1920	if (ptepa & PG_PS) {
1921		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1922		ptepa = pmap->pm_pdir[ptepindex];
1923	}
1924
1925	/*
1926	 * If the page table page is mapped, we just increment the
1927	 * hold count, and activate it.
1928	 */
1929	if (ptepa) {
1930		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1931		m->wire_count++;
1932	} else {
1933		/*
1934		 * Here if the pte page isn't mapped, or if it has
1935		 * been deallocated.
1936		 */
1937		m = _pmap_allocpte(pmap, ptepindex, flags);
1938		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
1939			goto retry;
1940	}
1941	return (m);
1942}
1943
1944
1945/***************************************************
1946* Pmap allocation/deallocation routines.
1947 ***************************************************/
1948
1949#ifdef SMP
1950/*
1951 * Deal with a SMP shootdown of other users of the pmap that we are
1952 * trying to dispose of.  This can be a bit hairy.
1953 */
1954static cpuset_t *lazymask;
1955static u_int lazyptd;
1956static volatile u_int lazywait;
1957
1958void pmap_lazyfix_action(void);
1959
1960void
1961pmap_lazyfix_action(void)
1962{
1963
1964#ifdef COUNT_IPIS
1965	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1966#endif
1967	if (rcr3() == lazyptd)
1968		load_cr3(curpcb->pcb_cr3);
1969	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1970	atomic_store_rel_int(&lazywait, 1);
1971}
1972
1973static void
1974pmap_lazyfix_self(u_int cpuid)
1975{
1976
1977	if (rcr3() == lazyptd)
1978		load_cr3(curpcb->pcb_cr3);
1979	CPU_CLR_ATOMIC(cpuid, lazymask);
1980}
1981
1982
1983static void
1984pmap_lazyfix(pmap_t pmap)
1985{
1986	cpuset_t mymask, mask;
1987	u_int cpuid, spins;
1988	int lsb;
1989
1990	mask = pmap->pm_active;
1991	while (!CPU_EMPTY(&mask)) {
1992		spins = 50000000;
1993
1994		/* Find least significant set bit. */
1995		lsb = CPU_FFS(&mask);
1996		MPASS(lsb != 0);
1997		lsb--;
1998		CPU_SETOF(lsb, &mask);
1999		mtx_lock_spin(&smp_ipi_mtx);
2000#if defined(PAE) || defined(PAE_TABLES)
2001		lazyptd = vtophys(pmap->pm_pdpt);
2002#else
2003		lazyptd = vtophys(pmap->pm_pdir);
2004#endif
2005		cpuid = PCPU_GET(cpuid);
2006
2007		/* Use a cpuset just for having an easy check. */
2008		CPU_SETOF(cpuid, &mymask);
2009		if (!CPU_CMP(&mask, &mymask)) {
2010			lazymask = &pmap->pm_active;
2011			pmap_lazyfix_self(cpuid);
2012		} else {
2013			atomic_store_rel_int((u_int *)&lazymask,
2014			    (u_int)&pmap->pm_active);
2015			atomic_store_rel_int(&lazywait, 0);
2016			ipi_selected(mask, IPI_LAZYPMAP);
2017			while (lazywait == 0) {
2018				ia32_pause();
2019				if (--spins == 0)
2020					break;
2021			}
2022		}
2023		mtx_unlock_spin(&smp_ipi_mtx);
2024		if (spins == 0)
2025			printf("pmap_lazyfix: spun for 50000000\n");
2026		mask = pmap->pm_active;
2027	}
2028}
2029
2030#else	/* SMP */
2031
2032/*
2033 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2034 * unlikely to have to even execute this code, including the fact
2035 * that the cleanup is deferred until the parent does a wait(2), which
2036 * means that another userland process has run.
2037 */
2038static void
2039pmap_lazyfix(pmap_t pmap)
2040{
2041	u_int cr3;
2042
2043	cr3 = vtophys(pmap->pm_pdir);
2044	if (cr3 == rcr3()) {
2045		load_cr3(curpcb->pcb_cr3);
2046		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2047	}
2048}
2049#endif	/* SMP */
2050
2051/*
2052 * Release any resources held by the given physical map.
2053 * Called when a pmap initialized by pmap_pinit is being released.
2054 * Should only be called if the map contains no valid mappings.
2055 */
2056void
2057pmap_release(pmap_t pmap)
2058{
2059	vm_page_t m, ptdpg[NPGPTD];
2060	int i;
2061
2062	KASSERT(pmap->pm_stats.resident_count == 0,
2063	    ("pmap_release: pmap resident count %ld != 0",
2064	    pmap->pm_stats.resident_count));
2065	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2066	    ("pmap_release: pmap has reserved page table page(s)"));
2067
2068	pmap_lazyfix(pmap);
2069	mtx_lock_spin(&allpmaps_lock);
2070	LIST_REMOVE(pmap, pm_list);
2071	mtx_unlock_spin(&allpmaps_lock);
2072
2073	for (i = 0; i < NPGPTD; i++)
2074		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2075		    PG_FRAME);
2076
2077	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2078	    sizeof(*pmap->pm_pdir));
2079
2080	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2081
2082	for (i = 0; i < NPGPTD; i++) {
2083		m = ptdpg[i];
2084#if defined(PAE) || defined(PAE_TABLES)
2085		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2086		    ("pmap_release: got wrong ptd page"));
2087#endif
2088		m->wire_count--;
2089		atomic_subtract_int(&cnt.v_wire_count, 1);
2090		vm_page_free_zero(m);
2091	}
2092}
2093
2094static int
2095kvm_size(SYSCTL_HANDLER_ARGS)
2096{
2097	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2098
2099	return (sysctl_handle_long(oidp, &ksize, 0, req));
2100}
2101SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2102    0, 0, kvm_size, "IU", "Size of KVM");
2103
2104static int
2105kvm_free(SYSCTL_HANDLER_ARGS)
2106{
2107	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2108
2109	return (sysctl_handle_long(oidp, &kfree, 0, req));
2110}
2111SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2112    0, 0, kvm_free, "IU", "Amount of KVM free");
2113
2114/*
2115 * grow the number of kernel page table entries, if needed
2116 */
2117void
2118pmap_growkernel(vm_offset_t addr)
2119{
2120	vm_paddr_t ptppaddr;
2121	vm_page_t nkpg;
2122	pd_entry_t newpdir;
2123
2124	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2125	addr = roundup2(addr, NBPDR);
2126	if (addr - 1 >= kernel_map->max_offset)
2127		addr = kernel_map->max_offset;
2128	while (kernel_vm_end < addr) {
2129		if (pdir_pde(PTD, kernel_vm_end)) {
2130			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2131			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2132				kernel_vm_end = kernel_map->max_offset;
2133				break;
2134			}
2135			continue;
2136		}
2137
2138		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2139		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2140		    VM_ALLOC_ZERO);
2141		if (nkpg == NULL)
2142			panic("pmap_growkernel: no memory to grow kernel");
2143
2144		nkpt++;
2145
2146		if ((nkpg->flags & PG_ZERO) == 0)
2147			pmap_zero_page(nkpg);
2148		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2149		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2150		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2151
2152		pmap_kenter_pde(kernel_vm_end, newpdir);
2153		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2154		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2155			kernel_vm_end = kernel_map->max_offset;
2156			break;
2157		}
2158	}
2159}
2160
2161
2162/***************************************************
2163 * page management routines.
2164 ***************************************************/
2165
2166CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2167CTASSERT(_NPCM == 11);
2168CTASSERT(_NPCPV == 336);
2169
2170static __inline struct pv_chunk *
2171pv_to_chunk(pv_entry_t pv)
2172{
2173
2174	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2175}
2176
2177#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2178
2179#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2180#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2181
2182static const uint32_t pc_freemask[_NPCM] = {
2183	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2184	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2185	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2186	PC_FREE0_9, PC_FREE10
2187};
2188
2189SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2190	"Current number of pv entries");
2191
2192#ifdef PV_STATS
2193static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2194
2195SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2196	"Current number of pv entry chunks");
2197SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2198	"Current number of pv entry chunks allocated");
2199SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2200	"Current number of pv entry chunks frees");
2201SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2202	"Number of times tried to get a chunk page but failed.");
2203
2204static long pv_entry_frees, pv_entry_allocs;
2205static int pv_entry_spare;
2206
2207SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2208	"Current number of pv entry frees");
2209SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2210	"Current number of pv entry allocs");
2211SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2212	"Current number of spare pv entries");
2213#endif
2214
2215/*
2216 * We are in a serious low memory condition.  Resort to
2217 * drastic measures to free some pages so we can allocate
2218 * another pv entry chunk.
2219 */
2220static vm_page_t
2221pmap_pv_reclaim(pmap_t locked_pmap)
2222{
2223	struct pch newtail;
2224	struct pv_chunk *pc;
2225	struct md_page *pvh;
2226	pd_entry_t *pde;
2227	pmap_t pmap;
2228	pt_entry_t *pte, tpte;
2229	pv_entry_t pv;
2230	vm_offset_t va;
2231	vm_page_t m, m_pc;
2232	struct spglist free;
2233	uint32_t inuse;
2234	int bit, field, freed;
2235
2236	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2237	pmap = NULL;
2238	m_pc = NULL;
2239	SLIST_INIT(&free);
2240	TAILQ_INIT(&newtail);
2241	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2242	    SLIST_EMPTY(&free))) {
2243		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2244		if (pmap != pc->pc_pmap) {
2245			if (pmap != NULL) {
2246				pmap_invalidate_all(pmap);
2247				if (pmap != locked_pmap)
2248					PMAP_UNLOCK(pmap);
2249			}
2250			pmap = pc->pc_pmap;
2251			/* Avoid deadlock and lock recursion. */
2252			if (pmap > locked_pmap)
2253				PMAP_LOCK(pmap);
2254			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2255				pmap = NULL;
2256				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2257				continue;
2258			}
2259		}
2260
2261		/*
2262		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2263		 */
2264		freed = 0;
2265		for (field = 0; field < _NPCM; field++) {
2266			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2267			    inuse != 0; inuse &= ~(1UL << bit)) {
2268				bit = bsfl(inuse);
2269				pv = &pc->pc_pventry[field * 32 + bit];
2270				va = pv->pv_va;
2271				pde = pmap_pde(pmap, va);
2272				if ((*pde & PG_PS) != 0)
2273					continue;
2274				pte = pmap_pte(pmap, va);
2275				tpte = *pte;
2276				if ((tpte & PG_W) == 0)
2277					tpte = pte_load_clear(pte);
2278				pmap_pte_release(pte);
2279				if ((tpte & PG_W) != 0)
2280					continue;
2281				KASSERT(tpte != 0,
2282				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2283				    pmap, va));
2284				if ((tpte & PG_G) != 0)
2285					pmap_invalidate_page(pmap, va);
2286				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2287				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2288					vm_page_dirty(m);
2289				if ((tpte & PG_A) != 0)
2290					vm_page_aflag_set(m, PGA_REFERENCED);
2291				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2292				if (TAILQ_EMPTY(&m->md.pv_list) &&
2293				    (m->flags & PG_FICTITIOUS) == 0) {
2294					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2295					if (TAILQ_EMPTY(&pvh->pv_list)) {
2296						vm_page_aflag_clear(m,
2297						    PGA_WRITEABLE);
2298					}
2299				}
2300				pc->pc_map[field] |= 1UL << bit;
2301				pmap_unuse_pt(pmap, va, &free);
2302				freed++;
2303			}
2304		}
2305		if (freed == 0) {
2306			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2307			continue;
2308		}
2309		/* Every freed mapping is for a 4 KB page. */
2310		pmap->pm_stats.resident_count -= freed;
2311		PV_STAT(pv_entry_frees += freed);
2312		PV_STAT(pv_entry_spare += freed);
2313		pv_entry_count -= freed;
2314		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2315		for (field = 0; field < _NPCM; field++)
2316			if (pc->pc_map[field] != pc_freemask[field]) {
2317				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2318				    pc_list);
2319				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2320
2321				/*
2322				 * One freed pv entry in locked_pmap is
2323				 * sufficient.
2324				 */
2325				if (pmap == locked_pmap)
2326					goto out;
2327				break;
2328			}
2329		if (field == _NPCM) {
2330			PV_STAT(pv_entry_spare -= _NPCPV);
2331			PV_STAT(pc_chunk_count--);
2332			PV_STAT(pc_chunk_frees++);
2333			/* Entire chunk is free; return it. */
2334			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2335			pmap_qremove((vm_offset_t)pc, 1);
2336			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2337			break;
2338		}
2339	}
2340out:
2341	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2342	if (pmap != NULL) {
2343		pmap_invalidate_all(pmap);
2344		if (pmap != locked_pmap)
2345			PMAP_UNLOCK(pmap);
2346	}
2347	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2348		m_pc = SLIST_FIRST(&free);
2349		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2350		/* Recycle a freed page table page. */
2351		m_pc->wire_count = 1;
2352		atomic_add_int(&cnt.v_wire_count, 1);
2353	}
2354	pmap_free_zero_pages(&free);
2355	return (m_pc);
2356}
2357
2358/*
2359 * free the pv_entry back to the free list
2360 */
2361static void
2362free_pv_entry(pmap_t pmap, pv_entry_t pv)
2363{
2364	struct pv_chunk *pc;
2365	int idx, field, bit;
2366
2367	rw_assert(&pvh_global_lock, RA_WLOCKED);
2368	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2369	PV_STAT(pv_entry_frees++);
2370	PV_STAT(pv_entry_spare++);
2371	pv_entry_count--;
2372	pc = pv_to_chunk(pv);
2373	idx = pv - &pc->pc_pventry[0];
2374	field = idx / 32;
2375	bit = idx % 32;
2376	pc->pc_map[field] |= 1ul << bit;
2377	for (idx = 0; idx < _NPCM; idx++)
2378		if (pc->pc_map[idx] != pc_freemask[idx]) {
2379			/*
2380			 * 98% of the time, pc is already at the head of the
2381			 * list.  If it isn't already, move it to the head.
2382			 */
2383			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2384			    pc)) {
2385				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2386				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2387				    pc_list);
2388			}
2389			return;
2390		}
2391	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2392	free_pv_chunk(pc);
2393}
2394
2395static void
2396free_pv_chunk(struct pv_chunk *pc)
2397{
2398	vm_page_t m;
2399
2400 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2401	PV_STAT(pv_entry_spare -= _NPCPV);
2402	PV_STAT(pc_chunk_count--);
2403	PV_STAT(pc_chunk_frees++);
2404	/* entire chunk is free, return it */
2405	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2406	pmap_qremove((vm_offset_t)pc, 1);
2407	vm_page_unwire(m, 0);
2408	vm_page_free(m);
2409	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2410}
2411
2412/*
2413 * get a new pv_entry, allocating a block from the system
2414 * when needed.
2415 */
2416static pv_entry_t
2417get_pv_entry(pmap_t pmap, boolean_t try)
2418{
2419	static const struct timeval printinterval = { 60, 0 };
2420	static struct timeval lastprint;
2421	int bit, field;
2422	pv_entry_t pv;
2423	struct pv_chunk *pc;
2424	vm_page_t m;
2425
2426	rw_assert(&pvh_global_lock, RA_WLOCKED);
2427	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2428	PV_STAT(pv_entry_allocs++);
2429	pv_entry_count++;
2430	if (pv_entry_count > pv_entry_high_water)
2431		if (ratecheck(&lastprint, &printinterval))
2432			printf("Approaching the limit on PV entries, consider "
2433			    "increasing either the vm.pmap.shpgperproc or the "
2434			    "vm.pmap.pv_entry_max tunable.\n");
2435retry:
2436	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2437	if (pc != NULL) {
2438		for (field = 0; field < _NPCM; field++) {
2439			if (pc->pc_map[field]) {
2440				bit = bsfl(pc->pc_map[field]);
2441				break;
2442			}
2443		}
2444		if (field < _NPCM) {
2445			pv = &pc->pc_pventry[field * 32 + bit];
2446			pc->pc_map[field] &= ~(1ul << bit);
2447			/* If this was the last item, move it to tail */
2448			for (field = 0; field < _NPCM; field++)
2449				if (pc->pc_map[field] != 0) {
2450					PV_STAT(pv_entry_spare--);
2451					return (pv);	/* not full, return */
2452				}
2453			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2454			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2455			PV_STAT(pv_entry_spare--);
2456			return (pv);
2457		}
2458	}
2459	/*
2460	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2461	 * global lock.  If "pv_vafree" is currently non-empty, it will
2462	 * remain non-empty until pmap_ptelist_alloc() completes.
2463	 */
2464	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2465	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2466		if (try) {
2467			pv_entry_count--;
2468			PV_STAT(pc_chunk_tryfail++);
2469			return (NULL);
2470		}
2471		m = pmap_pv_reclaim(pmap);
2472		if (m == NULL)
2473			goto retry;
2474	}
2475	PV_STAT(pc_chunk_count++);
2476	PV_STAT(pc_chunk_allocs++);
2477	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2478	pmap_qenter((vm_offset_t)pc, &m, 1);
2479	pc->pc_pmap = pmap;
2480	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2481	for (field = 1; field < _NPCM; field++)
2482		pc->pc_map[field] = pc_freemask[field];
2483	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2484	pv = &pc->pc_pventry[0];
2485	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2486	PV_STAT(pv_entry_spare += _NPCPV - 1);
2487	return (pv);
2488}
2489
2490static __inline pv_entry_t
2491pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2492{
2493	pv_entry_t pv;
2494
2495	rw_assert(&pvh_global_lock, RA_WLOCKED);
2496	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2497		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2498			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2499			break;
2500		}
2501	}
2502	return (pv);
2503}
2504
2505static void
2506pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2507{
2508	struct md_page *pvh;
2509	pv_entry_t pv;
2510	vm_offset_t va_last;
2511	vm_page_t m;
2512
2513	rw_assert(&pvh_global_lock, RA_WLOCKED);
2514	KASSERT((pa & PDRMASK) == 0,
2515	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2516
2517	/*
2518	 * Transfer the 4mpage's pv entry for this mapping to the first
2519	 * page's pv list.
2520	 */
2521	pvh = pa_to_pvh(pa);
2522	va = trunc_4mpage(va);
2523	pv = pmap_pvh_remove(pvh, pmap, va);
2524	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2525	m = PHYS_TO_VM_PAGE(pa);
2526	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2527	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2528	va_last = va + NBPDR - PAGE_SIZE;
2529	do {
2530		m++;
2531		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2532		    ("pmap_pv_demote_pde: page %p is not managed", m));
2533		va += PAGE_SIZE;
2534		pmap_insert_entry(pmap, va, m);
2535	} while (va < va_last);
2536}
2537
2538static void
2539pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2540{
2541	struct md_page *pvh;
2542	pv_entry_t pv;
2543	vm_offset_t va_last;
2544	vm_page_t m;
2545
2546	rw_assert(&pvh_global_lock, RA_WLOCKED);
2547	KASSERT((pa & PDRMASK) == 0,
2548	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2549
2550	/*
2551	 * Transfer the first page's pv entry for this mapping to the
2552	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2553	 * to get_pv_entry(), a transfer avoids the possibility that
2554	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2555	 * removes one of the mappings that is being promoted.
2556	 */
2557	m = PHYS_TO_VM_PAGE(pa);
2558	va = trunc_4mpage(va);
2559	pv = pmap_pvh_remove(&m->md, pmap, va);
2560	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2561	pvh = pa_to_pvh(pa);
2562	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2563	/* Free the remaining NPTEPG - 1 pv entries. */
2564	va_last = va + NBPDR - PAGE_SIZE;
2565	do {
2566		m++;
2567		va += PAGE_SIZE;
2568		pmap_pvh_free(&m->md, pmap, va);
2569	} while (va < va_last);
2570}
2571
2572static void
2573pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2574{
2575	pv_entry_t pv;
2576
2577	pv = pmap_pvh_remove(pvh, pmap, va);
2578	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2579	free_pv_entry(pmap, pv);
2580}
2581
2582static void
2583pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2584{
2585	struct md_page *pvh;
2586
2587	rw_assert(&pvh_global_lock, RA_WLOCKED);
2588	pmap_pvh_free(&m->md, pmap, va);
2589	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2590		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2591		if (TAILQ_EMPTY(&pvh->pv_list))
2592			vm_page_aflag_clear(m, PGA_WRITEABLE);
2593	}
2594}
2595
2596/*
2597 * Create a pv entry for page at pa for
2598 * (pmap, va).
2599 */
2600static void
2601pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2602{
2603	pv_entry_t pv;
2604
2605	rw_assert(&pvh_global_lock, RA_WLOCKED);
2606	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2607	pv = get_pv_entry(pmap, FALSE);
2608	pv->pv_va = va;
2609	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2610}
2611
2612/*
2613 * Conditionally create a pv entry.
2614 */
2615static boolean_t
2616pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2617{
2618	pv_entry_t pv;
2619
2620	rw_assert(&pvh_global_lock, RA_WLOCKED);
2621	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2622	if (pv_entry_count < pv_entry_high_water &&
2623	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2624		pv->pv_va = va;
2625		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2626		return (TRUE);
2627	} else
2628		return (FALSE);
2629}
2630
2631/*
2632 * Create the pv entries for each of the pages within a superpage.
2633 */
2634static boolean_t
2635pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2636{
2637	struct md_page *pvh;
2638	pv_entry_t pv;
2639
2640	rw_assert(&pvh_global_lock, RA_WLOCKED);
2641	if (pv_entry_count < pv_entry_high_water &&
2642	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2643		pv->pv_va = va;
2644		pvh = pa_to_pvh(pa);
2645		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2646		return (TRUE);
2647	} else
2648		return (FALSE);
2649}
2650
2651/*
2652 * Fills a page table page with mappings to consecutive physical pages.
2653 */
2654static void
2655pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2656{
2657	pt_entry_t *pte;
2658
2659	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2660		*pte = newpte;
2661		newpte += PAGE_SIZE;
2662	}
2663}
2664
2665/*
2666 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2667 * 2- or 4MB page mapping is invalidated.
2668 */
2669static boolean_t
2670pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2671{
2672	pd_entry_t newpde, oldpde;
2673	pt_entry_t *firstpte, newpte;
2674	vm_paddr_t mptepa;
2675	vm_page_t mpte;
2676	struct spglist free;
2677
2678	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2679	oldpde = *pde;
2680	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2681	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2682	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
2683	    NULL)
2684		pmap_remove_pt_page(pmap, mpte);
2685	else {
2686		KASSERT((oldpde & PG_W) == 0,
2687		    ("pmap_demote_pde: page table page for a wired mapping"
2688		    " is missing"));
2689
2690		/*
2691		 * Invalidate the 2- or 4MB page mapping and return
2692		 * "failure" if the mapping was never accessed or the
2693		 * allocation of the new page table page fails.
2694		 */
2695		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2696		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2697		    VM_ALLOC_WIRED)) == NULL) {
2698			SLIST_INIT(&free);
2699			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2700			pmap_invalidate_page(pmap, trunc_4mpage(va));
2701			pmap_free_zero_pages(&free);
2702			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2703			    " in pmap %p", va, pmap);
2704			return (FALSE);
2705		}
2706		if (va < VM_MAXUSER_ADDRESS)
2707			pmap->pm_stats.resident_count++;
2708	}
2709	mptepa = VM_PAGE_TO_PHYS(mpte);
2710
2711	/*
2712	 * If the page mapping is in the kernel's address space, then the
2713	 * KPTmap can provide access to the page table page.  Otherwise,
2714	 * temporarily map the page table page (mpte) into the kernel's
2715	 * address space at either PADDR1 or PADDR2.
2716	 */
2717	if (va >= KERNBASE)
2718		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2719	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2720		if ((*PMAP1 & PG_FRAME) != mptepa) {
2721			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2722#ifdef SMP
2723			PMAP1cpu = PCPU_GET(cpuid);
2724#endif
2725			invlcaddr(PADDR1);
2726			PMAP1changed++;
2727		} else
2728#ifdef SMP
2729		if (PMAP1cpu != PCPU_GET(cpuid)) {
2730			PMAP1cpu = PCPU_GET(cpuid);
2731			invlcaddr(PADDR1);
2732			PMAP1changedcpu++;
2733		} else
2734#endif
2735			PMAP1unchanged++;
2736		firstpte = PADDR1;
2737	} else {
2738		mtx_lock(&PMAP2mutex);
2739		if ((*PMAP2 & PG_FRAME) != mptepa) {
2740			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2741			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2742		}
2743		firstpte = PADDR2;
2744	}
2745	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2746	KASSERT((oldpde & PG_A) != 0,
2747	    ("pmap_demote_pde: oldpde is missing PG_A"));
2748	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2749	    ("pmap_demote_pde: oldpde is missing PG_M"));
2750	newpte = oldpde & ~PG_PS;
2751	if ((newpte & PG_PDE_PAT) != 0)
2752		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2753
2754	/*
2755	 * If the page table page is new, initialize it.
2756	 */
2757	if (mpte->wire_count == 1) {
2758		mpte->wire_count = NPTEPG;
2759		pmap_fill_ptp(firstpte, newpte);
2760	}
2761	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2762	    ("pmap_demote_pde: firstpte and newpte map different physical"
2763	    " addresses"));
2764
2765	/*
2766	 * If the mapping has changed attributes, update the page table
2767	 * entries.
2768	 */
2769	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2770		pmap_fill_ptp(firstpte, newpte);
2771
2772	/*
2773	 * Demote the mapping.  This pmap is locked.  The old PDE has
2774	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2775	 * set.  Thus, there is no danger of a race with another
2776	 * processor changing the setting of PG_A and/or PG_M between
2777	 * the read above and the store below.
2778	 */
2779	if (workaround_erratum383)
2780		pmap_update_pde(pmap, va, pde, newpde);
2781	else if (pmap == kernel_pmap)
2782		pmap_kenter_pde(va, newpde);
2783	else
2784		pde_store(pde, newpde);
2785	if (firstpte == PADDR2)
2786		mtx_unlock(&PMAP2mutex);
2787
2788	/*
2789	 * Invalidate the recursive mapping of the page table page.
2790	 */
2791	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2792
2793	/*
2794	 * Demote the pv entry.  This depends on the earlier demotion
2795	 * of the mapping.  Specifically, the (re)creation of a per-
2796	 * page pv entry might trigger the execution of pmap_collect(),
2797	 * which might reclaim a newly (re)created per-page pv entry
2798	 * and destroy the associated mapping.  In order to destroy
2799	 * the mapping, the PDE must have already changed from mapping
2800	 * the 2mpage to referencing the page table page.
2801	 */
2802	if ((oldpde & PG_MANAGED) != 0)
2803		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2804
2805	pmap_pde_demotions++;
2806	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2807	    " in pmap %p", va, pmap);
2808	return (TRUE);
2809}
2810
2811/*
2812 * Removes a 2- or 4MB page mapping from the kernel pmap.
2813 */
2814static void
2815pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2816{
2817	pd_entry_t newpde;
2818	vm_paddr_t mptepa;
2819	vm_page_t mpte;
2820
2821	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2822	mpte = pmap_lookup_pt_page(pmap, va);
2823	if (mpte == NULL)
2824		panic("pmap_remove_kernel_pde: Missing pt page.");
2825
2826	pmap_remove_pt_page(pmap, mpte);
2827	mptepa = VM_PAGE_TO_PHYS(mpte);
2828	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2829
2830	/*
2831	 * Initialize the page table page.
2832	 */
2833	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2834
2835	/*
2836	 * Remove the mapping.
2837	 */
2838	if (workaround_erratum383)
2839		pmap_update_pde(pmap, va, pde, newpde);
2840	else
2841		pmap_kenter_pde(va, newpde);
2842
2843	/*
2844	 * Invalidate the recursive mapping of the page table page.
2845	 */
2846	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2847}
2848
2849/*
2850 * pmap_remove_pde: do the things to unmap a superpage in a process
2851 */
2852static void
2853pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2854    struct spglist *free)
2855{
2856	struct md_page *pvh;
2857	pd_entry_t oldpde;
2858	vm_offset_t eva, va;
2859	vm_page_t m, mpte;
2860
2861	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2862	KASSERT((sva & PDRMASK) == 0,
2863	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2864	oldpde = pte_load_clear(pdq);
2865	if (oldpde & PG_W)
2866		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2867
2868	/*
2869	 * Machines that don't support invlpg, also don't support
2870	 * PG_G.
2871	 */
2872	if (oldpde & PG_G)
2873		pmap_invalidate_page(kernel_pmap, sva);
2874	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2875	if (oldpde & PG_MANAGED) {
2876		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2877		pmap_pvh_free(pvh, pmap, sva);
2878		eva = sva + NBPDR;
2879		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2880		    va < eva; va += PAGE_SIZE, m++) {
2881			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2882				vm_page_dirty(m);
2883			if (oldpde & PG_A)
2884				vm_page_aflag_set(m, PGA_REFERENCED);
2885			if (TAILQ_EMPTY(&m->md.pv_list) &&
2886			    TAILQ_EMPTY(&pvh->pv_list))
2887				vm_page_aflag_clear(m, PGA_WRITEABLE);
2888		}
2889	}
2890	if (pmap == kernel_pmap) {
2891		pmap_remove_kernel_pde(pmap, pdq, sva);
2892	} else {
2893		mpte = pmap_lookup_pt_page(pmap, sva);
2894		if (mpte != NULL) {
2895			pmap_remove_pt_page(pmap, mpte);
2896			pmap->pm_stats.resident_count--;
2897			KASSERT(mpte->wire_count == NPTEPG,
2898			    ("pmap_remove_pde: pte page wire count error"));
2899			mpte->wire_count = 0;
2900			pmap_add_delayed_free_list(mpte, free, FALSE);
2901			atomic_subtract_int(&cnt.v_wire_count, 1);
2902		}
2903	}
2904}
2905
2906/*
2907 * pmap_remove_pte: do the things to unmap a page in a process
2908 */
2909static int
2910pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2911    struct spglist *free)
2912{
2913	pt_entry_t oldpte;
2914	vm_page_t m;
2915
2916	rw_assert(&pvh_global_lock, RA_WLOCKED);
2917	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2918	oldpte = pte_load_clear(ptq);
2919	KASSERT(oldpte != 0,
2920	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2921	if (oldpte & PG_W)
2922		pmap->pm_stats.wired_count -= 1;
2923	/*
2924	 * Machines that don't support invlpg, also don't support
2925	 * PG_G.
2926	 */
2927	if (oldpte & PG_G)
2928		pmap_invalidate_page(kernel_pmap, va);
2929	pmap->pm_stats.resident_count -= 1;
2930	if (oldpte & PG_MANAGED) {
2931		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2932		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2933			vm_page_dirty(m);
2934		if (oldpte & PG_A)
2935			vm_page_aflag_set(m, PGA_REFERENCED);
2936		pmap_remove_entry(pmap, m, va);
2937	}
2938	return (pmap_unuse_pt(pmap, va, free));
2939}
2940
2941/*
2942 * Remove a single page from a process address space
2943 */
2944static void
2945pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2946{
2947	pt_entry_t *pte;
2948
2949	rw_assert(&pvh_global_lock, RA_WLOCKED);
2950	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2951	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2952	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2953		return;
2954	pmap_remove_pte(pmap, pte, va, free);
2955	pmap_invalidate_page(pmap, va);
2956}
2957
2958/*
2959 *	Remove the given range of addresses from the specified map.
2960 *
2961 *	It is assumed that the start and end are properly
2962 *	rounded to the page size.
2963 */
2964void
2965pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2966{
2967	vm_offset_t pdnxt;
2968	pd_entry_t ptpaddr;
2969	pt_entry_t *pte;
2970	struct spglist free;
2971	int anyvalid;
2972
2973	/*
2974	 * Perform an unsynchronized read.  This is, however, safe.
2975	 */
2976	if (pmap->pm_stats.resident_count == 0)
2977		return;
2978
2979	anyvalid = 0;
2980	SLIST_INIT(&free);
2981
2982	rw_wlock(&pvh_global_lock);
2983	sched_pin();
2984	PMAP_LOCK(pmap);
2985
2986	/*
2987	 * special handling of removing one page.  a very
2988	 * common operation and easy to short circuit some
2989	 * code.
2990	 */
2991	if ((sva + PAGE_SIZE == eva) &&
2992	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2993		pmap_remove_page(pmap, sva, &free);
2994		goto out;
2995	}
2996
2997	for (; sva < eva; sva = pdnxt) {
2998		u_int pdirindex;
2999
3000		/*
3001		 * Calculate index for next page table.
3002		 */
3003		pdnxt = (sva + NBPDR) & ~PDRMASK;
3004		if (pdnxt < sva)
3005			pdnxt = eva;
3006		if (pmap->pm_stats.resident_count == 0)
3007			break;
3008
3009		pdirindex = sva >> PDRSHIFT;
3010		ptpaddr = pmap->pm_pdir[pdirindex];
3011
3012		/*
3013		 * Weed out invalid mappings. Note: we assume that the page
3014		 * directory table is always allocated, and in kernel virtual.
3015		 */
3016		if (ptpaddr == 0)
3017			continue;
3018
3019		/*
3020		 * Check for large page.
3021		 */
3022		if ((ptpaddr & PG_PS) != 0) {
3023			/*
3024			 * Are we removing the entire large page?  If not,
3025			 * demote the mapping and fall through.
3026			 */
3027			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3028				/*
3029				 * The TLB entry for a PG_G mapping is
3030				 * invalidated by pmap_remove_pde().
3031				 */
3032				if ((ptpaddr & PG_G) == 0)
3033					anyvalid = 1;
3034				pmap_remove_pde(pmap,
3035				    &pmap->pm_pdir[pdirindex], sva, &free);
3036				continue;
3037			} else if (!pmap_demote_pde(pmap,
3038			    &pmap->pm_pdir[pdirindex], sva)) {
3039				/* The large page mapping was destroyed. */
3040				continue;
3041			}
3042		}
3043
3044		/*
3045		 * Limit our scan to either the end of the va represented
3046		 * by the current page table page, or to the end of the
3047		 * range being removed.
3048		 */
3049		if (pdnxt > eva)
3050			pdnxt = eva;
3051
3052		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3053		    sva += PAGE_SIZE) {
3054			if (*pte == 0)
3055				continue;
3056
3057			/*
3058			 * The TLB entry for a PG_G mapping is invalidated
3059			 * by pmap_remove_pte().
3060			 */
3061			if ((*pte & PG_G) == 0)
3062				anyvalid = 1;
3063			if (pmap_remove_pte(pmap, pte, sva, &free))
3064				break;
3065		}
3066	}
3067out:
3068	sched_unpin();
3069	if (anyvalid)
3070		pmap_invalidate_all(pmap);
3071	rw_wunlock(&pvh_global_lock);
3072	PMAP_UNLOCK(pmap);
3073	pmap_free_zero_pages(&free);
3074}
3075
3076/*
3077 *	Routine:	pmap_remove_all
3078 *	Function:
3079 *		Removes this physical page from
3080 *		all physical maps in which it resides.
3081 *		Reflects back modify bits to the pager.
3082 *
3083 *	Notes:
3084 *		Original versions of this routine were very
3085 *		inefficient because they iteratively called
3086 *		pmap_remove (slow...)
3087 */
3088
3089void
3090pmap_remove_all(vm_page_t m)
3091{
3092	struct md_page *pvh;
3093	pv_entry_t pv;
3094	pmap_t pmap;
3095	pt_entry_t *pte, tpte;
3096	pd_entry_t *pde;
3097	vm_offset_t va;
3098	struct spglist free;
3099
3100	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3101	    ("pmap_remove_all: page %p is not managed", m));
3102	SLIST_INIT(&free);
3103	rw_wlock(&pvh_global_lock);
3104	sched_pin();
3105	if ((m->flags & PG_FICTITIOUS) != 0)
3106		goto small_mappings;
3107	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3108	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3109		va = pv->pv_va;
3110		pmap = PV_PMAP(pv);
3111		PMAP_LOCK(pmap);
3112		pde = pmap_pde(pmap, va);
3113		(void)pmap_demote_pde(pmap, pde, va);
3114		PMAP_UNLOCK(pmap);
3115	}
3116small_mappings:
3117	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3118		pmap = PV_PMAP(pv);
3119		PMAP_LOCK(pmap);
3120		pmap->pm_stats.resident_count--;
3121		pde = pmap_pde(pmap, pv->pv_va);
3122		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3123		    " a 4mpage in page %p's pv list", m));
3124		pte = pmap_pte_quick(pmap, pv->pv_va);
3125		tpte = pte_load_clear(pte);
3126		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3127		    pmap, pv->pv_va));
3128		if (tpte & PG_W)
3129			pmap->pm_stats.wired_count--;
3130		if (tpte & PG_A)
3131			vm_page_aflag_set(m, PGA_REFERENCED);
3132
3133		/*
3134		 * Update the vm_page_t clean and reference bits.
3135		 */
3136		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3137			vm_page_dirty(m);
3138		pmap_unuse_pt(pmap, pv->pv_va, &free);
3139		pmap_invalidate_page(pmap, pv->pv_va);
3140		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3141		free_pv_entry(pmap, pv);
3142		PMAP_UNLOCK(pmap);
3143	}
3144	vm_page_aflag_clear(m, PGA_WRITEABLE);
3145	sched_unpin();
3146	rw_wunlock(&pvh_global_lock);
3147	pmap_free_zero_pages(&free);
3148}
3149
3150/*
3151 * pmap_protect_pde: do the things to protect a 4mpage in a process
3152 */
3153static boolean_t
3154pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3155{
3156	pd_entry_t newpde, oldpde;
3157	vm_offset_t eva, va;
3158	vm_page_t m;
3159	boolean_t anychanged;
3160
3161	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3162	KASSERT((sva & PDRMASK) == 0,
3163	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3164	anychanged = FALSE;
3165retry:
3166	oldpde = newpde = *pde;
3167	if (oldpde & PG_MANAGED) {
3168		eva = sva + NBPDR;
3169		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3170		    va < eva; va += PAGE_SIZE, m++)
3171			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3172				vm_page_dirty(m);
3173	}
3174	if ((prot & VM_PROT_WRITE) == 0)
3175		newpde &= ~(PG_RW | PG_M);
3176#if defined(PAE) || defined(PAE_TABLES)
3177	if ((prot & VM_PROT_EXECUTE) == 0)
3178		newpde |= pg_nx;
3179#endif
3180	if (newpde != oldpde) {
3181		if (!pde_cmpset(pde, oldpde, newpde))
3182			goto retry;
3183		if (oldpde & PG_G)
3184			pmap_invalidate_page(pmap, sva);
3185		else
3186			anychanged = TRUE;
3187	}
3188	return (anychanged);
3189}
3190
3191/*
3192 *	Set the physical protection on the
3193 *	specified range of this map as requested.
3194 */
3195void
3196pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3197{
3198	vm_offset_t pdnxt;
3199	pd_entry_t ptpaddr;
3200	pt_entry_t *pte;
3201	boolean_t anychanged, pv_lists_locked;
3202
3203	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3204	if (prot == VM_PROT_NONE) {
3205		pmap_remove(pmap, sva, eva);
3206		return;
3207	}
3208
3209#if defined(PAE) || defined(PAE_TABLES)
3210	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3211	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3212		return;
3213#else
3214	if (prot & VM_PROT_WRITE)
3215		return;
3216#endif
3217
3218	if (pmap_is_current(pmap))
3219		pv_lists_locked = FALSE;
3220	else {
3221		pv_lists_locked = TRUE;
3222resume:
3223		rw_wlock(&pvh_global_lock);
3224		sched_pin();
3225	}
3226	anychanged = FALSE;
3227
3228	PMAP_LOCK(pmap);
3229	for (; sva < eva; sva = pdnxt) {
3230		pt_entry_t obits, pbits;
3231		u_int pdirindex;
3232
3233		pdnxt = (sva + NBPDR) & ~PDRMASK;
3234		if (pdnxt < sva)
3235			pdnxt = eva;
3236
3237		pdirindex = sva >> PDRSHIFT;
3238		ptpaddr = pmap->pm_pdir[pdirindex];
3239
3240		/*
3241		 * Weed out invalid mappings. Note: we assume that the page
3242		 * directory table is always allocated, and in kernel virtual.
3243		 */
3244		if (ptpaddr == 0)
3245			continue;
3246
3247		/*
3248		 * Check for large page.
3249		 */
3250		if ((ptpaddr & PG_PS) != 0) {
3251			/*
3252			 * Are we protecting the entire large page?  If not,
3253			 * demote the mapping and fall through.
3254			 */
3255			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3256				/*
3257				 * The TLB entry for a PG_G mapping is
3258				 * invalidated by pmap_protect_pde().
3259				 */
3260				if (pmap_protect_pde(pmap,
3261				    &pmap->pm_pdir[pdirindex], sva, prot))
3262					anychanged = TRUE;
3263				continue;
3264			} else {
3265				if (!pv_lists_locked) {
3266					pv_lists_locked = TRUE;
3267					if (!rw_try_wlock(&pvh_global_lock)) {
3268						if (anychanged)
3269							pmap_invalidate_all(
3270							    pmap);
3271						PMAP_UNLOCK(pmap);
3272						goto resume;
3273					}
3274					sched_pin();
3275				}
3276				if (!pmap_demote_pde(pmap,
3277				    &pmap->pm_pdir[pdirindex], sva)) {
3278					/*
3279					 * The large page mapping was
3280					 * destroyed.
3281					 */
3282					continue;
3283				}
3284			}
3285		}
3286
3287		if (pdnxt > eva)
3288			pdnxt = eva;
3289
3290		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3291		    sva += PAGE_SIZE) {
3292			vm_page_t m;
3293
3294retry:
3295			/*
3296			 * Regardless of whether a pte is 32 or 64 bits in
3297			 * size, PG_RW, PG_A, and PG_M are among the least
3298			 * significant 32 bits.
3299			 */
3300			obits = pbits = *pte;
3301			if ((pbits & PG_V) == 0)
3302				continue;
3303
3304			if ((prot & VM_PROT_WRITE) == 0) {
3305				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3306				    (PG_MANAGED | PG_M | PG_RW)) {
3307					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3308					vm_page_dirty(m);
3309				}
3310				pbits &= ~(PG_RW | PG_M);
3311			}
3312#if defined(PAE) || defined(PAE_TABLES)
3313			if ((prot & VM_PROT_EXECUTE) == 0)
3314				pbits |= pg_nx;
3315#endif
3316
3317			if (pbits != obits) {
3318#if defined(PAE) || defined(PAE_TABLES)
3319				if (!atomic_cmpset_64(pte, obits, pbits))
3320					goto retry;
3321#else
3322				if (!atomic_cmpset_int((u_int *)pte, obits,
3323				    pbits))
3324					goto retry;
3325#endif
3326				if (obits & PG_G)
3327					pmap_invalidate_page(pmap, sva);
3328				else
3329					anychanged = TRUE;
3330			}
3331		}
3332	}
3333	if (anychanged)
3334		pmap_invalidate_all(pmap);
3335	if (pv_lists_locked) {
3336		sched_unpin();
3337		rw_wunlock(&pvh_global_lock);
3338	}
3339	PMAP_UNLOCK(pmap);
3340}
3341
3342/*
3343 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3344 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3345 * For promotion to occur, two conditions must be met: (1) the 4KB page
3346 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3347 * mappings must have identical characteristics.
3348 *
3349 * Managed (PG_MANAGED) mappings within the kernel address space are not
3350 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3351 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3352 * pmap.
3353 */
3354static void
3355pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3356{
3357	pd_entry_t newpde;
3358	pt_entry_t *firstpte, oldpte, pa, *pte;
3359	vm_offset_t oldpteva;
3360	vm_page_t mpte;
3361
3362	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3363
3364	/*
3365	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3366	 * either invalid, unused, or does not map the first 4KB physical page
3367	 * within a 2- or 4MB page.
3368	 */
3369	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3370setpde:
3371	newpde = *firstpte;
3372	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3373		pmap_pde_p_failures++;
3374		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3375		    " in pmap %p", va, pmap);
3376		return;
3377	}
3378	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3379		pmap_pde_p_failures++;
3380		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3381		    " in pmap %p", va, pmap);
3382		return;
3383	}
3384	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3385		/*
3386		 * When PG_M is already clear, PG_RW can be cleared without
3387		 * a TLB invalidation.
3388		 */
3389		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3390		    ~PG_RW))
3391			goto setpde;
3392		newpde &= ~PG_RW;
3393	}
3394
3395	/*
3396	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3397	 * PTE maps an unexpected 4KB physical page or does not have identical
3398	 * characteristics to the first PTE.
3399	 */
3400	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3401	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3402setpte:
3403		oldpte = *pte;
3404		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3405			pmap_pde_p_failures++;
3406			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3407			    " in pmap %p", va, pmap);
3408			return;
3409		}
3410		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3411			/*
3412			 * When PG_M is already clear, PG_RW can be cleared
3413			 * without a TLB invalidation.
3414			 */
3415			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3416			    oldpte & ~PG_RW))
3417				goto setpte;
3418			oldpte &= ~PG_RW;
3419			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3420			    (va & ~PDRMASK);
3421			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3422			    " in pmap %p", oldpteva, pmap);
3423		}
3424		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3425			pmap_pde_p_failures++;
3426			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3427			    " in pmap %p", va, pmap);
3428			return;
3429		}
3430		pa -= PAGE_SIZE;
3431	}
3432
3433	/*
3434	 * Save the page table page in its current state until the PDE
3435	 * mapping the superpage is demoted by pmap_demote_pde() or
3436	 * destroyed by pmap_remove_pde().
3437	 */
3438	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3439	KASSERT(mpte >= vm_page_array &&
3440	    mpte < &vm_page_array[vm_page_array_size],
3441	    ("pmap_promote_pde: page table page is out of range"));
3442	KASSERT(mpte->pindex == va >> PDRSHIFT,
3443	    ("pmap_promote_pde: page table page's pindex is wrong"));
3444	if (pmap_insert_pt_page(pmap, mpte)) {
3445		pmap_pde_p_failures++;
3446		CTR2(KTR_PMAP,
3447		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3448		    pmap);
3449		return;
3450	}
3451
3452	/*
3453	 * Promote the pv entries.
3454	 */
3455	if ((newpde & PG_MANAGED) != 0)
3456		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3457
3458	/*
3459	 * Propagate the PAT index to its proper position.
3460	 */
3461	if ((newpde & PG_PTE_PAT) != 0)
3462		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3463
3464	/*
3465	 * Map the superpage.
3466	 */
3467	if (workaround_erratum383)
3468		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3469	else if (pmap == kernel_pmap)
3470		pmap_kenter_pde(va, PG_PS | newpde);
3471	else
3472		pde_store(pde, PG_PS | newpde);
3473
3474	pmap_pde_promotions++;
3475	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3476	    " in pmap %p", va, pmap);
3477}
3478
3479/*
3480 *	Insert the given physical page (p) at
3481 *	the specified virtual address (v) in the
3482 *	target physical map with the protection requested.
3483 *
3484 *	If specified, the page will be wired down, meaning
3485 *	that the related pte can not be reclaimed.
3486 *
3487 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3488 *	or lose information.  That is, this routine must actually
3489 *	insert this page into the given map NOW.
3490 */
3491int
3492pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3493    u_int flags, int8_t psind)
3494{
3495	pd_entry_t *pde;
3496	pt_entry_t *pte;
3497	pt_entry_t newpte, origpte;
3498	pv_entry_t pv;
3499	vm_paddr_t opa, pa;
3500	vm_page_t mpte, om;
3501	boolean_t invlva, wired;
3502
3503	va = trunc_page(va);
3504	mpte = NULL;
3505	wired = (flags & PMAP_ENTER_WIRED) != 0;
3506
3507	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3508	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3509	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3510	    va));
3511	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3512		VM_OBJECT_ASSERT_LOCKED(m->object);
3513
3514	rw_wlock(&pvh_global_lock);
3515	PMAP_LOCK(pmap);
3516	sched_pin();
3517
3518	/*
3519	 * In the case that a page table page is not
3520	 * resident, we are creating it here.
3521	 */
3522	if (va < VM_MAXUSER_ADDRESS) {
3523		mpte = pmap_allocpte(pmap, va, flags);
3524		if (mpte == NULL) {
3525			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3526			    ("pmap_allocpte failed with sleep allowed"));
3527			sched_unpin();
3528			rw_wunlock(&pvh_global_lock);
3529			PMAP_UNLOCK(pmap);
3530			return (KERN_RESOURCE_SHORTAGE);
3531		}
3532	}
3533
3534	pde = pmap_pde(pmap, va);
3535	if ((*pde & PG_PS) != 0)
3536		panic("pmap_enter: attempted pmap_enter on 4MB page");
3537	pte = pmap_pte_quick(pmap, va);
3538
3539	/*
3540	 * Page Directory table entry not valid, we need a new PT page
3541	 */
3542	if (pte == NULL) {
3543		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3544			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3545	}
3546
3547	pa = VM_PAGE_TO_PHYS(m);
3548	om = NULL;
3549	origpte = *pte;
3550	opa = origpte & PG_FRAME;
3551
3552	/*
3553	 * Mapping has not changed, must be protection or wiring change.
3554	 */
3555	if (origpte && (opa == pa)) {
3556		/*
3557		 * Wiring change, just update stats. We don't worry about
3558		 * wiring PT pages as they remain resident as long as there
3559		 * are valid mappings in them. Hence, if a user page is wired,
3560		 * the PT page will be also.
3561		 */
3562		if (wired && ((origpte & PG_W) == 0))
3563			pmap->pm_stats.wired_count++;
3564		else if (!wired && (origpte & PG_W))
3565			pmap->pm_stats.wired_count--;
3566
3567		/*
3568		 * Remove extra pte reference
3569		 */
3570		if (mpte)
3571			mpte->wire_count--;
3572
3573		if (origpte & PG_MANAGED) {
3574			om = m;
3575			pa |= PG_MANAGED;
3576		}
3577		goto validate;
3578	}
3579
3580	pv = NULL;
3581
3582	/*
3583	 * Mapping has changed, invalidate old range and fall through to
3584	 * handle validating new mapping.
3585	 */
3586	if (opa) {
3587		if (origpte & PG_W)
3588			pmap->pm_stats.wired_count--;
3589		if (origpte & PG_MANAGED) {
3590			om = PHYS_TO_VM_PAGE(opa);
3591			pv = pmap_pvh_remove(&om->md, pmap, va);
3592		}
3593		if (mpte != NULL) {
3594			mpte->wire_count--;
3595			KASSERT(mpte->wire_count > 0,
3596			    ("pmap_enter: missing reference to page table page,"
3597			     " va: 0x%x", va));
3598		}
3599	} else
3600		pmap->pm_stats.resident_count++;
3601
3602	/*
3603	 * Enter on the PV list if part of our managed memory.
3604	 */
3605	if ((m->oflags & VPO_UNMANAGED) == 0) {
3606		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3607		    ("pmap_enter: managed mapping within the clean submap"));
3608		if (pv == NULL)
3609			pv = get_pv_entry(pmap, FALSE);
3610		pv->pv_va = va;
3611		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3612		pa |= PG_MANAGED;
3613	} else if (pv != NULL)
3614		free_pv_entry(pmap, pv);
3615
3616	/*
3617	 * Increment counters
3618	 */
3619	if (wired)
3620		pmap->pm_stats.wired_count++;
3621
3622validate:
3623	/*
3624	 * Now validate mapping with desired protection/wiring.
3625	 */
3626	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3627	if ((prot & VM_PROT_WRITE) != 0) {
3628		newpte |= PG_RW;
3629		if ((newpte & PG_MANAGED) != 0)
3630			vm_page_aflag_set(m, PGA_WRITEABLE);
3631	}
3632#if defined(PAE) || defined(PAE_TABLES)
3633	if ((prot & VM_PROT_EXECUTE) == 0)
3634		newpte |= pg_nx;
3635#endif
3636	if (wired)
3637		newpte |= PG_W;
3638	if (va < VM_MAXUSER_ADDRESS)
3639		newpte |= PG_U;
3640	if (pmap == kernel_pmap)
3641		newpte |= pgeflag;
3642
3643	/*
3644	 * if the mapping or permission bits are different, we need
3645	 * to update the pte.
3646	 */
3647	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3648		newpte |= PG_A;
3649		if ((flags & VM_PROT_WRITE) != 0)
3650			newpte |= PG_M;
3651		if (origpte & PG_V) {
3652			invlva = FALSE;
3653			origpte = pte_load_store(pte, newpte);
3654			if (origpte & PG_A) {
3655				if (origpte & PG_MANAGED)
3656					vm_page_aflag_set(om, PGA_REFERENCED);
3657				if (opa != VM_PAGE_TO_PHYS(m))
3658					invlva = TRUE;
3659#if defined(PAE) || defined(PAE_TABLES)
3660				if ((origpte & PG_NX) == 0 &&
3661				    (newpte & PG_NX) != 0)
3662					invlva = TRUE;
3663#endif
3664			}
3665			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3666				if ((origpte & PG_MANAGED) != 0)
3667					vm_page_dirty(om);
3668				if ((prot & VM_PROT_WRITE) == 0)
3669					invlva = TRUE;
3670			}
3671			if ((origpte & PG_MANAGED) != 0 &&
3672			    TAILQ_EMPTY(&om->md.pv_list) &&
3673			    ((om->flags & PG_FICTITIOUS) != 0 ||
3674			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3675				vm_page_aflag_clear(om, PGA_WRITEABLE);
3676			if (invlva)
3677				pmap_invalidate_page(pmap, va);
3678		} else
3679			pte_store(pte, newpte);
3680	}
3681
3682	/*
3683	 * If both the page table page and the reservation are fully
3684	 * populated, then attempt promotion.
3685	 */
3686	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3687	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3688	    vm_reserv_level_iffullpop(m) == 0)
3689		pmap_promote_pde(pmap, pde, va);
3690
3691	sched_unpin();
3692	rw_wunlock(&pvh_global_lock);
3693	PMAP_UNLOCK(pmap);
3694	return (KERN_SUCCESS);
3695}
3696
3697/*
3698 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3699 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3700 * blocking, (2) a mapping already exists at the specified virtual address, or
3701 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3702 */
3703static boolean_t
3704pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3705{
3706	pd_entry_t *pde, newpde;
3707
3708	rw_assert(&pvh_global_lock, RA_WLOCKED);
3709	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3710	pde = pmap_pde(pmap, va);
3711	if (*pde != 0) {
3712		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3713		    " in pmap %p", va, pmap);
3714		return (FALSE);
3715	}
3716	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3717	    PG_PS | PG_V;
3718	if ((m->oflags & VPO_UNMANAGED) == 0) {
3719		newpde |= PG_MANAGED;
3720
3721		/*
3722		 * Abort this mapping if its PV entry could not be created.
3723		 */
3724		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3725			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3726			    " in pmap %p", va, pmap);
3727			return (FALSE);
3728		}
3729	}
3730#if defined(PAE) || defined(PAE_TABLES)
3731	if ((prot & VM_PROT_EXECUTE) == 0)
3732		newpde |= pg_nx;
3733#endif
3734	if (va < VM_MAXUSER_ADDRESS)
3735		newpde |= PG_U;
3736
3737	/*
3738	 * Increment counters.
3739	 */
3740	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3741
3742	/*
3743	 * Map the superpage.
3744	 */
3745	pde_store(pde, newpde);
3746
3747	pmap_pde_mappings++;
3748	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3749	    " in pmap %p", va, pmap);
3750	return (TRUE);
3751}
3752
3753/*
3754 * Maps a sequence of resident pages belonging to the same object.
3755 * The sequence begins with the given page m_start.  This page is
3756 * mapped at the given virtual address start.  Each subsequent page is
3757 * mapped at a virtual address that is offset from start by the same
3758 * amount as the page is offset from m_start within the object.  The
3759 * last page in the sequence is the page with the largest offset from
3760 * m_start that can be mapped at a virtual address less than the given
3761 * virtual address end.  Not every virtual page between start and end
3762 * is mapped; only those for which a resident page exists with the
3763 * corresponding offset from m_start are mapped.
3764 */
3765void
3766pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3767    vm_page_t m_start, vm_prot_t prot)
3768{
3769	vm_offset_t va;
3770	vm_page_t m, mpte;
3771	vm_pindex_t diff, psize;
3772
3773	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3774
3775	psize = atop(end - start);
3776	mpte = NULL;
3777	m = m_start;
3778	rw_wlock(&pvh_global_lock);
3779	PMAP_LOCK(pmap);
3780	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3781		va = start + ptoa(diff);
3782		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3783		    m->psind == 1 && pg_ps_enabled &&
3784		    pmap_enter_pde(pmap, va, m, prot))
3785			m = &m[NBPDR / PAGE_SIZE - 1];
3786		else
3787			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3788			    mpte);
3789		m = TAILQ_NEXT(m, listq);
3790	}
3791	rw_wunlock(&pvh_global_lock);
3792	PMAP_UNLOCK(pmap);
3793}
3794
3795/*
3796 * this code makes some *MAJOR* assumptions:
3797 * 1. Current pmap & pmap exists.
3798 * 2. Not wired.
3799 * 3. Read access.
3800 * 4. No page table pages.
3801 * but is *MUCH* faster than pmap_enter...
3802 */
3803
3804void
3805pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3806{
3807
3808	rw_wlock(&pvh_global_lock);
3809	PMAP_LOCK(pmap);
3810	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3811	rw_wunlock(&pvh_global_lock);
3812	PMAP_UNLOCK(pmap);
3813}
3814
3815static vm_page_t
3816pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3817    vm_prot_t prot, vm_page_t mpte)
3818{
3819	pt_entry_t *pte;
3820	vm_paddr_t pa;
3821	struct spglist free;
3822
3823	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3824	    (m->oflags & VPO_UNMANAGED) != 0,
3825	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3826	rw_assert(&pvh_global_lock, RA_WLOCKED);
3827	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3828
3829	/*
3830	 * In the case that a page table page is not
3831	 * resident, we are creating it here.
3832	 */
3833	if (va < VM_MAXUSER_ADDRESS) {
3834		u_int ptepindex;
3835		pd_entry_t ptepa;
3836
3837		/*
3838		 * Calculate pagetable page index
3839		 */
3840		ptepindex = va >> PDRSHIFT;
3841		if (mpte && (mpte->pindex == ptepindex)) {
3842			mpte->wire_count++;
3843		} else {
3844			/*
3845			 * Get the page directory entry
3846			 */
3847			ptepa = pmap->pm_pdir[ptepindex];
3848
3849			/*
3850			 * If the page table page is mapped, we just increment
3851			 * the hold count, and activate it.
3852			 */
3853			if (ptepa) {
3854				if (ptepa & PG_PS)
3855					return (NULL);
3856				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3857				mpte->wire_count++;
3858			} else {
3859				mpte = _pmap_allocpte(pmap, ptepindex,
3860				    PMAP_ENTER_NOSLEEP);
3861				if (mpte == NULL)
3862					return (mpte);
3863			}
3864		}
3865	} else {
3866		mpte = NULL;
3867	}
3868
3869	/*
3870	 * This call to vtopte makes the assumption that we are
3871	 * entering the page into the current pmap.  In order to support
3872	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3873	 * But that isn't as quick as vtopte.
3874	 */
3875	pte = vtopte(va);
3876	if (*pte) {
3877		if (mpte != NULL) {
3878			mpte->wire_count--;
3879			mpte = NULL;
3880		}
3881		return (mpte);
3882	}
3883
3884	/*
3885	 * Enter on the PV list if part of our managed memory.
3886	 */
3887	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3888	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3889		if (mpte != NULL) {
3890			SLIST_INIT(&free);
3891			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3892				pmap_invalidate_page(pmap, va);
3893				pmap_free_zero_pages(&free);
3894			}
3895
3896			mpte = NULL;
3897		}
3898		return (mpte);
3899	}
3900
3901	/*
3902	 * Increment counters
3903	 */
3904	pmap->pm_stats.resident_count++;
3905
3906	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3907#if defined(PAE) || defined(PAE_TABLES)
3908	if ((prot & VM_PROT_EXECUTE) == 0)
3909		pa |= pg_nx;
3910#endif
3911
3912	/*
3913	 * Now validate mapping with RO protection
3914	 */
3915	if ((m->oflags & VPO_UNMANAGED) != 0)
3916		pte_store(pte, pa | PG_V | PG_U);
3917	else
3918		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3919	return (mpte);
3920}
3921
3922/*
3923 * Make a temporary mapping for a physical address.  This is only intended
3924 * to be used for panic dumps.
3925 */
3926void *
3927pmap_kenter_temporary(vm_paddr_t pa, int i)
3928{
3929	vm_offset_t va;
3930
3931	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3932	pmap_kenter(va, pa);
3933	invlpg(va);
3934	return ((void *)crashdumpmap);
3935}
3936
3937/*
3938 * This code maps large physical mmap regions into the
3939 * processor address space.  Note that some shortcuts
3940 * are taken, but the code works.
3941 */
3942void
3943pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3944    vm_pindex_t pindex, vm_size_t size)
3945{
3946	pd_entry_t *pde;
3947	vm_paddr_t pa, ptepa;
3948	vm_page_t p;
3949	int pat_mode;
3950
3951	VM_OBJECT_ASSERT_WLOCKED(object);
3952	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3953	    ("pmap_object_init_pt: non-device object"));
3954	if (pseflag &&
3955	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3956		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3957			return;
3958		p = vm_page_lookup(object, pindex);
3959		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3960		    ("pmap_object_init_pt: invalid page %p", p));
3961		pat_mode = p->md.pat_mode;
3962
3963		/*
3964		 * Abort the mapping if the first page is not physically
3965		 * aligned to a 2/4MB page boundary.
3966		 */
3967		ptepa = VM_PAGE_TO_PHYS(p);
3968		if (ptepa & (NBPDR - 1))
3969			return;
3970
3971		/*
3972		 * Skip the first page.  Abort the mapping if the rest of
3973		 * the pages are not physically contiguous or have differing
3974		 * memory attributes.
3975		 */
3976		p = TAILQ_NEXT(p, listq);
3977		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3978		    pa += PAGE_SIZE) {
3979			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3980			    ("pmap_object_init_pt: invalid page %p", p));
3981			if (pa != VM_PAGE_TO_PHYS(p) ||
3982			    pat_mode != p->md.pat_mode)
3983				return;
3984			p = TAILQ_NEXT(p, listq);
3985		}
3986
3987		/*
3988		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3989		 * "size" is a multiple of 2/4M, adding the PAT setting to
3990		 * "pa" will not affect the termination of this loop.
3991		 */
3992		PMAP_LOCK(pmap);
3993		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3994		    size; pa += NBPDR) {
3995			pde = pmap_pde(pmap, addr);
3996			if (*pde == 0) {
3997				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3998				    PG_U | PG_RW | PG_V);
3999				pmap->pm_stats.resident_count += NBPDR /
4000				    PAGE_SIZE;
4001				pmap_pde_mappings++;
4002			}
4003			/* Else continue on if the PDE is already valid. */
4004			addr += NBPDR;
4005		}
4006		PMAP_UNLOCK(pmap);
4007	}
4008}
4009
4010/*
4011 *	Clear the wired attribute from the mappings for the specified range of
4012 *	addresses in the given pmap.  Every valid mapping within that range
4013 *	must have the wired attribute set.  In contrast, invalid mappings
4014 *	cannot have the wired attribute set, so they are ignored.
4015 *
4016 *	The wired attribute of the page table entry is not a hardware feature,
4017 *	so there is no need to invalidate any TLB entries.
4018 */
4019void
4020pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4021{
4022	vm_offset_t pdnxt;
4023	pd_entry_t *pde;
4024	pt_entry_t *pte;
4025	boolean_t pv_lists_locked;
4026
4027	if (pmap_is_current(pmap))
4028		pv_lists_locked = FALSE;
4029	else {
4030		pv_lists_locked = TRUE;
4031resume:
4032		rw_wlock(&pvh_global_lock);
4033		sched_pin();
4034	}
4035	PMAP_LOCK(pmap);
4036	for (; sva < eva; sva = pdnxt) {
4037		pdnxt = (sva + NBPDR) & ~PDRMASK;
4038		if (pdnxt < sva)
4039			pdnxt = eva;
4040		pde = pmap_pde(pmap, sva);
4041		if ((*pde & PG_V) == 0)
4042			continue;
4043		if ((*pde & PG_PS) != 0) {
4044			if ((*pde & PG_W) == 0)
4045				panic("pmap_unwire: pde %#jx is missing PG_W",
4046				    (uintmax_t)*pde);
4047
4048			/*
4049			 * Are we unwiring the entire large page?  If not,
4050			 * demote the mapping and fall through.
4051			 */
4052			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4053				/*
4054				 * Regardless of whether a pde (or pte) is 32
4055				 * or 64 bits in size, PG_W is among the least
4056				 * significant 32 bits.
4057				 */
4058				atomic_clear_int((u_int *)pde, PG_W);
4059				pmap->pm_stats.wired_count -= NBPDR /
4060				    PAGE_SIZE;
4061				continue;
4062			} else {
4063				if (!pv_lists_locked) {
4064					pv_lists_locked = TRUE;
4065					if (!rw_try_wlock(&pvh_global_lock)) {
4066						PMAP_UNLOCK(pmap);
4067						/* Repeat sva. */
4068						goto resume;
4069					}
4070					sched_pin();
4071				}
4072				if (!pmap_demote_pde(pmap, pde, sva))
4073					panic("pmap_unwire: demotion failed");
4074			}
4075		}
4076		if (pdnxt > eva)
4077			pdnxt = eva;
4078		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4079		    sva += PAGE_SIZE) {
4080			if ((*pte & PG_V) == 0)
4081				continue;
4082			if ((*pte & PG_W) == 0)
4083				panic("pmap_unwire: pte %#jx is missing PG_W",
4084				    (uintmax_t)*pte);
4085
4086			/*
4087			 * PG_W must be cleared atomically.  Although the pmap
4088			 * lock synchronizes access to PG_W, another processor
4089			 * could be setting PG_M and/or PG_A concurrently.
4090			 *
4091			 * PG_W is among the least significant 32 bits.
4092			 */
4093			atomic_clear_int((u_int *)pte, PG_W);
4094			pmap->pm_stats.wired_count--;
4095		}
4096	}
4097	if (pv_lists_locked) {
4098		sched_unpin();
4099		rw_wunlock(&pvh_global_lock);
4100	}
4101	PMAP_UNLOCK(pmap);
4102}
4103
4104
4105/*
4106 *	Copy the range specified by src_addr/len
4107 *	from the source map to the range dst_addr/len
4108 *	in the destination map.
4109 *
4110 *	This routine is only advisory and need not do anything.
4111 */
4112
4113void
4114pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4115    vm_offset_t src_addr)
4116{
4117	struct spglist free;
4118	vm_offset_t addr;
4119	vm_offset_t end_addr = src_addr + len;
4120	vm_offset_t pdnxt;
4121
4122	if (dst_addr != src_addr)
4123		return;
4124
4125	if (!pmap_is_current(src_pmap))
4126		return;
4127
4128	rw_wlock(&pvh_global_lock);
4129	if (dst_pmap < src_pmap) {
4130		PMAP_LOCK(dst_pmap);
4131		PMAP_LOCK(src_pmap);
4132	} else {
4133		PMAP_LOCK(src_pmap);
4134		PMAP_LOCK(dst_pmap);
4135	}
4136	sched_pin();
4137	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4138		pt_entry_t *src_pte, *dst_pte;
4139		vm_page_t dstmpte, srcmpte;
4140		pd_entry_t srcptepaddr;
4141		u_int ptepindex;
4142
4143		KASSERT(addr < UPT_MIN_ADDRESS,
4144		    ("pmap_copy: invalid to pmap_copy page tables"));
4145
4146		pdnxt = (addr + NBPDR) & ~PDRMASK;
4147		if (pdnxt < addr)
4148			pdnxt = end_addr;
4149		ptepindex = addr >> PDRSHIFT;
4150
4151		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4152		if (srcptepaddr == 0)
4153			continue;
4154
4155		if (srcptepaddr & PG_PS) {
4156			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4157				continue;
4158			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4159			    ((srcptepaddr & PG_MANAGED) == 0 ||
4160			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4161			    PG_PS_FRAME))) {
4162				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4163				    ~PG_W;
4164				dst_pmap->pm_stats.resident_count +=
4165				    NBPDR / PAGE_SIZE;
4166			}
4167			continue;
4168		}
4169
4170		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4171		KASSERT(srcmpte->wire_count > 0,
4172		    ("pmap_copy: source page table page is unused"));
4173
4174		if (pdnxt > end_addr)
4175			pdnxt = end_addr;
4176
4177		src_pte = vtopte(addr);
4178		while (addr < pdnxt) {
4179			pt_entry_t ptetemp;
4180			ptetemp = *src_pte;
4181			/*
4182			 * we only virtual copy managed pages
4183			 */
4184			if ((ptetemp & PG_MANAGED) != 0) {
4185				dstmpte = pmap_allocpte(dst_pmap, addr,
4186				    PMAP_ENTER_NOSLEEP);
4187				if (dstmpte == NULL)
4188					goto out;
4189				dst_pte = pmap_pte_quick(dst_pmap, addr);
4190				if (*dst_pte == 0 &&
4191				    pmap_try_insert_pv_entry(dst_pmap, addr,
4192				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4193					/*
4194					 * Clear the wired, modified, and
4195					 * accessed (referenced) bits
4196					 * during the copy.
4197					 */
4198					*dst_pte = ptetemp & ~(PG_W | PG_M |
4199					    PG_A);
4200					dst_pmap->pm_stats.resident_count++;
4201	 			} else {
4202					SLIST_INIT(&free);
4203					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4204					    &free)) {
4205						pmap_invalidate_page(dst_pmap,
4206						    addr);
4207						pmap_free_zero_pages(&free);
4208					}
4209					goto out;
4210				}
4211				if (dstmpte->wire_count >= srcmpte->wire_count)
4212					break;
4213			}
4214			addr += PAGE_SIZE;
4215			src_pte++;
4216		}
4217	}
4218out:
4219	sched_unpin();
4220	rw_wunlock(&pvh_global_lock);
4221	PMAP_UNLOCK(src_pmap);
4222	PMAP_UNLOCK(dst_pmap);
4223}
4224
4225static __inline void
4226pagezero(void *page)
4227{
4228#if defined(I686_CPU)
4229	if (cpu_class == CPUCLASS_686) {
4230#if defined(CPU_ENABLE_SSE)
4231		if (cpu_feature & CPUID_SSE2)
4232			sse2_pagezero(page);
4233		else
4234#endif
4235			i686_pagezero(page);
4236	} else
4237#endif
4238		bzero(page, PAGE_SIZE);
4239}
4240
4241/*
4242 *	pmap_zero_page zeros the specified hardware page by mapping
4243 *	the page into KVM and using bzero to clear its contents.
4244 */
4245void
4246pmap_zero_page(vm_page_t m)
4247{
4248	struct sysmaps *sysmaps;
4249
4250	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4251	mtx_lock(&sysmaps->lock);
4252	if (*sysmaps->CMAP2)
4253		panic("pmap_zero_page: CMAP2 busy");
4254	sched_pin();
4255	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4256	    pmap_cache_bits(m->md.pat_mode, 0);
4257	invlcaddr(sysmaps->CADDR2);
4258	pagezero(sysmaps->CADDR2);
4259	*sysmaps->CMAP2 = 0;
4260	sched_unpin();
4261	mtx_unlock(&sysmaps->lock);
4262}
4263
4264/*
4265 *	pmap_zero_page_area zeros the specified hardware page by mapping
4266 *	the page into KVM and using bzero to clear its contents.
4267 *
4268 *	off and size may not cover an area beyond a single hardware page.
4269 */
4270void
4271pmap_zero_page_area(vm_page_t m, int off, int size)
4272{
4273	struct sysmaps *sysmaps;
4274
4275	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4276	mtx_lock(&sysmaps->lock);
4277	if (*sysmaps->CMAP2)
4278		panic("pmap_zero_page_area: CMAP2 busy");
4279	sched_pin();
4280	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4281	    pmap_cache_bits(m->md.pat_mode, 0);
4282	invlcaddr(sysmaps->CADDR2);
4283	if (off == 0 && size == PAGE_SIZE)
4284		pagezero(sysmaps->CADDR2);
4285	else
4286		bzero((char *)sysmaps->CADDR2 + off, size);
4287	*sysmaps->CMAP2 = 0;
4288	sched_unpin();
4289	mtx_unlock(&sysmaps->lock);
4290}
4291
4292/*
4293 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4294 *	the page into KVM and using bzero to clear its contents.  This
4295 *	is intended to be called from the vm_pagezero process only and
4296 *	outside of Giant.
4297 */
4298void
4299pmap_zero_page_idle(vm_page_t m)
4300{
4301
4302	if (*CMAP3)
4303		panic("pmap_zero_page_idle: CMAP3 busy");
4304	sched_pin();
4305	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4306	    pmap_cache_bits(m->md.pat_mode, 0);
4307	invlcaddr(CADDR3);
4308	pagezero(CADDR3);
4309	*CMAP3 = 0;
4310	sched_unpin();
4311}
4312
4313/*
4314 *	pmap_copy_page copies the specified (machine independent)
4315 *	page by mapping the page into virtual memory and using
4316 *	bcopy to copy the page, one machine dependent page at a
4317 *	time.
4318 */
4319void
4320pmap_copy_page(vm_page_t src, vm_page_t dst)
4321{
4322	struct sysmaps *sysmaps;
4323
4324	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4325	mtx_lock(&sysmaps->lock);
4326	if (*sysmaps->CMAP1)
4327		panic("pmap_copy_page: CMAP1 busy");
4328	if (*sysmaps->CMAP2)
4329		panic("pmap_copy_page: CMAP2 busy");
4330	sched_pin();
4331	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4332	    pmap_cache_bits(src->md.pat_mode, 0);
4333	invlcaddr(sysmaps->CADDR1);
4334	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4335	    pmap_cache_bits(dst->md.pat_mode, 0);
4336	invlcaddr(sysmaps->CADDR2);
4337	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4338	*sysmaps->CMAP1 = 0;
4339	*sysmaps->CMAP2 = 0;
4340	sched_unpin();
4341	mtx_unlock(&sysmaps->lock);
4342}
4343
4344int unmapped_buf_allowed = 1;
4345
4346void
4347pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4348    vm_offset_t b_offset, int xfersize)
4349{
4350	struct sysmaps *sysmaps;
4351	vm_page_t a_pg, b_pg;
4352	char *a_cp, *b_cp;
4353	vm_offset_t a_pg_offset, b_pg_offset;
4354	int cnt;
4355
4356	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4357	mtx_lock(&sysmaps->lock);
4358	if (*sysmaps->CMAP1 != 0)
4359		panic("pmap_copy_pages: CMAP1 busy");
4360	if (*sysmaps->CMAP2 != 0)
4361		panic("pmap_copy_pages: CMAP2 busy");
4362	sched_pin();
4363	while (xfersize > 0) {
4364		a_pg = ma[a_offset >> PAGE_SHIFT];
4365		a_pg_offset = a_offset & PAGE_MASK;
4366		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4367		b_pg = mb[b_offset >> PAGE_SHIFT];
4368		b_pg_offset = b_offset & PAGE_MASK;
4369		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4370		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4371		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4372		invlcaddr(sysmaps->CADDR1);
4373		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4374		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4375		invlcaddr(sysmaps->CADDR2);
4376		a_cp = sysmaps->CADDR1 + a_pg_offset;
4377		b_cp = sysmaps->CADDR2 + b_pg_offset;
4378		bcopy(a_cp, b_cp, cnt);
4379		a_offset += cnt;
4380		b_offset += cnt;
4381		xfersize -= cnt;
4382	}
4383	*sysmaps->CMAP1 = 0;
4384	*sysmaps->CMAP2 = 0;
4385	sched_unpin();
4386	mtx_unlock(&sysmaps->lock);
4387}
4388
4389/*
4390 * Returns true if the pmap's pv is one of the first
4391 * 16 pvs linked to from this page.  This count may
4392 * be changed upwards or downwards in the future; it
4393 * is only necessary that true be returned for a small
4394 * subset of pmaps for proper page aging.
4395 */
4396boolean_t
4397pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4398{
4399	struct md_page *pvh;
4400	pv_entry_t pv;
4401	int loops = 0;
4402	boolean_t rv;
4403
4404	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4405	    ("pmap_page_exists_quick: page %p is not managed", m));
4406	rv = FALSE;
4407	rw_wlock(&pvh_global_lock);
4408	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4409		if (PV_PMAP(pv) == pmap) {
4410			rv = TRUE;
4411			break;
4412		}
4413		loops++;
4414		if (loops >= 16)
4415			break;
4416	}
4417	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4418		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4419		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4420			if (PV_PMAP(pv) == pmap) {
4421				rv = TRUE;
4422				break;
4423			}
4424			loops++;
4425			if (loops >= 16)
4426				break;
4427		}
4428	}
4429	rw_wunlock(&pvh_global_lock);
4430	return (rv);
4431}
4432
4433/*
4434 *	pmap_page_wired_mappings:
4435 *
4436 *	Return the number of managed mappings to the given physical page
4437 *	that are wired.
4438 */
4439int
4440pmap_page_wired_mappings(vm_page_t m)
4441{
4442	int count;
4443
4444	count = 0;
4445	if ((m->oflags & VPO_UNMANAGED) != 0)
4446		return (count);
4447	rw_wlock(&pvh_global_lock);
4448	count = pmap_pvh_wired_mappings(&m->md, count);
4449	if ((m->flags & PG_FICTITIOUS) == 0) {
4450	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4451	        count);
4452	}
4453	rw_wunlock(&pvh_global_lock);
4454	return (count);
4455}
4456
4457/*
4458 *	pmap_pvh_wired_mappings:
4459 *
4460 *	Return the updated number "count" of managed mappings that are wired.
4461 */
4462static int
4463pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4464{
4465	pmap_t pmap;
4466	pt_entry_t *pte;
4467	pv_entry_t pv;
4468
4469	rw_assert(&pvh_global_lock, RA_WLOCKED);
4470	sched_pin();
4471	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4472		pmap = PV_PMAP(pv);
4473		PMAP_LOCK(pmap);
4474		pte = pmap_pte_quick(pmap, pv->pv_va);
4475		if ((*pte & PG_W) != 0)
4476			count++;
4477		PMAP_UNLOCK(pmap);
4478	}
4479	sched_unpin();
4480	return (count);
4481}
4482
4483/*
4484 * Returns TRUE if the given page is mapped individually or as part of
4485 * a 4mpage.  Otherwise, returns FALSE.
4486 */
4487boolean_t
4488pmap_page_is_mapped(vm_page_t m)
4489{
4490	boolean_t rv;
4491
4492	if ((m->oflags & VPO_UNMANAGED) != 0)
4493		return (FALSE);
4494	rw_wlock(&pvh_global_lock);
4495	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4496	    ((m->flags & PG_FICTITIOUS) == 0 &&
4497	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4498	rw_wunlock(&pvh_global_lock);
4499	return (rv);
4500}
4501
4502/*
4503 * Remove all pages from specified address space
4504 * this aids process exit speeds.  Also, this code
4505 * is special cased for current process only, but
4506 * can have the more generic (and slightly slower)
4507 * mode enabled.  This is much faster than pmap_remove
4508 * in the case of running down an entire address space.
4509 */
4510void
4511pmap_remove_pages(pmap_t pmap)
4512{
4513	pt_entry_t *pte, tpte;
4514	vm_page_t m, mpte, mt;
4515	pv_entry_t pv;
4516	struct md_page *pvh;
4517	struct pv_chunk *pc, *npc;
4518	struct spglist free;
4519	int field, idx;
4520	int32_t bit;
4521	uint32_t inuse, bitmask;
4522	int allfree;
4523
4524	if (pmap != PCPU_GET(curpmap)) {
4525		printf("warning: pmap_remove_pages called with non-current pmap\n");
4526		return;
4527	}
4528	SLIST_INIT(&free);
4529	rw_wlock(&pvh_global_lock);
4530	PMAP_LOCK(pmap);
4531	sched_pin();
4532	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4533		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4534		    pc->pc_pmap));
4535		allfree = 1;
4536		for (field = 0; field < _NPCM; field++) {
4537			inuse = ~pc->pc_map[field] & pc_freemask[field];
4538			while (inuse != 0) {
4539				bit = bsfl(inuse);
4540				bitmask = 1UL << bit;
4541				idx = field * 32 + bit;
4542				pv = &pc->pc_pventry[idx];
4543				inuse &= ~bitmask;
4544
4545				pte = pmap_pde(pmap, pv->pv_va);
4546				tpte = *pte;
4547				if ((tpte & PG_PS) == 0) {
4548					pte = vtopte(pv->pv_va);
4549					tpte = *pte & ~PG_PTE_PAT;
4550				}
4551
4552				if (tpte == 0) {
4553					printf(
4554					    "TPTE at %p  IS ZERO @ VA %08x\n",
4555					    pte, pv->pv_va);
4556					panic("bad pte");
4557				}
4558
4559/*
4560 * We cannot remove wired pages from a process' mapping at this time
4561 */
4562				if (tpte & PG_W) {
4563					allfree = 0;
4564					continue;
4565				}
4566
4567				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4568				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4569				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4570				    m, (uintmax_t)m->phys_addr,
4571				    (uintmax_t)tpte));
4572
4573				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4574				    m < &vm_page_array[vm_page_array_size],
4575				    ("pmap_remove_pages: bad tpte %#jx",
4576				    (uintmax_t)tpte));
4577
4578				pte_clear(pte);
4579
4580				/*
4581				 * Update the vm_page_t clean/reference bits.
4582				 */
4583				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4584					if ((tpte & PG_PS) != 0) {
4585						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4586							vm_page_dirty(mt);
4587					} else
4588						vm_page_dirty(m);
4589				}
4590
4591				/* Mark free */
4592				PV_STAT(pv_entry_frees++);
4593				PV_STAT(pv_entry_spare++);
4594				pv_entry_count--;
4595				pc->pc_map[field] |= bitmask;
4596				if ((tpte & PG_PS) != 0) {
4597					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4598					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4599					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4600					if (TAILQ_EMPTY(&pvh->pv_list)) {
4601						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4602							if (TAILQ_EMPTY(&mt->md.pv_list))
4603								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4604					}
4605					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4606					if (mpte != NULL) {
4607						pmap_remove_pt_page(pmap, mpte);
4608						pmap->pm_stats.resident_count--;
4609						KASSERT(mpte->wire_count == NPTEPG,
4610						    ("pmap_remove_pages: pte page wire count error"));
4611						mpte->wire_count = 0;
4612						pmap_add_delayed_free_list(mpte, &free, FALSE);
4613						atomic_subtract_int(&cnt.v_wire_count, 1);
4614					}
4615				} else {
4616					pmap->pm_stats.resident_count--;
4617					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4618					if (TAILQ_EMPTY(&m->md.pv_list) &&
4619					    (m->flags & PG_FICTITIOUS) == 0) {
4620						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4621						if (TAILQ_EMPTY(&pvh->pv_list))
4622							vm_page_aflag_clear(m, PGA_WRITEABLE);
4623					}
4624					pmap_unuse_pt(pmap, pv->pv_va, &free);
4625				}
4626			}
4627		}
4628		if (allfree) {
4629			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4630			free_pv_chunk(pc);
4631		}
4632	}
4633	sched_unpin();
4634	pmap_invalidate_all(pmap);
4635	rw_wunlock(&pvh_global_lock);
4636	PMAP_UNLOCK(pmap);
4637	pmap_free_zero_pages(&free);
4638}
4639
4640/*
4641 *	pmap_is_modified:
4642 *
4643 *	Return whether or not the specified physical page was modified
4644 *	in any physical maps.
4645 */
4646boolean_t
4647pmap_is_modified(vm_page_t m)
4648{
4649	boolean_t rv;
4650
4651	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4652	    ("pmap_is_modified: page %p is not managed", m));
4653
4654	/*
4655	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4656	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4657	 * is clear, no PTEs can have PG_M set.
4658	 */
4659	VM_OBJECT_ASSERT_WLOCKED(m->object);
4660	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4661		return (FALSE);
4662	rw_wlock(&pvh_global_lock);
4663	rv = pmap_is_modified_pvh(&m->md) ||
4664	    ((m->flags & PG_FICTITIOUS) == 0 &&
4665	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4666	rw_wunlock(&pvh_global_lock);
4667	return (rv);
4668}
4669
4670/*
4671 * Returns TRUE if any of the given mappings were used to modify
4672 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4673 * mappings are supported.
4674 */
4675static boolean_t
4676pmap_is_modified_pvh(struct md_page *pvh)
4677{
4678	pv_entry_t pv;
4679	pt_entry_t *pte;
4680	pmap_t pmap;
4681	boolean_t rv;
4682
4683	rw_assert(&pvh_global_lock, RA_WLOCKED);
4684	rv = FALSE;
4685	sched_pin();
4686	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4687		pmap = PV_PMAP(pv);
4688		PMAP_LOCK(pmap);
4689		pte = pmap_pte_quick(pmap, pv->pv_va);
4690		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4691		PMAP_UNLOCK(pmap);
4692		if (rv)
4693			break;
4694	}
4695	sched_unpin();
4696	return (rv);
4697}
4698
4699/*
4700 *	pmap_is_prefaultable:
4701 *
4702 *	Return whether or not the specified virtual address is elgible
4703 *	for prefault.
4704 */
4705boolean_t
4706pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4707{
4708	pd_entry_t *pde;
4709	pt_entry_t *pte;
4710	boolean_t rv;
4711
4712	rv = FALSE;
4713	PMAP_LOCK(pmap);
4714	pde = pmap_pde(pmap, addr);
4715	if (*pde != 0 && (*pde & PG_PS) == 0) {
4716		pte = vtopte(addr);
4717		rv = *pte == 0;
4718	}
4719	PMAP_UNLOCK(pmap);
4720	return (rv);
4721}
4722
4723/*
4724 *	pmap_is_referenced:
4725 *
4726 *	Return whether or not the specified physical page was referenced
4727 *	in any physical maps.
4728 */
4729boolean_t
4730pmap_is_referenced(vm_page_t m)
4731{
4732	boolean_t rv;
4733
4734	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4735	    ("pmap_is_referenced: page %p is not managed", m));
4736	rw_wlock(&pvh_global_lock);
4737	rv = pmap_is_referenced_pvh(&m->md) ||
4738	    ((m->flags & PG_FICTITIOUS) == 0 &&
4739	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4740	rw_wunlock(&pvh_global_lock);
4741	return (rv);
4742}
4743
4744/*
4745 * Returns TRUE if any of the given mappings were referenced and FALSE
4746 * otherwise.  Both page and 4mpage mappings are supported.
4747 */
4748static boolean_t
4749pmap_is_referenced_pvh(struct md_page *pvh)
4750{
4751	pv_entry_t pv;
4752	pt_entry_t *pte;
4753	pmap_t pmap;
4754	boolean_t rv;
4755
4756	rw_assert(&pvh_global_lock, RA_WLOCKED);
4757	rv = FALSE;
4758	sched_pin();
4759	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4760		pmap = PV_PMAP(pv);
4761		PMAP_LOCK(pmap);
4762		pte = pmap_pte_quick(pmap, pv->pv_va);
4763		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4764		PMAP_UNLOCK(pmap);
4765		if (rv)
4766			break;
4767	}
4768	sched_unpin();
4769	return (rv);
4770}
4771
4772/*
4773 * Clear the write and modified bits in each of the given page's mappings.
4774 */
4775void
4776pmap_remove_write(vm_page_t m)
4777{
4778	struct md_page *pvh;
4779	pv_entry_t next_pv, pv;
4780	pmap_t pmap;
4781	pd_entry_t *pde;
4782	pt_entry_t oldpte, *pte;
4783	vm_offset_t va;
4784
4785	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4786	    ("pmap_remove_write: page %p is not managed", m));
4787
4788	/*
4789	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4790	 * set by another thread while the object is locked.  Thus,
4791	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4792	 */
4793	VM_OBJECT_ASSERT_WLOCKED(m->object);
4794	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4795		return;
4796	rw_wlock(&pvh_global_lock);
4797	sched_pin();
4798	if ((m->flags & PG_FICTITIOUS) != 0)
4799		goto small_mappings;
4800	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4801	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4802		va = pv->pv_va;
4803		pmap = PV_PMAP(pv);
4804		PMAP_LOCK(pmap);
4805		pde = pmap_pde(pmap, va);
4806		if ((*pde & PG_RW) != 0)
4807			(void)pmap_demote_pde(pmap, pde, va);
4808		PMAP_UNLOCK(pmap);
4809	}
4810small_mappings:
4811	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4812		pmap = PV_PMAP(pv);
4813		PMAP_LOCK(pmap);
4814		pde = pmap_pde(pmap, pv->pv_va);
4815		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4816		    " a 4mpage in page %p's pv list", m));
4817		pte = pmap_pte_quick(pmap, pv->pv_va);
4818retry:
4819		oldpte = *pte;
4820		if ((oldpte & PG_RW) != 0) {
4821			/*
4822			 * Regardless of whether a pte is 32 or 64 bits
4823			 * in size, PG_RW and PG_M are among the least
4824			 * significant 32 bits.
4825			 */
4826			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4827			    oldpte & ~(PG_RW | PG_M)))
4828				goto retry;
4829			if ((oldpte & PG_M) != 0)
4830				vm_page_dirty(m);
4831			pmap_invalidate_page(pmap, pv->pv_va);
4832		}
4833		PMAP_UNLOCK(pmap);
4834	}
4835	vm_page_aflag_clear(m, PGA_WRITEABLE);
4836	sched_unpin();
4837	rw_wunlock(&pvh_global_lock);
4838}
4839
4840#define	PMAP_TS_REFERENCED_MAX	5
4841
4842/*
4843 *	pmap_ts_referenced:
4844 *
4845 *	Return a count of reference bits for a page, clearing those bits.
4846 *	It is not necessary for every reference bit to be cleared, but it
4847 *	is necessary that 0 only be returned when there are truly no
4848 *	reference bits set.
4849 *
4850 *	XXX: The exact number of bits to check and clear is a matter that
4851 *	should be tested and standardized at some point in the future for
4852 *	optimal aging of shared pages.
4853 */
4854int
4855pmap_ts_referenced(vm_page_t m)
4856{
4857	struct md_page *pvh;
4858	pv_entry_t pv, pvf;
4859	pmap_t pmap;
4860	pd_entry_t *pde;
4861	pt_entry_t *pte;
4862	vm_paddr_t pa;
4863	int rtval = 0;
4864
4865	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4866	    ("pmap_ts_referenced: page %p is not managed", m));
4867	pa = VM_PAGE_TO_PHYS(m);
4868	pvh = pa_to_pvh(pa);
4869	rw_wlock(&pvh_global_lock);
4870	sched_pin();
4871	if ((m->flags & PG_FICTITIOUS) != 0 ||
4872	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4873		goto small_mappings;
4874	pv = pvf;
4875	do {
4876		pmap = PV_PMAP(pv);
4877		PMAP_LOCK(pmap);
4878		pde = pmap_pde(pmap, pv->pv_va);
4879		if ((*pde & PG_A) != 0) {
4880			/*
4881			 * Since this reference bit is shared by either 1024
4882			 * or 512 4KB pages, it should not be cleared every
4883			 * time it is tested.  Apply a simple "hash" function
4884			 * on the physical page number, the virtual superpage
4885			 * number, and the pmap address to select one 4KB page
4886			 * out of the 1024 or 512 on which testing the
4887			 * reference bit will result in clearing that bit.
4888			 * This function is designed to avoid the selection of
4889			 * the same 4KB page for every 2- or 4MB page mapping.
4890			 *
4891			 * On demotion, a mapping that hasn't been referenced
4892			 * is simply destroyed.  To avoid the possibility of a
4893			 * subsequent page fault on a demoted wired mapping,
4894			 * always leave its reference bit set.  Moreover,
4895			 * since the superpage is wired, the current state of
4896			 * its reference bit won't affect page replacement.
4897			 */
4898			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4899			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4900			    (*pde & PG_W) == 0) {
4901				atomic_clear_int((u_int *)pde, PG_A);
4902				pmap_invalidate_page(pmap, pv->pv_va);
4903			}
4904			rtval++;
4905		}
4906		PMAP_UNLOCK(pmap);
4907		/* Rotate the PV list if it has more than one entry. */
4908		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4909			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4910			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4911		}
4912		if (rtval >= PMAP_TS_REFERENCED_MAX)
4913			goto out;
4914	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4915small_mappings:
4916	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4917		goto out;
4918	pv = pvf;
4919	do {
4920		pmap = PV_PMAP(pv);
4921		PMAP_LOCK(pmap);
4922		pde = pmap_pde(pmap, pv->pv_va);
4923		KASSERT((*pde & PG_PS) == 0,
4924		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4925		    m));
4926		pte = pmap_pte_quick(pmap, pv->pv_va);
4927		if ((*pte & PG_A) != 0) {
4928			atomic_clear_int((u_int *)pte, PG_A);
4929			pmap_invalidate_page(pmap, pv->pv_va);
4930			rtval++;
4931		}
4932		PMAP_UNLOCK(pmap);
4933		/* Rotate the PV list if it has more than one entry. */
4934		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4935			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4936			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4937		}
4938	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4939	    PMAP_TS_REFERENCED_MAX);
4940out:
4941	sched_unpin();
4942	rw_wunlock(&pvh_global_lock);
4943	return (rtval);
4944}
4945
4946/*
4947 *	Apply the given advice to the specified range of addresses within the
4948 *	given pmap.  Depending on the advice, clear the referenced and/or
4949 *	modified flags in each mapping and set the mapped page's dirty field.
4950 */
4951void
4952pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4953{
4954	pd_entry_t oldpde, *pde;
4955	pt_entry_t *pte;
4956	vm_offset_t pdnxt;
4957	vm_page_t m;
4958	boolean_t anychanged, pv_lists_locked;
4959
4960	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4961		return;
4962	if (pmap_is_current(pmap))
4963		pv_lists_locked = FALSE;
4964	else {
4965		pv_lists_locked = TRUE;
4966resume:
4967		rw_wlock(&pvh_global_lock);
4968		sched_pin();
4969	}
4970	anychanged = FALSE;
4971	PMAP_LOCK(pmap);
4972	for (; sva < eva; sva = pdnxt) {
4973		pdnxt = (sva + NBPDR) & ~PDRMASK;
4974		if (pdnxt < sva)
4975			pdnxt = eva;
4976		pde = pmap_pde(pmap, sva);
4977		oldpde = *pde;
4978		if ((oldpde & PG_V) == 0)
4979			continue;
4980		else if ((oldpde & PG_PS) != 0) {
4981			if ((oldpde & PG_MANAGED) == 0)
4982				continue;
4983			if (!pv_lists_locked) {
4984				pv_lists_locked = TRUE;
4985				if (!rw_try_wlock(&pvh_global_lock)) {
4986					if (anychanged)
4987						pmap_invalidate_all(pmap);
4988					PMAP_UNLOCK(pmap);
4989					goto resume;
4990				}
4991				sched_pin();
4992			}
4993			if (!pmap_demote_pde(pmap, pde, sva)) {
4994				/*
4995				 * The large page mapping was destroyed.
4996				 */
4997				continue;
4998			}
4999
5000			/*
5001			 * Unless the page mappings are wired, remove the
5002			 * mapping to a single page so that a subsequent
5003			 * access may repromote.  Since the underlying page
5004			 * table page is fully populated, this removal never
5005			 * frees a page table page.
5006			 */
5007			if ((oldpde & PG_W) == 0) {
5008				pte = pmap_pte_quick(pmap, sva);
5009				KASSERT((*pte & PG_V) != 0,
5010				    ("pmap_advise: invalid PTE"));
5011				pmap_remove_pte(pmap, pte, sva, NULL);
5012				anychanged = TRUE;
5013			}
5014		}
5015		if (pdnxt > eva)
5016			pdnxt = eva;
5017		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
5018		    sva += PAGE_SIZE) {
5019			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
5020			    PG_V))
5021				continue;
5022			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5023				if (advice == MADV_DONTNEED) {
5024					/*
5025					 * Future calls to pmap_is_modified()
5026					 * can be avoided by making the page
5027					 * dirty now.
5028					 */
5029					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5030					vm_page_dirty(m);
5031				}
5032				atomic_clear_int((u_int *)pte, PG_M | PG_A);
5033			} else if ((*pte & PG_A) != 0)
5034				atomic_clear_int((u_int *)pte, PG_A);
5035			else
5036				continue;
5037			if ((*pte & PG_G) != 0)
5038				pmap_invalidate_page(pmap, sva);
5039			else
5040				anychanged = TRUE;
5041		}
5042	}
5043	if (anychanged)
5044		pmap_invalidate_all(pmap);
5045	if (pv_lists_locked) {
5046		sched_unpin();
5047		rw_wunlock(&pvh_global_lock);
5048	}
5049	PMAP_UNLOCK(pmap);
5050}
5051
5052/*
5053 *	Clear the modify bits on the specified physical page.
5054 */
5055void
5056pmap_clear_modify(vm_page_t m)
5057{
5058	struct md_page *pvh;
5059	pv_entry_t next_pv, pv;
5060	pmap_t pmap;
5061	pd_entry_t oldpde, *pde;
5062	pt_entry_t oldpte, *pte;
5063	vm_offset_t va;
5064
5065	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5066	    ("pmap_clear_modify: page %p is not managed", m));
5067	VM_OBJECT_ASSERT_WLOCKED(m->object);
5068	KASSERT(!vm_page_xbusied(m),
5069	    ("pmap_clear_modify: page %p is exclusive busied", m));
5070
5071	/*
5072	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5073	 * If the object containing the page is locked and the page is not
5074	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5075	 */
5076	if ((m->aflags & PGA_WRITEABLE) == 0)
5077		return;
5078	rw_wlock(&pvh_global_lock);
5079	sched_pin();
5080	if ((m->flags & PG_FICTITIOUS) != 0)
5081		goto small_mappings;
5082	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5083	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5084		va = pv->pv_va;
5085		pmap = PV_PMAP(pv);
5086		PMAP_LOCK(pmap);
5087		pde = pmap_pde(pmap, va);
5088		oldpde = *pde;
5089		if ((oldpde & PG_RW) != 0) {
5090			if (pmap_demote_pde(pmap, pde, va)) {
5091				if ((oldpde & PG_W) == 0) {
5092					/*
5093					 * Write protect the mapping to a
5094					 * single page so that a subsequent
5095					 * write access may repromote.
5096					 */
5097					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5098					    PG_PS_FRAME);
5099					pte = pmap_pte_quick(pmap, va);
5100					oldpte = *pte;
5101					if ((oldpte & PG_V) != 0) {
5102						/*
5103						 * Regardless of whether a pte is 32 or 64 bits
5104						 * in size, PG_RW and PG_M are among the least
5105						 * significant 32 bits.
5106						 */
5107						while (!atomic_cmpset_int((u_int *)pte,
5108						    oldpte,
5109						    oldpte & ~(PG_M | PG_RW)))
5110							oldpte = *pte;
5111						vm_page_dirty(m);
5112						pmap_invalidate_page(pmap, va);
5113					}
5114				}
5115			}
5116		}
5117		PMAP_UNLOCK(pmap);
5118	}
5119small_mappings:
5120	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5121		pmap = PV_PMAP(pv);
5122		PMAP_LOCK(pmap);
5123		pde = pmap_pde(pmap, pv->pv_va);
5124		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5125		    " a 4mpage in page %p's pv list", m));
5126		pte = pmap_pte_quick(pmap, pv->pv_va);
5127		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5128			/*
5129			 * Regardless of whether a pte is 32 or 64 bits
5130			 * in size, PG_M is among the least significant
5131			 * 32 bits.
5132			 */
5133			atomic_clear_int((u_int *)pte, PG_M);
5134			pmap_invalidate_page(pmap, pv->pv_va);
5135		}
5136		PMAP_UNLOCK(pmap);
5137	}
5138	sched_unpin();
5139	rw_wunlock(&pvh_global_lock);
5140}
5141
5142/*
5143 * Miscellaneous support routines follow
5144 */
5145
5146/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5147static __inline void
5148pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5149{
5150	u_int opte, npte;
5151
5152	/*
5153	 * The cache mode bits are all in the low 32-bits of the
5154	 * PTE, so we can just spin on updating the low 32-bits.
5155	 */
5156	do {
5157		opte = *(u_int *)pte;
5158		npte = opte & ~PG_PTE_CACHE;
5159		npte |= cache_bits;
5160	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5161}
5162
5163/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5164static __inline void
5165pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5166{
5167	u_int opde, npde;
5168
5169	/*
5170	 * The cache mode bits are all in the low 32-bits of the
5171	 * PDE, so we can just spin on updating the low 32-bits.
5172	 */
5173	do {
5174		opde = *(u_int *)pde;
5175		npde = opde & ~PG_PDE_CACHE;
5176		npde |= cache_bits;
5177	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5178}
5179
5180/*
5181 * Map a set of physical memory pages into the kernel virtual
5182 * address space. Return a pointer to where it is mapped. This
5183 * routine is intended to be used for mapping device memory,
5184 * NOT real memory.
5185 */
5186void *
5187pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5188{
5189	struct pmap_preinit_mapping *ppim;
5190	vm_offset_t va, offset;
5191	vm_size_t tmpsize;
5192	int i;
5193
5194	offset = pa & PAGE_MASK;
5195	size = round_page(offset + size);
5196	pa = pa & PG_FRAME;
5197
5198	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5199		va = KERNBASE + pa;
5200	else if (!pmap_initialized) {
5201		va = 0;
5202		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5203			ppim = pmap_preinit_mapping + i;
5204			if (ppim->va == 0) {
5205				ppim->pa = pa;
5206				ppim->sz = size;
5207				ppim->mode = mode;
5208				ppim->va = virtual_avail;
5209				virtual_avail += size;
5210				va = ppim->va;
5211				break;
5212			}
5213		}
5214		if (va == 0)
5215			panic("%s: too many preinit mappings", __func__);
5216	} else {
5217		/*
5218		 * If we have a preinit mapping, re-use it.
5219		 */
5220		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5221			ppim = pmap_preinit_mapping + i;
5222			if (ppim->pa == pa && ppim->sz == size &&
5223			    ppim->mode == mode)
5224				return ((void *)(ppim->va + offset));
5225		}
5226		va = kva_alloc(size);
5227		if (va == 0)
5228			panic("%s: Couldn't allocate KVA", __func__);
5229	}
5230	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5231		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5232	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5233	pmap_invalidate_cache_range(va, va + size, FALSE);
5234	return ((void *)(va + offset));
5235}
5236
5237void *
5238pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5239{
5240
5241	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5242}
5243
5244void *
5245pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5246{
5247
5248	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5249}
5250
5251void
5252pmap_unmapdev(vm_offset_t va, vm_size_t size)
5253{
5254	struct pmap_preinit_mapping *ppim;
5255	vm_offset_t offset;
5256	int i;
5257
5258	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5259		return;
5260	offset = va & PAGE_MASK;
5261	size = round_page(offset + size);
5262	va = trunc_page(va);
5263	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5264		ppim = pmap_preinit_mapping + i;
5265		if (ppim->va == va && ppim->sz == size) {
5266			if (pmap_initialized)
5267				return;
5268			ppim->pa = 0;
5269			ppim->va = 0;
5270			ppim->sz = 0;
5271			ppim->mode = 0;
5272			if (va + size == virtual_avail)
5273				virtual_avail = va;
5274			return;
5275		}
5276	}
5277	if (pmap_initialized)
5278		kva_free(va, size);
5279}
5280
5281/*
5282 * Sets the memory attribute for the specified page.
5283 */
5284void
5285pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5286{
5287
5288	m->md.pat_mode = ma;
5289	if ((m->flags & PG_FICTITIOUS) != 0)
5290		return;
5291
5292	/*
5293	 * If "m" is a normal page, flush it from the cache.
5294	 * See pmap_invalidate_cache_range().
5295	 *
5296	 * First, try to find an existing mapping of the page by sf
5297	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5298	 * flushes the cache.
5299	 */
5300	if (sf_buf_invalidate_cache(m))
5301		return;
5302
5303	/*
5304	 * If page is not mapped by sf buffer, but CPU does not
5305	 * support self snoop, map the page transient and do
5306	 * invalidation. In the worst case, whole cache is flushed by
5307	 * pmap_invalidate_cache_range().
5308	 */
5309	if ((cpu_feature & CPUID_SS) == 0)
5310		pmap_flush_page(m);
5311}
5312
5313static void
5314pmap_flush_page(vm_page_t m)
5315{
5316	struct sysmaps *sysmaps;
5317	vm_offset_t sva, eva;
5318
5319	if ((cpu_feature & CPUID_CLFSH) != 0) {
5320		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5321		mtx_lock(&sysmaps->lock);
5322		if (*sysmaps->CMAP2)
5323			panic("pmap_flush_page: CMAP2 busy");
5324		sched_pin();
5325		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5326		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5327		invlcaddr(sysmaps->CADDR2);
5328		sva = (vm_offset_t)sysmaps->CADDR2;
5329		eva = sva + PAGE_SIZE;
5330
5331		/*
5332		 * Use mfence despite the ordering implied by
5333		 * mtx_{un,}lock() because clflush is not guaranteed
5334		 * to be ordered by any other instruction.
5335		 */
5336		mfence();
5337		for (; sva < eva; sva += cpu_clflush_line_size)
5338			clflush(sva);
5339		mfence();
5340		*sysmaps->CMAP2 = 0;
5341		sched_unpin();
5342		mtx_unlock(&sysmaps->lock);
5343	} else
5344		pmap_invalidate_cache();
5345}
5346
5347/*
5348 * Changes the specified virtual address range's memory type to that given by
5349 * the parameter "mode".  The specified virtual address range must be
5350 * completely contained within either the kernel map.
5351 *
5352 * Returns zero if the change completed successfully, and either EINVAL or
5353 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5354 * of the virtual address range was not mapped, and ENOMEM is returned if
5355 * there was insufficient memory available to complete the change.
5356 */
5357int
5358pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5359{
5360	vm_offset_t base, offset, tmpva;
5361	pd_entry_t *pde;
5362	pt_entry_t *pte;
5363	int cache_bits_pte, cache_bits_pde;
5364	boolean_t changed;
5365
5366	base = trunc_page(va);
5367	offset = va & PAGE_MASK;
5368	size = round_page(offset + size);
5369
5370	/*
5371	 * Only supported on kernel virtual addresses above the recursive map.
5372	 */
5373	if (base < VM_MIN_KERNEL_ADDRESS)
5374		return (EINVAL);
5375
5376	cache_bits_pde = pmap_cache_bits(mode, 1);
5377	cache_bits_pte = pmap_cache_bits(mode, 0);
5378	changed = FALSE;
5379
5380	/*
5381	 * Pages that aren't mapped aren't supported.  Also break down
5382	 * 2/4MB pages into 4KB pages if required.
5383	 */
5384	PMAP_LOCK(kernel_pmap);
5385	for (tmpva = base; tmpva < base + size; ) {
5386		pde = pmap_pde(kernel_pmap, tmpva);
5387		if (*pde == 0) {
5388			PMAP_UNLOCK(kernel_pmap);
5389			return (EINVAL);
5390		}
5391		if (*pde & PG_PS) {
5392			/*
5393			 * If the current 2/4MB page already has
5394			 * the required memory type, then we need not
5395			 * demote this page.  Just increment tmpva to
5396			 * the next 2/4MB page frame.
5397			 */
5398			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5399				tmpva = trunc_4mpage(tmpva) + NBPDR;
5400				continue;
5401			}
5402
5403			/*
5404			 * If the current offset aligns with a 2/4MB
5405			 * page frame and there is at least 2/4MB left
5406			 * within the range, then we need not break
5407			 * down this page into 4KB pages.
5408			 */
5409			if ((tmpva & PDRMASK) == 0 &&
5410			    tmpva + PDRMASK < base + size) {
5411				tmpva += NBPDR;
5412				continue;
5413			}
5414			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5415				PMAP_UNLOCK(kernel_pmap);
5416				return (ENOMEM);
5417			}
5418		}
5419		pte = vtopte(tmpva);
5420		if (*pte == 0) {
5421			PMAP_UNLOCK(kernel_pmap);
5422			return (EINVAL);
5423		}
5424		tmpva += PAGE_SIZE;
5425	}
5426	PMAP_UNLOCK(kernel_pmap);
5427
5428	/*
5429	 * Ok, all the pages exist, so run through them updating their
5430	 * cache mode if required.
5431	 */
5432	for (tmpva = base; tmpva < base + size; ) {
5433		pde = pmap_pde(kernel_pmap, tmpva);
5434		if (*pde & PG_PS) {
5435			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5436				pmap_pde_attr(pde, cache_bits_pde);
5437				changed = TRUE;
5438			}
5439			tmpva = trunc_4mpage(tmpva) + NBPDR;
5440		} else {
5441			pte = vtopte(tmpva);
5442			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5443				pmap_pte_attr(pte, cache_bits_pte);
5444				changed = TRUE;
5445			}
5446			tmpva += PAGE_SIZE;
5447		}
5448	}
5449
5450	/*
5451	 * Flush CPU caches to make sure any data isn't cached that
5452	 * shouldn't be, etc.
5453	 */
5454	if (changed) {
5455		pmap_invalidate_range(kernel_pmap, base, tmpva);
5456		pmap_invalidate_cache_range(base, tmpva, FALSE);
5457	}
5458	return (0);
5459}
5460
5461/*
5462 * perform the pmap work for mincore
5463 */
5464int
5465pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5466{
5467	pd_entry_t *pdep;
5468	pt_entry_t *ptep, pte;
5469	vm_paddr_t pa;
5470	int val;
5471
5472	PMAP_LOCK(pmap);
5473retry:
5474	pdep = pmap_pde(pmap, addr);
5475	if (*pdep != 0) {
5476		if (*pdep & PG_PS) {
5477			pte = *pdep;
5478			/* Compute the physical address of the 4KB page. */
5479			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5480			    PG_FRAME;
5481			val = MINCORE_SUPER;
5482		} else {
5483			ptep = pmap_pte(pmap, addr);
5484			pte = *ptep;
5485			pmap_pte_release(ptep);
5486			pa = pte & PG_FRAME;
5487			val = 0;
5488		}
5489	} else {
5490		pte = 0;
5491		pa = 0;
5492		val = 0;
5493	}
5494	if ((pte & PG_V) != 0) {
5495		val |= MINCORE_INCORE;
5496		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5497			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5498		if ((pte & PG_A) != 0)
5499			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5500	}
5501	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5502	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5503	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5504		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5505		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5506			goto retry;
5507	} else
5508		PA_UNLOCK_COND(*locked_pa);
5509	PMAP_UNLOCK(pmap);
5510	return (val);
5511}
5512
5513void
5514pmap_activate(struct thread *td)
5515{
5516	pmap_t	pmap, oldpmap;
5517	u_int	cpuid;
5518	u_int32_t  cr3;
5519
5520	critical_enter();
5521	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5522	oldpmap = PCPU_GET(curpmap);
5523	cpuid = PCPU_GET(cpuid);
5524#if defined(SMP)
5525	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5526	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5527#else
5528	CPU_CLR(cpuid, &oldpmap->pm_active);
5529	CPU_SET(cpuid, &pmap->pm_active);
5530#endif
5531#if defined(PAE) || defined(PAE_TABLES)
5532	cr3 = vtophys(pmap->pm_pdpt);
5533#else
5534	cr3 = vtophys(pmap->pm_pdir);
5535#endif
5536	/*
5537	 * pmap_activate is for the current thread on the current cpu
5538	 */
5539	td->td_pcb->pcb_cr3 = cr3;
5540	load_cr3(cr3);
5541	PCPU_SET(curpmap, pmap);
5542	critical_exit();
5543}
5544
5545void
5546pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5547{
5548}
5549
5550/*
5551 *	Increase the starting virtual address of the given mapping if a
5552 *	different alignment might result in more superpage mappings.
5553 */
5554void
5555pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5556    vm_offset_t *addr, vm_size_t size)
5557{
5558	vm_offset_t superpage_offset;
5559
5560	if (size < NBPDR)
5561		return;
5562	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5563		offset += ptoa(object->pg_color);
5564	superpage_offset = offset & PDRMASK;
5565	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5566	    (*addr & PDRMASK) == superpage_offset)
5567		return;
5568	if ((*addr & PDRMASK) < superpage_offset)
5569		*addr = (*addr & ~PDRMASK) + superpage_offset;
5570	else
5571		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5572}
5573
5574
5575#if defined(PMAP_DEBUG)
5576pmap_pid_dump(int pid)
5577{
5578	pmap_t pmap;
5579	struct proc *p;
5580	int npte = 0;
5581	int index;
5582
5583	sx_slock(&allproc_lock);
5584	FOREACH_PROC_IN_SYSTEM(p) {
5585		if (p->p_pid != pid)
5586			continue;
5587
5588		if (p->p_vmspace) {
5589			int i,j;
5590			index = 0;
5591			pmap = vmspace_pmap(p->p_vmspace);
5592			for (i = 0; i < NPDEPTD; i++) {
5593				pd_entry_t *pde;
5594				pt_entry_t *pte;
5595				vm_offset_t base = i << PDRSHIFT;
5596
5597				pde = &pmap->pm_pdir[i];
5598				if (pde && pmap_pde_v(pde)) {
5599					for (j = 0; j < NPTEPG; j++) {
5600						vm_offset_t va = base + (j << PAGE_SHIFT);
5601						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5602							if (index) {
5603								index = 0;
5604								printf("\n");
5605							}
5606							sx_sunlock(&allproc_lock);
5607							return (npte);
5608						}
5609						pte = pmap_pte(pmap, va);
5610						if (pte && pmap_pte_v(pte)) {
5611							pt_entry_t pa;
5612							vm_page_t m;
5613							pa = *pte;
5614							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5615							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5616								va, pa, m->hold_count, m->wire_count, m->flags);
5617							npte++;
5618							index++;
5619							if (index >= 2) {
5620								index = 0;
5621								printf("\n");
5622							} else {
5623								printf(" ");
5624							}
5625						}
5626					}
5627				}
5628			}
5629		}
5630	}
5631	sx_sunlock(&allproc_lock);
5632	return (npte);
5633}
5634#endif
5635
5636#if defined(DEBUG)
5637
5638static void	pads(pmap_t pm);
5639void		pmap_pvdump(vm_paddr_t pa);
5640
5641/* print address space of pmap*/
5642static void
5643pads(pmap_t pm)
5644{
5645	int i, j;
5646	vm_paddr_t va;
5647	pt_entry_t *ptep;
5648
5649	if (pm == kernel_pmap)
5650		return;
5651	for (i = 0; i < NPDEPTD; i++)
5652		if (pm->pm_pdir[i])
5653			for (j = 0; j < NPTEPG; j++) {
5654				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5655				if (pm == kernel_pmap && va < KERNBASE)
5656					continue;
5657				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5658					continue;
5659				ptep = pmap_pte(pm, va);
5660				if (pmap_pte_v(ptep))
5661					printf("%x:%x ", va, *ptep);
5662			};
5663
5664}
5665
5666void
5667pmap_pvdump(vm_paddr_t pa)
5668{
5669	pv_entry_t pv;
5670	pmap_t pmap;
5671	vm_page_t m;
5672
5673	printf("pa %x", pa);
5674	m = PHYS_TO_VM_PAGE(pa);
5675	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5676		pmap = PV_PMAP(pv);
5677		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5678		pads(pmap);
5679	}
5680	printf(" ");
5681}
5682#endif
5683