1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2018 Matthew Macy
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "opt_platform.h"
29
30#include <sys/param.h>
31#include <sys/kernel.h>
32#include <sys/systm.h>
33#include <sys/conf.h>
34#include <sys/bitstring.h>
35#include <sys/queue.h>
36#include <sys/cpuset.h>
37#include <sys/endian.h>
38#include <sys/kerneldump.h>
39#include <sys/ktr.h>
40#include <sys/lock.h>
41#include <sys/syslog.h>
42#include <sys/msgbuf.h>
43#include <sys/malloc.h>
44#include <sys/mman.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/rwlock.h>
48#include <sys/sched.h>
49#include <sys/sysctl.h>
50#include <sys/systm.h>
51#include <sys/vmem.h>
52#include <sys/vmmeter.h>
53#include <sys/smp.h>
54
55#include <sys/kdb.h>
56
57#include <dev/ofw/openfirm.h>
58
59#include <vm/vm.h>
60#include <vm/pmap.h>
61#include <vm/vm_param.h>
62#include <vm/vm_kern.h>
63#include <vm/vm_page.h>
64#include <vm/vm_map.h>
65#include <vm/vm_object.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_pageout.h>
68#include <vm/vm_phys.h>
69#include <vm/vm_radix.h>
70#include <vm/vm_reserv.h>
71#include <vm/vm_dumpset.h>
72#include <vm/uma.h>
73
74#include <machine/_inttypes.h>
75#include <machine/cpu.h>
76#include <machine/platform.h>
77#include <machine/frame.h>
78#include <machine/md_var.h>
79#include <machine/psl.h>
80#include <machine/bat.h>
81#include <machine/hid.h>
82#include <machine/pte.h>
83#include <machine/sr.h>
84#include <machine/trap.h>
85#include <machine/mmuvar.h>
86
87/* For pseries bit. */
88#include <powerpc/pseries/phyp-hvcall.h>
89
90#ifdef INVARIANTS
91#include <vm/uma_dbg.h>
92#endif
93
94#define PPC_BITLSHIFT(bit)	(sizeof(long)*NBBY - 1 - (bit))
95#define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
96#define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
97
98#include "opt_ddb.h"
99
100#ifdef DDB
101static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
102#endif
103
104#define PG_W	RPTE_WIRED
105#define PG_V	RPTE_VALID
106#define PG_MANAGED	RPTE_MANAGED
107#define PG_PROMOTED	RPTE_PROMOTED
108#define PG_M	RPTE_C
109#define PG_A	RPTE_R
110#define PG_X	RPTE_EAA_X
111#define PG_RW	RPTE_EAA_W
112#define PG_PTE_CACHE RPTE_ATTR_MASK
113
114#define RPTE_SHIFT 9
115#define NLS_MASK ((1UL<<5)-1)
116#define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
117#define RPTE_MASK (RPTE_ENTRIES-1)
118
119#define NLB_SHIFT 0
120#define NLB_MASK (((1UL<<52)-1) << 8)
121
122extern int nkpt;
123extern caddr_t crashdumpmap;
124
125#define RIC_FLUSH_TLB 0
126#define RIC_FLUSH_PWC 1
127#define RIC_FLUSH_ALL 2
128
129#define POWER9_TLB_SETS_RADIX	128	/* # sets in POWER9 TLB Radix mode */
130
131#define PPC_INST_TLBIE			0x7c000264
132#define PPC_INST_TLBIEL			0x7c000224
133#define PPC_INST_SLBIA			0x7c0003e4
134
135#define ___PPC_RA(a)	(((a) & 0x1f) << 16)
136#define ___PPC_RB(b)	(((b) & 0x1f) << 11)
137#define ___PPC_RS(s)	(((s) & 0x1f) << 21)
138#define ___PPC_RT(t)	___PPC_RS(t)
139#define ___PPC_R(r)	(((r) & 0x1) << 16)
140#define ___PPC_PRS(prs)	(((prs) & 0x1) << 17)
141#define ___PPC_RIC(ric)	(((ric) & 0x3) << 18)
142
143#define PPC_SLBIA(IH)	__XSTRING(.long PPC_INST_SLBIA | \
144				       ((IH & 0x7) << 21))
145#define	PPC_TLBIE_5(rb,rs,ric,prs,r)				\
146	__XSTRING(.long PPC_INST_TLBIE |			\
147			  ___PPC_RB(rb) | ___PPC_RS(rs) |	\
148			  ___PPC_RIC(ric) | ___PPC_PRS(prs) |	\
149			  ___PPC_R(r))
150
151#define	PPC_TLBIEL(rb,rs,ric,prs,r) \
152	 __XSTRING(.long PPC_INST_TLBIEL | \
153			   ___PPC_RB(rb) | ___PPC_RS(rs) |	\
154			   ___PPC_RIC(ric) | ___PPC_PRS(prs) |	\
155			   ___PPC_R(r))
156
157#define PPC_INVALIDATE_ERAT		PPC_SLBIA(7)
158
159static __inline void
160ttusync(void)
161{
162	__asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
163}
164
165#define TLBIEL_INVAL_SEL_MASK	0xc00	/* invalidation selector */
166#define  TLBIEL_INVAL_PAGE	0x000	/* invalidate a single page */
167#define  TLBIEL_INVAL_SET_PID	0x400	/* invalidate a set for the current PID */
168#define  TLBIEL_INVAL_SET_LPID	0x800	/* invalidate a set for current LPID */
169#define  TLBIEL_INVAL_SET	0xc00	/* invalidate a set for all LPIDs */
170
171#define TLBIE_ACTUAL_PAGE_MASK		0xe0
172#define  TLBIE_ACTUAL_PAGE_4K		0x00
173#define  TLBIE_ACTUAL_PAGE_64K		0xa0
174#define  TLBIE_ACTUAL_PAGE_2M		0x20
175#define  TLBIE_ACTUAL_PAGE_1G		0x40
176
177#define TLBIE_PRS_PARTITION_SCOPE	0x0
178#define TLBIE_PRS_PROCESS_SCOPE	0x1
179
180#define TLBIE_RIC_INVALIDATE_TLB	0x0	/* Invalidate just TLB */
181#define TLBIE_RIC_INVALIDATE_PWC	0x1	/* Invalidate just PWC */
182#define TLBIE_RIC_INVALIDATE_ALL	0x2	/* Invalidate TLB, PWC,
183						 * cached {proc, part}tab entries
184						 */
185#define TLBIE_RIC_INVALIDATE_SEQ	0x3	/* HPT - only:
186						 * Invalidate a range of translations
187						 */
188
189static __always_inline void
190radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
191			vm_offset_t va, uint16_t ap)
192{
193	uint64_t rb, rs;
194
195	MPASS((va & PAGE_MASK) == 0);
196
197	rs = ((uint64_t)pid << 32) | lpid;
198	rb = va | is | ap;
199	__asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
200		"r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory");
201}
202
203static __inline void
204radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap)
205{
206
207	__asm __volatile("ptesync" ::: "memory");
208	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
209	    TLBIEL_INVAL_PAGE, 0, 0, va, ap);
210	__asm __volatile("ptesync" ::: "memory");
211	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
212	    TLBIEL_INVAL_PAGE, pid, 0, va, ap);
213}
214
215static __inline void
216radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
217{
218
219	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
220		TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
221	radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K);
222}
223
224static __inline void
225radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
226{
227
228	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
229		TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
230	radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M);
231}
232
233static __inline void
234radix_tlbie_invlpwc_user(uint32_t pid)
235{
236
237	radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
238		TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
239}
240
241static __inline void
242radix_tlbie_flush_user(uint32_t pid)
243{
244
245	radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
246		TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
247}
248
249static __inline void
250radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
251{
252
253	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
254	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
255	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K);
256}
257
258static __inline void
259radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
260{
261
262	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
263	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
264	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M);
265}
266
267/* 1GB pages aren't currently supported. */
268static __inline __unused void
269radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
270{
271
272	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
273	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
274	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G);
275}
276
277static __inline void
278radix_tlbie_invlpwc_kernel(void)
279{
280
281	radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
282	    TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
283}
284
285static __inline void
286radix_tlbie_flush_kernel(void)
287{
288
289	radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
290	    TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
291}
292
293static __inline vm_pindex_t
294pmap_l3e_pindex(vm_offset_t va)
295{
296	return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
297}
298
299static __inline vm_pindex_t
300pmap_pml3e_index(vm_offset_t va)
301{
302
303	return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
304}
305
306static __inline vm_pindex_t
307pmap_pml2e_index(vm_offset_t va)
308{
309	return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
310}
311
312static __inline vm_pindex_t
313pmap_pml1e_index(vm_offset_t va)
314{
315	return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
316}
317
318/* Return various clipped indexes for a given VA */
319static __inline vm_pindex_t
320pmap_pte_index(vm_offset_t va)
321{
322
323	return ((va >> PAGE_SHIFT) & RPTE_MASK);
324}
325
326/* Return a pointer to the PT slot that corresponds to a VA */
327static __inline pt_entry_t *
328pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
329{
330	pt_entry_t *pte;
331	vm_paddr_t ptepa;
332
333	ptepa = (be64toh(*l3e) & NLB_MASK);
334	pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
335	return (&pte[pmap_pte_index(va)]);
336}
337
338/* Return a pointer to the PD slot that corresponds to a VA */
339static __inline pt_entry_t *
340pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
341{
342	pt_entry_t *l3e;
343	vm_paddr_t l3pa;
344
345	l3pa = (be64toh(*l2e) & NLB_MASK);
346	l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
347	return (&l3e[pmap_pml3e_index(va)]);
348}
349
350/* Return a pointer to the PD slot that corresponds to a VA */
351static __inline pt_entry_t *
352pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
353{
354	pt_entry_t *l2e;
355	vm_paddr_t l2pa;
356
357	l2pa = (be64toh(*l1e) & NLB_MASK);
358
359	l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
360	return (&l2e[pmap_pml2e_index(va)]);
361}
362
363static __inline pml1_entry_t *
364pmap_pml1e(pmap_t pmap, vm_offset_t va)
365{
366
367	return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
368}
369
370static pt_entry_t *
371pmap_pml2e(pmap_t pmap, vm_offset_t va)
372{
373	pt_entry_t *l1e;
374
375	l1e = pmap_pml1e(pmap, va);
376	if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
377		return (NULL);
378	return (pmap_l1e_to_l2e(l1e, va));
379}
380
381static __inline pt_entry_t *
382pmap_pml3e(pmap_t pmap, vm_offset_t va)
383{
384	pt_entry_t *l2e;
385
386	l2e = pmap_pml2e(pmap, va);
387	if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
388		return (NULL);
389	return (pmap_l2e_to_l3e(l2e, va));
390}
391
392static __inline pt_entry_t *
393pmap_pte(pmap_t pmap, vm_offset_t va)
394{
395	pt_entry_t *l3e;
396
397	l3e = pmap_pml3e(pmap, va);
398	if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
399		return (NULL);
400	return (pmap_l3e_to_pte(l3e, va));
401}
402
403int nkpt = 64;
404SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
405    "Number of kernel page table pages allocated on bootup");
406
407vm_paddr_t dmaplimit;
408
409SYSCTL_DECL(_vm_pmap);
410
411#ifdef INVARIANTS
412#define VERBOSE_PMAP 0
413#define VERBOSE_PROTECT 0
414static int pmap_logging;
415SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
416    &pmap_logging, 0, "verbose debug logging");
417#endif
418
419static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
420
421//static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
422
423static vm_offset_t qframe = 0;
424static struct mtx qframe_mtx;
425
426void mmu_radix_activate(struct thread *);
427void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
428void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
429    vm_size_t);
430void mmu_radix_clear_modify(vm_page_t);
431void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
432int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
433int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
434void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
435	vm_prot_t);
436void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
437vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
438vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
439void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
440vm_paddr_t mmu_radix_kextract(vm_offset_t);
441void mmu_radix_kremove(vm_offset_t);
442bool mmu_radix_is_modified(vm_page_t);
443bool mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
444bool mmu_radix_is_referenced(vm_page_t);
445void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
446	vm_pindex_t, vm_size_t);
447bool mmu_radix_page_exists_quick(pmap_t, vm_page_t);
448void mmu_radix_page_init(vm_page_t);
449bool mmu_radix_page_is_mapped(vm_page_t m);
450void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
451int mmu_radix_page_wired_mappings(vm_page_t);
452int mmu_radix_pinit(pmap_t);
453void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
454bool mmu_radix_ps_enabled(pmap_t);
455void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
456void mmu_radix_qremove(vm_offset_t, int);
457vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
458void mmu_radix_quick_remove_page(vm_offset_t);
459int mmu_radix_ts_referenced(vm_page_t);
460void mmu_radix_release(pmap_t);
461void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
462void mmu_radix_remove_all(vm_page_t);
463void mmu_radix_remove_pages(pmap_t);
464void mmu_radix_remove_write(vm_page_t);
465void mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz);
466void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
467void mmu_radix_zero_page(vm_page_t);
468void mmu_radix_zero_page_area(vm_page_t, int, int);
469int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
470void mmu_radix_page_array_startup(long pages);
471
472#include "mmu_oea64.h"
473
474/*
475 * Kernel MMU interface
476 */
477
478static void	mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
479
480static void mmu_radix_copy_page(vm_page_t, vm_page_t);
481static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
482    vm_page_t *mb, vm_offset_t b_offset, int xfersize);
483static void mmu_radix_growkernel(vm_offset_t);
484static void mmu_radix_init(void);
485static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
486static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
487static void mmu_radix_pinit0(pmap_t);
488
489static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
490static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
491static void mmu_radix_unmapdev(void *, vm_size_t);
492static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
493static int mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
494static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
495static void mmu_radix_scan_init(void);
496static void	mmu_radix_cpu_bootstrap(int ap);
497static void	mmu_radix_tlbie_all(void);
498
499static struct pmap_funcs mmu_radix_methods = {
500	.bootstrap = mmu_radix_bootstrap,
501	.copy_page = mmu_radix_copy_page,
502	.copy_pages = mmu_radix_copy_pages,
503	.cpu_bootstrap = mmu_radix_cpu_bootstrap,
504	.growkernel = mmu_radix_growkernel,
505	.init = mmu_radix_init,
506	.map =      		mmu_radix_map,
507	.mincore =      	mmu_radix_mincore,
508	.pinit = mmu_radix_pinit,
509	.pinit0 = mmu_radix_pinit0,
510
511	.mapdev = mmu_radix_mapdev,
512	.mapdev_attr = mmu_radix_mapdev_attr,
513	.unmapdev = mmu_radix_unmapdev,
514	.kenter_attr = mmu_radix_kenter_attr,
515	.dev_direct_mapped = mmu_radix_dev_direct_mapped,
516	.dumpsys_pa_init = mmu_radix_scan_init,
517	.dumpsys_map_chunk = mmu_radix_dumpsys_map,
518	.page_is_mapped = mmu_radix_page_is_mapped,
519	.ps_enabled = mmu_radix_ps_enabled,
520	.align_superpage = mmu_radix_align_superpage,
521	.object_init_pt = mmu_radix_object_init_pt,
522	.protect = mmu_radix_protect,
523	/* pmap dispatcher interface */
524	.clear_modify = mmu_radix_clear_modify,
525	.copy = mmu_radix_copy,
526	.enter = mmu_radix_enter,
527	.enter_object = mmu_radix_enter_object,
528	.enter_quick = mmu_radix_enter_quick,
529	.extract = mmu_radix_extract,
530	.extract_and_hold = mmu_radix_extract_and_hold,
531	.is_modified = mmu_radix_is_modified,
532	.is_prefaultable = mmu_radix_is_prefaultable,
533	.is_referenced = mmu_radix_is_referenced,
534	.ts_referenced = mmu_radix_ts_referenced,
535	.page_exists_quick = mmu_radix_page_exists_quick,
536	.page_init = mmu_radix_page_init,
537	.page_wired_mappings =  mmu_radix_page_wired_mappings,
538	.qenter = mmu_radix_qenter,
539	.qremove = mmu_radix_qremove,
540	.release = mmu_radix_release,
541	.remove = mmu_radix_remove,
542	.remove_all = mmu_radix_remove_all,
543	.remove_write = mmu_radix_remove_write,
544	.sync_icache = mmu_radix_sync_icache,
545	.unwire = mmu_radix_unwire,
546	.zero_page = mmu_radix_zero_page,
547	.zero_page_area = mmu_radix_zero_page_area,
548	.activate = mmu_radix_activate,
549	.quick_enter_page =  mmu_radix_quick_enter_page,
550	.quick_remove_page =  mmu_radix_quick_remove_page,
551	.page_set_memattr = mmu_radix_page_set_memattr,
552	.page_array_startup =  mmu_radix_page_array_startup,
553
554	/* Internal interfaces */
555	.kenter = mmu_radix_kenter,
556	.kextract = mmu_radix_kextract,
557	.kremove = mmu_radix_kremove,
558	.change_attr = mmu_radix_change_attr,
559	.decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
560
561	.tlbie_all = mmu_radix_tlbie_all,
562};
563
564MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
565
566static bool pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
567	struct rwlock **lockp);
568static bool pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
569static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
570static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
571    struct spglist *free, struct rwlock **lockp);
572static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
573    pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
574static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
575static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
576    struct spglist *free);
577static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
578	pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
579
580static bool	pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
581		    u_int flags, struct rwlock **lockp);
582#if VM_NRESERVLEVEL > 0
583static void	pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
584	struct rwlock **lockp);
585#endif
586static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
587static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
588static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
589	vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
590
591static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
592	vm_prot_t prot, struct rwlock **lockp);
593static int	pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
594	u_int flags, vm_page_t m, struct rwlock **lockp);
595
596static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
597static void free_pv_chunk(struct pv_chunk *pc);
598static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
599static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
600	struct rwlock **lockp);
601static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
602	struct rwlock **lockp);
603static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
604    struct spglist *free);
605static bool pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
606
607static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
608static void pmap_invalidate_all(pmap_t pmap);
609static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
610static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
611
612/*
613 * Internal flags for pmap_enter()'s helper functions.
614 */
615#define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
616#define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
617
618#define UNIMPLEMENTED() panic("%s not implemented", __func__)
619#define UNTESTED() panic("%s not yet tested", __func__)
620
621/* Number of supported PID bits */
622static unsigned int isa3_pid_bits;
623
624/* PID to start allocating from */
625static unsigned int isa3_base_pid;
626
627#define PROCTAB_SIZE_SHIFT	(isa3_pid_bits + 4)
628#define PROCTAB_ENTRIES	(1ul << isa3_pid_bits)
629
630/*
631 * Map of physical memory regions.
632 */
633static struct	mem_region *regions, *pregions;
634static struct	numa_mem_region *numa_pregions;
635static u_int	phys_avail_count;
636static int	regions_sz, pregions_sz, numa_pregions_sz;
637static struct pate *isa3_parttab;
638static struct prte *isa3_proctab;
639static vmem_t *asid_arena;
640
641extern void bs_remap_earlyboot(void);
642
643#define	RADIX_PGD_SIZE_SHIFT	16
644#define RADIX_PGD_SIZE	(1UL << RADIX_PGD_SIZE_SHIFT)
645
646#define	RADIX_PGD_INDEX_SHIFT	(RADIX_PGD_SIZE_SHIFT-3)
647#define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
648#define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
649
650#define	NUPML1E		(RADIX_PGD_SIZE/sizeof(uint64_t))	/* number of userland PML1 pages */
651#define	NUPDPE		(NUPML1E * NL2EPG)/* number of userland PDP pages */
652#define	NUPDE		(NUPDPE * NL3EPG)	/* number of userland PD entries */
653
654/* POWER9 only permits a 64k partition table size. */
655#define	PARTTAB_SIZE_SHIFT	16
656#define PARTTAB_SIZE	(1UL << PARTTAB_SIZE_SHIFT)
657
658#define PARTTAB_HR		(1UL << 63) /* host uses radix */
659#define PARTTAB_GR		(1UL << 63) /* guest uses radix must match host */
660
661/* TLB flush actions. Used as argument to tlbiel_flush() */
662enum {
663	TLB_INVAL_SCOPE_LPID = 2,	/* invalidate TLBs for current LPID */
664	TLB_INVAL_SCOPE_GLOBAL = 3,	/* invalidate all TLBs */
665};
666
667#define	NPV_LIST_LOCKS	MAXCPU
668static int pmap_initialized;
669static vm_paddr_t proctab0pa;
670static vm_paddr_t parttab_phys;
671CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
672
673/*
674 * Data for the pv entry allocation mechanism.
675 * Updates to pv_invl_gen are protected by the pv_list_locks[]
676 * elements, but reads are not.
677 */
678static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
679static struct mtx __exclusive_cache_line pv_chunks_mutex;
680static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
681static struct md_page *pv_table;
682static struct md_page pv_dummy;
683
684#ifdef PV_STATS
685#define PV_STAT(x)	do { x ; } while (0)
686#else
687#define PV_STAT(x)	do { } while (0)
688#endif
689
690#define	pa_radix_index(pa)	((pa) >> L3_PAGE_SIZE_SHIFT)
691#define	pa_to_pvh(pa)	(&pv_table[pa_radix_index(pa)])
692
693#define	PHYS_TO_PV_LIST_LOCK(pa)	\
694			(&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
695
696#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
697	struct rwlock **_lockp = (lockp);		\
698	struct rwlock *_new_lock;			\
699							\
700	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
701	if (_new_lock != *_lockp) {			\
702		if (*_lockp != NULL)			\
703			rw_wunlock(*_lockp);		\
704		*_lockp = _new_lock;			\
705		rw_wlock(*_lockp);			\
706	}						\
707} while (0)
708
709#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
710	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
711
712#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
713	struct rwlock **_lockp = (lockp);		\
714							\
715	if (*_lockp != NULL) {				\
716		rw_wunlock(*_lockp);			\
717		*_lockp = NULL;				\
718	}						\
719} while (0)
720
721#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
722	PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
723
724/*
725 * We support 52 bits, hence:
726 * bits 52 - 31 = 21, 0b10101
727 * RTS encoding details
728 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
729 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
730 */
731#define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
732
733static int powernv_enabled = 1;
734
735static __always_inline void
736tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
737	uint32_t pid, uint32_t ric, uint32_t prs)
738{
739	uint64_t rb;
740	uint64_t rs;
741
742	rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
743	rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
744
745	__asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
746		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
747		     : "memory");
748}
749
750static void
751tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
752{
753	uint32_t set;
754
755	__asm __volatile("ptesync": : :"memory");
756
757	/*
758	 * Flush the first set of the TLB, and the entire Page Walk Cache
759	 * and partition table entries. Then flush the remaining sets of the
760	 * TLB.
761	 */
762	if (is == TLB_INVAL_SCOPE_GLOBAL) {
763		tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
764		for (set = 1; set < num_sets; set++)
765			tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
766	}
767
768	/* Do the same for process scoped entries. */
769	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
770	for (set = 1; set < num_sets; set++)
771		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
772
773	__asm __volatile("ptesync": : :"memory");
774}
775
776static void
777mmu_radix_tlbiel_flush(int scope)
778{
779	MPASS(scope == TLB_INVAL_SCOPE_LPID ||
780		  scope == TLB_INVAL_SCOPE_GLOBAL);
781
782	tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope);
783	__asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
784}
785
786static void
787mmu_radix_tlbie_all(void)
788{
789	if (powernv_enabled)
790		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
791	else
792		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
793}
794
795static void
796mmu_radix_init_amor(void)
797{
798	/*
799	* In HV mode, we init AMOR (Authority Mask Override Register) so that
800	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
801	* Register), enable key 0 and set it to 1.
802	*
803	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
804	*/
805	mtspr(SPR_AMOR, (3ul << 62));
806}
807
808static void
809mmu_radix_init_iamr(void)
810{
811	/*
812	 * Radix always uses key0 of the IAMR to determine if an access is
813	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
814	 * fetch.
815	 */
816	mtspr(SPR_IAMR, (1ul << 62));
817}
818
819static void
820mmu_radix_pid_set(pmap_t pmap)
821{
822
823	mtspr(SPR_PID, pmap->pm_pid);
824	isync();
825}
826
827/* Quick sort callout for comparing physical addresses. */
828static int
829pa_cmp(const void *a, const void *b)
830{
831	const vm_paddr_t *pa = a, *pb = b;
832
833	if (*pa < *pb)
834		return (-1);
835	else if (*pa > *pb)
836		return (1);
837	else
838		return (0);
839}
840
841#define	pte_load_store(ptep, pte)	atomic_swap_long(ptep, pte)
842#define	pte_load_clear(ptep)		atomic_swap_long(ptep, 0)
843#define	pte_store(ptep, pte) do {	   \
844	MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));	\
845	*(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
846} while (0)
847/*
848 * NB: should only be used for adding directories - not for direct mappings
849 */
850#define	pde_store(ptep, pa) do {				\
851	*(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
852} while (0)
853
854#define	pte_clear(ptep) do {					\
855		*(u_long *)(ptep) = (u_long)(0);		\
856} while (0)
857
858#define	PMAP_PDE_SUPERPAGE	(1 << 8)	/* supports 2MB superpages */
859
860/*
861 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
862 * (PTE) page mappings have identical settings for the following fields:
863 */
864#define	PG_PTE_PROMOTE	(PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
865	    PG_M | PG_A | RPTE_EAA_MASK | PG_V)
866
867static __inline void
868pmap_resident_count_inc(pmap_t pmap, int count)
869{
870
871	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
872	pmap->pm_stats.resident_count += count;
873}
874
875static __inline void
876pmap_resident_count_dec(pmap_t pmap, int count)
877{
878
879	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
880	KASSERT(pmap->pm_stats.resident_count >= count,
881	    ("pmap %p resident count underflow %ld %d", pmap,
882	    pmap->pm_stats.resident_count, count));
883	pmap->pm_stats.resident_count -= count;
884}
885
886static void
887pagezero(vm_offset_t va)
888{
889	va = trunc_page(va);
890
891	bzero((void *)va, PAGE_SIZE);
892}
893
894static uint64_t
895allocpages(int n)
896{
897	u_int64_t ret;
898
899	ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
900	for (int i = 0; i < n; i++)
901		pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
902	return (ret);
903}
904
905static pt_entry_t *
906kvtopte(vm_offset_t va)
907{
908	pt_entry_t *l3e;
909
910	l3e = pmap_pml3e(kernel_pmap, va);
911	if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
912		return (NULL);
913	return (pmap_l3e_to_pte(l3e, va));
914}
915
916void
917mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
918{
919	pt_entry_t *pte;
920
921	pte = kvtopte(va);
922	MPASS(pte != NULL);
923	*pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
924	    RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
925}
926
927bool
928mmu_radix_ps_enabled(pmap_t pmap)
929{
930	return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
931}
932
933static pt_entry_t *
934pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
935{
936	pml3_entry_t *l3e;
937	pt_entry_t *pte;
938
939	va &= PG_PS_FRAME;
940	l3e = pmap_pml3e(pmap, va);
941	if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
942		return (NULL);
943
944	if (be64toh(*l3e) & RPTE_LEAF) {
945		*is_l3e = 1;
946		return (l3e);
947	}
948	*is_l3e = 0;
949	va &= PG_FRAME;
950	pte = pmap_l3e_to_pte(l3e, va);
951	if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
952		return (NULL);
953	return (pte);
954}
955
956int
957pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
958{
959	pt_entry_t *pte;
960	pt_entry_t startpte, origpte, newpte;
961	vm_page_t m;
962	int is_l3e;
963
964	startpte = 0;
965 retry:
966	if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
967		return (KERN_INVALID_ADDRESS);
968	origpte = newpte = be64toh(*pte);
969	if (startpte == 0) {
970		startpte = origpte;
971		if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
972		    ((flags & VM_PROT_READ) && (startpte & PG_A))) {
973			pmap_invalidate_all(pmap);
974#ifdef INVARIANTS
975			if (VERBOSE_PMAP || pmap_logging)
976				printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
977				    __func__, pmap, va, flags, origpte);
978#endif
979			return (KERN_FAILURE);
980		}
981	}
982#ifdef INVARIANTS
983	if (VERBOSE_PMAP || pmap_logging)
984		printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
985		    flags, origpte);
986#endif
987	PMAP_LOCK(pmap);
988	if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
989	    be64toh(*pte) != origpte) {
990		PMAP_UNLOCK(pmap);
991		return (KERN_FAILURE);
992	}
993	m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
994	MPASS(m != NULL);
995	switch (flags) {
996	case VM_PROT_READ:
997		if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
998			goto protfail;
999		newpte |= PG_A;
1000		vm_page_aflag_set(m, PGA_REFERENCED);
1001		break;
1002	case VM_PROT_WRITE:
1003		if ((newpte & RPTE_EAA_W) == 0)
1004			goto protfail;
1005		if (is_l3e)
1006			goto protfail;
1007		newpte |= PG_M;
1008		vm_page_dirty(m);
1009		break;
1010	case VM_PROT_EXECUTE:
1011		if ((newpte & RPTE_EAA_X) == 0)
1012			goto protfail;
1013		newpte |= PG_A;
1014		vm_page_aflag_set(m, PGA_REFERENCED);
1015		break;
1016	}
1017
1018	if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
1019		goto retry;
1020	ptesync();
1021	PMAP_UNLOCK(pmap);
1022	if (startpte == newpte)
1023		return (KERN_FAILURE);
1024	return (0);
1025 protfail:
1026	PMAP_UNLOCK(pmap);
1027	return (KERN_PROTECTION_FAILURE);
1028}
1029
1030/*
1031 * Returns true if the given page is mapped individually or as part of
1032 * a 2mpage.  Otherwise, returns false.
1033 */
1034bool
1035mmu_radix_page_is_mapped(vm_page_t m)
1036{
1037	struct rwlock *lock;
1038	bool rv;
1039
1040	if ((m->oflags & VPO_UNMANAGED) != 0)
1041		return (false);
1042	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
1043	rw_rlock(lock);
1044	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
1045	    ((m->flags & PG_FICTITIOUS) == 0 &&
1046	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
1047	rw_runlock(lock);
1048	return (rv);
1049}
1050
1051/*
1052 * Determine the appropriate bits to set in a PTE or PDE for a specified
1053 * caching mode.
1054 */
1055static int
1056pmap_cache_bits(vm_memattr_t ma)
1057{
1058	if (ma != VM_MEMATTR_DEFAULT) {
1059		switch (ma) {
1060		case VM_MEMATTR_UNCACHEABLE:
1061			return (RPTE_ATTR_GUARDEDIO);
1062		case VM_MEMATTR_CACHEABLE:
1063			return (RPTE_ATTR_MEM);
1064		case VM_MEMATTR_WRITE_BACK:
1065		case VM_MEMATTR_PREFETCHABLE:
1066		case VM_MEMATTR_WRITE_COMBINING:
1067			return (RPTE_ATTR_UNGUARDEDIO);
1068		}
1069	}
1070	return (0);
1071}
1072
1073static void
1074pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
1075{
1076	ptesync();
1077	if (pmap == kernel_pmap)
1078		radix_tlbie_invlpg_kernel_4k(start);
1079	else
1080		radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1081	ttusync();
1082}
1083
1084static void
1085pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
1086{
1087	ptesync();
1088	if (pmap == kernel_pmap)
1089		radix_tlbie_invlpg_kernel_2m(start);
1090	else
1091		radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
1092	ttusync();
1093}
1094
1095static void
1096pmap_invalidate_pwc(pmap_t pmap)
1097{
1098	ptesync();
1099	if (pmap == kernel_pmap)
1100		radix_tlbie_invlpwc_kernel();
1101	else
1102		radix_tlbie_invlpwc_user(pmap->pm_pid);
1103	ttusync();
1104}
1105
1106static void
1107pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1108{
1109	if (((start - end) >> PAGE_SHIFT) > 8) {
1110		pmap_invalidate_all(pmap);
1111		return;
1112	}
1113	ptesync();
1114	if (pmap == kernel_pmap) {
1115		while (start < end) {
1116			radix_tlbie_invlpg_kernel_4k(start);
1117			start += PAGE_SIZE;
1118		}
1119	} else {
1120		while (start < end) {
1121			radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1122			start += PAGE_SIZE;
1123		}
1124	}
1125	ttusync();
1126}
1127
1128static void
1129pmap_invalidate_all(pmap_t pmap)
1130{
1131	ptesync();
1132	if (pmap == kernel_pmap)
1133		radix_tlbie_flush_kernel();
1134	else
1135		radix_tlbie_flush_user(pmap->pm_pid);
1136	ttusync();
1137}
1138
1139static void
1140pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
1141{
1142
1143	/*
1144	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1145	 * by a promotion that did not invalidate the 512 4KB page mappings
1146	 * that might exist in the TLB.  Consequently, at this point, the TLB
1147	 * may hold both 4KB and 2MB page mappings for the address range [va,
1148	 * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
1149	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1150	 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
1151	 * single INVLPG suffices to invalidate the 2MB page mapping from the
1152	 * TLB.
1153	 */
1154	ptesync();
1155	if ((l3e & PG_PROMOTED) != 0)
1156		pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
1157	else
1158		pmap_invalidate_page_2m(pmap, va);
1159
1160	pmap_invalidate_pwc(pmap);
1161}
1162
1163static __inline struct pv_chunk *
1164pv_to_chunk(pv_entry_t pv)
1165{
1166
1167	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1168}
1169
1170#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1171
1172#define	PC_FREE0	0xfffffffffffffffful
1173#define	PC_FREE1	((1ul << (_NPCPV % 64)) - 1)
1174
1175static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
1176
1177/*
1178 * Ensure that the number of spare PV entries in the specified pmap meets or
1179 * exceeds the given count, "needed".
1180 *
1181 * The given PV list lock may be released.
1182 */
1183static void
1184reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1185{
1186	struct pch new_tail;
1187	struct pv_chunk *pc;
1188	vm_page_t m;
1189	int avail, free;
1190	bool reclaimed;
1191
1192	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1193	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1194
1195	/*
1196	 * Newly allocated PV chunks must be stored in a private list until
1197	 * the required number of PV chunks have been allocated.  Otherwise,
1198	 * reclaim_pv_chunk() could recycle one of these chunks.  In
1199	 * contrast, these chunks must be added to the pmap upon allocation.
1200	 */
1201	TAILQ_INIT(&new_tail);
1202retry:
1203	avail = 0;
1204	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1205		//		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
1206		bit_count((bitstr_t *)pc->pc_map, 0,
1207				  sizeof(pc->pc_map) * NBBY, &free);
1208#if 0
1209		free = popcnt_pc_map_pq(pc->pc_map);
1210#endif
1211		if (free == 0)
1212			break;
1213		avail += free;
1214		if (avail >= needed)
1215			break;
1216	}
1217	for (reclaimed = false; avail < needed; avail += _NPCPV) {
1218		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1219		if (m == NULL) {
1220			m = reclaim_pv_chunk(pmap, lockp);
1221			if (m == NULL)
1222				goto retry;
1223			reclaimed = true;
1224		}
1225		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1226		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1227		dump_add_page(m->phys_addr);
1228		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1229		pc->pc_pmap = pmap;
1230		pc->pc_map[0] = PC_FREE0;
1231		pc->pc_map[1] = PC_FREE1;
1232		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1233		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1234		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
1235
1236		/*
1237		 * The reclaim might have freed a chunk from the current pmap.
1238		 * If that chunk contained available entries, we need to
1239		 * re-count the number of available entries.
1240		 */
1241		if (reclaimed)
1242			goto retry;
1243	}
1244	if (!TAILQ_EMPTY(&new_tail)) {
1245		mtx_lock(&pv_chunks_mutex);
1246		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1247		mtx_unlock(&pv_chunks_mutex);
1248	}
1249}
1250
1251/*
1252 * First find and then remove the pv entry for the specified pmap and virtual
1253 * address from the specified pv list.  Returns the pv entry if found and NULL
1254 * otherwise.  This operation can be performed on pv lists for either 4KB or
1255 * 2MB page mappings.
1256 */
1257static __inline pv_entry_t
1258pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1259{
1260	pv_entry_t pv;
1261
1262	TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
1263#ifdef INVARIANTS
1264		if (PV_PMAP(pv) == NULL) {
1265			printf("corrupted pv_chunk/pv %p\n", pv);
1266			printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
1267		}
1268		MPASS(PV_PMAP(pv) != NULL);
1269		MPASS(pv->pv_va != 0);
1270#endif
1271		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1272			TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
1273			pvh->pv_gen++;
1274			break;
1275		}
1276	}
1277	return (pv);
1278}
1279
1280/*
1281 * After demotion from a 2MB page mapping to 512 4KB page mappings,
1282 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1283 * entries for each of the 4KB page mappings.
1284 */
1285static void
1286pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1287    struct rwlock **lockp)
1288{
1289	struct md_page *pvh;
1290	struct pv_chunk *pc;
1291	pv_entry_t pv;
1292	vm_offset_t va_last;
1293	vm_page_t m;
1294	int bit, field;
1295
1296	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1297	KASSERT((pa & L3_PAGE_MASK) == 0,
1298	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
1299	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1300
1301	/*
1302	 * Transfer the 2mpage's pv entry for this mapping to the first
1303	 * page's pv list.  Once this transfer begins, the pv list lock
1304	 * must not be released until the last pv entry is reinstantiated.
1305	 */
1306	pvh = pa_to_pvh(pa);
1307	va = trunc_2mpage(va);
1308	pv = pmap_pvh_remove(pvh, pmap, va);
1309	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
1310	m = PHYS_TO_VM_PAGE(pa);
1311	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1312
1313	m->md.pv_gen++;
1314	/* Instantiate the remaining NPTEPG - 1 pv entries. */
1315	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
1316	va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1317	for (;;) {
1318		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1319		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
1320		    , ("pmap_pv_demote_pde: missing spare"));
1321		for (field = 0; field < _NPCM; field++) {
1322			while (pc->pc_map[field]) {
1323				bit = cnttzd(pc->pc_map[field]);
1324				pc->pc_map[field] &= ~(1ul << bit);
1325				pv = &pc->pc_pventry[field * 64 + bit];
1326				va += PAGE_SIZE;
1327				pv->pv_va = va;
1328				m++;
1329				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1330			    ("pmap_pv_demote_pde: page %p is not managed", m));
1331				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1332
1333				m->md.pv_gen++;
1334				if (va == va_last)
1335					goto out;
1336			}
1337		}
1338		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1339		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1340	}
1341out:
1342	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1343		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1344		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1345	}
1346	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
1347	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
1348}
1349
1350static void
1351reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
1352{
1353
1354	if (pmap == NULL)
1355		return;
1356	pmap_invalidate_all(pmap);
1357	if (pmap != locked_pmap)
1358		PMAP_UNLOCK(pmap);
1359}
1360
1361/*
1362 * We are in a serious low memory condition.  Resort to
1363 * drastic measures to free some pages so we can allocate
1364 * another pv entry chunk.
1365 *
1366 * Returns NULL if PV entries were reclaimed from the specified pmap.
1367 *
1368 * We do not, however, unmap 2mpages because subsequent accesses will
1369 * allocate per-page pv entries until repromotion occurs, thereby
1370 * exacerbating the shortage of free pv entries.
1371 */
1372static int active_reclaims = 0;
1373static vm_page_t
1374reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1375{
1376	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1377	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1378	struct md_page *pvh;
1379	pml3_entry_t *l3e;
1380	pmap_t next_pmap, pmap;
1381	pt_entry_t *pte, tpte;
1382	pv_entry_t pv;
1383	vm_offset_t va;
1384	vm_page_t m, m_pc;
1385	struct spglist free;
1386	uint64_t inuse;
1387	int bit, field, freed;
1388
1389	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1390	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1391	pmap = NULL;
1392	m_pc = NULL;
1393	SLIST_INIT(&free);
1394	bzero(&pc_marker_b, sizeof(pc_marker_b));
1395	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1396	pc_marker = (struct pv_chunk *)&pc_marker_b;
1397	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1398
1399	mtx_lock(&pv_chunks_mutex);
1400	active_reclaims++;
1401	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1402	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1403	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1404	    SLIST_EMPTY(&free)) {
1405		next_pmap = pc->pc_pmap;
1406		if (next_pmap == NULL) {
1407			/*
1408			 * The next chunk is a marker.  However, it is
1409			 * not our marker, so active_reclaims must be
1410			 * > 1.  Consequently, the next_chunk code
1411			 * will not rotate the pv_chunks list.
1412			 */
1413			goto next_chunk;
1414		}
1415		mtx_unlock(&pv_chunks_mutex);
1416
1417		/*
1418		 * A pv_chunk can only be removed from the pc_lru list
1419		 * when both pc_chunks_mutex is owned and the
1420		 * corresponding pmap is locked.
1421		 */
1422		if (pmap != next_pmap) {
1423			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1424			pmap = next_pmap;
1425			/* Avoid deadlock and lock recursion. */
1426			if (pmap > locked_pmap) {
1427				RELEASE_PV_LIST_LOCK(lockp);
1428				PMAP_LOCK(pmap);
1429				mtx_lock(&pv_chunks_mutex);
1430				continue;
1431			} else if (pmap != locked_pmap) {
1432				if (PMAP_TRYLOCK(pmap)) {
1433					mtx_lock(&pv_chunks_mutex);
1434					continue;
1435				} else {
1436					pmap = NULL; /* pmap is not locked */
1437					mtx_lock(&pv_chunks_mutex);
1438					pc = TAILQ_NEXT(pc_marker, pc_lru);
1439					if (pc == NULL ||
1440					    pc->pc_pmap != next_pmap)
1441						continue;
1442					goto next_chunk;
1443				}
1444			}
1445		}
1446
1447		/*
1448		 * Destroy every non-wired, 4 KB page mapping in the chunk.
1449		 */
1450		freed = 0;
1451		for (field = 0; field < _NPCM; field++) {
1452			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1453			    inuse != 0; inuse &= ~(1UL << bit)) {
1454				bit = cnttzd(inuse);
1455				pv = &pc->pc_pventry[field * 64 + bit];
1456				va = pv->pv_va;
1457				l3e = pmap_pml3e(pmap, va);
1458				if ((be64toh(*l3e) & RPTE_LEAF) != 0)
1459					continue;
1460				pte = pmap_l3e_to_pte(l3e, va);
1461				if ((be64toh(*pte) & PG_W) != 0)
1462					continue;
1463				tpte = be64toh(pte_load_clear(pte));
1464				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
1465				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1466					vm_page_dirty(m);
1467				if ((tpte & PG_A) != 0)
1468					vm_page_aflag_set(m, PGA_REFERENCED);
1469				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1470				TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
1471
1472				m->md.pv_gen++;
1473				if (TAILQ_EMPTY(&m->md.pv_list) &&
1474				    (m->flags & PG_FICTITIOUS) == 0) {
1475					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1476					if (TAILQ_EMPTY(&pvh->pv_list)) {
1477						vm_page_aflag_clear(m,
1478						    PGA_WRITEABLE);
1479					}
1480				}
1481				pc->pc_map[field] |= 1UL << bit;
1482				pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
1483				freed++;
1484			}
1485		}
1486		if (freed == 0) {
1487			mtx_lock(&pv_chunks_mutex);
1488			goto next_chunk;
1489		}
1490		/* Every freed mapping is for a 4 KB page. */
1491		pmap_resident_count_dec(pmap, freed);
1492		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1493		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1494		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1495		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1496		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
1497			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1498			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1499			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1500			/* Entire chunk is free; return it. */
1501			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1502			dump_drop_page(m_pc->phys_addr);
1503			mtx_lock(&pv_chunks_mutex);
1504			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1505			break;
1506		}
1507		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1508		mtx_lock(&pv_chunks_mutex);
1509		/* One freed pv entry in locked_pmap is sufficient. */
1510		if (pmap == locked_pmap)
1511			break;
1512next_chunk:
1513		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1514		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1515		if (active_reclaims == 1 && pmap != NULL) {
1516			/*
1517			 * Rotate the pv chunks list so that we do not
1518			 * scan the same pv chunks that could not be
1519			 * freed (because they contained a wired
1520			 * and/or superpage mapping) on every
1521			 * invocation of reclaim_pv_chunk().
1522			 */
1523			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1524				MPASS(pc->pc_pmap != NULL);
1525				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1526				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1527			}
1528		}
1529	}
1530	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1531	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1532	active_reclaims--;
1533	mtx_unlock(&pv_chunks_mutex);
1534	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1535	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1536		m_pc = SLIST_FIRST(&free);
1537		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1538		/* Recycle a freed page table page. */
1539		m_pc->ref_count = 1;
1540	}
1541	vm_page_free_pages_toq(&free, true);
1542	return (m_pc);
1543}
1544
1545/*
1546 * free the pv_entry back to the free list
1547 */
1548static void
1549free_pv_entry(pmap_t pmap, pv_entry_t pv)
1550{
1551	struct pv_chunk *pc;
1552	int idx, field, bit;
1553
1554#ifdef VERBOSE_PV
1555	if (pmap != kernel_pmap)
1556		printf("%s(%p, %p)\n", __func__, pmap, pv);
1557#endif
1558	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1559	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1560	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1561	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1562	pc = pv_to_chunk(pv);
1563	idx = pv - &pc->pc_pventry[0];
1564	field = idx / 64;
1565	bit = idx % 64;
1566	pc->pc_map[field] |= 1ul << bit;
1567	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
1568		/* 98% of the time, pc is already at the head of the list. */
1569		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1570			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1571			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1572		}
1573		return;
1574	}
1575	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1576	free_pv_chunk(pc);
1577}
1578
1579static void
1580free_pv_chunk(struct pv_chunk *pc)
1581{
1582	vm_page_t m;
1583
1584	mtx_lock(&pv_chunks_mutex);
1585 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1586	mtx_unlock(&pv_chunks_mutex);
1587	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1588	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1589	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1590	/* entire chunk is free, return it */
1591	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1592	dump_drop_page(m->phys_addr);
1593	vm_page_unwire_noq(m);
1594	vm_page_free(m);
1595}
1596
1597/*
1598 * Returns a new PV entry, allocating a new PV chunk from the system when
1599 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1600 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1601 * returned.
1602 *
1603 * The given PV list lock may be released.
1604 */
1605static pv_entry_t
1606get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1607{
1608	int bit, field;
1609	pv_entry_t pv;
1610	struct pv_chunk *pc;
1611	vm_page_t m;
1612
1613	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1614	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1615retry:
1616	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1617	if (pc != NULL) {
1618		for (field = 0; field < _NPCM; field++) {
1619			if (pc->pc_map[field]) {
1620				bit = cnttzd(pc->pc_map[field]);
1621				break;
1622			}
1623		}
1624		if (field < _NPCM) {
1625			pv = &pc->pc_pventry[field * 64 + bit];
1626			pc->pc_map[field] &= ~(1ul << bit);
1627			/* If this was the last item, move it to tail */
1628			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1629				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1630				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1631				    pc_list);
1632			}
1633			PV_STAT(atomic_add_long(&pv_entry_count, 1));
1634			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1635			MPASS(PV_PMAP(pv) != NULL);
1636			return (pv);
1637		}
1638	}
1639	/* No free items, allocate another chunk */
1640	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1641	if (m == NULL) {
1642		if (lockp == NULL) {
1643			PV_STAT(pc_chunk_tryfail++);
1644			return (NULL);
1645		}
1646		m = reclaim_pv_chunk(pmap, lockp);
1647		if (m == NULL)
1648			goto retry;
1649	}
1650	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1651	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1652	dump_add_page(m->phys_addr);
1653	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1654	pc->pc_pmap = pmap;
1655	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1656	pc->pc_map[1] = PC_FREE1;
1657	mtx_lock(&pv_chunks_mutex);
1658	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1659	mtx_unlock(&pv_chunks_mutex);
1660	pv = &pc->pc_pventry[0];
1661	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1662	PV_STAT(atomic_add_long(&pv_entry_count, 1));
1663	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1664	MPASS(PV_PMAP(pv) != NULL);
1665	return (pv);
1666}
1667
1668#if VM_NRESERVLEVEL > 0
1669/*
1670 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
1671 * replace the many pv entries for the 4KB page mappings by a single pv entry
1672 * for the 2MB page mapping.
1673 */
1674static void
1675pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1676    struct rwlock **lockp)
1677{
1678	struct md_page *pvh;
1679	pv_entry_t pv;
1680	vm_offset_t va_last;
1681	vm_page_t m;
1682
1683	KASSERT((pa & L3_PAGE_MASK) == 0,
1684	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
1685	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1686
1687	/*
1688	 * Transfer the first page's pv entry for this mapping to the 2mpage's
1689	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
1690	 * a transfer avoids the possibility that get_pv_entry() calls
1691	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
1692	 * mappings that is being promoted.
1693	 */
1694	m = PHYS_TO_VM_PAGE(pa);
1695	va = trunc_2mpage(va);
1696	pv = pmap_pvh_remove(&m->md, pmap, va);
1697	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
1698	pvh = pa_to_pvh(pa);
1699	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
1700	pvh->pv_gen++;
1701	/* Free the remaining NPTEPG - 1 pv entries. */
1702	va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1703	do {
1704		m++;
1705		va += PAGE_SIZE;
1706		pmap_pvh_free(&m->md, pmap, va);
1707	} while (va < va_last);
1708}
1709#endif /* VM_NRESERVLEVEL > 0 */
1710
1711/*
1712 * First find and then destroy the pv entry for the specified pmap and virtual
1713 * address.  This operation can be performed on pv lists for either 4KB or 2MB
1714 * page mappings.
1715 */
1716static void
1717pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1718{
1719	pv_entry_t pv;
1720
1721	pv = pmap_pvh_remove(pvh, pmap, va);
1722	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1723	free_pv_entry(pmap, pv);
1724}
1725
1726/*
1727 * Conditionally create the PV entry for a 4KB page mapping if the required
1728 * memory can be allocated without resorting to reclamation.
1729 */
1730static bool
1731pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1732    struct rwlock **lockp)
1733{
1734	pv_entry_t pv;
1735
1736	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1737	/* Pass NULL instead of the lock pointer to disable reclamation. */
1738	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1739		pv->pv_va = va;
1740		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1741		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1742		m->md.pv_gen++;
1743		return (true);
1744	} else
1745		return (false);
1746}
1747
1748vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
1749#ifdef INVARIANTS
1750static void
1751validate_addr(vm_paddr_t addr, vm_size_t size)
1752{
1753	vm_paddr_t end = addr + size;
1754	bool found = false;
1755
1756	for (int i = 0; i < 2 * phys_avail_count; i += 2) {
1757		if (addr >= phys_avail_debug[i] &&
1758			end <= phys_avail_debug[i + 1]) {
1759			found = true;
1760			break;
1761		}
1762	}
1763	KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
1764					addr, end));
1765}
1766#else
1767static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
1768#endif
1769#define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
1770
1771static vm_paddr_t
1772alloc_pt_page(void)
1773{
1774	vm_paddr_t page;
1775
1776	page = allocpages(1);
1777	pagezero(PHYS_TO_DMAP(page));
1778	return (page);
1779}
1780
1781static void
1782mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
1783{
1784	pt_entry_t *pte, pteval;
1785	vm_paddr_t page;
1786
1787	if (bootverbose)
1788		printf("%s %lx -> %lx\n", __func__, start, end);
1789	while (start < end) {
1790		pteval = start | DMAP_PAGE_BITS;
1791		pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
1792		if ((be64toh(*pte) & RPTE_VALID) == 0) {
1793			page = alloc_pt_page();
1794			pde_store(pte, page);
1795		}
1796		pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
1797		if ((start & L2_PAGE_MASK) == 0 &&
1798			end - start >= L2_PAGE_SIZE) {
1799			start += L2_PAGE_SIZE;
1800			goto done;
1801		} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1802			page = alloc_pt_page();
1803			pde_store(pte, page);
1804		}
1805
1806		pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
1807		if ((start & L3_PAGE_MASK) == 0 &&
1808			end - start >= L3_PAGE_SIZE) {
1809			start += L3_PAGE_SIZE;
1810			goto done;
1811		} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1812			page = alloc_pt_page();
1813			pde_store(pte, page);
1814		}
1815		pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
1816		start += PAGE_SIZE;
1817	done:
1818		pte_store(pte, pteval);
1819	}
1820}
1821
1822static void
1823mmu_radix_dmap_populate(vm_size_t hwphyssz)
1824{
1825	vm_paddr_t start, end;
1826
1827	for (int i = 0; i < pregions_sz; i++) {
1828		start = pregions[i].mr_start;
1829		end = start + pregions[i].mr_size;
1830		if (hwphyssz && start >= hwphyssz)
1831			break;
1832		if (hwphyssz && hwphyssz < end)
1833			end = hwphyssz;
1834		mmu_radix_dmap_range(start, end);
1835	}
1836}
1837
1838static void
1839mmu_radix_setup_pagetables(vm_size_t hwphyssz)
1840{
1841	vm_paddr_t ptpages, pages;
1842	pt_entry_t *pte;
1843	vm_paddr_t l1phys;
1844
1845	bzero(kernel_pmap, sizeof(struct pmap));
1846	PMAP_LOCK_INIT(kernel_pmap);
1847	vm_radix_init(&kernel_pmap->pm_radix);
1848
1849	ptpages = allocpages(3);
1850	l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
1851	validate_addr(l1phys, RADIX_PGD_SIZE);
1852	if (bootverbose)
1853		printf("l1phys=%lx\n", l1phys);
1854	MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
1855	for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
1856		pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
1857	kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
1858
1859	mmu_radix_dmap_populate(hwphyssz);
1860
1861	/*
1862	 * Create page tables for first 128MB of KVA
1863	 */
1864	pages = ptpages;
1865	pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
1866	*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1867	pages += PAGE_SIZE;
1868	pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
1869	*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1870	pages += PAGE_SIZE;
1871	pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
1872	/*
1873	 * the kernel page table pages need to be preserved in
1874	 * phys_avail and not overlap with previous  allocations
1875	 */
1876	pages = allocpages(nkpt);
1877	if (bootverbose) {
1878		printf("phys_avail after dmap populate and nkpt allocation\n");
1879		for (int j = 0; j < 2 * phys_avail_count; j+=2)
1880			printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1881				   j, phys_avail[j], j + 1, phys_avail[j + 1]);
1882	}
1883	KPTphys = pages;
1884	for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
1885		*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1886	kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
1887	if (bootverbose)
1888		printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
1889	/*
1890	 * Add a physical memory segment (vm_phys_seg) corresponding to the
1891	 * preallocated kernel page table pages so that vm_page structures
1892	 * representing these pages will be created.  The vm_page structures
1893	 * are required for promotion of the corresponding kernel virtual
1894	 * addresses to superpage mappings.
1895	 */
1896	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1897}
1898
1899static void
1900mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
1901{
1902	vm_paddr_t	kpstart, kpend;
1903	vm_size_t	physsz, hwphyssz;
1904	//uint64_t	l2virt;
1905	int		rm_pavail, proctab_size;
1906	int		i, j;
1907
1908	kpstart = start & ~DMAP_BASE_ADDRESS;
1909	kpend = end & ~DMAP_BASE_ADDRESS;
1910
1911	/* Get physical memory regions from firmware */
1912	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
1913	CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
1914
1915	if (2 * VM_PHYSSEG_MAX < regions_sz)
1916		panic("mmu_radix_early_bootstrap: phys_avail too small");
1917
1918	if (bootverbose)
1919		for (int i = 0; i < regions_sz; i++)
1920			printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
1921			    i, regions[i].mr_start, i, regions[i].mr_size);
1922	/*
1923	 * XXX workaround a simulator bug
1924	 */
1925	for (int i = 0; i < regions_sz; i++)
1926		if (regions[i].mr_start & PAGE_MASK) {
1927			regions[i].mr_start += PAGE_MASK;
1928			regions[i].mr_start &= ~PAGE_MASK;
1929			regions[i].mr_size &= ~PAGE_MASK;
1930		}
1931	if (bootverbose)
1932		for (int i = 0; i < pregions_sz; i++)
1933			printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
1934			    i, pregions[i].mr_start, i, pregions[i].mr_size);
1935
1936	phys_avail_count = 0;
1937	physsz = 0;
1938	hwphyssz = 0;
1939	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
1940	for (i = 0, j = 0; i < regions_sz; i++) {
1941		if (bootverbose)
1942			printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
1943			    i, regions[i].mr_start, i, regions[i].mr_size);
1944
1945		if (regions[i].mr_size < PAGE_SIZE)
1946			continue;
1947
1948		if (hwphyssz != 0 &&
1949		    (physsz + regions[i].mr_size) >= hwphyssz) {
1950			if (physsz < hwphyssz) {
1951				phys_avail[j] = regions[i].mr_start;
1952				phys_avail[j + 1] = regions[i].mr_start +
1953				    (hwphyssz - physsz);
1954				physsz = hwphyssz;
1955				phys_avail_count++;
1956				dump_avail[j] = phys_avail[j];
1957				dump_avail[j + 1] = phys_avail[j + 1];
1958			}
1959			break;
1960		}
1961		phys_avail[j] = regions[i].mr_start;
1962		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
1963		dump_avail[j] = phys_avail[j];
1964		dump_avail[j + 1] = phys_avail[j + 1];
1965
1966		phys_avail_count++;
1967		physsz += regions[i].mr_size;
1968		j += 2;
1969	}
1970
1971	/* Check for overlap with the kernel and exception vectors */
1972	rm_pavail = 0;
1973	for (j = 0; j < 2 * phys_avail_count; j+=2) {
1974		if (phys_avail[j] < EXC_LAST)
1975			phys_avail[j] += EXC_LAST;
1976
1977		if (phys_avail[j] >= kpstart &&
1978		    phys_avail[j + 1] <= kpend) {
1979			phys_avail[j] = phys_avail[j + 1] = ~0;
1980			rm_pavail++;
1981			continue;
1982		}
1983
1984		if (kpstart >= phys_avail[j] &&
1985		    kpstart < phys_avail[j + 1]) {
1986			if (kpend < phys_avail[j + 1]) {
1987				phys_avail[2 * phys_avail_count] =
1988				    (kpend & ~PAGE_MASK) + PAGE_SIZE;
1989				phys_avail[2 * phys_avail_count + 1] =
1990				    phys_avail[j + 1];
1991				phys_avail_count++;
1992			}
1993
1994			phys_avail[j + 1] = kpstart & ~PAGE_MASK;
1995		}
1996
1997		if (kpend >= phys_avail[j] &&
1998		    kpend < phys_avail[j + 1]) {
1999			if (kpstart > phys_avail[j]) {
2000				phys_avail[2 * phys_avail_count] = phys_avail[j];
2001				phys_avail[2 * phys_avail_count + 1] =
2002				    kpstart & ~PAGE_MASK;
2003				phys_avail_count++;
2004			}
2005
2006			phys_avail[j] = (kpend & ~PAGE_MASK) +
2007			    PAGE_SIZE;
2008		}
2009	}
2010	qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
2011	for (i = 0; i < 2 * phys_avail_count; i++)
2012		phys_avail_debug[i] = phys_avail[i];
2013
2014	/* Remove physical available regions marked for removal (~0) */
2015	if (rm_pavail) {
2016		phys_avail_count -= rm_pavail;
2017		for (i = 2 * phys_avail_count;
2018		     i < 2*(phys_avail_count + rm_pavail); i+=2)
2019			phys_avail[i] = phys_avail[i + 1] = 0;
2020	}
2021	if (bootverbose) {
2022		printf("phys_avail ranges after filtering:\n");
2023		for (j = 0; j < 2 * phys_avail_count; j+=2)
2024			printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
2025				   j, phys_avail[j], j + 1, phys_avail[j + 1]);
2026	}
2027	physmem = btoc(physsz);
2028
2029	/* XXX assume we're running non-virtualized and
2030	 * we don't support BHYVE
2031	 */
2032	if (isa3_pid_bits == 0)
2033		isa3_pid_bits = 20;
2034	if (powernv_enabled) {
2035		parttab_phys =
2036		    moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
2037		validate_addr(parttab_phys, PARTTAB_SIZE);
2038		for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
2039			pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
2040
2041	}
2042	proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
2043	proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
2044	validate_addr(proctab0pa, proctab_size);
2045	for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
2046		pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
2047
2048	mmu_radix_setup_pagetables(hwphyssz);
2049}
2050
2051static void
2052mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
2053{
2054	int		i;
2055	vm_paddr_t	pa;
2056	void		*dpcpu;
2057	vm_offset_t va;
2058
2059	/*
2060	 * Set up the Open Firmware pmap and add its mappings if not in real
2061	 * mode.
2062	 */
2063	if (bootverbose)
2064		printf("%s enter\n", __func__);
2065
2066	/*
2067	 * Calculate the last available physical address, and reserve the
2068	 * vm_page_array (upper bound).
2069	 */
2070	Maxmem = 0;
2071	for (i = 0; phys_avail[i + 1] != 0; i += 2)
2072		Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
2073
2074	/*
2075	 * Remap any early IO mappings (console framebuffer, etc.)
2076	 */
2077	bs_remap_earlyboot();
2078
2079	/*
2080	 * Allocate a kernel stack with a guard page for thread0 and map it
2081	 * into the kernel page map.
2082	 */
2083	pa = allocpages(kstack_pages);
2084	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
2085	virtual_avail = va + kstack_pages * PAGE_SIZE;
2086	CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
2087	thread0.td_kstack = va;
2088	for (i = 0; i < kstack_pages; i++) {
2089		mmu_radix_kenter(va, pa);
2090		pa += PAGE_SIZE;
2091		va += PAGE_SIZE;
2092	}
2093	thread0.td_kstack_pages = kstack_pages;
2094
2095	/*
2096	 * Allocate virtual address space for the message buffer.
2097	 */
2098	pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
2099	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
2100
2101	/*
2102	 * Allocate virtual address space for the dynamic percpu area.
2103	 */
2104	pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
2105	dpcpu = (void *)PHYS_TO_DMAP(pa);
2106	dpcpu_init(dpcpu, curcpu);
2107
2108	crashdumpmap = (caddr_t)virtual_avail;
2109	virtual_avail += MAXDUMPPGS * PAGE_SIZE;
2110
2111	/*
2112	 * Reserve some special page table entries/VA space for temporary
2113	 * mapping of pages.
2114	 */
2115}
2116
2117static void
2118mmu_parttab_init(void)
2119{
2120	uint64_t ptcr;
2121
2122	isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
2123
2124	if (bootverbose)
2125		printf("%s parttab: %p\n", __func__, isa3_parttab);
2126	ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2127	if (bootverbose)
2128		printf("setting ptcr %lx\n", ptcr);
2129	mtspr(SPR_PTCR, ptcr);
2130}
2131
2132static void
2133mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
2134{
2135	uint64_t prev;
2136
2137	if (bootverbose)
2138		printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
2139			   lpid, pagetab, proctab);
2140	prev = be64toh(isa3_parttab[lpid].pagetab);
2141	isa3_parttab[lpid].pagetab = htobe64(pagetab);
2142	isa3_parttab[lpid].proctab = htobe64(proctab);
2143
2144	if (prev & PARTTAB_HR) {
2145		__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
2146			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2147		__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2148			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2149	} else {
2150		__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
2151			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2152	}
2153	ttusync();
2154}
2155
2156static void
2157mmu_radix_parttab_init(void)
2158{
2159	uint64_t pagetab;
2160
2161	mmu_parttab_init();
2162	pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
2163		         RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
2164	mmu_parttab_update(0, pagetab, 0);
2165}
2166
2167static void
2168mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
2169{
2170	uint64_t pagetab, proctab;
2171
2172	pagetab = be64toh(isa3_parttab[0].pagetab);
2173	proctab = proctabpa | table_size | PARTTAB_GR;
2174	mmu_parttab_update(0, pagetab, proctab);
2175}
2176
2177static void
2178mmu_radix_proctab_init(void)
2179{
2180
2181	isa3_base_pid = 1;
2182
2183	isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
2184	isa3_proctab->proctab0 =
2185	    htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
2186		RADIX_PGD_INDEX_SHIFT);
2187
2188	if (powernv_enabled) {
2189		mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
2190		__asm __volatile("ptesync" : : : "memory");
2191		__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2192			     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
2193		__asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
2194#ifdef PSERIES
2195	} else {
2196		int64_t rc;
2197
2198		rc = phyp_hcall(H_REGISTER_PROC_TBL,
2199		    PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE,
2200		    proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12);
2201		if (rc != H_SUCCESS)
2202			panic("mmu_radix_proctab_init: "
2203				"failed to register process table: rc=%jd",
2204				(intmax_t)rc);
2205#endif
2206	}
2207
2208	if (bootverbose)
2209		printf("process table %p and kernel radix PDE: %p\n",
2210			   isa3_proctab, kernel_pmap->pm_pml1);
2211	mtmsr(mfmsr() | PSL_DR );
2212	mtmsr(mfmsr() &  ~PSL_DR);
2213	kernel_pmap->pm_pid = isa3_base_pid;
2214	isa3_base_pid++;
2215}
2216
2217void
2218mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2219    int advice)
2220{
2221	struct rwlock *lock;
2222	pml1_entry_t *l1e;
2223	pml2_entry_t *l2e;
2224	pml3_entry_t oldl3e, *l3e;
2225	pt_entry_t *pte;
2226	vm_offset_t va, va_next;
2227	vm_page_t m;
2228	bool anychanged;
2229
2230	if (advice != MADV_DONTNEED && advice != MADV_FREE)
2231		return;
2232	anychanged = false;
2233	PMAP_LOCK(pmap);
2234	for (; sva < eva; sva = va_next) {
2235		l1e = pmap_pml1e(pmap, sva);
2236		if ((be64toh(*l1e) & PG_V) == 0) {
2237			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2238			if (va_next < sva)
2239				va_next = eva;
2240			continue;
2241		}
2242		l2e = pmap_l1e_to_l2e(l1e, sva);
2243		if ((be64toh(*l2e) & PG_V) == 0) {
2244			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2245			if (va_next < sva)
2246				va_next = eva;
2247			continue;
2248		}
2249		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2250		if (va_next < sva)
2251			va_next = eva;
2252		l3e = pmap_l2e_to_l3e(l2e, sva);
2253		oldl3e = be64toh(*l3e);
2254		if ((oldl3e & PG_V) == 0)
2255			continue;
2256		else if ((oldl3e & RPTE_LEAF) != 0) {
2257			if ((oldl3e & PG_MANAGED) == 0)
2258				continue;
2259			lock = NULL;
2260			if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
2261				if (lock != NULL)
2262					rw_wunlock(lock);
2263
2264				/*
2265				 * The large page mapping was destroyed.
2266				 */
2267				continue;
2268			}
2269
2270			/*
2271			 * Unless the page mappings are wired, remove the
2272			 * mapping to a single page so that a subsequent
2273			 * access may repromote.  Choosing the last page
2274			 * within the address range [sva, min(va_next, eva))
2275			 * generally results in more repromotions.  Since the
2276			 * underlying page table page is fully populated, this
2277			 * removal never frees a page table page.
2278			 */
2279			if ((oldl3e & PG_W) == 0) {
2280				va = eva;
2281				if (va > va_next)
2282					va = va_next;
2283				va -= PAGE_SIZE;
2284				KASSERT(va >= sva,
2285				    ("mmu_radix_advise: no address gap"));
2286				pte = pmap_l3e_to_pte(l3e, va);
2287				KASSERT((be64toh(*pte) & PG_V) != 0,
2288				    ("pmap_advise: invalid PTE"));
2289				pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL,
2290				    &lock);
2291				anychanged = true;
2292			}
2293			if (lock != NULL)
2294				rw_wunlock(lock);
2295		}
2296		if (va_next > eva)
2297			va_next = eva;
2298		va = va_next;
2299		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
2300			 pte++, sva += PAGE_SIZE) {
2301			MPASS(pte == pmap_pte(pmap, sva));
2302
2303			if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
2304				goto maybe_invlrng;
2305			else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2306				if (advice == MADV_DONTNEED) {
2307					/*
2308					 * Future calls to pmap_is_modified()
2309					 * can be avoided by making the page
2310					 * dirty now.
2311					 */
2312					m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
2313					vm_page_dirty(m);
2314				}
2315				atomic_clear_long(pte, htobe64(PG_M | PG_A));
2316			} else if ((be64toh(*pte) & PG_A) != 0)
2317				atomic_clear_long(pte, htobe64(PG_A));
2318			else
2319				goto maybe_invlrng;
2320			anychanged = true;
2321			continue;
2322maybe_invlrng:
2323			if (va != va_next) {
2324				anychanged = true;
2325				va = va_next;
2326			}
2327		}
2328		if (va != va_next)
2329			anychanged = true;
2330	}
2331	if (anychanged)
2332		pmap_invalidate_all(pmap);
2333	PMAP_UNLOCK(pmap);
2334}
2335
2336/*
2337 * Routines used in machine-dependent code
2338 */
2339static void
2340mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
2341{
2342	uint64_t lpcr;
2343
2344	if (bootverbose)
2345		printf("%s\n", __func__);
2346	hw_direct_map = 1;
2347	powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0;
2348	mmu_radix_early_bootstrap(start, end);
2349	if (bootverbose)
2350		printf("early bootstrap complete\n");
2351	if (powernv_enabled) {
2352		lpcr = mfspr(SPR_LPCR);
2353		mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2354		mmu_radix_parttab_init();
2355		mmu_radix_init_amor();
2356		if (bootverbose)
2357			printf("powernv init complete\n");
2358	}
2359	mmu_radix_init_iamr();
2360	mmu_radix_proctab_init();
2361	mmu_radix_pid_set(kernel_pmap);
2362	if (powernv_enabled)
2363		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2364	else
2365		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2366
2367	mmu_radix_late_bootstrap(start, end);
2368	numa_mem_regions(&numa_pregions, &numa_pregions_sz);
2369	if (bootverbose)
2370		printf("%s done\n", __func__);
2371	pmap_bootstrapped = 1;
2372	dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
2373	PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
2374}
2375
2376static void
2377mmu_radix_cpu_bootstrap(int ap)
2378{
2379	uint64_t lpcr;
2380	uint64_t ptcr;
2381
2382	if (powernv_enabled) {
2383		lpcr = mfspr(SPR_LPCR);
2384		mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2385
2386		ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2387		mtspr(SPR_PTCR, ptcr);
2388		mmu_radix_init_amor();
2389	}
2390	mmu_radix_init_iamr();
2391	mmu_radix_pid_set(kernel_pmap);
2392	if (powernv_enabled)
2393		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2394	else
2395		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2396}
2397
2398static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
2399    "2MB page mapping counters");
2400
2401static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions);
2402SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
2403    &pmap_l3e_demotions, "2MB page demotions");
2404
2405static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings);
2406SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
2407    &pmap_l3e_mappings, "2MB page mappings");
2408
2409static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures);
2410SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
2411    &pmap_l3e_p_failures, "2MB page promotion failures");
2412
2413static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions);
2414SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
2415    &pmap_l3e_promotions, "2MB page promotions");
2416
2417static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
2418    "1GB page mapping counters");
2419
2420static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions);
2421SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
2422    &pmap_l2e_demotions, "1GB page demotions");
2423
2424void
2425mmu_radix_clear_modify(vm_page_t m)
2426{
2427	struct md_page *pvh;
2428	pmap_t pmap;
2429	pv_entry_t next_pv, pv;
2430	pml3_entry_t oldl3e, *l3e;
2431	pt_entry_t oldpte, *pte;
2432	struct rwlock *lock;
2433	vm_offset_t va;
2434	int md_gen, pvh_gen;
2435
2436	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2437	    ("pmap_clear_modify: page %p is not managed", m));
2438	vm_page_assert_busied(m);
2439	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
2440
2441	/*
2442	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2443	 * If the object containing the page is locked and the page is not
2444	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2445	 */
2446	if ((m->a.flags & PGA_WRITEABLE) == 0)
2447		return;
2448	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2449	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2450	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2451	rw_wlock(lock);
2452restart:
2453	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
2454		pmap = PV_PMAP(pv);
2455		if (!PMAP_TRYLOCK(pmap)) {
2456			pvh_gen = pvh->pv_gen;
2457			rw_wunlock(lock);
2458			PMAP_LOCK(pmap);
2459			rw_wlock(lock);
2460			if (pvh_gen != pvh->pv_gen) {
2461				PMAP_UNLOCK(pmap);
2462				goto restart;
2463			}
2464		}
2465		va = pv->pv_va;
2466		l3e = pmap_pml3e(pmap, va);
2467		oldl3e = be64toh(*l3e);
2468		if ((oldl3e & PG_RW) != 0 &&
2469		    pmap_demote_l3e_locked(pmap, l3e, va, &lock) &&
2470		    (oldl3e & PG_W) == 0) {
2471			/*
2472			 * Write protect the mapping to a
2473			 * single page so that a subsequent
2474			 * write access may repromote.
2475			 */
2476			va += VM_PAGE_TO_PHYS(m) - (oldl3e &
2477			    PG_PS_FRAME);
2478			pte = pmap_l3e_to_pte(l3e, va);
2479			oldpte = be64toh(*pte);
2480			while (!atomic_cmpset_long(pte,
2481			    htobe64(oldpte),
2482				htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
2483				   oldpte = be64toh(*pte);
2484			vm_page_dirty(m);
2485			pmap_invalidate_page(pmap, va);
2486		}
2487		PMAP_UNLOCK(pmap);
2488	}
2489	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2490		pmap = PV_PMAP(pv);
2491		if (!PMAP_TRYLOCK(pmap)) {
2492			md_gen = m->md.pv_gen;
2493			pvh_gen = pvh->pv_gen;
2494			rw_wunlock(lock);
2495			PMAP_LOCK(pmap);
2496			rw_wlock(lock);
2497			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2498				PMAP_UNLOCK(pmap);
2499				goto restart;
2500			}
2501		}
2502		l3e = pmap_pml3e(pmap, pv->pv_va);
2503		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
2504		    " a 2mpage in page %p's pv list", m));
2505		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
2506		if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2507			atomic_clear_long(pte, htobe64(PG_M));
2508			pmap_invalidate_page(pmap, pv->pv_va);
2509		}
2510		PMAP_UNLOCK(pmap);
2511	}
2512	rw_wunlock(lock);
2513}
2514
2515void
2516mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2517    vm_size_t len, vm_offset_t src_addr)
2518{
2519	struct rwlock *lock;
2520	struct spglist free;
2521	vm_offset_t addr;
2522	vm_offset_t end_addr = src_addr + len;
2523	vm_offset_t va_next;
2524	vm_page_t dst_pdpg, dstmpte, srcmpte;
2525	bool invalidate_all;
2526
2527	CTR6(KTR_PMAP,
2528	    "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
2529	    __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
2530
2531	if (dst_addr != src_addr)
2532		return;
2533	lock = NULL;
2534	invalidate_all = false;
2535	if (dst_pmap < src_pmap) {
2536		PMAP_LOCK(dst_pmap);
2537		PMAP_LOCK(src_pmap);
2538	} else {
2539		PMAP_LOCK(src_pmap);
2540		PMAP_LOCK(dst_pmap);
2541	}
2542
2543	for (addr = src_addr; addr < end_addr; addr = va_next) {
2544		pml1_entry_t *l1e;
2545		pml2_entry_t *l2e;
2546		pml3_entry_t srcptepaddr, *l3e;
2547		pt_entry_t *src_pte, *dst_pte;
2548
2549		l1e = pmap_pml1e(src_pmap, addr);
2550		if ((be64toh(*l1e) & PG_V) == 0) {
2551			va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2552			if (va_next < addr)
2553				va_next = end_addr;
2554			continue;
2555		}
2556
2557		l2e = pmap_l1e_to_l2e(l1e, addr);
2558		if ((be64toh(*l2e) & PG_V) == 0) {
2559			va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2560			if (va_next < addr)
2561				va_next = end_addr;
2562			continue;
2563		}
2564
2565		va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2566		if (va_next < addr)
2567			va_next = end_addr;
2568
2569		l3e = pmap_l2e_to_l3e(l2e, addr);
2570		srcptepaddr = be64toh(*l3e);
2571		if (srcptepaddr == 0)
2572			continue;
2573
2574		if (srcptepaddr & RPTE_LEAF) {
2575			if ((addr & L3_PAGE_MASK) != 0 ||
2576			    addr + L3_PAGE_SIZE > end_addr)
2577				continue;
2578			dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
2579			if (dst_pdpg == NULL)
2580				break;
2581			l3e = (pml3_entry_t *)
2582			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
2583			l3e = &l3e[pmap_pml3e_index(addr)];
2584			if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
2585			    pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
2586			    PMAP_ENTER_NORECLAIM, &lock))) {
2587				*l3e = htobe64(srcptepaddr & ~PG_W);
2588				pmap_resident_count_inc(dst_pmap,
2589				    L3_PAGE_SIZE / PAGE_SIZE);
2590				counter_u64_add(pmap_l3e_mappings, 1);
2591			} else
2592				dst_pdpg->ref_count--;
2593			continue;
2594		}
2595
2596		srcptepaddr &= PG_FRAME;
2597		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2598		KASSERT(srcmpte->ref_count > 0,
2599		    ("pmap_copy: source page table page is unused"));
2600
2601		if (va_next > end_addr)
2602			va_next = end_addr;
2603
2604		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
2605		src_pte = &src_pte[pmap_pte_index(addr)];
2606		dstmpte = NULL;
2607		while (addr < va_next) {
2608			pt_entry_t ptetemp;
2609			ptetemp = be64toh(*src_pte);
2610			/*
2611			 * we only virtual copy managed pages
2612			 */
2613			if ((ptetemp & PG_MANAGED) != 0) {
2614				if (dstmpte != NULL &&
2615				    dstmpte->pindex == pmap_l3e_pindex(addr))
2616					dstmpte->ref_count++;
2617				else if ((dstmpte = pmap_allocpte(dst_pmap,
2618				    addr, NULL)) == NULL)
2619					goto out;
2620				dst_pte = (pt_entry_t *)
2621				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2622				dst_pte = &dst_pte[pmap_pte_index(addr)];
2623				if (be64toh(*dst_pte) == 0 &&
2624				    pmap_try_insert_pv_entry(dst_pmap, addr,
2625				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
2626				    &lock)) {
2627					/*
2628					 * Clear the wired, modified, and
2629					 * accessed (referenced) bits
2630					 * during the copy.
2631					 */
2632					*dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
2633					    PG_A));
2634					pmap_resident_count_inc(dst_pmap, 1);
2635				} else {
2636					SLIST_INIT(&free);
2637					if (pmap_unwire_ptp(dst_pmap, addr,
2638					    dstmpte, &free)) {
2639						/*
2640						 * Although "addr" is not
2641						 * mapped, paging-structure
2642						 * caches could nonetheless
2643						 * have entries that refer to
2644						 * the freed page table pages.
2645						 * Invalidate those entries.
2646						 */
2647						invalidate_all = true;
2648						vm_page_free_pages_toq(&free,
2649						    true);
2650					}
2651					goto out;
2652				}
2653				if (dstmpte->ref_count >= srcmpte->ref_count)
2654					break;
2655			}
2656			addr += PAGE_SIZE;
2657			if (__predict_false((addr & L3_PAGE_MASK) == 0))
2658				src_pte = pmap_pte(src_pmap, addr);
2659			else
2660				src_pte++;
2661		}
2662	}
2663out:
2664	if (invalidate_all)
2665		pmap_invalidate_all(dst_pmap);
2666	if (lock != NULL)
2667		rw_wunlock(lock);
2668	PMAP_UNLOCK(src_pmap);
2669	PMAP_UNLOCK(dst_pmap);
2670}
2671
2672static void
2673mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
2674{
2675	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2676	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2677
2678	CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
2679	/*
2680	 * XXX slow
2681	 */
2682	bcopy((void *)src, (void *)dst, PAGE_SIZE);
2683}
2684
2685static void
2686mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2687    vm_offset_t b_offset, int xfersize)
2688{
2689        void *a_cp, *b_cp;
2690        vm_offset_t a_pg_offset, b_pg_offset;
2691        int cnt;
2692
2693	CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
2694	    a_offset, mb, b_offset, xfersize);
2695
2696        while (xfersize > 0) {
2697                a_pg_offset = a_offset & PAGE_MASK;
2698                cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2699                a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2700                    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
2701                    a_pg_offset;
2702                b_pg_offset = b_offset & PAGE_MASK;
2703                cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2704                b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2705                    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
2706                    b_pg_offset;
2707                bcopy(a_cp, b_cp, cnt);
2708                a_offset += cnt;
2709                b_offset += cnt;
2710                xfersize -= cnt;
2711        }
2712}
2713
2714#if VM_NRESERVLEVEL > 0
2715/*
2716 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2717 * single page table page (PTP) to a single 2MB page mapping.  For promotion
2718 * to occur, two conditions must be met: (1) the 4KB page mappings must map
2719 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2720 * identical characteristics.
2721 */
2722static int
2723pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
2724    struct rwlock **lockp)
2725{
2726	pml3_entry_t newpde;
2727	pt_entry_t *firstpte, oldpte, pa, *pte;
2728	vm_page_t mpte;
2729
2730	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2731
2732	/*
2733	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
2734	 * either invalid, unused, or does not map the first 4KB physical page
2735	 * within a 2MB page.
2736	 */
2737	firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
2738setpde:
2739	newpde = be64toh(*firstpte);
2740	if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2741		CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2742		    " in pmap %p", va, pmap);
2743		goto fail;
2744	}
2745	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2746		/*
2747		 * When PG_M is already clear, PG_RW can be cleared without
2748		 * a TLB invalidation.
2749		 */
2750		if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
2751			goto setpde;
2752		newpde &= ~RPTE_EAA_W;
2753	}
2754
2755	/*
2756	 * Examine each of the other PTEs in the specified PTP.  Abort if this
2757	 * PTE maps an unexpected 4KB physical page or does not have identical
2758	 * characteristics to the first PTE.
2759	 */
2760	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
2761	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2762setpte:
2763		oldpte = be64toh(*pte);
2764		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2765			CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2766			    " in pmap %p", va, pmap);
2767			goto fail;
2768		}
2769		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2770			/*
2771			 * When PG_M is already clear, PG_RW can be cleared
2772			 * without a TLB invalidation.
2773			 */
2774			if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
2775				goto setpte;
2776			oldpte &= ~RPTE_EAA_W;
2777			CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
2778			    " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
2779			    (va & ~L3_PAGE_MASK), pmap);
2780		}
2781		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2782			CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2783			    " in pmap %p", va, pmap);
2784			goto fail;
2785		}
2786		pa -= PAGE_SIZE;
2787	}
2788
2789	/*
2790	 * Save the page table page in its current state until the PDE
2791	 * mapping the superpage is demoted by pmap_demote_pde() or
2792	 * destroyed by pmap_remove_pde().
2793	 */
2794	mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
2795	KASSERT(mpte >= vm_page_array &&
2796	    mpte < &vm_page_array[vm_page_array_size],
2797	    ("pmap_promote_l3e: page table page is out of range"));
2798	KASSERT(mpte->pindex == pmap_l3e_pindex(va),
2799	    ("pmap_promote_l3e: page table page's pindex is wrong"));
2800	if (pmap_insert_pt_page(pmap, mpte)) {
2801		CTR2(KTR_PMAP,
2802		    "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
2803		    pmap);
2804		goto fail;
2805	}
2806
2807	/*
2808	 * Promote the pv entries.
2809	 */
2810	if ((newpde & PG_MANAGED) != 0)
2811		pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
2812
2813	pte_store(pde, PG_PROMOTED | newpde);
2814	ptesync();
2815	counter_u64_add(pmap_l3e_promotions, 1);
2816	CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
2817	    " in pmap %p", va, pmap);
2818	return (0);
2819 fail:
2820	counter_u64_add(pmap_l3e_p_failures, 1);
2821	return (KERN_FAILURE);
2822}
2823#endif /* VM_NRESERVLEVEL > 0 */
2824
2825int
2826mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
2827    vm_prot_t prot, u_int flags, int8_t psind)
2828{
2829	struct rwlock *lock;
2830	pml3_entry_t *l3e;
2831	pt_entry_t *pte;
2832	pt_entry_t newpte, origpte;
2833	pv_entry_t pv;
2834	vm_paddr_t opa, pa;
2835	vm_page_t mpte, om;
2836	int rv, retrycount;
2837	bool nosleep, invalidate_all, invalidate_page;
2838
2839	va = trunc_page(va);
2840	retrycount = 0;
2841	invalidate_page = invalidate_all = false;
2842	CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
2843	    m, prot, flags, psind);
2844	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2845	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
2846	    ("pmap_enter: managed mapping within the clean submap"));
2847	if ((m->oflags & VPO_UNMANAGED) == 0)
2848		VM_PAGE_OBJECT_BUSY_ASSERT(m);
2849
2850	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
2851	    ("pmap_enter: flags %u has reserved bits set", flags));
2852	pa = VM_PAGE_TO_PHYS(m);
2853	newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
2854	if ((flags & VM_PROT_WRITE) != 0)
2855		newpte |= PG_M;
2856	if ((flags & VM_PROT_READ) != 0)
2857		newpte |= PG_A;
2858	if (prot & VM_PROT_READ)
2859		newpte |= RPTE_EAA_R;
2860	if ((prot & VM_PROT_WRITE) != 0)
2861		newpte |= RPTE_EAA_W;
2862	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
2863	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
2864
2865	if (prot & VM_PROT_EXECUTE)
2866		newpte |= PG_X;
2867	if ((flags & PMAP_ENTER_WIRED) != 0)
2868		newpte |= PG_W;
2869	if (va >= DMAP_MIN_ADDRESS)
2870		newpte |= RPTE_EAA_P;
2871	newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
2872	/*
2873	 * Set modified bit gratuitously for writeable mappings if
2874	 * the page is unmanaged. We do not want to take a fault
2875	 * to do the dirty bit accounting for these mappings.
2876	 */
2877	if ((m->oflags & VPO_UNMANAGED) != 0) {
2878		if ((newpte & PG_RW) != 0)
2879			newpte |= PG_M;
2880	} else
2881		newpte |= PG_MANAGED;
2882
2883	lock = NULL;
2884	PMAP_LOCK(pmap);
2885	if (psind == 1) {
2886		/* Assert the required virtual and physical alignment. */
2887		KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
2888		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2889		rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
2890		goto out;
2891	}
2892	mpte = NULL;
2893
2894	/*
2895	 * In the case that a page table page is not
2896	 * resident, we are creating it here.
2897	 */
2898retry:
2899	l3e = pmap_pml3e(pmap, va);
2900	if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
2901	    pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
2902		pte = pmap_l3e_to_pte(l3e, va);
2903		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
2904			mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
2905			mpte->ref_count++;
2906		}
2907	} else if (va < VM_MAXUSER_ADDRESS) {
2908		/*
2909		 * Here if the pte page isn't mapped, or if it has been
2910		 * deallocated.
2911		 */
2912		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2913		mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
2914		    nosleep ? NULL : &lock);
2915		if (mpte == NULL && nosleep) {
2916			rv = KERN_RESOURCE_SHORTAGE;
2917			goto out;
2918		}
2919		if (__predict_false(retrycount++ == 6))
2920			panic("too many retries");
2921		invalidate_all = true;
2922		goto retry;
2923	} else
2924		panic("pmap_enter: invalid page directory va=%#lx", va);
2925
2926	origpte = be64toh(*pte);
2927	pv = NULL;
2928
2929	/*
2930	 * Is the specified virtual address already mapped?
2931	 */
2932	if ((origpte & PG_V) != 0) {
2933#ifdef INVARIANTS
2934		if (VERBOSE_PMAP || pmap_logging) {
2935			printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
2936			    " asid=%lu curpid=%d name=%s origpte0x%lx\n",
2937			    pmap, va, m, prot, flags, psind, pmap->pm_pid,
2938			    curproc->p_pid, curproc->p_comm, origpte);
2939#ifdef DDB
2940			pmap_pte_walk(pmap->pm_pml1, va);
2941#endif
2942		}
2943#endif
2944		/*
2945		 * Wiring change, just update stats. We don't worry about
2946		 * wiring PT pages as they remain resident as long as there
2947		 * are valid mappings in them. Hence, if a user page is wired,
2948		 * the PT page will be also.
2949		 */
2950		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
2951			pmap->pm_stats.wired_count++;
2952		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
2953			pmap->pm_stats.wired_count--;
2954
2955		/*
2956		 * Remove the extra PT page reference.
2957		 */
2958		if (mpte != NULL) {
2959			mpte->ref_count--;
2960			KASSERT(mpte->ref_count > 0,
2961			    ("pmap_enter: missing reference to page table page,"
2962			     " va: 0x%lx", va));
2963		}
2964
2965		/*
2966		 * Has the physical page changed?
2967		 */
2968		opa = origpte & PG_FRAME;
2969		if (opa == pa) {
2970			/*
2971			 * No, might be a protection or wiring change.
2972			 */
2973			if ((origpte & PG_MANAGED) != 0 &&
2974			    (newpte & PG_RW) != 0)
2975				vm_page_aflag_set(m, PGA_WRITEABLE);
2976			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
2977				if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
2978					if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
2979						goto retry;
2980					if ((newpte & PG_M) != (origpte & PG_M))
2981						vm_page_dirty(m);
2982					if ((newpte & PG_A) != (origpte & PG_A))
2983						vm_page_aflag_set(m, PGA_REFERENCED);
2984					ptesync();
2985				} else
2986					invalidate_all = true;
2987				if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
2988					goto unchanged;
2989			}
2990			goto validate;
2991		}
2992
2993		/*
2994		 * The physical page has changed.  Temporarily invalidate
2995		 * the mapping.  This ensures that all threads sharing the
2996		 * pmap keep a consistent view of the mapping, which is
2997		 * necessary for the correct handling of COW faults.  It
2998		 * also permits reuse of the old mapping's PV entry,
2999		 * avoiding an allocation.
3000		 *
3001		 * For consistency, handle unmanaged mappings the same way.
3002		 */
3003		origpte = be64toh(pte_load_clear(pte));
3004		KASSERT((origpte & PG_FRAME) == opa,
3005		    ("pmap_enter: unexpected pa update for %#lx", va));
3006		if ((origpte & PG_MANAGED) != 0) {
3007			om = PHYS_TO_VM_PAGE(opa);
3008
3009			/*
3010			 * The pmap lock is sufficient to synchronize with
3011			 * concurrent calls to pmap_page_test_mappings() and
3012			 * pmap_ts_referenced().
3013			 */
3014			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3015				vm_page_dirty(om);
3016			if ((origpte & PG_A) != 0)
3017				vm_page_aflag_set(om, PGA_REFERENCED);
3018			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3019			pv = pmap_pvh_remove(&om->md, pmap, va);
3020			if ((newpte & PG_MANAGED) == 0)
3021				free_pv_entry(pmap, pv);
3022#ifdef INVARIANTS
3023			else if (origpte & PG_MANAGED) {
3024				if (pv == NULL) {
3025#ifdef DDB
3026					pmap_page_print_mappings(om);
3027#endif
3028					MPASS(pv != NULL);
3029				}
3030			}
3031#endif
3032			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3033			    TAILQ_EMPTY(&om->md.pv_list) &&
3034			    ((om->flags & PG_FICTITIOUS) != 0 ||
3035			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3036				vm_page_aflag_clear(om, PGA_WRITEABLE);
3037		}
3038		if ((origpte & PG_A) != 0)
3039			invalidate_page = true;
3040		origpte = 0;
3041	} else {
3042		if (pmap != kernel_pmap) {
3043#ifdef INVARIANTS
3044			if (VERBOSE_PMAP || pmap_logging)
3045				printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
3046				    pmap, va, m, prot, flags, psind,
3047				    pmap->pm_pid, curproc->p_pid,
3048				    curproc->p_comm);
3049#endif
3050		}
3051
3052		/*
3053		 * Increment the counters.
3054		 */
3055		if ((newpte & PG_W) != 0)
3056			pmap->pm_stats.wired_count++;
3057		pmap_resident_count_inc(pmap, 1);
3058	}
3059
3060	/*
3061	 * Enter on the PV list if part of our managed memory.
3062	 */
3063	if ((newpte & PG_MANAGED) != 0) {
3064		if (pv == NULL) {
3065			pv = get_pv_entry(pmap, &lock);
3066			pv->pv_va = va;
3067		}
3068#ifdef VERBOSE_PV
3069		else
3070			printf("reassigning pv: %p to pmap: %p\n",
3071				   pv, pmap);
3072#endif
3073		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3074		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3075		m->md.pv_gen++;
3076		if ((newpte & PG_RW) != 0)
3077			vm_page_aflag_set(m, PGA_WRITEABLE);
3078	}
3079
3080	/*
3081	 * Update the PTE.
3082	 */
3083	if ((origpte & PG_V) != 0) {
3084validate:
3085		origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
3086		KASSERT((origpte & PG_FRAME) == pa,
3087		    ("pmap_enter: unexpected pa update for %#lx", va));
3088		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
3089		    (PG_M | PG_RW)) {
3090			if ((origpte & PG_MANAGED) != 0)
3091				vm_page_dirty(m);
3092			invalidate_page = true;
3093
3094			/*
3095			 * Although the PTE may still have PG_RW set, TLB
3096			 * invalidation may nonetheless be required because
3097			 * the PTE no longer has PG_M set.
3098			 */
3099		} else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
3100			/*
3101			 * Removing capabilities requires invalidation on POWER
3102			 */
3103			invalidate_page = true;
3104			goto unchanged;
3105		}
3106		if ((origpte & PG_A) != 0)
3107			invalidate_page = true;
3108	} else {
3109		pte_store(pte, newpte);
3110		ptesync();
3111	}
3112unchanged:
3113
3114#if VM_NRESERVLEVEL > 0
3115	/*
3116	 * If both the page table page and the reservation are fully
3117	 * populated, then attempt promotion.
3118	 */
3119	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
3120	    mmu_radix_ps_enabled(pmap) &&
3121	    (m->flags & PG_FICTITIOUS) == 0 &&
3122	    vm_reserv_level_iffullpop(m) == 0 &&
3123		pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
3124		invalidate_all = true;
3125#endif
3126	if (invalidate_all)
3127		pmap_invalidate_all(pmap);
3128	else if (invalidate_page)
3129		pmap_invalidate_page(pmap, va);
3130
3131	rv = KERN_SUCCESS;
3132out:
3133	if (lock != NULL)
3134		rw_wunlock(lock);
3135	PMAP_UNLOCK(pmap);
3136
3137	return (rv);
3138}
3139
3140/*
3141 * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
3142 * if successful.  Returns false if (1) a page table page cannot be allocated
3143 * without sleeping, (2) a mapping already exists at the specified virtual
3144 * address, or (3) a PV entry cannot be allocated without reclaiming another
3145 * PV entry.
3146 */
3147static bool
3148pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3149    struct rwlock **lockp)
3150{
3151	pml3_entry_t newpde;
3152
3153	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3154	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
3155	    RPTE_LEAF | PG_V;
3156	if ((m->oflags & VPO_UNMANAGED) == 0)
3157		newpde |= PG_MANAGED;
3158	if (prot & VM_PROT_EXECUTE)
3159		newpde |= PG_X;
3160	if (prot & VM_PROT_READ)
3161		newpde |= RPTE_EAA_R;
3162	if (va >= DMAP_MIN_ADDRESS)
3163		newpde |= RPTE_EAA_P;
3164	return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
3165	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3166	    KERN_SUCCESS);
3167}
3168
3169/*
3170 * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3171 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3172 * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3173 * a mapping already exists at the specified virtual address.  Returns
3174 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3175 * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
3176 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3177 *
3178 * The parameter "m" is only used when creating a managed, writeable mapping.
3179 */
3180static int
3181pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
3182    vm_page_t m, struct rwlock **lockp)
3183{
3184	struct spglist free;
3185	pml3_entry_t oldl3e, *l3e;
3186	vm_page_t mt, pdpg;
3187	vm_page_t uwptpg;
3188
3189	KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
3190	    ("pmap_enter_pde: newpde is missing PG_M"));
3191	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3192
3193	if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3194	    NULL : lockp)) == NULL) {
3195		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3196		    " in pmap %p", va, pmap);
3197		return (KERN_RESOURCE_SHORTAGE);
3198	}
3199	l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3200	l3e = &l3e[pmap_pml3e_index(va)];
3201	oldl3e = be64toh(*l3e);
3202	if ((oldl3e & PG_V) != 0) {
3203		KASSERT(pdpg->ref_count > 1,
3204		    ("pmap_enter_pde: pdpg's wire count is too low"));
3205		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3206			pdpg->ref_count--;
3207			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3208			    " in pmap %p", va, pmap);
3209			return (KERN_FAILURE);
3210		}
3211		/* Break the existing mapping(s). */
3212		SLIST_INIT(&free);
3213		if ((oldl3e & RPTE_LEAF) != 0) {
3214			/*
3215			 * The reference to the PD page that was acquired by
3216			 * pmap_allocl3e() ensures that it won't be freed.
3217			 * However, if the PDE resulted from a promotion, then
3218			 * a reserved PT page could be freed.
3219			 */
3220			(void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
3221			pmap_invalidate_l3e_page(pmap, va, oldl3e);
3222		} else {
3223			if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
3224			    &free, lockp))
3225		               pmap_invalidate_all(pmap);
3226		}
3227		vm_page_free_pages_toq(&free, true);
3228		if (va >= VM_MAXUSER_ADDRESS) {
3229			mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
3230			if (pmap_insert_pt_page(pmap, mt)) {
3231				/*
3232				 * XXX Currently, this can't happen because
3233				 * we do not perform pmap_enter(psind == 1)
3234				 * on the kernel pmap.
3235				 */
3236				panic("pmap_enter_pde: trie insert failed");
3237			}
3238		} else
3239			KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
3240			    l3e));
3241	}
3242
3243	/*
3244	 * Allocate leaf ptpage for wired userspace pages.
3245	 */
3246	uwptpg = NULL;
3247	if ((newpde & PG_W) != 0 && pmap != kernel_pmap) {
3248		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3249		if (uwptpg == NULL)
3250			return (KERN_RESOURCE_SHORTAGE);
3251		uwptpg->pindex = pmap_l3e_pindex(va);
3252		if (pmap_insert_pt_page(pmap, uwptpg)) {
3253			vm_page_unwire_noq(uwptpg);
3254			vm_page_free(uwptpg);
3255			return (KERN_RESOURCE_SHORTAGE);
3256		}
3257		pmap_resident_count_inc(pmap, 1);
3258		uwptpg->ref_count = NPTEPG;
3259		pmap_fill_ptp((pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(uwptpg)),
3260		    newpde);
3261	}
3262	if ((newpde & PG_MANAGED) != 0) {
3263		/*
3264		 * Abort this mapping if its PV entry could not be created.
3265		 */
3266		if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
3267			SLIST_INIT(&free);
3268			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
3269				/*
3270				 * Although "va" is not mapped, paging-
3271				 * structure caches could nonetheless have
3272				 * entries that refer to the freed page table
3273				 * pages.  Invalidate those entries.
3274				 */
3275				pmap_invalidate_page(pmap, va);
3276				vm_page_free_pages_toq(&free, true);
3277			}
3278			if (uwptpg != NULL) {
3279				mt = pmap_remove_pt_page(pmap, va);
3280				KASSERT(mt == uwptpg,
3281				    ("removed pt page %p, expected %p", mt,
3282				    uwptpg));
3283				pmap_resident_count_dec(pmap, 1);
3284				uwptpg->ref_count = 1;
3285				vm_page_unwire_noq(uwptpg);
3286				vm_page_free(uwptpg);
3287			}
3288			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3289			    " in pmap %p", va, pmap);
3290			return (KERN_RESOURCE_SHORTAGE);
3291		}
3292		if ((newpde & PG_RW) != 0) {
3293			for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
3294				vm_page_aflag_set(mt, PGA_WRITEABLE);
3295		}
3296	}
3297
3298	/*
3299	 * Increment counters.
3300	 */
3301	if ((newpde & PG_W) != 0)
3302		pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
3303	pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3304
3305	/*
3306	 * Map the superpage.  (This is not a promoted mapping; there will not
3307	 * be any lingering 4KB page mappings in the TLB.)
3308	 */
3309	pte_store(l3e, newpde);
3310	ptesync();
3311
3312	counter_u64_add(pmap_l3e_mappings, 1);
3313	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3314	    " in pmap %p", va, pmap);
3315	return (KERN_SUCCESS);
3316}
3317
3318void
3319mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
3320    vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
3321{
3322
3323	struct rwlock *lock;
3324	vm_offset_t va;
3325	vm_page_t m, mpte;
3326	vm_pindex_t diff, psize;
3327	bool invalidate;
3328	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3329
3330	CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
3331	    end, m_start, prot);
3332
3333	invalidate = false;
3334	psize = atop(end - start);
3335	mpte = NULL;
3336	m = m_start;
3337	lock = NULL;
3338	PMAP_LOCK(pmap);
3339	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3340		va = start + ptoa(diff);
3341		if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
3342		    m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
3343		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
3344			m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
3345		else
3346			mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
3347			    mpte, &lock, &invalidate);
3348		m = TAILQ_NEXT(m, listq);
3349	}
3350	ptesync();
3351	if (lock != NULL)
3352		rw_wunlock(lock);
3353	if (invalidate)
3354		pmap_invalidate_all(pmap);
3355	PMAP_UNLOCK(pmap);
3356}
3357
3358static vm_page_t
3359mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3360    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
3361{
3362	struct spglist free;
3363	pt_entry_t *pte;
3364	vm_paddr_t pa;
3365
3366	KASSERT(!VA_IS_CLEANMAP(va) ||
3367	    (m->oflags & VPO_UNMANAGED) != 0,
3368	    ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
3369	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3370
3371	/*
3372	 * In the case that a page table page is not
3373	 * resident, we are creating it here.
3374	 */
3375	if (va < VM_MAXUSER_ADDRESS) {
3376		vm_pindex_t ptepindex;
3377		pml3_entry_t *ptepa;
3378
3379		/*
3380		 * Calculate pagetable page index
3381		 */
3382		ptepindex = pmap_l3e_pindex(va);
3383		if (mpte && (mpte->pindex == ptepindex)) {
3384			mpte->ref_count++;
3385		} else {
3386			/*
3387			 * Get the page directory entry
3388			 */
3389			ptepa = pmap_pml3e(pmap, va);
3390
3391			/*
3392			 * If the page table page is mapped, we just increment
3393			 * the hold count, and activate it.  Otherwise, we
3394			 * attempt to allocate a page table page.  If this
3395			 * attempt fails, we don't retry.  Instead, we give up.
3396			 */
3397			if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
3398				if (be64toh(*ptepa) & RPTE_LEAF)
3399					return (NULL);
3400				mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
3401				mpte->ref_count++;
3402			} else {
3403				/*
3404				 * Pass NULL instead of the PV list lock
3405				 * pointer, because we don't intend to sleep.
3406				 */
3407				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3408				if (mpte == NULL)
3409					return (mpte);
3410			}
3411		}
3412		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3413		pte = &pte[pmap_pte_index(va)];
3414	} else {
3415		mpte = NULL;
3416		pte = pmap_pte(pmap, va);
3417	}
3418	if (be64toh(*pte)) {
3419		if (mpte != NULL) {
3420			mpte->ref_count--;
3421			mpte = NULL;
3422		}
3423		return (mpte);
3424	}
3425
3426	/*
3427	 * Enter on the PV list if part of our managed memory.
3428	 */
3429	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3430	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3431		if (mpte != NULL) {
3432			SLIST_INIT(&free);
3433			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3434				/*
3435				 * Although "va" is not mapped, paging-
3436				 * structure caches could nonetheless have
3437				 * entries that refer to the freed page table
3438				 * pages.  Invalidate those entries.
3439				 */
3440				*invalidate = true;
3441				vm_page_free_pages_toq(&free, true);
3442			}
3443			mpte = NULL;
3444		}
3445		return (mpte);
3446	}
3447
3448	/*
3449	 * Increment counters
3450	 */
3451	pmap_resident_count_inc(pmap, 1);
3452
3453	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
3454	if (prot & VM_PROT_EXECUTE)
3455		pa |= PG_X;
3456	else
3457		pa |= RPTE_EAA_R;
3458	if ((m->oflags & VPO_UNMANAGED) == 0)
3459		pa |= PG_MANAGED;
3460
3461	pte_store(pte, pa);
3462	return (mpte);
3463}
3464
3465void
3466mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
3467    vm_prot_t prot)
3468{
3469	struct rwlock *lock;
3470	bool invalidate;
3471
3472	lock = NULL;
3473	invalidate = false;
3474	PMAP_LOCK(pmap);
3475	mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
3476	    &invalidate);
3477	ptesync();
3478	if (lock != NULL)
3479		rw_wunlock(lock);
3480	if (invalidate)
3481		pmap_invalidate_all(pmap);
3482	PMAP_UNLOCK(pmap);
3483}
3484
3485vm_paddr_t
3486mmu_radix_extract(pmap_t pmap, vm_offset_t va)
3487{
3488	pml3_entry_t *l3e;
3489	pt_entry_t *pte;
3490	vm_paddr_t pa;
3491
3492	l3e = pmap_pml3e(pmap, va);
3493	if (__predict_false(l3e == NULL))
3494		return (0);
3495	if (be64toh(*l3e) & RPTE_LEAF) {
3496		pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
3497		pa |= (va & L3_PAGE_MASK);
3498	} else {
3499		/*
3500		 * Beware of a concurrent promotion that changes the
3501		 * PDE at this point!  For example, vtopte() must not
3502		 * be used to access the PTE because it would use the
3503		 * new PDE.  It is, however, safe to use the old PDE
3504		 * because the page table page is preserved by the
3505		 * promotion.
3506		 */
3507		pte = pmap_l3e_to_pte(l3e, va);
3508		if (__predict_false(pte == NULL))
3509			return (0);
3510		pa = be64toh(*pte);
3511		pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3512		pa |= (va & PAGE_MASK);
3513	}
3514	return (pa);
3515}
3516
3517vm_page_t
3518mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3519{
3520	pml3_entry_t l3e, *l3ep;
3521	pt_entry_t pte;
3522	vm_page_t m;
3523
3524	m = NULL;
3525	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
3526	PMAP_LOCK(pmap);
3527	l3ep = pmap_pml3e(pmap, va);
3528	if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
3529		if (l3e & RPTE_LEAF) {
3530			if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
3531				m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
3532				    (va & L3_PAGE_MASK));
3533		} else {
3534			/* Native endian PTE, do not pass to pmap functions */
3535			pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
3536			if ((pte & PG_V) &&
3537			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
3538				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3539		}
3540		if (m != NULL && !vm_page_wire_mapped(m))
3541			m = NULL;
3542	}
3543	PMAP_UNLOCK(pmap);
3544	return (m);
3545}
3546
3547static void
3548mmu_radix_growkernel(vm_offset_t addr)
3549{
3550	vm_paddr_t paddr;
3551	vm_page_t nkpg;
3552	pml3_entry_t *l3e;
3553	pml2_entry_t *l2e;
3554
3555	CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
3556	if (VM_MIN_KERNEL_ADDRESS < addr &&
3557		addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
3558		return;
3559
3560	addr = roundup2(addr, L3_PAGE_SIZE);
3561	if (addr - 1 >= vm_map_max(kernel_map))
3562		addr = vm_map_max(kernel_map);
3563	while (kernel_vm_end < addr) {
3564		l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
3565		if ((be64toh(*l2e) & PG_V) == 0) {
3566			/* We need a new PDP entry */
3567			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3568			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3569			if (nkpg == NULL)
3570				panic("pmap_growkernel: no memory to grow kernel");
3571			nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT;
3572			paddr = VM_PAGE_TO_PHYS(nkpg);
3573			pde_store(l2e, paddr);
3574			continue; /* try again */
3575		}
3576		l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
3577		if ((be64toh(*l3e) & PG_V) != 0) {
3578			kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3579			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3580				kernel_vm_end = vm_map_max(kernel_map);
3581				break;
3582			}
3583			continue;
3584		}
3585
3586		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
3587		    VM_ALLOC_ZERO);
3588		if (nkpg == NULL)
3589			panic("pmap_growkernel: no memory to grow kernel");
3590		nkpg->pindex = pmap_l3e_pindex(kernel_vm_end);
3591		paddr = VM_PAGE_TO_PHYS(nkpg);
3592		pde_store(l3e, paddr);
3593
3594		kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3595		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3596			kernel_vm_end = vm_map_max(kernel_map);
3597			break;
3598		}
3599	}
3600	ptesync();
3601}
3602
3603static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
3604static uma_zone_t zone_radix_pgd;
3605
3606static int
3607radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
3608    int flags)
3609{
3610	int req;
3611
3612	req = VM_ALLOC_WIRED | malloc2vm_flags(flags);
3613	for (int i = 0; i < count; i++) {
3614		vm_page_t m = vm_page_alloc_noobj_contig(req,
3615		    RADIX_PGD_SIZE / PAGE_SIZE,
3616		    0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
3617		    VM_MEMATTR_DEFAULT);
3618		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3619	}
3620	return (count);
3621}
3622
3623static void
3624radix_pgd_release(void *arg __unused, void **store, int count)
3625{
3626	vm_page_t m;
3627	struct spglist free;
3628	int page_count;
3629
3630	SLIST_INIT(&free);
3631	page_count = RADIX_PGD_SIZE/PAGE_SIZE;
3632
3633	for (int i = 0; i < count; i++) {
3634		/*
3635		 * XXX selectively remove dmap and KVA entries so we don't
3636		 * need to bzero
3637		 */
3638		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
3639		for (int j = page_count-1; j >= 0; j--) {
3640			vm_page_unwire_noq(&m[j]);
3641			SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
3642		}
3643		vm_page_free_pages_toq(&free, false);
3644	}
3645}
3646
3647static void
3648mmu_radix_init(void)
3649{
3650	vm_page_t mpte;
3651	vm_size_t s;
3652	int error, i, pv_npg;
3653
3654	/* XXX is this really needed for POWER? */
3655	/* L1TF, reserve page @0 unconditionally */
3656	vm_page_blacklist_add(0, bootverbose);
3657
3658	zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
3659		RADIX_PGD_SIZE, NULL, NULL,
3660#ifdef INVARIANTS
3661	    trash_init, trash_fini,
3662#else
3663	    NULL, NULL,
3664#endif
3665		radix_pgd_import, radix_pgd_release,
3666		NULL, UMA_ZONE_NOBUCKET);
3667
3668	/*
3669	 * Initialize the vm page array entries for the kernel pmap's
3670	 * page table pages.
3671	 */
3672	PMAP_LOCK(kernel_pmap);
3673	for (i = 0; i < nkpt; i++) {
3674		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
3675		KASSERT(mpte >= vm_page_array &&
3676		    mpte < &vm_page_array[vm_page_array_size],
3677		    ("pmap_init: page table page is out of range size: %lu",
3678		     vm_page_array_size));
3679		mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
3680		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
3681		MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
3682		//pmap_insert_pt_page(kernel_pmap, mpte);
3683		mpte->ref_count = 1;
3684	}
3685	PMAP_UNLOCK(kernel_pmap);
3686	vm_wire_add(nkpt);
3687
3688	CTR1(KTR_PMAP, "%s()", __func__);
3689	TAILQ_INIT(&pv_dummy.pv_list);
3690
3691	/*
3692	 * Are large page mappings enabled?
3693	 */
3694	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
3695	if (superpages_enabled) {
3696		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
3697		    ("pmap_init: can't assign to pagesizes[1]"));
3698		pagesizes[1] = L3_PAGE_SIZE;
3699	}
3700
3701	/*
3702	 * Initialize the pv chunk list mutex.
3703	 */
3704	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
3705
3706	/*
3707	 * Initialize the pool of pv list locks.
3708	 */
3709	for (i = 0; i < NPV_LIST_LOCKS; i++)
3710		rw_init(&pv_list_locks[i], "pmap pv list");
3711
3712	/*
3713	 * Calculate the size of the pv head table for superpages.
3714	 */
3715	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
3716
3717	/*
3718	 * Allocate memory for the pv head table for superpages.
3719	 */
3720	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
3721	s = round_page(s);
3722	pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
3723	for (i = 0; i < pv_npg; i++)
3724		TAILQ_INIT(&pv_table[i].pv_list);
3725	TAILQ_INIT(&pv_dummy.pv_list);
3726
3727	pmap_initialized = 1;
3728	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
3729	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3730	    (vmem_addr_t *)&qframe);
3731
3732	if (error != 0)
3733		panic("qframe allocation failed");
3734	asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
3735	    1, 1, M_WAITOK);
3736}
3737
3738static bool
3739pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
3740{
3741	struct rwlock *lock;
3742	pv_entry_t pv;
3743	struct md_page *pvh;
3744	pt_entry_t *pte, mask;
3745	pmap_t pmap;
3746	int md_gen, pvh_gen;
3747	bool rv;
3748
3749	rv = false;
3750	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3751	rw_rlock(lock);
3752restart:
3753	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
3754		pmap = PV_PMAP(pv);
3755		if (!PMAP_TRYLOCK(pmap)) {
3756			md_gen = m->md.pv_gen;
3757			rw_runlock(lock);
3758			PMAP_LOCK(pmap);
3759			rw_rlock(lock);
3760			if (md_gen != m->md.pv_gen) {
3761				PMAP_UNLOCK(pmap);
3762				goto restart;
3763			}
3764		}
3765		pte = pmap_pte(pmap, pv->pv_va);
3766		mask = 0;
3767		if (modified)
3768			mask |= PG_RW | PG_M;
3769		if (accessed)
3770			mask |= PG_V | PG_A;
3771		rv = (be64toh(*pte) & mask) == mask;
3772		PMAP_UNLOCK(pmap);
3773		if (rv)
3774			goto out;
3775	}
3776	if ((m->flags & PG_FICTITIOUS) == 0) {
3777		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3778		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
3779			pmap = PV_PMAP(pv);
3780			if (!PMAP_TRYLOCK(pmap)) {
3781				md_gen = m->md.pv_gen;
3782				pvh_gen = pvh->pv_gen;
3783				rw_runlock(lock);
3784				PMAP_LOCK(pmap);
3785				rw_rlock(lock);
3786				if (md_gen != m->md.pv_gen ||
3787				    pvh_gen != pvh->pv_gen) {
3788					PMAP_UNLOCK(pmap);
3789					goto restart;
3790				}
3791			}
3792			pte = pmap_pml3e(pmap, pv->pv_va);
3793			mask = 0;
3794			if (modified)
3795				mask |= PG_RW | PG_M;
3796			if (accessed)
3797				mask |= PG_V | PG_A;
3798			rv = (be64toh(*pte) & mask) == mask;
3799			PMAP_UNLOCK(pmap);
3800			if (rv)
3801				goto out;
3802		}
3803	}
3804out:
3805	rw_runlock(lock);
3806	return (rv);
3807}
3808
3809/*
3810 *	pmap_is_modified:
3811 *
3812 *	Return whether or not the specified physical page was modified
3813 *	in any physical maps.
3814 */
3815bool
3816mmu_radix_is_modified(vm_page_t m)
3817{
3818
3819	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3820	    ("pmap_is_modified: page %p is not managed", m));
3821
3822	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3823	/*
3824	 * If the page is not busied then this check is racy.
3825	 */
3826	if (!pmap_page_is_write_mapped(m))
3827		return (false);
3828	return (pmap_page_test_mappings(m, false, true));
3829}
3830
3831bool
3832mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3833{
3834	pml3_entry_t *l3e;
3835	pt_entry_t *pte;
3836	bool rv;
3837
3838	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
3839	rv = false;
3840	PMAP_LOCK(pmap);
3841	l3e = pmap_pml3e(pmap, addr);
3842	if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
3843		pte = pmap_l3e_to_pte(l3e, addr);
3844		rv = (be64toh(*pte) & PG_V) == 0;
3845	}
3846	PMAP_UNLOCK(pmap);
3847	return (rv);
3848}
3849
3850bool
3851mmu_radix_is_referenced(vm_page_t m)
3852{
3853	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3854	    ("pmap_is_referenced: page %p is not managed", m));
3855	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3856	return (pmap_page_test_mappings(m, true, false));
3857}
3858
3859/*
3860 *	pmap_ts_referenced:
3861 *
3862 *	Return a count of reference bits for a page, clearing those bits.
3863 *	It is not necessary for every reference bit to be cleared, but it
3864 *	is necessary that 0 only be returned when there are truly no
3865 *	reference bits set.
3866 *
3867 *	As an optimization, update the page's dirty field if a modified bit is
3868 *	found while counting reference bits.  This opportunistic update can be
3869 *	performed at low cost and can eliminate the need for some future calls
3870 *	to pmap_is_modified().  However, since this function stops after
3871 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3872 *	dirty pages.  Those dirty pages will only be detected by a future call
3873 *	to pmap_is_modified().
3874 *
3875 *	A DI block is not needed within this function, because
3876 *	invalidations are performed before the PV list lock is
3877 *	released.
3878 */
3879int
3880mmu_radix_ts_referenced(vm_page_t m)
3881{
3882	struct md_page *pvh;
3883	pv_entry_t pv, pvf;
3884	pmap_t pmap;
3885	struct rwlock *lock;
3886	pml3_entry_t oldl3e, *l3e;
3887	pt_entry_t *pte;
3888	vm_paddr_t pa;
3889	int cleared, md_gen, not_cleared, pvh_gen;
3890	struct spglist free;
3891
3892	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3893	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3894	    ("pmap_ts_referenced: page %p is not managed", m));
3895	SLIST_INIT(&free);
3896	cleared = 0;
3897	pa = VM_PAGE_TO_PHYS(m);
3898	lock = PHYS_TO_PV_LIST_LOCK(pa);
3899	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
3900	rw_wlock(lock);
3901retry:
3902	not_cleared = 0;
3903	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
3904		goto small_mappings;
3905	pv = pvf;
3906	do {
3907		if (pvf == NULL)
3908			pvf = pv;
3909		pmap = PV_PMAP(pv);
3910		if (!PMAP_TRYLOCK(pmap)) {
3911			pvh_gen = pvh->pv_gen;
3912			rw_wunlock(lock);
3913			PMAP_LOCK(pmap);
3914			rw_wlock(lock);
3915			if (pvh_gen != pvh->pv_gen) {
3916				PMAP_UNLOCK(pmap);
3917				goto retry;
3918			}
3919		}
3920		l3e = pmap_pml3e(pmap, pv->pv_va);
3921		oldl3e = be64toh(*l3e);
3922		if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3923			/*
3924			 * Although "oldpde" is mapping a 2MB page, because
3925			 * this function is called at a 4KB page granularity,
3926			 * we only update the 4KB page under test.
3927			 */
3928			vm_page_dirty(m);
3929		}
3930		if ((oldl3e & PG_A) != 0) {
3931			/*
3932			 * Since this reference bit is shared by 512 4KB
3933			 * pages, it should not be cleared every time it is
3934			 * tested.  Apply a simple "hash" function on the
3935			 * physical page number, the virtual superpage number,
3936			 * and the pmap address to select one 4KB page out of
3937			 * the 512 on which testing the reference bit will
3938			 * result in clearing that reference bit.  This
3939			 * function is designed to avoid the selection of the
3940			 * same 4KB page for every 2MB page mapping.
3941			 *
3942			 * On demotion, a mapping that hasn't been referenced
3943			 * is simply destroyed.  To avoid the possibility of a
3944			 * subsequent page fault on a demoted wired mapping,
3945			 * always leave its reference bit set.  Moreover,
3946			 * since the superpage is wired, the current state of
3947			 * its reference bit won't affect page replacement.
3948			 */
3949			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
3950			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
3951			    (oldl3e & PG_W) == 0) {
3952				atomic_clear_long(l3e, htobe64(PG_A));
3953				pmap_invalidate_page(pmap, pv->pv_va);
3954				cleared++;
3955				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3956				    ("inconsistent pv lock %p %p for page %p",
3957				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3958			} else
3959				not_cleared++;
3960		}
3961		PMAP_UNLOCK(pmap);
3962		/* Rotate the PV list if it has more than one entry. */
3963		if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3964			TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
3965			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
3966			pvh->pv_gen++;
3967		}
3968		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
3969			goto out;
3970	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
3971small_mappings:
3972	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3973		goto out;
3974	pv = pvf;
3975	do {
3976		if (pvf == NULL)
3977			pvf = pv;
3978		pmap = PV_PMAP(pv);
3979		if (!PMAP_TRYLOCK(pmap)) {
3980			pvh_gen = pvh->pv_gen;
3981			md_gen = m->md.pv_gen;
3982			rw_wunlock(lock);
3983			PMAP_LOCK(pmap);
3984			rw_wlock(lock);
3985			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3986				PMAP_UNLOCK(pmap);
3987				goto retry;
3988			}
3989		}
3990		l3e = pmap_pml3e(pmap, pv->pv_va);
3991		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
3992		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
3993		    m));
3994		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
3995		if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
3996			vm_page_dirty(m);
3997		if ((be64toh(*pte) & PG_A) != 0) {
3998			atomic_clear_long(pte, htobe64(PG_A));
3999			pmap_invalidate_page(pmap, pv->pv_va);
4000			cleared++;
4001		}
4002		PMAP_UNLOCK(pmap);
4003		/* Rotate the PV list if it has more than one entry. */
4004		if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
4005			TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
4006			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
4007			m->md.pv_gen++;
4008		}
4009	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4010	    not_cleared < PMAP_TS_REFERENCED_MAX);
4011out:
4012	rw_wunlock(lock);
4013	vm_page_free_pages_toq(&free, true);
4014	return (cleared + not_cleared);
4015}
4016
4017static vm_offset_t
4018mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
4019    vm_paddr_t end, int prot __unused)
4020{
4021
4022	CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
4023		 prot);
4024	return (PHYS_TO_DMAP(start));
4025}
4026
4027void
4028mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
4029    vm_object_t object, vm_pindex_t pindex, vm_size_t size)
4030{
4031	pml3_entry_t *l3e;
4032	vm_paddr_t pa, ptepa;
4033	vm_page_t p, pdpg;
4034	vm_memattr_t ma;
4035
4036	CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
4037	    object, pindex, size);
4038	VM_OBJECT_ASSERT_WLOCKED(object);
4039	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4040			("pmap_object_init_pt: non-device object"));
4041	/* NB: size can be logically ored with addr here */
4042	if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
4043		if (!mmu_radix_ps_enabled(pmap))
4044			return;
4045		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4046			return;
4047		p = vm_page_lookup(object, pindex);
4048		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4049		    ("pmap_object_init_pt: invalid page %p", p));
4050		ma = p->md.mdpg_cache_attrs;
4051
4052		/*
4053		 * Abort the mapping if the first page is not physically
4054		 * aligned to a 2MB page boundary.
4055		 */
4056		ptepa = VM_PAGE_TO_PHYS(p);
4057		if (ptepa & L3_PAGE_MASK)
4058			return;
4059
4060		/*
4061		 * Skip the first page.  Abort the mapping if the rest of
4062		 * the pages are not physically contiguous or have differing
4063		 * memory attributes.
4064		 */
4065		p = TAILQ_NEXT(p, listq);
4066		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4067		    pa += PAGE_SIZE) {
4068			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4069			    ("pmap_object_init_pt: invalid page %p", p));
4070			if (pa != VM_PAGE_TO_PHYS(p) ||
4071			    ma != p->md.mdpg_cache_attrs)
4072				return;
4073			p = TAILQ_NEXT(p, listq);
4074		}
4075
4076		PMAP_LOCK(pmap);
4077		for (pa = ptepa | pmap_cache_bits(ma);
4078		    pa < ptepa + size; pa += L3_PAGE_SIZE) {
4079			pdpg = pmap_allocl3e(pmap, addr, NULL);
4080			if (pdpg == NULL) {
4081				/*
4082				 * The creation of mappings below is only an
4083				 * optimization.  If a page directory page
4084				 * cannot be allocated without blocking,
4085				 * continue on to the next mapping rather than
4086				 * blocking.
4087				 */
4088				addr += L3_PAGE_SIZE;
4089				continue;
4090			}
4091			l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4092			l3e = &l3e[pmap_pml3e_index(addr)];
4093			if ((be64toh(*l3e) & PG_V) == 0) {
4094				pa |= PG_M | PG_A | PG_RW;
4095				pte_store(l3e, pa);
4096				pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4097				counter_u64_add(pmap_l3e_mappings, 1);
4098			} else {
4099				/* Continue on if the PDE is already valid. */
4100				pdpg->ref_count--;
4101				KASSERT(pdpg->ref_count > 0,
4102				    ("pmap_object_init_pt: missing reference "
4103				    "to page directory page, va: 0x%lx", addr));
4104			}
4105			addr += L3_PAGE_SIZE;
4106		}
4107		ptesync();
4108		PMAP_UNLOCK(pmap);
4109	}
4110}
4111
4112bool
4113mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
4114{
4115	struct md_page *pvh;
4116	struct rwlock *lock;
4117	pv_entry_t pv;
4118	int loops = 0;
4119	bool rv;
4120
4121	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4122	    ("pmap_page_exists_quick: page %p is not managed", m));
4123	CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
4124	rv = false;
4125	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4126	rw_rlock(lock);
4127	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4128		if (PV_PMAP(pv) == pmap) {
4129			rv = true;
4130			break;
4131		}
4132		loops++;
4133		if (loops >= 16)
4134			break;
4135	}
4136	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4137		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4138		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4139			if (PV_PMAP(pv) == pmap) {
4140				rv = true;
4141				break;
4142			}
4143			loops++;
4144			if (loops >= 16)
4145				break;
4146		}
4147	}
4148	rw_runlock(lock);
4149	return (rv);
4150}
4151
4152void
4153mmu_radix_page_init(vm_page_t m)
4154{
4155
4156	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4157	TAILQ_INIT(&m->md.pv_list);
4158	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
4159}
4160
4161int
4162mmu_radix_page_wired_mappings(vm_page_t m)
4163{
4164	struct rwlock *lock;
4165	struct md_page *pvh;
4166	pmap_t pmap;
4167	pt_entry_t *pte;
4168	pv_entry_t pv;
4169	int count, md_gen, pvh_gen;
4170
4171	if ((m->oflags & VPO_UNMANAGED) != 0)
4172		return (0);
4173	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4174	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4175	rw_rlock(lock);
4176restart:
4177	count = 0;
4178	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4179		pmap = PV_PMAP(pv);
4180		if (!PMAP_TRYLOCK(pmap)) {
4181			md_gen = m->md.pv_gen;
4182			rw_runlock(lock);
4183			PMAP_LOCK(pmap);
4184			rw_rlock(lock);
4185			if (md_gen != m->md.pv_gen) {
4186				PMAP_UNLOCK(pmap);
4187				goto restart;
4188			}
4189		}
4190		pte = pmap_pte(pmap, pv->pv_va);
4191		if ((be64toh(*pte) & PG_W) != 0)
4192			count++;
4193		PMAP_UNLOCK(pmap);
4194	}
4195	if ((m->flags & PG_FICTITIOUS) == 0) {
4196		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4197		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4198			pmap = PV_PMAP(pv);
4199			if (!PMAP_TRYLOCK(pmap)) {
4200				md_gen = m->md.pv_gen;
4201				pvh_gen = pvh->pv_gen;
4202				rw_runlock(lock);
4203				PMAP_LOCK(pmap);
4204				rw_rlock(lock);
4205				if (md_gen != m->md.pv_gen ||
4206				    pvh_gen != pvh->pv_gen) {
4207					PMAP_UNLOCK(pmap);
4208					goto restart;
4209				}
4210			}
4211			pte = pmap_pml3e(pmap, pv->pv_va);
4212			if ((be64toh(*pte) & PG_W) != 0)
4213				count++;
4214			PMAP_UNLOCK(pmap);
4215		}
4216	}
4217	rw_runlock(lock);
4218	return (count);
4219}
4220
4221static void
4222mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
4223{
4224	isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
4225}
4226
4227int
4228mmu_radix_pinit(pmap_t pmap)
4229{
4230	vmem_addr_t pid;
4231	vm_paddr_t l1pa;
4232
4233	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4234
4235	/*
4236	 * allocate the page directory page
4237	 */
4238	pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
4239
4240	for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
4241		pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
4242	vm_radix_init(&pmap->pm_radix);
4243	TAILQ_INIT(&pmap->pm_pvchunk);
4244	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4245	pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4246	vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
4247
4248	pmap->pm_pid = pid;
4249	l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
4250	mmu_radix_update_proctab(pid, l1pa);
4251	__asm __volatile("ptesync;isync" : : : "memory");
4252
4253	return (1);
4254}
4255
4256/*
4257 * This routine is called if the desired page table page does not exist.
4258 *
4259 * If page table page allocation fails, this routine may sleep before
4260 * returning NULL.  It sleeps only if a lock pointer was given.
4261 *
4262 * Note: If a page allocation fails at page table level two or three,
4263 * one or two pages may be held during the wait, only to be released
4264 * afterwards.  This conservative approach is easily argued to avoid
4265 * race conditions.
4266 */
4267static vm_page_t
4268_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
4269{
4270	vm_page_t m, pdppg, pdpg;
4271
4272	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4273
4274	/*
4275	 * Allocate a page table page.
4276	 */
4277	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
4278		if (lockp != NULL) {
4279			RELEASE_PV_LIST_LOCK(lockp);
4280			PMAP_UNLOCK(pmap);
4281			vm_wait(NULL);
4282			PMAP_LOCK(pmap);
4283		}
4284		/*
4285		 * Indicate the need to retry.  While waiting, the page table
4286		 * page may have been allocated.
4287		 */
4288		return (NULL);
4289	}
4290	m->pindex = ptepindex;
4291
4292	/*
4293	 * Map the pagetable page into the process address space, if
4294	 * it isn't already there.
4295	 */
4296
4297	if (ptepindex >= (NUPDE + NUPDPE)) {
4298		pml1_entry_t *l1e;
4299		vm_pindex_t pml1index;
4300
4301		/* Wire up a new PDPE page */
4302		pml1index = ptepindex - (NUPDE + NUPDPE);
4303		l1e = &pmap->pm_pml1[pml1index];
4304		KASSERT((be64toh(*l1e) & PG_V) == 0,
4305		    ("%s: L1 entry %#lx is valid", __func__, *l1e));
4306		pde_store(l1e, VM_PAGE_TO_PHYS(m));
4307	} else if (ptepindex >= NUPDE) {
4308		vm_pindex_t pml1index;
4309		vm_pindex_t pdpindex;
4310		pml1_entry_t *l1e;
4311		pml2_entry_t *l2e;
4312
4313		/* Wire up a new l2e page */
4314		pdpindex = ptepindex - NUPDE;
4315		pml1index = pdpindex >> RPTE_SHIFT;
4316
4317		l1e = &pmap->pm_pml1[pml1index];
4318		if ((be64toh(*l1e) & PG_V) == 0) {
4319			/* Have to allocate a new pdp, recurse */
4320			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
4321				lockp) == NULL) {
4322				vm_page_unwire_noq(m);
4323				vm_page_free_zero(m);
4324				return (NULL);
4325			}
4326		} else {
4327			/* Add reference to l2e page */
4328			pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
4329			pdppg->ref_count++;
4330		}
4331		l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4332
4333		/* Now find the pdp page */
4334		l2e = &l2e[pdpindex & RPTE_MASK];
4335		KASSERT((be64toh(*l2e) & PG_V) == 0,
4336		    ("%s: L2 entry %#lx is valid", __func__, *l2e));
4337		pde_store(l2e, VM_PAGE_TO_PHYS(m));
4338	} else {
4339		vm_pindex_t pml1index;
4340		vm_pindex_t pdpindex;
4341		pml1_entry_t *l1e;
4342		pml2_entry_t *l2e;
4343		pml3_entry_t *l3e;
4344
4345		/* Wire up a new PTE page */
4346		pdpindex = ptepindex >> RPTE_SHIFT;
4347		pml1index = pdpindex >> RPTE_SHIFT;
4348
4349		/* First, find the pdp and check that its valid. */
4350		l1e = &pmap->pm_pml1[pml1index];
4351		if ((be64toh(*l1e) & PG_V) == 0) {
4352			/* Have to allocate a new pd, recurse */
4353			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4354			    lockp) == NULL) {
4355				vm_page_unwire_noq(m);
4356				vm_page_free_zero(m);
4357				return (NULL);
4358			}
4359			l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4360			l2e = &l2e[pdpindex & RPTE_MASK];
4361		} else {
4362			l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4363			l2e = &l2e[pdpindex & RPTE_MASK];
4364			if ((be64toh(*l2e) & PG_V) == 0) {
4365				/* Have to allocate a new pd, recurse */
4366				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4367				    lockp) == NULL) {
4368					vm_page_unwire_noq(m);
4369					vm_page_free_zero(m);
4370					return (NULL);
4371				}
4372			} else {
4373				/* Add reference to the pd page */
4374				pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
4375				pdpg->ref_count++;
4376			}
4377		}
4378		l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
4379
4380		/* Now we know where the page directory page is */
4381		l3e = &l3e[ptepindex & RPTE_MASK];
4382		KASSERT((be64toh(*l3e) & PG_V) == 0,
4383		    ("%s: L3 entry %#lx is valid", __func__, *l3e));
4384		pde_store(l3e, VM_PAGE_TO_PHYS(m));
4385	}
4386
4387	pmap_resident_count_inc(pmap, 1);
4388	return (m);
4389}
4390static vm_page_t
4391pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4392{
4393	vm_pindex_t pdpindex, ptepindex;
4394	pml2_entry_t *pdpe;
4395	vm_page_t pdpg;
4396
4397retry:
4398	pdpe = pmap_pml2e(pmap, va);
4399	if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
4400		/* Add a reference to the pd page. */
4401		pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
4402		pdpg->ref_count++;
4403	} else {
4404		/* Allocate a pd page. */
4405		ptepindex = pmap_l3e_pindex(va);
4406		pdpindex = ptepindex >> RPTE_SHIFT;
4407		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
4408		if (pdpg == NULL && lockp != NULL)
4409			goto retry;
4410	}
4411	return (pdpg);
4412}
4413
4414static vm_page_t
4415pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4416{
4417	vm_pindex_t ptepindex;
4418	pml3_entry_t *pd;
4419	vm_page_t m;
4420
4421	/*
4422	 * Calculate pagetable page index
4423	 */
4424	ptepindex = pmap_l3e_pindex(va);
4425retry:
4426	/*
4427	 * Get the page directory entry
4428	 */
4429	pd = pmap_pml3e(pmap, va);
4430
4431	/*
4432	 * This supports switching from a 2MB page to a
4433	 * normal 4K page.
4434	 */
4435	if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
4436		if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
4437			/*
4438			 * Invalidation of the 2MB page mapping may have caused
4439			 * the deallocation of the underlying PD page.
4440			 */
4441			pd = NULL;
4442		}
4443	}
4444
4445	/*
4446	 * If the page table page is mapped, we just increment the
4447	 * hold count, and activate it.
4448	 */
4449	if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
4450		m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
4451		m->ref_count++;
4452	} else {
4453		/*
4454		 * Here if the pte page isn't mapped, or if it has been
4455		 * deallocated.
4456		 */
4457		m = _pmap_allocpte(pmap, ptepindex, lockp);
4458		if (m == NULL && lockp != NULL)
4459			goto retry;
4460	}
4461	return (m);
4462}
4463
4464static void
4465mmu_radix_pinit0(pmap_t pmap)
4466{
4467
4468	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4469	PMAP_LOCK_INIT(pmap);
4470	pmap->pm_pml1 = kernel_pmap->pm_pml1;
4471	pmap->pm_pid = kernel_pmap->pm_pid;
4472
4473	vm_radix_init(&pmap->pm_radix);
4474	TAILQ_INIT(&pmap->pm_pvchunk);
4475	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4476	kernel_pmap->pm_flags =
4477		pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4478}
4479/*
4480 * pmap_protect_l3e: do the things to protect a 2mpage in a process
4481 */
4482static bool
4483pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
4484{
4485	pt_entry_t newpde, oldpde;
4486	vm_offset_t eva, va;
4487	vm_page_t m;
4488	bool anychanged;
4489
4490	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4491	KASSERT((sva & L3_PAGE_MASK) == 0,
4492	    ("pmap_protect_l3e: sva is not 2mpage aligned"));
4493	anychanged = false;
4494retry:
4495	oldpde = newpde = be64toh(*l3e);
4496	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4497	    (PG_MANAGED | PG_M | PG_RW)) {
4498		eva = sva + L3_PAGE_SIZE;
4499		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4500		    va < eva; va += PAGE_SIZE, m++)
4501			vm_page_dirty(m);
4502	}
4503	if ((prot & VM_PROT_WRITE) == 0) {
4504		newpde &= ~(PG_RW | PG_M);
4505		newpde |= RPTE_EAA_R;
4506	}
4507	if (prot & VM_PROT_EXECUTE)
4508		newpde |= PG_X;
4509	if (newpde != oldpde) {
4510		/*
4511		 * As an optimization to future operations on this PDE, clear
4512		 * PG_PROMOTED.  The impending invalidation will remove any
4513		 * lingering 4KB page mappings from the TLB.
4514		 */
4515		if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
4516			goto retry;
4517		anychanged = true;
4518	}
4519	return (anychanged);
4520}
4521
4522void
4523mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4524    vm_prot_t prot)
4525{
4526	vm_offset_t va_next;
4527	pml1_entry_t *l1e;
4528	pml2_entry_t *l2e;
4529	pml3_entry_t ptpaddr, *l3e;
4530	pt_entry_t *pte;
4531	bool anychanged;
4532
4533	CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
4534	    prot);
4535
4536	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4537	if (prot == VM_PROT_NONE) {
4538		mmu_radix_remove(pmap, sva, eva);
4539		return;
4540	}
4541
4542	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4543	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
4544		return;
4545
4546#ifdef INVARIANTS
4547	if (VERBOSE_PROTECT || pmap_logging)
4548		printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
4549			   pmap, sva, eva, prot, pmap->pm_pid);
4550#endif
4551	anychanged = false;
4552
4553	PMAP_LOCK(pmap);
4554	for (; sva < eva; sva = va_next) {
4555		l1e = pmap_pml1e(pmap, sva);
4556		if ((be64toh(*l1e) & PG_V) == 0) {
4557			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
4558			if (va_next < sva)
4559				va_next = eva;
4560			continue;
4561		}
4562
4563		l2e = pmap_l1e_to_l2e(l1e, sva);
4564		if ((be64toh(*l2e) & PG_V) == 0) {
4565			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
4566			if (va_next < sva)
4567				va_next = eva;
4568			continue;
4569		}
4570
4571		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
4572		if (va_next < sva)
4573			va_next = eva;
4574
4575		l3e = pmap_l2e_to_l3e(l2e, sva);
4576		ptpaddr = be64toh(*l3e);
4577
4578		/*
4579		 * Weed out invalid mappings.
4580		 */
4581		if (ptpaddr == 0)
4582			continue;
4583
4584		/*
4585		 * Check for large page.
4586		 */
4587		if ((ptpaddr & RPTE_LEAF) != 0) {
4588			/*
4589			 * Are we protecting the entire large page?  If not,
4590			 * demote the mapping and fall through.
4591			 */
4592			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
4593				if (pmap_protect_l3e(pmap, l3e, sva, prot))
4594					anychanged = true;
4595				continue;
4596			} else if (!pmap_demote_l3e(pmap, l3e, sva)) {
4597				/*
4598				 * The large page mapping was destroyed.
4599				 */
4600				continue;
4601			}
4602		}
4603
4604		if (va_next > eva)
4605			va_next = eva;
4606
4607		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
4608		    sva += PAGE_SIZE) {
4609			pt_entry_t obits, pbits;
4610			vm_page_t m;
4611
4612retry:
4613			MPASS(pte == pmap_pte(pmap, sva));
4614			obits = pbits = be64toh(*pte);
4615			if ((pbits & PG_V) == 0)
4616				continue;
4617
4618			if ((prot & VM_PROT_WRITE) == 0) {
4619				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4620				    (PG_MANAGED | PG_M | PG_RW)) {
4621					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4622					vm_page_dirty(m);
4623				}
4624				pbits &= ~(PG_RW | PG_M);
4625				pbits |= RPTE_EAA_R;
4626			}
4627			if (prot & VM_PROT_EXECUTE)
4628				pbits |= PG_X;
4629
4630			if (pbits != obits) {
4631				if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
4632					goto retry;
4633				if (obits & (PG_A|PG_M)) {
4634					anychanged = true;
4635#ifdef INVARIANTS
4636					if (VERBOSE_PROTECT || pmap_logging)
4637						printf("%#lx %#lx -> %#lx\n",
4638						    sva, obits, pbits);
4639#endif
4640				}
4641			}
4642		}
4643	}
4644	if (anychanged)
4645		pmap_invalidate_all(pmap);
4646	PMAP_UNLOCK(pmap);
4647}
4648
4649void
4650mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4651{
4652
4653	CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
4654	pt_entry_t oldpte, pa, *pte;
4655	vm_page_t m;
4656	uint64_t cache_bits, attr_bits;
4657	vm_offset_t va;
4658
4659	oldpte = 0;
4660	attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
4661	va = sva;
4662	pte = kvtopte(va);
4663	while (va < sva + PAGE_SIZE * count) {
4664		if (__predict_false((va & L3_PAGE_MASK) == 0))
4665			pte = kvtopte(va);
4666		MPASS(pte == pmap_pte(kernel_pmap, va));
4667
4668		/*
4669		 * XXX there has to be a more efficient way than traversing
4670		 * the page table every time - but go for correctness for
4671		 * today
4672		 */
4673
4674		m = *ma++;
4675		cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
4676		pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
4677		if (be64toh(*pte) != pa) {
4678			oldpte |= be64toh(*pte);
4679			pte_store(pte, pa);
4680		}
4681		va += PAGE_SIZE;
4682		pte++;
4683	}
4684	if (__predict_false((oldpte & RPTE_VALID) != 0))
4685		pmap_invalidate_range(kernel_pmap, sva, sva + count *
4686		    PAGE_SIZE);
4687	else
4688		ptesync();
4689}
4690
4691void
4692mmu_radix_qremove(vm_offset_t sva, int count)
4693{
4694	vm_offset_t va;
4695	pt_entry_t *pte;
4696
4697	CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
4698	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
4699
4700	va = sva;
4701	pte = kvtopte(va);
4702	while (va < sva + PAGE_SIZE * count) {
4703		if (__predict_false((va & L3_PAGE_MASK) == 0))
4704			pte = kvtopte(va);
4705		pte_clear(pte);
4706		pte++;
4707		va += PAGE_SIZE;
4708	}
4709	pmap_invalidate_range(kernel_pmap, sva, va);
4710}
4711
4712/***************************************************
4713 * Page table page management routines.....
4714 ***************************************************/
4715/*
4716 * Schedule the specified unused page table page to be freed.  Specifically,
4717 * add the page to the specified list of pages that will be released to the
4718 * physical memory manager after the TLB has been updated.
4719 */
4720static __inline void
4721pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
4722{
4723
4724	if (set_PG_ZERO)
4725		m->flags |= PG_ZERO;
4726	else
4727		m->flags &= ~PG_ZERO;
4728	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4729}
4730
4731/*
4732 * Inserts the specified page table page into the specified pmap's collection
4733 * of idle page table pages.  Each of a pmap's page table pages is responsible
4734 * for mapping a distinct range of virtual addresses.  The pmap's collection is
4735 * ordered by this virtual address range.
4736 */
4737static __inline int
4738pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
4739{
4740
4741	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4742	return (vm_radix_insert(&pmap->pm_radix, mpte));
4743}
4744
4745/*
4746 * Removes the page table page mapping the specified virtual address from the
4747 * specified pmap's collection of idle page table pages, and returns it.
4748 * Otherwise, returns NULL if there is no page table page corresponding to the
4749 * specified virtual address.
4750 */
4751static __inline vm_page_t
4752pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4753{
4754
4755	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4756	return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
4757}
4758
4759/*
4760 * Decrements a page table page's wire count, which is used to record the
4761 * number of valid page table entries within the page.  If the wire count
4762 * drops to zero, then the page table page is unmapped.  Returns true if the
4763 * page table page was unmapped and false otherwise.
4764 */
4765static inline bool
4766pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4767{
4768
4769	--m->ref_count;
4770	if (m->ref_count == 0) {
4771		_pmap_unwire_ptp(pmap, va, m, free);
4772		return (true);
4773	} else
4774		return (false);
4775}
4776
4777static void
4778_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4779{
4780
4781	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4782	/*
4783	 * unmap the page table page
4784	 */
4785	if (m->pindex >= NUPDE + NUPDPE) {
4786		/* PDP page */
4787		pml1_entry_t *pml1;
4788		pml1 = pmap_pml1e(pmap, va);
4789		*pml1 = 0;
4790	} else if (m->pindex >= NUPDE) {
4791		/* PD page */
4792		pml2_entry_t *l2e;
4793		l2e = pmap_pml2e(pmap, va);
4794		*l2e = 0;
4795	} else {
4796		/* PTE page */
4797		pml3_entry_t *l3e;
4798		l3e = pmap_pml3e(pmap, va);
4799		*l3e = 0;
4800	}
4801	pmap_resident_count_dec(pmap, 1);
4802	if (m->pindex < NUPDE) {
4803		/* We just released a PT, unhold the matching PD */
4804		vm_page_t pdpg;
4805
4806		pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
4807		pmap_unwire_ptp(pmap, va, pdpg, free);
4808	}
4809	else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
4810		/* We just released a PD, unhold the matching PDP */
4811		vm_page_t pdppg;
4812
4813		pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
4814		pmap_unwire_ptp(pmap, va, pdppg, free);
4815	}
4816
4817	/*
4818	 * Put page on a list so that it is released after
4819	 * *ALL* TLB shootdown is done
4820	 */
4821	pmap_add_delayed_free_list(m, free, true);
4822}
4823
4824/*
4825 * After removing a page table entry, this routine is used to
4826 * conditionally free the page, and manage the hold/wire counts.
4827 */
4828static int
4829pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
4830    struct spglist *free)
4831{
4832	vm_page_t mpte;
4833
4834	if (va >= VM_MAXUSER_ADDRESS)
4835		return (0);
4836	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4837	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4838	return (pmap_unwire_ptp(pmap, va, mpte, free));
4839}
4840
4841void
4842mmu_radix_release(pmap_t pmap)
4843{
4844
4845	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4846	KASSERT(pmap->pm_stats.resident_count == 0,
4847	    ("pmap_release: pmap resident count %ld != 0",
4848	    pmap->pm_stats.resident_count));
4849	KASSERT(vm_radix_is_empty(&pmap->pm_radix),
4850	    ("pmap_release: pmap has reserved page table page(s)"));
4851
4852	pmap_invalidate_all(pmap);
4853	isa3_proctab[pmap->pm_pid].proctab0 = 0;
4854	uma_zfree(zone_radix_pgd, pmap->pm_pml1);
4855	vmem_free(asid_arena, pmap->pm_pid, 1);
4856}
4857
4858/*
4859 * Create the PV entry for a 2MB page mapping.  Always returns true unless the
4860 * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
4861 * false if the PV entry cannot be allocated without resorting to reclamation.
4862 */
4863static bool
4864pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
4865    struct rwlock **lockp)
4866{
4867	struct md_page *pvh;
4868	pv_entry_t pv;
4869	vm_paddr_t pa;
4870
4871	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4872	/* Pass NULL instead of the lock pointer to disable reclamation. */
4873	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4874	    NULL : lockp)) == NULL)
4875		return (false);
4876	pv->pv_va = va;
4877	pa = pde & PG_PS_FRAME;
4878	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4879	pvh = pa_to_pvh(pa);
4880	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
4881	pvh->pv_gen++;
4882	return (true);
4883}
4884
4885/*
4886 * Fills a page table page with mappings to consecutive physical pages.
4887 */
4888static void
4889pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4890{
4891	pt_entry_t *pte;
4892
4893	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4894		*pte = htobe64(newpte);
4895		newpte += PAGE_SIZE;
4896	}
4897}
4898
4899static bool
4900pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
4901{
4902	struct rwlock *lock;
4903	bool rv;
4904
4905	lock = NULL;
4906	rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
4907	if (lock != NULL)
4908		rw_wunlock(lock);
4909	return (rv);
4910}
4911
4912static bool
4913pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
4914    struct rwlock **lockp)
4915{
4916	pml3_entry_t oldpde;
4917	pt_entry_t *firstpte;
4918	vm_paddr_t mptepa;
4919	vm_page_t mpte;
4920	struct spglist free;
4921	vm_offset_t sva;
4922
4923	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4924	oldpde = be64toh(*l3e);
4925	KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
4926	    ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
4927	    oldpde));
4928	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4929	    NULL) {
4930		KASSERT((oldpde & PG_W) == 0,
4931		    ("pmap_demote_l3e: page table page for a wired mapping"
4932		    " is missing"));
4933
4934		/*
4935		 * Invalidate the 2MB page mapping and return "failure" if the
4936		 * mapping was never accessed or the allocation of the new
4937		 * page table page fails.  If the 2MB page mapping belongs to
4938		 * the direct map region of the kernel's address space, then
4939		 * the page allocation request specifies the highest possible
4940		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4941		 * normal.  Page table pages are preallocated for every other
4942		 * part of the kernel address space, so the direct map region
4943		 * is the only part of the kernel address space that must be
4944		 * handled here.
4945		 */
4946		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj(
4947		    (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ?
4948		    VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) {
4949			SLIST_INIT(&free);
4950			sva = trunc_2mpage(va);
4951			pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
4952			pmap_invalidate_l3e_page(pmap, sva, oldpde);
4953			vm_page_free_pages_toq(&free, true);
4954			CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
4955			    " in pmap %p", va, pmap);
4956			return (false);
4957		}
4958		mpte->pindex = pmap_l3e_pindex(va);
4959		if (va < VM_MAXUSER_ADDRESS)
4960			pmap_resident_count_inc(pmap, 1);
4961	}
4962	mptepa = VM_PAGE_TO_PHYS(mpte);
4963	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4964	KASSERT((oldpde & PG_A) != 0,
4965	    ("pmap_demote_l3e: oldpde is missing PG_A"));
4966	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4967	    ("pmap_demote_l3e: oldpde is missing PG_M"));
4968
4969	/*
4970	 * If the page table page is new, initialize it.
4971	 */
4972	if (mpte->ref_count == 1) {
4973		mpte->ref_count = NPTEPG;
4974		pmap_fill_ptp(firstpte, oldpde);
4975	}
4976
4977	KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
4978	    ("pmap_demote_l3e: firstpte and newpte map different physical"
4979	    " addresses"));
4980
4981	/*
4982	 * If the mapping has changed attributes, update the page table
4983	 * entries.
4984	 */
4985	if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
4986		pmap_fill_ptp(firstpte, oldpde);
4987
4988	/*
4989	 * The spare PV entries must be reserved prior to demoting the
4990	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
4991	 * of the PDE and the PV lists will be inconsistent, which can result
4992	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
4993	 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
4994	 * PV entry for the 2MB page mapping that is being demoted.
4995	 */
4996	if ((oldpde & PG_MANAGED) != 0)
4997		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4998
4999	/*
5000	 * Demote the mapping.  This pmap is locked.  The old PDE has
5001	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
5002	 * set.  Thus, there is no danger of a race with another
5003	 * processor changing the setting of PG_A and/or PG_M between
5004	 * the read above and the store below.
5005	 */
5006	pde_store(l3e, mptepa);
5007	pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde);
5008	/*
5009	 * Demote the PV entry.
5010	 */
5011	if ((oldpde & PG_MANAGED) != 0)
5012		pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
5013
5014	counter_u64_add(pmap_l3e_demotions, 1);
5015	CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
5016	    " in pmap %p", va, pmap);
5017	return (true);
5018}
5019
5020/*
5021 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
5022 */
5023static void
5024pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
5025{
5026	vm_paddr_t mptepa;
5027	vm_page_t mpte;
5028
5029	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
5030	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5031	mpte = pmap_remove_pt_page(pmap, va);
5032	if (mpte == NULL)
5033		panic("pmap_remove_kernel_pde: Missing pt page.");
5034
5035	mptepa = VM_PAGE_TO_PHYS(mpte);
5036
5037	/*
5038	 * Initialize the page table page.
5039	 */
5040	pagezero(PHYS_TO_DMAP(mptepa));
5041
5042	/*
5043	 * Demote the mapping.
5044	 */
5045	pde_store(l3e, mptepa);
5046	ptesync();
5047}
5048
5049/*
5050 * pmap_remove_l3e: do the things to unmap a superpage in a process
5051 */
5052static int
5053pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
5054    struct spglist *free, struct rwlock **lockp)
5055{
5056	struct md_page *pvh;
5057	pml3_entry_t oldpde;
5058	vm_offset_t eva, va;
5059	vm_page_t m, mpte;
5060
5061	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5062	KASSERT((sva & L3_PAGE_MASK) == 0,
5063	    ("pmap_remove_l3e: sva is not 2mpage aligned"));
5064	oldpde = be64toh(pte_load_clear(pdq));
5065	if (oldpde & PG_W)
5066		pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
5067	pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5068	if (oldpde & PG_MANAGED) {
5069		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
5070		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
5071		pmap_pvh_free(pvh, pmap, sva);
5072		eva = sva + L3_PAGE_SIZE;
5073		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
5074		    va < eva; va += PAGE_SIZE, m++) {
5075			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
5076				vm_page_dirty(m);
5077			if (oldpde & PG_A)
5078				vm_page_aflag_set(m, PGA_REFERENCED);
5079			if (TAILQ_EMPTY(&m->md.pv_list) &&
5080			    TAILQ_EMPTY(&pvh->pv_list))
5081				vm_page_aflag_clear(m, PGA_WRITEABLE);
5082		}
5083	}
5084	if (pmap == kernel_pmap) {
5085		pmap_remove_kernel_l3e(pmap, pdq, sva);
5086	} else {
5087		mpte = pmap_remove_pt_page(pmap, sva);
5088		if (mpte != NULL) {
5089			pmap_resident_count_dec(pmap, 1);
5090			KASSERT(mpte->ref_count == NPTEPG,
5091			    ("pmap_remove_l3e: pte page wire count error"));
5092			mpte->ref_count = 0;
5093			pmap_add_delayed_free_list(mpte, free, false);
5094		}
5095	}
5096	return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
5097}
5098
5099/*
5100 * pmap_remove_pte: do the things to unmap a page in a process
5101 */
5102static int
5103pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
5104    pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5105{
5106	struct md_page *pvh;
5107	pt_entry_t oldpte;
5108	vm_page_t m;
5109
5110	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5111	oldpte = be64toh(pte_load_clear(ptq));
5112	if (oldpte & RPTE_WIRED)
5113		pmap->pm_stats.wired_count -= 1;
5114	pmap_resident_count_dec(pmap, 1);
5115	if (oldpte & RPTE_MANAGED) {
5116		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5117		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5118			vm_page_dirty(m);
5119		if (oldpte & PG_A)
5120			vm_page_aflag_set(m, PGA_REFERENCED);
5121		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5122		pmap_pvh_free(&m->md, pmap, va);
5123		if (TAILQ_EMPTY(&m->md.pv_list) &&
5124		    (m->flags & PG_FICTITIOUS) == 0) {
5125			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5126			if (TAILQ_EMPTY(&pvh->pv_list))
5127				vm_page_aflag_clear(m, PGA_WRITEABLE);
5128		}
5129	}
5130	return (pmap_unuse_pt(pmap, va, ptepde, free));
5131}
5132
5133/*
5134 * Remove a single page from a process address space
5135 */
5136static bool
5137pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
5138    struct spglist *free)
5139{
5140	struct rwlock *lock;
5141	pt_entry_t *pte;
5142	bool invalidate_all;
5143
5144	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5145	if ((be64toh(*l3e) & RPTE_VALID) == 0) {
5146		return (false);
5147	}
5148	pte = pmap_l3e_to_pte(l3e, va);
5149	if ((be64toh(*pte) & RPTE_VALID) == 0) {
5150		return (false);
5151	}
5152	lock = NULL;
5153
5154	invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
5155	if (lock != NULL)
5156		rw_wunlock(lock);
5157	if (!invalidate_all)
5158		pmap_invalidate_page(pmap, va);
5159	return (invalidate_all);
5160}
5161
5162/*
5163 * Removes the specified range of addresses from the page table page.
5164 */
5165static bool
5166pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5167    pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
5168{
5169	pt_entry_t *pte;
5170	vm_offset_t va;
5171	bool anyvalid;
5172
5173	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5174	anyvalid = false;
5175	va = eva;
5176	for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
5177	    sva += PAGE_SIZE) {
5178		MPASS(pte == pmap_pte(pmap, sva));
5179		if (*pte == 0) {
5180			if (va != eva) {
5181				anyvalid = true;
5182				va = eva;
5183			}
5184			continue;
5185		}
5186		if (va == eva)
5187			va = sva;
5188		if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
5189			anyvalid = true;
5190			sva += PAGE_SIZE;
5191			break;
5192		}
5193	}
5194	if (anyvalid)
5195		pmap_invalidate_all(pmap);
5196	else if (va != eva)
5197		pmap_invalidate_range(pmap, va, sva);
5198	return (anyvalid);
5199}
5200
5201void
5202mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5203{
5204	struct rwlock *lock;
5205	vm_offset_t va_next;
5206	pml1_entry_t *l1e;
5207	pml2_entry_t *l2e;
5208	pml3_entry_t ptpaddr, *l3e;
5209	struct spglist free;
5210	bool anyvalid;
5211
5212	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5213
5214	/*
5215	 * Perform an unsynchronized read.  This is, however, safe.
5216	 */
5217	if (pmap->pm_stats.resident_count == 0)
5218		return;
5219
5220	anyvalid = false;
5221	SLIST_INIT(&free);
5222
5223	/* XXX something fishy here */
5224	sva = (sva + PAGE_MASK) & ~PAGE_MASK;
5225	eva = (eva + PAGE_MASK) & ~PAGE_MASK;
5226
5227	PMAP_LOCK(pmap);
5228
5229	/*
5230	 * special handling of removing one page.  a very
5231	 * common operation and easy to short circuit some
5232	 * code.
5233	 */
5234	if (sva + PAGE_SIZE == eva) {
5235		l3e = pmap_pml3e(pmap, sva);
5236		if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
5237			anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
5238			goto out;
5239		}
5240	}
5241
5242	lock = NULL;
5243	for (; sva < eva; sva = va_next) {
5244		if (pmap->pm_stats.resident_count == 0)
5245			break;
5246		l1e = pmap_pml1e(pmap, sva);
5247		if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
5248			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5249			if (va_next < sva)
5250				va_next = eva;
5251			continue;
5252		}
5253
5254		l2e = pmap_l1e_to_l2e(l1e, sva);
5255		if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
5256			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5257			if (va_next < sva)
5258				va_next = eva;
5259			continue;
5260		}
5261
5262		/*
5263		 * Calculate index for next page table.
5264		 */
5265		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5266		if (va_next < sva)
5267			va_next = eva;
5268
5269		l3e = pmap_l2e_to_l3e(l2e, sva);
5270		ptpaddr = be64toh(*l3e);
5271
5272		/*
5273		 * Weed out invalid mappings.
5274		 */
5275		if (ptpaddr == 0)
5276			continue;
5277
5278		/*
5279		 * Check for large page.
5280		 */
5281		if ((ptpaddr & RPTE_LEAF) != 0) {
5282			/*
5283			 * Are we removing the entire large page?  If not,
5284			 * demote the mapping and fall through.
5285			 */
5286			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5287				pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
5288				anyvalid = true;
5289				continue;
5290			} else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
5291			    &lock)) {
5292				/* The large page mapping was destroyed. */
5293				continue;
5294			} else
5295				ptpaddr = be64toh(*l3e);
5296		}
5297
5298		/*
5299		 * Limit our scan to either the end of the va represented
5300		 * by the current page table page, or to the end of the
5301		 * range being removed.
5302		 */
5303		if (va_next > eva)
5304			va_next = eva;
5305
5306		if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
5307			anyvalid = true;
5308	}
5309	if (lock != NULL)
5310		rw_wunlock(lock);
5311out:
5312	if (anyvalid)
5313		pmap_invalidate_all(pmap);
5314	PMAP_UNLOCK(pmap);
5315	vm_page_free_pages_toq(&free, true);
5316}
5317
5318void
5319mmu_radix_remove_all(vm_page_t m)
5320{
5321	struct md_page *pvh;
5322	pv_entry_t pv;
5323	pmap_t pmap;
5324	struct rwlock *lock;
5325	pt_entry_t *pte, tpte;
5326	pml3_entry_t *l3e;
5327	vm_offset_t va;
5328	struct spglist free;
5329	int pvh_gen, md_gen;
5330
5331	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5332	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5333	    ("pmap_remove_all: page %p is not managed", m));
5334	SLIST_INIT(&free);
5335	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5336	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5337	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
5338retry:
5339	rw_wlock(lock);
5340	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5341		pmap = PV_PMAP(pv);
5342		if (!PMAP_TRYLOCK(pmap)) {
5343			pvh_gen = pvh->pv_gen;
5344			rw_wunlock(lock);
5345			PMAP_LOCK(pmap);
5346			rw_wlock(lock);
5347			if (pvh_gen != pvh->pv_gen) {
5348				rw_wunlock(lock);
5349				PMAP_UNLOCK(pmap);
5350				goto retry;
5351			}
5352		}
5353		va = pv->pv_va;
5354		l3e = pmap_pml3e(pmap, va);
5355		(void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
5356		PMAP_UNLOCK(pmap);
5357	}
5358	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5359		pmap = PV_PMAP(pv);
5360		if (!PMAP_TRYLOCK(pmap)) {
5361			pvh_gen = pvh->pv_gen;
5362			md_gen = m->md.pv_gen;
5363			rw_wunlock(lock);
5364			PMAP_LOCK(pmap);
5365			rw_wlock(lock);
5366			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5367				rw_wunlock(lock);
5368				PMAP_UNLOCK(pmap);
5369				goto retry;
5370			}
5371		}
5372		pmap_resident_count_dec(pmap, 1);
5373		l3e = pmap_pml3e(pmap, pv->pv_va);
5374		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
5375		    " a 2mpage in page %p's pv list", m));
5376		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5377		tpte = be64toh(pte_load_clear(pte));
5378		if (tpte & PG_W)
5379			pmap->pm_stats.wired_count--;
5380		if (tpte & PG_A)
5381			vm_page_aflag_set(m, PGA_REFERENCED);
5382
5383		/*
5384		 * Update the vm_page_t clean and reference bits.
5385		 */
5386		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5387			vm_page_dirty(m);
5388		pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
5389		pmap_invalidate_page(pmap, pv->pv_va);
5390		TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5391		m->md.pv_gen++;
5392		free_pv_entry(pmap, pv);
5393		PMAP_UNLOCK(pmap);
5394	}
5395	vm_page_aflag_clear(m, PGA_WRITEABLE);
5396	rw_wunlock(lock);
5397	vm_page_free_pages_toq(&free, true);
5398}
5399
5400/*
5401 * Destroy all managed, non-wired mappings in the given user-space
5402 * pmap.  This pmap cannot be active on any processor besides the
5403 * caller.
5404 *
5405 * This function cannot be applied to the kernel pmap.  Moreover, it
5406 * is not intended for general use.  It is only to be used during
5407 * process termination.  Consequently, it can be implemented in ways
5408 * that make it faster than pmap_remove().  First, it can more quickly
5409 * destroy mappings by iterating over the pmap's collection of PV
5410 * entries, rather than searching the page table.  Second, it doesn't
5411 * have to test and clear the page table entries atomically, because
5412 * no processor is currently accessing the user address space.  In
5413 * particular, a page table entry's dirty bit won't change state once
5414 * this function starts.
5415 *
5416 * Although this function destroys all of the pmap's managed,
5417 * non-wired mappings, it can delay and batch the invalidation of TLB
5418 * entries without calling pmap_delayed_invl_started() and
5419 * pmap_delayed_invl_finished().  Because the pmap is not active on
5420 * any other processor, none of these TLB entries will ever be used
5421 * before their eventual invalidation.  Consequently, there is no need
5422 * for either pmap_remove_all() or pmap_remove_write() to wait for
5423 * that eventual TLB invalidation.
5424 */
5425
5426void
5427mmu_radix_remove_pages(pmap_t pmap)
5428{
5429
5430	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
5431	pml3_entry_t ptel3e;
5432	pt_entry_t *pte, tpte;
5433	struct spglist free;
5434	vm_page_t m, mpte, mt;
5435	pv_entry_t pv;
5436	struct md_page *pvh;
5437	struct pv_chunk *pc, *npc;
5438	struct rwlock *lock;
5439	int64_t bit;
5440	uint64_t inuse, bitmask;
5441	int allfree, field, idx;
5442#ifdef PV_STATS
5443	int freed;
5444#endif
5445	bool superpage;
5446	vm_paddr_t pa;
5447
5448	/*
5449	 * Assert that the given pmap is only active on the current
5450	 * CPU.  Unfortunately, we cannot block another CPU from
5451	 * activating the pmap while this function is executing.
5452	 */
5453	KASSERT(pmap->pm_pid == mfspr(SPR_PID),
5454	    ("non-current asid %lu - expected %lu", pmap->pm_pid,
5455	    mfspr(SPR_PID)));
5456
5457	lock = NULL;
5458
5459	SLIST_INIT(&free);
5460	PMAP_LOCK(pmap);
5461	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5462		allfree = 1;
5463#ifdef PV_STATS
5464		freed = 0;
5465#endif
5466		for (field = 0; field < _NPCM; field++) {
5467			inuse = ~pc->pc_map[field] & pc_freemask[field];
5468			while (inuse != 0) {
5469				bit = cnttzd(inuse);
5470				bitmask = 1UL << bit;
5471				idx = field * 64 + bit;
5472				pv = &pc->pc_pventry[idx];
5473				inuse &= ~bitmask;
5474
5475				pte = pmap_pml2e(pmap, pv->pv_va);
5476				ptel3e = be64toh(*pte);
5477				pte = pmap_l2e_to_l3e(pte, pv->pv_va);
5478				tpte = be64toh(*pte);
5479				if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
5480					superpage = false;
5481					ptel3e = tpte;
5482					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5483					    PG_FRAME);
5484					pte = &pte[pmap_pte_index(pv->pv_va)];
5485					tpte = be64toh(*pte);
5486				} else {
5487					/*
5488					 * Keep track whether 'tpte' is a
5489					 * superpage explicitly instead of
5490					 * relying on RPTE_LEAF being set.
5491					 *
5492					 * This is because RPTE_LEAF is numerically
5493					 * identical to PG_PTE_PAT and thus a
5494					 * regular page could be mistaken for
5495					 * a superpage.
5496					 */
5497					superpage = true;
5498				}
5499
5500				if ((tpte & PG_V) == 0) {
5501					panic("bad pte va %lx pte %lx",
5502					    pv->pv_va, tpte);
5503				}
5504
5505/*
5506 * We cannot remove wired pages from a process' mapping at this time
5507 */
5508				if (tpte & PG_W) {
5509					allfree = 0;
5510					continue;
5511				}
5512
5513				if (superpage)
5514					pa = tpte & PG_PS_FRAME;
5515				else
5516					pa = tpte & PG_FRAME;
5517
5518				m = PHYS_TO_VM_PAGE(pa);
5519				KASSERT(m->phys_addr == pa,
5520				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5521				    m, (uintmax_t)m->phys_addr,
5522				    (uintmax_t)tpte));
5523
5524				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5525				    m < &vm_page_array[vm_page_array_size],
5526				    ("pmap_remove_pages: bad tpte %#jx",
5527				    (uintmax_t)tpte));
5528
5529				pte_clear(pte);
5530
5531				/*
5532				 * Update the vm_page_t clean/reference bits.
5533				 */
5534				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5535					if (superpage) {
5536						for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5537							vm_page_dirty(mt);
5538					} else
5539						vm_page_dirty(m);
5540				}
5541
5542				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5543
5544				/* Mark free */
5545				pc->pc_map[field] |= bitmask;
5546				if (superpage) {
5547					pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5548					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5549					TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
5550					pvh->pv_gen++;
5551					if (TAILQ_EMPTY(&pvh->pv_list)) {
5552						for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5553							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5554							    TAILQ_EMPTY(&mt->md.pv_list))
5555								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5556					}
5557					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5558					if (mpte != NULL) {
5559						pmap_resident_count_dec(pmap, 1);
5560						KASSERT(mpte->ref_count == NPTEPG,
5561						    ("pmap_remove_pages: pte page wire count error"));
5562						mpte->ref_count = 0;
5563						pmap_add_delayed_free_list(mpte, &free, false);
5564					}
5565				} else {
5566					pmap_resident_count_dec(pmap, 1);
5567#ifdef VERBOSE_PV
5568					printf("freeing pv (%p, %p)\n",
5569						   pmap, pv);
5570#endif
5571					TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5572					m->md.pv_gen++;
5573					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5574					    TAILQ_EMPTY(&m->md.pv_list) &&
5575					    (m->flags & PG_FICTITIOUS) == 0) {
5576						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5577						if (TAILQ_EMPTY(&pvh->pv_list))
5578							vm_page_aflag_clear(m, PGA_WRITEABLE);
5579					}
5580				}
5581				pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
5582#ifdef PV_STATS
5583				freed++;
5584#endif
5585			}
5586		}
5587		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5588		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5589		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5590		if (allfree) {
5591			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5592			free_pv_chunk(pc);
5593		}
5594	}
5595	if (lock != NULL)
5596		rw_wunlock(lock);
5597	pmap_invalidate_all(pmap);
5598	PMAP_UNLOCK(pmap);
5599	vm_page_free_pages_toq(&free, true);
5600}
5601
5602void
5603mmu_radix_remove_write(vm_page_t m)
5604{
5605	struct md_page *pvh;
5606	pmap_t pmap;
5607	struct rwlock *lock;
5608	pv_entry_t next_pv, pv;
5609	pml3_entry_t *l3e;
5610	pt_entry_t oldpte, *pte;
5611	int pvh_gen, md_gen;
5612
5613	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5614	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5615	    ("pmap_remove_write: page %p is not managed", m));
5616	vm_page_assert_busied(m);
5617
5618	if (!pmap_page_is_write_mapped(m))
5619		return;
5620	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5621	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5622	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
5623retry_pv_loop:
5624	rw_wlock(lock);
5625	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
5626		pmap = PV_PMAP(pv);
5627		if (!PMAP_TRYLOCK(pmap)) {
5628			pvh_gen = pvh->pv_gen;
5629			rw_wunlock(lock);
5630			PMAP_LOCK(pmap);
5631			rw_wlock(lock);
5632			if (pvh_gen != pvh->pv_gen) {
5633				PMAP_UNLOCK(pmap);
5634				rw_wunlock(lock);
5635				goto retry_pv_loop;
5636			}
5637		}
5638		l3e = pmap_pml3e(pmap, pv->pv_va);
5639		if ((be64toh(*l3e) & PG_RW) != 0)
5640			(void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
5641		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5642		    ("inconsistent pv lock %p %p for page %p",
5643		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5644		PMAP_UNLOCK(pmap);
5645	}
5646	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
5647		pmap = PV_PMAP(pv);
5648		if (!PMAP_TRYLOCK(pmap)) {
5649			pvh_gen = pvh->pv_gen;
5650			md_gen = m->md.pv_gen;
5651			rw_wunlock(lock);
5652			PMAP_LOCK(pmap);
5653			rw_wlock(lock);
5654			if (pvh_gen != pvh->pv_gen ||
5655			    md_gen != m->md.pv_gen) {
5656				PMAP_UNLOCK(pmap);
5657				rw_wunlock(lock);
5658				goto retry_pv_loop;
5659			}
5660		}
5661		l3e = pmap_pml3e(pmap, pv->pv_va);
5662		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
5663		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5664		    m));
5665		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5666retry:
5667		oldpte = be64toh(*pte);
5668		if (oldpte & PG_RW) {
5669			if (!atomic_cmpset_long(pte, htobe64(oldpte),
5670			    htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
5671				goto retry;
5672			if ((oldpte & PG_M) != 0)
5673				vm_page_dirty(m);
5674			pmap_invalidate_page(pmap, pv->pv_va);
5675		}
5676		PMAP_UNLOCK(pmap);
5677	}
5678	rw_wunlock(lock);
5679	vm_page_aflag_clear(m, PGA_WRITEABLE);
5680}
5681
5682/*
5683 *	Clear the wired attribute from the mappings for the specified range of
5684 *	addresses in the given pmap.  Every valid mapping within that range
5685 *	must have the wired attribute set.  In contrast, invalid mappings
5686 *	cannot have the wired attribute set, so they are ignored.
5687 *
5688 *	The wired attribute of the page table entry is not a hardware
5689 *	feature, so there is no need to invalidate any TLB entries.
5690 *	Since pmap_demote_l3e() for the wired entry must never fail,
5691 *	pmap_delayed_invl_started()/finished() calls around the
5692 *	function are not needed.
5693 */
5694void
5695mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5696{
5697	vm_offset_t va_next;
5698	pml1_entry_t *l1e;
5699	pml2_entry_t *l2e;
5700	pml3_entry_t *l3e;
5701	pt_entry_t *pte;
5702
5703	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5704	PMAP_LOCK(pmap);
5705	for (; sva < eva; sva = va_next) {
5706		l1e = pmap_pml1e(pmap, sva);
5707		if ((be64toh(*l1e) & PG_V) == 0) {
5708			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5709			if (va_next < sva)
5710				va_next = eva;
5711			continue;
5712		}
5713		l2e = pmap_l1e_to_l2e(l1e, sva);
5714		if ((be64toh(*l2e) & PG_V) == 0) {
5715			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5716			if (va_next < sva)
5717				va_next = eva;
5718			continue;
5719		}
5720		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5721		if (va_next < sva)
5722			va_next = eva;
5723		l3e = pmap_l2e_to_l3e(l2e, sva);
5724		if ((be64toh(*l3e) & PG_V) == 0)
5725			continue;
5726		if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
5727			if ((be64toh(*l3e) & PG_W) == 0)
5728				panic("pmap_unwire: pde %#jx is missing PG_W",
5729				    (uintmax_t)(be64toh(*l3e)));
5730
5731			/*
5732			 * Are we unwiring the entire large page?  If not,
5733			 * demote the mapping and fall through.
5734			 */
5735			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5736				atomic_clear_long(l3e, htobe64(PG_W));
5737				pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
5738				    PAGE_SIZE;
5739				continue;
5740			} else if (!pmap_demote_l3e(pmap, l3e, sva))
5741				panic("pmap_unwire: demotion failed");
5742		}
5743		if (va_next > eva)
5744			va_next = eva;
5745		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
5746		    sva += PAGE_SIZE) {
5747			MPASS(pte == pmap_pte(pmap, sva));
5748			if ((be64toh(*pte) & PG_V) == 0)
5749				continue;
5750			if ((be64toh(*pte) & PG_W) == 0)
5751				panic("pmap_unwire: pte %#jx is missing PG_W",
5752				    (uintmax_t)(be64toh(*pte)));
5753
5754			/*
5755			 * PG_W must be cleared atomically.  Although the pmap
5756			 * lock synchronizes access to PG_W, another processor
5757			 * could be setting PG_M and/or PG_A concurrently.
5758			 */
5759			atomic_clear_long(pte, htobe64(PG_W));
5760			pmap->pm_stats.wired_count--;
5761		}
5762	}
5763	PMAP_UNLOCK(pmap);
5764}
5765
5766void
5767mmu_radix_zero_page(vm_page_t m)
5768{
5769	vm_offset_t addr;
5770
5771	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5772	addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5773	pagezero(addr);
5774}
5775
5776void
5777mmu_radix_zero_page_area(vm_page_t m, int off, int size)
5778{
5779	caddr_t addr;
5780
5781	CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
5782	MPASS(off + size <= PAGE_SIZE);
5783	addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5784	memset(addr + off, 0, size);
5785}
5786
5787static int
5788mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5789{
5790	pml3_entry_t *l3ep;
5791	pt_entry_t pte;
5792	vm_paddr_t pa;
5793	int val;
5794
5795	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
5796	PMAP_LOCK(pmap);
5797
5798	l3ep = pmap_pml3e(pmap, addr);
5799	if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
5800		if (be64toh(*l3ep) & RPTE_LEAF) {
5801			pte = be64toh(*l3ep);
5802			/* Compute the physical address of the 4KB page. */
5803			pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
5804			    PG_FRAME;
5805			val = MINCORE_PSIND(1);
5806		} else {
5807			/* Native endian PTE, do not pass to functions */
5808			pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
5809			pa = pte & PG_FRAME;
5810			val = 0;
5811		}
5812	} else {
5813		pte = 0;
5814		pa = 0;
5815		val = 0;
5816	}
5817	if ((pte & PG_V) != 0) {
5818		val |= MINCORE_INCORE;
5819		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5820			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5821		if ((pte & PG_A) != 0)
5822			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5823	}
5824	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5825	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5826	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5827		*locked_pa = pa;
5828	}
5829	PMAP_UNLOCK(pmap);
5830	return (val);
5831}
5832
5833void
5834mmu_radix_activate(struct thread *td)
5835{
5836	pmap_t pmap;
5837	uint32_t curpid;
5838
5839	CTR2(KTR_PMAP, "%s(%p)", __func__, td);
5840	critical_enter();
5841	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5842	curpid = mfspr(SPR_PID);
5843	if (pmap->pm_pid > isa3_base_pid &&
5844		curpid != pmap->pm_pid) {
5845		mmu_radix_pid_set(pmap);
5846	}
5847	critical_exit();
5848}
5849
5850/*
5851 *	Increase the starting virtual address of the given mapping if a
5852 *	different alignment might result in more superpage mappings.
5853 */
5854void
5855mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
5856    vm_offset_t *addr, vm_size_t size)
5857{
5858
5859	CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
5860	    size);
5861	vm_offset_t superpage_offset;
5862
5863	if (size < L3_PAGE_SIZE)
5864		return;
5865	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5866		offset += ptoa(object->pg_color);
5867	superpage_offset = offset & L3_PAGE_MASK;
5868	if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
5869	    (*addr & L3_PAGE_MASK) == superpage_offset)
5870		return;
5871	if ((*addr & L3_PAGE_MASK) < superpage_offset)
5872		*addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
5873	else
5874		*addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
5875}
5876
5877static void *
5878mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
5879{
5880	vm_offset_t va, tmpva, ppa, offset;
5881
5882	ppa = trunc_page(pa);
5883	offset = pa & PAGE_MASK;
5884	size = roundup2(offset + size, PAGE_SIZE);
5885	if (pa < powerpc_ptob(Maxmem))
5886		panic("bad pa: %#lx less than Maxmem %#lx\n",
5887			  pa, powerpc_ptob(Maxmem));
5888	va = kva_alloc(size);
5889	if (bootverbose)
5890		printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
5891	KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
5892
5893	if (!va)
5894		panic("%s: Couldn't alloc kernel virtual memory", __func__);
5895
5896	for (tmpva = va; size > 0;) {
5897		mmu_radix_kenter_attr(tmpva, ppa, attr);
5898		size -= PAGE_SIZE;
5899		tmpva += PAGE_SIZE;
5900		ppa += PAGE_SIZE;
5901	}
5902	ptesync();
5903
5904	return ((void *)(va + offset));
5905}
5906
5907static void *
5908mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
5909{
5910
5911	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
5912
5913	return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
5914}
5915
5916void
5917mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5918{
5919
5920	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
5921	m->md.mdpg_cache_attrs = ma;
5922
5923	/*
5924	 * If "m" is a normal page, update its direct mapping.  This update
5925	 * can be relied upon to perform any cache operations that are
5926	 * required for data coherence.
5927	 */
5928	if ((m->flags & PG_FICTITIOUS) == 0 &&
5929	    mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
5930	    PAGE_SIZE, m->md.mdpg_cache_attrs))
5931		panic("memory attribute change on the direct map failed");
5932}
5933
5934static void
5935mmu_radix_unmapdev(void *p, vm_size_t size)
5936{
5937	vm_offset_t offset, va;
5938
5939	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, p, size);
5940
5941	/* If we gave a direct map region in pmap_mapdev, do nothing */
5942	va = (vm_offset_t)p;
5943	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5944		return;
5945
5946	offset = va & PAGE_MASK;
5947	size = round_page(offset + size);
5948	va = trunc_page(va);
5949
5950	if (pmap_initialized) {
5951		mmu_radix_qremove(va, atop(size));
5952		kva_free(va, size);
5953	}
5954}
5955
5956void
5957mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5958{
5959	vm_paddr_t pa = 0;
5960	int sync_sz;
5961
5962	if (__predict_false(pm == NULL))
5963		pm = &curthread->td_proc->p_vmspace->vm_pmap;
5964
5965	while (sz > 0) {
5966		pa = pmap_extract(pm, va);
5967		sync_sz = PAGE_SIZE - (va & PAGE_MASK);
5968		sync_sz = min(sync_sz, sz);
5969		if (pa != 0) {
5970			pa += (va & PAGE_MASK);
5971			__syncicache((void *)PHYS_TO_DMAP(pa), sync_sz);
5972		}
5973		va += sync_sz;
5974		sz -= sync_sz;
5975	}
5976}
5977
5978static __inline void
5979pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
5980{
5981	uint64_t opte, npte;
5982
5983	/*
5984	 * The cache mode bits are all in the low 32-bits of the
5985	 * PTE, so we can just spin on updating the low 32-bits.
5986	 */
5987	do {
5988		opte = be64toh(*pte);
5989		npte = opte & ~mask;
5990		npte |= cache_bits;
5991	} while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
5992}
5993
5994/*
5995 * Tries to demote a 1GB page mapping.
5996 */
5997static bool
5998pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
5999{
6000	pml2_entry_t oldpdpe;
6001	pml3_entry_t *firstpde, newpde, *pde;
6002	vm_paddr_t pdpgpa;
6003	vm_page_t pdpg;
6004
6005	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6006	oldpdpe = be64toh(*l2e);
6007	KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
6008	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6009	pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
6010	if (pdpg == NULL) {
6011		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6012		    " in pmap %p", va, pmap);
6013		return (false);
6014	}
6015	pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT;
6016	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
6017	firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
6018	KASSERT((oldpdpe & PG_A) != 0,
6019	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6020	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6021	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6022	newpde = oldpdpe;
6023
6024	/*
6025	 * Initialize the page directory page.
6026	 */
6027	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6028		*pde = htobe64(newpde);
6029		newpde += L3_PAGE_SIZE;
6030	}
6031
6032	/*
6033	 * Demote the mapping.
6034	 */
6035	pde_store(l2e, pdpgpa);
6036
6037	/*
6038	 * Flush PWC --- XXX revisit
6039	 */
6040	pmap_invalidate_all(pmap);
6041
6042	counter_u64_add(pmap_l2e_demotions, 1);
6043	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6044	    " in pmap %p", va, pmap);
6045	return (true);
6046}
6047
6048vm_paddr_t
6049mmu_radix_kextract(vm_offset_t va)
6050{
6051	pml3_entry_t l3e;
6052	vm_paddr_t pa;
6053
6054	CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6055	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
6056		pa = DMAP_TO_PHYS(va);
6057	} else {
6058		/* Big-endian PTE on stack */
6059		l3e = *pmap_pml3e(kernel_pmap, va);
6060		if (be64toh(l3e) & RPTE_LEAF) {
6061			pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
6062			pa |= (va & L3_PAGE_MASK);
6063		} else {
6064			/*
6065			 * Beware of a concurrent promotion that changes the
6066			 * PDE at this point!  For example, vtopte() must not
6067			 * be used to access the PTE because it would use the
6068			 * new PDE.  It is, however, safe to use the old PDE
6069			 * because the page table page is preserved by the
6070			 * promotion.
6071			 */
6072			pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
6073			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
6074			pa |= (va & PAGE_MASK);
6075		}
6076	}
6077	return (pa);
6078}
6079
6080static pt_entry_t
6081mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
6082{
6083
6084	if (ma != VM_MEMATTR_DEFAULT) {
6085		return pmap_cache_bits(ma);
6086	}
6087
6088	/*
6089	 * Assume the page is cache inhibited and access is guarded unless
6090	 * it's in our available memory array.
6091	 */
6092	for (int i = 0; i < pregions_sz; i++) {
6093		if ((pa >= pregions[i].mr_start) &&
6094		    (pa < (pregions[i].mr_start + pregions[i].mr_size)))
6095			return (RPTE_ATTR_MEM);
6096	}
6097	return (RPTE_ATTR_GUARDEDIO);
6098}
6099
6100static void
6101mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
6102{
6103	pt_entry_t *pte, pteval;
6104	uint64_t cache_bits;
6105
6106	pte = kvtopte(va);
6107	MPASS(pte != NULL);
6108	pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
6109	cache_bits = mmu_radix_calc_wimg(pa, ma);
6110	pte_store(pte, pteval | cache_bits);
6111}
6112
6113void
6114mmu_radix_kremove(vm_offset_t va)
6115{
6116	pt_entry_t *pte;
6117
6118	CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6119
6120	pte = kvtopte(va);
6121	pte_clear(pte);
6122}
6123
6124int
6125mmu_radix_decode_kernel_ptr(vm_offset_t addr,
6126    int *is_user, vm_offset_t *decoded)
6127{
6128
6129	CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
6130	*decoded = addr;
6131	*is_user = (addr < VM_MAXUSER_ADDRESS);
6132	return (0);
6133}
6134
6135static int
6136mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
6137{
6138
6139	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
6140	return (mem_valid(pa, size));
6141}
6142
6143static void
6144mmu_radix_scan_init(void)
6145{
6146
6147	CTR1(KTR_PMAP, "%s()", __func__);
6148	UNIMPLEMENTED();
6149}
6150
6151static void
6152mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
6153	void **va)
6154{
6155	CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
6156	UNIMPLEMENTED();
6157}
6158
6159vm_offset_t
6160mmu_radix_quick_enter_page(vm_page_t m)
6161{
6162	vm_paddr_t paddr;
6163
6164	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
6165	paddr = VM_PAGE_TO_PHYS(m);
6166	return (PHYS_TO_DMAP(paddr));
6167}
6168
6169void
6170mmu_radix_quick_remove_page(vm_offset_t addr __unused)
6171{
6172	/* no work to do here */
6173	CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
6174}
6175
6176static void
6177pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
6178{
6179	cpu_flush_dcache((void *)sva, eva - sva);
6180}
6181
6182int
6183mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
6184    vm_memattr_t mode)
6185{
6186	int error;
6187
6188	CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
6189	PMAP_LOCK(kernel_pmap);
6190	error = pmap_change_attr_locked(va, size, mode, true);
6191	PMAP_UNLOCK(kernel_pmap);
6192	return (error);
6193}
6194
6195static int
6196pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
6197{
6198	vm_offset_t base, offset, tmpva;
6199	vm_paddr_t pa_start, pa_end, pa_end1;
6200	pml2_entry_t *l2e;
6201	pml3_entry_t *l3e;
6202	pt_entry_t *pte;
6203	int cache_bits, error;
6204	bool changed;
6205
6206	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6207	base = trunc_page(va);
6208	offset = va & PAGE_MASK;
6209	size = round_page(offset + size);
6210
6211	/*
6212	 * Only supported on kernel virtual addresses, including the direct
6213	 * map but excluding the recursive map.
6214	 */
6215	if (base < DMAP_MIN_ADDRESS)
6216		return (EINVAL);
6217
6218	cache_bits = pmap_cache_bits(mode);
6219	changed = false;
6220
6221	/*
6222	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6223	 * into 4KB pages if required.
6224	 */
6225	for (tmpva = base; tmpva < base + size; ) {
6226		l2e = pmap_pml2e(kernel_pmap, tmpva);
6227		if (l2e == NULL || *l2e == 0)
6228			return (EINVAL);
6229		if (be64toh(*l2e) & RPTE_LEAF) {
6230			/*
6231			 * If the current 1GB page already has the required
6232			 * memory type, then we need not demote this page. Just
6233			 * increment tmpva to the next 1GB page frame.
6234			 */
6235			if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
6236				tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6237				continue;
6238			}
6239
6240			/*
6241			 * If the current offset aligns with a 1GB page frame
6242			 * and there is at least 1GB left within the range, then
6243			 * we need not break down this page into 2MB pages.
6244			 */
6245			if ((tmpva & L2_PAGE_MASK) == 0 &&
6246			    tmpva + L2_PAGE_MASK < base + size) {
6247				tmpva += L2_PAGE_MASK;
6248				continue;
6249			}
6250			if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
6251				return (ENOMEM);
6252		}
6253		l3e = pmap_l2e_to_l3e(l2e, tmpva);
6254		KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
6255		    tmpva, l2e));
6256		if (*l3e == 0)
6257			return (EINVAL);
6258		if (be64toh(*l3e) & RPTE_LEAF) {
6259			/*
6260			 * If the current 2MB page already has the required
6261			 * memory type, then we need not demote this page. Just
6262			 * increment tmpva to the next 2MB page frame.
6263			 */
6264			if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
6265				tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6266				continue;
6267			}
6268
6269			/*
6270			 * If the current offset aligns with a 2MB page frame
6271			 * and there is at least 2MB left within the range, then
6272			 * we need not break down this page into 4KB pages.
6273			 */
6274			if ((tmpva & L3_PAGE_MASK) == 0 &&
6275			    tmpva + L3_PAGE_MASK < base + size) {
6276				tmpva += L3_PAGE_SIZE;
6277				continue;
6278			}
6279			if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
6280				return (ENOMEM);
6281		}
6282		pte = pmap_l3e_to_pte(l3e, tmpva);
6283		if (*pte == 0)
6284			return (EINVAL);
6285		tmpva += PAGE_SIZE;
6286	}
6287	error = 0;
6288
6289	/*
6290	 * Ok, all the pages exist, so run through them updating their
6291	 * cache mode if required.
6292	 */
6293	pa_start = pa_end = 0;
6294	for (tmpva = base; tmpva < base + size; ) {
6295		l2e = pmap_pml2e(kernel_pmap, tmpva);
6296		if (be64toh(*l2e) & RPTE_LEAF) {
6297			if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
6298				pmap_pte_attr(l2e, cache_bits,
6299				    RPTE_ATTR_MASK);
6300				changed = true;
6301			}
6302			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6303			    (*l2e & PG_PS_FRAME) < dmaplimit) {
6304				if (pa_start == pa_end) {
6305					/* Start physical address run. */
6306					pa_start = be64toh(*l2e) & PG_PS_FRAME;
6307					pa_end = pa_start + L2_PAGE_SIZE;
6308				} else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
6309					pa_end += L2_PAGE_SIZE;
6310				else {
6311					/* Run ended, update direct map. */
6312					error = pmap_change_attr_locked(
6313					    PHYS_TO_DMAP(pa_start),
6314					    pa_end - pa_start, mode, flush);
6315					if (error != 0)
6316						break;
6317					/* Start physical address run. */
6318					pa_start = be64toh(*l2e) & PG_PS_FRAME;
6319					pa_end = pa_start + L2_PAGE_SIZE;
6320				}
6321			}
6322			tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6323			continue;
6324		}
6325		l3e = pmap_l2e_to_l3e(l2e, tmpva);
6326		if (be64toh(*l3e) & RPTE_LEAF) {
6327			if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
6328				pmap_pte_attr(l3e, cache_bits,
6329				    RPTE_ATTR_MASK);
6330				changed = true;
6331			}
6332			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6333			    (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
6334				if (pa_start == pa_end) {
6335					/* Start physical address run. */
6336					pa_start = be64toh(*l3e) & PG_PS_FRAME;
6337					pa_end = pa_start + L3_PAGE_SIZE;
6338				} else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
6339					pa_end += L3_PAGE_SIZE;
6340				else {
6341					/* Run ended, update direct map. */
6342					error = pmap_change_attr_locked(
6343					    PHYS_TO_DMAP(pa_start),
6344					    pa_end - pa_start, mode, flush);
6345					if (error != 0)
6346						break;
6347					/* Start physical address run. */
6348					pa_start = be64toh(*l3e) & PG_PS_FRAME;
6349					pa_end = pa_start + L3_PAGE_SIZE;
6350				}
6351			}
6352			tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6353		} else {
6354			pte = pmap_l3e_to_pte(l3e, tmpva);
6355			if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
6356				pmap_pte_attr(pte, cache_bits,
6357				    RPTE_ATTR_MASK);
6358				changed = true;
6359			}
6360			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6361			    (be64toh(*pte) & PG_FRAME) < dmaplimit) {
6362				if (pa_start == pa_end) {
6363					/* Start physical address run. */
6364					pa_start = be64toh(*pte) & PG_FRAME;
6365					pa_end = pa_start + PAGE_SIZE;
6366				} else if (pa_end == (be64toh(*pte) & PG_FRAME))
6367					pa_end += PAGE_SIZE;
6368				else {
6369					/* Run ended, update direct map. */
6370					error = pmap_change_attr_locked(
6371					    PHYS_TO_DMAP(pa_start),
6372					    pa_end - pa_start, mode, flush);
6373					if (error != 0)
6374						break;
6375					/* Start physical address run. */
6376					pa_start = be64toh(*pte) & PG_FRAME;
6377					pa_end = pa_start + PAGE_SIZE;
6378				}
6379			}
6380			tmpva += PAGE_SIZE;
6381		}
6382	}
6383	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6384		pa_end1 = MIN(pa_end, dmaplimit);
6385		if (pa_start != pa_end1)
6386			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6387			    pa_end1 - pa_start, mode, flush);
6388	}
6389
6390	/*
6391	 * Flush CPU caches if required to make sure any data isn't cached that
6392	 * shouldn't be, etc.
6393	 */
6394	if (changed) {
6395		pmap_invalidate_all(kernel_pmap);
6396
6397		if (flush)
6398			pmap_invalidate_cache_range(base, tmpva);
6399	}
6400	return (error);
6401}
6402
6403/*
6404 * Allocate physical memory for the vm_page array and map it into KVA,
6405 * attempting to back the vm_pages with domain-local memory.
6406 */
6407void
6408mmu_radix_page_array_startup(long pages)
6409{
6410#ifdef notyet
6411	pml2_entry_t *l2e;
6412	pml3_entry_t *pde;
6413	pml3_entry_t newl3;
6414	vm_offset_t va;
6415	long pfn;
6416	int domain, i;
6417#endif
6418	vm_paddr_t pa;
6419	vm_offset_t start, end;
6420
6421	vm_page_array_size = pages;
6422
6423	start = VM_MIN_KERNEL_ADDRESS;
6424	end = start + pages * sizeof(struct vm_page);
6425
6426	pa = vm_phys_early_alloc(0, end - start);
6427
6428	start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
6429#ifdef notyet
6430	/* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
6431	for (va = start; va < end; va += L3_PAGE_SIZE) {
6432		pfn = first_page + (va - start) / sizeof(struct vm_page);
6433		domain = vm_phys_domain(ptoa(pfn));
6434		l2e = pmap_pml2e(kernel_pmap, va);
6435		if ((be64toh(*l2e) & PG_V) == 0) {
6436			pa = vm_phys_early_alloc(domain, PAGE_SIZE);
6437			dump_add_page(pa);
6438			pagezero(PHYS_TO_DMAP(pa));
6439			pde_store(l2e, (pml2_entry_t)pa);
6440		}
6441		pde = pmap_l2e_to_l3e(l2e, va);
6442		if ((be64toh(*pde) & PG_V) != 0)
6443			panic("Unexpected pde %p", pde);
6444		pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
6445		for (i = 0; i < NPDEPG; i++)
6446			dump_add_page(pa + i * PAGE_SIZE);
6447		newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
6448		pte_store(pde, newl3);
6449	}
6450#endif
6451	vm_page_array = (vm_page_t)start;
6452}
6453
6454#ifdef DDB
6455#include <sys/kdb.h>
6456#include <ddb/ddb.h>
6457
6458static void
6459pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
6460{
6461	pml1_entry_t *l1e;
6462	pml2_entry_t *l2e;
6463	pml3_entry_t *l3e;
6464	pt_entry_t *pte;
6465
6466	l1e = &l1[pmap_pml1e_index(va)];
6467	db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
6468	if ((be64toh(*l1e) & PG_V) == 0) {
6469		db_printf("\n");
6470		return;
6471	}
6472	l2e = pmap_l1e_to_l2e(l1e, va);
6473	db_printf(" l2e %#016lx", be64toh(*l2e));
6474	if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
6475		db_printf("\n");
6476		return;
6477	}
6478	l3e = pmap_l2e_to_l3e(l2e, va);
6479	db_printf(" l3e %#016lx", be64toh(*l3e));
6480	if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
6481		db_printf("\n");
6482		return;
6483	}
6484	pte = pmap_l3e_to_pte(l3e, va);
6485	db_printf(" pte %#016lx\n", be64toh(*pte));
6486}
6487
6488void
6489pmap_page_print_mappings(vm_page_t m)
6490{
6491	pmap_t pmap;
6492	pv_entry_t pv;
6493
6494	db_printf("page %p(%lx)\n", m, m->phys_addr);
6495	/* need to elide locks if running in ddb */
6496	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
6497		db_printf("pv: %p ", pv);
6498		db_printf("va: %#016lx ", pv->pv_va);
6499		pmap = PV_PMAP(pv);
6500		db_printf("pmap %p  ", pmap);
6501		if (pmap != NULL) {
6502			db_printf("asid: %lu\n", pmap->pm_pid);
6503			pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
6504		}
6505	}
6506}
6507
6508DB_SHOW_COMMAND(pte, pmap_print_pte)
6509{
6510	vm_offset_t va;
6511	pmap_t pmap;
6512
6513	if (!have_addr) {
6514		db_printf("show pte addr\n");
6515		return;
6516	}
6517	va = (vm_offset_t)addr;
6518
6519	if (va >= DMAP_MIN_ADDRESS)
6520		pmap = kernel_pmap;
6521	else if (kdb_thread != NULL)
6522		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
6523	else
6524		pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
6525
6526	pmap_pte_walk(pmap->pm_pml1, va);
6527}
6528
6529#endif
6530