mmu_oea64.c revision 260672
1/*-
2 * Copyright (c) 2001 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *        This product includes software developed by the NetBSD
19 *        Foundation, Inc. and its contributors.
20 * 4. Neither the name of The NetBSD Foundation nor the names of its
21 *    contributors may be used to endorse or promote products derived
22 *    from this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36/*-
37 * Copyright (C) 1995, 1996 Wolfgang Solfrank.
38 * Copyright (C) 1995, 1996 TooLs GmbH.
39 * All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. All advertising materials mentioning features or use of this software
50 *    must display the following acknowledgement:
51 *	This product includes software developed by TooLs GmbH.
52 * 4. The name of TooLs GmbH may not be used to endorse or promote products
53 *    derived from this software without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
56 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
59 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
61 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
63 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
64 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $
67 */
68/*-
69 * Copyright (C) 2001 Benno Rice.
70 * All rights reserved.
71 *
72 * Redistribution and use in source and binary forms, with or without
73 * modification, are permitted provided that the following conditions
74 * are met:
75 * 1. Redistributions of source code must retain the above copyright
76 *    notice, this list of conditions and the following disclaimer.
77 * 2. Redistributions in binary form must reproduce the above copyright
78 *    notice, this list of conditions and the following disclaimer in the
79 *    documentation and/or other materials provided with the distribution.
80 *
81 * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
82 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
83 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
84 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
85 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
86 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
87 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
88 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
89 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
90 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
91 */
92
93#include <sys/cdefs.h>
94__FBSDID("$FreeBSD: stable/10/sys/powerpc/aim/mmu_oea64.c 260672 2014-01-15 05:41:28Z jhibbits $");
95
96/*
97 * Manages physical address maps.
98 *
99 * Since the information managed by this module is also stored by the
100 * logical address mapping module, this module may throw away valid virtual
101 * to physical mappings at almost any time.  However, invalidations of
102 * mappings must be done as requested.
103 *
104 * In order to cope with hardware architectures which make virtual to
105 * physical map invalidates expensive, this module may delay invalidate
106 * reduced protection operations until such time as they are actually
107 * necessary.  This module is given full information as to which processors
108 * are currently using which maps, and to when physical maps must be made
109 * correct.
110 */
111
112#include "opt_compat.h"
113#include "opt_kstack_pages.h"
114
115#include <sys/param.h>
116#include <sys/kernel.h>
117#include <sys/queue.h>
118#include <sys/cpuset.h>
119#include <sys/ktr.h>
120#include <sys/lock.h>
121#include <sys/msgbuf.h>
122#include <sys/malloc.h>
123#include <sys/mutex.h>
124#include <sys/proc.h>
125#include <sys/rwlock.h>
126#include <sys/sched.h>
127#include <sys/sysctl.h>
128#include <sys/systm.h>
129#include <sys/vmmeter.h>
130
131#include <sys/kdb.h>
132
133#include <dev/ofw/openfirm.h>
134
135#include <vm/vm.h>
136#include <vm/vm_param.h>
137#include <vm/vm_kern.h>
138#include <vm/vm_page.h>
139#include <vm/vm_map.h>
140#include <vm/vm_object.h>
141#include <vm/vm_extern.h>
142#include <vm/vm_pageout.h>
143#include <vm/uma.h>
144
145#include <machine/_inttypes.h>
146#include <machine/cpu.h>
147#include <machine/platform.h>
148#include <machine/frame.h>
149#include <machine/md_var.h>
150#include <machine/psl.h>
151#include <machine/bat.h>
152#include <machine/hid.h>
153#include <machine/pte.h>
154#include <machine/sr.h>
155#include <machine/trap.h>
156#include <machine/mmuvar.h>
157
158#include "mmu_oea64.h"
159#include "mmu_if.h"
160#include "moea64_if.h"
161
162void moea64_release_vsid(uint64_t vsid);
163uintptr_t moea64_get_unique_vsid(void);
164
165#define DISABLE_TRANS(msr)	msr = mfmsr(); mtmsr(msr & ~PSL_DR)
166#define ENABLE_TRANS(msr)	mtmsr(msr)
167
168#define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
169#define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
170#define	VSID_HASH_MASK		0x0000007fffffffffULL
171
172/*
173 * Locking semantics:
174 * -- Read lock: if no modifications are being made to either the PVO lists
175 *    or page table or if any modifications being made result in internal
176 *    changes (e.g. wiring, protection) such that the existence of the PVOs
177 *    is unchanged and they remain associated with the same pmap (in which
178 *    case the changes should be protected by the pmap lock)
179 * -- Write lock: required if PTEs/PVOs are being inserted or removed.
180 */
181
182#define LOCK_TABLE_RD() rw_rlock(&moea64_table_lock)
183#define UNLOCK_TABLE_RD() rw_runlock(&moea64_table_lock)
184#define LOCK_TABLE_WR() rw_wlock(&moea64_table_lock)
185#define UNLOCK_TABLE_WR() rw_wunlock(&moea64_table_lock)
186
187struct ofw_map {
188	cell_t	om_va;
189	cell_t	om_len;
190	cell_t	om_pa_hi;
191	cell_t	om_pa_lo;
192	cell_t	om_mode;
193};
194
195extern unsigned char _etext[];
196extern unsigned char _end[];
197
198extern int dumpsys_minidump;
199
200/*
201 * Map of physical memory regions.
202 */
203static struct	mem_region *regions;
204static struct	mem_region *pregions;
205static u_int	phys_avail_count;
206static int	regions_sz, pregions_sz;
207
208extern void bs_remap_earlyboot(void);
209
210/*
211 * Lock for the pteg and pvo tables.
212 */
213struct rwlock	moea64_table_lock;
214struct mtx	moea64_slb_mutex;
215
216/*
217 * PTEG data.
218 */
219u_int		moea64_pteg_count;
220u_int		moea64_pteg_mask;
221
222/*
223 * PVO data.
224 */
225struct	pvo_head *moea64_pvo_table;		/* pvo entries by pteg index */
226
227uma_zone_t	moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */
228uma_zone_t	moea64_mpvo_zone; /* zone for pvo entries for managed pages */
229
230#define	BPVO_POOL_SIZE	327680
231static struct	pvo_entry *moea64_bpvo_pool;
232static int	moea64_bpvo_pool_index = 0;
233
234#define	VSID_NBPW	(sizeof(u_int32_t) * 8)
235#ifdef __powerpc64__
236#define	NVSIDS		(NPMAPS * 16)
237#define VSID_HASHMASK	0xffffffffUL
238#else
239#define NVSIDS		NPMAPS
240#define VSID_HASHMASK	0xfffffUL
241#endif
242static u_int	moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
243
244static boolean_t moea64_initialized = FALSE;
245
246/*
247 * Statistics.
248 */
249u_int	moea64_pte_valid = 0;
250u_int	moea64_pte_overflow = 0;
251u_int	moea64_pvo_entries = 0;
252u_int	moea64_pvo_enter_calls = 0;
253u_int	moea64_pvo_remove_calls = 0;
254SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
255    &moea64_pte_valid, 0, "");
256SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
257    &moea64_pte_overflow, 0, "");
258SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
259    &moea64_pvo_entries, 0, "");
260SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
261    &moea64_pvo_enter_calls, 0, "");
262SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
263    &moea64_pvo_remove_calls, 0, "");
264
265vm_offset_t	moea64_scratchpage_va[2];
266struct pvo_entry *moea64_scratchpage_pvo[2];
267uintptr_t	moea64_scratchpage_pte[2];
268struct	mtx	moea64_scratchpage_mtx;
269
270uint64_t 	moea64_large_page_mask = 0;
271uint64_t	moea64_large_page_size = 0;
272int		moea64_large_page_shift = 0;
273
274/*
275 * PVO calls.
276 */
277static int	moea64_pvo_enter(mmu_t, pmap_t, uma_zone_t, struct pvo_head *,
278		    vm_offset_t, vm_offset_t, uint64_t, int);
279static void	moea64_pvo_remove(mmu_t, struct pvo_entry *);
280static struct	pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
281
282/*
283 * Utility routines.
284 */
285static boolean_t	moea64_query_bit(mmu_t, vm_page_t, u_int64_t);
286static u_int		moea64_clear_bit(mmu_t, vm_page_t, u_int64_t);
287static void		moea64_kremove(mmu_t, vm_offset_t);
288static void		moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va,
289			    vm_offset_t pa, vm_size_t sz);
290
291/*
292 * Kernel MMU interface
293 */
294void moea64_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t);
295void moea64_clear_modify(mmu_t, vm_page_t);
296void moea64_copy_page(mmu_t, vm_page_t, vm_page_t);
297void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
298    vm_page_t *mb, vm_offset_t b_offset, int xfersize);
299void moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t);
300void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
301    vm_prot_t);
302void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
303vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t);
304vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t);
305void moea64_init(mmu_t);
306boolean_t moea64_is_modified(mmu_t, vm_page_t);
307boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
308boolean_t moea64_is_referenced(mmu_t, vm_page_t);
309int moea64_ts_referenced(mmu_t, vm_page_t);
310vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
311boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t);
312int moea64_page_wired_mappings(mmu_t, vm_page_t);
313void moea64_pinit(mmu_t, pmap_t);
314void moea64_pinit0(mmu_t, pmap_t);
315void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
316void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
317void moea64_qremove(mmu_t, vm_offset_t, int);
318void moea64_release(mmu_t, pmap_t);
319void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
320void moea64_remove_pages(mmu_t, pmap_t);
321void moea64_remove_all(mmu_t, vm_page_t);
322void moea64_remove_write(mmu_t, vm_page_t);
323void moea64_zero_page(mmu_t, vm_page_t);
324void moea64_zero_page_area(mmu_t, vm_page_t, int, int);
325void moea64_zero_page_idle(mmu_t, vm_page_t);
326void moea64_activate(mmu_t, struct thread *);
327void moea64_deactivate(mmu_t, struct thread *);
328void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t);
329void *moea64_mapdev_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t);
330void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t);
331vm_paddr_t moea64_kextract(mmu_t, vm_offset_t);
332void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma);
333void moea64_kenter_attr(mmu_t, vm_offset_t, vm_offset_t, vm_memattr_t ma);
334void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t);
335boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
336static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
337vm_offset_t moea64_dumpsys_map(mmu_t mmu, struct pmap_md *md, vm_size_t ofs,
338    vm_size_t *sz);
339struct pmap_md * moea64_scan_md(mmu_t mmu, struct pmap_md *prev);
340
341static mmu_method_t moea64_methods[] = {
342	MMUMETHOD(mmu_change_wiring,	moea64_change_wiring),
343	MMUMETHOD(mmu_clear_modify,	moea64_clear_modify),
344	MMUMETHOD(mmu_copy_page,	moea64_copy_page),
345	MMUMETHOD(mmu_copy_pages,	moea64_copy_pages),
346	MMUMETHOD(mmu_enter,		moea64_enter),
347	MMUMETHOD(mmu_enter_object,	moea64_enter_object),
348	MMUMETHOD(mmu_enter_quick,	moea64_enter_quick),
349	MMUMETHOD(mmu_extract,		moea64_extract),
350	MMUMETHOD(mmu_extract_and_hold,	moea64_extract_and_hold),
351	MMUMETHOD(mmu_init,		moea64_init),
352	MMUMETHOD(mmu_is_modified,	moea64_is_modified),
353	MMUMETHOD(mmu_is_prefaultable,	moea64_is_prefaultable),
354	MMUMETHOD(mmu_is_referenced,	moea64_is_referenced),
355	MMUMETHOD(mmu_ts_referenced,	moea64_ts_referenced),
356	MMUMETHOD(mmu_map,     		moea64_map),
357	MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick),
358	MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings),
359	MMUMETHOD(mmu_pinit,		moea64_pinit),
360	MMUMETHOD(mmu_pinit0,		moea64_pinit0),
361	MMUMETHOD(mmu_protect,		moea64_protect),
362	MMUMETHOD(mmu_qenter,		moea64_qenter),
363	MMUMETHOD(mmu_qremove,		moea64_qremove),
364	MMUMETHOD(mmu_release,		moea64_release),
365	MMUMETHOD(mmu_remove,		moea64_remove),
366	MMUMETHOD(mmu_remove_pages,	moea64_remove_pages),
367	MMUMETHOD(mmu_remove_all,      	moea64_remove_all),
368	MMUMETHOD(mmu_remove_write,	moea64_remove_write),
369	MMUMETHOD(mmu_sync_icache,	moea64_sync_icache),
370	MMUMETHOD(mmu_zero_page,       	moea64_zero_page),
371	MMUMETHOD(mmu_zero_page_area,	moea64_zero_page_area),
372	MMUMETHOD(mmu_zero_page_idle,	moea64_zero_page_idle),
373	MMUMETHOD(mmu_activate,		moea64_activate),
374	MMUMETHOD(mmu_deactivate,      	moea64_deactivate),
375	MMUMETHOD(mmu_page_set_memattr,	moea64_page_set_memattr),
376
377	/* Internal interfaces */
378	MMUMETHOD(mmu_mapdev,		moea64_mapdev),
379	MMUMETHOD(mmu_mapdev_attr,	moea64_mapdev_attr),
380	MMUMETHOD(mmu_unmapdev,		moea64_unmapdev),
381	MMUMETHOD(mmu_kextract,		moea64_kextract),
382	MMUMETHOD(mmu_kenter,		moea64_kenter),
383	MMUMETHOD(mmu_kenter_attr,	moea64_kenter_attr),
384	MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped),
385	MMUMETHOD(mmu_scan_md,		moea64_scan_md),
386	MMUMETHOD(mmu_dumpsys_map,	moea64_dumpsys_map),
387
388	{ 0, 0 }
389};
390
391MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0);
392
393static __inline u_int
394va_to_pteg(uint64_t vsid, vm_offset_t addr, int large)
395{
396	uint64_t hash;
397	int shift;
398
399	shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT;
400	hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >>
401	    shift);
402	return (hash & moea64_pteg_mask);
403}
404
405static __inline struct pvo_head *
406vm_page_to_pvoh(vm_page_t m)
407{
408
409	return (&m->md.mdpg_pvoh);
410}
411
412static __inline void
413moea64_pte_create(struct lpte *pt, uint64_t vsid, vm_offset_t va,
414    uint64_t pte_lo, int flags)
415{
416
417	/*
418	 * Construct a PTE.  Default to IMB initially.  Valid bit only gets
419	 * set when the real pte is set in memory.
420	 *
421	 * Note: Don't set the valid bit for correct operation of tlb update.
422	 */
423	pt->pte_hi = (vsid << LPTE_VSID_SHIFT) |
424	    (((uint64_t)(va & ADDR_PIDX) >> ADDR_API_SHFT64) & LPTE_API);
425
426	if (flags & PVO_LARGE)
427		pt->pte_hi |= LPTE_BIG;
428
429	pt->pte_lo = pte_lo;
430}
431
432static __inline uint64_t
433moea64_calc_wimg(vm_offset_t pa, vm_memattr_t ma)
434{
435	uint64_t pte_lo;
436	int i;
437
438	if (ma != VM_MEMATTR_DEFAULT) {
439		switch (ma) {
440		case VM_MEMATTR_UNCACHEABLE:
441			return (LPTE_I | LPTE_G);
442		case VM_MEMATTR_WRITE_COMBINING:
443		case VM_MEMATTR_WRITE_BACK:
444		case VM_MEMATTR_PREFETCHABLE:
445			return (LPTE_I);
446		case VM_MEMATTR_WRITE_THROUGH:
447			return (LPTE_W | LPTE_M);
448		}
449	}
450
451	/*
452	 * Assume the page is cache inhibited and access is guarded unless
453	 * it's in our available memory array.
454	 */
455	pte_lo = LPTE_I | LPTE_G;
456	for (i = 0; i < pregions_sz; i++) {
457		if ((pa >= pregions[i].mr_start) &&
458		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
459			pte_lo &= ~(LPTE_I | LPTE_G);
460			pte_lo |= LPTE_M;
461			break;
462		}
463	}
464
465	return pte_lo;
466}
467
468/*
469 * Quick sort callout for comparing memory regions.
470 */
471static int	om_cmp(const void *a, const void *b);
472
473static int
474om_cmp(const void *a, const void *b)
475{
476	const struct	ofw_map *mapa;
477	const struct	ofw_map *mapb;
478
479	mapa = a;
480	mapb = b;
481	if (mapa->om_pa_hi < mapb->om_pa_hi)
482		return (-1);
483	else if (mapa->om_pa_hi > mapb->om_pa_hi)
484		return (1);
485	else if (mapa->om_pa_lo < mapb->om_pa_lo)
486		return (-1);
487	else if (mapa->om_pa_lo > mapb->om_pa_lo)
488		return (1);
489	else
490		return (0);
491}
492
493static void
494moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz)
495{
496	struct ofw_map	translations[sz/sizeof(struct ofw_map)];
497	register_t	msr;
498	vm_offset_t	off;
499	vm_paddr_t	pa_base;
500	int		i;
501
502	bzero(translations, sz);
503	if (OF_getprop(mmu, "translations", translations, sz) == -1)
504		panic("moea64_bootstrap: can't get ofw translations");
505
506	CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
507	sz /= sizeof(*translations);
508	qsort(translations, sz, sizeof (*translations), om_cmp);
509
510	for (i = 0; i < sz; i++) {
511		CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x",
512		    (uint32_t)(translations[i].om_pa_lo), translations[i].om_va,
513		    translations[i].om_len);
514
515		if (translations[i].om_pa_lo % PAGE_SIZE)
516			panic("OFW translation not page-aligned!");
517
518		pa_base = translations[i].om_pa_lo;
519
520	      #ifdef __powerpc64__
521		pa_base += (vm_offset_t)translations[i].om_pa_hi << 32;
522	      #else
523		if (translations[i].om_pa_hi)
524			panic("OFW translations above 32-bit boundary!");
525	      #endif
526
527		/* Now enter the pages for this mapping */
528
529		DISABLE_TRANS(msr);
530		for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
531			if (moea64_pvo_find_va(kernel_pmap,
532			    translations[i].om_va + off) != NULL)
533				continue;
534
535			moea64_kenter(mmup, translations[i].om_va + off,
536			    pa_base + off);
537		}
538		ENABLE_TRANS(msr);
539	}
540}
541
542#ifdef __powerpc64__
543static void
544moea64_probe_large_page(void)
545{
546	uint16_t pvr = mfpvr() >> 16;
547
548	switch (pvr) {
549	case IBM970:
550	case IBM970FX:
551	case IBM970MP:
552		powerpc_sync(); isync();
553		mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
554		powerpc_sync(); isync();
555
556		/* FALLTHROUGH */
557	default:
558		moea64_large_page_size = 0x1000000; /* 16 MB */
559		moea64_large_page_shift = 24;
560	}
561
562	moea64_large_page_mask = moea64_large_page_size - 1;
563}
564
565static void
566moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
567{
568	struct slb *cache;
569	struct slb entry;
570	uint64_t esid, slbe;
571	uint64_t i;
572
573	cache = PCPU_GET(slb);
574	esid = va >> ADDR_SR_SHFT;
575	slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
576
577	for (i = 0; i < 64; i++) {
578		if (cache[i].slbe == (slbe | i))
579			return;
580	}
581
582	entry.slbe = slbe;
583	entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
584	if (large)
585		entry.slbv |= SLBV_L;
586
587	slb_insert_kernel(entry.slbe, entry.slbv);
588}
589#endif
590
591static void
592moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart,
593    vm_offset_t kernelend)
594{
595	register_t msr;
596	vm_paddr_t pa;
597	vm_offset_t size, off;
598	uint64_t pte_lo;
599	int i;
600
601	if (moea64_large_page_size == 0)
602		hw_direct_map = 0;
603
604	DISABLE_TRANS(msr);
605	if (hw_direct_map) {
606		LOCK_TABLE_WR();
607		PMAP_LOCK(kernel_pmap);
608		for (i = 0; i < pregions_sz; i++) {
609		  for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
610		     pregions[i].mr_size; pa += moea64_large_page_size) {
611			pte_lo = LPTE_M;
612
613			/*
614			 * Set memory access as guarded if prefetch within
615			 * the page could exit the available physmem area.
616			 */
617			if (pa & moea64_large_page_mask) {
618				pa &= moea64_large_page_mask;
619				pte_lo |= LPTE_G;
620			}
621			if (pa + moea64_large_page_size >
622			    pregions[i].mr_start + pregions[i].mr_size)
623				pte_lo |= LPTE_G;
624
625			moea64_pvo_enter(mmup, kernel_pmap, moea64_upvo_zone,
626				    NULL, pa, pa, pte_lo,
627				    PVO_WIRED | PVO_LARGE);
628		  }
629		}
630		PMAP_UNLOCK(kernel_pmap);
631		UNLOCK_TABLE_WR();
632	} else {
633		size = sizeof(struct pvo_head) * moea64_pteg_count;
634		off = (vm_offset_t)(moea64_pvo_table);
635		for (pa = off; pa < off + size; pa += PAGE_SIZE)
636			moea64_kenter(mmup, pa, pa);
637		size = BPVO_POOL_SIZE*sizeof(struct pvo_entry);
638		off = (vm_offset_t)(moea64_bpvo_pool);
639		for (pa = off; pa < off + size; pa += PAGE_SIZE)
640		moea64_kenter(mmup, pa, pa);
641
642		/*
643		 * Map certain important things, like ourselves.
644		 *
645		 * NOTE: We do not map the exception vector space. That code is
646		 * used only in real mode, and leaving it unmapped allows us to
647		 * catch NULL pointer deferences, instead of making NULL a valid
648		 * address.
649		 */
650
651		for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
652		    pa += PAGE_SIZE)
653			moea64_kenter(mmup, pa, pa);
654	}
655	ENABLE_TRANS(msr);
656
657	/*
658	 * Allow user to override unmapped_buf_allowed for testing.
659	 * XXXKIB Only direct map implementation was tested.
660	 */
661	if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
662	    &unmapped_buf_allowed))
663		unmapped_buf_allowed = hw_direct_map;
664}
665
666void
667moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
668{
669	int		i, j;
670	vm_size_t	physsz, hwphyssz;
671
672#ifndef __powerpc64__
673	/* We don't have a direct map since there is no BAT */
674	hw_direct_map = 0;
675
676	/* Make sure battable is zero, since we have no BAT */
677	for (i = 0; i < 16; i++) {
678		battable[i].batu = 0;
679		battable[i].batl = 0;
680	}
681#else
682	moea64_probe_large_page();
683
684	/* Use a direct map if we have large page support */
685	if (moea64_large_page_size > 0)
686		hw_direct_map = 1;
687	else
688		hw_direct_map = 0;
689#endif
690
691	/* Get physical memory regions from firmware */
692	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
693	CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
694
695	if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz)
696		panic("moea64_bootstrap: phys_avail too small");
697
698	phys_avail_count = 0;
699	physsz = 0;
700	hwphyssz = 0;
701	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
702	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
703		CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start,
704		    regions[i].mr_start + regions[i].mr_size,
705		    regions[i].mr_size);
706		if (hwphyssz != 0 &&
707		    (physsz + regions[i].mr_size) >= hwphyssz) {
708			if (physsz < hwphyssz) {
709				phys_avail[j] = regions[i].mr_start;
710				phys_avail[j + 1] = regions[i].mr_start +
711				    hwphyssz - physsz;
712				physsz = hwphyssz;
713				phys_avail_count++;
714			}
715			break;
716		}
717		phys_avail[j] = regions[i].mr_start;
718		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
719		phys_avail_count++;
720		physsz += regions[i].mr_size;
721	}
722
723	/* Check for overlap with the kernel and exception vectors */
724	for (j = 0; j < 2*phys_avail_count; j+=2) {
725		if (phys_avail[j] < EXC_LAST)
726			phys_avail[j] += EXC_LAST;
727
728		if (kernelstart >= phys_avail[j] &&
729		    kernelstart < phys_avail[j+1]) {
730			if (kernelend < phys_avail[j+1]) {
731				phys_avail[2*phys_avail_count] =
732				    (kernelend & ~PAGE_MASK) + PAGE_SIZE;
733				phys_avail[2*phys_avail_count + 1] =
734				    phys_avail[j+1];
735				phys_avail_count++;
736			}
737
738			phys_avail[j+1] = kernelstart & ~PAGE_MASK;
739		}
740
741		if (kernelend >= phys_avail[j] &&
742		    kernelend < phys_avail[j+1]) {
743			if (kernelstart > phys_avail[j]) {
744				phys_avail[2*phys_avail_count] = phys_avail[j];
745				phys_avail[2*phys_avail_count + 1] =
746				    kernelstart & ~PAGE_MASK;
747				phys_avail_count++;
748			}
749
750			phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE;
751		}
752	}
753
754	physmem = btoc(physsz);
755
756#ifdef PTEGCOUNT
757	moea64_pteg_count = PTEGCOUNT;
758#else
759	moea64_pteg_count = 0x1000;
760
761	while (moea64_pteg_count < physmem)
762		moea64_pteg_count <<= 1;
763
764	moea64_pteg_count >>= 1;
765#endif /* PTEGCOUNT */
766}
767
768void
769moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
770{
771	vm_size_t	size;
772	register_t	msr;
773	int		i;
774
775	/*
776	 * Set PTEG mask
777	 */
778	moea64_pteg_mask = moea64_pteg_count - 1;
779
780	/*
781	 * Allocate pv/overflow lists.
782	 */
783	size = sizeof(struct pvo_head) * moea64_pteg_count;
784
785	moea64_pvo_table = (struct pvo_head *)moea64_bootstrap_alloc(size,
786	    PAGE_SIZE);
787	CTR1(KTR_PMAP, "moea64_bootstrap: PVO table at %p", moea64_pvo_table);
788
789	DISABLE_TRANS(msr);
790	for (i = 0; i < moea64_pteg_count; i++)
791		LIST_INIT(&moea64_pvo_table[i]);
792	ENABLE_TRANS(msr);
793
794	/*
795	 * Initialize the lock that synchronizes access to the pteg and pvo
796	 * tables.
797	 */
798	rw_init_flags(&moea64_table_lock, "pmap tables", RW_RECURSE);
799	mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
800
801	/*
802	 * Initialise the unmanaged pvo pool.
803	 */
804	moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
805		BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0);
806	moea64_bpvo_pool_index = 0;
807
808	/*
809	 * Make sure kernel vsid is allocated as well as VSID 0.
810	 */
811	#ifndef __powerpc64__
812	moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
813		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
814	moea64_vsid_bitmap[0] |= 1;
815	#endif
816
817	/*
818	 * Initialize the kernel pmap (which is statically allocated).
819	 */
820	#ifdef __powerpc64__
821	for (i = 0; i < 64; i++) {
822		pcpup->pc_slb[i].slbv = 0;
823		pcpup->pc_slb[i].slbe = 0;
824	}
825	#else
826	for (i = 0; i < 16; i++)
827		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
828	#endif
829
830	kernel_pmap->pmap_phys = kernel_pmap;
831	CPU_FILL(&kernel_pmap->pm_active);
832	RB_INIT(&kernel_pmap->pmap_pvo);
833
834	PMAP_LOCK_INIT(kernel_pmap);
835
836	/*
837	 * Now map in all the other buffers we allocated earlier
838	 */
839
840	moea64_setup_direct_map(mmup, kernelstart, kernelend);
841}
842
843void
844moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
845{
846	ihandle_t	mmui;
847	phandle_t	chosen;
848	phandle_t	mmu;
849	size_t		sz;
850	int		i;
851	vm_offset_t	pa, va;
852	void		*dpcpu;
853
854	/*
855	 * Set up the Open Firmware pmap and add its mappings if not in real
856	 * mode.
857	 */
858
859	chosen = OF_finddevice("/chosen");
860	if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1) {
861	    mmu = OF_instance_to_package(mmui);
862	    if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1)
863		sz = 0;
864	    if (sz > 6144 /* tmpstksz - 2 KB headroom */)
865		panic("moea64_bootstrap: too many ofw translations");
866
867	    if (sz > 0)
868		moea64_add_ofw_mappings(mmup, mmu, sz);
869	}
870
871	/*
872	 * Calculate the last available physical address.
873	 */
874	for (i = 0; phys_avail[i + 2] != 0; i += 2)
875		;
876	Maxmem = powerpc_btop(phys_avail[i + 1]);
877
878	/*
879	 * Initialize MMU and remap early physical mappings
880	 */
881	MMU_CPU_BOOTSTRAP(mmup,0);
882	mtmsr(mfmsr() | PSL_DR | PSL_IR);
883	pmap_bootstrapped++;
884	bs_remap_earlyboot();
885
886	/*
887	 * Set the start and end of kva.
888	 */
889	virtual_avail = VM_MIN_KERNEL_ADDRESS;
890	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
891
892	/*
893	 * Map the entire KVA range into the SLB. We must not fault there.
894	 */
895	#ifdef __powerpc64__
896	for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
897		moea64_bootstrap_slb_prefault(va, 0);
898	#endif
899
900	/*
901	 * Figure out how far we can extend virtual_end into segment 16
902	 * without running into existing mappings. Segment 16 is guaranteed
903	 * to contain neither RAM nor devices (at least on Apple hardware),
904	 * but will generally contain some OFW mappings we should not
905	 * step on.
906	 */
907
908	#ifndef __powerpc64__	/* KVA is in high memory on PPC64 */
909	PMAP_LOCK(kernel_pmap);
910	while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
911	    moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
912		virtual_end += PAGE_SIZE;
913	PMAP_UNLOCK(kernel_pmap);
914	#endif
915
916	/*
917	 * Allocate a kernel stack with a guard page for thread0 and map it
918	 * into the kernel page map.
919	 */
920	pa = moea64_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
921	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
922	virtual_avail = va + KSTACK_PAGES * PAGE_SIZE;
923	CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
924	thread0.td_kstack = va;
925	thread0.td_kstack_pages = KSTACK_PAGES;
926	for (i = 0; i < KSTACK_PAGES; i++) {
927		moea64_kenter(mmup, va, pa);
928		pa += PAGE_SIZE;
929		va += PAGE_SIZE;
930	}
931
932	/*
933	 * Allocate virtual address space for the message buffer.
934	 */
935	pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
936	msgbufp = (struct msgbuf *)virtual_avail;
937	va = virtual_avail;
938	virtual_avail += round_page(msgbufsize);
939	while (va < virtual_avail) {
940		moea64_kenter(mmup, va, pa);
941		pa += PAGE_SIZE;
942		va += PAGE_SIZE;
943	}
944
945	/*
946	 * Allocate virtual address space for the dynamic percpu area.
947	 */
948	pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
949	dpcpu = (void *)virtual_avail;
950	va = virtual_avail;
951	virtual_avail += DPCPU_SIZE;
952	while (va < virtual_avail) {
953		moea64_kenter(mmup, va, pa);
954		pa += PAGE_SIZE;
955		va += PAGE_SIZE;
956	}
957	dpcpu_init(dpcpu, 0);
958
959	/*
960	 * Allocate some things for page zeroing. We put this directly
961	 * in the page table, marked with LPTE_LOCKED, to avoid any
962	 * of the PVO book-keeping or other parts of the VM system
963	 * from even knowing that this hack exists.
964	 */
965
966	if (!hw_direct_map) {
967		mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
968		    MTX_DEF);
969		for (i = 0; i < 2; i++) {
970			moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
971			virtual_end -= PAGE_SIZE;
972
973			moea64_kenter(mmup, moea64_scratchpage_va[i], 0);
974
975			moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
976			    kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
977			LOCK_TABLE_RD();
978			moea64_scratchpage_pte[i] = MOEA64_PVO_TO_PTE(
979			    mmup, moea64_scratchpage_pvo[i]);
980			moea64_scratchpage_pvo[i]->pvo_pte.lpte.pte_hi
981			    |= LPTE_LOCKED;
982			MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[i],
983			    &moea64_scratchpage_pvo[i]->pvo_pte.lpte,
984			    moea64_scratchpage_pvo[i]->pvo_vpn);
985			UNLOCK_TABLE_RD();
986		}
987	}
988}
989
990/*
991 * Activate a user pmap.  The pmap must be activated before its address
992 * space can be accessed in any way.
993 */
994void
995moea64_activate(mmu_t mmu, struct thread *td)
996{
997	pmap_t	pm;
998
999	pm = &td->td_proc->p_vmspace->vm_pmap;
1000	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1001
1002	#ifdef __powerpc64__
1003	PCPU_SET(userslb, pm->pm_slb);
1004	#else
1005	PCPU_SET(curpmap, pm->pmap_phys);
1006	#endif
1007}
1008
1009void
1010moea64_deactivate(mmu_t mmu, struct thread *td)
1011{
1012	pmap_t	pm;
1013
1014	pm = &td->td_proc->p_vmspace->vm_pmap;
1015	CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1016	#ifdef __powerpc64__
1017	PCPU_SET(userslb, NULL);
1018	#else
1019	PCPU_SET(curpmap, NULL);
1020	#endif
1021}
1022
1023void
1024moea64_change_wiring(mmu_t mmu, pmap_t pm, vm_offset_t va, boolean_t wired)
1025{
1026	struct	pvo_entry *pvo;
1027	uintptr_t pt;
1028	uint64_t vsid;
1029	int	i, ptegidx;
1030
1031	LOCK_TABLE_WR();
1032	PMAP_LOCK(pm);
1033	pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
1034
1035	if (pvo != NULL) {
1036		pt = MOEA64_PVO_TO_PTE(mmu, pvo);
1037
1038		if (wired) {
1039			if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1040				pm->pm_stats.wired_count++;
1041			pvo->pvo_vaddr |= PVO_WIRED;
1042			pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED;
1043		} else {
1044			if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1045				pm->pm_stats.wired_count--;
1046			pvo->pvo_vaddr &= ~PVO_WIRED;
1047			pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED;
1048		}
1049
1050		if (pt != -1) {
1051			/* Update wiring flag in page table. */
1052			MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte,
1053			    pvo->pvo_vpn);
1054		} else if (wired) {
1055			/*
1056			 * If we are wiring the page, and it wasn't in the
1057			 * page table before, add it.
1058			 */
1059			vsid = PVO_VSID(pvo);
1060			ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo),
1061			    pvo->pvo_vaddr & PVO_LARGE);
1062
1063			i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte);
1064
1065			if (i >= 0) {
1066				PVO_PTEGIDX_CLR(pvo);
1067				PVO_PTEGIDX_SET(pvo, i);
1068			}
1069		}
1070
1071	}
1072	UNLOCK_TABLE_WR();
1073	PMAP_UNLOCK(pm);
1074}
1075
1076/*
1077 * This goes through and sets the physical address of our
1078 * special scratch PTE to the PA we want to zero or copy. Because
1079 * of locking issues (this can get called in pvo_enter() by
1080 * the UMA allocator), we can't use most other utility functions here
1081 */
1082
1083static __inline
1084void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_offset_t pa) {
1085
1086	KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
1087	mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1088
1089	moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo &=
1090	    ~(LPTE_WIMG | LPTE_RPGN);
1091	moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo |=
1092	    moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
1093	MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[which],
1094	    &moea64_scratchpage_pvo[which]->pvo_pte.lpte,
1095	    moea64_scratchpage_pvo[which]->pvo_vpn);
1096	isync();
1097}
1098
1099void
1100moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst)
1101{
1102	vm_offset_t	dst;
1103	vm_offset_t	src;
1104
1105	dst = VM_PAGE_TO_PHYS(mdst);
1106	src = VM_PAGE_TO_PHYS(msrc);
1107
1108	if (hw_direct_map) {
1109		bcopy((void *)src, (void *)dst, PAGE_SIZE);
1110	} else {
1111		mtx_lock(&moea64_scratchpage_mtx);
1112
1113		moea64_set_scratchpage_pa(mmu, 0, src);
1114		moea64_set_scratchpage_pa(mmu, 1, dst);
1115
1116		bcopy((void *)moea64_scratchpage_va[0],
1117		    (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1118
1119		mtx_unlock(&moea64_scratchpage_mtx);
1120	}
1121}
1122
1123static inline void
1124moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1125    vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1126{
1127	void *a_cp, *b_cp;
1128	vm_offset_t a_pg_offset, b_pg_offset;
1129	int cnt;
1130
1131	while (xfersize > 0) {
1132		a_pg_offset = a_offset & PAGE_MASK;
1133		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1134		a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) +
1135		    a_pg_offset;
1136		b_pg_offset = b_offset & PAGE_MASK;
1137		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1138		b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) +
1139		    b_pg_offset;
1140		bcopy(a_cp, b_cp, cnt);
1141		a_offset += cnt;
1142		b_offset += cnt;
1143		xfersize -= cnt;
1144	}
1145}
1146
1147static inline void
1148moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1149    vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1150{
1151	void *a_cp, *b_cp;
1152	vm_offset_t a_pg_offset, b_pg_offset;
1153	int cnt;
1154
1155	mtx_lock(&moea64_scratchpage_mtx);
1156	while (xfersize > 0) {
1157		a_pg_offset = a_offset & PAGE_MASK;
1158		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1159		moea64_set_scratchpage_pa(mmu, 0,
1160		    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
1161		a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
1162		b_pg_offset = b_offset & PAGE_MASK;
1163		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1164		moea64_set_scratchpage_pa(mmu, 1,
1165		    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
1166		b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
1167		bcopy(a_cp, b_cp, cnt);
1168		a_offset += cnt;
1169		b_offset += cnt;
1170		xfersize -= cnt;
1171	}
1172	mtx_unlock(&moea64_scratchpage_mtx);
1173}
1174
1175void
1176moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1177    vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1178{
1179
1180	if (hw_direct_map) {
1181		moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset,
1182		    xfersize);
1183	} else {
1184		moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset,
1185		    xfersize);
1186	}
1187}
1188
1189void
1190moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
1191{
1192	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1193
1194	if (size + off > PAGE_SIZE)
1195		panic("moea64_zero_page: size + off > PAGE_SIZE");
1196
1197	if (hw_direct_map) {
1198		bzero((caddr_t)pa + off, size);
1199	} else {
1200		mtx_lock(&moea64_scratchpage_mtx);
1201		moea64_set_scratchpage_pa(mmu, 0, pa);
1202		bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1203		mtx_unlock(&moea64_scratchpage_mtx);
1204	}
1205}
1206
1207/*
1208 * Zero a page of physical memory by temporarily mapping it
1209 */
1210void
1211moea64_zero_page(mmu_t mmu, vm_page_t m)
1212{
1213	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1214	vm_offset_t va, off;
1215
1216	if (!hw_direct_map) {
1217		mtx_lock(&moea64_scratchpage_mtx);
1218
1219		moea64_set_scratchpage_pa(mmu, 0, pa);
1220		va = moea64_scratchpage_va[0];
1221	} else {
1222		va = pa;
1223	}
1224
1225	for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1226		__asm __volatile("dcbz 0,%0" :: "r"(va + off));
1227
1228	if (!hw_direct_map)
1229		mtx_unlock(&moea64_scratchpage_mtx);
1230}
1231
1232void
1233moea64_zero_page_idle(mmu_t mmu, vm_page_t m)
1234{
1235
1236	moea64_zero_page(mmu, m);
1237}
1238
1239/*
1240 * Map the given physical page at the specified virtual address in the
1241 * target pmap with the protection requested.  If specified the page
1242 * will be wired down.
1243 */
1244
1245void
1246moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
1247    vm_prot_t prot, boolean_t wired)
1248{
1249	struct		pvo_head *pvo_head;
1250	uma_zone_t	zone;
1251	vm_page_t	pg;
1252	uint64_t	pte_lo;
1253	u_int		pvo_flags;
1254	int		error;
1255
1256	if (!moea64_initialized) {
1257		pvo_head = NULL;
1258		pg = NULL;
1259		zone = moea64_upvo_zone;
1260		pvo_flags = 0;
1261	} else {
1262		pvo_head = vm_page_to_pvoh(m);
1263		pg = m;
1264		zone = moea64_mpvo_zone;
1265		pvo_flags = PVO_MANAGED;
1266	}
1267
1268	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
1269		VM_OBJECT_ASSERT_LOCKED(m->object);
1270
1271	/* XXX change the pvo head for fake pages */
1272	if ((m->oflags & VPO_UNMANAGED) != 0) {
1273		pvo_flags &= ~PVO_MANAGED;
1274		pvo_head = NULL;
1275		zone = moea64_upvo_zone;
1276	}
1277
1278	pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
1279
1280	if (prot & VM_PROT_WRITE) {
1281		pte_lo |= LPTE_BW;
1282		if (pmap_bootstrapped &&
1283		    (m->oflags & VPO_UNMANAGED) == 0)
1284			vm_page_aflag_set(m, PGA_WRITEABLE);
1285	} else
1286		pte_lo |= LPTE_BR;
1287
1288	if ((prot & VM_PROT_EXECUTE) == 0)
1289		pte_lo |= LPTE_NOEXEC;
1290
1291	if (wired)
1292		pvo_flags |= PVO_WIRED;
1293
1294	LOCK_TABLE_WR();
1295	PMAP_LOCK(pmap);
1296	error = moea64_pvo_enter(mmu, pmap, zone, pvo_head, va,
1297	    VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags);
1298	PMAP_UNLOCK(pmap);
1299	UNLOCK_TABLE_WR();
1300
1301	/*
1302	 * Flush the page from the instruction cache if this page is
1303	 * mapped executable and cacheable.
1304	 */
1305	if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) &&
1306	    (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1307		vm_page_aflag_set(m, PGA_EXECUTABLE);
1308		moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1309	}
1310}
1311
1312static void
1313moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t pa,
1314    vm_size_t sz)
1315{
1316
1317	/*
1318	 * This is much trickier than on older systems because
1319	 * we can't sync the icache on physical addresses directly
1320	 * without a direct map. Instead we check a couple of cases
1321	 * where the memory is already mapped in and, failing that,
1322	 * use the same trick we use for page zeroing to create
1323	 * a temporary mapping for this physical address.
1324	 */
1325
1326	if (!pmap_bootstrapped) {
1327		/*
1328		 * If PMAP is not bootstrapped, we are likely to be
1329		 * in real mode.
1330		 */
1331		__syncicache((void *)pa, sz);
1332	} else if (pmap == kernel_pmap) {
1333		__syncicache((void *)va, sz);
1334	} else if (hw_direct_map) {
1335		__syncicache((void *)pa, sz);
1336	} else {
1337		/* Use the scratch page to set up a temp mapping */
1338
1339		mtx_lock(&moea64_scratchpage_mtx);
1340
1341		moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF);
1342		__syncicache((void *)(moea64_scratchpage_va[1] +
1343		    (va & ADDR_POFF)), sz);
1344
1345		mtx_unlock(&moea64_scratchpage_mtx);
1346	}
1347}
1348
1349/*
1350 * Maps a sequence of resident pages belonging to the same object.
1351 * The sequence begins with the given page m_start.  This page is
1352 * mapped at the given virtual address start.  Each subsequent page is
1353 * mapped at a virtual address that is offset from start by the same
1354 * amount as the page is offset from m_start within the object.  The
1355 * last page in the sequence is the page with the largest offset from
1356 * m_start that can be mapped at a virtual address less than the given
1357 * virtual address end.  Not every virtual page between start and end
1358 * is mapped; only those for which a resident page exists with the
1359 * corresponding offset from m_start are mapped.
1360 */
1361void
1362moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end,
1363    vm_page_t m_start, vm_prot_t prot)
1364{
1365	vm_page_t m;
1366	vm_pindex_t diff, psize;
1367
1368	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1369
1370	psize = atop(end - start);
1371	m = m_start;
1372	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1373		moea64_enter(mmu, pm, start + ptoa(diff), m, prot &
1374		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
1375		m = TAILQ_NEXT(m, listq);
1376	}
1377}
1378
1379void
1380moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m,
1381    vm_prot_t prot)
1382{
1383
1384	moea64_enter(mmu, pm, va, m,
1385	    prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
1386}
1387
1388vm_paddr_t
1389moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va)
1390{
1391	struct	pvo_entry *pvo;
1392	vm_paddr_t pa;
1393
1394	PMAP_LOCK(pm);
1395	pvo = moea64_pvo_find_va(pm, va);
1396	if (pvo == NULL)
1397		pa = 0;
1398	else
1399		pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) |
1400		    (va - PVO_VADDR(pvo));
1401	PMAP_UNLOCK(pm);
1402	return (pa);
1403}
1404
1405/*
1406 * Atomically extract and hold the physical page with the given
1407 * pmap and virtual address pair if that mapping permits the given
1408 * protection.
1409 */
1410vm_page_t
1411moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1412{
1413	struct	pvo_entry *pvo;
1414	vm_page_t m;
1415        vm_paddr_t pa;
1416
1417	m = NULL;
1418	pa = 0;
1419	PMAP_LOCK(pmap);
1420retry:
1421	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1422	if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) &&
1423	    ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) == LPTE_RW ||
1424	     (prot & VM_PROT_WRITE) == 0)) {
1425		if (vm_page_pa_tryrelock(pmap,
1426			pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, &pa))
1427			goto retry;
1428		m = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
1429		vm_page_hold(m);
1430	}
1431	PA_UNLOCK_COND(pa);
1432	PMAP_UNLOCK(pmap);
1433	return (m);
1434}
1435
1436static mmu_t installed_mmu;
1437
1438static void *
1439moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
1440{
1441	/*
1442	 * This entire routine is a horrible hack to avoid bothering kmem
1443	 * for new KVA addresses. Because this can get called from inside
1444	 * kmem allocation routines, calling kmem for a new address here
1445	 * can lead to multiply locking non-recursive mutexes.
1446	 */
1447        vm_offset_t va;
1448
1449        vm_page_t m;
1450        int pflags, needed_lock;
1451
1452	*flags = UMA_SLAB_PRIV;
1453	needed_lock = !PMAP_LOCKED(kernel_pmap);
1454	pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED;
1455
1456        for (;;) {
1457                m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ);
1458                if (m == NULL) {
1459                        if (wait & M_NOWAIT)
1460                                return (NULL);
1461                        VM_WAIT;
1462                } else
1463                        break;
1464        }
1465
1466	va = VM_PAGE_TO_PHYS(m);
1467
1468	LOCK_TABLE_WR();
1469	if (needed_lock)
1470		PMAP_LOCK(kernel_pmap);
1471
1472	moea64_pvo_enter(installed_mmu, kernel_pmap, moea64_upvo_zone,
1473	    NULL, va, VM_PAGE_TO_PHYS(m), LPTE_M, PVO_WIRED | PVO_BOOTSTRAP);
1474
1475	if (needed_lock)
1476		PMAP_UNLOCK(kernel_pmap);
1477	UNLOCK_TABLE_WR();
1478
1479	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
1480                bzero((void *)va, PAGE_SIZE);
1481
1482	return (void *)va;
1483}
1484
1485extern int elf32_nxstack;
1486
1487void
1488moea64_init(mmu_t mmu)
1489{
1490
1491	CTR0(KTR_PMAP, "moea64_init");
1492
1493	moea64_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1494	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1495	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1496	moea64_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry),
1497	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1498	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1499
1500	if (!hw_direct_map) {
1501		installed_mmu = mmu;
1502		uma_zone_set_allocf(moea64_upvo_zone,moea64_uma_page_alloc);
1503		uma_zone_set_allocf(moea64_mpvo_zone,moea64_uma_page_alloc);
1504	}
1505
1506#ifdef COMPAT_FREEBSD32
1507	elf32_nxstack = 1;
1508#endif
1509
1510	moea64_initialized = TRUE;
1511}
1512
1513boolean_t
1514moea64_is_referenced(mmu_t mmu, vm_page_t m)
1515{
1516
1517	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1518	    ("moea64_is_referenced: page %p is not managed", m));
1519	return (moea64_query_bit(mmu, m, PTE_REF));
1520}
1521
1522boolean_t
1523moea64_is_modified(mmu_t mmu, vm_page_t m)
1524{
1525
1526	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1527	    ("moea64_is_modified: page %p is not managed", m));
1528
1529	/*
1530	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1531	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
1532	 * is clear, no PTEs can have LPTE_CHG set.
1533	 */
1534	VM_OBJECT_ASSERT_LOCKED(m->object);
1535	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1536		return (FALSE);
1537	return (moea64_query_bit(mmu, m, LPTE_CHG));
1538}
1539
1540boolean_t
1541moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va)
1542{
1543	struct pvo_entry *pvo;
1544	boolean_t rv;
1545
1546	PMAP_LOCK(pmap);
1547	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1548	rv = pvo == NULL || (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0;
1549	PMAP_UNLOCK(pmap);
1550	return (rv);
1551}
1552
1553void
1554moea64_clear_modify(mmu_t mmu, vm_page_t m)
1555{
1556
1557	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1558	    ("moea64_clear_modify: page %p is not managed", m));
1559	VM_OBJECT_ASSERT_WLOCKED(m->object);
1560	KASSERT(!vm_page_xbusied(m),
1561	    ("moea64_clear_modify: page %p is exclusive busied", m));
1562
1563	/*
1564	 * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG
1565	 * set.  If the object containing the page is locked and the page is
1566	 * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
1567	 */
1568	if ((m->aflags & PGA_WRITEABLE) == 0)
1569		return;
1570	moea64_clear_bit(mmu, m, LPTE_CHG);
1571}
1572
1573/*
1574 * Clear the write and modified bits in each of the given page's mappings.
1575 */
1576void
1577moea64_remove_write(mmu_t mmu, vm_page_t m)
1578{
1579	struct	pvo_entry *pvo;
1580	uintptr_t pt;
1581	pmap_t	pmap;
1582	uint64_t lo = 0;
1583
1584	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1585	    ("moea64_remove_write: page %p is not managed", m));
1586
1587	/*
1588	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1589	 * set by another thread while the object is locked.  Thus,
1590	 * if PGA_WRITEABLE is clear, no page table entries need updating.
1591	 */
1592	VM_OBJECT_ASSERT_WLOCKED(m->object);
1593	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1594		return;
1595	powerpc_sync();
1596	LOCK_TABLE_RD();
1597	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1598		pmap = pvo->pvo_pmap;
1599		PMAP_LOCK(pmap);
1600		if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) {
1601			pt = MOEA64_PVO_TO_PTE(mmu, pvo);
1602			pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP;
1603			pvo->pvo_pte.lpte.pte_lo |= LPTE_BR;
1604			if (pt != -1) {
1605				MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte);
1606				lo |= pvo->pvo_pte.lpte.pte_lo;
1607				pvo->pvo_pte.lpte.pte_lo &= ~LPTE_CHG;
1608				MOEA64_PTE_CHANGE(mmu, pt,
1609				    &pvo->pvo_pte.lpte, pvo->pvo_vpn);
1610				if (pvo->pvo_pmap == kernel_pmap)
1611					isync();
1612			}
1613		}
1614		if ((lo & LPTE_CHG) != 0)
1615			vm_page_dirty(m);
1616		PMAP_UNLOCK(pmap);
1617	}
1618	UNLOCK_TABLE_RD();
1619	vm_page_aflag_clear(m, PGA_WRITEABLE);
1620}
1621
1622/*
1623 *	moea64_ts_referenced:
1624 *
1625 *	Return a count of reference bits for a page, clearing those bits.
1626 *	It is not necessary for every reference bit to be cleared, but it
1627 *	is necessary that 0 only be returned when there are truly no
1628 *	reference bits set.
1629 *
1630 *	XXX: The exact number of bits to check and clear is a matter that
1631 *	should be tested and standardized at some point in the future for
1632 *	optimal aging of shared pages.
1633 */
1634int
1635moea64_ts_referenced(mmu_t mmu, vm_page_t m)
1636{
1637
1638	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1639	    ("moea64_ts_referenced: page %p is not managed", m));
1640	return (moea64_clear_bit(mmu, m, LPTE_REF));
1641}
1642
1643/*
1644 * Modify the WIMG settings of all mappings for a page.
1645 */
1646void
1647moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma)
1648{
1649	struct	pvo_entry *pvo;
1650	struct  pvo_head *pvo_head;
1651	uintptr_t pt;
1652	pmap_t	pmap;
1653	uint64_t lo;
1654
1655	if ((m->oflags & VPO_UNMANAGED) != 0) {
1656		m->md.mdpg_cache_attrs = ma;
1657		return;
1658	}
1659
1660	pvo_head = vm_page_to_pvoh(m);
1661	lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
1662	LOCK_TABLE_RD();
1663	LIST_FOREACH(pvo, pvo_head, pvo_vlink) {
1664		pmap = pvo->pvo_pmap;
1665		PMAP_LOCK(pmap);
1666		pt = MOEA64_PVO_TO_PTE(mmu, pvo);
1667		pvo->pvo_pte.lpte.pte_lo &= ~LPTE_WIMG;
1668		pvo->pvo_pte.lpte.pte_lo |= lo;
1669		if (pt != -1) {
1670			MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte,
1671			    pvo->pvo_vpn);
1672			if (pvo->pvo_pmap == kernel_pmap)
1673				isync();
1674		}
1675		PMAP_UNLOCK(pmap);
1676	}
1677	UNLOCK_TABLE_RD();
1678	m->md.mdpg_cache_attrs = ma;
1679}
1680
1681/*
1682 * Map a wired page into kernel virtual address space.
1683 */
1684void
1685moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_offset_t pa, vm_memattr_t ma)
1686{
1687	uint64_t	pte_lo;
1688	int		error;
1689
1690	pte_lo = moea64_calc_wimg(pa, ma);
1691
1692	LOCK_TABLE_WR();
1693	PMAP_LOCK(kernel_pmap);
1694	error = moea64_pvo_enter(mmu, kernel_pmap, moea64_upvo_zone,
1695	    NULL, va, pa, pte_lo, PVO_WIRED);
1696	PMAP_UNLOCK(kernel_pmap);
1697	UNLOCK_TABLE_WR();
1698
1699	if (error != 0 && error != ENOENT)
1700		panic("moea64_kenter: failed to enter va %#zx pa %#zx: %d", va,
1701		    pa, error);
1702}
1703
1704void
1705moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
1706{
1707
1708	moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT);
1709}
1710
1711/*
1712 * Extract the physical page address associated with the given kernel virtual
1713 * address.
1714 */
1715vm_paddr_t
1716moea64_kextract(mmu_t mmu, vm_offset_t va)
1717{
1718	struct		pvo_entry *pvo;
1719	vm_paddr_t pa;
1720
1721	/*
1722	 * Shortcut the direct-mapped case when applicable.  We never put
1723	 * anything but 1:1 mappings below VM_MIN_KERNEL_ADDRESS.
1724	 */
1725	if (va < VM_MIN_KERNEL_ADDRESS)
1726		return (va);
1727
1728	PMAP_LOCK(kernel_pmap);
1729	pvo = moea64_pvo_find_va(kernel_pmap, va);
1730	KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
1731	    va));
1732	pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo));
1733	PMAP_UNLOCK(kernel_pmap);
1734	return (pa);
1735}
1736
1737/*
1738 * Remove a wired page from kernel virtual address space.
1739 */
1740void
1741moea64_kremove(mmu_t mmu, vm_offset_t va)
1742{
1743	moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE);
1744}
1745
1746/*
1747 * Map a range of physical addresses into kernel virtual address space.
1748 *
1749 * The value passed in *virt is a suggested virtual address for the mapping.
1750 * Architectures which can support a direct-mapped physical to virtual region
1751 * can return the appropriate address within that region, leaving '*virt'
1752 * unchanged.  We cannot and therefore do not; *virt is updated with the
1753 * first usable address after the mapped region.
1754 */
1755vm_offset_t
1756moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
1757    vm_paddr_t pa_end, int prot)
1758{
1759	vm_offset_t	sva, va;
1760
1761	sva = *virt;
1762	va = sva;
1763	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
1764		moea64_kenter(mmu, va, pa_start);
1765	*virt = va;
1766
1767	return (sva);
1768}
1769
1770/*
1771 * Returns true if the pmap's pv is one of the first
1772 * 16 pvs linked to from this page.  This count may
1773 * be changed upwards or downwards in the future; it
1774 * is only necessary that true be returned for a small
1775 * subset of pmaps for proper page aging.
1776 */
1777boolean_t
1778moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
1779{
1780        int loops;
1781	struct pvo_entry *pvo;
1782	boolean_t rv;
1783
1784	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1785	    ("moea64_page_exists_quick: page %p is not managed", m));
1786	loops = 0;
1787	rv = FALSE;
1788	LOCK_TABLE_RD();
1789	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1790		if (pvo->pvo_pmap == pmap) {
1791			rv = TRUE;
1792			break;
1793		}
1794		if (++loops >= 16)
1795			break;
1796	}
1797	UNLOCK_TABLE_RD();
1798	return (rv);
1799}
1800
1801/*
1802 * Return the number of managed mappings to the given physical page
1803 * that are wired.
1804 */
1805int
1806moea64_page_wired_mappings(mmu_t mmu, vm_page_t m)
1807{
1808	struct pvo_entry *pvo;
1809	int count;
1810
1811	count = 0;
1812	if ((m->oflags & VPO_UNMANAGED) != 0)
1813		return (count);
1814	LOCK_TABLE_RD();
1815	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
1816		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1817			count++;
1818	UNLOCK_TABLE_RD();
1819	return (count);
1820}
1821
1822static uintptr_t	moea64_vsidcontext;
1823
1824uintptr_t
1825moea64_get_unique_vsid(void) {
1826	u_int entropy;
1827	register_t hash;
1828	uint32_t mask;
1829	int i;
1830
1831	entropy = 0;
1832	__asm __volatile("mftb %0" : "=r"(entropy));
1833
1834	mtx_lock(&moea64_slb_mutex);
1835	for (i = 0; i < NVSIDS; i += VSID_NBPW) {
1836		u_int	n;
1837
1838		/*
1839		 * Create a new value by mutiplying by a prime and adding in
1840		 * entropy from the timebase register.  This is to make the
1841		 * VSID more random so that the PT hash function collides
1842		 * less often.  (Note that the prime casues gcc to do shifts
1843		 * instead of a multiply.)
1844		 */
1845		moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
1846		hash = moea64_vsidcontext & (NVSIDS - 1);
1847		if (hash == 0)		/* 0 is special, avoid it */
1848			continue;
1849		n = hash >> 5;
1850		mask = 1 << (hash & (VSID_NBPW - 1));
1851		hash = (moea64_vsidcontext & VSID_HASHMASK);
1852		if (moea64_vsid_bitmap[n] & mask) {	/* collision? */
1853			/* anything free in this bucket? */
1854			if (moea64_vsid_bitmap[n] == 0xffffffff) {
1855				entropy = (moea64_vsidcontext >> 20);
1856				continue;
1857			}
1858			i = ffs(~moea64_vsid_bitmap[n]) - 1;
1859			mask = 1 << i;
1860			hash &= VSID_HASHMASK & ~(VSID_NBPW - 1);
1861			hash |= i;
1862		}
1863		KASSERT(!(moea64_vsid_bitmap[n] & mask),
1864		    ("Allocating in-use VSID %#zx\n", hash));
1865		moea64_vsid_bitmap[n] |= mask;
1866		mtx_unlock(&moea64_slb_mutex);
1867		return (hash);
1868	}
1869
1870	mtx_unlock(&moea64_slb_mutex);
1871	panic("%s: out of segments",__func__);
1872}
1873
1874#ifdef __powerpc64__
1875void
1876moea64_pinit(mmu_t mmu, pmap_t pmap)
1877{
1878
1879	RB_INIT(&pmap->pmap_pvo);
1880
1881	pmap->pm_slb_tree_root = slb_alloc_tree();
1882	pmap->pm_slb = slb_alloc_user_cache();
1883	pmap->pm_slb_len = 0;
1884}
1885#else
1886void
1887moea64_pinit(mmu_t mmu, pmap_t pmap)
1888{
1889	int	i;
1890	uint32_t hash;
1891
1892	RB_INIT(&pmap->pmap_pvo);
1893
1894	if (pmap_bootstrapped)
1895		pmap->pmap_phys = (pmap_t)moea64_kextract(mmu,
1896		    (vm_offset_t)pmap);
1897	else
1898		pmap->pmap_phys = pmap;
1899
1900	/*
1901	 * Allocate some segment registers for this pmap.
1902	 */
1903	hash = moea64_get_unique_vsid();
1904
1905	for (i = 0; i < 16; i++)
1906		pmap->pm_sr[i] = VSID_MAKE(i, hash);
1907
1908	KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
1909}
1910#endif
1911
1912/*
1913 * Initialize the pmap associated with process 0.
1914 */
1915void
1916moea64_pinit0(mmu_t mmu, pmap_t pm)
1917{
1918
1919	PMAP_LOCK_INIT(pm);
1920	moea64_pinit(mmu, pm);
1921	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
1922}
1923
1924/*
1925 * Set the physical protection on the specified range of this map as requested.
1926 */
1927static void
1928moea64_pvo_protect(mmu_t mmu,  pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
1929{
1930	uintptr_t pt;
1931	struct	vm_page *pg;
1932	uint64_t oldlo;
1933
1934	PMAP_LOCK_ASSERT(pm, MA_OWNED);
1935
1936	/*
1937	 * Grab the PTE pointer before we diddle with the cached PTE
1938	 * copy.
1939	 */
1940	pt = MOEA64_PVO_TO_PTE(mmu, pvo);
1941
1942	/*
1943	 * Change the protection of the page.
1944	 */
1945	oldlo = pvo->pvo_pte.lpte.pte_lo;
1946	pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP;
1947	pvo->pvo_pte.lpte.pte_lo &= ~LPTE_NOEXEC;
1948	if ((prot & VM_PROT_EXECUTE) == 0)
1949		pvo->pvo_pte.lpte.pte_lo |= LPTE_NOEXEC;
1950	if (prot & VM_PROT_WRITE)
1951		pvo->pvo_pte.lpte.pte_lo |= LPTE_BW;
1952	else
1953		pvo->pvo_pte.lpte.pte_lo |= LPTE_BR;
1954
1955	pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
1956
1957	/*
1958	 * If the PVO is in the page table, update that pte as well.
1959	 */
1960	if (pt != -1)
1961		MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte,
1962		    pvo->pvo_vpn);
1963	if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) &&
1964	    (pvo->pvo_pte.lpte.pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1965		if ((pg->oflags & VPO_UNMANAGED) == 0)
1966			vm_page_aflag_set(pg, PGA_EXECUTABLE);
1967		moea64_syncicache(mmu, pm, PVO_VADDR(pvo),
1968		    pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, PAGE_SIZE);
1969	}
1970
1971	/*
1972	 * Update vm about the REF/CHG bits if the page is managed and we have
1973	 * removed write access.
1974	 */
1975	if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED &&
1976	    (oldlo & LPTE_PP) != LPTE_BR && !(prot & VM_PROT_WRITE)) {
1977		if (pg != NULL) {
1978			if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG)
1979				vm_page_dirty(pg);
1980			if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF)
1981				vm_page_aflag_set(pg, PGA_REFERENCED);
1982		}
1983	}
1984}
1985
1986void
1987moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
1988    vm_prot_t prot)
1989{
1990	struct	pvo_entry *pvo, *tpvo, key;
1991
1992	CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
1993	    sva, eva, prot);
1994
1995	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
1996	    ("moea64_protect: non current pmap"));
1997
1998	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1999		moea64_remove(mmu, pm, sva, eva);
2000		return;
2001	}
2002
2003	LOCK_TABLE_RD();
2004	PMAP_LOCK(pm);
2005	key.pvo_vaddr = sva;
2006	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2007	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2008		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2009		moea64_pvo_protect(mmu, pm, pvo, prot);
2010	}
2011	UNLOCK_TABLE_RD();
2012	PMAP_UNLOCK(pm);
2013}
2014
2015/*
2016 * Map a list of wired pages into kernel virtual address space.  This is
2017 * intended for temporary mappings which do not need page modification or
2018 * references recorded.  Existing mappings in the region are overwritten.
2019 */
2020void
2021moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count)
2022{
2023	while (count-- > 0) {
2024		moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
2025		va += PAGE_SIZE;
2026		m++;
2027	}
2028}
2029
2030/*
2031 * Remove page mappings from kernel virtual address space.  Intended for
2032 * temporary mappings entered by moea64_qenter.
2033 */
2034void
2035moea64_qremove(mmu_t mmu, vm_offset_t va, int count)
2036{
2037	while (count-- > 0) {
2038		moea64_kremove(mmu, va);
2039		va += PAGE_SIZE;
2040	}
2041}
2042
2043void
2044moea64_release_vsid(uint64_t vsid)
2045{
2046	int idx, mask;
2047
2048	mtx_lock(&moea64_slb_mutex);
2049	idx = vsid & (NVSIDS-1);
2050	mask = 1 << (idx % VSID_NBPW);
2051	idx /= VSID_NBPW;
2052	KASSERT(moea64_vsid_bitmap[idx] & mask,
2053	    ("Freeing unallocated VSID %#jx", vsid));
2054	moea64_vsid_bitmap[idx] &= ~mask;
2055	mtx_unlock(&moea64_slb_mutex);
2056}
2057
2058
2059void
2060moea64_release(mmu_t mmu, pmap_t pmap)
2061{
2062
2063	/*
2064	 * Free segment registers' VSIDs
2065	 */
2066    #ifdef __powerpc64__
2067	slb_free_tree(pmap);
2068	slb_free_user_cache(pmap->pm_slb);
2069    #else
2070	KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
2071
2072	moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
2073    #endif
2074}
2075
2076/*
2077 * Remove all pages mapped by the specified pmap
2078 */
2079void
2080moea64_remove_pages(mmu_t mmu, pmap_t pm)
2081{
2082	struct	pvo_entry *pvo, *tpvo;
2083
2084	LOCK_TABLE_WR();
2085	PMAP_LOCK(pm);
2086	RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
2087		if (!(pvo->pvo_vaddr & PVO_WIRED))
2088			moea64_pvo_remove(mmu, pvo);
2089	}
2090	UNLOCK_TABLE_WR();
2091	PMAP_UNLOCK(pm);
2092}
2093
2094/*
2095 * Remove the given range of addresses from the specified map.
2096 */
2097void
2098moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
2099{
2100	struct	pvo_entry *pvo, *tpvo, key;
2101
2102	/*
2103	 * Perform an unsynchronized read.  This is, however, safe.
2104	 */
2105	if (pm->pm_stats.resident_count == 0)
2106		return;
2107
2108	LOCK_TABLE_WR();
2109	PMAP_LOCK(pm);
2110	key.pvo_vaddr = sva;
2111	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2112	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2113		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2114		moea64_pvo_remove(mmu, pvo);
2115	}
2116	UNLOCK_TABLE_WR();
2117	PMAP_UNLOCK(pm);
2118}
2119
2120/*
2121 * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
2122 * will reflect changes in pte's back to the vm_page.
2123 */
2124void
2125moea64_remove_all(mmu_t mmu, vm_page_t m)
2126{
2127	struct	pvo_entry *pvo, *next_pvo;
2128	pmap_t	pmap;
2129
2130	LOCK_TABLE_WR();
2131	LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
2132		pmap = pvo->pvo_pmap;
2133		PMAP_LOCK(pmap);
2134		moea64_pvo_remove(mmu, pvo);
2135		PMAP_UNLOCK(pmap);
2136	}
2137	UNLOCK_TABLE_WR();
2138	if ((m->aflags & PGA_WRITEABLE) && moea64_is_modified(mmu, m))
2139		vm_page_dirty(m);
2140	vm_page_aflag_clear(m, PGA_WRITEABLE);
2141	vm_page_aflag_clear(m, PGA_EXECUTABLE);
2142}
2143
2144/*
2145 * Allocate a physical page of memory directly from the phys_avail map.
2146 * Can only be called from moea64_bootstrap before avail start and end are
2147 * calculated.
2148 */
2149vm_offset_t
2150moea64_bootstrap_alloc(vm_size_t size, u_int align)
2151{
2152	vm_offset_t	s, e;
2153	int		i, j;
2154
2155	size = round_page(size);
2156	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2157		if (align != 0)
2158			s = (phys_avail[i] + align - 1) & ~(align - 1);
2159		else
2160			s = phys_avail[i];
2161		e = s + size;
2162
2163		if (s < phys_avail[i] || e > phys_avail[i + 1])
2164			continue;
2165
2166		if (s + size > platform_real_maxaddr())
2167			continue;
2168
2169		if (s == phys_avail[i]) {
2170			phys_avail[i] += size;
2171		} else if (e == phys_avail[i + 1]) {
2172			phys_avail[i + 1] -= size;
2173		} else {
2174			for (j = phys_avail_count * 2; j > i; j -= 2) {
2175				phys_avail[j] = phys_avail[j - 2];
2176				phys_avail[j + 1] = phys_avail[j - 1];
2177			}
2178
2179			phys_avail[i + 3] = phys_avail[i + 1];
2180			phys_avail[i + 1] = s;
2181			phys_avail[i + 2] = e;
2182			phys_avail_count++;
2183		}
2184
2185		return (s);
2186	}
2187	panic("moea64_bootstrap_alloc: could not allocate memory");
2188}
2189
2190static int
2191moea64_pvo_enter(mmu_t mmu, pmap_t pm, uma_zone_t zone,
2192    struct pvo_head *pvo_head, vm_offset_t va, vm_offset_t pa,
2193    uint64_t pte_lo, int flags)
2194{
2195	struct	 pvo_entry *pvo;
2196	uint64_t vsid;
2197	int	 first;
2198	u_int	 ptegidx;
2199	int	 i;
2200	int      bootstrap;
2201
2202	/*
2203	 * One nasty thing that can happen here is that the UMA calls to
2204	 * allocate new PVOs need to map more memory, which calls pvo_enter(),
2205	 * which calls UMA...
2206	 *
2207	 * We break the loop by detecting recursion and allocating out of
2208	 * the bootstrap pool.
2209	 */
2210
2211	first = 0;
2212	bootstrap = (flags & PVO_BOOTSTRAP);
2213
2214	if (!moea64_initialized)
2215		bootstrap = 1;
2216
2217	PMAP_LOCK_ASSERT(pm, MA_OWNED);
2218	rw_assert(&moea64_table_lock, RA_WLOCKED);
2219
2220	/*
2221	 * Compute the PTE Group index.
2222	 */
2223	va &= ~ADDR_POFF;
2224	vsid = va_to_vsid(pm, va);
2225	ptegidx = va_to_pteg(vsid, va, flags & PVO_LARGE);
2226
2227	/*
2228	 * Remove any existing mapping for this page.  Reuse the pvo entry if
2229	 * there is a mapping.
2230	 */
2231	moea64_pvo_enter_calls++;
2232
2233	LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) {
2234		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2235			if ((pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) == pa &&
2236			    (pvo->pvo_pte.lpte.pte_lo & (LPTE_NOEXEC | LPTE_PP))
2237			    == (pte_lo & (LPTE_NOEXEC | LPTE_PP))) {
2238			    	if (!(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) {
2239					/* Re-insert if spilled */
2240					i = MOEA64_PTE_INSERT(mmu, ptegidx,
2241					    &pvo->pvo_pte.lpte);
2242					if (i >= 0)
2243						PVO_PTEGIDX_SET(pvo, i);
2244					moea64_pte_overflow--;
2245				}
2246				return (0);
2247			}
2248			moea64_pvo_remove(mmu, pvo);
2249			break;
2250		}
2251	}
2252
2253	/*
2254	 * If we aren't overwriting a mapping, try to allocate.
2255	 */
2256	if (bootstrap) {
2257		if (moea64_bpvo_pool_index >= BPVO_POOL_SIZE) {
2258			panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd",
2259			      moea64_bpvo_pool_index, BPVO_POOL_SIZE,
2260			      BPVO_POOL_SIZE * sizeof(struct pvo_entry));
2261		}
2262		pvo = &moea64_bpvo_pool[moea64_bpvo_pool_index];
2263		moea64_bpvo_pool_index++;
2264		bootstrap = 1;
2265	} else {
2266		pvo = uma_zalloc(zone, M_NOWAIT);
2267	}
2268
2269	if (pvo == NULL)
2270		return (ENOMEM);
2271
2272	moea64_pvo_entries++;
2273	pvo->pvo_vaddr = va;
2274	pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
2275	    | (vsid << 16);
2276	pvo->pvo_pmap = pm;
2277	LIST_INSERT_HEAD(&moea64_pvo_table[ptegidx], pvo, pvo_olink);
2278	pvo->pvo_vaddr &= ~ADDR_POFF;
2279
2280	if (flags & PVO_WIRED)
2281		pvo->pvo_vaddr |= PVO_WIRED;
2282	if (pvo_head != NULL)
2283		pvo->pvo_vaddr |= PVO_MANAGED;
2284	if (bootstrap)
2285		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
2286	if (flags & PVO_LARGE)
2287		pvo->pvo_vaddr |= PVO_LARGE;
2288
2289	moea64_pte_create(&pvo->pvo_pte.lpte, vsid, va,
2290	    (uint64_t)(pa) | pte_lo, flags);
2291
2292	/*
2293	 * Add to pmap list
2294	 */
2295	RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo);
2296
2297	/*
2298	 * Remember if the list was empty and therefore will be the first
2299	 * item.
2300	 */
2301	if (pvo_head != NULL) {
2302		if (LIST_FIRST(pvo_head) == NULL)
2303			first = 1;
2304		LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2305	}
2306
2307	if (pvo->pvo_vaddr & PVO_WIRED) {
2308		pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED;
2309		pm->pm_stats.wired_count++;
2310	}
2311	pm->pm_stats.resident_count++;
2312
2313	/*
2314	 * We hope this succeeds but it isn't required.
2315	 */
2316	i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte);
2317	if (i >= 0) {
2318		PVO_PTEGIDX_SET(pvo, i);
2319	} else {
2320		panic("moea64_pvo_enter: overflow");
2321		moea64_pte_overflow++;
2322	}
2323
2324	if (pm == kernel_pmap)
2325		isync();
2326
2327#ifdef __powerpc64__
2328	/*
2329	 * Make sure all our bootstrap mappings are in the SLB as soon
2330	 * as virtual memory is switched on.
2331	 */
2332	if (!pmap_bootstrapped)
2333		moea64_bootstrap_slb_prefault(va, flags & PVO_LARGE);
2334#endif
2335
2336	return (first ? ENOENT : 0);
2337}
2338
2339static void
2340moea64_pvo_remove(mmu_t mmu, struct pvo_entry *pvo)
2341{
2342	struct	vm_page *pg;
2343	uintptr_t pt;
2344
2345	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2346	rw_assert(&moea64_table_lock, RA_WLOCKED);
2347
2348	/*
2349	 * If there is an active pte entry, we need to deactivate it (and
2350	 * save the ref & cfg bits).
2351	 */
2352	pt = MOEA64_PVO_TO_PTE(mmu, pvo);
2353	if (pt != -1) {
2354		MOEA64_PTE_UNSET(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn);
2355		PVO_PTEGIDX_CLR(pvo);
2356	} else {
2357		moea64_pte_overflow--;
2358	}
2359
2360	/*
2361	 * Update our statistics.
2362	 */
2363	pvo->pvo_pmap->pm_stats.resident_count--;
2364	if (pvo->pvo_vaddr & PVO_WIRED)
2365		pvo->pvo_pmap->pm_stats.wired_count--;
2366
2367	/*
2368	 * Remove this PVO from the pmap list.
2369	 */
2370	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2371
2372	/*
2373	 * Remove this from the overflow list and return it to the pool
2374	 * if we aren't going to reuse it.
2375	 */
2376	LIST_REMOVE(pvo, pvo_olink);
2377
2378	/*
2379	 * Update vm about the REF/CHG bits if the page is managed.
2380	 */
2381	pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
2382
2383	if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && pg != NULL) {
2384		LIST_REMOVE(pvo, pvo_vlink);
2385		if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) {
2386			if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG)
2387				vm_page_dirty(pg);
2388			if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF)
2389				vm_page_aflag_set(pg, PGA_REFERENCED);
2390			if (LIST_EMPTY(vm_page_to_pvoh(pg)))
2391				vm_page_aflag_clear(pg, PGA_WRITEABLE);
2392		}
2393		if (LIST_EMPTY(vm_page_to_pvoh(pg)))
2394			vm_page_aflag_clear(pg, PGA_EXECUTABLE);
2395	}
2396
2397	moea64_pvo_entries--;
2398	moea64_pvo_remove_calls++;
2399
2400	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
2401		uma_zfree((pvo->pvo_vaddr & PVO_MANAGED) ? moea64_mpvo_zone :
2402		    moea64_upvo_zone, pvo);
2403}
2404
2405static struct pvo_entry *
2406moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
2407{
2408	struct pvo_entry key;
2409
2410	key.pvo_vaddr = va & ~ADDR_POFF;
2411	return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
2412}
2413
2414static boolean_t
2415moea64_query_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit)
2416{
2417	struct	pvo_entry *pvo;
2418	uintptr_t pt;
2419
2420	LOCK_TABLE_RD();
2421	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2422		/*
2423		 * See if we saved the bit off.  If so, return success.
2424		 */
2425		if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2426			UNLOCK_TABLE_RD();
2427			return (TRUE);
2428		}
2429	}
2430
2431	/*
2432	 * No luck, now go through the hard part of looking at the PTEs
2433	 * themselves.  Sync so that any pending REF/CHG bits are flushed to
2434	 * the PTEs.
2435	 */
2436	powerpc_sync();
2437	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2438
2439		/*
2440		 * See if this pvo has a valid PTE.  if so, fetch the
2441		 * REF/CHG bits from the valid PTE.  If the appropriate
2442		 * ptebit is set, return success.
2443		 */
2444		PMAP_LOCK(pvo->pvo_pmap);
2445		pt = MOEA64_PVO_TO_PTE(mmu, pvo);
2446		if (pt != -1) {
2447			MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte);
2448			if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2449				PMAP_UNLOCK(pvo->pvo_pmap);
2450				UNLOCK_TABLE_RD();
2451				return (TRUE);
2452			}
2453		}
2454		PMAP_UNLOCK(pvo->pvo_pmap);
2455	}
2456
2457	UNLOCK_TABLE_RD();
2458	return (FALSE);
2459}
2460
2461static u_int
2462moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit)
2463{
2464	u_int	count;
2465	struct	pvo_entry *pvo;
2466	uintptr_t pt;
2467
2468	/*
2469	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2470	 * we can reset the right ones).  note that since the pvo entries and
2471	 * list heads are accessed via BAT0 and are never placed in the page
2472	 * table, we don't have to worry about further accesses setting the
2473	 * REF/CHG bits.
2474	 */
2475	powerpc_sync();
2476
2477	/*
2478	 * For each pvo entry, clear the pvo's ptebit.  If this pvo has a
2479	 * valid pte clear the ptebit from the valid pte.
2480	 */
2481	count = 0;
2482	LOCK_TABLE_RD();
2483	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2484		PMAP_LOCK(pvo->pvo_pmap);
2485		pt = MOEA64_PVO_TO_PTE(mmu, pvo);
2486		if (pt != -1) {
2487			MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte);
2488			if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2489				count++;
2490				MOEA64_PTE_CLEAR(mmu, pt, &pvo->pvo_pte.lpte,
2491				    pvo->pvo_vpn, ptebit);
2492			}
2493		}
2494		pvo->pvo_pte.lpte.pte_lo &= ~ptebit;
2495		PMAP_UNLOCK(pvo->pvo_pmap);
2496	}
2497
2498	UNLOCK_TABLE_RD();
2499	return (count);
2500}
2501
2502boolean_t
2503moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2504{
2505	struct pvo_entry *pvo, key;
2506	vm_offset_t ppa;
2507	int error = 0;
2508
2509	PMAP_LOCK(kernel_pmap);
2510	key.pvo_vaddr = ppa = pa & ~ADDR_POFF;
2511	for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
2512	    ppa < pa + size; ppa += PAGE_SIZE,
2513	    pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
2514		if (pvo == NULL ||
2515		    (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) {
2516			error = EFAULT;
2517			break;
2518		}
2519	}
2520	PMAP_UNLOCK(kernel_pmap);
2521
2522	return (error);
2523}
2524
2525/*
2526 * Map a set of physical memory pages into the kernel virtual
2527 * address space. Return a pointer to where it is mapped. This
2528 * routine is intended to be used for mapping device memory,
2529 * NOT real memory.
2530 */
2531void *
2532moea64_mapdev_attr(mmu_t mmu, vm_offset_t pa, vm_size_t size, vm_memattr_t ma)
2533{
2534	vm_offset_t va, tmpva, ppa, offset;
2535
2536	ppa = trunc_page(pa);
2537	offset = pa & PAGE_MASK;
2538	size = roundup2(offset + size, PAGE_SIZE);
2539
2540	va = kva_alloc(size);
2541
2542	if (!va)
2543		panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
2544
2545	for (tmpva = va; size > 0;) {
2546		moea64_kenter_attr(mmu, tmpva, ppa, ma);
2547		size -= PAGE_SIZE;
2548		tmpva += PAGE_SIZE;
2549		ppa += PAGE_SIZE;
2550	}
2551
2552	return ((void *)(va + offset));
2553}
2554
2555void *
2556moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2557{
2558
2559	return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT);
2560}
2561
2562void
2563moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
2564{
2565	vm_offset_t base, offset;
2566
2567	base = trunc_page(va);
2568	offset = va & PAGE_MASK;
2569	size = roundup2(offset + size, PAGE_SIZE);
2570
2571	kva_free(base, size);
2572}
2573
2574void
2575moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz)
2576{
2577	struct pvo_entry *pvo;
2578	vm_offset_t lim;
2579	vm_paddr_t pa;
2580	vm_size_t len;
2581
2582	PMAP_LOCK(pm);
2583	while (sz > 0) {
2584		lim = round_page(va);
2585		len = MIN(lim - va, sz);
2586		pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
2587		if (pvo != NULL && !(pvo->pvo_pte.lpte.pte_lo & LPTE_I)) {
2588			pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) |
2589			    (va & ADDR_POFF);
2590			moea64_syncicache(mmu, pm, va, pa, len);
2591		}
2592		va += len;
2593		sz -= len;
2594	}
2595	PMAP_UNLOCK(pm);
2596}
2597
2598vm_offset_t
2599moea64_dumpsys_map(mmu_t mmu, struct pmap_md *md, vm_size_t ofs,
2600    vm_size_t *sz)
2601{
2602	if (md->md_vaddr == ~0UL)
2603	    return (md->md_paddr + ofs);
2604	else
2605	    return (md->md_vaddr + ofs);
2606}
2607
2608struct pmap_md *
2609moea64_scan_md(mmu_t mmu, struct pmap_md *prev)
2610{
2611	static struct pmap_md md;
2612	struct pvo_entry *pvo;
2613	vm_offset_t va;
2614
2615	if (dumpsys_minidump) {
2616		md.md_paddr = ~0UL;	/* Minidumps use virtual addresses. */
2617		if (prev == NULL) {
2618			/* 1st: kernel .data and .bss. */
2619			md.md_index = 1;
2620			md.md_vaddr = trunc_page((uintptr_t)_etext);
2621			md.md_size = round_page((uintptr_t)_end) - md.md_vaddr;
2622			return (&md);
2623		}
2624		switch (prev->md_index) {
2625		case 1:
2626			/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2627			md.md_index = 2;
2628			md.md_vaddr = (vm_offset_t)msgbufp->msg_ptr;
2629			md.md_size = round_page(msgbufp->msg_size);
2630			break;
2631		case 2:
2632			/* 3rd: kernel VM. */
2633			va = prev->md_vaddr + prev->md_size;
2634			/* Find start of next chunk (from va). */
2635			while (va < virtual_end) {
2636				/* Don't dump the buffer cache. */
2637				if (va >= kmi.buffer_sva &&
2638				    va < kmi.buffer_eva) {
2639					va = kmi.buffer_eva;
2640					continue;
2641				}
2642				pvo = moea64_pvo_find_va(kernel_pmap,
2643				    va & ~ADDR_POFF);
2644				if (pvo != NULL &&
2645				    (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID))
2646					break;
2647				va += PAGE_SIZE;
2648			}
2649			if (va < virtual_end) {
2650				md.md_vaddr = va;
2651				va += PAGE_SIZE;
2652				/* Find last page in chunk. */
2653				while (va < virtual_end) {
2654					/* Don't run into the buffer cache. */
2655					if (va == kmi.buffer_sva)
2656						break;
2657					pvo = moea64_pvo_find_va(kernel_pmap,
2658					    va & ~ADDR_POFF);
2659					if (pvo == NULL ||
2660					    !(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID))
2661						break;
2662					va += PAGE_SIZE;
2663				}
2664				md.md_size = va - md.md_vaddr;
2665				break;
2666			}
2667			md.md_index = 3;
2668			/* FALLTHROUGH */
2669		default:
2670			return (NULL);
2671		}
2672	} else { /* minidumps */
2673		if (prev == NULL) {
2674			/* first physical chunk. */
2675			md.md_paddr = pregions[0].mr_start;
2676			md.md_size = pregions[0].mr_size;
2677			md.md_vaddr = ~0UL;
2678			md.md_index = 1;
2679		} else if (md.md_index < pregions_sz) {
2680			md.md_paddr = pregions[md.md_index].mr_start;
2681			md.md_size = pregions[md.md_index].mr_size;
2682			md.md_vaddr = ~0UL;
2683			md.md_index++;
2684		} else {
2685			/* There's no next physical chunk. */
2686			return (NULL);
2687		}
2688	}
2689
2690	return (&md);
2691}
2692