mmu_oea64.c revision 208175
1218887Sdim/*-
2218887Sdim * Copyright (c) 2001 The NetBSD Foundation, Inc.
3218887Sdim * All rights reserved.
4218887Sdim *
5218887Sdim * This code is derived from software contributed to The NetBSD Foundation
6218887Sdim * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc.
7218887Sdim *
8218887Sdim * Redistribution and use in source and binary forms, with or without
9218887Sdim * modification, are permitted provided that the following conditions
10218887Sdim * are met:
11218887Sdim * 1. Redistributions of source code must retain the above copyright
12218887Sdim *    notice, this list of conditions and the following disclaimer.
13218887Sdim * 2. Redistributions in binary form must reproduce the above copyright
14234353Sdim *    notice, this list of conditions and the following disclaimer in the
15234353Sdim *    documentation and/or other materials provided with the distribution.
16218887Sdim * 3. All advertising materials mentioning features or use of this software
17218887Sdim *    must display the following acknowledgement:
18218887Sdim *        This product includes software developed by the NetBSD
19218887Sdim *        Foundation, Inc. and its contributors.
20218887Sdim * 4. Neither the name of The NetBSD Foundation nor the names of its
21218887Sdim *    contributors may be used to endorse or promote products derived
22234353Sdim *    from this software without specific prior written permission.
23249423Sdim *
24218887Sdim * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25234353Sdim * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26249423Sdim * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27249423Sdim * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28249423Sdim * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29218887Sdim * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30249423Sdim * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31249423Sdim * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32218887Sdim * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33249423Sdim * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34249423Sdim * POSSIBILITY OF SUCH DAMAGE.
35218887Sdim */
36218887Sdim/*-
37249423Sdim * Copyright (C) 1995, 1996 Wolfgang Solfrank.
38234353Sdim * Copyright (C) 1995, 1996 TooLs GmbH.
39218887Sdim * All rights reserved.
40249423Sdim *
41234353Sdim * Redistribution and use in source and binary forms, with or without
42234353Sdim * modification, are permitted provided that the following conditions
43263508Sdim * are met:
44249423Sdim * 1. Redistributions of source code must retain the above copyright
45249423Sdim *    notice, this list of conditions and the following disclaimer.
46249423Sdim * 2. Redistributions in binary form must reproduce the above copyright
47249423Sdim *    notice, this list of conditions and the following disclaimer in the
48234353Sdim *    documentation and/or other materials provided with the distribution.
49234353Sdim * 3. All advertising materials mentioning features or use of this software
50218887Sdim *    must display the following acknowledgement:
51218887Sdim *	This product includes software developed by TooLs GmbH.
52234353Sdim * 4. The name of TooLs GmbH may not be used to endorse or promote products
53218887Sdim *    derived from this software without specific prior written permission.
54218887Sdim *
55218887Sdim * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
56234353Sdim * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57249423Sdim * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58249423Sdim * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
59249423Sdim * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60234353Sdim * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
61249423Sdim * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62234353Sdim * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
63239462Sdim * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
64234353Sdim * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65218887Sdim *
66226633Sdim * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $
67218887Sdim */
68218887Sdim/*-
69263508Sdim * Copyright (C) 2001 Benno Rice.
70263508Sdim * All rights reserved.
71263508Sdim *
72263508Sdim * Redistribution and use in source and binary forms, with or without
73249423Sdim * modification, are permitted provided that the following conditions
74249423Sdim * are met:
75249423Sdim * 1. Redistributions of source code must retain the above copyright
76218887Sdim *    notice, this list of conditions and the following disclaimer.
77218887Sdim * 2. Redistributions in binary form must reproduce the above copyright
78263508Sdim *    notice, this list of conditions and the following disclaimer in the
79263508Sdim *    documentation and/or other materials provided with the distribution.
80263508Sdim *
81263508Sdim * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
82263508Sdim * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
83263508Sdim * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
84263508Sdim * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
85239462Sdim * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
86239462Sdim * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
87239462Sdim * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
88263508Sdim * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
89239462Sdim * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
90263508Sdim * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
91263508Sdim */
92239462Sdim
93239462Sdim#include <sys/cdefs.h>
94239462Sdim__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea64.c 208175 2010-05-16 23:45:10Z alc $");
95263508Sdim
96263508Sdim/*
97263508Sdim * Manages physical address maps.
98263508Sdim *
99263508Sdim * In addition to hardware address maps, this module is called upon to
100263508Sdim * provide software-use-only maps which may or may not be stored in the
101263508Sdim * same form as hardware maps.  These pseudo-maps are used to store
102263508Sdim * intermediate results from copy operations to and from address spaces.
103263508Sdim *
104263508Sdim * Since the information managed by this module is also stored by the
105263508Sdim * logical address mapping module, this module may throw away valid virtual
106263508Sdim * to physical mappings at almost any time.  However, invalidations of
107263508Sdim * mappings must be done as requested.
108263508Sdim *
109263508Sdim * In order to cope with hardware architectures which make virtual to
110263508Sdim * physical map invalidates expensive, this module may delay invalidate
111263508Sdim * reduced protection operations until such time as they are actually
112263508Sdim * necessary.  This module is given full information as to which processors
113263508Sdim * are currently using which maps, and to when physical maps must be made
114263508Sdim * correct.
115263508Sdim */
116239462Sdim
117239462Sdim#include "opt_kstack_pages.h"
118239462Sdim
119239462Sdim#include <sys/param.h>
120239462Sdim#include <sys/kernel.h>
121243830Sdim#include <sys/ktr.h>
122239462Sdim#include <sys/lock.h>
123239462Sdim#include <sys/msgbuf.h>
124239462Sdim#include <sys/mutex.h>
125239462Sdim#include <sys/proc.h>
126239462Sdim#include <sys/sysctl.h>
127239462Sdim#include <sys/systm.h>
128239462Sdim#include <sys/vmmeter.h>
129239462Sdim
130239462Sdim#include <sys/kdb.h>
131239462Sdim
132239462Sdim#include <dev/ofw/openfirm.h>
133239462Sdim
134263508Sdim#include <vm/vm.h>
135239462Sdim#include <vm/vm_param.h>
136263508Sdim#include <vm/vm_kern.h>
137263508Sdim#include <vm/vm_page.h>
138239462Sdim#include <vm/vm_map.h>
139263508Sdim#include <vm/vm_object.h>
140263508Sdim#include <vm/vm_extern.h>
141263508Sdim#include <vm/vm_pageout.h>
142263508Sdim#include <vm/vm_pager.h>
143263508Sdim#include <vm/uma.h>
144263508Sdim
145263508Sdim#include <machine/cpu.h>
146263508Sdim#include <machine/platform.h>
147263508Sdim#include <machine/frame.h>
148239462Sdim#include <machine/md_var.h>
149239462Sdim#include <machine/psl.h>
150239462Sdim#include <machine/bat.h>
151239462Sdim#include <machine/pte.h>
152239462Sdim#include <machine/sr.h>
153239462Sdim#include <machine/trap.h>
154218887Sdim#include <machine/mmuvar.h>
155218887Sdim
156218887Sdim#include "mmu_if.h"
157218887Sdim
158218887Sdim#define	MOEA_DEBUG
159218887Sdim
160234353Sdim#define TODO	panic("%s: not implemented", __func__);
161234353Sdim
162243830Sdimstatic __inline u_int32_t
163243830Sdimcntlzw(volatile u_int32_t a) {
164243830Sdim	u_int32_t b;
165243830Sdim	__asm ("cntlzw %0, %1" : "=r"(b) : "r"(a));
166234353Sdim	return b;
167243830Sdim}
168234353Sdim
169234353Sdimstatic __inline uint64_t
170234353Sdimva_to_vsid(pmap_t pm, vm_offset_t va)
171234353Sdim{
172234353Sdim	return ((pm->pm_sr[(uintptr_t)va >> ADDR_SR_SHFT]) & SR_VSID_MASK);
173234353Sdim}
174218887Sdim
175226633Sdim#define	PTESYNC()	__asm __volatile("ptesync");
176218887Sdim#define	TLBSYNC()	__asm __volatile("tlbsync; ptesync");
177218887Sdim#define	SYNC()		__asm __volatile("sync");
178243830Sdim#define	EIEIO()		__asm __volatile("eieio");
179226633Sdim
180218887Sdim/*
181234353Sdim * The tlbie instruction must be executed in 64-bit mode
182234353Sdim * so we have to twiddle MSR[SF] around every invocation.
183234353Sdim * Just to add to the fun, exceptions must be off as well
184234353Sdim * so that we can't trap in 64-bit mode. What a pain.
185234353Sdim */
186234353Sdimstruct mtx	tlbie_mutex;
187239462Sdim
188239462Sdimstatic __inline void
189239462SdimTLBIE(pmap_t pmap, vm_offset_t va) {
190234353Sdim	uint64_t vpn;
191218887Sdim	register_t vpn_hi, vpn_lo;
192218887Sdim	register_t msr;
193218887Sdim	register_t scratch;
194234353Sdim
195234353Sdim	vpn = (uint64_t)(va & ADDR_PIDX);
196218887Sdim	if (pmap != NULL)
197234353Sdim		vpn |= (va_to_vsid(pmap,va) << 28);
198234353Sdim	vpn &= ~(0xffffULL << 48);
199234353Sdim
200234353Sdim	vpn_hi = (uint32_t)(vpn >> 32);
201234353Sdim	vpn_lo = (uint32_t)vpn;
202234353Sdim
203234353Sdim	mtx_lock_spin(&tlbie_mutex);
204218887Sdim	__asm __volatile("\
205218887Sdim	    mfmsr %0; \
206243830Sdim	    mr %1, %0; \
207226633Sdim	    insrdi %1,%5,1,0; \
208243830Sdim	    mtmsrd %1; \
209239462Sdim	    ptesync; \
210218887Sdim	    \
211243830Sdim	    sld %1,%2,%4; \
212234353Sdim	    or %1,%1,%3; \
213234353Sdim	    tlbie %1; \
214234353Sdim	    \
215218887Sdim	    mtmsrd %0; \
216218887Sdim	    eieio; \
217234353Sdim	    tlbsync; \
218243830Sdim	    ptesync;"
219234353Sdim	: "=r"(msr), "=r"(scratch) : "r"(vpn_hi), "r"(vpn_lo), "r"(32), "r"(1)
220234353Sdim	    : "memory");
221234353Sdim	mtx_unlock_spin(&tlbie_mutex);
222218887Sdim}
223226633Sdim
224263508Sdim#define DISABLE_TRANS(msr)	msr = mfmsr(); mtmsr(msr & ~PSL_DR); isync()
225263508Sdim#define ENABLE_TRANS(msr)	mtmsr(msr); isync()
226263508Sdim
227239462Sdim#define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
228263508Sdim#define	VSID_TO_SR(vsid)	((vsid) & 0xf)
229263508Sdim#define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
230263508Sdim#define	VSID_HASH_MASK		0x0000007fffffffffULL
231263508Sdim
232243830Sdim#define	PVO_PTEGIDX_MASK	0x007UL		/* which PTEG slot */
233218887Sdim#define	PVO_PTEGIDX_VALID	0x008UL		/* slot is valid */
234263508Sdim#define	PVO_WIRED		0x010UL		/* PVO entry is wired */
235249423Sdim#define	PVO_MANAGED		0x020UL		/* PVO entry is managed */
236249423Sdim#define	PVO_BOOTSTRAP		0x080UL		/* PVO entry allocated during
237243830Sdim						   bootstrap */
238218887Sdim#define PVO_FAKE		0x100UL		/* fictitious phys page */
239218887Sdim#define	PVO_VADDR(pvo)		((pvo)->pvo_vaddr & ~ADDR_POFF)
240218887Sdim#define PVO_ISFAKE(pvo)		((pvo)->pvo_vaddr & PVO_FAKE)
241218887Sdim#define	PVO_PTEGIDX_GET(pvo)	((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK)
242243830Sdim#define	PVO_PTEGIDX_ISSET(pvo)	((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID)
243218887Sdim#define	PVO_PTEGIDX_CLR(pvo)	\
244226633Sdim	((void)((pvo)->pvo_vaddr &= ~(PVO_PTEGIDX_VALID|PVO_PTEGIDX_MASK)))
245218887Sdim#define	PVO_PTEGIDX_SET(pvo, i)	\
246218887Sdim	((void)((pvo)->pvo_vaddr |= (i)|PVO_PTEGIDX_VALID))
247243830Sdim
248218887Sdim#define	MOEA_PVO_CHECK(pvo)
249218887Sdim
250243830Sdim#define LOCK_TABLE() mtx_lock(&moea64_table_mutex)
251218887Sdim#define UNLOCK_TABLE() mtx_unlock(&moea64_table_mutex);
252249423Sdim#define ASSERT_TABLE_LOCK() mtx_assert(&moea64_table_mutex, MA_OWNED)
253218887Sdim
254218887Sdimstruct ofw_map {
255243830Sdim	vm_offset_t	om_va;
256218887Sdim	vm_size_t	om_len;
257218887Sdim	vm_offset_t	om_pa_hi;
258218887Sdim	vm_offset_t	om_pa_lo;
259249423Sdim	u_int		om_mode;
260249423Sdim};
261243830Sdim
262218887Sdim/*
263218887Sdim * Map of physical memory regions.
264218887Sdim */
265218887Sdimstatic struct	mem_region *regions;
266218887Sdimstatic struct	mem_region *pregions;
267234353Sdimextern u_int	phys_avail_count;
268243830Sdimextern int	regions_sz, pregions_sz;
269243830Sdimextern int	ofw_real_mode;
270243830Sdim
271249423Sdimextern struct pmap ofw_pmap;
272249423Sdim
273249423Sdimextern void bs_remap_earlyboot(void);
274249423Sdim
275249423Sdim
276249423Sdim/*
277249423Sdim * Lock for the pteg and pvo tables.
278249423Sdim */
279249423Sdimstruct mtx	moea64_table_mutex;
280249423Sdim
281249423Sdim/*
282249423Sdim * PTEG data.
283243830Sdim */
284243830Sdimstatic struct	lpteg *moea64_pteg_table;
285243830Sdimu_int		moea64_pteg_count;
286234353Sdimu_int		moea64_pteg_mask;
287218887Sdim
288218887Sdim/*
289226633Sdim * PVO data.
290218887Sdim */
291218887Sdimstruct	pvo_head *moea64_pvo_table;		/* pvo entries by pteg index */
292218887Sdim/* lists of unmanaged pages */
293218887Sdimstruct	pvo_head moea64_pvo_kunmanaged =
294218887Sdim    LIST_HEAD_INITIALIZER(moea64_pvo_kunmanaged);
295218887Sdimstruct	pvo_head moea64_pvo_unmanaged =
296218887Sdim    LIST_HEAD_INITIALIZER(moea64_pvo_unmanaged);
297218887Sdim
298218887Sdimuma_zone_t	moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */
299218887Sdimuma_zone_t	moea64_mpvo_zone; /* zone for pvo entries for managed pages */
300218887Sdim
301218887Sdim#define	BPVO_POOL_SIZE	327680
302218887Sdimstatic struct	pvo_entry *moea64_bpvo_pool;
303218887Sdimstatic int	moea64_bpvo_pool_index = 0;
304243830Sdim
305226633Sdim#define	VSID_NBPW	(sizeof(u_int32_t) * 8)
306239462Sdimstatic u_int	moea64_vsid_bitmap[NPMAPS / VSID_NBPW];
307239462Sdim
308239462Sdimstatic boolean_t moea64_initialized = FALSE;
309239462Sdim
310239462Sdim/*
311239462Sdim * Statistics.
312218887Sdim */
313243830Sdimu_int	moea64_pte_valid = 0;
314218887Sdimu_int	moea64_pte_overflow = 0;
315218887Sdimu_int	moea64_pvo_entries = 0;
316234353Sdimu_int	moea64_pvo_enter_calls = 0;
317234353Sdimu_int	moea64_pvo_remove_calls = 0;
318234353SdimSYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
319234353Sdim    &moea64_pte_valid, 0, "");
320234353SdimSYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
321218887Sdim    &moea64_pte_overflow, 0, "");
322218887SdimSYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
323249423Sdim    &moea64_pvo_entries, 0, "");
324249423SdimSYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
325249423Sdim    &moea64_pvo_enter_calls, 0, "");
326249423SdimSYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
327249423Sdim    &moea64_pvo_remove_calls, 0, "");
328249423Sdim
329234353Sdimvm_offset_t	moea64_scratchpage_va[2];
330234353Sdimstruct	lpte 	*moea64_scratchpage_pte[2];
331243830Sdimstruct	mtx	moea64_scratchpage_mtx;
332234353Sdim
333234353Sdim/*
334234353Sdim * Allocate physical memory for use in moea64_bootstrap.
335234353Sdim */
336234353Sdimstatic vm_offset_t	moea64_bootstrap_alloc(vm_size_t, u_int);
337234353Sdim
338234353Sdim/*
339234353Sdim * PTE calls.
340249423Sdim */
341234353Sdimstatic int		moea64_pte_insert(u_int, struct lpte *);
342234353Sdim
343249423Sdim/*
344249423Sdim * PVO calls.
345249423Sdim */
346234353Sdimstatic int	moea64_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *,
347249423Sdim		    vm_offset_t, vm_offset_t, uint64_t, int);
348234353Sdimstatic void	moea64_pvo_remove(struct pvo_entry *, int);
349234353Sdimstatic struct	pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t, int *);
350234353Sdimstatic struct	lpte *moea64_pvo_to_pte(const struct pvo_entry *, int);
351239462Sdim
352234353Sdim/*
353234353Sdim * Utility routines.
354234353Sdim */
355243830Sdimstatic void		moea64_bridge_bootstrap(mmu_t mmup,
356243830Sdim			    vm_offset_t kernelstart, vm_offset_t kernelend);
357243830Sdimstatic void		moea64_bridge_cpu_bootstrap(mmu_t, int ap);
358234353Sdimstatic void		moea64_enter_locked(pmap_t, vm_offset_t, vm_page_t,
359234353Sdim			    vm_prot_t, boolean_t);
360234353Sdimstatic boolean_t	moea64_query_bit(vm_page_t, u_int64_t);
361234353Sdimstatic u_int		moea64_clear_bit(vm_page_t, u_int64_t, u_int64_t *);
362234353Sdimstatic void		moea64_kremove(mmu_t, vm_offset_t);
363234353Sdimstatic void		moea64_syncicache(pmap_t pmap, vm_offset_t va,
364234353Sdim			    vm_offset_t pa, vm_size_t sz);
365234353Sdimstatic void		tlbia(void);
366234353Sdim
367234353Sdim/*
368234353Sdim * Kernel MMU interface
369234353Sdim */
370249423Sdimvoid moea64_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t);
371234353Sdimvoid moea64_clear_modify(mmu_t, vm_page_t);
372234353Sdimvoid moea64_clear_reference(mmu_t, vm_page_t);
373234353Sdimvoid moea64_copy_page(mmu_t, vm_page_t, vm_page_t);
374234353Sdimvoid moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t);
375234353Sdimvoid moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
376234353Sdim    vm_prot_t);
377249423Sdimvoid moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
378249423Sdimvm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t);
379234353Sdimvm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t);
380249423Sdimvoid moea64_init(mmu_t);
381234353Sdimboolean_t moea64_is_modified(mmu_t, vm_page_t);
382234353Sdimboolean_t moea64_is_referenced(mmu_t, vm_page_t);
383249423Sdimboolean_t moea64_ts_referenced(mmu_t, vm_page_t);
384249423Sdimvm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_offset_t, vm_offset_t, int);
385249423Sdimboolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t);
386249423Sdimint moea64_page_wired_mappings(mmu_t, vm_page_t);
387249423Sdimvoid moea64_pinit(mmu_t, pmap_t);
388249423Sdimvoid moea64_pinit0(mmu_t, pmap_t);
389249423Sdimvoid moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
390249423Sdimvoid moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
391234353Sdimvoid moea64_qremove(mmu_t, vm_offset_t, int);
392234353Sdimvoid moea64_release(mmu_t, pmap_t);
393234353Sdimvoid moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
394234353Sdimvoid moea64_remove_all(mmu_t, vm_page_t);
395234353Sdimvoid moea64_remove_write(mmu_t, vm_page_t);
396243830Sdimvoid moea64_zero_page(mmu_t, vm_page_t);
397234353Sdimvoid moea64_zero_page_area(mmu_t, vm_page_t, int, int);
398218887Sdimvoid moea64_zero_page_idle(mmu_t, vm_page_t);
399218887Sdimvoid moea64_activate(mmu_t, struct thread *);
400218887Sdimvoid moea64_deactivate(mmu_t, struct thread *);
401234353Sdimvoid *moea64_mapdev(mmu_t, vm_offset_t, vm_size_t);
402218887Sdimvoid moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t);
403218887Sdimvm_offset_t moea64_kextract(mmu_t, vm_offset_t);
404218887Sdimvoid moea64_kenter(mmu_t, vm_offset_t, vm_offset_t);
405234353Sdimboolean_t moea64_dev_direct_mapped(mmu_t, vm_offset_t, vm_size_t);
406218887Sdimstatic void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
407234353Sdim
408234353Sdimstatic mmu_method_t moea64_bridge_methods[] = {
409234353Sdim	MMUMETHOD(mmu_change_wiring,	moea64_change_wiring),
410234353Sdim	MMUMETHOD(mmu_clear_modify,	moea64_clear_modify),
411234353Sdim	MMUMETHOD(mmu_clear_reference,	moea64_clear_reference),
412234353Sdim	MMUMETHOD(mmu_copy_page,	moea64_copy_page),
413234353Sdim	MMUMETHOD(mmu_enter,		moea64_enter),
414234353Sdim	MMUMETHOD(mmu_enter_object,	moea64_enter_object),
415234353Sdim	MMUMETHOD(mmu_enter_quick,	moea64_enter_quick),
416234353Sdim	MMUMETHOD(mmu_extract,		moea64_extract),
417234353Sdim	MMUMETHOD(mmu_extract_and_hold,	moea64_extract_and_hold),
418234353Sdim	MMUMETHOD(mmu_init,		moea64_init),
419234353Sdim	MMUMETHOD(mmu_is_modified,	moea64_is_modified),
420234353Sdim	MMUMETHOD(mmu_is_referenced,	moea64_is_referenced),
421234353Sdim	MMUMETHOD(mmu_ts_referenced,	moea64_ts_referenced),
422234353Sdim	MMUMETHOD(mmu_map,     		moea64_map),
423234353Sdim	MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick),
424239462Sdim	MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings),
425226633Sdim	MMUMETHOD(mmu_pinit,		moea64_pinit),
426226633Sdim	MMUMETHOD(mmu_pinit0,		moea64_pinit0),
427226633Sdim	MMUMETHOD(mmu_protect,		moea64_protect),
428249423Sdim	MMUMETHOD(mmu_qenter,		moea64_qenter),
429249423Sdim	MMUMETHOD(mmu_qremove,		moea64_qremove),
430249423Sdim	MMUMETHOD(mmu_release,		moea64_release),
431249423Sdim	MMUMETHOD(mmu_remove,		moea64_remove),
432249423Sdim	MMUMETHOD(mmu_remove_all,      	moea64_remove_all),
433249423Sdim	MMUMETHOD(mmu_remove_write,	moea64_remove_write),
434249423Sdim	MMUMETHOD(mmu_sync_icache,	moea64_sync_icache),
435243830Sdim	MMUMETHOD(mmu_zero_page,       	moea64_zero_page),
436243830Sdim	MMUMETHOD(mmu_zero_page_area,	moea64_zero_page_area),
437249423Sdim	MMUMETHOD(mmu_zero_page_idle,	moea64_zero_page_idle),
438249423Sdim	MMUMETHOD(mmu_activate,		moea64_activate),
439243830Sdim	MMUMETHOD(mmu_deactivate,      	moea64_deactivate),
440243830Sdim
441249423Sdim	/* Internal interfaces */
442243830Sdim	MMUMETHOD(mmu_bootstrap,       	moea64_bridge_bootstrap),
443243830Sdim	MMUMETHOD(mmu_cpu_bootstrap,   	moea64_bridge_cpu_bootstrap),
444243830Sdim	MMUMETHOD(mmu_mapdev,		moea64_mapdev),
445249423Sdim	MMUMETHOD(mmu_unmapdev,		moea64_unmapdev),
446243830Sdim	MMUMETHOD(mmu_kextract,		moea64_kextract),
447243830Sdim	MMUMETHOD(mmu_kenter,		moea64_kenter),
448249423Sdim	MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped),
449249423Sdim
450249423Sdim	{ 0, 0 }
451249423Sdim};
452249423Sdim
453249423Sdimstatic mmu_def_t oea64_bridge_mmu = {
454249423Sdim	MMU_TYPE_G5,
455249423Sdim	moea64_bridge_methods,
456249423Sdim	0
457249423Sdim};
458249423SdimMMU_DEF(oea64_bridge_mmu);
459249423Sdim
460249423Sdimstatic __inline u_int
461249423Sdimva_to_pteg(uint64_t vsid, vm_offset_t addr)
462249423Sdim{
463249423Sdim	uint64_t hash;
464249423Sdim
465243830Sdim	hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >>
466249423Sdim	    ADDR_PIDX_SHFT);
467239462Sdim	return (hash & moea64_pteg_mask);
468239462Sdim}
469239462Sdim
470249423Sdimstatic __inline struct pvo_head *
471239462Sdimpa_to_pvoh(vm_offset_t pa, vm_page_t *pg_p)
472239462Sdim{
473239462Sdim	struct	vm_page *pg;
474234353Sdim
475249423Sdim	pg = PHYS_TO_VM_PAGE(pa);
476249423Sdim
477249423Sdim	if (pg_p != NULL)
478249423Sdim		*pg_p = pg;
479249423Sdim
480249423Sdim	if (pg == NULL)
481249423Sdim		return (&moea64_pvo_unmanaged);
482249423Sdim
483249423Sdim	return (&pg->md.mdpg_pvoh);
484249423Sdim}
485249423Sdim
486234353Sdimstatic __inline struct pvo_head *
487218887Sdimvm_page_to_pvoh(vm_page_t m)
488249423Sdim{
489249423Sdim
490249423Sdim	return (&m->md.mdpg_pvoh);
491249423Sdim}
492249423Sdim
493249423Sdimstatic __inline void
494234353Sdimmoea64_attr_clear(vm_page_t m, u_int64_t ptebit)
495234353Sdim{
496234353Sdim
497249423Sdim	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
498234353Sdim	m->md.mdpg_attrs &= ~ptebit;
499234353Sdim}
500234353Sdim
501234353Sdimstatic __inline u_int64_t
502249423Sdimmoea64_attr_fetch(vm_page_t m)
503249423Sdim{
504243830Sdim
505234353Sdim	return (m->md.mdpg_attrs);
506234353Sdim}
507239462Sdim
508239462Sdimstatic __inline void
509249423Sdimmoea64_attr_save(vm_page_t m, u_int64_t ptebit)
510226633Sdim{
511249423Sdim
512226633Sdim	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
513218887Sdim	m->md.mdpg_attrs |= ptebit;
514218887Sdim}
515218887Sdim
516234353Sdimstatic __inline void
517234353Sdimmoea64_pte_create(struct lpte *pt, uint64_t vsid, vm_offset_t va,
518234353Sdim    uint64_t pte_lo)
519234353Sdim{
520218887Sdim	ASSERT_TABLE_LOCK();
521234353Sdim
522234353Sdim	/*
523223017Sdim	 * Construct a PTE.  Default to IMB initially.  Valid bit only gets
524234353Sdim	 * set when the real pte is set in memory.
525234353Sdim	 *
526234353Sdim	 * Note: Don't set the valid bit for correct operation of tlb update.
527234353Sdim	 */
528234353Sdim	pt->pte_hi = (vsid << LPTE_VSID_SHIFT) |
529234353Sdim	    (((uint64_t)(va & ADDR_PIDX) >> ADDR_API_SHFT64) & LPTE_API);
530234353Sdim
531234353Sdim	pt->pte_lo = pte_lo;
532243830Sdim}
533243830Sdim
534243830Sdimstatic __inline void
535234353Sdimmoea64_pte_synch(struct lpte *pt, struct lpte *pvo_pt)
536234353Sdim{
537234353Sdim
538239462Sdim	ASSERT_TABLE_LOCK();
539239462Sdim
540239462Sdim	pvo_pt->pte_lo |= pt->pte_lo & (LPTE_REF | LPTE_CHG);
541239462Sdim}
542239462Sdim
543239462Sdimstatic __inline void
544239462Sdimmoea64_pte_clear(struct lpte *pt, pmap_t pmap, vm_offset_t va, u_int64_t ptebit)
545239462Sdim{
546239462Sdim	ASSERT_TABLE_LOCK();
547234353Sdim
548234353Sdim	/*
549243830Sdim	 * As shown in Section 7.6.3.2.3
550234353Sdim	 */
551234353Sdim	pt->pte_lo &= ~ptebit;
552234353Sdim	TLBIE(pmap,va);
553234353Sdim}
554234353Sdim
555234353Sdimstatic __inline void
556234353Sdimmoea64_pte_set(struct lpte *pt, struct lpte *pvo_pt)
557226633Sdim{
558218887Sdim
559226633Sdim	ASSERT_TABLE_LOCK();
560218887Sdim	pvo_pt->pte_hi |= LPTE_VALID;
561218887Sdim
562234353Sdim	/*
563234353Sdim	 * Update the PTE as defined in section 7.6.3.1.
564234353Sdim	 * Note that the REF/CHG bits are from pvo_pt and thus should have
565234353Sdim	 * been saved so this routine can restore them (if desired).
566234353Sdim	 */
567234353Sdim	pt->pte_lo = pvo_pt->pte_lo;
568234353Sdim	EIEIO();
569234353Sdim	pt->pte_hi = pvo_pt->pte_hi;
570234353Sdim	PTESYNC();
571234353Sdim	moea64_pte_valid++;
572218887Sdim}
573218887Sdim
574234353Sdimstatic __inline void
575234353Sdimmoea64_pte_unset(struct lpte *pt, struct lpte *pvo_pt, pmap_t pmap, vm_offset_t va)
576234353Sdim{
577234353Sdim	ASSERT_TABLE_LOCK();
578234353Sdim	pvo_pt->pte_hi &= ~LPTE_VALID;
579234353Sdim
580234353Sdim	/*
581234353Sdim	 * Force the reg & chg bits back into the PTEs.
582234353Sdim	 */
583234353Sdim	SYNC();
584234353Sdim
585218887Sdim	/*
586243830Sdim	 * Invalidate the pte.
587243830Sdim	 */
588243830Sdim	pt->pte_hi &= ~LPTE_VALID;
589243830Sdim	TLBIE(pmap,va);
590243830Sdim
591221345Sdim	/*
592243830Sdim	 * Save the reg & chg bits.
593243830Sdim	 */
594243830Sdim	moea64_pte_synch(pt, pvo_pt);
595243830Sdim	moea64_pte_valid--;
596243830Sdim}
597218887Sdim
598226633Sdimstatic __inline void
599263508Sdimmoea64_pte_change(struct lpte *pt, struct lpte *pvo_pt, pmap_t pmap, vm_offset_t va)
600243830Sdim{
601243830Sdim
602243830Sdim	/*
603243830Sdim	 * Invalidate the PTE
604234353Sdim	 */
605243830Sdim	moea64_pte_unset(pt, pvo_pt, pmap, va);
606234353Sdim	moea64_pte_set(pt, pvo_pt);
607234353Sdim	if (pmap == kernel_pmap)
608234353Sdim		isync();
609249423Sdim}
610234353Sdim
611249423Sdimstatic __inline uint64_t
612249423Sdimmoea64_calc_wimg(vm_offset_t pa)
613243830Sdim{
614243830Sdim	uint64_t pte_lo;
615218887Sdim	int i;
616218887Sdim
617249423Sdim	/*
618239462Sdim	 * Assume the page is cache inhibited and access is guarded unless
619239462Sdim	 * it's in our available memory array.
620239462Sdim	 */
621239462Sdim	pte_lo = LPTE_I | LPTE_G;
622239462Sdim	for (i = 0; i < pregions_sz; i++) {
623234353Sdim		if ((pa >= pregions[i].mr_start) &&
624234353Sdim		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
625218887Sdim			pte_lo &= ~(LPTE_I | LPTE_G);
626249423Sdim			pte_lo |= LPTE_M;
627218887Sdim			break;
628249423Sdim		}
629249423Sdim	}
630249423Sdim
631249423Sdim	return pte_lo;
632249423Sdim}
633249423Sdim
634249423Sdim/*
635218887Sdim * Quick sort callout for comparing memory regions.
636218887Sdim */
637218887Sdimstatic int	mr_cmp(const void *a, const void *b);
638221345Sdimstatic int	om_cmp(const void *a, const void *b);
639218887Sdim
640218887Sdimstatic int
641234353Sdimmr_cmp(const void *a, const void *b)
642249423Sdim{
643234353Sdim	const struct	mem_region *regiona;
644226633Sdim	const struct	mem_region *regionb;
645218887Sdim
646234353Sdim	regiona = a;
647218887Sdim	regionb = b;
648218887Sdim	if (regiona->mr_start < regionb->mr_start)
649239462Sdim		return (-1);
650239462Sdim	else if (regiona->mr_start > regionb->mr_start)
651239462Sdim		return (1);
652239462Sdim	else
653249423Sdim		return (0);
654234353Sdim}
655218887Sdim
656234353Sdimstatic int
657243830Sdimom_cmp(const void *a, const void *b)
658218887Sdim{
659218887Sdim	const struct	ofw_map *mapa;
660218887Sdim	const struct	ofw_map *mapb;
661218887Sdim
662218887Sdim	mapa = a;
663239462Sdim	mapb = b;
664249423Sdim	if (mapa->om_pa_hi < mapb->om_pa_hi)
665218887Sdim		return (-1);
666218887Sdim	else if (mapa->om_pa_hi > mapb->om_pa_hi)
667218887Sdim		return (1);
668218887Sdim	else if (mapa->om_pa_lo < mapb->om_pa_lo)
669218887Sdim		return (-1);
670218887Sdim	else if (mapa->om_pa_lo > mapb->om_pa_lo)
671243830Sdim		return (1);
672243830Sdim	else
673218887Sdim		return (0);
674218887Sdim}
675218887Sdim
676218887Sdimstatic void
677218887Sdimmoea64_bridge_cpu_bootstrap(mmu_t mmup, int ap)
678234353Sdim{
679249423Sdim	int i = 0;
680234353Sdim
681218887Sdim	/*
682234353Sdim	 * Initialize segment registers and MMU
683226633Sdim	 */
684249423Sdim
685226633Sdim	mtmsr(mfmsr() & ~PSL_DR & ~PSL_IR); isync();
686226633Sdim	for (i = 0; i < 16; i++) {
687226633Sdim		mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]);
688249423Sdim	}
689226633Sdim	__asm __volatile ("ptesync; mtsdr1 %0; isync"
690226633Sdim	    :: "r"((u_int)moea64_pteg_table
691226633Sdim		     | (32 - cntlzw(moea64_pteg_mask >> 11))));
692249423Sdim	tlbia();
693249423Sdim}
694226633Sdim
695226633Sdimstatic void
696218887Sdimmoea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz)
697218887Sdim{
698218887Sdim	struct ofw_map	translations[sz/sizeof(struct ofw_map)];
699218887Sdim	register_t	msr;
700218887Sdim	vm_offset_t	off;
701218887Sdim	vm_paddr_t	pa_base;
702218887Sdim	int		i, ofw_mappings;
703226633Sdim
704243830Sdim	bzero(translations, sz);
705226633Sdim	if (OF_getprop(mmu, "translations", translations, sz) == -1)
706226633Sdim		panic("moea64_bootstrap: can't get ofw translations");
707218887Sdim
708218887Sdim	CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
709226633Sdim	sz /= sizeof(*translations);
710218887Sdim	qsort(translations, sz, sizeof (*translations), om_cmp);
711218887Sdim
712218887Sdim	for (i = 0, ofw_mappings = 0; i < sz; i++) {
713218887Sdim		CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x",
714218887Sdim		    (uint32_t)(translations[i].om_pa_lo), translations[i].om_va,
715218887Sdim		    translations[i].om_len);
716218887Sdim
717218887Sdim		if (translations[i].om_pa_lo % PAGE_SIZE)
718218887Sdim			panic("OFW translation not page-aligned!");
719234353Sdim
720263508Sdim		if (translations[i].om_pa_hi)
721218887Sdim			panic("OFW translations above 32-bit boundary!");
722218887Sdim
723218887Sdim		pa_base = translations[i].om_pa_lo;
724218887Sdim
725218887Sdim		/* Now enter the pages for this mapping */
726218887Sdim
727263508Sdim		DISABLE_TRANS(msr);
728218887Sdim		for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
729218887Sdim			moea64_kenter(mmup, translations[i].om_va + off,
730218887Sdim			    pa_base + off);
731226633Sdim
732218887Sdim			ofw_mappings++;
733218887Sdim		}
734218887Sdim		ENABLE_TRANS(msr);
735218887Sdim	}
736218887Sdim}
737263508Sdim
738263508Sdimstatic void
739263508Sdimmoea64_bridge_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
740263508Sdim{
741218887Sdim	ihandle_t	mmui;
742234353Sdim	phandle_t	chosen;
743263508Sdim	phandle_t	mmu;
744218887Sdim	size_t		sz;
745263508Sdim	int		i, j;
746218887Sdim	vm_size_t	size, physsz, hwphyssz;
747218887Sdim	vm_offset_t	pa, va, off;
748226633Sdim	register_t	msr;
749218887Sdim	void		*dpcpu;
750218887Sdim
751218887Sdim	/* We don't have a direct map since there is no BAT */
752218887Sdim	hw_direct_map = 0;
753218887Sdim
754218887Sdim	/* Make sure battable is zero, since we have no BAT */
755218887Sdim	for (i = 0; i < 16; i++) {
756218887Sdim		battable[i].batu = 0;
757218887Sdim		battable[i].batl = 0;
758218887Sdim	}
759218887Sdim
760218887Sdim	/* Get physical memory regions from firmware */
761218887Sdim	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
762218887Sdim	CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
763218887Sdim
764218887Sdim	qsort(pregions, pregions_sz, sizeof(*pregions), mr_cmp);
765218887Sdim	if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz)
766218887Sdim		panic("moea64_bootstrap: phys_avail too small");
767218887Sdim	qsort(regions, regions_sz, sizeof(*regions), mr_cmp);
768218887Sdim	phys_avail_count = 0;
769218887Sdim	physsz = 0;
770218887Sdim	hwphyssz = 0;
771218887Sdim	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
772218887Sdim	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
773218887Sdim		CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start,
774218887Sdim		    regions[i].mr_start + regions[i].mr_size,
775218887Sdim		    regions[i].mr_size);
776218887Sdim		if (hwphyssz != 0 &&
777218887Sdim		    (physsz + regions[i].mr_size) >= hwphyssz) {
778218887Sdim			if (physsz < hwphyssz) {
779218887Sdim				phys_avail[j] = regions[i].mr_start;
780218887Sdim				phys_avail[j + 1] = regions[i].mr_start +
781218887Sdim				    hwphyssz - physsz;
782263508Sdim				physsz = hwphyssz;
783263508Sdim				phys_avail_count++;
784218887Sdim			}
785218887Sdim			break;
786218887Sdim		}
787218887Sdim		phys_avail[j] = regions[i].mr_start;
788218887Sdim		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
789218887Sdim		phys_avail_count++;
790218887Sdim		physsz += regions[i].mr_size;
791218887Sdim	}
792218887Sdim	physmem = btoc(physsz);
793218887Sdim
794263508Sdim	/*
795218887Sdim	 * Allocate PTEG table.
796218887Sdim	 */
797218887Sdim#ifdef PTEGCOUNT
798218887Sdim	moea64_pteg_count = PTEGCOUNT;
799218887Sdim#else
800263508Sdim	moea64_pteg_count = 0x1000;
801218887Sdim
802218887Sdim	while (moea64_pteg_count < physmem)
803218887Sdim		moea64_pteg_count <<= 1;
804263508Sdim#endif /* PTEGCOUNT */
805263508Sdim
806218887Sdim	size = moea64_pteg_count * sizeof(struct lpteg);
807	CTR2(KTR_PMAP, "moea64_bootstrap: %d PTEGs, %d bytes",
808	    moea64_pteg_count, size);
809
810	/*
811	 * We now need to allocate memory. This memory, to be allocated,
812	 * has to reside in a page table. The page table we are about to
813	 * allocate. We don't have BAT. So drop to data real mode for a minute
814	 * as a measure of last resort. We do this a couple times.
815	 */
816
817	moea64_pteg_table = (struct lpteg *)moea64_bootstrap_alloc(size, size);
818	DISABLE_TRANS(msr);
819	bzero((void *)moea64_pteg_table, moea64_pteg_count * sizeof(struct lpteg));
820	ENABLE_TRANS(msr);
821
822	moea64_pteg_mask = moea64_pteg_count - 1;
823
824	CTR1(KTR_PMAP, "moea64_bootstrap: PTEG table at %p", moea64_pteg_table);
825
826	/*
827	 * Allocate pv/overflow lists.
828	 */
829	size = sizeof(struct pvo_head) * moea64_pteg_count;
830
831	moea64_pvo_table = (struct pvo_head *)moea64_bootstrap_alloc(size,
832	    PAGE_SIZE);
833	CTR1(KTR_PMAP, "moea64_bootstrap: PVO table at %p", moea64_pvo_table);
834
835	DISABLE_TRANS(msr);
836	for (i = 0; i < moea64_pteg_count; i++)
837		LIST_INIT(&moea64_pvo_table[i]);
838	ENABLE_TRANS(msr);
839
840	/*
841	 * Initialize the lock that synchronizes access to the pteg and pvo
842	 * tables.
843	 */
844	mtx_init(&moea64_table_mutex, "pmap table", NULL, MTX_DEF |
845	    MTX_RECURSE);
846
847	/*
848	 * Initialize the TLBIE lock. TLBIE can only be executed by one CPU.
849	 */
850	mtx_init(&tlbie_mutex, "tlbie mutex", NULL, MTX_SPIN);
851
852	/*
853	 * Initialise the unmanaged pvo pool.
854	 */
855	moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
856		BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0);
857	moea64_bpvo_pool_index = 0;
858
859	/*
860	 * Make sure kernel vsid is allocated as well as VSID 0.
861	 */
862	moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW]
863		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
864	moea64_vsid_bitmap[0] |= 1;
865
866	/*
867	 * Initialize the kernel pmap (which is statically allocated).
868	 */
869	for (i = 0; i < 16; i++)
870		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
871
872	kernel_pmap->pmap_phys = kernel_pmap;
873	kernel_pmap->pm_active = ~0;
874
875	PMAP_LOCK_INIT(kernel_pmap);
876
877	/*
878	 * Now map in all the other buffers we allocated earlier
879	 */
880
881	DISABLE_TRANS(msr);
882	size = moea64_pteg_count * sizeof(struct lpteg);
883	off = (vm_offset_t)(moea64_pteg_table);
884	for (pa = off; pa < off + size; pa += PAGE_SIZE)
885		moea64_kenter(mmup, pa, pa);
886	size = sizeof(struct pvo_head) * moea64_pteg_count;
887	off = (vm_offset_t)(moea64_pvo_table);
888	for (pa = off; pa < off + size; pa += PAGE_SIZE)
889		moea64_kenter(mmup, pa, pa);
890	size = BPVO_POOL_SIZE*sizeof(struct pvo_entry);
891	off = (vm_offset_t)(moea64_bpvo_pool);
892	for (pa = off; pa < off + size; pa += PAGE_SIZE)
893		moea64_kenter(mmup, pa, pa);
894
895	/*
896	 * Map certain important things, like ourselves.
897	 *
898	 * NOTE: We do not map the exception vector space. That code is
899	 * used only in real mode, and leaving it unmapped allows us to
900	 * catch NULL pointer deferences, instead of making NULL a valid
901	 * address.
902	 */
903
904	for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE)
905		moea64_kenter(mmup, pa, pa);
906	ENABLE_TRANS(msr);
907
908	if (!ofw_real_mode) {
909	    /*
910	     * Set up the Open Firmware pmap and add its mappings.
911	     */
912
913	    moea64_pinit(mmup, &ofw_pmap);
914	    for (i = 0; i < 16; i++)
915		ofw_pmap.pm_sr[i] = kernel_pmap->pm_sr[i];
916
917	    if ((chosen = OF_finddevice("/chosen")) == -1)
918		panic("moea64_bootstrap: can't find /chosen");
919	    OF_getprop(chosen, "mmu", &mmui, 4);
920	    if ((mmu = OF_instance_to_package(mmui)) == -1)
921		panic("moea64_bootstrap: can't get mmu package");
922	    if ((sz = OF_getproplen(mmu, "translations")) == -1)
923		panic("moea64_bootstrap: can't get ofw translation count");
924	    if (sz > 6144 /* tmpstksz - 2 KB headroom */)
925		panic("moea64_bootstrap: too many ofw translations");
926
927	    moea64_add_ofw_mappings(mmup, mmu, sz);
928	}
929
930#ifdef SMP
931	TLBSYNC();
932#endif
933
934	/*
935	 * Calculate the last available physical address.
936	 */
937	for (i = 0; phys_avail[i + 2] != 0; i += 2)
938		;
939	Maxmem = powerpc_btop(phys_avail[i + 1]);
940
941	/*
942	 * Initialize MMU and remap early physical mappings
943	 */
944	moea64_bridge_cpu_bootstrap(mmup,0);
945	mtmsr(mfmsr() | PSL_DR | PSL_IR); isync();
946	pmap_bootstrapped++;
947	bs_remap_earlyboot();
948
949	/*
950	 * Set the start and end of kva.
951	 */
952	virtual_avail = VM_MIN_KERNEL_ADDRESS;
953	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
954
955	/*
956	 * Figure out how far we can extend virtual_end into segment 16
957	 * without running into existing mappings. Segment 16 is guaranteed
958	 * to contain neither RAM nor devices (at least on Apple hardware),
959	 * but will generally contain some OFW mappings we should not
960	 * step on.
961	 */
962
963	PMAP_LOCK(kernel_pmap);
964	while (moea64_pvo_find_va(kernel_pmap, virtual_end+1, NULL) == NULL)
965		virtual_end += PAGE_SIZE;
966	PMAP_UNLOCK(kernel_pmap);
967
968	/*
969	 * Allocate some things for page zeroing. We put this directly
970	 * in the page table, marked with LPTE_LOCKED, to avoid any
971	 * of the PVO book-keeping or other parts of the VM system
972	 * from even knowing that this hack exists.
973	 */
974
975	mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF);
976	for (i = 0; i < 2; i++) {
977		struct lpte pt;
978		uint64_t vsid;
979		int pteidx, ptegidx;
980
981		moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
982		virtual_end -= PAGE_SIZE;
983
984		LOCK_TABLE();
985
986		vsid = va_to_vsid(kernel_pmap, moea64_scratchpage_va[i]);
987		moea64_pte_create(&pt, vsid, moea64_scratchpage_va[i],
988		    LPTE_NOEXEC);
989		pt.pte_hi |= LPTE_LOCKED;
990
991		ptegidx = va_to_pteg(vsid, moea64_scratchpage_va[i]);
992		pteidx = moea64_pte_insert(ptegidx, &pt);
993		if (pt.pte_hi & LPTE_HID)
994			ptegidx ^= moea64_pteg_mask;
995
996		moea64_scratchpage_pte[i] =
997		    &moea64_pteg_table[ptegidx].pt[pteidx];
998
999		UNLOCK_TABLE();
1000	}
1001
1002	/*
1003	 * Allocate a kernel stack with a guard page for thread0 and map it
1004	 * into the kernel page map.
1005	 */
1006	pa = moea64_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
1007	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
1008	virtual_avail = va + KSTACK_PAGES * PAGE_SIZE;
1009	CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va);
1010	thread0.td_kstack = va;
1011	thread0.td_kstack_pages = KSTACK_PAGES;
1012	for (i = 0; i < KSTACK_PAGES; i++) {
1013		moea64_kenter(mmup, va, pa);
1014		pa += PAGE_SIZE;
1015		va += PAGE_SIZE;
1016	}
1017
1018	/*
1019	 * Allocate virtual address space for the message buffer.
1020	 */
1021	pa = msgbuf_phys = moea64_bootstrap_alloc(MSGBUF_SIZE, PAGE_SIZE);
1022	msgbufp = (struct msgbuf *)virtual_avail;
1023	va = virtual_avail;
1024	virtual_avail += round_page(MSGBUF_SIZE);
1025	while (va < virtual_avail) {
1026		moea64_kenter(mmup, va, pa);
1027		pa += PAGE_SIZE;
1028		va += PAGE_SIZE;
1029	}
1030
1031	/*
1032	 * Allocate virtual address space for the dynamic percpu area.
1033	 */
1034	pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
1035	dpcpu = (void *)virtual_avail;
1036	virtual_avail += DPCPU_SIZE;
1037	while (va < virtual_avail) {
1038		moea64_kenter(mmup, va, pa);
1039		pa += PAGE_SIZE;
1040		va += PAGE_SIZE;
1041	}
1042	dpcpu_init(dpcpu, 0);
1043}
1044
1045/*
1046 * Activate a user pmap.  The pmap must be activated before it's address
1047 * space can be accessed in any way.
1048 */
1049void
1050moea64_activate(mmu_t mmu, struct thread *td)
1051{
1052	pmap_t	pm, pmr;
1053
1054	/*
1055	 * Load all the data we need up front to encourage the compiler to
1056	 * not issue any loads while we have interrupts disabled below.
1057	 */
1058	pm = &td->td_proc->p_vmspace->vm_pmap;
1059	pmr = pm->pmap_phys;
1060
1061	pm->pm_active |= PCPU_GET(cpumask);
1062	PCPU_SET(curpmap, pmr);
1063}
1064
1065void
1066moea64_deactivate(mmu_t mmu, struct thread *td)
1067{
1068	pmap_t	pm;
1069
1070	pm = &td->td_proc->p_vmspace->vm_pmap;
1071	pm->pm_active &= ~(PCPU_GET(cpumask));
1072	PCPU_SET(curpmap, NULL);
1073}
1074
1075void
1076moea64_change_wiring(mmu_t mmu, pmap_t pm, vm_offset_t va, boolean_t wired)
1077{
1078	struct	pvo_entry *pvo;
1079
1080	PMAP_LOCK(pm);
1081	pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1082
1083	if (pvo != NULL) {
1084		if (wired) {
1085			if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1086				pm->pm_stats.wired_count++;
1087			pvo->pvo_vaddr |= PVO_WIRED;
1088		} else {
1089			if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1090				pm->pm_stats.wired_count--;
1091			pvo->pvo_vaddr &= ~PVO_WIRED;
1092		}
1093	}
1094	PMAP_UNLOCK(pm);
1095}
1096
1097/*
1098 * This goes through and sets the physical address of our
1099 * special scratch PTE to the PA we want to zero or copy. Because
1100 * of locking issues (this can get called in pvo_enter() by
1101 * the UMA allocator), we can't use most other utility functions here
1102 */
1103
1104static __inline
1105void moea64_set_scratchpage_pa(int which, vm_offset_t pa) {
1106
1107	mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1108
1109	moea64_scratchpage_pte[which]->pte_hi &= ~LPTE_VALID;
1110	TLBIE(kernel_pmap, moea64_scratchpage_va[which]);
1111
1112	moea64_scratchpage_pte[which]->pte_lo &=
1113	    ~(LPTE_WIMG | LPTE_RPGN);
1114	moea64_scratchpage_pte[which]->pte_lo |=
1115	    moea64_calc_wimg(pa) | (uint64_t)pa;
1116	EIEIO();
1117
1118	moea64_scratchpage_pte[which]->pte_hi |= LPTE_VALID;
1119	PTESYNC(); isync();
1120}
1121
1122void
1123moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst)
1124{
1125	vm_offset_t	dst;
1126	vm_offset_t	src;
1127
1128	dst = VM_PAGE_TO_PHYS(mdst);
1129	src = VM_PAGE_TO_PHYS(msrc);
1130
1131	mtx_lock(&moea64_scratchpage_mtx);
1132
1133	moea64_set_scratchpage_pa(0,src);
1134	moea64_set_scratchpage_pa(1,dst);
1135
1136	kcopy((void *)moea64_scratchpage_va[0],
1137	    (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1138
1139	mtx_unlock(&moea64_scratchpage_mtx);
1140}
1141
1142void
1143moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
1144{
1145	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1146
1147	if (!moea64_initialized)
1148		panic("moea64_zero_page: can't zero pa %#x", pa);
1149	if (size + off > PAGE_SIZE)
1150		panic("moea64_zero_page: size + off > PAGE_SIZE");
1151
1152	mtx_lock(&moea64_scratchpage_mtx);
1153
1154	moea64_set_scratchpage_pa(0,pa);
1155	bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1156	mtx_unlock(&moea64_scratchpage_mtx);
1157}
1158
1159/*
1160 * Zero a page of physical memory by temporarily mapping it
1161 */
1162void
1163moea64_zero_page(mmu_t mmu, vm_page_t m)
1164{
1165	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1166	vm_offset_t off;
1167
1168	if (!moea64_initialized)
1169		panic("moea64_zero_page: can't zero pa %#x", pa);
1170
1171	mtx_lock(&moea64_scratchpage_mtx);
1172
1173	moea64_set_scratchpage_pa(0,pa);
1174	for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1175		__asm __volatile("dcbz 0,%0" ::
1176		    "r"(moea64_scratchpage_va[0] + off));
1177	mtx_unlock(&moea64_scratchpage_mtx);
1178}
1179
1180void
1181moea64_zero_page_idle(mmu_t mmu, vm_page_t m)
1182{
1183
1184	moea64_zero_page(mmu, m);
1185}
1186
1187/*
1188 * Map the given physical page at the specified virtual address in the
1189 * target pmap with the protection requested.  If specified the page
1190 * will be wired down.
1191 */
1192void
1193moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
1194    vm_prot_t prot, boolean_t wired)
1195{
1196
1197	vm_page_lock_queues();
1198	PMAP_LOCK(pmap);
1199	moea64_enter_locked(pmap, va, m, prot, wired);
1200	vm_page_unlock_queues();
1201	PMAP_UNLOCK(pmap);
1202}
1203
1204/*
1205 * Map the given physical page at the specified virtual address in the
1206 * target pmap with the protection requested.  If specified the page
1207 * will be wired down.
1208 *
1209 * The page queues and pmap must be locked.
1210 */
1211
1212static void
1213moea64_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1214    boolean_t wired)
1215{
1216	struct		pvo_head *pvo_head;
1217	uma_zone_t	zone;
1218	vm_page_t	pg;
1219	uint64_t	pte_lo;
1220	u_int		pvo_flags;
1221	int		error;
1222
1223	if (!moea64_initialized) {
1224		pvo_head = &moea64_pvo_kunmanaged;
1225		pg = NULL;
1226		zone = moea64_upvo_zone;
1227		pvo_flags = 0;
1228	} else {
1229		pvo_head = vm_page_to_pvoh(m);
1230		pg = m;
1231		zone = moea64_mpvo_zone;
1232		pvo_flags = PVO_MANAGED;
1233	}
1234
1235	if (pmap_bootstrapped)
1236		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1237	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1238	KASSERT((m->oflags & VPO_BUSY) != 0 || VM_OBJECT_LOCKED(m->object),
1239	    ("moea64_enter_locked: page %p is not busy", m));
1240
1241	/* XXX change the pvo head for fake pages */
1242	if ((m->flags & PG_FICTITIOUS) == PG_FICTITIOUS) {
1243		pvo_flags &= ~PVO_MANAGED;
1244		pvo_head = &moea64_pvo_kunmanaged;
1245		zone = moea64_upvo_zone;
1246	}
1247
1248	pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m));
1249
1250	if (prot & VM_PROT_WRITE) {
1251		pte_lo |= LPTE_BW;
1252		if (pmap_bootstrapped)
1253			vm_page_flag_set(m, PG_WRITEABLE);
1254	} else
1255		pte_lo |= LPTE_BR;
1256
1257	if (prot & VM_PROT_EXECUTE)
1258		pvo_flags |= VM_PROT_EXECUTE;
1259
1260	if (wired)
1261		pvo_flags |= PVO_WIRED;
1262
1263	if ((m->flags & PG_FICTITIOUS) != 0)
1264		pvo_flags |= PVO_FAKE;
1265
1266	error = moea64_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m),
1267	    pte_lo, pvo_flags);
1268
1269	/*
1270	 * Flush the page from the instruction cache if this page is
1271	 * mapped executable and cacheable.
1272	 */
1273	if ((pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1274		moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1275	}
1276}
1277
1278static void
1279moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz)
1280{
1281
1282	/*
1283	 * This is much trickier than on older systems because
1284	 * we can't sync the icache on physical addresses directly
1285	 * without a direct map. Instead we check a couple of cases
1286	 * where the memory is already mapped in and, failing that,
1287	 * use the same trick we use for page zeroing to create
1288	 * a temporary mapping for this physical address.
1289	 */
1290
1291	if (!pmap_bootstrapped) {
1292		/*
1293		 * If PMAP is not bootstrapped, we are likely to be
1294		 * in real mode.
1295		 */
1296		__syncicache((void *)pa, sz);
1297	} else if (pmap == kernel_pmap) {
1298		__syncicache((void *)va, sz);
1299	} else {
1300		/* Use the scratch page to set up a temp mapping */
1301
1302		mtx_lock(&moea64_scratchpage_mtx);
1303
1304		moea64_set_scratchpage_pa(1,pa & ~ADDR_POFF);
1305		__syncicache((void *)(moea64_scratchpage_va[1] +
1306		    (va & ADDR_POFF)), sz);
1307
1308		mtx_unlock(&moea64_scratchpage_mtx);
1309	}
1310}
1311
1312/*
1313 * Maps a sequence of resident pages belonging to the same object.
1314 * The sequence begins with the given page m_start.  This page is
1315 * mapped at the given virtual address start.  Each subsequent page is
1316 * mapped at a virtual address that is offset from start by the same
1317 * amount as the page is offset from m_start within the object.  The
1318 * last page in the sequence is the page with the largest offset from
1319 * m_start that can be mapped at a virtual address less than the given
1320 * virtual address end.  Not every virtual page between start and end
1321 * is mapped; only those for which a resident page exists with the
1322 * corresponding offset from m_start are mapped.
1323 */
1324void
1325moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end,
1326    vm_page_t m_start, vm_prot_t prot)
1327{
1328	vm_page_t m;
1329	vm_pindex_t diff, psize;
1330
1331	psize = atop(end - start);
1332	m = m_start;
1333	PMAP_LOCK(pm);
1334	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1335		moea64_enter_locked(pm, start + ptoa(diff), m, prot &
1336		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
1337		m = TAILQ_NEXT(m, listq);
1338	}
1339	PMAP_UNLOCK(pm);
1340}
1341
1342void
1343moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m,
1344    vm_prot_t prot)
1345{
1346
1347	vm_page_lock_queues();
1348	PMAP_LOCK(pm);
1349	moea64_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1350	    FALSE);
1351	vm_page_unlock_queues();
1352	PMAP_UNLOCK(pm);
1353}
1354
1355vm_paddr_t
1356moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va)
1357{
1358	struct	pvo_entry *pvo;
1359	vm_paddr_t pa;
1360
1361	PMAP_LOCK(pm);
1362	pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1363	if (pvo == NULL)
1364		pa = 0;
1365	else
1366		pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va & ADDR_POFF);
1367	PMAP_UNLOCK(pm);
1368	return (pa);
1369}
1370
1371/*
1372 * Atomically extract and hold the physical page with the given
1373 * pmap and virtual address pair if that mapping permits the given
1374 * protection.
1375 */
1376vm_page_t
1377moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1378{
1379	struct	pvo_entry *pvo;
1380	vm_page_t m;
1381        vm_paddr_t pa;
1382
1383	m = NULL;
1384	pa = 0;
1385	PMAP_LOCK(pmap);
1386retry:
1387	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1388	if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) &&
1389	    ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) == LPTE_RW ||
1390	     (prot & VM_PROT_WRITE) == 0)) {
1391		if (vm_page_pa_tryrelock(pmap,
1392			pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, &pa))
1393			goto retry;
1394		m = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
1395		vm_page_hold(m);
1396	}
1397	PA_UNLOCK_COND(pa);
1398	PMAP_UNLOCK(pmap);
1399	return (m);
1400}
1401
1402static void *
1403moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
1404{
1405	/*
1406	 * This entire routine is a horrible hack to avoid bothering kmem
1407	 * for new KVA addresses. Because this can get called from inside
1408	 * kmem allocation routines, calling kmem for a new address here
1409	 * can lead to multiply locking non-recursive mutexes.
1410	 */
1411	static vm_pindex_t color;
1412        vm_offset_t va;
1413
1414        vm_page_t m;
1415        int pflags, needed_lock;
1416
1417	*flags = UMA_SLAB_PRIV;
1418	needed_lock = !PMAP_LOCKED(kernel_pmap);
1419
1420	if (needed_lock)
1421		PMAP_LOCK(kernel_pmap);
1422
1423        if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
1424                pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED;
1425        else
1426                pflags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED;
1427        if (wait & M_ZERO)
1428                pflags |= VM_ALLOC_ZERO;
1429
1430        for (;;) {
1431                m = vm_page_alloc(NULL, color++, pflags | VM_ALLOC_NOOBJ);
1432                if (m == NULL) {
1433                        if (wait & M_NOWAIT)
1434                                return (NULL);
1435                        VM_WAIT;
1436                } else
1437                        break;
1438        }
1439
1440	va = VM_PAGE_TO_PHYS(m);
1441
1442	moea64_pvo_enter(kernel_pmap, moea64_upvo_zone,
1443	    &moea64_pvo_kunmanaged, va, VM_PAGE_TO_PHYS(m), LPTE_M,
1444	    PVO_WIRED | PVO_BOOTSTRAP);
1445
1446	if (needed_lock)
1447		PMAP_UNLOCK(kernel_pmap);
1448
1449	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
1450                bzero((void *)va, PAGE_SIZE);
1451
1452	return (void *)va;
1453}
1454
1455void
1456moea64_init(mmu_t mmu)
1457{
1458
1459	CTR0(KTR_PMAP, "moea64_init");
1460
1461	moea64_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1462	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1463	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1464	moea64_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry),
1465	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1466	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1467
1468	if (!hw_direct_map) {
1469		uma_zone_set_allocf(moea64_upvo_zone,moea64_uma_page_alloc);
1470		uma_zone_set_allocf(moea64_mpvo_zone,moea64_uma_page_alloc);
1471	}
1472
1473	moea64_initialized = TRUE;
1474}
1475
1476boolean_t
1477moea64_is_referenced(mmu_t mmu, vm_page_t m)
1478{
1479
1480	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1481		return (FALSE);
1482	return (moea64_query_bit(m, PTE_REF));
1483}
1484
1485boolean_t
1486moea64_is_modified(mmu_t mmu, vm_page_t m)
1487{
1488
1489	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1490		return (FALSE);
1491
1492	return (moea64_query_bit(m, LPTE_CHG));
1493}
1494
1495void
1496moea64_clear_reference(mmu_t mmu, vm_page_t m)
1497{
1498
1499	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1500		return;
1501	moea64_clear_bit(m, LPTE_REF, NULL);
1502}
1503
1504void
1505moea64_clear_modify(mmu_t mmu, vm_page_t m)
1506{
1507
1508	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1509		return;
1510	moea64_clear_bit(m, LPTE_CHG, NULL);
1511}
1512
1513/*
1514 * Clear the write and modified bits in each of the given page's mappings.
1515 */
1516void
1517moea64_remove_write(mmu_t mmu, vm_page_t m)
1518{
1519	struct	pvo_entry *pvo;
1520	struct	lpte *pt;
1521	pmap_t	pmap;
1522	uint64_t lo;
1523
1524	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
1525	    ("moea64_remove_write: page %p is not managed", m));
1526
1527	/*
1528	 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be set by
1529	 * another thread while the object is locked.  Thus, if PG_WRITEABLE
1530	 * is clear, no page table entries need updating.
1531	 */
1532	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1533	if ((m->oflags & VPO_BUSY) == 0 &&
1534	    (m->flags & PG_WRITEABLE) == 0)
1535		return;
1536	vm_page_lock_queues();
1537	lo = moea64_attr_fetch(m);
1538	SYNC();
1539	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1540		pmap = pvo->pvo_pmap;
1541		PMAP_LOCK(pmap);
1542		LOCK_TABLE();
1543		if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) {
1544			pt = moea64_pvo_to_pte(pvo, -1);
1545			pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP;
1546			pvo->pvo_pte.lpte.pte_lo |= LPTE_BR;
1547			if (pt != NULL) {
1548				moea64_pte_synch(pt, &pvo->pvo_pte.lpte);
1549				lo |= pvo->pvo_pte.lpte.pte_lo;
1550				pvo->pvo_pte.lpte.pte_lo &= ~LPTE_CHG;
1551				moea64_pte_change(pt, &pvo->pvo_pte.lpte,
1552				    pvo->pvo_pmap, PVO_VADDR(pvo));
1553			}
1554		}
1555		UNLOCK_TABLE();
1556		PMAP_UNLOCK(pmap);
1557	}
1558	if ((lo & LPTE_CHG) != 0) {
1559		moea64_attr_clear(m, LPTE_CHG);
1560		vm_page_dirty(m);
1561	}
1562	vm_page_flag_clear(m, PG_WRITEABLE);
1563	vm_page_unlock_queues();
1564}
1565
1566/*
1567 *	moea64_ts_referenced:
1568 *
1569 *	Return a count of reference bits for a page, clearing those bits.
1570 *	It is not necessary for every reference bit to be cleared, but it
1571 *	is necessary that 0 only be returned when there are truly no
1572 *	reference bits set.
1573 *
1574 *	XXX: The exact number of bits to check and clear is a matter that
1575 *	should be tested and standardized at some point in the future for
1576 *	optimal aging of shared pages.
1577 */
1578boolean_t
1579moea64_ts_referenced(mmu_t mmu, vm_page_t m)
1580{
1581	int count;
1582
1583	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1584		return (0);
1585
1586	count = moea64_clear_bit(m, LPTE_REF, NULL);
1587
1588	return (count);
1589}
1590
1591/*
1592 * Map a wired page into kernel virtual address space.
1593 */
1594void
1595moea64_kenter(mmu_t mmu, vm_offset_t va, vm_offset_t pa)
1596{
1597	uint64_t	pte_lo;
1598	int		error;
1599
1600#if 0
1601	if (!pmap_bootstrapped) {
1602		if (va >= VM_MIN_KERNEL_ADDRESS && va < virtual_end)
1603			panic("Trying to enter an address in KVA -- %#x!\n",pa);
1604	}
1605#endif
1606
1607	pte_lo = moea64_calc_wimg(pa);
1608
1609	PMAP_LOCK(kernel_pmap);
1610	error = moea64_pvo_enter(kernel_pmap, moea64_upvo_zone,
1611	    &moea64_pvo_kunmanaged, va, pa, pte_lo,
1612	    PVO_WIRED | VM_PROT_EXECUTE);
1613
1614	if (error != 0 && error != ENOENT)
1615		panic("moea64_kenter: failed to enter va %#x pa %#x: %d", va,
1616		    pa, error);
1617
1618	/*
1619	 * Flush the memory from the instruction cache.
1620	 */
1621	if ((pte_lo & (LPTE_I | LPTE_G)) == 0) {
1622		__syncicache((void *)va, PAGE_SIZE);
1623	}
1624	PMAP_UNLOCK(kernel_pmap);
1625}
1626
1627/*
1628 * Extract the physical page address associated with the given kernel virtual
1629 * address.
1630 */
1631vm_offset_t
1632moea64_kextract(mmu_t mmu, vm_offset_t va)
1633{
1634	struct		pvo_entry *pvo;
1635	vm_paddr_t pa;
1636
1637	/*
1638	 * Shortcut the direct-mapped case when applicable.  We never put
1639	 * anything but 1:1 mappings below VM_MIN_KERNEL_ADDRESS.
1640	 */
1641	if (va < VM_MIN_KERNEL_ADDRESS)
1642		return (va);
1643
1644	PMAP_LOCK(kernel_pmap);
1645	pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
1646	KASSERT(pvo != NULL, ("moea64_kextract: no addr found"));
1647	pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va & ADDR_POFF);
1648	PMAP_UNLOCK(kernel_pmap);
1649	return (pa);
1650}
1651
1652/*
1653 * Remove a wired page from kernel virtual address space.
1654 */
1655void
1656moea64_kremove(mmu_t mmu, vm_offset_t va)
1657{
1658	moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE);
1659}
1660
1661/*
1662 * Map a range of physical addresses into kernel virtual address space.
1663 *
1664 * The value passed in *virt is a suggested virtual address for the mapping.
1665 * Architectures which can support a direct-mapped physical to virtual region
1666 * can return the appropriate address within that region, leaving '*virt'
1667 * unchanged.  We cannot and therefore do not; *virt is updated with the
1668 * first usable address after the mapped region.
1669 */
1670vm_offset_t
1671moea64_map(mmu_t mmu, vm_offset_t *virt, vm_offset_t pa_start,
1672    vm_offset_t pa_end, int prot)
1673{
1674	vm_offset_t	sva, va;
1675
1676	sva = *virt;
1677	va = sva;
1678	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
1679		moea64_kenter(mmu, va, pa_start);
1680	*virt = va;
1681
1682	return (sva);
1683}
1684
1685/*
1686 * Returns true if the pmap's pv is one of the first
1687 * 16 pvs linked to from this page.  This count may
1688 * be changed upwards or downwards in the future; it
1689 * is only necessary that true be returned for a small
1690 * subset of pmaps for proper page aging.
1691 */
1692boolean_t
1693moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
1694{
1695        int loops;
1696	struct pvo_entry *pvo;
1697
1698        if (!moea64_initialized || (m->flags & PG_FICTITIOUS))
1699                return FALSE;
1700
1701	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1702
1703	loops = 0;
1704	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1705		if (pvo->pvo_pmap == pmap)
1706			return (TRUE);
1707		if (++loops >= 16)
1708			break;
1709	}
1710
1711	return (FALSE);
1712}
1713
1714/*
1715 * Return the number of managed mappings to the given physical page
1716 * that are wired.
1717 */
1718int
1719moea64_page_wired_mappings(mmu_t mmu, vm_page_t m)
1720{
1721	struct pvo_entry *pvo;
1722	int count;
1723
1724	count = 0;
1725	if (!moea64_initialized || (m->flags & PG_FICTITIOUS) != 0)
1726		return (count);
1727	vm_page_lock_queues();
1728	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
1729		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1730			count++;
1731	vm_page_unlock_queues();
1732	return (count);
1733}
1734
1735static u_int	moea64_vsidcontext;
1736
1737void
1738moea64_pinit(mmu_t mmu, pmap_t pmap)
1739{
1740	int	i, mask;
1741	u_int	entropy;
1742
1743	PMAP_LOCK_INIT(pmap);
1744
1745	entropy = 0;
1746	__asm __volatile("mftb %0" : "=r"(entropy));
1747
1748	if (pmap_bootstrapped)
1749		pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap);
1750	else
1751		pmap->pmap_phys = pmap;
1752
1753	/*
1754	 * Allocate some segment registers for this pmap.
1755	 */
1756	for (i = 0; i < NPMAPS; i += VSID_NBPW) {
1757		u_int	hash, n;
1758
1759		/*
1760		 * Create a new value by mutiplying by a prime and adding in
1761		 * entropy from the timebase register.  This is to make the
1762		 * VSID more random so that the PT hash function collides
1763		 * less often.  (Note that the prime casues gcc to do shifts
1764		 * instead of a multiply.)
1765		 */
1766		moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
1767		hash = moea64_vsidcontext & (NPMAPS - 1);
1768		if (hash == 0)		/* 0 is special, avoid it */
1769			continue;
1770		n = hash >> 5;
1771		mask = 1 << (hash & (VSID_NBPW - 1));
1772		hash = (moea64_vsidcontext & 0xfffff);
1773		if (moea64_vsid_bitmap[n] & mask) {	/* collision? */
1774			/* anything free in this bucket? */
1775			if (moea64_vsid_bitmap[n] == 0xffffffff) {
1776				entropy = (moea64_vsidcontext >> 20);
1777				continue;
1778			}
1779			i = ffs(~moea64_vsid_bitmap[i]) - 1;
1780			mask = 1 << i;
1781			hash &= 0xfffff & ~(VSID_NBPW - 1);
1782			hash |= i;
1783		}
1784		moea64_vsid_bitmap[n] |= mask;
1785		for (i = 0; i < 16; i++) {
1786			pmap->pm_sr[i] = VSID_MAKE(i, hash);
1787		}
1788		return;
1789	}
1790
1791	panic("moea64_pinit: out of segments");
1792}
1793
1794/*
1795 * Initialize the pmap associated with process 0.
1796 */
1797void
1798moea64_pinit0(mmu_t mmu, pmap_t pm)
1799{
1800	moea64_pinit(mmu, pm);
1801	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
1802}
1803
1804/*
1805 * Set the physical protection on the specified range of this map as requested.
1806 */
1807void
1808moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
1809    vm_prot_t prot)
1810{
1811	struct	pvo_entry *pvo;
1812	struct	lpte *pt;
1813	int	pteidx;
1814
1815	CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva,
1816	    eva, prot);
1817
1818
1819	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
1820	    ("moea64_protect: non current pmap"));
1821
1822	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1823		moea64_remove(mmu, pm, sva, eva);
1824		return;
1825	}
1826
1827	vm_page_lock_queues();
1828	PMAP_LOCK(pm);
1829	for (; sva < eva; sva += PAGE_SIZE) {
1830		pvo = moea64_pvo_find_va(pm, sva, &pteidx);
1831		if (pvo == NULL)
1832			continue;
1833
1834		/*
1835		 * Grab the PTE pointer before we diddle with the cached PTE
1836		 * copy.
1837		 */
1838		LOCK_TABLE();
1839		pt = moea64_pvo_to_pte(pvo, pteidx);
1840
1841		/*
1842		 * Change the protection of the page.
1843		 */
1844		pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP;
1845		pvo->pvo_pte.lpte.pte_lo |= LPTE_BR;
1846		pvo->pvo_pte.lpte.pte_lo &= ~LPTE_NOEXEC;
1847		if ((prot & VM_PROT_EXECUTE) == 0)
1848			pvo->pvo_pte.lpte.pte_lo |= LPTE_NOEXEC;
1849
1850		/*
1851		 * If the PVO is in the page table, update that pte as well.
1852		 */
1853		if (pt != NULL) {
1854			moea64_pte_change(pt, &pvo->pvo_pte.lpte,
1855			    pvo->pvo_pmap, PVO_VADDR(pvo));
1856			if ((pvo->pvo_pte.lpte.pte_lo &
1857			    (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1858				moea64_syncicache(pm, sva,
1859				    pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN,
1860				    PAGE_SIZE);
1861			}
1862		}
1863		UNLOCK_TABLE();
1864	}
1865	vm_page_unlock_queues();
1866	PMAP_UNLOCK(pm);
1867}
1868
1869/*
1870 * Map a list of wired pages into kernel virtual address space.  This is
1871 * intended for temporary mappings which do not need page modification or
1872 * references recorded.  Existing mappings in the region are overwritten.
1873 */
1874void
1875moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count)
1876{
1877	while (count-- > 0) {
1878		moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
1879		va += PAGE_SIZE;
1880		m++;
1881	}
1882}
1883
1884/*
1885 * Remove page mappings from kernel virtual address space.  Intended for
1886 * temporary mappings entered by moea64_qenter.
1887 */
1888void
1889moea64_qremove(mmu_t mmu, vm_offset_t va, int count)
1890{
1891	while (count-- > 0) {
1892		moea64_kremove(mmu, va);
1893		va += PAGE_SIZE;
1894	}
1895}
1896
1897void
1898moea64_release(mmu_t mmu, pmap_t pmap)
1899{
1900        int idx, mask;
1901
1902	/*
1903	 * Free segment register's VSID
1904	 */
1905        if (pmap->pm_sr[0] == 0)
1906                panic("moea64_release");
1907
1908        idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1);
1909        mask = 1 << (idx % VSID_NBPW);
1910        idx /= VSID_NBPW;
1911        moea64_vsid_bitmap[idx] &= ~mask;
1912	PMAP_LOCK_DESTROY(pmap);
1913}
1914
1915/*
1916 * Remove the given range of addresses from the specified map.
1917 */
1918void
1919moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1920{
1921	struct	pvo_entry *pvo;
1922	int	pteidx;
1923
1924	vm_page_lock_queues();
1925	PMAP_LOCK(pm);
1926	for (; sva < eva; sva += PAGE_SIZE) {
1927		pvo = moea64_pvo_find_va(pm, sva, &pteidx);
1928		if (pvo != NULL) {
1929			moea64_pvo_remove(pvo, pteidx);
1930		}
1931	}
1932	vm_page_unlock_queues();
1933	PMAP_UNLOCK(pm);
1934}
1935
1936/*
1937 * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
1938 * will reflect changes in pte's back to the vm_page.
1939 */
1940void
1941moea64_remove_all(mmu_t mmu, vm_page_t m)
1942{
1943	struct  pvo_head *pvo_head;
1944	struct	pvo_entry *pvo, *next_pvo;
1945	pmap_t	pmap;
1946
1947	vm_page_lock_queues();
1948	pvo_head = vm_page_to_pvoh(m);
1949	for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) {
1950		next_pvo = LIST_NEXT(pvo, pvo_vlink);
1951
1952		MOEA_PVO_CHECK(pvo);	/* sanity check */
1953		pmap = pvo->pvo_pmap;
1954		PMAP_LOCK(pmap);
1955		moea64_pvo_remove(pvo, -1);
1956		PMAP_UNLOCK(pmap);
1957	}
1958	if ((m->flags & PG_WRITEABLE) && moea64_is_modified(mmu, m)) {
1959		moea64_attr_clear(m, LPTE_CHG);
1960		vm_page_dirty(m);
1961	}
1962	vm_page_flag_clear(m, PG_WRITEABLE);
1963	vm_page_unlock_queues();
1964}
1965
1966/*
1967 * Allocate a physical page of memory directly from the phys_avail map.
1968 * Can only be called from moea64_bootstrap before avail start and end are
1969 * calculated.
1970 */
1971static vm_offset_t
1972moea64_bootstrap_alloc(vm_size_t size, u_int align)
1973{
1974	vm_offset_t	s, e;
1975	int		i, j;
1976
1977	size = round_page(size);
1978	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1979		if (align != 0)
1980			s = (phys_avail[i] + align - 1) & ~(align - 1);
1981		else
1982			s = phys_avail[i];
1983		e = s + size;
1984
1985		if (s < phys_avail[i] || e > phys_avail[i + 1])
1986			continue;
1987
1988		if (s == phys_avail[i]) {
1989			phys_avail[i] += size;
1990		} else if (e == phys_avail[i + 1]) {
1991			phys_avail[i + 1] -= size;
1992		} else {
1993			for (j = phys_avail_count * 2; j > i; j -= 2) {
1994				phys_avail[j] = phys_avail[j - 2];
1995				phys_avail[j + 1] = phys_avail[j - 1];
1996			}
1997
1998			phys_avail[i + 3] = phys_avail[i + 1];
1999			phys_avail[i + 1] = s;
2000			phys_avail[i + 2] = e;
2001			phys_avail_count++;
2002		}
2003
2004		return (s);
2005	}
2006	panic("moea64_bootstrap_alloc: could not allocate memory");
2007}
2008
2009static void
2010tlbia(void)
2011{
2012	vm_offset_t i;
2013	register_t msr, scratch;
2014
2015	for (i = 0; i < 0xFF000; i += 0x00001000) {
2016		__asm __volatile("\
2017		    mfmsr %0; \
2018		    mr %1, %0; \
2019		    insrdi %1,%3,1,0; \
2020		    mtmsrd %1; \
2021		    ptesync; \
2022		    \
2023		    tlbiel %2; \
2024		    \
2025		    mtmsrd %0; \
2026		    eieio; \
2027		    tlbsync; \
2028		    ptesync;"
2029		: "=r"(msr), "=r"(scratch) : "r"(i), "r"(1));
2030	}
2031}
2032
2033static int
2034moea64_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head,
2035    vm_offset_t va, vm_offset_t pa, uint64_t pte_lo, int flags)
2036{
2037	struct	 pvo_entry *pvo;
2038	uint64_t vsid;
2039	int	 first;
2040	u_int	 ptegidx;
2041	int	 i;
2042	int      bootstrap;
2043
2044	/*
2045	 * One nasty thing that can happen here is that the UMA calls to
2046	 * allocate new PVOs need to map more memory, which calls pvo_enter(),
2047	 * which calls UMA...
2048	 *
2049	 * We break the loop by detecting recursion and allocating out of
2050	 * the bootstrap pool.
2051	 */
2052
2053	moea64_pvo_enter_calls++;
2054	first = 0;
2055	bootstrap = (flags & PVO_BOOTSTRAP);
2056
2057	if (!moea64_initialized)
2058		bootstrap = 1;
2059
2060	/*
2061	 * Compute the PTE Group index.
2062	 */
2063	va &= ~ADDR_POFF;
2064	vsid = va_to_vsid(pm, va);
2065	ptegidx = va_to_pteg(vsid, va);
2066
2067	/*
2068	 * Remove any existing mapping for this page.  Reuse the pvo entry if
2069	 * there is a mapping.
2070	 */
2071	LOCK_TABLE();
2072
2073	LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) {
2074		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2075			if ((pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) == pa &&
2076			    (pvo->pvo_pte.lpte.pte_lo & LPTE_PP) ==
2077			    (pte_lo & LPTE_PP)) {
2078				UNLOCK_TABLE();
2079				return (0);
2080			}
2081			moea64_pvo_remove(pvo, -1);
2082			break;
2083		}
2084	}
2085
2086	/*
2087	 * If we aren't overwriting a mapping, try to allocate.
2088	 */
2089	if (bootstrap) {
2090		if (moea64_bpvo_pool_index >= BPVO_POOL_SIZE) {
2091			panic("moea64_enter: bpvo pool exhausted, %d, %d, %d",
2092			      moea64_bpvo_pool_index, BPVO_POOL_SIZE,
2093			      BPVO_POOL_SIZE * sizeof(struct pvo_entry));
2094		}
2095		pvo = &moea64_bpvo_pool[moea64_bpvo_pool_index];
2096		moea64_bpvo_pool_index++;
2097		bootstrap = 1;
2098	} else {
2099		/*
2100		 * Note: drop the table lock around the UMA allocation in
2101		 * case the UMA allocator needs to manipulate the page
2102		 * table. The mapping we are working with is already
2103		 * protected by the PMAP lock.
2104		 */
2105		UNLOCK_TABLE();
2106		pvo = uma_zalloc(zone, M_NOWAIT);
2107		LOCK_TABLE();
2108	}
2109
2110	if (pvo == NULL) {
2111		UNLOCK_TABLE();
2112		return (ENOMEM);
2113	}
2114
2115	moea64_pvo_entries++;
2116	pvo->pvo_vaddr = va;
2117	pvo->pvo_pmap = pm;
2118	LIST_INSERT_HEAD(&moea64_pvo_table[ptegidx], pvo, pvo_olink);
2119	pvo->pvo_vaddr &= ~ADDR_POFF;
2120
2121	if (!(flags & VM_PROT_EXECUTE))
2122		pte_lo |= LPTE_NOEXEC;
2123	if (flags & PVO_WIRED)
2124		pvo->pvo_vaddr |= PVO_WIRED;
2125	if (pvo_head != &moea64_pvo_kunmanaged)
2126		pvo->pvo_vaddr |= PVO_MANAGED;
2127	if (bootstrap)
2128		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
2129	if (flags & PVO_FAKE)
2130		pvo->pvo_vaddr |= PVO_FAKE;
2131
2132	moea64_pte_create(&pvo->pvo_pte.lpte, vsid, va,
2133	    (uint64_t)(pa) | pte_lo);
2134
2135	/*
2136	 * Remember if the list was empty and therefore will be the first
2137	 * item.
2138	 */
2139	if (LIST_FIRST(pvo_head) == NULL)
2140		first = 1;
2141	LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2142
2143	if (pvo->pvo_vaddr & PVO_WIRED)
2144		pm->pm_stats.wired_count++;
2145	pm->pm_stats.resident_count++;
2146
2147	/*
2148	 * We hope this succeeds but it isn't required.
2149	 */
2150	i = moea64_pte_insert(ptegidx, &pvo->pvo_pte.lpte);
2151	if (i >= 0) {
2152		PVO_PTEGIDX_SET(pvo, i);
2153	} else {
2154		panic("moea64_pvo_enter: overflow");
2155		moea64_pte_overflow++;
2156	}
2157
2158	if (pm == kernel_pmap)
2159		isync();
2160
2161	UNLOCK_TABLE();
2162
2163	return (first ? ENOENT : 0);
2164}
2165
2166static void
2167moea64_pvo_remove(struct pvo_entry *pvo, int pteidx)
2168{
2169	struct	lpte *pt;
2170
2171	/*
2172	 * If there is an active pte entry, we need to deactivate it (and
2173	 * save the ref & cfg bits).
2174	 */
2175	LOCK_TABLE();
2176	pt = moea64_pvo_to_pte(pvo, pteidx);
2177	if (pt != NULL) {
2178		moea64_pte_unset(pt, &pvo->pvo_pte.lpte, pvo->pvo_pmap,
2179		    PVO_VADDR(pvo));
2180		PVO_PTEGIDX_CLR(pvo);
2181	} else {
2182		moea64_pte_overflow--;
2183	}
2184
2185	/*
2186	 * Update our statistics.
2187	 */
2188	pvo->pvo_pmap->pm_stats.resident_count--;
2189	if (pvo->pvo_vaddr & PVO_WIRED)
2190		pvo->pvo_pmap->pm_stats.wired_count--;
2191
2192	/*
2193	 * Save the REF/CHG bits into their cache if the page is managed.
2194	 */
2195	if ((pvo->pvo_vaddr & (PVO_MANAGED|PVO_FAKE)) == PVO_MANAGED) {
2196		struct	vm_page *pg;
2197
2198		pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
2199		if (pg != NULL) {
2200			moea64_attr_save(pg, pvo->pvo_pte.lpte.pte_lo &
2201			    (LPTE_REF | LPTE_CHG));
2202		}
2203	}
2204
2205	/*
2206	 * Remove this PVO from the PV list.
2207	 */
2208	LIST_REMOVE(pvo, pvo_vlink);
2209
2210	/*
2211	 * Remove this from the overflow list and return it to the pool
2212	 * if we aren't going to reuse it.
2213	 */
2214	LIST_REMOVE(pvo, pvo_olink);
2215	UNLOCK_TABLE();
2216
2217	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
2218		uma_zfree((pvo->pvo_vaddr & PVO_MANAGED) ? moea64_mpvo_zone :
2219		    moea64_upvo_zone, pvo);
2220
2221	moea64_pvo_entries--;
2222	moea64_pvo_remove_calls++;
2223}
2224
2225static __inline int
2226moea64_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx)
2227{
2228
2229	/*
2230	 * We can find the actual pte entry without searching by grabbing
2231	 * the PTEG index from 3 unused bits in pvo_vaddr and by
2232	 * noticing the HID bit.
2233	 */
2234	if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID)
2235		ptegidx ^= moea64_pteg_mask;
2236
2237	return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo));
2238}
2239
2240static struct pvo_entry *
2241moea64_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p)
2242{
2243	struct		pvo_entry *pvo;
2244	int		ptegidx;
2245	uint64_t	vsid;
2246
2247	va &= ~ADDR_POFF;
2248	vsid = va_to_vsid(pm, va);
2249	ptegidx = va_to_pteg(vsid, va);
2250
2251	LOCK_TABLE();
2252	LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) {
2253		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2254			if (pteidx_p)
2255				*pteidx_p = moea64_pvo_pte_index(pvo, ptegidx);
2256			break;
2257		}
2258	}
2259	UNLOCK_TABLE();
2260
2261	return (pvo);
2262}
2263
2264static struct lpte *
2265moea64_pvo_to_pte(const struct pvo_entry *pvo, int pteidx)
2266{
2267	struct lpte *pt;
2268
2269	/*
2270	 * If we haven't been supplied the ptegidx, calculate it.
2271	 */
2272	if (pteidx == -1) {
2273		int		ptegidx;
2274		uint64_t	vsid;
2275
2276		vsid = va_to_vsid(pvo->pvo_pmap, PVO_VADDR(pvo));
2277		ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo));
2278		pteidx = moea64_pvo_pte_index(pvo, ptegidx);
2279	}
2280
2281	pt = &moea64_pteg_table[pteidx >> 3].pt[pteidx & 7];
2282
2283	if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) &&
2284	    !PVO_PTEGIDX_ISSET(pvo)) {
2285		panic("moea64_pvo_to_pte: pvo %p has valid pte in pvo but no "
2286		    "valid pte index", pvo);
2287	}
2288
2289	if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0 &&
2290	    PVO_PTEGIDX_ISSET(pvo)) {
2291		panic("moea64_pvo_to_pte: pvo %p has valid pte index in pvo "
2292		    "pvo but no valid pte", pvo);
2293	}
2294
2295	if ((pt->pte_hi ^ (pvo->pvo_pte.lpte.pte_hi & ~LPTE_VALID)) ==
2296	    LPTE_VALID) {
2297		if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0) {
2298			panic("moea64_pvo_to_pte: pvo %p has valid pte in "
2299			    "moea64_pteg_table %p but invalid in pvo", pvo, pt);
2300		}
2301
2302		if (((pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo) &
2303		    ~(LPTE_M|LPTE_CHG|LPTE_REF)) != 0) {
2304			panic("moea64_pvo_to_pte: pvo %p pte does not match "
2305			    "pte %p in moea64_pteg_table difference is %#x",
2306			    pvo, pt,
2307			    (uint32_t)(pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo));
2308		}
2309
2310		ASSERT_TABLE_LOCK();
2311		return (pt);
2312	}
2313
2314	if (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) {
2315		panic("moea64_pvo_to_pte: pvo %p has invalid pte %p in "
2316		    "moea64_pteg_table but valid in pvo", pvo, pt);
2317	}
2318
2319	return (NULL);
2320}
2321
2322static int
2323moea64_pte_insert(u_int ptegidx, struct lpte *pvo_pt)
2324{
2325	struct	lpte *pt;
2326	int	i;
2327
2328	ASSERT_TABLE_LOCK();
2329
2330	/*
2331	 * First try primary hash.
2332	 */
2333	for (pt = moea64_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2334		if ((pt->pte_hi & LPTE_VALID) == 0 &&
2335		    (pt->pte_hi & LPTE_LOCKED) == 0) {
2336			pvo_pt->pte_hi &= ~LPTE_HID;
2337			moea64_pte_set(pt, pvo_pt);
2338			return (i);
2339		}
2340	}
2341
2342	/*
2343	 * Now try secondary hash.
2344	 */
2345	ptegidx ^= moea64_pteg_mask;
2346
2347	for (pt = moea64_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2348		if ((pt->pte_hi & LPTE_VALID) == 0 &&
2349		    (pt->pte_hi & LPTE_LOCKED) == 0) {
2350			pvo_pt->pte_hi |= LPTE_HID;
2351			moea64_pte_set(pt, pvo_pt);
2352			return (i);
2353		}
2354	}
2355
2356	panic("moea64_pte_insert: overflow");
2357	return (-1);
2358}
2359
2360static boolean_t
2361moea64_query_bit(vm_page_t m, u_int64_t ptebit)
2362{
2363	struct	pvo_entry *pvo;
2364	struct	lpte *pt;
2365
2366	if (moea64_attr_fetch(m) & ptebit)
2367		return (TRUE);
2368
2369	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2370
2371	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2372		MOEA_PVO_CHECK(pvo);	/* sanity check */
2373
2374		/*
2375		 * See if we saved the bit off.  If so, cache it and return
2376		 * success.
2377		 */
2378		if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2379			moea64_attr_save(m, ptebit);
2380			MOEA_PVO_CHECK(pvo);	/* sanity check */
2381			return (TRUE);
2382		}
2383	}
2384
2385	/*
2386	 * No luck, now go through the hard part of looking at the PTEs
2387	 * themselves.  Sync so that any pending REF/CHG bits are flushed to
2388	 * the PTEs.
2389	 */
2390	SYNC();
2391	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2392		MOEA_PVO_CHECK(pvo);	/* sanity check */
2393
2394		/*
2395		 * See if this pvo has a valid PTE.  if so, fetch the
2396		 * REF/CHG bits from the valid PTE.  If the appropriate
2397		 * ptebit is set, cache it and return success.
2398		 */
2399		LOCK_TABLE();
2400		pt = moea64_pvo_to_pte(pvo, -1);
2401		if (pt != NULL) {
2402			moea64_pte_synch(pt, &pvo->pvo_pte.lpte);
2403			if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2404				UNLOCK_TABLE();
2405
2406				moea64_attr_save(m, ptebit);
2407				MOEA_PVO_CHECK(pvo);	/* sanity check */
2408				return (TRUE);
2409			}
2410		}
2411		UNLOCK_TABLE();
2412	}
2413
2414	return (FALSE);
2415}
2416
2417static u_int
2418moea64_clear_bit(vm_page_t m, u_int64_t ptebit, u_int64_t *origbit)
2419{
2420	u_int	count;
2421	struct	pvo_entry *pvo;
2422	struct	lpte *pt;
2423	uint64_t rv;
2424
2425	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2426
2427	/*
2428	 * Clear the cached value.
2429	 */
2430	rv = moea64_attr_fetch(m);
2431	moea64_attr_clear(m, ptebit);
2432
2433	/*
2434	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2435	 * we can reset the right ones).  note that since the pvo entries and
2436	 * list heads are accessed via BAT0 and are never placed in the page
2437	 * table, we don't have to worry about further accesses setting the
2438	 * REF/CHG bits.
2439	 */
2440	SYNC();
2441
2442	/*
2443	 * For each pvo entry, clear the pvo's ptebit.  If this pvo has a
2444	 * valid pte clear the ptebit from the valid pte.
2445	 */
2446	count = 0;
2447	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2448		MOEA_PVO_CHECK(pvo);	/* sanity check */
2449
2450		LOCK_TABLE();
2451		pt = moea64_pvo_to_pte(pvo, -1);
2452		if (pt != NULL) {
2453			moea64_pte_synch(pt, &pvo->pvo_pte.lpte);
2454			if (pvo->pvo_pte.lpte.pte_lo & ptebit) {
2455				count++;
2456				moea64_pte_clear(pt, pvo->pvo_pmap, PVO_VADDR(pvo), ptebit);
2457			}
2458		}
2459		rv |= pvo->pvo_pte.lpte.pte_lo;
2460		pvo->pvo_pte.lpte.pte_lo &= ~ptebit;
2461		MOEA_PVO_CHECK(pvo);	/* sanity check */
2462		UNLOCK_TABLE();
2463	}
2464
2465	if (origbit != NULL) {
2466		*origbit = rv;
2467	}
2468
2469	return (count);
2470}
2471
2472boolean_t
2473moea64_dev_direct_mapped(mmu_t mmu, vm_offset_t pa, vm_size_t size)
2474{
2475	struct pvo_entry *pvo;
2476	vm_offset_t ppa;
2477	int error = 0;
2478
2479	PMAP_LOCK(kernel_pmap);
2480	for (ppa = pa & ~ADDR_POFF; ppa < pa + size; ppa += PAGE_SIZE) {
2481		pvo = moea64_pvo_find_va(kernel_pmap, ppa, NULL);
2482		if (pvo == NULL ||
2483		    (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) {
2484			error = EFAULT;
2485			break;
2486		}
2487	}
2488	PMAP_UNLOCK(kernel_pmap);
2489
2490	return (error);
2491}
2492
2493/*
2494 * Map a set of physical memory pages into the kernel virtual
2495 * address space. Return a pointer to where it is mapped. This
2496 * routine is intended to be used for mapping device memory,
2497 * NOT real memory.
2498 */
2499void *
2500moea64_mapdev(mmu_t mmu, vm_offset_t pa, vm_size_t size)
2501{
2502	vm_offset_t va, tmpva, ppa, offset;
2503
2504	ppa = trunc_page(pa);
2505	offset = pa & PAGE_MASK;
2506	size = roundup(offset + size, PAGE_SIZE);
2507
2508	va = kmem_alloc_nofault(kernel_map, size);
2509
2510	if (!va)
2511		panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
2512
2513	for (tmpva = va; size > 0;) {
2514		moea64_kenter(mmu, tmpva, ppa);
2515		size -= PAGE_SIZE;
2516		tmpva += PAGE_SIZE;
2517		ppa += PAGE_SIZE;
2518	}
2519
2520	return ((void *)(va + offset));
2521}
2522
2523void
2524moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
2525{
2526	vm_offset_t base, offset;
2527
2528	base = trunc_page(va);
2529	offset = va & PAGE_MASK;
2530	size = roundup(offset + size, PAGE_SIZE);
2531
2532	kmem_free(kernel_map, base, size);
2533}
2534
2535static void
2536moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz)
2537{
2538	struct pvo_entry *pvo;
2539	vm_offset_t lim;
2540	vm_paddr_t pa;
2541	vm_size_t len;
2542
2543	PMAP_LOCK(pm);
2544	while (sz > 0) {
2545		lim = round_page(va);
2546		len = MIN(lim - va, sz);
2547		pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
2548		if (pvo != NULL) {
2549			pa = (pvo->pvo_pte.pte.pte_lo & LPTE_RPGN) |
2550			    (va & ADDR_POFF);
2551			moea64_syncicache(pm, va, pa, len);
2552		}
2553		va += len;
2554		sz -= len;
2555	}
2556	PMAP_UNLOCK(pm);
2557}
2558