mmu_oea.c revision 331722
1/*-
2 * Copyright (c) 2001 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29/*-
30 * Copyright (C) 1995, 1996 Wolfgang Solfrank.
31 * Copyright (C) 1995, 1996 TooLs GmbH.
32 * All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 *    notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 *    notice, this list of conditions and the following disclaimer in the
41 *    documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 *    must display the following acknowledgement:
44 *	This product includes software developed by TooLs GmbH.
45 * 4. The name of TooLs GmbH may not be used to endorse or promote products
46 *    derived from this software without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
52 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
53 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
54 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
55 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
56 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
57 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 *
59 * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $
60 */
61/*-
62 * Copyright (C) 2001 Benno Rice.
63 * All rights reserved.
64 *
65 * Redistribution and use in source and binary forms, with or without
66 * modification, are permitted provided that the following conditions
67 * are met:
68 * 1. Redistributions of source code must retain the above copyright
69 *    notice, this list of conditions and the following disclaimer.
70 * 2. Redistributions in binary form must reproduce the above copyright
71 *    notice, this list of conditions and the following disclaimer in the
72 *    documentation and/or other materials provided with the distribution.
73 *
74 * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
75 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
76 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
77 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
78 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
79 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
80 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
81 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
82 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
83 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
84 */
85
86#include <sys/cdefs.h>
87__FBSDID("$FreeBSD: stable/11/sys/powerpc/aim/mmu_oea.c 331722 2018-03-29 02:50:57Z eadler $");
88
89/*
90 * Manages physical address maps.
91 *
92 * Since the information managed by this module is also stored by the
93 * logical address mapping module, this module may throw away valid virtual
94 * to physical mappings at almost any time.  However, invalidations of
95 * mappings must be done as requested.
96 *
97 * In order to cope with hardware architectures which make virtual to
98 * physical map invalidates expensive, this module may delay invalidate
99 * reduced protection operations until such time as they are actually
100 * necessary.  This module is given full information as to which processors
101 * are currently using which maps, and to when physical maps must be made
102 * correct.
103 */
104
105#include "opt_kstack_pages.h"
106
107#include <sys/param.h>
108#include <sys/kernel.h>
109#include <sys/conf.h>
110#include <sys/queue.h>
111#include <sys/cpuset.h>
112#include <sys/kerneldump.h>
113#include <sys/ktr.h>
114#include <sys/lock.h>
115#include <sys/msgbuf.h>
116#include <sys/mutex.h>
117#include <sys/proc.h>
118#include <sys/rwlock.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/systm.h>
122#include <sys/vmmeter.h>
123
124#include <dev/ofw/openfirm.h>
125
126#include <vm/vm.h>
127#include <vm/vm_param.h>
128#include <vm/vm_kern.h>
129#include <vm/vm_page.h>
130#include <vm/vm_map.h>
131#include <vm/vm_object.h>
132#include <vm/vm_extern.h>
133#include <vm/vm_pageout.h>
134#include <vm/uma.h>
135
136#include <machine/cpu.h>
137#include <machine/platform.h>
138#include <machine/bat.h>
139#include <machine/frame.h>
140#include <machine/md_var.h>
141#include <machine/psl.h>
142#include <machine/pte.h>
143#include <machine/smp.h>
144#include <machine/sr.h>
145#include <machine/mmuvar.h>
146#include <machine/trap.h>
147
148#include "mmu_if.h"
149
150#define	MOEA_DEBUG
151
152#define TODO	panic("%s: not implemented", __func__);
153
154#define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
155#define	VSID_TO_SR(vsid)	((vsid) & 0xf)
156#define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
157
158struct ofw_map {
159	vm_offset_t	om_va;
160	vm_size_t	om_len;
161	vm_offset_t	om_pa;
162	u_int		om_mode;
163};
164
165extern unsigned char _etext[];
166extern unsigned char _end[];
167
168/*
169 * Map of physical memory regions.
170 */
171static struct	mem_region *regions;
172static struct	mem_region *pregions;
173static u_int    phys_avail_count;
174static int	regions_sz, pregions_sz;
175static struct	ofw_map *translations;
176
177/*
178 * Lock for the pteg and pvo tables.
179 */
180struct mtx	moea_table_mutex;
181struct mtx	moea_vsid_mutex;
182
183/* tlbie instruction synchronization */
184static struct mtx tlbie_mtx;
185
186/*
187 * PTEG data.
188 */
189static struct	pteg *moea_pteg_table;
190u_int		moea_pteg_count;
191u_int		moea_pteg_mask;
192
193/*
194 * PVO data.
195 */
196struct	pvo_head *moea_pvo_table;		/* pvo entries by pteg index */
197struct	pvo_head moea_pvo_kunmanaged =
198    LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged);	/* list of unmanaged pages */
199
200static struct rwlock_padalign pvh_global_lock;
201
202uma_zone_t	moea_upvo_zone;	/* zone for pvo entries for unmanaged pages */
203uma_zone_t	moea_mpvo_zone;	/* zone for pvo entries for managed pages */
204
205#define	BPVO_POOL_SIZE	32768
206static struct	pvo_entry *moea_bpvo_pool;
207static int	moea_bpvo_pool_index = 0;
208
209#define	VSID_NBPW	(sizeof(u_int32_t) * 8)
210static u_int	moea_vsid_bitmap[NPMAPS / VSID_NBPW];
211
212static boolean_t moea_initialized = FALSE;
213
214/*
215 * Statistics.
216 */
217u_int	moea_pte_valid = 0;
218u_int	moea_pte_overflow = 0;
219u_int	moea_pte_replacements = 0;
220u_int	moea_pvo_entries = 0;
221u_int	moea_pvo_enter_calls = 0;
222u_int	moea_pvo_remove_calls = 0;
223u_int	moea_pte_spills = 0;
224SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid,
225    0, "");
226SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD,
227    &moea_pte_overflow, 0, "");
228SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD,
229    &moea_pte_replacements, 0, "");
230SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries,
231    0, "");
232SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD,
233    &moea_pvo_enter_calls, 0, "");
234SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD,
235    &moea_pvo_remove_calls, 0, "");
236SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD,
237    &moea_pte_spills, 0, "");
238
239/*
240 * Allocate physical memory for use in moea_bootstrap.
241 */
242static vm_offset_t	moea_bootstrap_alloc(vm_size_t, u_int);
243
244/*
245 * PTE calls.
246 */
247static int		moea_pte_insert(u_int, struct pte *);
248
249/*
250 * PVO calls.
251 */
252static int	moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *,
253		    vm_offset_t, vm_paddr_t, u_int, int);
254static void	moea_pvo_remove(struct pvo_entry *, int);
255static struct	pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *);
256static struct	pte *moea_pvo_to_pte(const struct pvo_entry *, int);
257
258/*
259 * Utility routines.
260 */
261static int		moea_enter_locked(pmap_t, vm_offset_t, vm_page_t,
262			    vm_prot_t, u_int, int8_t);
263static void		moea_syncicache(vm_paddr_t, vm_size_t);
264static boolean_t	moea_query_bit(vm_page_t, int);
265static u_int		moea_clear_bit(vm_page_t, int);
266static void		moea_kremove(mmu_t, vm_offset_t);
267int		moea_pte_spill(vm_offset_t);
268
269/*
270 * Kernel MMU interface
271 */
272void moea_clear_modify(mmu_t, vm_page_t);
273void moea_copy_page(mmu_t, vm_page_t, vm_page_t);
274void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
275    vm_page_t *mb, vm_offset_t b_offset, int xfersize);
276int moea_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int,
277    int8_t);
278void moea_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
279    vm_prot_t);
280void moea_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
281vm_paddr_t moea_extract(mmu_t, pmap_t, vm_offset_t);
282vm_page_t moea_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t);
283void moea_init(mmu_t);
284boolean_t moea_is_modified(mmu_t, vm_page_t);
285boolean_t moea_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
286boolean_t moea_is_referenced(mmu_t, vm_page_t);
287int moea_ts_referenced(mmu_t, vm_page_t);
288vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
289boolean_t moea_page_exists_quick(mmu_t, pmap_t, vm_page_t);
290void moea_page_init(mmu_t, vm_page_t);
291int moea_page_wired_mappings(mmu_t, vm_page_t);
292void moea_pinit(mmu_t, pmap_t);
293void moea_pinit0(mmu_t, pmap_t);
294void moea_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
295void moea_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
296void moea_qremove(mmu_t, vm_offset_t, int);
297void moea_release(mmu_t, pmap_t);
298void moea_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
299void moea_remove_all(mmu_t, vm_page_t);
300void moea_remove_write(mmu_t, vm_page_t);
301void moea_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
302void moea_zero_page(mmu_t, vm_page_t);
303void moea_zero_page_area(mmu_t, vm_page_t, int, int);
304void moea_zero_page_idle(mmu_t, vm_page_t);
305void moea_activate(mmu_t, struct thread *);
306void moea_deactivate(mmu_t, struct thread *);
307void moea_cpu_bootstrap(mmu_t, int);
308void moea_bootstrap(mmu_t, vm_offset_t, vm_offset_t);
309void *moea_mapdev(mmu_t, vm_paddr_t, vm_size_t);
310void *moea_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t);
311void moea_unmapdev(mmu_t, vm_offset_t, vm_size_t);
312vm_paddr_t moea_kextract(mmu_t, vm_offset_t);
313void moea_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t);
314void moea_kenter(mmu_t, vm_offset_t, vm_paddr_t);
315void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma);
316boolean_t moea_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
317static void moea_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
318void moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va);
319void moea_scan_init(mmu_t mmu);
320vm_offset_t moea_quick_enter_page(mmu_t mmu, vm_page_t m);
321void moea_quick_remove_page(mmu_t mmu, vm_offset_t addr);
322
323static mmu_method_t moea_methods[] = {
324	MMUMETHOD(mmu_clear_modify,	moea_clear_modify),
325	MMUMETHOD(mmu_copy_page,	moea_copy_page),
326	MMUMETHOD(mmu_copy_pages,	moea_copy_pages),
327	MMUMETHOD(mmu_enter,		moea_enter),
328	MMUMETHOD(mmu_enter_object,	moea_enter_object),
329	MMUMETHOD(mmu_enter_quick,	moea_enter_quick),
330	MMUMETHOD(mmu_extract,		moea_extract),
331	MMUMETHOD(mmu_extract_and_hold,	moea_extract_and_hold),
332	MMUMETHOD(mmu_init,		moea_init),
333	MMUMETHOD(mmu_is_modified,	moea_is_modified),
334	MMUMETHOD(mmu_is_prefaultable,	moea_is_prefaultable),
335	MMUMETHOD(mmu_is_referenced,	moea_is_referenced),
336	MMUMETHOD(mmu_ts_referenced,	moea_ts_referenced),
337	MMUMETHOD(mmu_map,     		moea_map),
338	MMUMETHOD(mmu_page_exists_quick,moea_page_exists_quick),
339	MMUMETHOD(mmu_page_init,	moea_page_init),
340	MMUMETHOD(mmu_page_wired_mappings,moea_page_wired_mappings),
341	MMUMETHOD(mmu_pinit,		moea_pinit),
342	MMUMETHOD(mmu_pinit0,		moea_pinit0),
343	MMUMETHOD(mmu_protect,		moea_protect),
344	MMUMETHOD(mmu_qenter,		moea_qenter),
345	MMUMETHOD(mmu_qremove,		moea_qremove),
346	MMUMETHOD(mmu_release,		moea_release),
347	MMUMETHOD(mmu_remove,		moea_remove),
348	MMUMETHOD(mmu_remove_all,      	moea_remove_all),
349	MMUMETHOD(mmu_remove_write,	moea_remove_write),
350	MMUMETHOD(mmu_sync_icache,	moea_sync_icache),
351	MMUMETHOD(mmu_unwire,		moea_unwire),
352	MMUMETHOD(mmu_zero_page,       	moea_zero_page),
353	MMUMETHOD(mmu_zero_page_area,	moea_zero_page_area),
354	MMUMETHOD(mmu_zero_page_idle,	moea_zero_page_idle),
355	MMUMETHOD(mmu_activate,		moea_activate),
356	MMUMETHOD(mmu_deactivate,      	moea_deactivate),
357	MMUMETHOD(mmu_page_set_memattr,	moea_page_set_memattr),
358	MMUMETHOD(mmu_quick_enter_page, moea_quick_enter_page),
359	MMUMETHOD(mmu_quick_remove_page, moea_quick_remove_page),
360
361	/* Internal interfaces */
362	MMUMETHOD(mmu_bootstrap,       	moea_bootstrap),
363	MMUMETHOD(mmu_cpu_bootstrap,   	moea_cpu_bootstrap),
364	MMUMETHOD(mmu_mapdev_attr,	moea_mapdev_attr),
365	MMUMETHOD(mmu_mapdev,		moea_mapdev),
366	MMUMETHOD(mmu_unmapdev,		moea_unmapdev),
367	MMUMETHOD(mmu_kextract,		moea_kextract),
368	MMUMETHOD(mmu_kenter,		moea_kenter),
369	MMUMETHOD(mmu_kenter_attr,	moea_kenter_attr),
370	MMUMETHOD(mmu_dev_direct_mapped,moea_dev_direct_mapped),
371	MMUMETHOD(mmu_scan_init,	moea_scan_init),
372	MMUMETHOD(mmu_dumpsys_map,	moea_dumpsys_map),
373
374	{ 0, 0 }
375};
376
377MMU_DEF(oea_mmu, MMU_TYPE_OEA, moea_methods, 0);
378
379static __inline uint32_t
380moea_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
381{
382	uint32_t pte_lo;
383	int i;
384
385	if (ma != VM_MEMATTR_DEFAULT) {
386		switch (ma) {
387		case VM_MEMATTR_UNCACHEABLE:
388			return (PTE_I | PTE_G);
389		case VM_MEMATTR_CACHEABLE:
390			return (PTE_M);
391		case VM_MEMATTR_WRITE_COMBINING:
392		case VM_MEMATTR_WRITE_BACK:
393		case VM_MEMATTR_PREFETCHABLE:
394			return (PTE_I);
395		case VM_MEMATTR_WRITE_THROUGH:
396			return (PTE_W | PTE_M);
397		}
398	}
399
400	/*
401	 * Assume the page is cache inhibited and access is guarded unless
402	 * it's in our available memory array.
403	 */
404	pte_lo = PTE_I | PTE_G;
405	for (i = 0; i < pregions_sz; i++) {
406		if ((pa >= pregions[i].mr_start) &&
407		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
408			pte_lo = PTE_M;
409			break;
410		}
411	}
412
413	return pte_lo;
414}
415
416static void
417tlbie(vm_offset_t va)
418{
419
420	mtx_lock_spin(&tlbie_mtx);
421	__asm __volatile("ptesync");
422	__asm __volatile("tlbie %0" :: "r"(va));
423	__asm __volatile("eieio; tlbsync; ptesync");
424	mtx_unlock_spin(&tlbie_mtx);
425}
426
427static void
428tlbia(void)
429{
430	vm_offset_t va;
431
432	for (va = 0; va < 0x00040000; va += 0x00001000) {
433		__asm __volatile("tlbie %0" :: "r"(va));
434		powerpc_sync();
435	}
436	__asm __volatile("tlbsync");
437	powerpc_sync();
438}
439
440static __inline int
441va_to_sr(u_int *sr, vm_offset_t va)
442{
443	return (sr[(uintptr_t)va >> ADDR_SR_SHFT]);
444}
445
446static __inline u_int
447va_to_pteg(u_int sr, vm_offset_t addr)
448{
449	u_int hash;
450
451	hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >>
452	    ADDR_PIDX_SHFT);
453	return (hash & moea_pteg_mask);
454}
455
456static __inline struct pvo_head *
457vm_page_to_pvoh(vm_page_t m)
458{
459
460	return (&m->md.mdpg_pvoh);
461}
462
463static __inline void
464moea_attr_clear(vm_page_t m, int ptebit)
465{
466
467	rw_assert(&pvh_global_lock, RA_WLOCKED);
468	m->md.mdpg_attrs &= ~ptebit;
469}
470
471static __inline int
472moea_attr_fetch(vm_page_t m)
473{
474
475	return (m->md.mdpg_attrs);
476}
477
478static __inline void
479moea_attr_save(vm_page_t m, int ptebit)
480{
481
482	rw_assert(&pvh_global_lock, RA_WLOCKED);
483	m->md.mdpg_attrs |= ptebit;
484}
485
486static __inline int
487moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt)
488{
489	if (pt->pte_hi == pvo_pt->pte_hi)
490		return (1);
491
492	return (0);
493}
494
495static __inline int
496moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which)
497{
498	return (pt->pte_hi & ~PTE_VALID) ==
499	    (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
500	    ((va >> ADDR_API_SHFT) & PTE_API) | which);
501}
502
503static __inline void
504moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo)
505{
506
507	mtx_assert(&moea_table_mutex, MA_OWNED);
508
509	/*
510	 * Construct a PTE.  Default to IMB initially.  Valid bit only gets
511	 * set when the real pte is set in memory.
512	 *
513	 * Note: Don't set the valid bit for correct operation of tlb update.
514	 */
515	pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
516	    (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API);
517	pt->pte_lo = pte_lo;
518}
519
520static __inline void
521moea_pte_synch(struct pte *pt, struct pte *pvo_pt)
522{
523
524	mtx_assert(&moea_table_mutex, MA_OWNED);
525	pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG);
526}
527
528static __inline void
529moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit)
530{
531
532	mtx_assert(&moea_table_mutex, MA_OWNED);
533
534	/*
535	 * As shown in Section 7.6.3.2.3
536	 */
537	pt->pte_lo &= ~ptebit;
538	tlbie(va);
539}
540
541static __inline void
542moea_pte_set(struct pte *pt, struct pte *pvo_pt)
543{
544
545	mtx_assert(&moea_table_mutex, MA_OWNED);
546	pvo_pt->pte_hi |= PTE_VALID;
547
548	/*
549	 * Update the PTE as defined in section 7.6.3.1.
550	 * Note that the REF/CHG bits are from pvo_pt and thus should have
551	 * been saved so this routine can restore them (if desired).
552	 */
553	pt->pte_lo = pvo_pt->pte_lo;
554	powerpc_sync();
555	pt->pte_hi = pvo_pt->pte_hi;
556	powerpc_sync();
557	moea_pte_valid++;
558}
559
560static __inline void
561moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
562{
563
564	mtx_assert(&moea_table_mutex, MA_OWNED);
565	pvo_pt->pte_hi &= ~PTE_VALID;
566
567	/*
568	 * Force the reg & chg bits back into the PTEs.
569	 */
570	powerpc_sync();
571
572	/*
573	 * Invalidate the pte.
574	 */
575	pt->pte_hi &= ~PTE_VALID;
576
577	tlbie(va);
578
579	/*
580	 * Save the reg & chg bits.
581	 */
582	moea_pte_synch(pt, pvo_pt);
583	moea_pte_valid--;
584}
585
586static __inline void
587moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
588{
589
590	/*
591	 * Invalidate the PTE
592	 */
593	moea_pte_unset(pt, pvo_pt, va);
594	moea_pte_set(pt, pvo_pt);
595}
596
597/*
598 * Quick sort callout for comparing memory regions.
599 */
600static int	om_cmp(const void *a, const void *b);
601
602static int
603om_cmp(const void *a, const void *b)
604{
605	const struct	ofw_map *mapa;
606	const struct	ofw_map *mapb;
607
608	mapa = a;
609	mapb = b;
610	if (mapa->om_pa < mapb->om_pa)
611		return (-1);
612	else if (mapa->om_pa > mapb->om_pa)
613		return (1);
614	else
615		return (0);
616}
617
618void
619moea_cpu_bootstrap(mmu_t mmup, int ap)
620{
621	u_int sdr;
622	int i;
623
624	if (ap) {
625		powerpc_sync();
626		__asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu));
627		__asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl));
628		isync();
629		__asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu));
630		__asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl));
631		isync();
632	}
633
634	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
635	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
636	isync();
637
638	__asm __volatile("mtibatu 1,%0" :: "r"(0));
639	__asm __volatile("mtdbatu 2,%0" :: "r"(0));
640	__asm __volatile("mtibatu 2,%0" :: "r"(0));
641	__asm __volatile("mtdbatu 3,%0" :: "r"(0));
642	__asm __volatile("mtibatu 3,%0" :: "r"(0));
643	isync();
644
645	for (i = 0; i < 16; i++)
646		mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]);
647	powerpc_sync();
648
649	sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10);
650	__asm __volatile("mtsdr1 %0" :: "r"(sdr));
651	isync();
652
653	tlbia();
654}
655
656void
657moea_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
658{
659	ihandle_t	mmui;
660	phandle_t	chosen, mmu;
661	int		sz;
662	int		i, j;
663	vm_size_t	size, physsz, hwphyssz;
664	vm_offset_t	pa, va, off;
665	void		*dpcpu;
666	register_t	msr;
667
668        /*
669         * Set up BAT0 to map the lowest 256 MB area
670         */
671        battable[0x0].batl = BATL(0x00000000, BAT_M, BAT_PP_RW);
672        battable[0x0].batu = BATU(0x00000000, BAT_BL_256M, BAT_Vs);
673
674	/*
675	 * Map PCI memory space.
676	 */
677	battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW);
678	battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs);
679
680	battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW);
681	battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs);
682
683	battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW);
684	battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs);
685
686	battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW);
687	battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs);
688
689	/*
690	 * Map obio devices.
691	 */
692	battable[0xf].batl = BATL(0xf0000000, BAT_I|BAT_G, BAT_PP_RW);
693	battable[0xf].batu = BATU(0xf0000000, BAT_BL_256M, BAT_Vs);
694
695	/*
696	 * Use an IBAT and a DBAT to map the bottom segment of memory
697	 * where we are. Turn off instruction relocation temporarily
698	 * to prevent faults while reprogramming the IBAT.
699	 */
700	msr = mfmsr();
701	mtmsr(msr & ~PSL_IR);
702	__asm (".balign 32; \n"
703	       "mtibatu 0,%0; mtibatl 0,%1; isync; \n"
704	       "mtdbatu 0,%0; mtdbatl 0,%1; isync"
705	    :: "r"(battable[0].batu), "r"(battable[0].batl));
706	mtmsr(msr);
707
708	/* map pci space */
709	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
710	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
711	isync();
712
713	/* set global direct map flag */
714	hw_direct_map = 1;
715
716	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
717	CTR0(KTR_PMAP, "moea_bootstrap: physical memory");
718
719	for (i = 0; i < pregions_sz; i++) {
720		vm_offset_t pa;
721		vm_offset_t end;
722
723		CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)",
724			pregions[i].mr_start,
725			pregions[i].mr_start + pregions[i].mr_size,
726			pregions[i].mr_size);
727		/*
728		 * Install entries into the BAT table to allow all
729		 * of physmem to be convered by on-demand BAT entries.
730		 * The loop will sometimes set the same battable element
731		 * twice, but that's fine since they won't be used for
732		 * a while yet.
733		 */
734		pa = pregions[i].mr_start & 0xf0000000;
735		end = pregions[i].mr_start + pregions[i].mr_size;
736		do {
737                        u_int n = pa >> ADDR_SR_SHFT;
738
739			battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW);
740			battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs);
741			pa += SEGMENT_LENGTH;
742		} while (pa < end);
743	}
744
745	if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz)
746		panic("moea_bootstrap: phys_avail too small");
747
748	phys_avail_count = 0;
749	physsz = 0;
750	hwphyssz = 0;
751	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
752	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
753		CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start,
754		    regions[i].mr_start + regions[i].mr_size,
755		    regions[i].mr_size);
756		if (hwphyssz != 0 &&
757		    (physsz + regions[i].mr_size) >= hwphyssz) {
758			if (physsz < hwphyssz) {
759				phys_avail[j] = regions[i].mr_start;
760				phys_avail[j + 1] = regions[i].mr_start +
761				    hwphyssz - physsz;
762				physsz = hwphyssz;
763				phys_avail_count++;
764			}
765			break;
766		}
767		phys_avail[j] = regions[i].mr_start;
768		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
769		phys_avail_count++;
770		physsz += regions[i].mr_size;
771	}
772
773	/* Check for overlap with the kernel and exception vectors */
774	for (j = 0; j < 2*phys_avail_count; j+=2) {
775		if (phys_avail[j] < EXC_LAST)
776			phys_avail[j] += EXC_LAST;
777
778		if (kernelstart >= phys_avail[j] &&
779		    kernelstart < phys_avail[j+1]) {
780			if (kernelend < phys_avail[j+1]) {
781				phys_avail[2*phys_avail_count] =
782				    (kernelend & ~PAGE_MASK) + PAGE_SIZE;
783				phys_avail[2*phys_avail_count + 1] =
784				    phys_avail[j+1];
785				phys_avail_count++;
786			}
787
788			phys_avail[j+1] = kernelstart & ~PAGE_MASK;
789		}
790
791		if (kernelend >= phys_avail[j] &&
792		    kernelend < phys_avail[j+1]) {
793			if (kernelstart > phys_avail[j]) {
794				phys_avail[2*phys_avail_count] = phys_avail[j];
795				phys_avail[2*phys_avail_count + 1] =
796				    kernelstart & ~PAGE_MASK;
797				phys_avail_count++;
798			}
799
800			phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE;
801		}
802	}
803
804	physmem = btoc(physsz);
805
806	/*
807	 * Allocate PTEG table.
808	 */
809#ifdef PTEGCOUNT
810	moea_pteg_count = PTEGCOUNT;
811#else
812	moea_pteg_count = 0x1000;
813
814	while (moea_pteg_count < physmem)
815		moea_pteg_count <<= 1;
816
817	moea_pteg_count >>= 1;
818#endif /* PTEGCOUNT */
819
820	size = moea_pteg_count * sizeof(struct pteg);
821	CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count,
822	    size);
823	moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size);
824	CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table);
825	bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg));
826	moea_pteg_mask = moea_pteg_count - 1;
827
828	/*
829	 * Allocate pv/overflow lists.
830	 */
831	size = sizeof(struct pvo_head) * moea_pteg_count;
832	moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size,
833	    PAGE_SIZE);
834	CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table);
835	for (i = 0; i < moea_pteg_count; i++)
836		LIST_INIT(&moea_pvo_table[i]);
837
838	/*
839	 * Initialize the lock that synchronizes access to the pteg and pvo
840	 * tables.
841	 */
842	mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF |
843	    MTX_RECURSE);
844	mtx_init(&moea_vsid_mutex, "VSID table", NULL, MTX_DEF);
845
846	mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN);
847
848	/*
849	 * Initialise the unmanaged pvo pool.
850	 */
851	moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc(
852		BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0);
853	moea_bpvo_pool_index = 0;
854
855	/*
856	 * Make sure kernel vsid is allocated as well as VSID 0.
857	 */
858	moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW]
859		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
860	moea_vsid_bitmap[0] |= 1;
861
862	/*
863	 * Initialize the kernel pmap (which is statically allocated).
864	 */
865	PMAP_LOCK_INIT(kernel_pmap);
866	for (i = 0; i < 16; i++)
867		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
868	CPU_FILL(&kernel_pmap->pm_active);
869	RB_INIT(&kernel_pmap->pmap_pvo);
870
871 	/*
872	 * Initialize the global pv list lock.
873	 */
874	rw_init(&pvh_global_lock, "pmap pv global");
875
876	/*
877	 * Set up the Open Firmware mappings
878	 */
879	chosen = OF_finddevice("/chosen");
880	if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1 &&
881	    (mmu = OF_instance_to_package(mmui)) != -1 &&
882	    (sz = OF_getproplen(mmu, "translations")) != -1) {
883		translations = NULL;
884		for (i = 0; phys_avail[i] != 0; i += 2) {
885			if (phys_avail[i + 1] >= sz) {
886				translations = (struct ofw_map *)phys_avail[i];
887				break;
888			}
889		}
890		if (translations == NULL)
891			panic("moea_bootstrap: no space to copy translations");
892		bzero(translations, sz);
893		if (OF_getprop(mmu, "translations", translations, sz) == -1)
894			panic("moea_bootstrap: can't get ofw translations");
895		CTR0(KTR_PMAP, "moea_bootstrap: translations");
896		sz /= sizeof(*translations);
897		qsort(translations, sz, sizeof (*translations), om_cmp);
898		for (i = 0; i < sz; i++) {
899			CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x",
900			    translations[i].om_pa, translations[i].om_va,
901			    translations[i].om_len);
902
903			/*
904			 * If the mapping is 1:1, let the RAM and device
905			 * on-demand BAT tables take care of the translation.
906			 */
907			if (translations[i].om_va == translations[i].om_pa)
908				continue;
909
910			/* Enter the pages */
911			for (off = 0; off < translations[i].om_len;
912			    off += PAGE_SIZE)
913				moea_kenter(mmup, translations[i].om_va + off,
914					    translations[i].om_pa + off);
915		}
916	}
917
918	/*
919	 * Calculate the last available physical address.
920	 */
921	for (i = 0; phys_avail[i + 2] != 0; i += 2)
922		;
923	Maxmem = powerpc_btop(phys_avail[i + 1]);
924
925	moea_cpu_bootstrap(mmup,0);
926	mtmsr(mfmsr() | PSL_DR | PSL_IR);
927	pmap_bootstrapped++;
928
929	/*
930	 * Set the start and end of kva.
931	 */
932	virtual_avail = VM_MIN_KERNEL_ADDRESS;
933	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
934
935	/*
936	 * Allocate a kernel stack with a guard page for thread0 and map it
937	 * into the kernel page map.
938	 */
939	pa = moea_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
940	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
941	virtual_avail = va + kstack_pages * PAGE_SIZE;
942	CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va);
943	thread0.td_kstack = va;
944	thread0.td_kstack_pages = kstack_pages;
945	for (i = 0; i < kstack_pages; i++) {
946		moea_kenter(mmup, va, pa);
947		pa += PAGE_SIZE;
948		va += PAGE_SIZE;
949	}
950
951	/*
952	 * Allocate virtual address space for the message buffer.
953	 */
954	pa = msgbuf_phys = moea_bootstrap_alloc(msgbufsize, PAGE_SIZE);
955	msgbufp = (struct msgbuf *)virtual_avail;
956	va = virtual_avail;
957	virtual_avail += round_page(msgbufsize);
958	while (va < virtual_avail) {
959		moea_kenter(mmup, va, pa);
960		pa += PAGE_SIZE;
961		va += PAGE_SIZE;
962	}
963
964	/*
965	 * Allocate virtual address space for the dynamic percpu area.
966	 */
967	pa = moea_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
968	dpcpu = (void *)virtual_avail;
969	va = virtual_avail;
970	virtual_avail += DPCPU_SIZE;
971	while (va < virtual_avail) {
972		moea_kenter(mmup, va, pa);
973		pa += PAGE_SIZE;
974		va += PAGE_SIZE;
975	}
976	dpcpu_init(dpcpu, 0);
977}
978
979/*
980 * Activate a user pmap.  The pmap must be activated before it's address
981 * space can be accessed in any way.
982 */
983void
984moea_activate(mmu_t mmu, struct thread *td)
985{
986	pmap_t	pm, pmr;
987
988	/*
989	 * Load all the data we need up front to encourage the compiler to
990	 * not issue any loads while we have interrupts disabled below.
991	 */
992	pm = &td->td_proc->p_vmspace->vm_pmap;
993	pmr = pm->pmap_phys;
994
995	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
996	PCPU_SET(curpmap, pmr);
997
998	mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
999}
1000
1001void
1002moea_deactivate(mmu_t mmu, struct thread *td)
1003{
1004	pmap_t	pm;
1005
1006	pm = &td->td_proc->p_vmspace->vm_pmap;
1007	CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1008	PCPU_SET(curpmap, NULL);
1009}
1010
1011void
1012moea_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1013{
1014	struct	pvo_entry key, *pvo;
1015
1016	PMAP_LOCK(pm);
1017	key.pvo_vaddr = sva;
1018	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1019	    pvo != NULL && PVO_VADDR(pvo) < eva;
1020	    pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1021		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1022			panic("moea_unwire: pvo %p is missing PVO_WIRED", pvo);
1023		pvo->pvo_vaddr &= ~PVO_WIRED;
1024		pm->pm_stats.wired_count--;
1025	}
1026	PMAP_UNLOCK(pm);
1027}
1028
1029void
1030moea_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst)
1031{
1032	vm_offset_t	dst;
1033	vm_offset_t	src;
1034
1035	dst = VM_PAGE_TO_PHYS(mdst);
1036	src = VM_PAGE_TO_PHYS(msrc);
1037
1038	bcopy((void *)src, (void *)dst, PAGE_SIZE);
1039}
1040
1041void
1042moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1043    vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1044{
1045	void *a_cp, *b_cp;
1046	vm_offset_t a_pg_offset, b_pg_offset;
1047	int cnt;
1048
1049	while (xfersize > 0) {
1050		a_pg_offset = a_offset & PAGE_MASK;
1051		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1052		a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) +
1053		    a_pg_offset;
1054		b_pg_offset = b_offset & PAGE_MASK;
1055		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1056		b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) +
1057		    b_pg_offset;
1058		bcopy(a_cp, b_cp, cnt);
1059		a_offset += cnt;
1060		b_offset += cnt;
1061		xfersize -= cnt;
1062	}
1063}
1064
1065/*
1066 * Zero a page of physical memory by temporarily mapping it into the tlb.
1067 */
1068void
1069moea_zero_page(mmu_t mmu, vm_page_t m)
1070{
1071	vm_offset_t off, pa = VM_PAGE_TO_PHYS(m);
1072
1073	for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1074		__asm __volatile("dcbz 0,%0" :: "r"(pa + off));
1075}
1076
1077void
1078moea_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
1079{
1080	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1081	void *va = (void *)(pa + off);
1082
1083	bzero(va, size);
1084}
1085
1086void
1087moea_zero_page_idle(mmu_t mmu, vm_page_t m)
1088{
1089
1090	moea_zero_page(mmu, m);
1091}
1092
1093vm_offset_t
1094moea_quick_enter_page(mmu_t mmu, vm_page_t m)
1095{
1096
1097	return (VM_PAGE_TO_PHYS(m));
1098}
1099
1100void
1101moea_quick_remove_page(mmu_t mmu, vm_offset_t addr)
1102{
1103}
1104
1105/*
1106 * Map the given physical page at the specified virtual address in the
1107 * target pmap with the protection requested.  If specified the page
1108 * will be wired down.
1109 */
1110int
1111moea_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1112    u_int flags, int8_t psind)
1113{
1114	int error;
1115
1116	for (;;) {
1117		rw_wlock(&pvh_global_lock);
1118		PMAP_LOCK(pmap);
1119		error = moea_enter_locked(pmap, va, m, prot, flags, psind);
1120		rw_wunlock(&pvh_global_lock);
1121		PMAP_UNLOCK(pmap);
1122		if (error != ENOMEM)
1123			return (KERN_SUCCESS);
1124		if ((flags & PMAP_ENTER_NOSLEEP) != 0)
1125			return (KERN_RESOURCE_SHORTAGE);
1126		VM_OBJECT_ASSERT_UNLOCKED(m->object);
1127		VM_WAIT;
1128	}
1129}
1130
1131/*
1132 * Map the given physical page at the specified virtual address in the
1133 * target pmap with the protection requested.  If specified the page
1134 * will be wired down.
1135 *
1136 * The global pvh and pmap must be locked.
1137 */
1138static int
1139moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1140    u_int flags, int8_t psind __unused)
1141{
1142	struct		pvo_head *pvo_head;
1143	uma_zone_t	zone;
1144	u_int		pte_lo, pvo_flags;
1145	int		error;
1146
1147	if (pmap_bootstrapped)
1148		rw_assert(&pvh_global_lock, RA_WLOCKED);
1149	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1150	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
1151		VM_OBJECT_ASSERT_LOCKED(m->object);
1152
1153	if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) {
1154		pvo_head = &moea_pvo_kunmanaged;
1155		zone = moea_upvo_zone;
1156		pvo_flags = 0;
1157	} else {
1158		pvo_head = vm_page_to_pvoh(m);
1159		zone = moea_mpvo_zone;
1160		pvo_flags = PVO_MANAGED;
1161	}
1162
1163	pte_lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
1164
1165	if (prot & VM_PROT_WRITE) {
1166		pte_lo |= PTE_BW;
1167		if (pmap_bootstrapped &&
1168		    (m->oflags & VPO_UNMANAGED) == 0)
1169			vm_page_aflag_set(m, PGA_WRITEABLE);
1170	} else
1171		pte_lo |= PTE_BR;
1172
1173	if ((flags & PMAP_ENTER_WIRED) != 0)
1174		pvo_flags |= PVO_WIRED;
1175
1176	error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m),
1177	    pte_lo, pvo_flags);
1178
1179	/*
1180	 * Flush the real page from the instruction cache. This has be done
1181	 * for all user mappings to prevent information leakage via the
1182	 * instruction cache. moea_pvo_enter() returns ENOENT for the first
1183	 * mapping for a page.
1184	 */
1185	if (pmap != kernel_pmap && error == ENOENT &&
1186	    (pte_lo & (PTE_I | PTE_G)) == 0)
1187		moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1188
1189	return (error);
1190}
1191
1192/*
1193 * Maps a sequence of resident pages belonging to the same object.
1194 * The sequence begins with the given page m_start.  This page is
1195 * mapped at the given virtual address start.  Each subsequent page is
1196 * mapped at a virtual address that is offset from start by the same
1197 * amount as the page is offset from m_start within the object.  The
1198 * last page in the sequence is the page with the largest offset from
1199 * m_start that can be mapped at a virtual address less than the given
1200 * virtual address end.  Not every virtual page between start and end
1201 * is mapped; only those for which a resident page exists with the
1202 * corresponding offset from m_start are mapped.
1203 */
1204void
1205moea_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end,
1206    vm_page_t m_start, vm_prot_t prot)
1207{
1208	vm_page_t m;
1209	vm_pindex_t diff, psize;
1210
1211	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1212
1213	psize = atop(end - start);
1214	m = m_start;
1215	rw_wlock(&pvh_global_lock);
1216	PMAP_LOCK(pm);
1217	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1218		moea_enter_locked(pm, start + ptoa(diff), m, prot &
1219		    (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0);
1220		m = TAILQ_NEXT(m, listq);
1221	}
1222	rw_wunlock(&pvh_global_lock);
1223	PMAP_UNLOCK(pm);
1224}
1225
1226void
1227moea_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m,
1228    vm_prot_t prot)
1229{
1230
1231	rw_wlock(&pvh_global_lock);
1232	PMAP_LOCK(pm);
1233	moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1234	    0, 0);
1235	rw_wunlock(&pvh_global_lock);
1236	PMAP_UNLOCK(pm);
1237}
1238
1239vm_paddr_t
1240moea_extract(mmu_t mmu, pmap_t pm, vm_offset_t va)
1241{
1242	struct	pvo_entry *pvo;
1243	vm_paddr_t pa;
1244
1245	PMAP_LOCK(pm);
1246	pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1247	if (pvo == NULL)
1248		pa = 0;
1249	else
1250		pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF);
1251	PMAP_UNLOCK(pm);
1252	return (pa);
1253}
1254
1255/*
1256 * Atomically extract and hold the physical page with the given
1257 * pmap and virtual address pair if that mapping permits the given
1258 * protection.
1259 */
1260vm_page_t
1261moea_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1262{
1263	struct	pvo_entry *pvo;
1264	vm_page_t m;
1265        vm_paddr_t pa;
1266
1267	m = NULL;
1268	pa = 0;
1269	PMAP_LOCK(pmap);
1270retry:
1271	pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1272	if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID) &&
1273	    ((pvo->pvo_pte.pte.pte_lo & PTE_PP) == PTE_RW ||
1274	     (prot & VM_PROT_WRITE) == 0)) {
1275		if (vm_page_pa_tryrelock(pmap, pvo->pvo_pte.pte.pte_lo & PTE_RPGN, &pa))
1276			goto retry;
1277		m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN);
1278		vm_page_hold(m);
1279	}
1280	PA_UNLOCK_COND(pa);
1281	PMAP_UNLOCK(pmap);
1282	return (m);
1283}
1284
1285void
1286moea_init(mmu_t mmu)
1287{
1288
1289	moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1290	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1291	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1292	moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry),
1293	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1294	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1295	moea_initialized = TRUE;
1296}
1297
1298boolean_t
1299moea_is_referenced(mmu_t mmu, vm_page_t m)
1300{
1301	boolean_t rv;
1302
1303	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1304	    ("moea_is_referenced: page %p is not managed", m));
1305	rw_wlock(&pvh_global_lock);
1306	rv = moea_query_bit(m, PTE_REF);
1307	rw_wunlock(&pvh_global_lock);
1308	return (rv);
1309}
1310
1311boolean_t
1312moea_is_modified(mmu_t mmu, vm_page_t m)
1313{
1314	boolean_t rv;
1315
1316	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1317	    ("moea_is_modified: page %p is not managed", m));
1318
1319	/*
1320	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1321	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
1322	 * is clear, no PTEs can have PTE_CHG set.
1323	 */
1324	VM_OBJECT_ASSERT_WLOCKED(m->object);
1325	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1326		return (FALSE);
1327	rw_wlock(&pvh_global_lock);
1328	rv = moea_query_bit(m, PTE_CHG);
1329	rw_wunlock(&pvh_global_lock);
1330	return (rv);
1331}
1332
1333boolean_t
1334moea_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va)
1335{
1336	struct pvo_entry *pvo;
1337	boolean_t rv;
1338
1339	PMAP_LOCK(pmap);
1340	pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1341	rv = pvo == NULL || (pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0;
1342	PMAP_UNLOCK(pmap);
1343	return (rv);
1344}
1345
1346void
1347moea_clear_modify(mmu_t mmu, vm_page_t m)
1348{
1349
1350	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1351	    ("moea_clear_modify: page %p is not managed", m));
1352	VM_OBJECT_ASSERT_WLOCKED(m->object);
1353	KASSERT(!vm_page_xbusied(m),
1354	    ("moea_clear_modify: page %p is exclusive busy", m));
1355
1356	/*
1357	 * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_CHG
1358	 * set.  If the object containing the page is locked and the page is
1359	 * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
1360	 */
1361	if ((m->aflags & PGA_WRITEABLE) == 0)
1362		return;
1363	rw_wlock(&pvh_global_lock);
1364	moea_clear_bit(m, PTE_CHG);
1365	rw_wunlock(&pvh_global_lock);
1366}
1367
1368/*
1369 * Clear the write and modified bits in each of the given page's mappings.
1370 */
1371void
1372moea_remove_write(mmu_t mmu, vm_page_t m)
1373{
1374	struct	pvo_entry *pvo;
1375	struct	pte *pt;
1376	pmap_t	pmap;
1377	u_int	lo;
1378
1379	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1380	    ("moea_remove_write: page %p is not managed", m));
1381
1382	/*
1383	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1384	 * set by another thread while the object is locked.  Thus,
1385	 * if PGA_WRITEABLE is clear, no page table entries need updating.
1386	 */
1387	VM_OBJECT_ASSERT_WLOCKED(m->object);
1388	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1389		return;
1390	rw_wlock(&pvh_global_lock);
1391	lo = moea_attr_fetch(m);
1392	powerpc_sync();
1393	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1394		pmap = pvo->pvo_pmap;
1395		PMAP_LOCK(pmap);
1396		if ((pvo->pvo_pte.pte.pte_lo & PTE_PP) != PTE_BR) {
1397			pt = moea_pvo_to_pte(pvo, -1);
1398			pvo->pvo_pte.pte.pte_lo &= ~PTE_PP;
1399			pvo->pvo_pte.pte.pte_lo |= PTE_BR;
1400			if (pt != NULL) {
1401				moea_pte_synch(pt, &pvo->pvo_pte.pte);
1402				lo |= pvo->pvo_pte.pte.pte_lo;
1403				pvo->pvo_pte.pte.pte_lo &= ~PTE_CHG;
1404				moea_pte_change(pt, &pvo->pvo_pte.pte,
1405				    pvo->pvo_vaddr);
1406				mtx_unlock(&moea_table_mutex);
1407			}
1408		}
1409		PMAP_UNLOCK(pmap);
1410	}
1411	if ((lo & PTE_CHG) != 0) {
1412		moea_attr_clear(m, PTE_CHG);
1413		vm_page_dirty(m);
1414	}
1415	vm_page_aflag_clear(m, PGA_WRITEABLE);
1416	rw_wunlock(&pvh_global_lock);
1417}
1418
1419/*
1420 *	moea_ts_referenced:
1421 *
1422 *	Return a count of reference bits for a page, clearing those bits.
1423 *	It is not necessary for every reference bit to be cleared, but it
1424 *	is necessary that 0 only be returned when there are truly no
1425 *	reference bits set.
1426 *
1427 *	XXX: The exact number of bits to check and clear is a matter that
1428 *	should be tested and standardized at some point in the future for
1429 *	optimal aging of shared pages.
1430 */
1431int
1432moea_ts_referenced(mmu_t mmu, vm_page_t m)
1433{
1434	int count;
1435
1436	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1437	    ("moea_ts_referenced: page %p is not managed", m));
1438	rw_wlock(&pvh_global_lock);
1439	count = moea_clear_bit(m, PTE_REF);
1440	rw_wunlock(&pvh_global_lock);
1441	return (count);
1442}
1443
1444/*
1445 * Modify the WIMG settings of all mappings for a page.
1446 */
1447void
1448moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma)
1449{
1450	struct	pvo_entry *pvo;
1451	struct	pvo_head *pvo_head;
1452	struct	pte *pt;
1453	pmap_t	pmap;
1454	u_int	lo;
1455
1456	if ((m->oflags & VPO_UNMANAGED) != 0) {
1457		m->md.mdpg_cache_attrs = ma;
1458		return;
1459	}
1460
1461	rw_wlock(&pvh_global_lock);
1462	pvo_head = vm_page_to_pvoh(m);
1463	lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
1464
1465	LIST_FOREACH(pvo, pvo_head, pvo_vlink) {
1466		pmap = pvo->pvo_pmap;
1467		PMAP_LOCK(pmap);
1468		pt = moea_pvo_to_pte(pvo, -1);
1469		pvo->pvo_pte.pte.pte_lo &= ~PTE_WIMG;
1470		pvo->pvo_pte.pte.pte_lo |= lo;
1471		if (pt != NULL) {
1472			moea_pte_change(pt, &pvo->pvo_pte.pte,
1473			    pvo->pvo_vaddr);
1474			if (pvo->pvo_pmap == kernel_pmap)
1475				isync();
1476		}
1477		mtx_unlock(&moea_table_mutex);
1478		PMAP_UNLOCK(pmap);
1479	}
1480	m->md.mdpg_cache_attrs = ma;
1481	rw_wunlock(&pvh_global_lock);
1482}
1483
1484/*
1485 * Map a wired page into kernel virtual address space.
1486 */
1487void
1488moea_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
1489{
1490
1491	moea_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT);
1492}
1493
1494void
1495moea_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
1496{
1497	u_int		pte_lo;
1498	int		error;
1499
1500#if 0
1501	if (va < VM_MIN_KERNEL_ADDRESS)
1502		panic("moea_kenter: attempt to enter non-kernel address %#x",
1503		    va);
1504#endif
1505
1506	pte_lo = moea_calc_wimg(pa, ma);
1507
1508	PMAP_LOCK(kernel_pmap);
1509	error = moea_pvo_enter(kernel_pmap, moea_upvo_zone,
1510	    &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED);
1511
1512	if (error != 0 && error != ENOENT)
1513		panic("moea_kenter: failed to enter va %#x pa %#x: %d", va,
1514		    pa, error);
1515
1516	PMAP_UNLOCK(kernel_pmap);
1517}
1518
1519/*
1520 * Extract the physical page address associated with the given kernel virtual
1521 * address.
1522 */
1523vm_paddr_t
1524moea_kextract(mmu_t mmu, vm_offset_t va)
1525{
1526	struct		pvo_entry *pvo;
1527	vm_paddr_t pa;
1528
1529	/*
1530	 * Allow direct mappings on 32-bit OEA
1531	 */
1532	if (va < VM_MIN_KERNEL_ADDRESS) {
1533		return (va);
1534	}
1535
1536	PMAP_LOCK(kernel_pmap);
1537	pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
1538	KASSERT(pvo != NULL, ("moea_kextract: no addr found"));
1539	pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF);
1540	PMAP_UNLOCK(kernel_pmap);
1541	return (pa);
1542}
1543
1544/*
1545 * Remove a wired page from kernel virtual address space.
1546 */
1547void
1548moea_kremove(mmu_t mmu, vm_offset_t va)
1549{
1550
1551	moea_remove(mmu, kernel_pmap, va, va + PAGE_SIZE);
1552}
1553
1554/*
1555 * Map a range of physical addresses into kernel virtual address space.
1556 *
1557 * The value passed in *virt is a suggested virtual address for the mapping.
1558 * Architectures which can support a direct-mapped physical to virtual region
1559 * can return the appropriate address within that region, leaving '*virt'
1560 * unchanged.  We cannot and therefore do not; *virt is updated with the
1561 * first usable address after the mapped region.
1562 */
1563vm_offset_t
1564moea_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
1565    vm_paddr_t pa_end, int prot)
1566{
1567	vm_offset_t	sva, va;
1568
1569	sva = *virt;
1570	va = sva;
1571	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
1572		moea_kenter(mmu, va, pa_start);
1573	*virt = va;
1574	return (sva);
1575}
1576
1577/*
1578 * Returns true if the pmap's pv is one of the first
1579 * 16 pvs linked to from this page.  This count may
1580 * be changed upwards or downwards in the future; it
1581 * is only necessary that true be returned for a small
1582 * subset of pmaps for proper page aging.
1583 */
1584boolean_t
1585moea_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
1586{
1587        int loops;
1588	struct pvo_entry *pvo;
1589	boolean_t rv;
1590
1591	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1592	    ("moea_page_exists_quick: page %p is not managed", m));
1593	loops = 0;
1594	rv = FALSE;
1595	rw_wlock(&pvh_global_lock);
1596	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1597		if (pvo->pvo_pmap == pmap) {
1598			rv = TRUE;
1599			break;
1600		}
1601		if (++loops >= 16)
1602			break;
1603	}
1604	rw_wunlock(&pvh_global_lock);
1605	return (rv);
1606}
1607
1608void
1609moea_page_init(mmu_t mmu __unused, vm_page_t m)
1610{
1611
1612	m->md.mdpg_attrs = 0;
1613	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
1614	LIST_INIT(&m->md.mdpg_pvoh);
1615}
1616
1617/*
1618 * Return the number of managed mappings to the given physical page
1619 * that are wired.
1620 */
1621int
1622moea_page_wired_mappings(mmu_t mmu, vm_page_t m)
1623{
1624	struct pvo_entry *pvo;
1625	int count;
1626
1627	count = 0;
1628	if ((m->oflags & VPO_UNMANAGED) != 0)
1629		return (count);
1630	rw_wlock(&pvh_global_lock);
1631	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
1632		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1633			count++;
1634	rw_wunlock(&pvh_global_lock);
1635	return (count);
1636}
1637
1638static u_int	moea_vsidcontext;
1639
1640void
1641moea_pinit(mmu_t mmu, pmap_t pmap)
1642{
1643	int	i, mask;
1644	u_int	entropy;
1645
1646	KASSERT((int)pmap < VM_MIN_KERNEL_ADDRESS, ("moea_pinit: virt pmap"));
1647	RB_INIT(&pmap->pmap_pvo);
1648
1649	entropy = 0;
1650	__asm __volatile("mftb %0" : "=r"(entropy));
1651
1652	if ((pmap->pmap_phys = (pmap_t)moea_kextract(mmu, (vm_offset_t)pmap))
1653	    == NULL) {
1654		pmap->pmap_phys = pmap;
1655	}
1656
1657
1658	mtx_lock(&moea_vsid_mutex);
1659	/*
1660	 * Allocate some segment registers for this pmap.
1661	 */
1662	for (i = 0; i < NPMAPS; i += VSID_NBPW) {
1663		u_int	hash, n;
1664
1665		/*
1666		 * Create a new value by mutiplying by a prime and adding in
1667		 * entropy from the timebase register.  This is to make the
1668		 * VSID more random so that the PT hash function collides
1669		 * less often.  (Note that the prime casues gcc to do shifts
1670		 * instead of a multiply.)
1671		 */
1672		moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy;
1673		hash = moea_vsidcontext & (NPMAPS - 1);
1674		if (hash == 0)		/* 0 is special, avoid it */
1675			continue;
1676		n = hash >> 5;
1677		mask = 1 << (hash & (VSID_NBPW - 1));
1678		hash = (moea_vsidcontext & 0xfffff);
1679		if (moea_vsid_bitmap[n] & mask) {	/* collision? */
1680			/* anything free in this bucket? */
1681			if (moea_vsid_bitmap[n] == 0xffffffff) {
1682				entropy = (moea_vsidcontext >> 20);
1683				continue;
1684			}
1685			i = ffs(~moea_vsid_bitmap[n]) - 1;
1686			mask = 1 << i;
1687			hash &= rounddown2(0xfffff, VSID_NBPW);
1688			hash |= i;
1689		}
1690		KASSERT(!(moea_vsid_bitmap[n] & mask),
1691		    ("Allocating in-use VSID group %#x\n", hash));
1692		moea_vsid_bitmap[n] |= mask;
1693		for (i = 0; i < 16; i++)
1694			pmap->pm_sr[i] = VSID_MAKE(i, hash);
1695		mtx_unlock(&moea_vsid_mutex);
1696		return;
1697	}
1698
1699	mtx_unlock(&moea_vsid_mutex);
1700	panic("moea_pinit: out of segments");
1701}
1702
1703/*
1704 * Initialize the pmap associated with process 0.
1705 */
1706void
1707moea_pinit0(mmu_t mmu, pmap_t pm)
1708{
1709
1710	PMAP_LOCK_INIT(pm);
1711	moea_pinit(mmu, pm);
1712	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
1713}
1714
1715/*
1716 * Set the physical protection on the specified range of this map as requested.
1717 */
1718void
1719moea_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
1720    vm_prot_t prot)
1721{
1722	struct	pvo_entry *pvo, *tpvo, key;
1723	struct	pte *pt;
1724
1725	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
1726	    ("moea_protect: non current pmap"));
1727
1728	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1729		moea_remove(mmu, pm, sva, eva);
1730		return;
1731	}
1732
1733	rw_wlock(&pvh_global_lock);
1734	PMAP_LOCK(pm);
1735	key.pvo_vaddr = sva;
1736	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1737	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
1738		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
1739
1740		/*
1741		 * Grab the PTE pointer before we diddle with the cached PTE
1742		 * copy.
1743		 */
1744		pt = moea_pvo_to_pte(pvo, -1);
1745		/*
1746		 * Change the protection of the page.
1747		 */
1748		pvo->pvo_pte.pte.pte_lo &= ~PTE_PP;
1749		pvo->pvo_pte.pte.pte_lo |= PTE_BR;
1750
1751		/*
1752		 * If the PVO is in the page table, update that pte as well.
1753		 */
1754		if (pt != NULL) {
1755			moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr);
1756			mtx_unlock(&moea_table_mutex);
1757		}
1758	}
1759	rw_wunlock(&pvh_global_lock);
1760	PMAP_UNLOCK(pm);
1761}
1762
1763/*
1764 * Map a list of wired pages into kernel virtual address space.  This is
1765 * intended for temporary mappings which do not need page modification or
1766 * references recorded.  Existing mappings in the region are overwritten.
1767 */
1768void
1769moea_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count)
1770{
1771	vm_offset_t va;
1772
1773	va = sva;
1774	while (count-- > 0) {
1775		moea_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
1776		va += PAGE_SIZE;
1777		m++;
1778	}
1779}
1780
1781/*
1782 * Remove page mappings from kernel virtual address space.  Intended for
1783 * temporary mappings entered by moea_qenter.
1784 */
1785void
1786moea_qremove(mmu_t mmu, vm_offset_t sva, int count)
1787{
1788	vm_offset_t va;
1789
1790	va = sva;
1791	while (count-- > 0) {
1792		moea_kremove(mmu, va);
1793		va += PAGE_SIZE;
1794	}
1795}
1796
1797void
1798moea_release(mmu_t mmu, pmap_t pmap)
1799{
1800        int idx, mask;
1801
1802	/*
1803	 * Free segment register's VSID
1804	 */
1805        if (pmap->pm_sr[0] == 0)
1806                panic("moea_release");
1807
1808	mtx_lock(&moea_vsid_mutex);
1809        idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1);
1810        mask = 1 << (idx % VSID_NBPW);
1811        idx /= VSID_NBPW;
1812        moea_vsid_bitmap[idx] &= ~mask;
1813	mtx_unlock(&moea_vsid_mutex);
1814}
1815
1816/*
1817 * Remove the given range of addresses from the specified map.
1818 */
1819void
1820moea_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1821{
1822	struct	pvo_entry *pvo, *tpvo, key;
1823
1824	rw_wlock(&pvh_global_lock);
1825	PMAP_LOCK(pm);
1826	key.pvo_vaddr = sva;
1827	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1828	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
1829		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
1830		moea_pvo_remove(pvo, -1);
1831	}
1832	PMAP_UNLOCK(pm);
1833	rw_wunlock(&pvh_global_lock);
1834}
1835
1836/*
1837 * Remove physical page from all pmaps in which it resides. moea_pvo_remove()
1838 * will reflect changes in pte's back to the vm_page.
1839 */
1840void
1841moea_remove_all(mmu_t mmu, vm_page_t m)
1842{
1843	struct  pvo_head *pvo_head;
1844	struct	pvo_entry *pvo, *next_pvo;
1845	pmap_t	pmap;
1846
1847	rw_wlock(&pvh_global_lock);
1848	pvo_head = vm_page_to_pvoh(m);
1849	for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) {
1850		next_pvo = LIST_NEXT(pvo, pvo_vlink);
1851
1852		pmap = pvo->pvo_pmap;
1853		PMAP_LOCK(pmap);
1854		moea_pvo_remove(pvo, -1);
1855		PMAP_UNLOCK(pmap);
1856	}
1857	if ((m->aflags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) {
1858		moea_attr_clear(m, PTE_CHG);
1859		vm_page_dirty(m);
1860	}
1861	vm_page_aflag_clear(m, PGA_WRITEABLE);
1862	rw_wunlock(&pvh_global_lock);
1863}
1864
1865/*
1866 * Allocate a physical page of memory directly from the phys_avail map.
1867 * Can only be called from moea_bootstrap before avail start and end are
1868 * calculated.
1869 */
1870static vm_offset_t
1871moea_bootstrap_alloc(vm_size_t size, u_int align)
1872{
1873	vm_offset_t	s, e;
1874	int		i, j;
1875
1876	size = round_page(size);
1877	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1878		if (align != 0)
1879			s = roundup2(phys_avail[i], align);
1880		else
1881			s = phys_avail[i];
1882		e = s + size;
1883
1884		if (s < phys_avail[i] || e > phys_avail[i + 1])
1885			continue;
1886
1887		if (s == phys_avail[i]) {
1888			phys_avail[i] += size;
1889		} else if (e == phys_avail[i + 1]) {
1890			phys_avail[i + 1] -= size;
1891		} else {
1892			for (j = phys_avail_count * 2; j > i; j -= 2) {
1893				phys_avail[j] = phys_avail[j - 2];
1894				phys_avail[j + 1] = phys_avail[j - 1];
1895			}
1896
1897			phys_avail[i + 3] = phys_avail[i + 1];
1898			phys_avail[i + 1] = s;
1899			phys_avail[i + 2] = e;
1900			phys_avail_count++;
1901		}
1902
1903		return (s);
1904	}
1905	panic("moea_bootstrap_alloc: could not allocate memory");
1906}
1907
1908static void
1909moea_syncicache(vm_paddr_t pa, vm_size_t len)
1910{
1911	__syncicache((void *)pa, len);
1912}
1913
1914static int
1915moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head,
1916    vm_offset_t va, vm_paddr_t pa, u_int pte_lo, int flags)
1917{
1918	struct	pvo_entry *pvo;
1919	u_int	sr;
1920	int	first;
1921	u_int	ptegidx;
1922	int	i;
1923	int     bootstrap;
1924
1925	moea_pvo_enter_calls++;
1926	first = 0;
1927	bootstrap = 0;
1928
1929	/*
1930	 * Compute the PTE Group index.
1931	 */
1932	va &= ~ADDR_POFF;
1933	sr = va_to_sr(pm->pm_sr, va);
1934	ptegidx = va_to_pteg(sr, va);
1935
1936	/*
1937	 * Remove any existing mapping for this page.  Reuse the pvo entry if
1938	 * there is a mapping.
1939	 */
1940	mtx_lock(&moea_table_mutex);
1941	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
1942		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
1943			if ((pvo->pvo_pte.pte.pte_lo & PTE_RPGN) == pa &&
1944			    (pvo->pvo_pte.pte.pte_lo & PTE_PP) ==
1945			    (pte_lo & PTE_PP)) {
1946				/*
1947				 * The PTE is not changing.  Instead, this may
1948				 * be a request to change the mapping's wired
1949				 * attribute.
1950				 */
1951				mtx_unlock(&moea_table_mutex);
1952				if ((flags & PVO_WIRED) != 0 &&
1953				    (pvo->pvo_vaddr & PVO_WIRED) == 0) {
1954					pvo->pvo_vaddr |= PVO_WIRED;
1955					pm->pm_stats.wired_count++;
1956				} else if ((flags & PVO_WIRED) == 0 &&
1957				    (pvo->pvo_vaddr & PVO_WIRED) != 0) {
1958					pvo->pvo_vaddr &= ~PVO_WIRED;
1959					pm->pm_stats.wired_count--;
1960				}
1961				return (0);
1962			}
1963			moea_pvo_remove(pvo, -1);
1964			break;
1965		}
1966	}
1967
1968	/*
1969	 * If we aren't overwriting a mapping, try to allocate.
1970	 */
1971	if (moea_initialized) {
1972		pvo = uma_zalloc(zone, M_NOWAIT);
1973	} else {
1974		if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) {
1975			panic("moea_enter: bpvo pool exhausted, %d, %d, %d",
1976			      moea_bpvo_pool_index, BPVO_POOL_SIZE,
1977			      BPVO_POOL_SIZE * sizeof(struct pvo_entry));
1978		}
1979		pvo = &moea_bpvo_pool[moea_bpvo_pool_index];
1980		moea_bpvo_pool_index++;
1981		bootstrap = 1;
1982	}
1983
1984	if (pvo == NULL) {
1985		mtx_unlock(&moea_table_mutex);
1986		return (ENOMEM);
1987	}
1988
1989	moea_pvo_entries++;
1990	pvo->pvo_vaddr = va;
1991	pvo->pvo_pmap = pm;
1992	LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink);
1993	pvo->pvo_vaddr &= ~ADDR_POFF;
1994	if (flags & PVO_WIRED)
1995		pvo->pvo_vaddr |= PVO_WIRED;
1996	if (pvo_head != &moea_pvo_kunmanaged)
1997		pvo->pvo_vaddr |= PVO_MANAGED;
1998	if (bootstrap)
1999		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
2000
2001	moea_pte_create(&pvo->pvo_pte.pte, sr, va, pa | pte_lo);
2002
2003	/*
2004	 * Add to pmap list
2005	 */
2006	RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo);
2007
2008	/*
2009	 * Remember if the list was empty and therefore will be the first
2010	 * item.
2011	 */
2012	if (LIST_FIRST(pvo_head) == NULL)
2013		first = 1;
2014	LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2015
2016	if (pvo->pvo_vaddr & PVO_WIRED)
2017		pm->pm_stats.wired_count++;
2018	pm->pm_stats.resident_count++;
2019
2020	i = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte);
2021	KASSERT(i < 8, ("Invalid PTE index"));
2022	if (i >= 0) {
2023		PVO_PTEGIDX_SET(pvo, i);
2024	} else {
2025		panic("moea_pvo_enter: overflow");
2026		moea_pte_overflow++;
2027	}
2028	mtx_unlock(&moea_table_mutex);
2029
2030	return (first ? ENOENT : 0);
2031}
2032
2033static void
2034moea_pvo_remove(struct pvo_entry *pvo, int pteidx)
2035{
2036	struct	pte *pt;
2037
2038	/*
2039	 * If there is an active pte entry, we need to deactivate it (and
2040	 * save the ref & cfg bits).
2041	 */
2042	pt = moea_pvo_to_pte(pvo, pteidx);
2043	if (pt != NULL) {
2044		moea_pte_unset(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr);
2045		mtx_unlock(&moea_table_mutex);
2046		PVO_PTEGIDX_CLR(pvo);
2047	} else {
2048		moea_pte_overflow--;
2049	}
2050
2051	/*
2052	 * Update our statistics.
2053	 */
2054	pvo->pvo_pmap->pm_stats.resident_count--;
2055	if (pvo->pvo_vaddr & PVO_WIRED)
2056		pvo->pvo_pmap->pm_stats.wired_count--;
2057
2058	/*
2059	 * Save the REF/CHG bits into their cache if the page is managed.
2060	 */
2061	if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED) {
2062		struct	vm_page *pg;
2063
2064		pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN);
2065		if (pg != NULL) {
2066			moea_attr_save(pg, pvo->pvo_pte.pte.pte_lo &
2067			    (PTE_REF | PTE_CHG));
2068		}
2069	}
2070
2071	/*
2072	 * Remove this PVO from the PV and pmap lists.
2073	 */
2074	LIST_REMOVE(pvo, pvo_vlink);
2075	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2076
2077	/*
2078	 * Remove this from the overflow list and return it to the pool
2079	 * if we aren't going to reuse it.
2080	 */
2081	LIST_REMOVE(pvo, pvo_olink);
2082	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
2083		uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone :
2084		    moea_upvo_zone, pvo);
2085	moea_pvo_entries--;
2086	moea_pvo_remove_calls++;
2087}
2088
2089static __inline int
2090moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx)
2091{
2092	int	pteidx;
2093
2094	/*
2095	 * We can find the actual pte entry without searching by grabbing
2096	 * the PTEG index from 3 unused bits in pte_lo[11:9] and by
2097	 * noticing the HID bit.
2098	 */
2099	pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo);
2100	if (pvo->pvo_pte.pte.pte_hi & PTE_HID)
2101		pteidx ^= moea_pteg_mask * 8;
2102
2103	return (pteidx);
2104}
2105
2106static struct pvo_entry *
2107moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p)
2108{
2109	struct	pvo_entry *pvo;
2110	int	ptegidx;
2111	u_int	sr;
2112
2113	va &= ~ADDR_POFF;
2114	sr = va_to_sr(pm->pm_sr, va);
2115	ptegidx = va_to_pteg(sr, va);
2116
2117	mtx_lock(&moea_table_mutex);
2118	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2119		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2120			if (pteidx_p)
2121				*pteidx_p = moea_pvo_pte_index(pvo, ptegidx);
2122			break;
2123		}
2124	}
2125	mtx_unlock(&moea_table_mutex);
2126
2127	return (pvo);
2128}
2129
2130static struct pte *
2131moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx)
2132{
2133	struct	pte *pt;
2134
2135	/*
2136	 * If we haven't been supplied the ptegidx, calculate it.
2137	 */
2138	if (pteidx == -1) {
2139		int	ptegidx;
2140		u_int	sr;
2141
2142		sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr);
2143		ptegidx = va_to_pteg(sr, pvo->pvo_vaddr);
2144		pteidx = moea_pvo_pte_index(pvo, ptegidx);
2145	}
2146
2147	pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7];
2148	mtx_lock(&moea_table_mutex);
2149
2150	if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) {
2151		panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no "
2152		    "valid pte index", pvo);
2153	}
2154
2155	if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) {
2156		panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo "
2157		    "pvo but no valid pte", pvo);
2158	}
2159
2160	if ((pt->pte_hi ^ (pvo->pvo_pte.pte.pte_hi & ~PTE_VALID)) == PTE_VALID) {
2161		if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0) {
2162			panic("moea_pvo_to_pte: pvo %p has valid pte in "
2163			    "moea_pteg_table %p but invalid in pvo", pvo, pt);
2164		}
2165
2166		if (((pt->pte_lo ^ pvo->pvo_pte.pte.pte_lo) & ~(PTE_CHG|PTE_REF))
2167		    != 0) {
2168			panic("moea_pvo_to_pte: pvo %p pte does not match "
2169			    "pte %p in moea_pteg_table", pvo, pt);
2170		}
2171
2172		mtx_assert(&moea_table_mutex, MA_OWNED);
2173		return (pt);
2174	}
2175
2176	if (pvo->pvo_pte.pte.pte_hi & PTE_VALID) {
2177		panic("moea_pvo_to_pte: pvo %p has invalid pte %p in "
2178		    "moea_pteg_table but valid in pvo: %8x, %8x", pvo, pt, pvo->pvo_pte.pte.pte_hi, pt->pte_hi);
2179	}
2180
2181	mtx_unlock(&moea_table_mutex);
2182	return (NULL);
2183}
2184
2185/*
2186 * XXX: THIS STUFF SHOULD BE IN pte.c?
2187 */
2188int
2189moea_pte_spill(vm_offset_t addr)
2190{
2191	struct	pvo_entry *source_pvo, *victim_pvo;
2192	struct	pvo_entry *pvo;
2193	int	ptegidx, i, j;
2194	u_int	sr;
2195	struct	pteg *pteg;
2196	struct	pte *pt;
2197
2198	moea_pte_spills++;
2199
2200	sr = mfsrin(addr);
2201	ptegidx = va_to_pteg(sr, addr);
2202
2203	/*
2204	 * Have to substitute some entry.  Use the primary hash for this.
2205	 * Use low bits of timebase as random generator.
2206	 */
2207	pteg = &moea_pteg_table[ptegidx];
2208	mtx_lock(&moea_table_mutex);
2209	__asm __volatile("mftb %0" : "=r"(i));
2210	i &= 7;
2211	pt = &pteg->pt[i];
2212
2213	source_pvo = NULL;
2214	victim_pvo = NULL;
2215	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2216		/*
2217		 * We need to find a pvo entry for this address.
2218		 */
2219		if (source_pvo == NULL &&
2220		    moea_pte_match(&pvo->pvo_pte.pte, sr, addr,
2221		    pvo->pvo_pte.pte.pte_hi & PTE_HID)) {
2222			/*
2223			 * Now found an entry to be spilled into the pteg.
2224			 * The PTE is now valid, so we know it's active.
2225			 */
2226			j = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte);
2227
2228			if (j >= 0) {
2229				PVO_PTEGIDX_SET(pvo, j);
2230				moea_pte_overflow--;
2231				mtx_unlock(&moea_table_mutex);
2232				return (1);
2233			}
2234
2235			source_pvo = pvo;
2236
2237			if (victim_pvo != NULL)
2238				break;
2239		}
2240
2241		/*
2242		 * We also need the pvo entry of the victim we are replacing
2243		 * so save the R & C bits of the PTE.
2244		 */
2245		if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL &&
2246		    moea_pte_compare(pt, &pvo->pvo_pte.pte)) {
2247			victim_pvo = pvo;
2248			if (source_pvo != NULL)
2249				break;
2250		}
2251	}
2252
2253	if (source_pvo == NULL) {
2254		mtx_unlock(&moea_table_mutex);
2255		return (0);
2256	}
2257
2258	if (victim_pvo == NULL) {
2259		if ((pt->pte_hi & PTE_HID) == 0)
2260			panic("moea_pte_spill: victim p-pte (%p) has no pvo"
2261			    "entry", pt);
2262
2263		/*
2264		 * If this is a secondary PTE, we need to search it's primary
2265		 * pvo bucket for the matching PVO.
2266		 */
2267		LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask],
2268		    pvo_olink) {
2269			/*
2270			 * We also need the pvo entry of the victim we are
2271			 * replacing so save the R & C bits of the PTE.
2272			 */
2273			if (moea_pte_compare(pt, &pvo->pvo_pte.pte)) {
2274				victim_pvo = pvo;
2275				break;
2276			}
2277		}
2278
2279		if (victim_pvo == NULL)
2280			panic("moea_pte_spill: victim s-pte (%p) has no pvo"
2281			    "entry", pt);
2282	}
2283
2284	/*
2285	 * We are invalidating the TLB entry for the EA we are replacing even
2286	 * though it's valid.  If we don't, we lose any ref/chg bit changes
2287	 * contained in the TLB entry.
2288	 */
2289	source_pvo->pvo_pte.pte.pte_hi &= ~PTE_HID;
2290
2291	moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr);
2292	moea_pte_set(pt, &source_pvo->pvo_pte.pte);
2293
2294	PVO_PTEGIDX_CLR(victim_pvo);
2295	PVO_PTEGIDX_SET(source_pvo, i);
2296	moea_pte_replacements++;
2297
2298	mtx_unlock(&moea_table_mutex);
2299	return (1);
2300}
2301
2302static __inline struct pvo_entry *
2303moea_pte_spillable_ident(u_int ptegidx)
2304{
2305	struct	pte *pt;
2306	struct	pvo_entry *pvo_walk, *pvo = NULL;
2307
2308	LIST_FOREACH(pvo_walk, &moea_pvo_table[ptegidx], pvo_olink) {
2309		if (pvo_walk->pvo_vaddr & PVO_WIRED)
2310			continue;
2311
2312		if (!(pvo_walk->pvo_pte.pte.pte_hi & PTE_VALID))
2313			continue;
2314
2315		pt = moea_pvo_to_pte(pvo_walk, -1);
2316
2317		if (pt == NULL)
2318			continue;
2319
2320		pvo = pvo_walk;
2321
2322		mtx_unlock(&moea_table_mutex);
2323		if (!(pt->pte_lo & PTE_REF))
2324			return (pvo_walk);
2325	}
2326
2327	return (pvo);
2328}
2329
2330static int
2331moea_pte_insert(u_int ptegidx, struct pte *pvo_pt)
2332{
2333	struct	pte *pt;
2334	struct	pvo_entry *victim_pvo;
2335	int	i;
2336	int	victim_idx;
2337	u_int	pteg_bkpidx = ptegidx;
2338
2339	mtx_assert(&moea_table_mutex, MA_OWNED);
2340
2341	/*
2342	 * First try primary hash.
2343	 */
2344	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2345		if ((pt->pte_hi & PTE_VALID) == 0) {
2346			pvo_pt->pte_hi &= ~PTE_HID;
2347			moea_pte_set(pt, pvo_pt);
2348			return (i);
2349		}
2350	}
2351
2352	/*
2353	 * Now try secondary hash.
2354	 */
2355	ptegidx ^= moea_pteg_mask;
2356
2357	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2358		if ((pt->pte_hi & PTE_VALID) == 0) {
2359			pvo_pt->pte_hi |= PTE_HID;
2360			moea_pte_set(pt, pvo_pt);
2361			return (i);
2362		}
2363	}
2364
2365	/* Try again, but this time try to force a PTE out. */
2366	ptegidx = pteg_bkpidx;
2367
2368	victim_pvo = moea_pte_spillable_ident(ptegidx);
2369	if (victim_pvo == NULL) {
2370		ptegidx ^= moea_pteg_mask;
2371		victim_pvo = moea_pte_spillable_ident(ptegidx);
2372	}
2373
2374	if (victim_pvo == NULL) {
2375		panic("moea_pte_insert: overflow");
2376		return (-1);
2377	}
2378
2379	victim_idx = moea_pvo_pte_index(victim_pvo, ptegidx);
2380
2381	if (pteg_bkpidx == ptegidx)
2382		pvo_pt->pte_hi &= ~PTE_HID;
2383	else
2384		pvo_pt->pte_hi |= PTE_HID;
2385
2386	/*
2387	 * Synchronize the sacrifice PTE with its PVO, then mark both
2388	 * invalid. The PVO will be reused when/if the VM system comes
2389	 * here after a fault.
2390	 */
2391	pt = &moea_pteg_table[victim_idx >> 3].pt[victim_idx & 7];
2392
2393	if (pt->pte_hi != victim_pvo->pvo_pte.pte.pte_hi)
2394	    panic("Victim PVO doesn't match PTE! PVO: %8x, PTE: %8x", victim_pvo->pvo_pte.pte.pte_hi, pt->pte_hi);
2395
2396	/*
2397	 * Set the new PTE.
2398	 */
2399	moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr);
2400	PVO_PTEGIDX_CLR(victim_pvo);
2401	moea_pte_overflow++;
2402	moea_pte_set(pt, pvo_pt);
2403
2404	return (victim_idx & 7);
2405}
2406
2407static boolean_t
2408moea_query_bit(vm_page_t m, int ptebit)
2409{
2410	struct	pvo_entry *pvo;
2411	struct	pte *pt;
2412
2413	rw_assert(&pvh_global_lock, RA_WLOCKED);
2414	if (moea_attr_fetch(m) & ptebit)
2415		return (TRUE);
2416
2417	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2418
2419		/*
2420		 * See if we saved the bit off.  If so, cache it and return
2421		 * success.
2422		 */
2423		if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2424			moea_attr_save(m, ptebit);
2425			return (TRUE);
2426		}
2427	}
2428
2429	/*
2430	 * No luck, now go through the hard part of looking at the PTEs
2431	 * themselves.  Sync so that any pending REF/CHG bits are flushed to
2432	 * the PTEs.
2433	 */
2434	powerpc_sync();
2435	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2436
2437		/*
2438		 * See if this pvo has a valid PTE.  if so, fetch the
2439		 * REF/CHG bits from the valid PTE.  If the appropriate
2440		 * ptebit is set, cache it and return success.
2441		 */
2442		pt = moea_pvo_to_pte(pvo, -1);
2443		if (pt != NULL) {
2444			moea_pte_synch(pt, &pvo->pvo_pte.pte);
2445			mtx_unlock(&moea_table_mutex);
2446			if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2447				moea_attr_save(m, ptebit);
2448				return (TRUE);
2449			}
2450		}
2451	}
2452
2453	return (FALSE);
2454}
2455
2456static u_int
2457moea_clear_bit(vm_page_t m, int ptebit)
2458{
2459	u_int	count;
2460	struct	pvo_entry *pvo;
2461	struct	pte *pt;
2462
2463	rw_assert(&pvh_global_lock, RA_WLOCKED);
2464
2465	/*
2466	 * Clear the cached value.
2467	 */
2468	moea_attr_clear(m, ptebit);
2469
2470	/*
2471	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2472	 * we can reset the right ones).  note that since the pvo entries and
2473	 * list heads are accessed via BAT0 and are never placed in the page
2474	 * table, we don't have to worry about further accesses setting the
2475	 * REF/CHG bits.
2476	 */
2477	powerpc_sync();
2478
2479	/*
2480	 * For each pvo entry, clear the pvo's ptebit.  If this pvo has a
2481	 * valid pte clear the ptebit from the valid pte.
2482	 */
2483	count = 0;
2484	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2485		pt = moea_pvo_to_pte(pvo, -1);
2486		if (pt != NULL) {
2487			moea_pte_synch(pt, &pvo->pvo_pte.pte);
2488			if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2489				count++;
2490				moea_pte_clear(pt, PVO_VADDR(pvo), ptebit);
2491			}
2492			mtx_unlock(&moea_table_mutex);
2493		}
2494		pvo->pvo_pte.pte.pte_lo &= ~ptebit;
2495	}
2496
2497	return (count);
2498}
2499
2500/*
2501 * Return true if the physical range is encompassed by the battable[idx]
2502 */
2503static int
2504moea_bat_mapped(int idx, vm_paddr_t pa, vm_size_t size)
2505{
2506	u_int prot;
2507	u_int32_t start;
2508	u_int32_t end;
2509	u_int32_t bat_ble;
2510
2511	/*
2512	 * Return immediately if not a valid mapping
2513	 */
2514	if (!(battable[idx].batu & BAT_Vs))
2515		return (EINVAL);
2516
2517	/*
2518	 * The BAT entry must be cache-inhibited, guarded, and r/w
2519	 * so it can function as an i/o page
2520	 */
2521	prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW);
2522	if (prot != (BAT_I|BAT_G|BAT_PP_RW))
2523		return (EPERM);
2524
2525	/*
2526	 * The address should be within the BAT range. Assume that the
2527	 * start address in the BAT has the correct alignment (thus
2528	 * not requiring masking)
2529	 */
2530	start = battable[idx].batl & BAT_PBS;
2531	bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03;
2532	end = start | (bat_ble << 15) | 0x7fff;
2533
2534	if ((pa < start) || ((pa + size) > end))
2535		return (ERANGE);
2536
2537	return (0);
2538}
2539
2540boolean_t
2541moea_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2542{
2543	int i;
2544
2545	/*
2546	 * This currently does not work for entries that
2547	 * overlap 256M BAT segments.
2548	 */
2549
2550	for(i = 0; i < 16; i++)
2551		if (moea_bat_mapped(i, pa, size) == 0)
2552			return (0);
2553
2554	return (EFAULT);
2555}
2556
2557/*
2558 * Map a set of physical memory pages into the kernel virtual
2559 * address space. Return a pointer to where it is mapped. This
2560 * routine is intended to be used for mapping device memory,
2561 * NOT real memory.
2562 */
2563void *
2564moea_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2565{
2566
2567	return (moea_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT));
2568}
2569
2570void *
2571moea_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
2572{
2573	vm_offset_t va, tmpva, ppa, offset;
2574	int i;
2575
2576	ppa = trunc_page(pa);
2577	offset = pa & PAGE_MASK;
2578	size = roundup(offset + size, PAGE_SIZE);
2579
2580	/*
2581	 * If the physical address lies within a valid BAT table entry,
2582	 * return the 1:1 mapping. This currently doesn't work
2583	 * for regions that overlap 256M BAT segments.
2584	 */
2585	for (i = 0; i < 16; i++) {
2586		if (moea_bat_mapped(i, pa, size) == 0)
2587			return ((void *) pa);
2588	}
2589
2590	va = kva_alloc(size);
2591	if (!va)
2592		panic("moea_mapdev: Couldn't alloc kernel virtual memory");
2593
2594	for (tmpva = va; size > 0;) {
2595		moea_kenter_attr(mmu, tmpva, ppa, ma);
2596		tlbie(tmpva);
2597		size -= PAGE_SIZE;
2598		tmpva += PAGE_SIZE;
2599		ppa += PAGE_SIZE;
2600	}
2601
2602	return ((void *)(va + offset));
2603}
2604
2605void
2606moea_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
2607{
2608	vm_offset_t base, offset;
2609
2610	/*
2611	 * If this is outside kernel virtual space, then it's a
2612	 * battable entry and doesn't require unmapping
2613	 */
2614	if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= virtual_end)) {
2615		base = trunc_page(va);
2616		offset = va & PAGE_MASK;
2617		size = roundup(offset + size, PAGE_SIZE);
2618		kva_free(base, size);
2619	}
2620}
2621
2622static void
2623moea_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz)
2624{
2625	struct pvo_entry *pvo;
2626	vm_offset_t lim;
2627	vm_paddr_t pa;
2628	vm_size_t len;
2629
2630	PMAP_LOCK(pm);
2631	while (sz > 0) {
2632		lim = round_page(va);
2633		len = MIN(lim - va, sz);
2634		pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
2635		if (pvo != NULL) {
2636			pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) |
2637			    (va & ADDR_POFF);
2638			moea_syncicache(pa, len);
2639		}
2640		va += len;
2641		sz -= len;
2642	}
2643	PMAP_UNLOCK(pm);
2644}
2645
2646void
2647moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va)
2648{
2649
2650	*va = (void *)pa;
2651}
2652
2653extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
2654
2655void
2656moea_scan_init(mmu_t mmu)
2657{
2658	struct pvo_entry *pvo;
2659	vm_offset_t va;
2660	int i;
2661
2662	if (!do_minidump) {
2663		/* Initialize phys. segments for dumpsys(). */
2664		memset(&dump_map, 0, sizeof(dump_map));
2665		mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
2666		for (i = 0; i < pregions_sz; i++) {
2667			dump_map[i].pa_start = pregions[i].mr_start;
2668			dump_map[i].pa_size = pregions[i].mr_size;
2669		}
2670		return;
2671	}
2672
2673	/* Virtual segments for minidumps: */
2674	memset(&dump_map, 0, sizeof(dump_map));
2675
2676	/* 1st: kernel .data and .bss. */
2677	dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
2678	dump_map[0].pa_size =
2679	    round_page((uintptr_t)_end) - dump_map[0].pa_start;
2680
2681	/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2682	dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr;
2683	dump_map[1].pa_size = round_page(msgbufp->msg_size);
2684
2685	/* 3rd: kernel VM. */
2686	va = dump_map[1].pa_start + dump_map[1].pa_size;
2687	/* Find start of next chunk (from va). */
2688	while (va < virtual_end) {
2689		/* Don't dump the buffer cache. */
2690		if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
2691			va = kmi.buffer_eva;
2692			continue;
2693		}
2694		pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
2695		if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID))
2696			break;
2697		va += PAGE_SIZE;
2698	}
2699	if (va < virtual_end) {
2700		dump_map[2].pa_start = va;
2701		va += PAGE_SIZE;
2702		/* Find last page in chunk. */
2703		while (va < virtual_end) {
2704			/* Don't run into the buffer cache. */
2705			if (va == kmi.buffer_sva)
2706				break;
2707			pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF,
2708			    NULL);
2709			if (pvo == NULL ||
2710			    !(pvo->pvo_pte.pte.pte_hi & PTE_VALID))
2711				break;
2712			va += PAGE_SIZE;
2713		}
2714		dump_map[2].pa_size = va - dump_map[2].pa_start;
2715	}
2716}
2717