1/*-
2 * Copyright (c) 2001 Jake Burkholder.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <machine/asm.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/errno.h>
31
32#include <machine/asi.h>
33#include <machine/asmacros.h>
34#include <machine/fsr.h>
35#include <machine/intr_machdep.h>
36#include <machine/pcb.h>
37#include <machine/pstate.h>
38#include <machine/wstate.h>
39
40#include "assym.s"
41
42	.register %g2, #ignore
43	.register %g3, #ignore
44	.register %g6, #ignore
45
46/*
47 * Common code for copy routines.
48 *
49 * We use large macros to generate functions for each of the copy routines.
50 * This allows the load and store instructions to be generated for the right
51 * operation, asi or not.  It is possible to write an asi independent function
52 * but this would require 2 expensive wrs in the main loop to switch %asi.
53 * It would also screw up profiling (if we ever get it), but may save some I$.
54 * We assume that either one of dasi and sasi is empty, or that they are both
55 * the same (empty or non-empty).  It is up to the caller to set %asi.
56 */
57
58/*
59 * ASI independent implementation of copystr(9).
60 * Used to implement copyinstr() and copystr().
61 *
62 * Return value is in %g1.
63 */
64#define	_COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
65	brz	len, 4f ; \
66	 mov	src, %g2 ; \
671:	deccc	1, len ; \
68	bl,a,pn	%xcc, 3f ; \
69	 nop ; \
70	LD(ub, sa) [src] sasi, %g1 ; \
71	ST(b, da) %g1, [dst] dasi ; \
72	brz,pn	%g1, 3f ; \
73	 inc	src ; \
74	ba	%xcc, 1b ; \
75	 inc	dst ; \
762:	mov	ENAMETOOLONG, %g1 ; \
773:	sub	src, %g2, %g2 ; \
78	brnz,a	done, 4f ; \
79	 stx	%g2, [done] ; \
804:
81
82/*
83 * ASI independent implementation of memset(3).
84 * Used to implement bzero(), memset() and aszero().
85 *
86 * If the pattern is non-zero, duplicate it to fill 64 bits.
87 * Store bytes until dst is 8-byte aligned, then store 8 bytes.
88 * It has yet to be determined how much unrolling is beneficial.
89 * Could also read and compare before writing to minimize snoop traffic.
90 *
91 * XXX bzero() should be implemented as
92 * #define bzero(dst, len) (void)memset((dst), 0, (len))
93 * if at all.
94 */
95#define	_MEMSET(dst, pat, len, da, dasi) \
96	brlez,pn len, 5f ; \
97	 and	pat, 0xff, pat ; \
98	brz,pt	pat, 1f ; \
99	 sllx	pat, 8, %g1 ; \
100	or	pat, %g1, pat ; \
101	sllx	pat, 16, %g1 ; \
102	or	pat, %g1, pat ; \
103	sllx	pat, 32, %g1 ; \
104	or	pat, %g1, pat ; \
105	.align 16 ; \
1061:	deccc	1, len ; \
107	bl,pn	%xcc, 5f ; \
108	 btst	7, dst ; \
109	bz,a,pt	%xcc, 2f ; \
110	 inc	1, len ; \
111	ST(b, da) pat, [dst] dasi ; \
112	ba	%xcc, 1b ; \
113	 inc	dst ; \
114	.align 16 ; \
1152:	deccc	32, len ; \
116	bl,a,pn	%xcc, 3f ; \
117	 inc	32, len ; \
118	ST(x, da) pat, [dst] dasi ; \
119	ST(x, da) pat, [dst + 8] dasi ; \
120	ST(x, da) pat, [dst + 16] dasi ; \
121	ST(x, da) pat, [dst + 24] dasi ; \
122	ba	%xcc, 2b ; \
123	 inc	32, dst ; \
124	.align 16 ; \
1253:	deccc	8, len ; \
126	bl,a,pn	%xcc, 4f ; \
127	 inc	8, len ; \
128	ST(x, da) pat, [dst] dasi ; \
129	ba	%xcc, 3b ; \
130	 inc	8, dst ; \
131	.align 16 ; \
1324:	deccc	1, len ; \
133	bl,a,pn	%xcc, 5f ; \
134	 nop ; \
135	ST(b, da) pat, [dst] dasi ; \
136	ba	%xcc, 4b ; \
137	 inc	1, dst ; \
1385:
139
140/*
141 * ASI independent implementation of memcpy(3).
142 * Used to implement bcopy(), copyin(), copyout(), memcpy(), ascopy(),
143 * ascopyfrom() and ascopyto().
144 *
145 * Transfer bytes until dst is 8-byte aligned.  If src is then also 8 byte
146 * aligned, transfer 8 bytes, otherwise finish with bytes.  The unaligned
147 * case could be optimized, but it is expected that this is the uncommon
148 * case and of questionable value.  The code to do so is also rather large
149 * and ugly.  It has yet to be determined how much unrolling is beneficial.
150 *
151 * XXX bcopy() must also check for overlap.  This is stupid.
152 * XXX bcopy() should be implemented as
153 * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
154 * if at all.
155 */
156#define	_MEMCPY(dst, src, len, da, dasi, sa, sasi) \
1571:	deccc	1, len ; \
158	bl,pn	%xcc, 6f ; \
159	 btst	7, dst ; \
160	bz,a,pt	%xcc, 2f ; \
161	 inc	1, len ; \
162	LD(ub, sa) [src] sasi, %g1 ; \
163	ST(b, da) %g1, [dst] dasi ; \
164	inc	1, src ; \
165	ba	%xcc, 1b ; \
166	 inc	1, dst ; \
167	.align 16 ; \
1682:	btst	7, src ; \
169	bz,a,pt	%xcc, 3f ; \
170	 nop ; \
171	ba,a	%xcc, 5f ; \
172	.align 16 ; \
1733:	deccc	32, len ; \
174	bl,a,pn	%xcc, 4f ; \
175	 inc	32, len ; \
176	LD(x, sa) [src] sasi, %g1 ; \
177	LD(x, sa) [src + 8] sasi, %g2 ; \
178	LD(x, sa) [src + 16] sasi, %g3 ; \
179	LD(x, sa) [src + 24] sasi, %g4 ; \
180	ST(x, da) %g1, [dst] dasi ; \
181	ST(x, da) %g2, [dst + 8] dasi ; \
182	ST(x, da) %g3, [dst + 16] dasi ; \
183	ST(x, da) %g4, [dst + 24] dasi ; \
184	inc	32, src ; \
185	ba	%xcc, 3b ; \
186	 inc	32, dst ; \
187	.align 16 ; \
1884:	deccc	8, len ; \
189	bl,a,pn	%xcc, 5f ; \
190	 inc	8, len ; \
191	LD(x, sa) [src] sasi, %g1 ; \
192	ST(x, da) %g1, [dst] dasi ; \
193	inc	8, src ; \
194	ba	%xcc, 4b ; \
195	 inc	8, dst ; \
196	.align 16 ; \
1975:	deccc	1, len ; \
198	bl,a,pn	%xcc, 6f ; \
199	 nop ; \
200	LD(ub, sa) [src] sasi, %g1 ; \
201	ST(b, da) %g1, [dst] dasi ; \
202	inc	src ; \
203	ba	%xcc, 5b ; \
204	 inc	dst ; \
2056:
206
207/*
208 * void ascopy(u_long asi, vm_offset_t src, vm_offset_t dst, size_t len)
209 */
210ENTRY(ascopy)
211	wr	%o0, 0, %asi
212	_MEMCPY(%o2, %o1, %o3, a, %asi, a, %asi)
213	retl
214	 nop
215END(ascopy)
216
217/*
218 * void ascopyfrom(u_long sasi, vm_offset_t src, caddr_t dst, size_t len)
219 */
220ENTRY(ascopyfrom)
221	wr	%o0, 0, %asi
222	_MEMCPY(%o2, %o1, %o3, EMPTY, EMPTY, a, %asi)
223	retl
224	 nop
225END(ascopyfrom)
226
227/*
228 * void ascopyto(caddr_t src, u_long dasi, vm_offset_t dst, size_t len)
229 */
230ENTRY(ascopyto)
231	wr	%o1, 0, %asi
232	_MEMCPY(%o2, %o0, %o3, a, %asi, EMPTY, EMPTY)
233	retl
234	 nop
235END(ascopyto)
236
237/*
238 * void aszero(u_long asi, vm_offset_t pa, size_t len)
239 */
240ENTRY(aszero)
241	wr	%o0, 0, %asi
242	_MEMSET(%o1, %g0, %o2, a, %asi)
243	retl
244	 nop
245END(aszero)
246
247/*
248 * int bcmp(const void *b1, const void *b2, size_t len)
249 */
250ENTRY(bcmp)
251	brz,pn	%o2, 2f
252	 clr	%o3
2531:	ldub	[%o0 + %o3], %o4
254	ldub	[%o1 + %o3], %o5
255	cmp	%o4, %o5
256	bne,pn	%xcc, 2f
257	 inc	%o3
258	deccc	%o2
259	bne,pt	%xcc, 1b
260	 nop
2612:	retl
262	 mov	%o2, %o0
263END(bcmp)
264
265/*
266 * void bcopy(const void *src, void *dst, size_t len)
267 */
268ENTRY(bcopy)
269	/*
270	 * Check for overlap, and copy backwards if so.
271	 */
272	sub	%o1, %o0, %g1
273	cmp	%g1, %o2
274	bgeu,a,pt %xcc, 3f
275	 nop
276
277	/*
278	 * Copy backwards.
279	 */
280	add	%o0, %o2, %o0
281	add	%o1, %o2, %o1
2821:	deccc	1, %o2
283	bl,a,pn	%xcc, 2f
284	 nop
285	dec	1, %o0
286	ldub	[%o0], %g1
287	dec	1, %o1
288	ba	%xcc, 1b
289	 stb	%g1, [%o1]
2902:	retl
291	 nop
292
293	/*
294	 * Do the fast version.
295	 */
2963:	_MEMCPY(%o1, %o0, %o2, EMPTY, EMPTY, EMPTY, EMPTY)
297	retl
298	 nop
299END(bcopy)
300
301/*
302 * void bzero(void *b, size_t len)
303 */
304ENTRY(bzero)
305	_MEMSET(%o0, %g0, %o1, EMPTY, EMPTY)
306	retl
307	 nop
308END(bzero)
309
310/*
311 * int copystr(const void *src, void *dst, size_t len, size_t *done)
312 */
313ENTRY(copystr)
314	_COPYSTR(%o0, %o1, %o2, %o3, EMPTY, EMPTY, EMPTY, EMPTY)
315	retl
316	 mov	%g1, %o0
317END(copystr)
318
319/*
320 * void *memcpy(void *dst, const void *src, size_t len)
321 */
322ENTRY(memcpy)
323	mov	%o0, %o3
324	_MEMCPY(%o3, %o1, %o2, EMPTY, EMPTY, EMPTY, EMPTY)
325	retl
326	 nop
327END(memcpy)
328
329/*
330 * void *memset(void *b, int c, size_t len)
331 */
332ENTRY(memset)
333	mov	%o0, %o3
334	_MEMSET(%o3, %o1, %o2, EMPTY, EMPTY)
335	retl
336	 nop
337END(memset)
338
339	.globl	copy_nofault_begin
340copy_nofault_begin:
341	nop
342
343/*
344 * int copyin(const void *uaddr, void *kaddr, size_t len)
345 */
346ENTRY(copyin)
347	wr	%g0, ASI_AIUP, %asi
348	_MEMCPY(%o1, %o0, %o2, EMPTY, EMPTY, a, %asi)
349	retl
350	 clr	%o0
351END(copyin)
352
353/*
354 * int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done)
355 */
356ENTRY(copyinstr)
357	wr	%g0, ASI_AIUP, %asi
358	_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, EMPTY, EMPTY)
359	retl
360	 mov	%g1, %o0
361END(copyinstr)
362
363/*
364 * int copyout(const void *kaddr, void *uaddr, size_t len)
365 */
366ENTRY(copyout)
367	wr	%g0, ASI_AIUP, %asi
368	_MEMCPY(%o1, %o0, %o2, a, %asi, EMPTY, EMPTY)
369	retl
370	 clr	%o0
371END(copyout)
372
373	.globl	copy_nofault_end
374copy_nofault_end:
375	nop
376
377ENTRY(copy_fault)
378	retl
379	 mov	EFAULT, %o0
380END(copy_fault)
381
382	.globl	fs_nofault_begin
383fs_nofault_begin:
384	nop
385
386/*
387 * Chatty aliases for fetch, store functions.
388 */
389	.globl	fubyte, fusword, fuword, subyte, susword, suword
390	.set	fubyte, fuword8
391	.set	fusword, fuword16
392	.set	fuword, fuword64
393	.set	subyte, suword8
394	.set	susword, suword16
395	.set	suword, suword64
396
397	.globl	casuword32, casuword, fuptr, suptr
398	.set	casuword, casuword64
399	.set	fuptr, fuword64
400	.set	suptr, suword64
401
402/*
403 * int32_t casuword32(volatile int32_t *p, int32_t e, int32_t s)
404 */
405ENTRY(casuword32)
406	casa	[%o0] ASI_AIUP, %o1, %o2
407	retl
408	 mov	%o2, %o0
409END(casuword32)
410
411/*
412 * int64_t casuword64(volatile int64_t *p, int64_t e, int64_t s)
413 */
414ENTRY(casuword64)
415	casxa	[%o0] ASI_AIUP, %o1, %o2
416	retl
417	 mov	%o2, %o0
418END(casuword64)
419
420/*
421 * int fuword8(const void *base)
422 */
423ENTRY(fuword8)
424	retl
425	 lduba	[%o0] ASI_AIUP, %o0
426END(fuword8)
427
428/*
429 * int fuword16(const void *base)
430 */
431ENTRY(fuword16)
432	retl
433	 lduha	[%o0] ASI_AIUP, %o0
434END(fuword16)
435
436/*
437 * int32_t fuword32(const void *base)
438 */
439ENTRY(fuword32)
440	retl
441	 lduwa	[%o0] ASI_AIUP, %o0
442END(fuword32)
443
444/*
445 * int64_t fuword64(const void *base)
446 */
447ENTRY(fuword64)
448	retl
449	 ldxa	[%o0] ASI_AIUP, %o0
450END(fuword64)
451
452/*
453 * int suword8(const void *base, int word)
454 */
455ENTRY(suword8)
456	stba	%o1, [%o0] ASI_AIUP
457	retl
458	 clr	%o0
459END(suword8)
460
461/*
462 * int suword16(const void *base, int word)
463 */
464ENTRY(suword16)
465	stha	%o1, [%o0] ASI_AIUP
466	retl
467	 clr	%o0
468END(suword16)
469
470/*
471 * int suword32(const void *base, int32_t word)
472 */
473ENTRY(suword32)
474	stwa	%o1, [%o0] ASI_AIUP
475	retl
476	 clr	%o0
477END(suword32)
478
479/*
480 * int suword64(const void *base, int64_t word)
481 */
482ENTRY(suword64)
483	stxa	%o1, [%o0] ASI_AIUP
484	retl
485	 clr	%o0
486END(suword64)
487
488	.globl	fs_nofault_intr_begin
489fs_nofault_intr_begin:
490	nop
491
492/*
493 * int fuswintr(const void *base)
494 */
495ENTRY(fuswintr)
496	retl
497	 lduha	[%o0] ASI_AIUP, %o0
498END(fuswintr)
499
500/*
501 * int suswintr(const void *base, int word)
502 */
503ENTRY(suswintr)
504	stha	%o1, [%o0] ASI_AIUP
505	retl
506	 clr	%o0
507END(suswintr)
508
509	.globl	fs_nofault_intr_end
510fs_nofault_intr_end:
511	nop
512
513	.globl	fs_nofault_end
514fs_nofault_end:
515	nop
516
517ENTRY(fs_fault)
518	retl
519	 mov	-1, %o0
520END(fsfault)
521
522	.globl	fas_nofault_begin
523fas_nofault_begin:
524
525/*
526 * int fasword8(u_long asi, uint64_t addr, uint8_t *val)
527 */
528ENTRY(fasword8)
529	wr	%o0, 0, %asi
530	membar	#Sync
531	lduba	[%o1] %asi, %o3
532	membar	#Sync
533	stb	%o3, [%o2]
534	retl
535	 clr	%o0
536END(fasword8)
537
538/*
539 * int fasword16(u_long asi, uint64_t addr, uint16_t *val)
540 */
541ENTRY(fasword16)
542	wr	%o0, 0, %asi
543	membar	#Sync
544	lduha	[%o1] %asi, %o3
545	membar	#Sync
546	sth	%o3, [%o2]
547	retl
548	 clr	%o0
549END(fasword16)
550
551/*
552 * int fasword32(u_long asi, uint64_t addr, uint32_t *val)
553 */
554ENTRY(fasword32)
555	wr	%o0, 0, %asi
556	membar	#Sync
557	lduwa	[%o1] %asi, %o3
558	membar	#Sync
559	stw	%o3, [%o2]
560	retl
561	 clr	%o0
562END(fasword32)
563
564	.globl	fas_nofault_end
565fas_nofault_end:
566	nop
567
568	.globl	fas_fault
569ENTRY(fas_fault)
570	retl
571	 mov	-1, %o0
572END(fas_fault)
573
574	.globl	fpu_fault_begin
575fpu_fault_begin:
576	nop
577
578/*
579 * void spitfire_block_copy(void *src, void *dst, size_t len)
580 */
581ENTRY(spitfire_block_copy)
582	rdpr	%pstate, %o3
583	wrpr	%g0, PSTATE_NORMAL, %pstate
584
585	wr	%g0, ASI_BLK_S, %asi
586	wr	%g0, FPRS_FEF, %fprs
587
588	sub	PCB_REG, TF_SIZEOF, %o4
589	ldx	[%o4 + TF_FPRS], %o5
590	andcc	%o5, FPRS_FEF, %g0
591	bz,a,pt	%xcc, 1f
592	 nop
593	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
594	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
595	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
596	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
597	membar	#Sync
598
599	andn	%o5, FPRS_FEF, %o5
600	stx	%o5, [%o4 + TF_FPRS]
601	ldx	[PCB_REG + PCB_FLAGS], %o4
602	or	%o4, PCB_FEF, %o4
603	stx	%o4, [PCB_REG + PCB_FLAGS]
604
6051:	wrpr	%o3, 0, %pstate
606
607	ldda	[%o0] %asi, %f0
608	add	%o0, VIS_BLOCKSIZE, %o0
609	sub	%o2, VIS_BLOCKSIZE, %o2
610
6112:	ldda	[%o0] %asi, %f16
612	fsrc1	%f0, %f32
613	fsrc1	%f2, %f34
614	fsrc1	%f4, %f36
615	fsrc1	%f6, %f38
616	fsrc1	%f8, %f40
617	fsrc1	%f10, %f42
618	fsrc1	%f12, %f44
619	fsrc1	%f14, %f46
620	stda	%f32, [%o1] %asi
621	add	%o0, VIS_BLOCKSIZE, %o0
622	subcc	%o2, VIS_BLOCKSIZE, %o2
623	bz,pn	%xcc, 3f
624	 add	%o1, VIS_BLOCKSIZE, %o1
625	ldda	[%o0] %asi, %f0
626	fsrc1	%f16, %f32
627	fsrc1	%f18, %f34
628	fsrc1	%f20, %f36
629	fsrc1	%f22, %f38
630	fsrc1	%f24, %f40
631	fsrc1	%f26, %f42
632	fsrc1	%f28, %f44
633	fsrc1	%f30, %f46
634	stda	%f32, [%o1] %asi
635	add	%o0, VIS_BLOCKSIZE, %o0
636	sub	%o2, VIS_BLOCKSIZE, %o2
637	ba,pt	%xcc, 2b
638	 add	%o1, VIS_BLOCKSIZE, %o1
639
6403:	membar	#Sync
641
642	stda	%f16, [%o1] %asi
643	membar	#Sync
644
645	retl
646	 wr	%g0, 0, %fprs
647END(spitfire_block_copy)
648
649/*
650 * void zeus_block_copy(void *src, void *dst, size_t len)
651 */
652ENTRY(zeus_block_copy)
653	prefetch [%o0 + (0 * VIS_BLOCKSIZE)], 0
654
655	rdpr	%pstate, %o3
656	wrpr	%g0, PSTATE_NORMAL, %pstate
657
658	wr	%g0, ASI_BLK_S, %asi
659	wr	%g0, FPRS_FEF, %fprs
660
661	sub	PCB_REG, TF_SIZEOF, %o4
662	ldx	[%o4 + TF_FPRS], %o5
663	andcc	%o5, FPRS_FEF, %g0
664	bz,a,pt	%xcc, 1f
665	 nop
666	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
667	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
668	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
669	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
670	membar	#Sync
671
672	andn	%o5, FPRS_FEF, %o5
673	stx	%o5, [%o4 + TF_FPRS]
674	ldx	[PCB_REG + PCB_FLAGS], %o4
675	or	%o4, PCB_FEF, %o4
676	stx	%o4, [PCB_REG + PCB_FLAGS]
677
6781:	wrpr	%o3, 0, %pstate
679
680	ldd	[%o0 + (0 * 8)], %f0
681	prefetch [%o0 + (1 * VIS_BLOCKSIZE)], 0
682	ldd	[%o0 + (1 * 8)], %f2
683	prefetch [%o0 + (2 * VIS_BLOCKSIZE)], 0
684	fmovd	%f0, %f32
685	ldd	[%o0 + (2 * 8)], %f4
686	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], 0
687	fmovd	%f2, %f34
688	ldd	[%o0 + (3 * 8)], %f6
689	prefetch [%o0 + (4 * VIS_BLOCKSIZE)], 1
690	fmovd	%f4, %f36
691	ldd	[%o0 + (4 * 8)], %f8
692	prefetch [%o0 + (8 * VIS_BLOCKSIZE)], 1
693	fmovd	%f6, %f38
694	ldd	[%o0 + (5 * 8)], %f10
695	prefetch [%o0 + (12 * VIS_BLOCKSIZE)], 1
696	fmovd	%f8, %f40
697	ldd	[%o0 + (6 * 8)], %f12
698	prefetch [%o0 + (16 * VIS_BLOCKSIZE)], 1
699	fmovd	%f10, %f42
700	ldd	[%o0 + (7 * 8)], %f14
701	ldd	[%o0 + (8 * 8)], %f0
702	sub	%o2, VIS_BLOCKSIZE, %o2
703	add	%o0, VIS_BLOCKSIZE, %o0
704	prefetch [%o0 + (19 * VIS_BLOCKSIZE)], 1
705	ba,pt	%xcc, 2f
706	 prefetch [%o0 + (23 * VIS_BLOCKSIZE)], 1
707	.align	32
708
7092:	ldd	[%o0 + (1 * 8)], %f2
710	fmovd	%f12, %f44
711	ldd	[%o0 + (2 * 8)], %f4
712	fmovd	%f14, %f46
713	stda	%f32, [%o1] %asi
714	ldd	[%o0 + (3 * 8)], %f6
715	fmovd	%f0, %f32
716	ldd	[%o0 + (4 * 8)], %f8
717	fmovd	%f2, %f34
718	ldd	[%o0 + (5 * 8)], %f10
719	fmovd	%f4, %f36
720	ldd	[%o0 + (6 * 8)], %f12
721	fmovd	%f6, %f38
722	ldd	[%o0 + (7 * 8)], %f14
723	fmovd	%f8, %f40
724	ldd	[%o0 + (8 * 8)], %f0
725	fmovd	%f10, %f42
726	sub	%o2, VIS_BLOCKSIZE, %o2
727	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], 0
728	add	%o1, VIS_BLOCKSIZE, %o1
729	prefetch [%o0 + (24 * VIS_BLOCKSIZE)], 1
730	add	%o0, VIS_BLOCKSIZE, %o0
731	cmp	%o2, VIS_BLOCKSIZE + 8
732	bgu,pt	%xcc, 2b
733	 prefetch [%o0 + (12 * VIS_BLOCKSIZE)], 1
734	ldd	[%o0 + (1 * 8)], %f2
735	fsrc1	%f12, %f44
736	ldd	[%o0 + (2 * 8)], %f4
737	fsrc1	%f14, %f46
738	stda	%f32, [%o1] %asi
739	ldd	[%o0 + (3 * 8)], %f6
740	fsrc1	%f0, %f32
741	ldd	[%o0 + (4 * 8)], %f8
742	fsrc1	%f2, %f34
743	ldd	[%o0 + (5 * 8)], %f10
744	fsrc1	%f4, %f36
745	ldd	[%o0 + (6 * 8)], %f12
746	fsrc1	%f6, %f38
747	ldd	[%o0 + (7 * 8)], %f14
748	fsrc1	%f8, %f40
749	add	%o1, VIS_BLOCKSIZE, %o1
750	fsrc1	%f10, %f42
751	fsrc1	%f12, %f44
752	fsrc1	%f14, %f46
753	stda	%f32, [%o1] %asi
754	membar	#Sync
755
756	retl
757	 wr	%g0, 0, %fprs
758END(zeus_block_copy)
759
760/*
761 * void spitfire_block_zero(void *dst, size_t len)
762 * void zeus_block_zero(void *dst, size_t len)
763 */
764ALTENTRY(zeus_block_zero)
765ENTRY(spitfire_block_zero)
766	rdpr	%pstate, %o3
767	wrpr	%g0, PSTATE_NORMAL, %pstate
768
769	wr	%g0, ASI_BLK_S, %asi
770	wr	%g0, FPRS_FEF, %fprs
771
772	sub	PCB_REG, TF_SIZEOF, %o4
773	ldx	[%o4 + TF_FPRS], %o5
774	andcc	%o5, FPRS_FEF, %g0
775	bz,a,pt	%xcc, 1f
776	 nop
777	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
778	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
779	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
780	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
781	membar	#Sync
782
783	andn	%o5, FPRS_FEF, %o5
784	stx	%o5, [%o4 + TF_FPRS]
785	ldx	[PCB_REG + PCB_FLAGS], %o4
786	or	%o4, PCB_FEF, %o4
787	stx	%o4, [PCB_REG + PCB_FLAGS]
788
7891:	wrpr	%o3, 0, %pstate
790
791	fzero	%f0
792	fzero	%f2
793	fzero	%f4
794	fzero	%f6
795	fzero	%f8
796	fzero	%f10
797	fzero	%f12
798	fzero	%f14
799
8001:	stda	%f0, [%o0 + (0 * VIS_BLOCKSIZE)] %asi
801	stda	%f0, [%o0 + (1 * VIS_BLOCKSIZE)] %asi
802	stda	%f0, [%o0 + (2 * VIS_BLOCKSIZE)] %asi
803	stda	%f0, [%o0 + (3 * VIS_BLOCKSIZE)] %asi
804	sub	%o1, (4 * VIS_BLOCKSIZE), %o1
805	brnz,pt	%o1, 1b
806	 add	%o0, (4 * VIS_BLOCKSIZE), %o0
807	membar	#Sync
808
809	retl
810	 wr	%g0, 0, %fprs
811END(spitfire_block_zero)
812
813	.globl	fpu_fault_end
814fpu_fault_end:
815	nop
816
817	.globl	fpu_fault_size
818	.set	fpu_fault_size, fpu_fault_end - fpu_fault_begin
819
820ENTRY(longjmp)
821	set	1, %g3
822	movrz	%o1, %o1, %g3
823	mov	%o0, %g1
824	ldx	[%g1 + _JB_FP], %g2
8251:	cmp	%fp, %g2
826	bl,a,pt	%xcc, 1b
827	 restore
828	bne,pn	%xcc, 2f
829	 ldx	[%g1 + _JB_SP], %o2
830	cmp	%o2, %sp
831	blt,pn	%xcc, 2f
832	 movge	%xcc, %o2, %sp
833	ldx	[%g1 + _JB_PC], %o7
834	retl
835	 mov	%g3, %o0
8362:	PANIC("longjmp botch", %l1)
837END(longjmp)
838
839ENTRY(setjmp)
840	stx	%sp, [%o0 + _JB_SP]
841	stx	%o7, [%o0 + _JB_PC]
842	stx	%fp, [%o0 + _JB_FP]
843	retl
844	 clr	%o0
845END(setjmp)
846
847/*
848 * void ofw_entry(cell_t args[])
849 */
850ENTRY(ofw_entry)
851	save	%sp, -CCFSZ, %sp
852	SET(ofw_vec, %l7, %l6)
853	ldx	[%l6], %l6
854	rdpr	%pstate, %l7
855	andn	%l7, PSTATE_AM | PSTATE_IE, %l5
856	wrpr	%l5, 0, %pstate
857	SET(tba_taken_over, %l5, %l4)
858	brz,pn	%l4, 1f
859	 rdpr	%wstate, %l5
860	andn	%l5, WSTATE_PROM_MASK, %l3
861	wrpr	%l3, WSTATE_PROM_KMIX, %wstate
8621:	call	%l6
863	 mov	%i0, %o0
864	brz,pn	%l4, 1f
865	 nop
866	wrpr	%g0, %l5, %wstate
8671:	wrpr	%l7, 0, %pstate
868	ret
869	 restore %o0, %g0, %o0
870END(ofw_entry)
871
872/*
873 * void ofw_exit(cell_t args[])
874 */
875ENTRY(ofw_exit)
876	save	%sp, -CCFSZ, %sp
877	flushw
878	SET(ofw_tba, %l7, %l5)
879	ldx	[%l5], %l5
880	rdpr	%pstate, %l7
881	andn	%l7, PSTATE_AM | PSTATE_IE, %l7
882	wrpr	%l7, 0, %pstate
883	rdpr	%wstate, %l7
884	andn	%l7, WSTATE_PROM_MASK, %l7
885	wrpr	%l7, WSTATE_PROM_KMIX, %wstate
886	wrpr	%l5, 0, %tba			! restore the OFW trap table
887	SET(ofw_vec, %l7, %l6)
888	ldx	[%l6], %l6
889	SET(kstack0 + KSTACK_PAGES * PAGE_SIZE - PCB_SIZEOF, %l7, %l0)
890	sub	%l0, SPOFF, %fp			! setup a stack in a locked page
891	sub	%l0, SPOFF + CCFSZ, %sp
892	mov	AA_DMMU_PCXR, %l3		! force primary DMMU context 0
893	sethi	%hi(KERNBASE), %l5
894	stxa	%g0, [%l3] ASI_DMMU
895	flush	%l5
896	wrpr	%g0, 0, %tl			! force trap level 0
897	call	%l6
898	 mov	%i0, %o0
899	! never to return
900END(ofw_exit)
901
902#ifdef GPROF
903
904ENTRY(user)
905	nop
906
907ENTRY(btrap)
908	nop
909
910ENTRY(etrap)
911	nop
912
913ENTRY(bintr)
914	nop
915
916ENTRY(eintr)
917	nop
918
919/*
920 * XXX including sys/gmon.h in genassym.c is not possible due to uintfptr_t
921 * badness.
922 */
923#define	GM_STATE	0x0
924#define	GMON_PROF_OFF	3
925#define	GMON_PROF_HIRES	4
926
927	.globl	_mcount
928	.set	_mcount, __cyg_profile_func_enter
929
930ENTRY(__cyg_profile_func_enter)
931	SET(_gmonparam, %o3, %o2)
932	lduw	[%o2 + GM_STATE], %o3
933	cmp	%o3, GMON_PROF_OFF
934	be,a,pn %icc, 1f
935	 nop
936	SET(mcount, %o3, %o2)
937	jmpl	%o2, %g0
938	 nop
9391:	retl
940	 nop
941END(__cyg_profile_func_enter)
942
943#ifdef GUPROF
944
945ENTRY(__cyg_profile_func_exit)
946	SET(_gmonparam, %o3, %o2)
947	lduw	[%o2 + GM_STATE], %o3
948	cmp	%o3, GMON_PROF_HIRES
949	be,a,pn %icc, 1f
950	 nop
951	SET(mexitcount, %o3, %o2)
952	jmpl	%o2, %g0
953	 nop
9541:	retl
955	 nop
956END(__cyg_profile_func_exit)
957
958#endif /* GUPROF */
959
960#endif /* GPROF */
961