155714Skris.ident	"sparcv8plus.s, Version 1.4"
255714Skris.ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
355714Skris
455714Skris/*
555714Skris * ====================================================================
655714Skris * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
755714Skris * project.
855714Skris *
955714Skris * Rights for redistribution and usage in source and binary forms are
1055714Skris * granted according to the OpenSSL license. Warranty of any kind is
1155714Skris * disclaimed.
1255714Skris * ====================================================================
1355714Skris */
1455714Skris
1555714Skris/*
1655714Skris * This is my modest contributon to OpenSSL project (see
1755714Skris * http://www.openssl.org/ for more information about it) and is
1855714Skris * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
1955714Skris * module. For updates see http://fy.chalmers.se/~appro/hpe/.
2055714Skris *
2155714Skris * Questions-n-answers.
2255714Skris *
2355714Skris * Q. How to compile?
2455714Skris * A. With SC4.x/SC5.x:
2555714Skris *
2655714Skris *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
2755714Skris *
2855714Skris *    and with gcc:
2955714Skris *
3055714Skris *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
3155714Skris *
3255714Skris *    or if above fails (it does if you have gas installed):
3355714Skris *
3455714Skris *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
3555714Skris *
3655714Skris *    Quick-n-dirty way to fuse the module into the library.
3755714Skris *    Provided that the library is already configured and built
3855714Skris *    (in 0.9.2 case with no-asm option):
3955714Skris *
4055714Skris *	# cd crypto/bn
4155714Skris *	# cp /some/place/bn_asm.sparc.v8plus.S .
4255714Skris *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
4355714Skris *	# make
4455714Skris *	# cd ../..
4555714Skris *	# make; make test
4655714Skris *
4755714Skris *    Quick-n-dirty way to get rid of it:
4855714Skris *
4955714Skris *	# cd crypto/bn
5055714Skris *	# touch bn_asm.c
5155714Skris *	# make
5255714Skris *	# cd ../..
5355714Skris *	# make; make test
5455714Skris *
5555714Skris * Q. V8plus achitecture? What kind of beast is that?
5655714Skris * A. Well, it's rather a programming model than an architecture...
5755714Skris *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
5855714Skris *    special conditions, namely when kernel doesn't preserve upper
5955714Skris *    32 bits of otherwise 64-bit registers during a context switch.
6055714Skris *
6155714Skris * Q. Why just UltraSPARC? What about SuperSPARC?
6255714Skris * A. Original release did target UltraSPARC only. Now SuperSPARC
6355714Skris *    version is provided along. Both version share bn_*comba[48]
6455714Skris *    implementations (see comment later in code for explanation).
6555714Skris *    But what's so special about this UltraSPARC implementation?
6655714Skris *    Why didn't I let compiler do the job? Trouble is that most of
6755714Skris *    available compilers (well, SC5.0 is the only exception) don't
6855714Skris *    attempt to take advantage of UltraSPARC's 64-bitness under
6955714Skris *    32-bit kernels even though it's perfectly possible (see next
7055714Skris *    question).
7155714Skris *
7255714Skris * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
7355714Skris *    doesn't work?
7455714Skris * A. You can't adress *all* registers as 64-bit wide:-( The catch is
7555714Skris *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
7655714Skris *    preserved if you're in a leaf function, i.e. such never calling
7755714Skris *    any other functions. All functions in this module are leaf and
7855714Skris *    10 registers is a handful. And as a matter of fact none-"comba"
7955714Skris *    routines don't require even that much and I could even afford to
8055714Skris *    not allocate own stack frame for 'em:-)
8155714Skris *
8255714Skris * Q. What about 64-bit kernels?
8355714Skris * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
8455714Skris *    under evaluation and development...
8555714Skris *
8655714Skris * Q. What about shared libraries?
8755714Skris * A. What about 'em? Kidding again:-) Code does *not* contain any
8855714Skris *    code position dependencies and it's safe to include it into
8955714Skris *    shared library as is.
9055714Skris *
9155714Skris * Q. How much faster does it go?
9255714Skris * A. Do you have a good benchmark? In either case below is what I
9355714Skris *    experience with crypto/bn/expspeed.c test program:
9455714Skris *
9555714Skris *	v8plus module on U10/300MHz against bn_asm.c compiled with:
9655714Skris *
9755714Skris *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
9855714Skris *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
9955714Skris *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
10055714Skris *
10155714Skris *	v8 module on SS10/60MHz against bn_asm.c compiled with:
10255714Skris *
10355714Skris *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
10455714Skris *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
10555714Skris *	egcs-1.1.2 -mv8 -O3			+35-45%
10655714Skris *
10755714Skris *    As you can see it's damn hard to beat the new Sun C compiler
10855714Skris *    and it's in first place GNU C users who will appreciate this
10955714Skris *    assembler implementation:-)
11055714Skris */
11155714Skris
11255714Skris/*
11355714Skris * Revision history.
11455714Skris *
11555714Skris * 1.0	- initial release;
11655714Skris * 1.1	- new loop unrolling model(*);
11755714Skris *	- some more fine tuning;
11855714Skris * 1.2	- made gas friendly;
11955714Skris *	- updates to documentation concerning v9;
12055714Skris *	- new performance comparison matrix;
12155714Skris * 1.3	- fixed problem with /usr/ccs/lib/cpp;
12255714Skris * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
12355714Skris *	  resulting in slight overall performance kick;
12455714Skris *	- some retunes;
12555714Skris *	- support for GNU as added;
12655714Skris *
12755714Skris * (*)	Originally unrolled loop looked like this:
12855714Skris *	    for (;;) {
12955714Skris *		op(p+0); if (--n==0) break;
13055714Skris *		op(p+1); if (--n==0) break;
13155714Skris *		op(p+2); if (--n==0) break;
13255714Skris *		op(p+3); if (--n==0) break;
13355714Skris *		p+=4;
13455714Skris *	    }
13555714Skris *	I unroll according to following:
13655714Skris *	    while (n&~3) {
13755714Skris *		op(p+0); op(p+1); op(p+2); op(p+3);
13855714Skris *		p+=4; n=-4;
13955714Skris *	    }
14055714Skris *	    if (n) {
14155714Skris *		op(p+0); if (--n==0) return;
14255714Skris *		op(p+2); if (--n==0) return;
14355714Skris *		op(p+3); return;
14455714Skris *	    }
14555714Skris */
14655714Skris
147238405Sjkim#if defined(__SUNPRO_C) && defined(__sparcv9)
148238405Sjkim  /* They've said -xarch=v9 at command line */
149238405Sjkim  .register	%g2,#scratch
150238405Sjkim  .register	%g3,#scratch
151238405Sjkim# define	FRAME_SIZE	-192
152238405Sjkim#elif defined(__GNUC__) && defined(__arch64__)
153238405Sjkim  /* They've said -m64 at command line */
154238405Sjkim  .register	%g2,#scratch
155238405Sjkim  .register	%g3,#scratch
156238405Sjkim# define	FRAME_SIZE	-192
157238405Sjkim#else
158238405Sjkim# define	FRAME_SIZE	-96
159238405Sjkim#endif
16055714Skris/*
16155714Skris * GNU assembler can't stand stuw:-(
16255714Skris */
16355714Skris#define stuw st
16455714Skris
16555714Skris.section	".text",#alloc,#execinstr
16655714Skris.file		"bn_asm.sparc.v8plus.S"
16755714Skris
16855714Skris.align	32
16955714Skris
17055714Skris.global bn_mul_add_words
17155714Skris/*
17255714Skris * BN_ULONG bn_mul_add_words(rp,ap,num,w)
17355714Skris * BN_ULONG *rp,*ap;
17455714Skris * int num;
17555714Skris * BN_ULONG w;
17655714Skris */
17755714Skrisbn_mul_add_words:
178160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
17955714Skris	brgz,a	%o2,.L_bn_mul_add_words_proceed
18055714Skris	lduw	[%o1],%g2
18155714Skris	retl
18255714Skris	clr	%o0
183160814Ssimon	nop
184160814Ssimon	nop
185160814Ssimon	nop
18655714Skris
18755714Skris.L_bn_mul_add_words_proceed:
18855714Skris	srl	%o3,%g0,%o3	! clruw	%o3
18955714Skris	andcc	%o2,-4,%g0
19055714Skris	bz,pn	%icc,.L_bn_mul_add_words_tail
19155714Skris	clr	%o5
19255714Skris
19355714Skris.L_bn_mul_add_words_loop:	! wow! 32 aligned!
19455714Skris	lduw	[%o0],%g1
19555714Skris	lduw	[%o1+4],%g3
19655714Skris	mulx	%o3,%g2,%g2
19755714Skris	add	%g1,%o5,%o4
19855714Skris	nop
19955714Skris	add	%o4,%g2,%o4
20055714Skris	stuw	%o4,[%o0]
20155714Skris	srlx	%o4,32,%o5
20255714Skris
20355714Skris	lduw	[%o0+4],%g1
20455714Skris	lduw	[%o1+8],%g2
20555714Skris	mulx	%o3,%g3,%g3
20655714Skris	add	%g1,%o5,%o4
20755714Skris	dec	4,%o2
20855714Skris	add	%o4,%g3,%o4
20955714Skris	stuw	%o4,[%o0+4]
21055714Skris	srlx	%o4,32,%o5
21155714Skris
21255714Skris	lduw	[%o0+8],%g1
21355714Skris	lduw	[%o1+12],%g3
21455714Skris	mulx	%o3,%g2,%g2
21555714Skris	add	%g1,%o5,%o4
21655714Skris	inc	16,%o1
21755714Skris	add	%o4,%g2,%o4
21855714Skris	stuw	%o4,[%o0+8]
21955714Skris	srlx	%o4,32,%o5
22055714Skris
22155714Skris	lduw	[%o0+12],%g1
22255714Skris	mulx	%o3,%g3,%g3
22355714Skris	add	%g1,%o5,%o4
22455714Skris	inc	16,%o0
22555714Skris	add	%o4,%g3,%o4
22655714Skris	andcc	%o2,-4,%g0
22755714Skris	stuw	%o4,[%o0-4]
22855714Skris	srlx	%o4,32,%o5
22955714Skris	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
23055714Skris	lduw	[%o1],%g2
23155714Skris
23255714Skris	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
23355714Skris	lduw	[%o1],%g2
23455714Skris.L_bn_mul_add_words_return:
23555714Skris	retl
23655714Skris	mov	%o5,%o0
23755714Skris
23855714Skris.L_bn_mul_add_words_tail:
23955714Skris	lduw	[%o0],%g1
24055714Skris	mulx	%o3,%g2,%g2
24155714Skris	add	%g1,%o5,%o4
24255714Skris	dec	%o2
24355714Skris	add	%o4,%g2,%o4
24455714Skris	srlx	%o4,32,%o5
24555714Skris	brz,pt	%o2,.L_bn_mul_add_words_return
24655714Skris	stuw	%o4,[%o0]
24755714Skris
24855714Skris	lduw	[%o1+4],%g2
24955714Skris	lduw	[%o0+4],%g1
25055714Skris	mulx	%o3,%g2,%g2
25155714Skris	add	%g1,%o5,%o4
25255714Skris	dec	%o2
25355714Skris	add	%o4,%g2,%o4
25455714Skris	srlx	%o4,32,%o5
25555714Skris	brz,pt	%o2,.L_bn_mul_add_words_return
25655714Skris	stuw	%o4,[%o0+4]
25755714Skris
25855714Skris	lduw	[%o1+8],%g2
25955714Skris	lduw	[%o0+8],%g1
26055714Skris	mulx	%o3,%g2,%g2
26155714Skris	add	%g1,%o5,%o4
26255714Skris	add	%o4,%g2,%o4
26355714Skris	stuw	%o4,[%o0+8]
26455714Skris	retl
26555714Skris	srlx	%o4,32,%o0
26655714Skris
26755714Skris.type	bn_mul_add_words,#function
26855714Skris.size	bn_mul_add_words,(.-bn_mul_add_words)
26955714Skris
27055714Skris.align	32
27155714Skris
27255714Skris.global bn_mul_words
27355714Skris/*
27455714Skris * BN_ULONG bn_mul_words(rp,ap,num,w)
27555714Skris * BN_ULONG *rp,*ap;
27655714Skris * int num;
27755714Skris * BN_ULONG w;
27855714Skris */
27955714Skrisbn_mul_words:
280160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
28155714Skris	brgz,a	%o2,.L_bn_mul_words_proceeed
28255714Skris	lduw	[%o1],%g2
28355714Skris	retl
28455714Skris	clr	%o0
285160814Ssimon	nop
286160814Ssimon	nop
287160814Ssimon	nop
28855714Skris
28955714Skris.L_bn_mul_words_proceeed:
29055714Skris	srl	%o3,%g0,%o3	! clruw	%o3
29155714Skris	andcc	%o2,-4,%g0
29255714Skris	bz,pn	%icc,.L_bn_mul_words_tail
29355714Skris	clr	%o5
29455714Skris
29555714Skris.L_bn_mul_words_loop:		! wow! 32 aligned!
29655714Skris	lduw	[%o1+4],%g3
29755714Skris	mulx	%o3,%g2,%g2
29855714Skris	add	%g2,%o5,%o4
29955714Skris	nop
30055714Skris	stuw	%o4,[%o0]
30155714Skris	srlx	%o4,32,%o5
30255714Skris
30355714Skris	lduw	[%o1+8],%g2
30455714Skris	mulx	%o3,%g3,%g3
30555714Skris	add	%g3,%o5,%o4
30655714Skris	dec	4,%o2
30755714Skris	stuw	%o4,[%o0+4]
30855714Skris	srlx	%o4,32,%o5
30955714Skris
31055714Skris	lduw	[%o1+12],%g3
31155714Skris	mulx	%o3,%g2,%g2
31255714Skris	add	%g2,%o5,%o4
31355714Skris	inc	16,%o1
31455714Skris	stuw	%o4,[%o0+8]
31555714Skris	srlx	%o4,32,%o5
31655714Skris
31755714Skris	mulx	%o3,%g3,%g3
31855714Skris	add	%g3,%o5,%o4
31955714Skris	inc	16,%o0
32055714Skris	stuw	%o4,[%o0-4]
32155714Skris	srlx	%o4,32,%o5
32255714Skris	andcc	%o2,-4,%g0
32355714Skris	bnz,a,pt	%icc,.L_bn_mul_words_loop
32455714Skris	lduw	[%o1],%g2
32555714Skris	nop
32655714Skris	nop
32755714Skris
32855714Skris	brnz,a,pn	%o2,.L_bn_mul_words_tail
32955714Skris	lduw	[%o1],%g2
33055714Skris.L_bn_mul_words_return:
33155714Skris	retl
33255714Skris	mov	%o5,%o0
33355714Skris
33455714Skris.L_bn_mul_words_tail:
33555714Skris	mulx	%o3,%g2,%g2
33655714Skris	add	%g2,%o5,%o4
33755714Skris	dec	%o2
33855714Skris	srlx	%o4,32,%o5
33955714Skris	brz,pt	%o2,.L_bn_mul_words_return
34055714Skris	stuw	%o4,[%o0]
34155714Skris
34255714Skris	lduw	[%o1+4],%g2
34355714Skris	mulx	%o3,%g2,%g2
34455714Skris	add	%g2,%o5,%o4
34555714Skris	dec	%o2
34655714Skris	srlx	%o4,32,%o5
34755714Skris	brz,pt	%o2,.L_bn_mul_words_return
34855714Skris	stuw	%o4,[%o0+4]
34955714Skris
35055714Skris	lduw	[%o1+8],%g2
35155714Skris	mulx	%o3,%g2,%g2
35255714Skris	add	%g2,%o5,%o4
35355714Skris	stuw	%o4,[%o0+8]
35455714Skris	retl
35555714Skris	srlx	%o4,32,%o0
35655714Skris
35755714Skris.type	bn_mul_words,#function
35855714Skris.size	bn_mul_words,(.-bn_mul_words)
35955714Skris
36055714Skris.align  32
36155714Skris.global	bn_sqr_words
36255714Skris/*
36355714Skris * void bn_sqr_words(r,a,n)
36455714Skris * BN_ULONG *r,*a;
36555714Skris * int n;
36655714Skris */
36755714Skrisbn_sqr_words:
368160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
36955714Skris	brgz,a	%o2,.L_bn_sqr_words_proceeed
37055714Skris	lduw	[%o1],%g2
37155714Skris	retl
37255714Skris	clr	%o0
373160814Ssimon	nop
374160814Ssimon	nop
375160814Ssimon	nop
37655714Skris
37755714Skris.L_bn_sqr_words_proceeed:
37855714Skris	andcc	%o2,-4,%g0
37955714Skris	nop
38055714Skris	bz,pn	%icc,.L_bn_sqr_words_tail
38155714Skris	nop
38255714Skris
38355714Skris.L_bn_sqr_words_loop:		! wow! 32 aligned!
38455714Skris	lduw	[%o1+4],%g3
38555714Skris	mulx	%g2,%g2,%o4
38655714Skris	stuw	%o4,[%o0]
38755714Skris	srlx	%o4,32,%o5
38855714Skris	stuw	%o5,[%o0+4]
38955714Skris	nop
39055714Skris
39155714Skris	lduw	[%o1+8],%g2
39255714Skris	mulx	%g3,%g3,%o4
39355714Skris	dec	4,%o2
39455714Skris	stuw	%o4,[%o0+8]
39555714Skris	srlx	%o4,32,%o5
39655714Skris	stuw	%o5,[%o0+12]
39755714Skris
39855714Skris	lduw	[%o1+12],%g3
39955714Skris	mulx	%g2,%g2,%o4
40055714Skris	srlx	%o4,32,%o5
40155714Skris	stuw	%o4,[%o0+16]
40255714Skris	inc	16,%o1
40355714Skris	stuw	%o5,[%o0+20]
40455714Skris
40555714Skris	mulx	%g3,%g3,%o4
40655714Skris	inc	32,%o0
40755714Skris	stuw	%o4,[%o0-8]
40855714Skris	srlx	%o4,32,%o5
40955714Skris	andcc	%o2,-4,%g2
41055714Skris	stuw	%o5,[%o0-4]
41155714Skris	bnz,a,pt	%icc,.L_bn_sqr_words_loop
41255714Skris	lduw	[%o1],%g2
41355714Skris	nop
41455714Skris
41555714Skris	brnz,a,pn	%o2,.L_bn_sqr_words_tail
41655714Skris	lduw	[%o1],%g2
41755714Skris.L_bn_sqr_words_return:
41855714Skris	retl
41955714Skris	clr	%o0
42055714Skris
42155714Skris.L_bn_sqr_words_tail:
42255714Skris	mulx	%g2,%g2,%o4
42355714Skris	dec	%o2
42455714Skris	stuw	%o4,[%o0]
42555714Skris	srlx	%o4,32,%o5
42655714Skris	brz,pt	%o2,.L_bn_sqr_words_return
42755714Skris	stuw	%o5,[%o0+4]
42855714Skris
42955714Skris	lduw	[%o1+4],%g2
43055714Skris	mulx	%g2,%g2,%o4
43155714Skris	dec	%o2
43255714Skris	stuw	%o4,[%o0+8]
43355714Skris	srlx	%o4,32,%o5
43455714Skris	brz,pt	%o2,.L_bn_sqr_words_return
43555714Skris	stuw	%o5,[%o0+12]
43655714Skris
43755714Skris	lduw	[%o1+8],%g2
43855714Skris	mulx	%g2,%g2,%o4
43955714Skris	srlx	%o4,32,%o5
44055714Skris	stuw	%o4,[%o0+16]
44155714Skris	stuw	%o5,[%o0+20]
44255714Skris	retl
44355714Skris	clr	%o0
44455714Skris
44555714Skris.type	bn_sqr_words,#function
44655714Skris.size	bn_sqr_words,(.-bn_sqr_words)
44755714Skris
44855714Skris.align	32
44955714Skris.global bn_div_words
45055714Skris/*
45155714Skris * BN_ULONG bn_div_words(h,l,d)
45255714Skris * BN_ULONG h,l,d;
45355714Skris */
45455714Skrisbn_div_words:
45555714Skris	sllx	%o0,32,%o0
45655714Skris	or	%o0,%o1,%o0
45755714Skris	udivx	%o0,%o2,%o0
45855714Skris	retl
45955714Skris	srl	%o0,%g0,%o0	! clruw	%o0
46055714Skris
46155714Skris.type	bn_div_words,#function
46255714Skris.size	bn_div_words,(.-bn_div_words)
46355714Skris
46455714Skris.align	32
46555714Skris
46655714Skris.global bn_add_words
46755714Skris/*
46855714Skris * BN_ULONG bn_add_words(rp,ap,bp,n)
46955714Skris * BN_ULONG *rp,*ap,*bp;
47055714Skris * int n;
47155714Skris */
47255714Skrisbn_add_words:
473160814Ssimon	sra	%o3,%g0,%o3	! signx %o3
47455714Skris	brgz,a	%o3,.L_bn_add_words_proceed
47555714Skris	lduw	[%o1],%o4
47655714Skris	retl
47755714Skris	clr	%o0
47855714Skris
47955714Skris.L_bn_add_words_proceed:
48055714Skris	andcc	%o3,-4,%g0
48155714Skris	bz,pn	%icc,.L_bn_add_words_tail
48255714Skris	addcc	%g0,0,%g0	! clear carry flag
48355714Skris
48455714Skris.L_bn_add_words_loop:		! wow! 32 aligned!
48555714Skris	dec	4,%o3
48655714Skris	lduw	[%o2],%o5
48755714Skris	lduw	[%o1+4],%g1
48855714Skris	lduw	[%o2+4],%g2
48955714Skris	lduw	[%o1+8],%g3
49055714Skris	lduw	[%o2+8],%g4
49155714Skris	addccc	%o5,%o4,%o5
49255714Skris	stuw	%o5,[%o0]
49355714Skris
49455714Skris	lduw	[%o1+12],%o4
49555714Skris	lduw	[%o2+12],%o5
49655714Skris	inc	16,%o1
49755714Skris	addccc	%g1,%g2,%g1
49855714Skris	stuw	%g1,[%o0+4]
49955714Skris
50055714Skris	inc	16,%o2
50155714Skris	addccc	%g3,%g4,%g3
50255714Skris	stuw	%g3,[%o0+8]
50355714Skris
50455714Skris	inc	16,%o0
50555714Skris	addccc	%o5,%o4,%o5
50655714Skris	stuw	%o5,[%o0-4]
50755714Skris	and	%o3,-4,%g1
50855714Skris	brnz,a,pt	%g1,.L_bn_add_words_loop
50955714Skris	lduw	[%o1],%o4
51055714Skris
51155714Skris	brnz,a,pn	%o3,.L_bn_add_words_tail
51255714Skris	lduw	[%o1],%o4
51355714Skris.L_bn_add_words_return:
51455714Skris	clr	%o0
51555714Skris	retl
51655714Skris	movcs	%icc,1,%o0
51755714Skris	nop
51855714Skris
51955714Skris.L_bn_add_words_tail:
52055714Skris	lduw	[%o2],%o5
52155714Skris	dec	%o3
52255714Skris	addccc	%o5,%o4,%o5
52355714Skris	brz,pt	%o3,.L_bn_add_words_return
52455714Skris	stuw	%o5,[%o0]
52555714Skris
52655714Skris	lduw	[%o1+4],%o4
52755714Skris	lduw	[%o2+4],%o5
52855714Skris	dec	%o3
52955714Skris	addccc	%o5,%o4,%o5
53055714Skris	brz,pt	%o3,.L_bn_add_words_return
53155714Skris	stuw	%o5,[%o0+4]
53255714Skris
53355714Skris	lduw	[%o1+8],%o4
53455714Skris	lduw	[%o2+8],%o5
53555714Skris	addccc	%o5,%o4,%o5
53655714Skris	stuw	%o5,[%o0+8]
53755714Skris	clr	%o0
53855714Skris	retl
53955714Skris	movcs	%icc,1,%o0
54055714Skris
54155714Skris.type	bn_add_words,#function
54255714Skris.size	bn_add_words,(.-bn_add_words)
54355714Skris
54455714Skris.global bn_sub_words
54555714Skris/*
54655714Skris * BN_ULONG bn_sub_words(rp,ap,bp,n)
54755714Skris * BN_ULONG *rp,*ap,*bp;
54855714Skris * int n;
54955714Skris */
55055714Skrisbn_sub_words:
551160814Ssimon	sra	%o3,%g0,%o3	! signx %o3
55255714Skris	brgz,a	%o3,.L_bn_sub_words_proceed
55355714Skris	lduw	[%o1],%o4
55455714Skris	retl
55555714Skris	clr	%o0
55655714Skris
55755714Skris.L_bn_sub_words_proceed:
55855714Skris	andcc	%o3,-4,%g0
55955714Skris	bz,pn	%icc,.L_bn_sub_words_tail
56055714Skris	addcc	%g0,0,%g0	! clear carry flag
56155714Skris
56255714Skris.L_bn_sub_words_loop:		! wow! 32 aligned!
56355714Skris	dec	4,%o3
56455714Skris	lduw	[%o2],%o5
56555714Skris	lduw	[%o1+4],%g1
56655714Skris	lduw	[%o2+4],%g2
56755714Skris	lduw	[%o1+8],%g3
56855714Skris	lduw	[%o2+8],%g4
56955714Skris	subccc	%o4,%o5,%o5
57055714Skris	stuw	%o5,[%o0]
57155714Skris
57255714Skris	lduw	[%o1+12],%o4
57355714Skris	lduw	[%o2+12],%o5
57455714Skris	inc	16,%o1
57555714Skris	subccc	%g1,%g2,%g2
57655714Skris	stuw	%g2,[%o0+4]
57755714Skris
57855714Skris	inc	16,%o2
57955714Skris	subccc	%g3,%g4,%g4
58055714Skris	stuw	%g4,[%o0+8]
58155714Skris
58255714Skris	inc	16,%o0
58355714Skris	subccc	%o4,%o5,%o5
58455714Skris	stuw	%o5,[%o0-4]
58555714Skris	and	%o3,-4,%g1
58655714Skris	brnz,a,pt	%g1,.L_bn_sub_words_loop
58755714Skris	lduw	[%o1],%o4
58855714Skris
58955714Skris	brnz,a,pn	%o3,.L_bn_sub_words_tail
59055714Skris	lduw	[%o1],%o4
59155714Skris.L_bn_sub_words_return:
59255714Skris	clr	%o0
59355714Skris	retl
59455714Skris	movcs	%icc,1,%o0
59555714Skris	nop
59655714Skris
59755714Skris.L_bn_sub_words_tail:		! wow! 32 aligned!
59855714Skris	lduw	[%o2],%o5
59955714Skris	dec	%o3
60055714Skris	subccc	%o4,%o5,%o5
60155714Skris	brz,pt	%o3,.L_bn_sub_words_return
60255714Skris	stuw	%o5,[%o0]
60355714Skris
60455714Skris	lduw	[%o1+4],%o4
60555714Skris	lduw	[%o2+4],%o5
60655714Skris	dec	%o3
60755714Skris	subccc	%o4,%o5,%o5
60855714Skris	brz,pt	%o3,.L_bn_sub_words_return
60955714Skris	stuw	%o5,[%o0+4]
61055714Skris
61155714Skris	lduw	[%o1+8],%o4
61255714Skris	lduw	[%o2+8],%o5
61355714Skris	subccc	%o4,%o5,%o5
61455714Skris	stuw	%o5,[%o0+8]
61555714Skris	clr	%o0
61655714Skris	retl
61755714Skris	movcs	%icc,1,%o0
61855714Skris
61955714Skris.type	bn_sub_words,#function
62055714Skris.size	bn_sub_words,(.-bn_sub_words)
62155714Skris
62255714Skris/*
62355714Skris * Code below depends on the fact that upper parts of the %l0-%l7
62455714Skris * and %i0-%i7 are zeroed by kernel after context switch. In
62555714Skris * previous versions this comment stated that "the trouble is that
62655714Skris * it's not feasible to implement the mumbo-jumbo in less V9
62755714Skris * instructions:-(" which apparently isn't true thanks to
62855714Skris * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
62955714Skris * results not from the shorter code, but from elimination of
63055714Skris * multicycle none-pairable 'rd %y,%rd' instructions.
63155714Skris *
63255714Skris *							Andy.
63355714Skris */
63455714Skris
63555714Skris/*
63655714Skris * Here is register usage map for *all* routines below.
63755714Skris */
63855714Skris#define t_1	%o0
63955714Skris#define	t_2	%o1
64055714Skris#define c_12	%o2
64155714Skris#define c_3	%o3
64255714Skris
64355714Skris#define ap(I)	[%i1+4*I]
64455714Skris#define bp(I)	[%i2+4*I]
64555714Skris#define rp(I)	[%i0+4*I]
64655714Skris
64755714Skris#define	a_0	%l0
64855714Skris#define	a_1	%l1
64955714Skris#define	a_2	%l2
65055714Skris#define	a_3	%l3
65155714Skris#define	a_4	%l4
65255714Skris#define	a_5	%l5
65355714Skris#define	a_6	%l6
65455714Skris#define	a_7	%l7
65555714Skris
65655714Skris#define	b_0	%i3
65755714Skris#define	b_1	%i4
65855714Skris#define	b_2	%i5
65955714Skris#define	b_3	%o4
66055714Skris#define	b_4	%o5
66155714Skris#define	b_5	%o7
66255714Skris#define	b_6	%g1
66355714Skris#define	b_7	%g4
66455714Skris
66555714Skris.align	32
66655714Skris.global bn_mul_comba8
66755714Skris/*
66855714Skris * void bn_mul_comba8(r,a,b)
66955714Skris * BN_ULONG *r,*a,*b;
67055714Skris */
67155714Skrisbn_mul_comba8:
67255714Skris	save	%sp,FRAME_SIZE,%sp
67355714Skris	mov	1,t_2
67455714Skris	lduw	ap(0),a_0
67555714Skris	sllx	t_2,32,t_2
67655714Skris	lduw	bp(0),b_0	!=
67755714Skris	lduw	bp(1),b_1
67855714Skris	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
67955714Skris	srlx	t_1,32,c_12
68055714Skris	stuw	t_1,rp(0)	!=!r[0]=c1;
68155714Skris
68255714Skris	lduw	ap(1),a_1
68355714Skris	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
68455714Skris	addcc	c_12,t_1,c_12
68555714Skris	clr	c_3		!=
68655714Skris	bcs,a	%xcc,.+8
68755714Skris	add	c_3,t_2,c_3
68855714Skris	lduw	ap(2),a_2
68955714Skris	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
69055714Skris	addcc	c_12,t_1,t_1
69155714Skris	bcs,a	%xcc,.+8
69255714Skris	add	c_3,t_2,c_3
69355714Skris	srlx	t_1,32,c_12	!=
69455714Skris	stuw	t_1,rp(1)	!r[1]=c2;
69555714Skris	or	c_12,c_3,c_12
69655714Skris
69755714Skris	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
69855714Skris	addcc	c_12,t_1,c_12	!=
69955714Skris	clr	c_3
70055714Skris	bcs,a	%xcc,.+8
70155714Skris	add	c_3,t_2,c_3
70255714Skris	lduw	bp(2),b_2	!=
70355714Skris	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
70455714Skris	addcc	c_12,t_1,c_12
70555714Skris	bcs,a	%xcc,.+8
70655714Skris	add	c_3,t_2,c_3	!=
70755714Skris	lduw	bp(3),b_3
70855714Skris	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
70955714Skris	addcc	c_12,t_1,t_1
71055714Skris	bcs,a	%xcc,.+8	!=
71155714Skris	add	c_3,t_2,c_3
71255714Skris	srlx	t_1,32,c_12
71355714Skris	stuw	t_1,rp(2)	!r[2]=c3;
71455714Skris	or	c_12,c_3,c_12	!=
71555714Skris
71655714Skris	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
71755714Skris	addcc	c_12,t_1,c_12
71855714Skris	clr	c_3
71955714Skris	bcs,a	%xcc,.+8	!=
72055714Skris	add	c_3,t_2,c_3
72155714Skris	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
72255714Skris	addcc	c_12,t_1,c_12
72355714Skris	bcs,a	%xcc,.+8	!=
72455714Skris	add	c_3,t_2,c_3
72555714Skris	lduw	ap(3),a_3
72655714Skris	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
72755714Skris	addcc	c_12,t_1,c_12	!=
72855714Skris	bcs,a	%xcc,.+8
72955714Skris	add	c_3,t_2,c_3
73055714Skris	lduw	ap(4),a_4
73155714Skris	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
73255714Skris	addcc	c_12,t_1,t_1
73355714Skris	bcs,a	%xcc,.+8
73455714Skris	add	c_3,t_2,c_3
73555714Skris	srlx	t_1,32,c_12	!=
73655714Skris	stuw	t_1,rp(3)	!r[3]=c1;
73755714Skris	or	c_12,c_3,c_12
73855714Skris
73955714Skris	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
74055714Skris	addcc	c_12,t_1,c_12	!=
74155714Skris	clr	c_3
74255714Skris	bcs,a	%xcc,.+8
74355714Skris	add	c_3,t_2,c_3
74455714Skris	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
74555714Skris	addcc	c_12,t_1,c_12
74655714Skris	bcs,a	%xcc,.+8
74755714Skris	add	c_3,t_2,c_3
74855714Skris	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
74955714Skris	addcc	c_12,t_1,c_12
75055714Skris	bcs,a	%xcc,.+8
75155714Skris	add	c_3,t_2,c_3
75255714Skris	lduw	bp(4),b_4	!=
75355714Skris	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
75455714Skris	addcc	c_12,t_1,c_12
75555714Skris	bcs,a	%xcc,.+8
75655714Skris	add	c_3,t_2,c_3	!=
75755714Skris	lduw	bp(5),b_5
75855714Skris	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
75955714Skris	addcc	c_12,t_1,t_1
76055714Skris	bcs,a	%xcc,.+8	!=
76155714Skris	add	c_3,t_2,c_3
76255714Skris	srlx	t_1,32,c_12
76355714Skris	stuw	t_1,rp(4)	!r[4]=c2;
76455714Skris	or	c_12,c_3,c_12	!=
76555714Skris
76655714Skris	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
76755714Skris	addcc	c_12,t_1,c_12
76855714Skris	clr	c_3
76955714Skris	bcs,a	%xcc,.+8	!=
77055714Skris	add	c_3,t_2,c_3
77155714Skris	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
77255714Skris	addcc	c_12,t_1,c_12
77355714Skris	bcs,a	%xcc,.+8	!=
77455714Skris	add	c_3,t_2,c_3
77555714Skris	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
77655714Skris	addcc	c_12,t_1,c_12
77755714Skris	bcs,a	%xcc,.+8	!=
77855714Skris	add	c_3,t_2,c_3
77955714Skris	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
78055714Skris	addcc	c_12,t_1,c_12
78155714Skris	bcs,a	%xcc,.+8	!=
78255714Skris	add	c_3,t_2,c_3
78355714Skris	lduw	ap(5),a_5
78455714Skris	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
78555714Skris	addcc	c_12,t_1,c_12	!=
78655714Skris	bcs,a	%xcc,.+8
78755714Skris	add	c_3,t_2,c_3
78855714Skris	lduw	ap(6),a_6
78955714Skris	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
79055714Skris	addcc	c_12,t_1,t_1
79155714Skris	bcs,a	%xcc,.+8
79255714Skris	add	c_3,t_2,c_3
79355714Skris	srlx	t_1,32,c_12	!=
79455714Skris	stuw	t_1,rp(5)	!r[5]=c3;
79555714Skris	or	c_12,c_3,c_12
79655714Skris
79755714Skris	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
79855714Skris	addcc	c_12,t_1,c_12	!=
79955714Skris	clr	c_3
80055714Skris	bcs,a	%xcc,.+8
80155714Skris	add	c_3,t_2,c_3
80255714Skris	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
80355714Skris	addcc	c_12,t_1,c_12
80455714Skris	bcs,a	%xcc,.+8
80555714Skris	add	c_3,t_2,c_3
80655714Skris	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
80755714Skris	addcc	c_12,t_1,c_12
80855714Skris	bcs,a	%xcc,.+8
80955714Skris	add	c_3,t_2,c_3
81055714Skris	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
81155714Skris	addcc	c_12,t_1,c_12
81255714Skris	bcs,a	%xcc,.+8
81355714Skris	add	c_3,t_2,c_3
81455714Skris	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
81555714Skris	addcc	c_12,t_1,c_12
81655714Skris	bcs,a	%xcc,.+8
81755714Skris	add	c_3,t_2,c_3
81855714Skris	lduw	bp(6),b_6	!=
81955714Skris	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
82055714Skris	addcc	c_12,t_1,c_12
82155714Skris	bcs,a	%xcc,.+8
82255714Skris	add	c_3,t_2,c_3	!=
82355714Skris	lduw	bp(7),b_7
82455714Skris	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
82555714Skris	addcc	c_12,t_1,t_1
82655714Skris	bcs,a	%xcc,.+8	!=
82755714Skris	add	c_3,t_2,c_3
82855714Skris	srlx	t_1,32,c_12
82955714Skris	stuw	t_1,rp(6)	!r[6]=c1;
83055714Skris	or	c_12,c_3,c_12	!=
83155714Skris
83255714Skris	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
83355714Skris	addcc	c_12,t_1,c_12
83455714Skris	clr	c_3
83555714Skris	bcs,a	%xcc,.+8	!=
83655714Skris	add	c_3,t_2,c_3
83755714Skris	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
83855714Skris	addcc	c_12,t_1,c_12
83955714Skris	bcs,a	%xcc,.+8	!=
84055714Skris	add	c_3,t_2,c_3
84155714Skris	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
84255714Skris	addcc	c_12,t_1,c_12
84355714Skris	bcs,a	%xcc,.+8	!=
84455714Skris	add	c_3,t_2,c_3
84555714Skris	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
84655714Skris	addcc	c_12,t_1,c_12
84755714Skris	bcs,a	%xcc,.+8	!=
84855714Skris	add	c_3,t_2,c_3
84955714Skris	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
85055714Skris	addcc	c_12,t_1,c_12
85155714Skris	bcs,a	%xcc,.+8	!=
85255714Skris	add	c_3,t_2,c_3
85355714Skris	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
85455714Skris	addcc	c_12,t_1,c_12
85555714Skris	bcs,a	%xcc,.+8	!=
85655714Skris	add	c_3,t_2,c_3
85755714Skris	lduw	ap(7),a_7
85855714Skris	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
85955714Skris	addcc	c_12,t_1,c_12
86055714Skris	bcs,a	%xcc,.+8
86155714Skris	add	c_3,t_2,c_3
86255714Skris	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
86355714Skris	addcc	c_12,t_1,t_1
86455714Skris	bcs,a	%xcc,.+8
86555714Skris	add	c_3,t_2,c_3
86655714Skris	srlx	t_1,32,c_12	!=
86755714Skris	stuw	t_1,rp(7)	!r[7]=c2;
86855714Skris	or	c_12,c_3,c_12
86955714Skris
87055714Skris	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
87155714Skris	addcc	c_12,t_1,c_12
87255714Skris	clr	c_3
87355714Skris	bcs,a	%xcc,.+8
87455714Skris	add	c_3,t_2,c_3	!=
87555714Skris	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
87655714Skris	addcc	c_12,t_1,c_12
87755714Skris	bcs,a	%xcc,.+8
87855714Skris	add	c_3,t_2,c_3	!=
87955714Skris	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
88055714Skris	addcc	c_12,t_1,c_12
88155714Skris	bcs,a	%xcc,.+8
88255714Skris	add	c_3,t_2,c_3	!=
88355714Skris	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
88455714Skris	addcc	c_12,t_1,c_12
88555714Skris	bcs,a	%xcc,.+8
88655714Skris	add	c_3,t_2,c_3	!=
88755714Skris	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
88855714Skris	addcc	c_12,t_1,c_12
88955714Skris	bcs,a	%xcc,.+8
89055714Skris	add	c_3,t_2,c_3	!=
89155714Skris	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
89255714Skris	addcc	c_12,t_1,c_12
89355714Skris	bcs,a	%xcc,.+8
89455714Skris	add	c_3,t_2,c_3	!=
89555714Skris	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
89655714Skris	addcc	c_12,t_1,t_1
89755714Skris	bcs,a	%xcc,.+8
89855714Skris	add	c_3,t_2,c_3	!=
89955714Skris	srlx	t_1,32,c_12
90055714Skris	stuw	t_1,rp(8)	!r[8]=c3;
90155714Skris	or	c_12,c_3,c_12
90255714Skris
90355714Skris	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
90455714Skris	addcc	c_12,t_1,c_12
90555714Skris	clr	c_3
90655714Skris	bcs,a	%xcc,.+8
90755714Skris	add	c_3,t_2,c_3	!=
90855714Skris	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
90955714Skris	addcc	c_12,t_1,c_12
91055714Skris	bcs,a	%xcc,.+8	!=
91155714Skris	add	c_3,t_2,c_3
91255714Skris	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
91355714Skris	addcc	c_12,t_1,c_12
91455714Skris	bcs,a	%xcc,.+8	!=
91555714Skris	add	c_3,t_2,c_3
91655714Skris	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
91755714Skris	addcc	c_12,t_1,c_12
91855714Skris	bcs,a	%xcc,.+8	!=
91955714Skris	add	c_3,t_2,c_3
92055714Skris	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
92155714Skris	addcc	c_12,t_1,c_12
92255714Skris	bcs,a	%xcc,.+8	!=
92355714Skris	add	c_3,t_2,c_3
92455714Skris	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
92555714Skris	addcc	c_12,t_1,t_1
92655714Skris	bcs,a	%xcc,.+8	!=
92755714Skris	add	c_3,t_2,c_3
92855714Skris	srlx	t_1,32,c_12
92955714Skris	stuw	t_1,rp(9)	!r[9]=c1;
93055714Skris	or	c_12,c_3,c_12	!=
93155714Skris
93255714Skris	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
93355714Skris	addcc	c_12,t_1,c_12
93455714Skris	clr	c_3
93555714Skris	bcs,a	%xcc,.+8	!=
93655714Skris	add	c_3,t_2,c_3
93755714Skris	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
93855714Skris	addcc	c_12,t_1,c_12
93955714Skris	bcs,a	%xcc,.+8	!=
94055714Skris	add	c_3,t_2,c_3
94155714Skris	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
94255714Skris	addcc	c_12,t_1,c_12
94355714Skris	bcs,a	%xcc,.+8	!=
94455714Skris	add	c_3,t_2,c_3
94555714Skris	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
94655714Skris	addcc	c_12,t_1,c_12
94755714Skris	bcs,a	%xcc,.+8	!=
94855714Skris	add	c_3,t_2,c_3
94955714Skris	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
95055714Skris	addcc	c_12,t_1,t_1
95155714Skris	bcs,a	%xcc,.+8	!=
95255714Skris	add	c_3,t_2,c_3
95355714Skris	srlx	t_1,32,c_12
95455714Skris	stuw	t_1,rp(10)	!r[10]=c2;
95555714Skris	or	c_12,c_3,c_12	!=
95655714Skris
95755714Skris	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
95855714Skris	addcc	c_12,t_1,c_12
95955714Skris	clr	c_3
96055714Skris	bcs,a	%xcc,.+8	!=
96155714Skris	add	c_3,t_2,c_3
96255714Skris	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
96355714Skris	addcc	c_12,t_1,c_12
96455714Skris	bcs,a	%xcc,.+8	!=
96555714Skris	add	c_3,t_2,c_3
96655714Skris	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
96755714Skris	addcc	c_12,t_1,c_12
96855714Skris	bcs,a	%xcc,.+8	!=
96955714Skris	add	c_3,t_2,c_3
97055714Skris	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
97155714Skris	addcc	c_12,t_1,t_1
97255714Skris	bcs,a	%xcc,.+8	!=
97355714Skris	add	c_3,t_2,c_3
97455714Skris	srlx	t_1,32,c_12
97555714Skris	stuw	t_1,rp(11)	!r[11]=c3;
97655714Skris	or	c_12,c_3,c_12	!=
97755714Skris
97855714Skris	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
97955714Skris	addcc	c_12,t_1,c_12
98055714Skris	clr	c_3
98155714Skris	bcs,a	%xcc,.+8	!=
98255714Skris	add	c_3,t_2,c_3
98355714Skris	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
98455714Skris	addcc	c_12,t_1,c_12
98555714Skris	bcs,a	%xcc,.+8	!=
98655714Skris	add	c_3,t_2,c_3
98755714Skris	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
98855714Skris	addcc	c_12,t_1,t_1
98955714Skris	bcs,a	%xcc,.+8	!=
99055714Skris	add	c_3,t_2,c_3
99155714Skris	srlx	t_1,32,c_12
99255714Skris	stuw	t_1,rp(12)	!r[12]=c1;
99355714Skris	or	c_12,c_3,c_12	!=
99455714Skris
99555714Skris	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
99655714Skris	addcc	c_12,t_1,c_12
99755714Skris	clr	c_3
99855714Skris	bcs,a	%xcc,.+8	!=
99955714Skris	add	c_3,t_2,c_3
100055714Skris	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
100155714Skris	addcc	c_12,t_1,t_1
100255714Skris	bcs,a	%xcc,.+8	!=
100355714Skris	add	c_3,t_2,c_3
100455714Skris	srlx	t_1,32,c_12
100555714Skris	st	t_1,rp(13)	!r[13]=c2;
100655714Skris	or	c_12,c_3,c_12	!=
100755714Skris
100855714Skris	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
100955714Skris	addcc	c_12,t_1,t_1
101055714Skris	srlx	t_1,32,c_12	!=
101155714Skris	stuw	t_1,rp(14)	!r[14]=c3;
101255714Skris	stuw	c_12,rp(15)	!r[15]=c1;
101355714Skris
101455714Skris	ret
101555714Skris	restore	%g0,%g0,%o0	!=
101655714Skris
101755714Skris.type	bn_mul_comba8,#function
101855714Skris.size	bn_mul_comba8,(.-bn_mul_comba8)
101955714Skris
102055714Skris.align	32
102155714Skris
102255714Skris.global bn_mul_comba4
102355714Skris/*
102455714Skris * void bn_mul_comba4(r,a,b)
102555714Skris * BN_ULONG *r,*a,*b;
102655714Skris */
102755714Skrisbn_mul_comba4:
102855714Skris	save	%sp,FRAME_SIZE,%sp
102955714Skris	lduw	ap(0),a_0
103055714Skris	mov	1,t_2
103155714Skris	lduw	bp(0),b_0
103255714Skris	sllx	t_2,32,t_2	!=
103355714Skris	lduw	bp(1),b_1
103455714Skris	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
103555714Skris	srlx	t_1,32,c_12
103655714Skris	stuw	t_1,rp(0)	!=!r[0]=c1;
103755714Skris
103855714Skris	lduw	ap(1),a_1
103955714Skris	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
104055714Skris	addcc	c_12,t_1,c_12
104155714Skris	clr	c_3		!=
104255714Skris	bcs,a	%xcc,.+8
104355714Skris	add	c_3,t_2,c_3
104455714Skris	lduw	ap(2),a_2
104555714Skris	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
104655714Skris	addcc	c_12,t_1,t_1
104755714Skris	bcs,a	%xcc,.+8
104855714Skris	add	c_3,t_2,c_3
104955714Skris	srlx	t_1,32,c_12	!=
105055714Skris	stuw	t_1,rp(1)	!r[1]=c2;
105155714Skris	or	c_12,c_3,c_12
105255714Skris
105355714Skris	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
105455714Skris	addcc	c_12,t_1,c_12	!=
105555714Skris	clr	c_3
105655714Skris	bcs,a	%xcc,.+8
105755714Skris	add	c_3,t_2,c_3
105855714Skris	lduw	bp(2),b_2	!=
105955714Skris	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
106055714Skris	addcc	c_12,t_1,c_12
106155714Skris	bcs,a	%xcc,.+8
106255714Skris	add	c_3,t_2,c_3	!=
106355714Skris	lduw	bp(3),b_3
106455714Skris	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
106555714Skris	addcc	c_12,t_1,t_1
106655714Skris	bcs,a	%xcc,.+8	!=
106755714Skris	add	c_3,t_2,c_3
106855714Skris	srlx	t_1,32,c_12
106955714Skris	stuw	t_1,rp(2)	!r[2]=c3;
107055714Skris	or	c_12,c_3,c_12	!=
107155714Skris
107255714Skris	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
107355714Skris	addcc	c_12,t_1,c_12
107455714Skris	clr	c_3
107555714Skris	bcs,a	%xcc,.+8	!=
107655714Skris	add	c_3,t_2,c_3
107755714Skris	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
107855714Skris	addcc	c_12,t_1,c_12
107955714Skris	bcs,a	%xcc,.+8	!=
108055714Skris	add	c_3,t_2,c_3
108155714Skris	lduw	ap(3),a_3
108255714Skris	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
108355714Skris	addcc	c_12,t_1,c_12	!=
108455714Skris	bcs,a	%xcc,.+8
108555714Skris	add	c_3,t_2,c_3
108655714Skris	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
108755714Skris	addcc	c_12,t_1,t_1	!=
108855714Skris	bcs,a	%xcc,.+8
108955714Skris	add	c_3,t_2,c_3
109055714Skris	srlx	t_1,32,c_12
109155714Skris	stuw	t_1,rp(3)	!=!r[3]=c1;
109255714Skris	or	c_12,c_3,c_12
109355714Skris
109455714Skris	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
109555714Skris	addcc	c_12,t_1,c_12
109655714Skris	clr	c_3		!=
109755714Skris	bcs,a	%xcc,.+8
109855714Skris	add	c_3,t_2,c_3
109955714Skris	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
110055714Skris	addcc	c_12,t_1,c_12	!=
110155714Skris	bcs,a	%xcc,.+8
110255714Skris	add	c_3,t_2,c_3
110355714Skris	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
110455714Skris	addcc	c_12,t_1,t_1	!=
110555714Skris	bcs,a	%xcc,.+8
110655714Skris	add	c_3,t_2,c_3
110755714Skris	srlx	t_1,32,c_12
110855714Skris	stuw	t_1,rp(4)	!=!r[4]=c2;
110955714Skris	or	c_12,c_3,c_12
111055714Skris
111155714Skris	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
111255714Skris	addcc	c_12,t_1,c_12
111355714Skris	clr	c_3		!=
111455714Skris	bcs,a	%xcc,.+8
111555714Skris	add	c_3,t_2,c_3
111655714Skris	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
111755714Skris	addcc	c_12,t_1,t_1	!=
111855714Skris	bcs,a	%xcc,.+8
111955714Skris	add	c_3,t_2,c_3
112055714Skris	srlx	t_1,32,c_12
112155714Skris	stuw	t_1,rp(5)	!=!r[5]=c3;
112255714Skris	or	c_12,c_3,c_12
112355714Skris
112455714Skris	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
112555714Skris	addcc	c_12,t_1,t_1
112655714Skris	srlx	t_1,32,c_12	!=
112755714Skris	stuw	t_1,rp(6)	!r[6]=c1;
112855714Skris	stuw	c_12,rp(7)	!r[7]=c2;
112955714Skris
113055714Skris	ret
113155714Skris	restore	%g0,%g0,%o0
113255714Skris
113355714Skris.type	bn_mul_comba4,#function
113455714Skris.size	bn_mul_comba4,(.-bn_mul_comba4)
113555714Skris
113655714Skris.align	32
113755714Skris
113855714Skris.global bn_sqr_comba8
113955714Skrisbn_sqr_comba8:
114055714Skris	save	%sp,FRAME_SIZE,%sp
114155714Skris	mov	1,t_2
114255714Skris	lduw	ap(0),a_0
114355714Skris	sllx	t_2,32,t_2
114455714Skris	lduw	ap(1),a_1
114555714Skris	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
114655714Skris	srlx	t_1,32,c_12
114755714Skris	stuw	t_1,rp(0)	!r[0]=c1;
114855714Skris
114955714Skris	lduw	ap(2),a_2
115055714Skris	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
115155714Skris	addcc	c_12,t_1,c_12
115255714Skris	clr	c_3
115355714Skris	bcs,a	%xcc,.+8
115455714Skris	add	c_3,t_2,c_3
115555714Skris	addcc	c_12,t_1,t_1
115655714Skris	bcs,a	%xcc,.+8
115755714Skris	add	c_3,t_2,c_3
115855714Skris	srlx	t_1,32,c_12
115955714Skris	stuw	t_1,rp(1)	!r[1]=c2;
116055714Skris	or	c_12,c_3,c_12
116155714Skris
116255714Skris	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
116355714Skris	addcc	c_12,t_1,c_12
116455714Skris	clr	c_3
116555714Skris	bcs,a	%xcc,.+8
116655714Skris	add	c_3,t_2,c_3
116755714Skris	addcc	c_12,t_1,c_12
116855714Skris	bcs,a	%xcc,.+8
116955714Skris	add	c_3,t_2,c_3
117055714Skris	lduw	ap(3),a_3
117155714Skris	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
117255714Skris	addcc	c_12,t_1,t_1
117355714Skris	bcs,a	%xcc,.+8
117455714Skris	add	c_3,t_2,c_3
117555714Skris	srlx	t_1,32,c_12
117655714Skris	stuw	t_1,rp(2)	!r[2]=c3;
117755714Skris	or	c_12,c_3,c_12
117855714Skris
117955714Skris	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
118055714Skris	addcc	c_12,t_1,c_12
118155714Skris	clr	c_3
118255714Skris	bcs,a	%xcc,.+8
118355714Skris	add	c_3,t_2,c_3
118455714Skris	addcc	c_12,t_1,c_12
118555714Skris	bcs,a	%xcc,.+8
118655714Skris	add	c_3,t_2,c_3
118755714Skris	lduw	ap(4),a_4
118855714Skris	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
118955714Skris	addcc	c_12,t_1,c_12
119055714Skris	bcs,a	%xcc,.+8
119155714Skris	add	c_3,t_2,c_3
119255714Skris	addcc	c_12,t_1,t_1
119355714Skris	bcs,a	%xcc,.+8
119455714Skris	add	c_3,t_2,c_3
119555714Skris	srlx	t_1,32,c_12
119655714Skris	st	t_1,rp(3)	!r[3]=c1;
119755714Skris	or	c_12,c_3,c_12
119855714Skris
119955714Skris	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
120055714Skris	addcc	c_12,t_1,c_12
120155714Skris	clr	c_3
120255714Skris	bcs,a	%xcc,.+8
120355714Skris	add	c_3,t_2,c_3
120455714Skris	addcc	c_12,t_1,c_12
120555714Skris	bcs,a	%xcc,.+8
120655714Skris	add	c_3,t_2,c_3
120755714Skris	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
120855714Skris	addcc	c_12,t_1,c_12
120955714Skris	bcs,a	%xcc,.+8
121055714Skris	add	c_3,t_2,c_3
121155714Skris	addcc	c_12,t_1,c_12
121255714Skris	bcs,a	%xcc,.+8
121355714Skris	add	c_3,t_2,c_3
121455714Skris	lduw	ap(5),a_5
121555714Skris	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
121655714Skris	addcc	c_12,t_1,t_1
121755714Skris	bcs,a	%xcc,.+8
121855714Skris	add	c_3,t_2,c_3
121955714Skris	srlx	t_1,32,c_12
122055714Skris	stuw	t_1,rp(4)	!r[4]=c2;
122155714Skris	or	c_12,c_3,c_12
122255714Skris
122355714Skris	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
122455714Skris	addcc	c_12,t_1,c_12
122555714Skris	clr	c_3
122655714Skris	bcs,a	%xcc,.+8
122755714Skris	add	c_3,t_2,c_3
122855714Skris	addcc	c_12,t_1,c_12
122955714Skris	bcs,a	%xcc,.+8
123055714Skris	add	c_3,t_2,c_3
123155714Skris	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
123255714Skris	addcc	c_12,t_1,c_12
123355714Skris	bcs,a	%xcc,.+8
123455714Skris	add	c_3,t_2,c_3
123555714Skris	addcc	c_12,t_1,c_12
123655714Skris	bcs,a	%xcc,.+8
123755714Skris	add	c_3,t_2,c_3
123855714Skris	lduw	ap(6),a_6
123955714Skris	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
124055714Skris	addcc	c_12,t_1,c_12
124155714Skris	bcs,a	%xcc,.+8
124255714Skris	add	c_3,t_2,c_3
124355714Skris	addcc	c_12,t_1,t_1
124455714Skris	bcs,a	%xcc,.+8
124555714Skris	add	c_3,t_2,c_3
124655714Skris	srlx	t_1,32,c_12
124755714Skris	stuw	t_1,rp(5)	!r[5]=c3;
124855714Skris	or	c_12,c_3,c_12
124955714Skris
125055714Skris	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
125155714Skris	addcc	c_12,t_1,c_12
125255714Skris	clr	c_3
125355714Skris	bcs,a	%xcc,.+8
125455714Skris	add	c_3,t_2,c_3
125555714Skris	addcc	c_12,t_1,c_12
125655714Skris	bcs,a	%xcc,.+8
125755714Skris	add	c_3,t_2,c_3
125855714Skris	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
125955714Skris	addcc	c_12,t_1,c_12
126055714Skris	bcs,a	%xcc,.+8
126155714Skris	add	c_3,t_2,c_3
126255714Skris	addcc	c_12,t_1,c_12
126355714Skris	bcs,a	%xcc,.+8
126455714Skris	add	c_3,t_2,c_3
126555714Skris	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
126655714Skris	addcc	c_12,t_1,c_12
126755714Skris	bcs,a	%xcc,.+8
126855714Skris	add	c_3,t_2,c_3
126955714Skris	addcc	c_12,t_1,c_12
127055714Skris	bcs,a	%xcc,.+8
127155714Skris	add	c_3,t_2,c_3
127255714Skris	lduw	ap(7),a_7
127355714Skris	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
127455714Skris	addcc	c_12,t_1,t_1
127555714Skris	bcs,a	%xcc,.+8
127655714Skris	add	c_3,t_2,c_3
127755714Skris	srlx	t_1,32,c_12
127855714Skris	stuw	t_1,rp(6)	!r[6]=c1;
127955714Skris	or	c_12,c_3,c_12
128055714Skris
128155714Skris	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
128255714Skris	addcc	c_12,t_1,c_12
128355714Skris	clr	c_3
128455714Skris	bcs,a	%xcc,.+8
128555714Skris	add	c_3,t_2,c_3
128655714Skris	addcc	c_12,t_1,c_12
128755714Skris	bcs,a	%xcc,.+8
128855714Skris	add	c_3,t_2,c_3
128955714Skris	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
129055714Skris	addcc	c_12,t_1,c_12
129155714Skris	bcs,a	%xcc,.+8
129255714Skris	add	c_3,t_2,c_3
129355714Skris	addcc	c_12,t_1,c_12
129455714Skris	bcs,a	%xcc,.+8
129555714Skris	add	c_3,t_2,c_3
129655714Skris	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
129755714Skris	addcc	c_12,t_1,c_12
129855714Skris	bcs,a	%xcc,.+8
129955714Skris	add	c_3,t_2,c_3
130055714Skris	addcc	c_12,t_1,c_12
130155714Skris	bcs,a	%xcc,.+8
130255714Skris	add	c_3,t_2,c_3
130355714Skris	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
130455714Skris	addcc	c_12,t_1,c_12
130555714Skris	bcs,a	%xcc,.+8
130655714Skris	add	c_3,t_2,c_3
130755714Skris	addcc	c_12,t_1,t_1
130855714Skris	bcs,a	%xcc,.+8
130955714Skris	add	c_3,t_2,c_3
131055714Skris	srlx	t_1,32,c_12
131155714Skris	stuw	t_1,rp(7)	!r[7]=c2;
131255714Skris	or	c_12,c_3,c_12
131355714Skris
131455714Skris	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
131555714Skris	addcc	c_12,t_1,c_12
131655714Skris	clr	c_3
131755714Skris	bcs,a	%xcc,.+8
131855714Skris	add	c_3,t_2,c_3
131955714Skris	addcc	c_12,t_1,c_12
132055714Skris	bcs,a	%xcc,.+8
132155714Skris	add	c_3,t_2,c_3
132255714Skris	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
132355714Skris	addcc	c_12,t_1,c_12
132455714Skris	bcs,a	%xcc,.+8
132555714Skris	add	c_3,t_2,c_3
132655714Skris	addcc	c_12,t_1,c_12
132755714Skris	bcs,a	%xcc,.+8
132855714Skris	add	c_3,t_2,c_3
132955714Skris	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
133055714Skris	addcc	c_12,t_1,c_12
133155714Skris	bcs,a	%xcc,.+8
133255714Skris	add	c_3,t_2,c_3
133355714Skris	addcc	c_12,t_1,c_12
133455714Skris	bcs,a	%xcc,.+8
133555714Skris	add	c_3,t_2,c_3
133655714Skris	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
133755714Skris	addcc	c_12,t_1,t_1
133855714Skris	bcs,a	%xcc,.+8
133955714Skris	add	c_3,t_2,c_3
134055714Skris	srlx	t_1,32,c_12
134155714Skris	stuw	t_1,rp(8)	!r[8]=c3;
134255714Skris	or	c_12,c_3,c_12
134355714Skris
134455714Skris	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
134555714Skris	addcc	c_12,t_1,c_12
134655714Skris	clr	c_3
134755714Skris	bcs,a	%xcc,.+8
134855714Skris	add	c_3,t_2,c_3
134955714Skris	addcc	c_12,t_1,c_12
135055714Skris	bcs,a	%xcc,.+8
135155714Skris	add	c_3,t_2,c_3
135255714Skris	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
135355714Skris	addcc	c_12,t_1,c_12
135455714Skris	bcs,a	%xcc,.+8
135555714Skris	add	c_3,t_2,c_3
135655714Skris	addcc	c_12,t_1,c_12
135755714Skris	bcs,a	%xcc,.+8
135855714Skris	add	c_3,t_2,c_3
135955714Skris	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
136055714Skris	addcc	c_12,t_1,c_12
136155714Skris	bcs,a	%xcc,.+8
136255714Skris	add	c_3,t_2,c_3
136355714Skris	addcc	c_12,t_1,t_1
136455714Skris	bcs,a	%xcc,.+8
136555714Skris	add	c_3,t_2,c_3
136655714Skris	srlx	t_1,32,c_12
136755714Skris	stuw	t_1,rp(9)	!r[9]=c1;
136855714Skris	or	c_12,c_3,c_12
136955714Skris
137055714Skris	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
137155714Skris	addcc	c_12,t_1,c_12
137255714Skris	clr	c_3
137355714Skris	bcs,a	%xcc,.+8
137455714Skris	add	c_3,t_2,c_3
137555714Skris	addcc	c_12,t_1,c_12
137655714Skris	bcs,a	%xcc,.+8
137755714Skris	add	c_3,t_2,c_3
137855714Skris	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
137955714Skris	addcc	c_12,t_1,c_12
138055714Skris	bcs,a	%xcc,.+8
138155714Skris	add	c_3,t_2,c_3
138255714Skris	addcc	c_12,t_1,c_12
138355714Skris	bcs,a	%xcc,.+8
138455714Skris	add	c_3,t_2,c_3
138555714Skris	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
138655714Skris	addcc	c_12,t_1,t_1
138755714Skris	bcs,a	%xcc,.+8
138855714Skris	add	c_3,t_2,c_3
138955714Skris	srlx	t_1,32,c_12
139055714Skris	stuw	t_1,rp(10)	!r[10]=c2;
139155714Skris	or	c_12,c_3,c_12
139255714Skris
139355714Skris	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
139455714Skris	addcc	c_12,t_1,c_12
139555714Skris	clr	c_3
139655714Skris	bcs,a	%xcc,.+8
139755714Skris	add	c_3,t_2,c_3
139855714Skris	addcc	c_12,t_1,c_12
139955714Skris	bcs,a	%xcc,.+8
140055714Skris	add	c_3,t_2,c_3
140155714Skris	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
140255714Skris	addcc	c_12,t_1,c_12
140355714Skris	bcs,a	%xcc,.+8
140455714Skris	add	c_3,t_2,c_3
140555714Skris	addcc	c_12,t_1,t_1
140655714Skris	bcs,a	%xcc,.+8
140755714Skris	add	c_3,t_2,c_3
140855714Skris	srlx	t_1,32,c_12
140955714Skris	stuw	t_1,rp(11)	!r[11]=c3;
141055714Skris	or	c_12,c_3,c_12
141155714Skris
141255714Skris	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
141355714Skris	addcc	c_12,t_1,c_12
141455714Skris	clr	c_3
141555714Skris	bcs,a	%xcc,.+8
141655714Skris	add	c_3,t_2,c_3
141755714Skris	addcc	c_12,t_1,c_12
141855714Skris	bcs,a	%xcc,.+8
141955714Skris	add	c_3,t_2,c_3
142055714Skris	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
142155714Skris	addcc	c_12,t_1,t_1
142255714Skris	bcs,a	%xcc,.+8
142355714Skris	add	c_3,t_2,c_3
142455714Skris	srlx	t_1,32,c_12
142555714Skris	stuw	t_1,rp(12)	!r[12]=c1;
142655714Skris	or	c_12,c_3,c_12
142755714Skris
142855714Skris	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
142955714Skris	addcc	c_12,t_1,c_12
143055714Skris	clr	c_3
143155714Skris	bcs,a	%xcc,.+8
143255714Skris	add	c_3,t_2,c_3
143355714Skris	addcc	c_12,t_1,t_1
143455714Skris	bcs,a	%xcc,.+8
143555714Skris	add	c_3,t_2,c_3
143655714Skris	srlx	t_1,32,c_12
143755714Skris	stuw	t_1,rp(13)	!r[13]=c2;
143855714Skris	or	c_12,c_3,c_12
143955714Skris
144055714Skris	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
144155714Skris	addcc	c_12,t_1,t_1
144255714Skris	srlx	t_1,32,c_12
144355714Skris	stuw	t_1,rp(14)	!r[14]=c3;
144455714Skris	stuw	c_12,rp(15)	!r[15]=c1;
144555714Skris
144655714Skris	ret
144755714Skris	restore	%g0,%g0,%o0
144855714Skris
144955714Skris.type	bn_sqr_comba8,#function
145055714Skris.size	bn_sqr_comba8,(.-bn_sqr_comba8)
145155714Skris
145255714Skris.align	32
145355714Skris
145455714Skris.global bn_sqr_comba4
145555714Skris/*
145655714Skris * void bn_sqr_comba4(r,a)
145755714Skris * BN_ULONG *r,*a;
145855714Skris */
145955714Skrisbn_sqr_comba4:
146055714Skris	save	%sp,FRAME_SIZE,%sp
146155714Skris	mov	1,t_2
146255714Skris	lduw	ap(0),a_0
146355714Skris	sllx	t_2,32,t_2
146455714Skris	lduw	ap(1),a_1
146555714Skris	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
146655714Skris	srlx	t_1,32,c_12
146755714Skris	stuw	t_1,rp(0)	!r[0]=c1;
146855714Skris
146955714Skris	lduw	ap(2),a_2
147055714Skris	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
147155714Skris	addcc	c_12,t_1,c_12
147255714Skris	clr	c_3
147355714Skris	bcs,a	%xcc,.+8
147455714Skris	add	c_3,t_2,c_3
147555714Skris	addcc	c_12,t_1,t_1
147655714Skris	bcs,a	%xcc,.+8
147755714Skris	add	c_3,t_2,c_3
147855714Skris	srlx	t_1,32,c_12
147955714Skris	stuw	t_1,rp(1)	!r[1]=c2;
148055714Skris	or	c_12,c_3,c_12
148155714Skris
148255714Skris	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
148355714Skris	addcc	c_12,t_1,c_12
148455714Skris	clr	c_3
148555714Skris	bcs,a	%xcc,.+8
148655714Skris	add	c_3,t_2,c_3
148755714Skris	addcc	c_12,t_1,c_12
148855714Skris	bcs,a	%xcc,.+8
148955714Skris	add	c_3,t_2,c_3
149055714Skris	lduw	ap(3),a_3
149155714Skris	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
149255714Skris	addcc	c_12,t_1,t_1
149355714Skris	bcs,a	%xcc,.+8
149455714Skris	add	c_3,t_2,c_3
149555714Skris	srlx	t_1,32,c_12
149655714Skris	stuw	t_1,rp(2)	!r[2]=c3;
149755714Skris	or	c_12,c_3,c_12
149855714Skris
149955714Skris	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
150055714Skris	addcc	c_12,t_1,c_12
150155714Skris	clr	c_3
150255714Skris	bcs,a	%xcc,.+8
150355714Skris	add	c_3,t_2,c_3
150455714Skris	addcc	c_12,t_1,c_12
150555714Skris	bcs,a	%xcc,.+8
150655714Skris	add	c_3,t_2,c_3
150755714Skris	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
150855714Skris	addcc	c_12,t_1,c_12
150955714Skris	bcs,a	%xcc,.+8
151055714Skris	add	c_3,t_2,c_3
151155714Skris	addcc	c_12,t_1,t_1
151255714Skris	bcs,a	%xcc,.+8
151355714Skris	add	c_3,t_2,c_3
151455714Skris	srlx	t_1,32,c_12
151555714Skris	stuw	t_1,rp(3)	!r[3]=c1;
151655714Skris	or	c_12,c_3,c_12
151755714Skris
151855714Skris	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
151955714Skris	addcc	c_12,t_1,c_12
152055714Skris	clr	c_3
152155714Skris	bcs,a	%xcc,.+8
152255714Skris	add	c_3,t_2,c_3
152355714Skris	addcc	c_12,t_1,c_12
152455714Skris	bcs,a	%xcc,.+8
152555714Skris	add	c_3,t_2,c_3
152655714Skris	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
152755714Skris	addcc	c_12,t_1,t_1
152855714Skris	bcs,a	%xcc,.+8
152955714Skris	add	c_3,t_2,c_3
153055714Skris	srlx	t_1,32,c_12
153155714Skris	stuw	t_1,rp(4)	!r[4]=c2;
153255714Skris	or	c_12,c_3,c_12
153355714Skris
153455714Skris	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
153555714Skris	addcc	c_12,t_1,c_12
153655714Skris	clr	c_3
153755714Skris	bcs,a	%xcc,.+8
153855714Skris	add	c_3,t_2,c_3
153955714Skris	addcc	c_12,t_1,t_1
154055714Skris	bcs,a	%xcc,.+8
154155714Skris	add	c_3,t_2,c_3
154255714Skris	srlx	t_1,32,c_12
154355714Skris	stuw	t_1,rp(5)	!r[5]=c3;
154455714Skris	or	c_12,c_3,c_12
154555714Skris
154655714Skris	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
154755714Skris	addcc	c_12,t_1,t_1
154855714Skris	srlx	t_1,32,c_12
154955714Skris	stuw	t_1,rp(6)	!r[6]=c1;
155055714Skris	stuw	c_12,rp(7)	!r[7]=c2;
155155714Skris
155255714Skris	ret
155355714Skris	restore	%g0,%g0,%o0
155455714Skris
155555714Skris.type	bn_sqr_comba4,#function
155655714Skris.size	bn_sqr_comba4,(.-bn_sqr_comba4)
155755714Skris
155855714Skris.align	32
1559