155714Skris.ident	"sparcv8plus.s, Version 1.4"
255714Skris.ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
355714Skris
455714Skris/*
555714Skris * ====================================================================
655714Skris * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
755714Skris * project.
855714Skris *
955714Skris * Rights for redistribution and usage in source and binary forms are
1055714Skris * granted according to the OpenSSL license. Warranty of any kind is
1155714Skris * disclaimed.
1255714Skris * ====================================================================
1355714Skris */
1455714Skris
1555714Skris/*
1655714Skris * This is my modest contributon to OpenSSL project (see
1755714Skris * http://www.openssl.org/ for more information about it) and is
1855714Skris * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
1955714Skris * module. For updates see http://fy.chalmers.se/~appro/hpe/.
2055714Skris *
2155714Skris * Questions-n-answers.
2255714Skris *
2355714Skris * Q. How to compile?
2455714Skris * A. With SC4.x/SC5.x:
2555714Skris *
2655714Skris *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
2755714Skris *
2855714Skris *    and with gcc:
2955714Skris *
3055714Skris *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
3155714Skris *
3255714Skris *    or if above fails (it does if you have gas installed):
3355714Skris *
3455714Skris *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
3555714Skris *
3655714Skris *    Quick-n-dirty way to fuse the module into the library.
3755714Skris *    Provided that the library is already configured and built
3855714Skris *    (in 0.9.2 case with no-asm option):
3955714Skris *
4055714Skris *	# cd crypto/bn
4155714Skris *	# cp /some/place/bn_asm.sparc.v8plus.S .
4255714Skris *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
4355714Skris *	# make
4455714Skris *	# cd ../..
4555714Skris *	# make; make test
4655714Skris *
4755714Skris *    Quick-n-dirty way to get rid of it:
4855714Skris *
4955714Skris *	# cd crypto/bn
5055714Skris *	# touch bn_asm.c
5155714Skris *	# make
5255714Skris *	# cd ../..
5355714Skris *	# make; make test
5455714Skris *
5555714Skris * Q. V8plus achitecture? What kind of beast is that?
5655714Skris * A. Well, it's rather a programming model than an architecture...
5755714Skris *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
5855714Skris *    special conditions, namely when kernel doesn't preserve upper
5955714Skris *    32 bits of otherwise 64-bit registers during a context switch.
6055714Skris *
6155714Skris * Q. Why just UltraSPARC? What about SuperSPARC?
6255714Skris * A. Original release did target UltraSPARC only. Now SuperSPARC
6355714Skris *    version is provided along. Both version share bn_*comba[48]
6455714Skris *    implementations (see comment later in code for explanation).
6555714Skris *    But what's so special about this UltraSPARC implementation?
6655714Skris *    Why didn't I let compiler do the job? Trouble is that most of
6755714Skris *    available compilers (well, SC5.0 is the only exception) don't
6855714Skris *    attempt to take advantage of UltraSPARC's 64-bitness under
6955714Skris *    32-bit kernels even though it's perfectly possible (see next
7055714Skris *    question).
7155714Skris *
7255714Skris * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
7355714Skris *    doesn't work?
7455714Skris * A. You can't adress *all* registers as 64-bit wide:-( The catch is
7555714Skris *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
7655714Skris *    preserved if you're in a leaf function, i.e. such never calling
7755714Skris *    any other functions. All functions in this module are leaf and
7855714Skris *    10 registers is a handful. And as a matter of fact none-"comba"
7955714Skris *    routines don't require even that much and I could even afford to
8055714Skris *    not allocate own stack frame for 'em:-)
8155714Skris *
8255714Skris * Q. What about 64-bit kernels?
8355714Skris * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
8455714Skris *    under evaluation and development...
8555714Skris *
8655714Skris * Q. What about shared libraries?
8755714Skris * A. What about 'em? Kidding again:-) Code does *not* contain any
8855714Skris *    code position dependencies and it's safe to include it into
8955714Skris *    shared library as is.
9055714Skris *
9155714Skris * Q. How much faster does it go?
9255714Skris * A. Do you have a good benchmark? In either case below is what I
9355714Skris *    experience with crypto/bn/expspeed.c test program:
9455714Skris *
9555714Skris *	v8plus module on U10/300MHz against bn_asm.c compiled with:
9655714Skris *
9755714Skris *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
9855714Skris *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
9955714Skris *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
10055714Skris *
10155714Skris *	v8 module on SS10/60MHz against bn_asm.c compiled with:
10255714Skris *
10355714Skris *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
10455714Skris *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
10555714Skris *	egcs-1.1.2 -mv8 -O3			+35-45%
10655714Skris *
10755714Skris *    As you can see it's damn hard to beat the new Sun C compiler
10855714Skris *    and it's in first place GNU C users who will appreciate this
10955714Skris *    assembler implementation:-)
11055714Skris */
11155714Skris
11255714Skris/*
11355714Skris * Revision history.
11455714Skris *
11555714Skris * 1.0	- initial release;
11655714Skris * 1.1	- new loop unrolling model(*);
11755714Skris *	- some more fine tuning;
11855714Skris * 1.2	- made gas friendly;
11955714Skris *	- updates to documentation concerning v9;
12055714Skris *	- new performance comparison matrix;
12155714Skris * 1.3	- fixed problem with /usr/ccs/lib/cpp;
12255714Skris * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
12355714Skris *	  resulting in slight overall performance kick;
12455714Skris *	- some retunes;
12555714Skris *	- support for GNU as added;
12655714Skris *
12755714Skris * (*)	Originally unrolled loop looked like this:
12855714Skris *	    for (;;) {
12955714Skris *		op(p+0); if (--n==0) break;
13055714Skris *		op(p+1); if (--n==0) break;
13155714Skris *		op(p+2); if (--n==0) break;
13255714Skris *		op(p+3); if (--n==0) break;
13355714Skris *		p+=4;
13455714Skris *	    }
13555714Skris *	I unroll according to following:
13655714Skris *	    while (n&~3) {
13755714Skris *		op(p+0); op(p+1); op(p+2); op(p+3);
13855714Skris *		p+=4; n=-4;
13955714Skris *	    }
14055714Skris *	    if (n) {
14155714Skris *		op(p+0); if (--n==0) return;
14255714Skris *		op(p+2); if (--n==0) return;
14355714Skris *		op(p+3); return;
14455714Skris *	    }
14555714Skris */
14655714Skris
14755714Skris/*
14855714Skris * GNU assembler can't stand stuw:-(
14955714Skris */
15055714Skris#define stuw st
15155714Skris
15255714Skris.section	".text",#alloc,#execinstr
15355714Skris.file		"bn_asm.sparc.v8plus.S"
15455714Skris
15555714Skris.align	32
15655714Skris
15755714Skris.global bn_mul_add_words
15855714Skris/*
15955714Skris * BN_ULONG bn_mul_add_words(rp,ap,num,w)
16055714Skris * BN_ULONG *rp,*ap;
16155714Skris * int num;
16255714Skris * BN_ULONG w;
16355714Skris */
16455714Skrisbn_mul_add_words:
165160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
16655714Skris	brgz,a	%o2,.L_bn_mul_add_words_proceed
16755714Skris	lduw	[%o1],%g2
16855714Skris	retl
16955714Skris	clr	%o0
170160814Ssimon	nop
171160814Ssimon	nop
172160814Ssimon	nop
17355714Skris
17455714Skris.L_bn_mul_add_words_proceed:
17555714Skris	srl	%o3,%g0,%o3	! clruw	%o3
17655714Skris	andcc	%o2,-4,%g0
17755714Skris	bz,pn	%icc,.L_bn_mul_add_words_tail
17855714Skris	clr	%o5
17955714Skris
18055714Skris.L_bn_mul_add_words_loop:	! wow! 32 aligned!
18155714Skris	lduw	[%o0],%g1
18255714Skris	lduw	[%o1+4],%g3
18355714Skris	mulx	%o3,%g2,%g2
18455714Skris	add	%g1,%o5,%o4
18555714Skris	nop
18655714Skris	add	%o4,%g2,%o4
18755714Skris	stuw	%o4,[%o0]
18855714Skris	srlx	%o4,32,%o5
18955714Skris
19055714Skris	lduw	[%o0+4],%g1
19155714Skris	lduw	[%o1+8],%g2
19255714Skris	mulx	%o3,%g3,%g3
19355714Skris	add	%g1,%o5,%o4
19455714Skris	dec	4,%o2
19555714Skris	add	%o4,%g3,%o4
19655714Skris	stuw	%o4,[%o0+4]
19755714Skris	srlx	%o4,32,%o5
19855714Skris
19955714Skris	lduw	[%o0+8],%g1
20055714Skris	lduw	[%o1+12],%g3
20155714Skris	mulx	%o3,%g2,%g2
20255714Skris	add	%g1,%o5,%o4
20355714Skris	inc	16,%o1
20455714Skris	add	%o4,%g2,%o4
20555714Skris	stuw	%o4,[%o0+8]
20655714Skris	srlx	%o4,32,%o5
20755714Skris
20855714Skris	lduw	[%o0+12],%g1
20955714Skris	mulx	%o3,%g3,%g3
21055714Skris	add	%g1,%o5,%o4
21155714Skris	inc	16,%o0
21255714Skris	add	%o4,%g3,%o4
21355714Skris	andcc	%o2,-4,%g0
21455714Skris	stuw	%o4,[%o0-4]
21555714Skris	srlx	%o4,32,%o5
21655714Skris	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
21755714Skris	lduw	[%o1],%g2
21855714Skris
21955714Skris	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
22055714Skris	lduw	[%o1],%g2
22155714Skris.L_bn_mul_add_words_return:
22255714Skris	retl
22355714Skris	mov	%o5,%o0
22455714Skris
22555714Skris.L_bn_mul_add_words_tail:
22655714Skris	lduw	[%o0],%g1
22755714Skris	mulx	%o3,%g2,%g2
22855714Skris	add	%g1,%o5,%o4
22955714Skris	dec	%o2
23055714Skris	add	%o4,%g2,%o4
23155714Skris	srlx	%o4,32,%o5
23255714Skris	brz,pt	%o2,.L_bn_mul_add_words_return
23355714Skris	stuw	%o4,[%o0]
23455714Skris
23555714Skris	lduw	[%o1+4],%g2
23655714Skris	lduw	[%o0+4],%g1
23755714Skris	mulx	%o3,%g2,%g2
23855714Skris	add	%g1,%o5,%o4
23955714Skris	dec	%o2
24055714Skris	add	%o4,%g2,%o4
24155714Skris	srlx	%o4,32,%o5
24255714Skris	brz,pt	%o2,.L_bn_mul_add_words_return
24355714Skris	stuw	%o4,[%o0+4]
24455714Skris
24555714Skris	lduw	[%o1+8],%g2
24655714Skris	lduw	[%o0+8],%g1
24755714Skris	mulx	%o3,%g2,%g2
24855714Skris	add	%g1,%o5,%o4
24955714Skris	add	%o4,%g2,%o4
25055714Skris	stuw	%o4,[%o0+8]
25155714Skris	retl
25255714Skris	srlx	%o4,32,%o0
25355714Skris
25455714Skris.type	bn_mul_add_words,#function
25555714Skris.size	bn_mul_add_words,(.-bn_mul_add_words)
25655714Skris
25755714Skris.align	32
25855714Skris
25955714Skris.global bn_mul_words
26055714Skris/*
26155714Skris * BN_ULONG bn_mul_words(rp,ap,num,w)
26255714Skris * BN_ULONG *rp,*ap;
26355714Skris * int num;
26455714Skris * BN_ULONG w;
26555714Skris */
26655714Skrisbn_mul_words:
267160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
26855714Skris	brgz,a	%o2,.L_bn_mul_words_proceeed
26955714Skris	lduw	[%o1],%g2
27055714Skris	retl
27155714Skris	clr	%o0
272160814Ssimon	nop
273160814Ssimon	nop
274160814Ssimon	nop
27555714Skris
27655714Skris.L_bn_mul_words_proceeed:
27755714Skris	srl	%o3,%g0,%o3	! clruw	%o3
27855714Skris	andcc	%o2,-4,%g0
27955714Skris	bz,pn	%icc,.L_bn_mul_words_tail
28055714Skris	clr	%o5
28155714Skris
28255714Skris.L_bn_mul_words_loop:		! wow! 32 aligned!
28355714Skris	lduw	[%o1+4],%g3
28455714Skris	mulx	%o3,%g2,%g2
28555714Skris	add	%g2,%o5,%o4
28655714Skris	nop
28755714Skris	stuw	%o4,[%o0]
28855714Skris	srlx	%o4,32,%o5
28955714Skris
29055714Skris	lduw	[%o1+8],%g2
29155714Skris	mulx	%o3,%g3,%g3
29255714Skris	add	%g3,%o5,%o4
29355714Skris	dec	4,%o2
29455714Skris	stuw	%o4,[%o0+4]
29555714Skris	srlx	%o4,32,%o5
29655714Skris
29755714Skris	lduw	[%o1+12],%g3
29855714Skris	mulx	%o3,%g2,%g2
29955714Skris	add	%g2,%o5,%o4
30055714Skris	inc	16,%o1
30155714Skris	stuw	%o4,[%o0+8]
30255714Skris	srlx	%o4,32,%o5
30355714Skris
30455714Skris	mulx	%o3,%g3,%g3
30555714Skris	add	%g3,%o5,%o4
30655714Skris	inc	16,%o0
30755714Skris	stuw	%o4,[%o0-4]
30855714Skris	srlx	%o4,32,%o5
30955714Skris	andcc	%o2,-4,%g0
31055714Skris	bnz,a,pt	%icc,.L_bn_mul_words_loop
31155714Skris	lduw	[%o1],%g2
31255714Skris	nop
31355714Skris	nop
31455714Skris
31555714Skris	brnz,a,pn	%o2,.L_bn_mul_words_tail
31655714Skris	lduw	[%o1],%g2
31755714Skris.L_bn_mul_words_return:
31855714Skris	retl
31955714Skris	mov	%o5,%o0
32055714Skris
32155714Skris.L_bn_mul_words_tail:
32255714Skris	mulx	%o3,%g2,%g2
32355714Skris	add	%g2,%o5,%o4
32455714Skris	dec	%o2
32555714Skris	srlx	%o4,32,%o5
32655714Skris	brz,pt	%o2,.L_bn_mul_words_return
32755714Skris	stuw	%o4,[%o0]
32855714Skris
32955714Skris	lduw	[%o1+4],%g2
33055714Skris	mulx	%o3,%g2,%g2
33155714Skris	add	%g2,%o5,%o4
33255714Skris	dec	%o2
33355714Skris	srlx	%o4,32,%o5
33455714Skris	brz,pt	%o2,.L_bn_mul_words_return
33555714Skris	stuw	%o4,[%o0+4]
33655714Skris
33755714Skris	lduw	[%o1+8],%g2
33855714Skris	mulx	%o3,%g2,%g2
33955714Skris	add	%g2,%o5,%o4
34055714Skris	stuw	%o4,[%o0+8]
34155714Skris	retl
34255714Skris	srlx	%o4,32,%o0
34355714Skris
34455714Skris.type	bn_mul_words,#function
34555714Skris.size	bn_mul_words,(.-bn_mul_words)
34655714Skris
34755714Skris.align  32
34855714Skris.global	bn_sqr_words
34955714Skris/*
35055714Skris * void bn_sqr_words(r,a,n)
35155714Skris * BN_ULONG *r,*a;
35255714Skris * int n;
35355714Skris */
35455714Skrisbn_sqr_words:
355160814Ssimon	sra	%o2,%g0,%o2	! signx %o2
35655714Skris	brgz,a	%o2,.L_bn_sqr_words_proceeed
35755714Skris	lduw	[%o1],%g2
35855714Skris	retl
35955714Skris	clr	%o0
360160814Ssimon	nop
361160814Ssimon	nop
362160814Ssimon	nop
36355714Skris
36455714Skris.L_bn_sqr_words_proceeed:
36555714Skris	andcc	%o2,-4,%g0
36655714Skris	nop
36755714Skris	bz,pn	%icc,.L_bn_sqr_words_tail
36855714Skris	nop
36955714Skris
37055714Skris.L_bn_sqr_words_loop:		! wow! 32 aligned!
37155714Skris	lduw	[%o1+4],%g3
37255714Skris	mulx	%g2,%g2,%o4
37355714Skris	stuw	%o4,[%o0]
37455714Skris	srlx	%o4,32,%o5
37555714Skris	stuw	%o5,[%o0+4]
37655714Skris	nop
37755714Skris
37855714Skris	lduw	[%o1+8],%g2
37955714Skris	mulx	%g3,%g3,%o4
38055714Skris	dec	4,%o2
38155714Skris	stuw	%o4,[%o0+8]
38255714Skris	srlx	%o4,32,%o5
38355714Skris	stuw	%o5,[%o0+12]
38455714Skris
38555714Skris	lduw	[%o1+12],%g3
38655714Skris	mulx	%g2,%g2,%o4
38755714Skris	srlx	%o4,32,%o5
38855714Skris	stuw	%o4,[%o0+16]
38955714Skris	inc	16,%o1
39055714Skris	stuw	%o5,[%o0+20]
39155714Skris
39255714Skris	mulx	%g3,%g3,%o4
39355714Skris	inc	32,%o0
39455714Skris	stuw	%o4,[%o0-8]
39555714Skris	srlx	%o4,32,%o5
39655714Skris	andcc	%o2,-4,%g2
39755714Skris	stuw	%o5,[%o0-4]
39855714Skris	bnz,a,pt	%icc,.L_bn_sqr_words_loop
39955714Skris	lduw	[%o1],%g2
40055714Skris	nop
40155714Skris
40255714Skris	brnz,a,pn	%o2,.L_bn_sqr_words_tail
40355714Skris	lduw	[%o1],%g2
40455714Skris.L_bn_sqr_words_return:
40555714Skris	retl
40655714Skris	clr	%o0
40755714Skris
40855714Skris.L_bn_sqr_words_tail:
40955714Skris	mulx	%g2,%g2,%o4
41055714Skris	dec	%o2
41155714Skris	stuw	%o4,[%o0]
41255714Skris	srlx	%o4,32,%o5
41355714Skris	brz,pt	%o2,.L_bn_sqr_words_return
41455714Skris	stuw	%o5,[%o0+4]
41555714Skris
41655714Skris	lduw	[%o1+4],%g2
41755714Skris	mulx	%g2,%g2,%o4
41855714Skris	dec	%o2
41955714Skris	stuw	%o4,[%o0+8]
42055714Skris	srlx	%o4,32,%o5
42155714Skris	brz,pt	%o2,.L_bn_sqr_words_return
42255714Skris	stuw	%o5,[%o0+12]
42355714Skris
42455714Skris	lduw	[%o1+8],%g2
42555714Skris	mulx	%g2,%g2,%o4
42655714Skris	srlx	%o4,32,%o5
42755714Skris	stuw	%o4,[%o0+16]
42855714Skris	stuw	%o5,[%o0+20]
42955714Skris	retl
43055714Skris	clr	%o0
43155714Skris
43255714Skris.type	bn_sqr_words,#function
43355714Skris.size	bn_sqr_words,(.-bn_sqr_words)
43455714Skris
43555714Skris.align	32
43655714Skris.global bn_div_words
43755714Skris/*
43855714Skris * BN_ULONG bn_div_words(h,l,d)
43955714Skris * BN_ULONG h,l,d;
44055714Skris */
44155714Skrisbn_div_words:
44255714Skris	sllx	%o0,32,%o0
44355714Skris	or	%o0,%o1,%o0
44455714Skris	udivx	%o0,%o2,%o0
44555714Skris	retl
44655714Skris	srl	%o0,%g0,%o0	! clruw	%o0
44755714Skris
44855714Skris.type	bn_div_words,#function
44955714Skris.size	bn_div_words,(.-bn_div_words)
45055714Skris
45155714Skris.align	32
45255714Skris
45355714Skris.global bn_add_words
45455714Skris/*
45555714Skris * BN_ULONG bn_add_words(rp,ap,bp,n)
45655714Skris * BN_ULONG *rp,*ap,*bp;
45755714Skris * int n;
45855714Skris */
45955714Skrisbn_add_words:
460160814Ssimon	sra	%o3,%g0,%o3	! signx %o3
46155714Skris	brgz,a	%o3,.L_bn_add_words_proceed
46255714Skris	lduw	[%o1],%o4
46355714Skris	retl
46455714Skris	clr	%o0
46555714Skris
46655714Skris.L_bn_add_words_proceed:
46755714Skris	andcc	%o3,-4,%g0
46855714Skris	bz,pn	%icc,.L_bn_add_words_tail
46955714Skris	addcc	%g0,0,%g0	! clear carry flag
47055714Skris
47155714Skris.L_bn_add_words_loop:		! wow! 32 aligned!
47255714Skris	dec	4,%o3
47355714Skris	lduw	[%o2],%o5
47455714Skris	lduw	[%o1+4],%g1
47555714Skris	lduw	[%o2+4],%g2
47655714Skris	lduw	[%o1+8],%g3
47755714Skris	lduw	[%o2+8],%g4
47855714Skris	addccc	%o5,%o4,%o5
47955714Skris	stuw	%o5,[%o0]
48055714Skris
48155714Skris	lduw	[%o1+12],%o4
48255714Skris	lduw	[%o2+12],%o5
48355714Skris	inc	16,%o1
48455714Skris	addccc	%g1,%g2,%g1
48555714Skris	stuw	%g1,[%o0+4]
48655714Skris
48755714Skris	inc	16,%o2
48855714Skris	addccc	%g3,%g4,%g3
48955714Skris	stuw	%g3,[%o0+8]
49055714Skris
49155714Skris	inc	16,%o0
49255714Skris	addccc	%o5,%o4,%o5
49355714Skris	stuw	%o5,[%o0-4]
49455714Skris	and	%o3,-4,%g1
49555714Skris	brnz,a,pt	%g1,.L_bn_add_words_loop
49655714Skris	lduw	[%o1],%o4
49755714Skris
49855714Skris	brnz,a,pn	%o3,.L_bn_add_words_tail
49955714Skris	lduw	[%o1],%o4
50055714Skris.L_bn_add_words_return:
50155714Skris	clr	%o0
50255714Skris	retl
50355714Skris	movcs	%icc,1,%o0
50455714Skris	nop
50555714Skris
50655714Skris.L_bn_add_words_tail:
50755714Skris	lduw	[%o2],%o5
50855714Skris	dec	%o3
50955714Skris	addccc	%o5,%o4,%o5
51055714Skris	brz,pt	%o3,.L_bn_add_words_return
51155714Skris	stuw	%o5,[%o0]
51255714Skris
51355714Skris	lduw	[%o1+4],%o4
51455714Skris	lduw	[%o2+4],%o5
51555714Skris	dec	%o3
51655714Skris	addccc	%o5,%o4,%o5
51755714Skris	brz,pt	%o3,.L_bn_add_words_return
51855714Skris	stuw	%o5,[%o0+4]
51955714Skris
52055714Skris	lduw	[%o1+8],%o4
52155714Skris	lduw	[%o2+8],%o5
52255714Skris	addccc	%o5,%o4,%o5
52355714Skris	stuw	%o5,[%o0+8]
52455714Skris	clr	%o0
52555714Skris	retl
52655714Skris	movcs	%icc,1,%o0
52755714Skris
52855714Skris.type	bn_add_words,#function
52955714Skris.size	bn_add_words,(.-bn_add_words)
53055714Skris
53155714Skris.global bn_sub_words
53255714Skris/*
53355714Skris * BN_ULONG bn_sub_words(rp,ap,bp,n)
53455714Skris * BN_ULONG *rp,*ap,*bp;
53555714Skris * int n;
53655714Skris */
53755714Skrisbn_sub_words:
538160814Ssimon	sra	%o3,%g0,%o3	! signx %o3
53955714Skris	brgz,a	%o3,.L_bn_sub_words_proceed
54055714Skris	lduw	[%o1],%o4
54155714Skris	retl
54255714Skris	clr	%o0
54355714Skris
54455714Skris.L_bn_sub_words_proceed:
54555714Skris	andcc	%o3,-4,%g0
54655714Skris	bz,pn	%icc,.L_bn_sub_words_tail
54755714Skris	addcc	%g0,0,%g0	! clear carry flag
54855714Skris
54955714Skris.L_bn_sub_words_loop:		! wow! 32 aligned!
55055714Skris	dec	4,%o3
55155714Skris	lduw	[%o2],%o5
55255714Skris	lduw	[%o1+4],%g1
55355714Skris	lduw	[%o2+4],%g2
55455714Skris	lduw	[%o1+8],%g3
55555714Skris	lduw	[%o2+8],%g4
55655714Skris	subccc	%o4,%o5,%o5
55755714Skris	stuw	%o5,[%o0]
55855714Skris
55955714Skris	lduw	[%o1+12],%o4
56055714Skris	lduw	[%o2+12],%o5
56155714Skris	inc	16,%o1
56255714Skris	subccc	%g1,%g2,%g2
56355714Skris	stuw	%g2,[%o0+4]
56455714Skris
56555714Skris	inc	16,%o2
56655714Skris	subccc	%g3,%g4,%g4
56755714Skris	stuw	%g4,[%o0+8]
56855714Skris
56955714Skris	inc	16,%o0
57055714Skris	subccc	%o4,%o5,%o5
57155714Skris	stuw	%o5,[%o0-4]
57255714Skris	and	%o3,-4,%g1
57355714Skris	brnz,a,pt	%g1,.L_bn_sub_words_loop
57455714Skris	lduw	[%o1],%o4
57555714Skris
57655714Skris	brnz,a,pn	%o3,.L_bn_sub_words_tail
57755714Skris	lduw	[%o1],%o4
57855714Skris.L_bn_sub_words_return:
57955714Skris	clr	%o0
58055714Skris	retl
58155714Skris	movcs	%icc,1,%o0
58255714Skris	nop
58355714Skris
58455714Skris.L_bn_sub_words_tail:		! wow! 32 aligned!
58555714Skris	lduw	[%o2],%o5
58655714Skris	dec	%o3
58755714Skris	subccc	%o4,%o5,%o5
58855714Skris	brz,pt	%o3,.L_bn_sub_words_return
58955714Skris	stuw	%o5,[%o0]
59055714Skris
59155714Skris	lduw	[%o1+4],%o4
59255714Skris	lduw	[%o2+4],%o5
59355714Skris	dec	%o3
59455714Skris	subccc	%o4,%o5,%o5
59555714Skris	brz,pt	%o3,.L_bn_sub_words_return
59655714Skris	stuw	%o5,[%o0+4]
59755714Skris
59855714Skris	lduw	[%o1+8],%o4
59955714Skris	lduw	[%o2+8],%o5
60055714Skris	subccc	%o4,%o5,%o5
60155714Skris	stuw	%o5,[%o0+8]
60255714Skris	clr	%o0
60355714Skris	retl
60455714Skris	movcs	%icc,1,%o0
60555714Skris
60655714Skris.type	bn_sub_words,#function
60755714Skris.size	bn_sub_words,(.-bn_sub_words)
60855714Skris
60955714Skris/*
61055714Skris * Code below depends on the fact that upper parts of the %l0-%l7
61155714Skris * and %i0-%i7 are zeroed by kernel after context switch. In
61255714Skris * previous versions this comment stated that "the trouble is that
61355714Skris * it's not feasible to implement the mumbo-jumbo in less V9
61455714Skris * instructions:-(" which apparently isn't true thanks to
61555714Skris * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
61655714Skris * results not from the shorter code, but from elimination of
61755714Skris * multicycle none-pairable 'rd %y,%rd' instructions.
61855714Skris *
61955714Skris *							Andy.
62055714Skris */
62155714Skris
62255714Skris#define FRAME_SIZE	-96
62355714Skris
62455714Skris/*
62555714Skris * Here is register usage map for *all* routines below.
62655714Skris */
62755714Skris#define t_1	%o0
62855714Skris#define	t_2	%o1
62955714Skris#define c_12	%o2
63055714Skris#define c_3	%o3
63155714Skris
63255714Skris#define ap(I)	[%i1+4*I]
63355714Skris#define bp(I)	[%i2+4*I]
63455714Skris#define rp(I)	[%i0+4*I]
63555714Skris
63655714Skris#define	a_0	%l0
63755714Skris#define	a_1	%l1
63855714Skris#define	a_2	%l2
63955714Skris#define	a_3	%l3
64055714Skris#define	a_4	%l4
64155714Skris#define	a_5	%l5
64255714Skris#define	a_6	%l6
64355714Skris#define	a_7	%l7
64455714Skris
64555714Skris#define	b_0	%i3
64655714Skris#define	b_1	%i4
64755714Skris#define	b_2	%i5
64855714Skris#define	b_3	%o4
64955714Skris#define	b_4	%o5
65055714Skris#define	b_5	%o7
65155714Skris#define	b_6	%g1
65255714Skris#define	b_7	%g4
65355714Skris
65455714Skris.align	32
65555714Skris.global bn_mul_comba8
65655714Skris/*
65755714Skris * void bn_mul_comba8(r,a,b)
65855714Skris * BN_ULONG *r,*a,*b;
65955714Skris */
66055714Skrisbn_mul_comba8:
66155714Skris	save	%sp,FRAME_SIZE,%sp
66255714Skris	mov	1,t_2
66355714Skris	lduw	ap(0),a_0
66455714Skris	sllx	t_2,32,t_2
66555714Skris	lduw	bp(0),b_0	!=
66655714Skris	lduw	bp(1),b_1
66755714Skris	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
66855714Skris	srlx	t_1,32,c_12
66955714Skris	stuw	t_1,rp(0)	!=!r[0]=c1;
67055714Skris
67155714Skris	lduw	ap(1),a_1
67255714Skris	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
67355714Skris	addcc	c_12,t_1,c_12
67455714Skris	clr	c_3		!=
67555714Skris	bcs,a	%xcc,.+8
67655714Skris	add	c_3,t_2,c_3
67755714Skris	lduw	ap(2),a_2
67855714Skris	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
67955714Skris	addcc	c_12,t_1,t_1
68055714Skris	bcs,a	%xcc,.+8
68155714Skris	add	c_3,t_2,c_3
68255714Skris	srlx	t_1,32,c_12	!=
68355714Skris	stuw	t_1,rp(1)	!r[1]=c2;
68455714Skris	or	c_12,c_3,c_12
68555714Skris
68655714Skris	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
68755714Skris	addcc	c_12,t_1,c_12	!=
68855714Skris	clr	c_3
68955714Skris	bcs,a	%xcc,.+8
69055714Skris	add	c_3,t_2,c_3
69155714Skris	lduw	bp(2),b_2	!=
69255714Skris	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
69355714Skris	addcc	c_12,t_1,c_12
69455714Skris	bcs,a	%xcc,.+8
69555714Skris	add	c_3,t_2,c_3	!=
69655714Skris	lduw	bp(3),b_3
69755714Skris	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
69855714Skris	addcc	c_12,t_1,t_1
69955714Skris	bcs,a	%xcc,.+8	!=
70055714Skris	add	c_3,t_2,c_3
70155714Skris	srlx	t_1,32,c_12
70255714Skris	stuw	t_1,rp(2)	!r[2]=c3;
70355714Skris	or	c_12,c_3,c_12	!=
70455714Skris
70555714Skris	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
70655714Skris	addcc	c_12,t_1,c_12
70755714Skris	clr	c_3
70855714Skris	bcs,a	%xcc,.+8	!=
70955714Skris	add	c_3,t_2,c_3
71055714Skris	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
71155714Skris	addcc	c_12,t_1,c_12
71255714Skris	bcs,a	%xcc,.+8	!=
71355714Skris	add	c_3,t_2,c_3
71455714Skris	lduw	ap(3),a_3
71555714Skris	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
71655714Skris	addcc	c_12,t_1,c_12	!=
71755714Skris	bcs,a	%xcc,.+8
71855714Skris	add	c_3,t_2,c_3
71955714Skris	lduw	ap(4),a_4
72055714Skris	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
72155714Skris	addcc	c_12,t_1,t_1
72255714Skris	bcs,a	%xcc,.+8
72355714Skris	add	c_3,t_2,c_3
72455714Skris	srlx	t_1,32,c_12	!=
72555714Skris	stuw	t_1,rp(3)	!r[3]=c1;
72655714Skris	or	c_12,c_3,c_12
72755714Skris
72855714Skris	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
72955714Skris	addcc	c_12,t_1,c_12	!=
73055714Skris	clr	c_3
73155714Skris	bcs,a	%xcc,.+8
73255714Skris	add	c_3,t_2,c_3
73355714Skris	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
73455714Skris	addcc	c_12,t_1,c_12
73555714Skris	bcs,a	%xcc,.+8
73655714Skris	add	c_3,t_2,c_3
73755714Skris	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
73855714Skris	addcc	c_12,t_1,c_12
73955714Skris	bcs,a	%xcc,.+8
74055714Skris	add	c_3,t_2,c_3
74155714Skris	lduw	bp(4),b_4	!=
74255714Skris	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
74355714Skris	addcc	c_12,t_1,c_12
74455714Skris	bcs,a	%xcc,.+8
74555714Skris	add	c_3,t_2,c_3	!=
74655714Skris	lduw	bp(5),b_5
74755714Skris	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
74855714Skris	addcc	c_12,t_1,t_1
74955714Skris	bcs,a	%xcc,.+8	!=
75055714Skris	add	c_3,t_2,c_3
75155714Skris	srlx	t_1,32,c_12
75255714Skris	stuw	t_1,rp(4)	!r[4]=c2;
75355714Skris	or	c_12,c_3,c_12	!=
75455714Skris
75555714Skris	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
75655714Skris	addcc	c_12,t_1,c_12
75755714Skris	clr	c_3
75855714Skris	bcs,a	%xcc,.+8	!=
75955714Skris	add	c_3,t_2,c_3
76055714Skris	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
76155714Skris	addcc	c_12,t_1,c_12
76255714Skris	bcs,a	%xcc,.+8	!=
76355714Skris	add	c_3,t_2,c_3
76455714Skris	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
76555714Skris	addcc	c_12,t_1,c_12
76655714Skris	bcs,a	%xcc,.+8	!=
76755714Skris	add	c_3,t_2,c_3
76855714Skris	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
76955714Skris	addcc	c_12,t_1,c_12
77055714Skris	bcs,a	%xcc,.+8	!=
77155714Skris	add	c_3,t_2,c_3
77255714Skris	lduw	ap(5),a_5
77355714Skris	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
77455714Skris	addcc	c_12,t_1,c_12	!=
77555714Skris	bcs,a	%xcc,.+8
77655714Skris	add	c_3,t_2,c_3
77755714Skris	lduw	ap(6),a_6
77855714Skris	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
77955714Skris	addcc	c_12,t_1,t_1
78055714Skris	bcs,a	%xcc,.+8
78155714Skris	add	c_3,t_2,c_3
78255714Skris	srlx	t_1,32,c_12	!=
78355714Skris	stuw	t_1,rp(5)	!r[5]=c3;
78455714Skris	or	c_12,c_3,c_12
78555714Skris
78655714Skris	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
78755714Skris	addcc	c_12,t_1,c_12	!=
78855714Skris	clr	c_3
78955714Skris	bcs,a	%xcc,.+8
79055714Skris	add	c_3,t_2,c_3
79155714Skris	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
79255714Skris	addcc	c_12,t_1,c_12
79355714Skris	bcs,a	%xcc,.+8
79455714Skris	add	c_3,t_2,c_3
79555714Skris	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
79655714Skris	addcc	c_12,t_1,c_12
79755714Skris	bcs,a	%xcc,.+8
79855714Skris	add	c_3,t_2,c_3
79955714Skris	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
80055714Skris	addcc	c_12,t_1,c_12
80155714Skris	bcs,a	%xcc,.+8
80255714Skris	add	c_3,t_2,c_3
80355714Skris	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
80455714Skris	addcc	c_12,t_1,c_12
80555714Skris	bcs,a	%xcc,.+8
80655714Skris	add	c_3,t_2,c_3
80755714Skris	lduw	bp(6),b_6	!=
80855714Skris	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
80955714Skris	addcc	c_12,t_1,c_12
81055714Skris	bcs,a	%xcc,.+8
81155714Skris	add	c_3,t_2,c_3	!=
81255714Skris	lduw	bp(7),b_7
81355714Skris	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
81455714Skris	addcc	c_12,t_1,t_1
81555714Skris	bcs,a	%xcc,.+8	!=
81655714Skris	add	c_3,t_2,c_3
81755714Skris	srlx	t_1,32,c_12
81855714Skris	stuw	t_1,rp(6)	!r[6]=c1;
81955714Skris	or	c_12,c_3,c_12	!=
82055714Skris
82155714Skris	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
82255714Skris	addcc	c_12,t_1,c_12
82355714Skris	clr	c_3
82455714Skris	bcs,a	%xcc,.+8	!=
82555714Skris	add	c_3,t_2,c_3
82655714Skris	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
82755714Skris	addcc	c_12,t_1,c_12
82855714Skris	bcs,a	%xcc,.+8	!=
82955714Skris	add	c_3,t_2,c_3
83055714Skris	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
83155714Skris	addcc	c_12,t_1,c_12
83255714Skris	bcs,a	%xcc,.+8	!=
83355714Skris	add	c_3,t_2,c_3
83455714Skris	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
83555714Skris	addcc	c_12,t_1,c_12
83655714Skris	bcs,a	%xcc,.+8	!=
83755714Skris	add	c_3,t_2,c_3
83855714Skris	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
83955714Skris	addcc	c_12,t_1,c_12
84055714Skris	bcs,a	%xcc,.+8	!=
84155714Skris	add	c_3,t_2,c_3
84255714Skris	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
84355714Skris	addcc	c_12,t_1,c_12
84455714Skris	bcs,a	%xcc,.+8	!=
84555714Skris	add	c_3,t_2,c_3
84655714Skris	lduw	ap(7),a_7
84755714Skris	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
84855714Skris	addcc	c_12,t_1,c_12
84955714Skris	bcs,a	%xcc,.+8
85055714Skris	add	c_3,t_2,c_3
85155714Skris	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
85255714Skris	addcc	c_12,t_1,t_1
85355714Skris	bcs,a	%xcc,.+8
85455714Skris	add	c_3,t_2,c_3
85555714Skris	srlx	t_1,32,c_12	!=
85655714Skris	stuw	t_1,rp(7)	!r[7]=c2;
85755714Skris	or	c_12,c_3,c_12
85855714Skris
85955714Skris	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
86055714Skris	addcc	c_12,t_1,c_12
86155714Skris	clr	c_3
86255714Skris	bcs,a	%xcc,.+8
86355714Skris	add	c_3,t_2,c_3	!=
86455714Skris	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
86555714Skris	addcc	c_12,t_1,c_12
86655714Skris	bcs,a	%xcc,.+8
86755714Skris	add	c_3,t_2,c_3	!=
86855714Skris	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
86955714Skris	addcc	c_12,t_1,c_12
87055714Skris	bcs,a	%xcc,.+8
87155714Skris	add	c_3,t_2,c_3	!=
87255714Skris	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
87355714Skris	addcc	c_12,t_1,c_12
87455714Skris	bcs,a	%xcc,.+8
87555714Skris	add	c_3,t_2,c_3	!=
87655714Skris	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
87755714Skris	addcc	c_12,t_1,c_12
87855714Skris	bcs,a	%xcc,.+8
87955714Skris	add	c_3,t_2,c_3	!=
88055714Skris	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
88155714Skris	addcc	c_12,t_1,c_12
88255714Skris	bcs,a	%xcc,.+8
88355714Skris	add	c_3,t_2,c_3	!=
88455714Skris	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
88555714Skris	addcc	c_12,t_1,t_1
88655714Skris	bcs,a	%xcc,.+8
88755714Skris	add	c_3,t_2,c_3	!=
88855714Skris	srlx	t_1,32,c_12
88955714Skris	stuw	t_1,rp(8)	!r[8]=c3;
89055714Skris	or	c_12,c_3,c_12
89155714Skris
89255714Skris	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
89355714Skris	addcc	c_12,t_1,c_12
89455714Skris	clr	c_3
89555714Skris	bcs,a	%xcc,.+8
89655714Skris	add	c_3,t_2,c_3	!=
89755714Skris	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
89855714Skris	addcc	c_12,t_1,c_12
89955714Skris	bcs,a	%xcc,.+8	!=
90055714Skris	add	c_3,t_2,c_3
90155714Skris	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
90255714Skris	addcc	c_12,t_1,c_12
90355714Skris	bcs,a	%xcc,.+8	!=
90455714Skris	add	c_3,t_2,c_3
90555714Skris	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
90655714Skris	addcc	c_12,t_1,c_12
90755714Skris	bcs,a	%xcc,.+8	!=
90855714Skris	add	c_3,t_2,c_3
90955714Skris	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
91055714Skris	addcc	c_12,t_1,c_12
91155714Skris	bcs,a	%xcc,.+8	!=
91255714Skris	add	c_3,t_2,c_3
91355714Skris	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
91455714Skris	addcc	c_12,t_1,t_1
91555714Skris	bcs,a	%xcc,.+8	!=
91655714Skris	add	c_3,t_2,c_3
91755714Skris	srlx	t_1,32,c_12
91855714Skris	stuw	t_1,rp(9)	!r[9]=c1;
91955714Skris	or	c_12,c_3,c_12	!=
92055714Skris
92155714Skris	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
92255714Skris	addcc	c_12,t_1,c_12
92355714Skris	clr	c_3
92455714Skris	bcs,a	%xcc,.+8	!=
92555714Skris	add	c_3,t_2,c_3
92655714Skris	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
92755714Skris	addcc	c_12,t_1,c_12
92855714Skris	bcs,a	%xcc,.+8	!=
92955714Skris	add	c_3,t_2,c_3
93055714Skris	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
93155714Skris	addcc	c_12,t_1,c_12
93255714Skris	bcs,a	%xcc,.+8	!=
93355714Skris	add	c_3,t_2,c_3
93455714Skris	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
93555714Skris	addcc	c_12,t_1,c_12
93655714Skris	bcs,a	%xcc,.+8	!=
93755714Skris	add	c_3,t_2,c_3
93855714Skris	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
93955714Skris	addcc	c_12,t_1,t_1
94055714Skris	bcs,a	%xcc,.+8	!=
94155714Skris	add	c_3,t_2,c_3
94255714Skris	srlx	t_1,32,c_12
94355714Skris	stuw	t_1,rp(10)	!r[10]=c2;
94455714Skris	or	c_12,c_3,c_12	!=
94555714Skris
94655714Skris	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
94755714Skris	addcc	c_12,t_1,c_12
94855714Skris	clr	c_3
94955714Skris	bcs,a	%xcc,.+8	!=
95055714Skris	add	c_3,t_2,c_3
95155714Skris	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
95255714Skris	addcc	c_12,t_1,c_12
95355714Skris	bcs,a	%xcc,.+8	!=
95455714Skris	add	c_3,t_2,c_3
95555714Skris	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
95655714Skris	addcc	c_12,t_1,c_12
95755714Skris	bcs,a	%xcc,.+8	!=
95855714Skris	add	c_3,t_2,c_3
95955714Skris	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
96055714Skris	addcc	c_12,t_1,t_1
96155714Skris	bcs,a	%xcc,.+8	!=
96255714Skris	add	c_3,t_2,c_3
96355714Skris	srlx	t_1,32,c_12
96455714Skris	stuw	t_1,rp(11)	!r[11]=c3;
96555714Skris	or	c_12,c_3,c_12	!=
96655714Skris
96755714Skris	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
96855714Skris	addcc	c_12,t_1,c_12
96955714Skris	clr	c_3
97055714Skris	bcs,a	%xcc,.+8	!=
97155714Skris	add	c_3,t_2,c_3
97255714Skris	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
97355714Skris	addcc	c_12,t_1,c_12
97455714Skris	bcs,a	%xcc,.+8	!=
97555714Skris	add	c_3,t_2,c_3
97655714Skris	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
97755714Skris	addcc	c_12,t_1,t_1
97855714Skris	bcs,a	%xcc,.+8	!=
97955714Skris	add	c_3,t_2,c_3
98055714Skris	srlx	t_1,32,c_12
98155714Skris	stuw	t_1,rp(12)	!r[12]=c1;
98255714Skris	or	c_12,c_3,c_12	!=
98355714Skris
98455714Skris	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
98555714Skris	addcc	c_12,t_1,c_12
98655714Skris	clr	c_3
98755714Skris	bcs,a	%xcc,.+8	!=
98855714Skris	add	c_3,t_2,c_3
98955714Skris	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
99055714Skris	addcc	c_12,t_1,t_1
99155714Skris	bcs,a	%xcc,.+8	!=
99255714Skris	add	c_3,t_2,c_3
99355714Skris	srlx	t_1,32,c_12
99455714Skris	st	t_1,rp(13)	!r[13]=c2;
99555714Skris	or	c_12,c_3,c_12	!=
99655714Skris
99755714Skris	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
99855714Skris	addcc	c_12,t_1,t_1
99955714Skris	srlx	t_1,32,c_12	!=
100055714Skris	stuw	t_1,rp(14)	!r[14]=c3;
100155714Skris	stuw	c_12,rp(15)	!r[15]=c1;
100255714Skris
100355714Skris	ret
100455714Skris	restore	%g0,%g0,%o0	!=
100555714Skris
100655714Skris.type	bn_mul_comba8,#function
100755714Skris.size	bn_mul_comba8,(.-bn_mul_comba8)
100855714Skris
100955714Skris.align	32
101055714Skris
101155714Skris.global bn_mul_comba4
101255714Skris/*
101355714Skris * void bn_mul_comba4(r,a,b)
101455714Skris * BN_ULONG *r,*a,*b;
101555714Skris */
101655714Skrisbn_mul_comba4:
101755714Skris	save	%sp,FRAME_SIZE,%sp
101855714Skris	lduw	ap(0),a_0
101955714Skris	mov	1,t_2
102055714Skris	lduw	bp(0),b_0
102155714Skris	sllx	t_2,32,t_2	!=
102255714Skris	lduw	bp(1),b_1
102355714Skris	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
102455714Skris	srlx	t_1,32,c_12
102555714Skris	stuw	t_1,rp(0)	!=!r[0]=c1;
102655714Skris
102755714Skris	lduw	ap(1),a_1
102855714Skris	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
102955714Skris	addcc	c_12,t_1,c_12
103055714Skris	clr	c_3		!=
103155714Skris	bcs,a	%xcc,.+8
103255714Skris	add	c_3,t_2,c_3
103355714Skris	lduw	ap(2),a_2
103455714Skris	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
103555714Skris	addcc	c_12,t_1,t_1
103655714Skris	bcs,a	%xcc,.+8
103755714Skris	add	c_3,t_2,c_3
103855714Skris	srlx	t_1,32,c_12	!=
103955714Skris	stuw	t_1,rp(1)	!r[1]=c2;
104055714Skris	or	c_12,c_3,c_12
104155714Skris
104255714Skris	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
104355714Skris	addcc	c_12,t_1,c_12	!=
104455714Skris	clr	c_3
104555714Skris	bcs,a	%xcc,.+8
104655714Skris	add	c_3,t_2,c_3
104755714Skris	lduw	bp(2),b_2	!=
104855714Skris	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
104955714Skris	addcc	c_12,t_1,c_12
105055714Skris	bcs,a	%xcc,.+8
105155714Skris	add	c_3,t_2,c_3	!=
105255714Skris	lduw	bp(3),b_3
105355714Skris	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
105455714Skris	addcc	c_12,t_1,t_1
105555714Skris	bcs,a	%xcc,.+8	!=
105655714Skris	add	c_3,t_2,c_3
105755714Skris	srlx	t_1,32,c_12
105855714Skris	stuw	t_1,rp(2)	!r[2]=c3;
105955714Skris	or	c_12,c_3,c_12	!=
106055714Skris
106155714Skris	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
106255714Skris	addcc	c_12,t_1,c_12
106355714Skris	clr	c_3
106455714Skris	bcs,a	%xcc,.+8	!=
106555714Skris	add	c_3,t_2,c_3
106655714Skris	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
106755714Skris	addcc	c_12,t_1,c_12
106855714Skris	bcs,a	%xcc,.+8	!=
106955714Skris	add	c_3,t_2,c_3
107055714Skris	lduw	ap(3),a_3
107155714Skris	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
107255714Skris	addcc	c_12,t_1,c_12	!=
107355714Skris	bcs,a	%xcc,.+8
107455714Skris	add	c_3,t_2,c_3
107555714Skris	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
107655714Skris	addcc	c_12,t_1,t_1	!=
107755714Skris	bcs,a	%xcc,.+8
107855714Skris	add	c_3,t_2,c_3
107955714Skris	srlx	t_1,32,c_12
108055714Skris	stuw	t_1,rp(3)	!=!r[3]=c1;
108155714Skris	or	c_12,c_3,c_12
108255714Skris
108355714Skris	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
108455714Skris	addcc	c_12,t_1,c_12
108555714Skris	clr	c_3		!=
108655714Skris	bcs,a	%xcc,.+8
108755714Skris	add	c_3,t_2,c_3
108855714Skris	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
108955714Skris	addcc	c_12,t_1,c_12	!=
109055714Skris	bcs,a	%xcc,.+8
109155714Skris	add	c_3,t_2,c_3
109255714Skris	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
109355714Skris	addcc	c_12,t_1,t_1	!=
109455714Skris	bcs,a	%xcc,.+8
109555714Skris	add	c_3,t_2,c_3
109655714Skris	srlx	t_1,32,c_12
109755714Skris	stuw	t_1,rp(4)	!=!r[4]=c2;
109855714Skris	or	c_12,c_3,c_12
109955714Skris
110055714Skris	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
110155714Skris	addcc	c_12,t_1,c_12
110255714Skris	clr	c_3		!=
110355714Skris	bcs,a	%xcc,.+8
110455714Skris	add	c_3,t_2,c_3
110555714Skris	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
110655714Skris	addcc	c_12,t_1,t_1	!=
110755714Skris	bcs,a	%xcc,.+8
110855714Skris	add	c_3,t_2,c_3
110955714Skris	srlx	t_1,32,c_12
111055714Skris	stuw	t_1,rp(5)	!=!r[5]=c3;
111155714Skris	or	c_12,c_3,c_12
111255714Skris
111355714Skris	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
111455714Skris	addcc	c_12,t_1,t_1
111555714Skris	srlx	t_1,32,c_12	!=
111655714Skris	stuw	t_1,rp(6)	!r[6]=c1;
111755714Skris	stuw	c_12,rp(7)	!r[7]=c2;
111855714Skris
111955714Skris	ret
112055714Skris	restore	%g0,%g0,%o0
112155714Skris
112255714Skris.type	bn_mul_comba4,#function
112355714Skris.size	bn_mul_comba4,(.-bn_mul_comba4)
112455714Skris
112555714Skris.align	32
112655714Skris
112755714Skris.global bn_sqr_comba8
112855714Skrisbn_sqr_comba8:
112955714Skris	save	%sp,FRAME_SIZE,%sp
113055714Skris	mov	1,t_2
113155714Skris	lduw	ap(0),a_0
113255714Skris	sllx	t_2,32,t_2
113355714Skris	lduw	ap(1),a_1
113455714Skris	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
113555714Skris	srlx	t_1,32,c_12
113655714Skris	stuw	t_1,rp(0)	!r[0]=c1;
113755714Skris
113855714Skris	lduw	ap(2),a_2
113955714Skris	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
114055714Skris	addcc	c_12,t_1,c_12
114155714Skris	clr	c_3
114255714Skris	bcs,a	%xcc,.+8
114355714Skris	add	c_3,t_2,c_3
114455714Skris	addcc	c_12,t_1,t_1
114555714Skris	bcs,a	%xcc,.+8
114655714Skris	add	c_3,t_2,c_3
114755714Skris	srlx	t_1,32,c_12
114855714Skris	stuw	t_1,rp(1)	!r[1]=c2;
114955714Skris	or	c_12,c_3,c_12
115055714Skris
115155714Skris	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
115255714Skris	addcc	c_12,t_1,c_12
115355714Skris	clr	c_3
115455714Skris	bcs,a	%xcc,.+8
115555714Skris	add	c_3,t_2,c_3
115655714Skris	addcc	c_12,t_1,c_12
115755714Skris	bcs,a	%xcc,.+8
115855714Skris	add	c_3,t_2,c_3
115955714Skris	lduw	ap(3),a_3
116055714Skris	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
116155714Skris	addcc	c_12,t_1,t_1
116255714Skris	bcs,a	%xcc,.+8
116355714Skris	add	c_3,t_2,c_3
116455714Skris	srlx	t_1,32,c_12
116555714Skris	stuw	t_1,rp(2)	!r[2]=c3;
116655714Skris	or	c_12,c_3,c_12
116755714Skris
116855714Skris	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
116955714Skris	addcc	c_12,t_1,c_12
117055714Skris	clr	c_3
117155714Skris	bcs,a	%xcc,.+8
117255714Skris	add	c_3,t_2,c_3
117355714Skris	addcc	c_12,t_1,c_12
117455714Skris	bcs,a	%xcc,.+8
117555714Skris	add	c_3,t_2,c_3
117655714Skris	lduw	ap(4),a_4
117755714Skris	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
117855714Skris	addcc	c_12,t_1,c_12
117955714Skris	bcs,a	%xcc,.+8
118055714Skris	add	c_3,t_2,c_3
118155714Skris	addcc	c_12,t_1,t_1
118255714Skris	bcs,a	%xcc,.+8
118355714Skris	add	c_3,t_2,c_3
118455714Skris	srlx	t_1,32,c_12
118555714Skris	st	t_1,rp(3)	!r[3]=c1;
118655714Skris	or	c_12,c_3,c_12
118755714Skris
118855714Skris	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
118955714Skris	addcc	c_12,t_1,c_12
119055714Skris	clr	c_3
119155714Skris	bcs,a	%xcc,.+8
119255714Skris	add	c_3,t_2,c_3
119355714Skris	addcc	c_12,t_1,c_12
119455714Skris	bcs,a	%xcc,.+8
119555714Skris	add	c_3,t_2,c_3
119655714Skris	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
119755714Skris	addcc	c_12,t_1,c_12
119855714Skris	bcs,a	%xcc,.+8
119955714Skris	add	c_3,t_2,c_3
120055714Skris	addcc	c_12,t_1,c_12
120155714Skris	bcs,a	%xcc,.+8
120255714Skris	add	c_3,t_2,c_3
120355714Skris	lduw	ap(5),a_5
120455714Skris	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
120555714Skris	addcc	c_12,t_1,t_1
120655714Skris	bcs,a	%xcc,.+8
120755714Skris	add	c_3,t_2,c_3
120855714Skris	srlx	t_1,32,c_12
120955714Skris	stuw	t_1,rp(4)	!r[4]=c2;
121055714Skris	or	c_12,c_3,c_12
121155714Skris
121255714Skris	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
121355714Skris	addcc	c_12,t_1,c_12
121455714Skris	clr	c_3
121555714Skris	bcs,a	%xcc,.+8
121655714Skris	add	c_3,t_2,c_3
121755714Skris	addcc	c_12,t_1,c_12
121855714Skris	bcs,a	%xcc,.+8
121955714Skris	add	c_3,t_2,c_3
122055714Skris	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
122155714Skris	addcc	c_12,t_1,c_12
122255714Skris	bcs,a	%xcc,.+8
122355714Skris	add	c_3,t_2,c_3
122455714Skris	addcc	c_12,t_1,c_12
122555714Skris	bcs,a	%xcc,.+8
122655714Skris	add	c_3,t_2,c_3
122755714Skris	lduw	ap(6),a_6
122855714Skris	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
122955714Skris	addcc	c_12,t_1,c_12
123055714Skris	bcs,a	%xcc,.+8
123155714Skris	add	c_3,t_2,c_3
123255714Skris	addcc	c_12,t_1,t_1
123355714Skris	bcs,a	%xcc,.+8
123455714Skris	add	c_3,t_2,c_3
123555714Skris	srlx	t_1,32,c_12
123655714Skris	stuw	t_1,rp(5)	!r[5]=c3;
123755714Skris	or	c_12,c_3,c_12
123855714Skris
123955714Skris	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
124055714Skris	addcc	c_12,t_1,c_12
124155714Skris	clr	c_3
124255714Skris	bcs,a	%xcc,.+8
124355714Skris	add	c_3,t_2,c_3
124455714Skris	addcc	c_12,t_1,c_12
124555714Skris	bcs,a	%xcc,.+8
124655714Skris	add	c_3,t_2,c_3
124755714Skris	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
124855714Skris	addcc	c_12,t_1,c_12
124955714Skris	bcs,a	%xcc,.+8
125055714Skris	add	c_3,t_2,c_3
125155714Skris	addcc	c_12,t_1,c_12
125255714Skris	bcs,a	%xcc,.+8
125355714Skris	add	c_3,t_2,c_3
125455714Skris	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
125555714Skris	addcc	c_12,t_1,c_12
125655714Skris	bcs,a	%xcc,.+8
125755714Skris	add	c_3,t_2,c_3
125855714Skris	addcc	c_12,t_1,c_12
125955714Skris	bcs,a	%xcc,.+8
126055714Skris	add	c_3,t_2,c_3
126155714Skris	lduw	ap(7),a_7
126255714Skris	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
126355714Skris	addcc	c_12,t_1,t_1
126455714Skris	bcs,a	%xcc,.+8
126555714Skris	add	c_3,t_2,c_3
126655714Skris	srlx	t_1,32,c_12
126755714Skris	stuw	t_1,rp(6)	!r[6]=c1;
126855714Skris	or	c_12,c_3,c_12
126955714Skris
127055714Skris	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
127155714Skris	addcc	c_12,t_1,c_12
127255714Skris	clr	c_3
127355714Skris	bcs,a	%xcc,.+8
127455714Skris	add	c_3,t_2,c_3
127555714Skris	addcc	c_12,t_1,c_12
127655714Skris	bcs,a	%xcc,.+8
127755714Skris	add	c_3,t_2,c_3
127855714Skris	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
127955714Skris	addcc	c_12,t_1,c_12
128055714Skris	bcs,a	%xcc,.+8
128155714Skris	add	c_3,t_2,c_3
128255714Skris	addcc	c_12,t_1,c_12
128355714Skris	bcs,a	%xcc,.+8
128455714Skris	add	c_3,t_2,c_3
128555714Skris	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
128655714Skris	addcc	c_12,t_1,c_12
128755714Skris	bcs,a	%xcc,.+8
128855714Skris	add	c_3,t_2,c_3
128955714Skris	addcc	c_12,t_1,c_12
129055714Skris	bcs,a	%xcc,.+8
129155714Skris	add	c_3,t_2,c_3
129255714Skris	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
129355714Skris	addcc	c_12,t_1,c_12
129455714Skris	bcs,a	%xcc,.+8
129555714Skris	add	c_3,t_2,c_3
129655714Skris	addcc	c_12,t_1,t_1
129755714Skris	bcs,a	%xcc,.+8
129855714Skris	add	c_3,t_2,c_3
129955714Skris	srlx	t_1,32,c_12
130055714Skris	stuw	t_1,rp(7)	!r[7]=c2;
130155714Skris	or	c_12,c_3,c_12
130255714Skris
130355714Skris	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
130455714Skris	addcc	c_12,t_1,c_12
130555714Skris	clr	c_3
130655714Skris	bcs,a	%xcc,.+8
130755714Skris	add	c_3,t_2,c_3
130855714Skris	addcc	c_12,t_1,c_12
130955714Skris	bcs,a	%xcc,.+8
131055714Skris	add	c_3,t_2,c_3
131155714Skris	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
131255714Skris	addcc	c_12,t_1,c_12
131355714Skris	bcs,a	%xcc,.+8
131455714Skris	add	c_3,t_2,c_3
131555714Skris	addcc	c_12,t_1,c_12
131655714Skris	bcs,a	%xcc,.+8
131755714Skris	add	c_3,t_2,c_3
131855714Skris	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
131955714Skris	addcc	c_12,t_1,c_12
132055714Skris	bcs,a	%xcc,.+8
132155714Skris	add	c_3,t_2,c_3
132255714Skris	addcc	c_12,t_1,c_12
132355714Skris	bcs,a	%xcc,.+8
132455714Skris	add	c_3,t_2,c_3
132555714Skris	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
132655714Skris	addcc	c_12,t_1,t_1
132755714Skris	bcs,a	%xcc,.+8
132855714Skris	add	c_3,t_2,c_3
132955714Skris	srlx	t_1,32,c_12
133055714Skris	stuw	t_1,rp(8)	!r[8]=c3;
133155714Skris	or	c_12,c_3,c_12
133255714Skris
133355714Skris	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
133455714Skris	addcc	c_12,t_1,c_12
133555714Skris	clr	c_3
133655714Skris	bcs,a	%xcc,.+8
133755714Skris	add	c_3,t_2,c_3
133855714Skris	addcc	c_12,t_1,c_12
133955714Skris	bcs,a	%xcc,.+8
134055714Skris	add	c_3,t_2,c_3
134155714Skris	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
134255714Skris	addcc	c_12,t_1,c_12
134355714Skris	bcs,a	%xcc,.+8
134455714Skris	add	c_3,t_2,c_3
134555714Skris	addcc	c_12,t_1,c_12
134655714Skris	bcs,a	%xcc,.+8
134755714Skris	add	c_3,t_2,c_3
134855714Skris	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
134955714Skris	addcc	c_12,t_1,c_12
135055714Skris	bcs,a	%xcc,.+8
135155714Skris	add	c_3,t_2,c_3
135255714Skris	addcc	c_12,t_1,t_1
135355714Skris	bcs,a	%xcc,.+8
135455714Skris	add	c_3,t_2,c_3
135555714Skris	srlx	t_1,32,c_12
135655714Skris	stuw	t_1,rp(9)	!r[9]=c1;
135755714Skris	or	c_12,c_3,c_12
135855714Skris
135955714Skris	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
136055714Skris	addcc	c_12,t_1,c_12
136155714Skris	clr	c_3
136255714Skris	bcs,a	%xcc,.+8
136355714Skris	add	c_3,t_2,c_3
136455714Skris	addcc	c_12,t_1,c_12
136555714Skris	bcs,a	%xcc,.+8
136655714Skris	add	c_3,t_2,c_3
136755714Skris	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
136855714Skris	addcc	c_12,t_1,c_12
136955714Skris	bcs,a	%xcc,.+8
137055714Skris	add	c_3,t_2,c_3
137155714Skris	addcc	c_12,t_1,c_12
137255714Skris	bcs,a	%xcc,.+8
137355714Skris	add	c_3,t_2,c_3
137455714Skris	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
137555714Skris	addcc	c_12,t_1,t_1
137655714Skris	bcs,a	%xcc,.+8
137755714Skris	add	c_3,t_2,c_3
137855714Skris	srlx	t_1,32,c_12
137955714Skris	stuw	t_1,rp(10)	!r[10]=c2;
138055714Skris	or	c_12,c_3,c_12
138155714Skris
138255714Skris	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
138355714Skris	addcc	c_12,t_1,c_12
138455714Skris	clr	c_3
138555714Skris	bcs,a	%xcc,.+8
138655714Skris	add	c_3,t_2,c_3
138755714Skris	addcc	c_12,t_1,c_12
138855714Skris	bcs,a	%xcc,.+8
138955714Skris	add	c_3,t_2,c_3
139055714Skris	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
139155714Skris	addcc	c_12,t_1,c_12
139255714Skris	bcs,a	%xcc,.+8
139355714Skris	add	c_3,t_2,c_3
139455714Skris	addcc	c_12,t_1,t_1
139555714Skris	bcs,a	%xcc,.+8
139655714Skris	add	c_3,t_2,c_3
139755714Skris	srlx	t_1,32,c_12
139855714Skris	stuw	t_1,rp(11)	!r[11]=c3;
139955714Skris	or	c_12,c_3,c_12
140055714Skris
140155714Skris	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
140255714Skris	addcc	c_12,t_1,c_12
140355714Skris	clr	c_3
140455714Skris	bcs,a	%xcc,.+8
140555714Skris	add	c_3,t_2,c_3
140655714Skris	addcc	c_12,t_1,c_12
140755714Skris	bcs,a	%xcc,.+8
140855714Skris	add	c_3,t_2,c_3
140955714Skris	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
141055714Skris	addcc	c_12,t_1,t_1
141155714Skris	bcs,a	%xcc,.+8
141255714Skris	add	c_3,t_2,c_3
141355714Skris	srlx	t_1,32,c_12
141455714Skris	stuw	t_1,rp(12)	!r[12]=c1;
141555714Skris	or	c_12,c_3,c_12
141655714Skris
141755714Skris	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
141855714Skris	addcc	c_12,t_1,c_12
141955714Skris	clr	c_3
142055714Skris	bcs,a	%xcc,.+8
142155714Skris	add	c_3,t_2,c_3
142255714Skris	addcc	c_12,t_1,t_1
142355714Skris	bcs,a	%xcc,.+8
142455714Skris	add	c_3,t_2,c_3
142555714Skris	srlx	t_1,32,c_12
142655714Skris	stuw	t_1,rp(13)	!r[13]=c2;
142755714Skris	or	c_12,c_3,c_12
142855714Skris
142955714Skris	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
143055714Skris	addcc	c_12,t_1,t_1
143155714Skris	srlx	t_1,32,c_12
143255714Skris	stuw	t_1,rp(14)	!r[14]=c3;
143355714Skris	stuw	c_12,rp(15)	!r[15]=c1;
143455714Skris
143555714Skris	ret
143655714Skris	restore	%g0,%g0,%o0
143755714Skris
143855714Skris.type	bn_sqr_comba8,#function
143955714Skris.size	bn_sqr_comba8,(.-bn_sqr_comba8)
144055714Skris
144155714Skris.align	32
144255714Skris
144355714Skris.global bn_sqr_comba4
144455714Skris/*
144555714Skris * void bn_sqr_comba4(r,a)
144655714Skris * BN_ULONG *r,*a;
144755714Skris */
144855714Skrisbn_sqr_comba4:
144955714Skris	save	%sp,FRAME_SIZE,%sp
145055714Skris	mov	1,t_2
145155714Skris	lduw	ap(0),a_0
145255714Skris	sllx	t_2,32,t_2
145355714Skris	lduw	ap(1),a_1
145455714Skris	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
145555714Skris	srlx	t_1,32,c_12
145655714Skris	stuw	t_1,rp(0)	!r[0]=c1;
145755714Skris
145855714Skris	lduw	ap(2),a_2
145955714Skris	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
146055714Skris	addcc	c_12,t_1,c_12
146155714Skris	clr	c_3
146255714Skris	bcs,a	%xcc,.+8
146355714Skris	add	c_3,t_2,c_3
146455714Skris	addcc	c_12,t_1,t_1
146555714Skris	bcs,a	%xcc,.+8
146655714Skris	add	c_3,t_2,c_3
146755714Skris	srlx	t_1,32,c_12
146855714Skris	stuw	t_1,rp(1)	!r[1]=c2;
146955714Skris	or	c_12,c_3,c_12
147055714Skris
147155714Skris	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
147255714Skris	addcc	c_12,t_1,c_12
147355714Skris	clr	c_3
147455714Skris	bcs,a	%xcc,.+8
147555714Skris	add	c_3,t_2,c_3
147655714Skris	addcc	c_12,t_1,c_12
147755714Skris	bcs,a	%xcc,.+8
147855714Skris	add	c_3,t_2,c_3
147955714Skris	lduw	ap(3),a_3
148055714Skris	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
148155714Skris	addcc	c_12,t_1,t_1
148255714Skris	bcs,a	%xcc,.+8
148355714Skris	add	c_3,t_2,c_3
148455714Skris	srlx	t_1,32,c_12
148555714Skris	stuw	t_1,rp(2)	!r[2]=c3;
148655714Skris	or	c_12,c_3,c_12
148755714Skris
148855714Skris	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
148955714Skris	addcc	c_12,t_1,c_12
149055714Skris	clr	c_3
149155714Skris	bcs,a	%xcc,.+8
149255714Skris	add	c_3,t_2,c_3
149355714Skris	addcc	c_12,t_1,c_12
149455714Skris	bcs,a	%xcc,.+8
149555714Skris	add	c_3,t_2,c_3
149655714Skris	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
149755714Skris	addcc	c_12,t_1,c_12
149855714Skris	bcs,a	%xcc,.+8
149955714Skris	add	c_3,t_2,c_3
150055714Skris	addcc	c_12,t_1,t_1
150155714Skris	bcs,a	%xcc,.+8
150255714Skris	add	c_3,t_2,c_3
150355714Skris	srlx	t_1,32,c_12
150455714Skris	stuw	t_1,rp(3)	!r[3]=c1;
150555714Skris	or	c_12,c_3,c_12
150655714Skris
150755714Skris	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
150855714Skris	addcc	c_12,t_1,c_12
150955714Skris	clr	c_3
151055714Skris	bcs,a	%xcc,.+8
151155714Skris	add	c_3,t_2,c_3
151255714Skris	addcc	c_12,t_1,c_12
151355714Skris	bcs,a	%xcc,.+8
151455714Skris	add	c_3,t_2,c_3
151555714Skris	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
151655714Skris	addcc	c_12,t_1,t_1
151755714Skris	bcs,a	%xcc,.+8
151855714Skris	add	c_3,t_2,c_3
151955714Skris	srlx	t_1,32,c_12
152055714Skris	stuw	t_1,rp(4)	!r[4]=c2;
152155714Skris	or	c_12,c_3,c_12
152255714Skris
152355714Skris	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
152455714Skris	addcc	c_12,t_1,c_12
152555714Skris	clr	c_3
152655714Skris	bcs,a	%xcc,.+8
152755714Skris	add	c_3,t_2,c_3
152855714Skris	addcc	c_12,t_1,t_1
152955714Skris	bcs,a	%xcc,.+8
153055714Skris	add	c_3,t_2,c_3
153155714Skris	srlx	t_1,32,c_12
153255714Skris	stuw	t_1,rp(5)	!r[5]=c3;
153355714Skris	or	c_12,c_3,c_12
153455714Skris
153555714Skris	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
153655714Skris	addcc	c_12,t_1,t_1
153755714Skris	srlx	t_1,32,c_12
153855714Skris	stuw	t_1,rp(6)	!r[6]=c1;
153955714Skris	stuw	c_12,rp(7)	!r[7]=c2;
154055714Skris
154155714Skris	ret
154255714Skris	restore	%g0,%g0,%o0
154355714Skris
154455714Skris.type	bn_sqr_comba4,#function
154555714Skris.size	bn_sqr_comba4,(.-bn_sqr_comba4)
154655714Skris
154755714Skris.align	32
1548