1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# December 2005
11238384Sjkim#
12238384Sjkim# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13238384Sjkim# for undertaken effort are multiple. First of all, UltraSPARC is not
14238384Sjkim# the whole SPARCv9 universe and other VIS-free implementations deserve
15238384Sjkim# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16238384Sjkim# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17238384Sjkim# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18238384Sjkim# several integrated RSA/DSA accelerator circuits accessible through
19238384Sjkim# kernel driver [only(*)], but having decent user-land software
20238384Sjkim# implementation is important too. Finally, reasons like desire to
21238384Sjkim# experiment with dedicated squaring procedure. Yes, this module
22238384Sjkim# implements one, because it was easiest to draft it in SPARCv9
23238384Sjkim# instructions...
24238384Sjkim
25238384Sjkim# (*)	Engine accessing the driver in question is on my TODO list.
26238384Sjkim#	For reference, acceleator is estimated to give 6 to 10 times
27238384Sjkim#	improvement on single-threaded RSA sign. It should be noted
28238384Sjkim#	that 6-10x improvement coefficient does not actually mean
29238384Sjkim#	something extraordinary in terms of absolute [single-threaded]
30238384Sjkim#	performance, as SPARCv9 instruction set is by all means least
31238384Sjkim#	suitable for high performance crypto among other 64 bit
32238384Sjkim#	platforms. 6-10x factor simply places T1 in same performance
33238384Sjkim#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
34238384Sjkim#	appear impressive at all, but it's the sign operation which is
35238384Sjkim#	far more critical/interesting.
36238384Sjkim
37238384Sjkim# You might notice that inner loops are modulo-scheduled:-) This has
38238384Sjkim# essentially negligible impact on UltraSPARC performance, it's
39238384Sjkim# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40238384Sjkim# the advantage... Currently this module surpasses sparcv9a-mont.pl
41238384Sjkim# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42238384Sjkim# module still have hidden potential [see TODO list there], which is
43238384Sjkim# estimated to be larger than 20%...
44238384Sjkim
45238384Sjkim# int bn_mul_mont(
46238384Sjkim$rp="%i0";	# BN_ULONG *rp,
47238384Sjkim$ap="%i1";	# const BN_ULONG *ap,
48238384Sjkim$bp="%i2";	# const BN_ULONG *bp,
49238384Sjkim$np="%i3";	# const BN_ULONG *np,
50238384Sjkim$n0="%i4";	# const BN_ULONG *n0,
51238384Sjkim$num="%i5";	# int num);
52238384Sjkim
53238384Sjkim$bits=32;
54238384Sjkimfor (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55238384Sjkimif ($bits==64)	{ $bias=2047; $frame=192; }
56238384Sjkimelse		{ $bias=0;    $frame=128; }
57238384Sjkim
58238384Sjkim$car0="%o0";
59238384Sjkim$car1="%o1";
60238384Sjkim$car2="%o2";	# 1 bit
61238384Sjkim$acc0="%o3";
62238384Sjkim$acc1="%o4";
63238384Sjkim$mask="%g1";	# 32 bits, what a waste...
64238384Sjkim$tmp0="%g4";
65238384Sjkim$tmp1="%g5";
66238384Sjkim
67238384Sjkim$i="%l0";
68238384Sjkim$j="%l1";
69238384Sjkim$mul0="%l2";
70238384Sjkim$mul1="%l3";
71238384Sjkim$tp="%l4";
72238384Sjkim$apj="%l5";
73238384Sjkim$npj="%l6";
74238384Sjkim$tpj="%l7";
75238384Sjkim
76238384Sjkim$fname="bn_mul_mont_int";
77238384Sjkim
78238384Sjkim$code=<<___;
79238384Sjkim.section	".text",#alloc,#execinstr
80238384Sjkim
81238384Sjkim.global	$fname
82238384Sjkim.align	32
83238384Sjkim$fname:
84238384Sjkim	cmp	%o5,4			! 128 bits minimum
85238384Sjkim	bge,pt	%icc,.Lenter
86238384Sjkim	sethi	%hi(0xffffffff),$mask
87238384Sjkim	retl
88238384Sjkim	clr	%o0
89238384Sjkim.align	32
90238384Sjkim.Lenter:
91238384Sjkim	save	%sp,-$frame,%sp
92238384Sjkim	sll	$num,2,$num		! num*=4
93238384Sjkim	or	$mask,%lo(0xffffffff),$mask
94238384Sjkim	ld	[$n0],$n0
95238384Sjkim	cmp	$ap,$bp
96238384Sjkim	and	$num,$mask,$num
97238384Sjkim	ld	[$bp],$mul0		! bp[0]
98238384Sjkim	nop
99238384Sjkim
100238384Sjkim	add	%sp,$bias,%o7		! real top of stack
101238384Sjkim	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
102238384Sjkim	sub	%o7,$num,%o7
103238384Sjkim	ld	[$ap+4],$apj		! ap[1]
104238384Sjkim	and	%o7,-1024,%o7
105238384Sjkim	ld	[$np],$car1		! np[0]
106238384Sjkim	sub	%o7,$bias,%sp		! alloca
107238384Sjkim	ld	[$np+4],$npj		! np[1]
108238384Sjkim	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109238384Sjkim	mov	12,$j
110238384Sjkim
111238384Sjkim	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
112238384Sjkim	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
113238384Sjkim	and	$car0,$mask,$acc0
114238384Sjkim	add	%sp,$bias+$frame,$tp
115238384Sjkim	ld	[$ap+8],$apj		!prologue!
116238384Sjkim
117238384Sjkim	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
118238384Sjkim	and	$mul1,$mask,$mul1
119238384Sjkim
120238384Sjkim	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
121238384Sjkim	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
122238384Sjkim	srlx	$car0,32,$car0
123238384Sjkim	add	$acc0,$car1,$car1
124238384Sjkim	ld	[$np+8],$npj		!prologue!
125238384Sjkim	srlx	$car1,32,$car1
126238384Sjkim	mov	$tmp0,$acc0		!prologue!
127238384Sjkim
128238384Sjkim.L1st:
129238384Sjkim	mulx	$apj,$mul0,$tmp0
130238384Sjkim	mulx	$npj,$mul1,$tmp1
131238384Sjkim	add	$acc0,$car0,$car0
132238384Sjkim	ld	[$ap+$j],$apj		! ap[j]
133238384Sjkim	and	$car0,$mask,$acc0
134238384Sjkim	add	$acc1,$car1,$car1
135238384Sjkim	ld	[$np+$j],$npj		! np[j]
136238384Sjkim	srlx	$car0,32,$car0
137238384Sjkim	add	$acc0,$car1,$car1
138238384Sjkim	add	$j,4,$j			! j++
139238384Sjkim	mov	$tmp0,$acc0
140238384Sjkim	st	$car1,[$tp]
141238384Sjkim	cmp	$j,$num
142238384Sjkim	mov	$tmp1,$acc1
143238384Sjkim	srlx	$car1,32,$car1
144238384Sjkim	bl	%icc,.L1st
145238384Sjkim	add	$tp,4,$tp		! tp++
146238384Sjkim!.L1st
147238384Sjkim
148238384Sjkim	mulx	$apj,$mul0,$tmp0	!epilogue!
149238384Sjkim	mulx	$npj,$mul1,$tmp1
150238384Sjkim	add	$acc0,$car0,$car0
151238384Sjkim	and	$car0,$mask,$acc0
152238384Sjkim	add	$acc1,$car1,$car1
153238384Sjkim	srlx	$car0,32,$car0
154238384Sjkim	add	$acc0,$car1,$car1
155238384Sjkim	st	$car1,[$tp]
156238384Sjkim	srlx	$car1,32,$car1
157238384Sjkim
158238384Sjkim	add	$tmp0,$car0,$car0
159238384Sjkim	and	$car0,$mask,$acc0
160238384Sjkim	add	$tmp1,$car1,$car1
161238384Sjkim	srlx	$car0,32,$car0
162238384Sjkim	add	$acc0,$car1,$car1
163238384Sjkim	st	$car1,[$tp+4]
164238384Sjkim	srlx	$car1,32,$car1
165238384Sjkim
166238384Sjkim	add	$car0,$car1,$car1
167238384Sjkim	st	$car1,[$tp+8]
168238384Sjkim	srlx	$car1,32,$car2
169238384Sjkim
170238384Sjkim	mov	4,$i			! i++
171238384Sjkim	ld	[$bp+4],$mul0		! bp[1]
172238384Sjkim.Louter:
173238384Sjkim	add	%sp,$bias+$frame,$tp
174238384Sjkim	ld	[$ap],$car0		! ap[0]
175238384Sjkim	ld	[$ap+4],$apj		! ap[1]
176238384Sjkim	ld	[$np],$car1		! np[0]
177238384Sjkim	ld	[$np+4],$npj		! np[1]
178238384Sjkim	ld	[$tp],$tmp1		! tp[0]
179238384Sjkim	ld	[$tp+4],$tpj		! tp[1]
180238384Sjkim	mov	12,$j
181238384Sjkim
182238384Sjkim	mulx	$car0,$mul0,$car0
183238384Sjkim	mulx	$apj,$mul0,$tmp0	!prologue!
184238384Sjkim	add	$tmp1,$car0,$car0
185238384Sjkim	ld	[$ap+8],$apj		!prologue!
186238384Sjkim	and	$car0,$mask,$acc0
187238384Sjkim
188238384Sjkim	mulx	$n0,$acc0,$mul1
189238384Sjkim	and	$mul1,$mask,$mul1
190238384Sjkim
191238384Sjkim	mulx	$car1,$mul1,$car1
192238384Sjkim	mulx	$npj,$mul1,$acc1	!prologue!
193238384Sjkim	srlx	$car0,32,$car0
194238384Sjkim	add	$acc0,$car1,$car1
195238384Sjkim	ld	[$np+8],$npj		!prologue!
196238384Sjkim	srlx	$car1,32,$car1
197238384Sjkim	mov	$tmp0,$acc0		!prologue!
198238384Sjkim
199238384Sjkim.Linner:
200238384Sjkim	mulx	$apj,$mul0,$tmp0
201238384Sjkim	mulx	$npj,$mul1,$tmp1
202238384Sjkim	add	$tpj,$car0,$car0
203238384Sjkim	ld	[$ap+$j],$apj		! ap[j]
204238384Sjkim	add	$acc0,$car0,$car0
205238384Sjkim	add	$acc1,$car1,$car1
206238384Sjkim	ld	[$np+$j],$npj		! np[j]
207238384Sjkim	and	$car0,$mask,$acc0
208238384Sjkim	ld	[$tp+8],$tpj		! tp[j]
209238384Sjkim	srlx	$car0,32,$car0
210238384Sjkim	add	$acc0,$car1,$car1
211238384Sjkim	add	$j,4,$j			! j++
212238384Sjkim	mov	$tmp0,$acc0
213238384Sjkim	st	$car1,[$tp]		! tp[j-1]
214238384Sjkim	srlx	$car1,32,$car1
215238384Sjkim	mov	$tmp1,$acc1
216238384Sjkim	cmp	$j,$num
217238384Sjkim	bl	%icc,.Linner
218238384Sjkim	add	$tp,4,$tp		! tp++
219238384Sjkim!.Linner
220238384Sjkim
221238384Sjkim	mulx	$apj,$mul0,$tmp0	!epilogue!
222238384Sjkim	mulx	$npj,$mul1,$tmp1
223238384Sjkim	add	$tpj,$car0,$car0
224238384Sjkim	add	$acc0,$car0,$car0
225238384Sjkim	ld	[$tp+8],$tpj		! tp[j]
226238384Sjkim	and	$car0,$mask,$acc0
227238384Sjkim	add	$acc1,$car1,$car1
228238384Sjkim	srlx	$car0,32,$car0
229238384Sjkim	add	$acc0,$car1,$car1
230238384Sjkim	st	$car1,[$tp]		! tp[j-1]
231238384Sjkim	srlx	$car1,32,$car1
232238384Sjkim
233238384Sjkim	add	$tpj,$car0,$car0
234238384Sjkim	add	$tmp0,$car0,$car0
235238384Sjkim	and	$car0,$mask,$acc0
236238384Sjkim	add	$tmp1,$car1,$car1
237238384Sjkim	add	$acc0,$car1,$car1
238238384Sjkim	st	$car1,[$tp+4]		! tp[j-1]
239238384Sjkim	srlx	$car0,32,$car0
240238384Sjkim	add	$i,4,$i			! i++
241238384Sjkim	srlx	$car1,32,$car1
242238384Sjkim
243238384Sjkim	add	$car0,$car1,$car1
244238384Sjkim	cmp	$i,$num
245238384Sjkim	add	$car2,$car1,$car1
246238384Sjkim	st	$car1,[$tp+8]
247238384Sjkim
248238384Sjkim	srlx	$car1,32,$car2
249238384Sjkim	bl,a	%icc,.Louter
250238384Sjkim	ld	[$bp+$i],$mul0		! bp[i]
251238384Sjkim!.Louter
252238384Sjkim
253238384Sjkim	add	$tp,12,$tp
254238384Sjkim
255238384Sjkim.Ltail:
256238384Sjkim	add	$np,$num,$np
257238384Sjkim	add	$rp,$num,$rp
258238384Sjkim	mov	$tp,$ap
259238384Sjkim	sub	%g0,$num,%o7		! k=-num
260238384Sjkim	ba	.Lsub
261238384Sjkim	subcc	%g0,%g0,%g0		! clear %icc.c
262238384Sjkim.align	16
263238384Sjkim.Lsub:
264238384Sjkim	ld	[$tp+%o7],%o0
265238384Sjkim	ld	[$np+%o7],%o1
266238384Sjkim	subccc	%o0,%o1,%o1		! tp[j]-np[j]
267238384Sjkim	add	$rp,%o7,$i
268238384Sjkim	add	%o7,4,%o7
269238384Sjkim	brnz	%o7,.Lsub
270238384Sjkim	st	%o1,[$i]
271238384Sjkim	subc	$car2,0,$car2		! handle upmost overflow bit
272238384Sjkim	and	$tp,$car2,$ap
273238384Sjkim	andn	$rp,$car2,$np
274238384Sjkim	or	$ap,$np,$ap
275238384Sjkim	sub	%g0,$num,%o7
276238384Sjkim
277238384Sjkim.Lcopy:
278238384Sjkim	ld	[$ap+%o7],%o0		! copy or in-place refresh
279238384Sjkim	st	%g0,[$tp+%o7]		! zap tp
280238384Sjkim	st	%o0,[$rp+%o7]
281238384Sjkim	add	%o7,4,%o7
282238384Sjkim	brnz	%o7,.Lcopy
283238384Sjkim	nop
284238384Sjkim	mov	1,%i0
285238384Sjkim	ret
286238384Sjkim	restore
287238384Sjkim___
288238384Sjkim
289238384Sjkim########
290238384Sjkim######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291238384Sjkim######## code without following dedicated squaring procedure.
292238384Sjkim########
293238384Sjkim$sbit="%i2";		# re-use $bp!
294238384Sjkim
295238384Sjkim$code.=<<___;
296238384Sjkim.align	32
297238384Sjkim.Lbn_sqr_mont:
298238384Sjkim	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
299238384Sjkim	mulx	$apj,$mul0,$tmp0		!prologue!
300238384Sjkim	and	$car0,$mask,$acc0
301238384Sjkim	add	%sp,$bias+$frame,$tp
302238384Sjkim	ld	[$ap+8],$apj			!prologue!
303238384Sjkim
304238384Sjkim	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
305238384Sjkim	srlx	$car0,32,$car0
306238384Sjkim	and	$mul1,$mask,$mul1
307238384Sjkim
308238384Sjkim	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
309238384Sjkim	mulx	$npj,$mul1,$acc1		!prologue!
310238384Sjkim	and	$car0,1,$sbit
311238384Sjkim	ld	[$np+8],$npj			!prologue!
312238384Sjkim	srlx	$car0,1,$car0
313238384Sjkim	add	$acc0,$car1,$car1
314238384Sjkim	srlx	$car1,32,$car1
315238384Sjkim	mov	$tmp0,$acc0			!prologue!
316238384Sjkim
317238384Sjkim.Lsqr_1st:
318238384Sjkim	mulx	$apj,$mul0,$tmp0
319238384Sjkim	mulx	$npj,$mul1,$tmp1
320238384Sjkim	add	$acc0,$car0,$car0		! ap[j]*a0+c0
321238384Sjkim	add	$acc1,$car1,$car1
322238384Sjkim	ld	[$ap+$j],$apj			! ap[j]
323238384Sjkim	and	$car0,$mask,$acc0
324238384Sjkim	ld	[$np+$j],$npj			! np[j]
325238384Sjkim	srlx	$car0,32,$car0
326238384Sjkim	add	$acc0,$acc0,$acc0
327238384Sjkim	or	$sbit,$acc0,$acc0
328238384Sjkim	mov	$tmp1,$acc1
329238384Sjkim	srlx	$acc0,32,$sbit
330238384Sjkim	add	$j,4,$j				! j++
331238384Sjkim	and	$acc0,$mask,$acc0
332238384Sjkim	cmp	$j,$num
333238384Sjkim	add	$acc0,$car1,$car1
334238384Sjkim	st	$car1,[$tp]
335238384Sjkim	mov	$tmp0,$acc0
336238384Sjkim	srlx	$car1,32,$car1
337238384Sjkim	bl	%icc,.Lsqr_1st
338238384Sjkim	add	$tp,4,$tp			! tp++
339238384Sjkim!.Lsqr_1st
340238384Sjkim
341238384Sjkim	mulx	$apj,$mul0,$tmp0		! epilogue
342238384Sjkim	mulx	$npj,$mul1,$tmp1
343238384Sjkim	add	$acc0,$car0,$car0		! ap[j]*a0+c0
344238384Sjkim	add	$acc1,$car1,$car1
345238384Sjkim	and	$car0,$mask,$acc0
346238384Sjkim	srlx	$car0,32,$car0
347238384Sjkim	add	$acc0,$acc0,$acc0
348238384Sjkim	or	$sbit,$acc0,$acc0
349238384Sjkim	srlx	$acc0,32,$sbit
350238384Sjkim	and	$acc0,$mask,$acc0
351238384Sjkim	add	$acc0,$car1,$car1
352238384Sjkim	st	$car1,[$tp]
353238384Sjkim	srlx	$car1,32,$car1
354238384Sjkim
355238384Sjkim	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
356238384Sjkim	add	$tmp1,$car1,$car1
357238384Sjkim	and	$car0,$mask,$acc0
358238384Sjkim	srlx	$car0,32,$car0
359238384Sjkim	add	$acc0,$acc0,$acc0
360238384Sjkim	or	$sbit,$acc0,$acc0
361238384Sjkim	srlx	$acc0,32,$sbit
362238384Sjkim	and	$acc0,$mask,$acc0
363238384Sjkim	add	$acc0,$car1,$car1
364238384Sjkim	st	$car1,[$tp+4]
365238384Sjkim	srlx	$car1,32,$car1
366238384Sjkim
367238384Sjkim	add	$car0,$car0,$car0
368238384Sjkim	or	$sbit,$car0,$car0
369238384Sjkim	add	$car0,$car1,$car1
370238384Sjkim	st	$car1,[$tp+8]
371238384Sjkim	srlx	$car1,32,$car2
372238384Sjkim
373238384Sjkim	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
374238384Sjkim	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
375238384Sjkim	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
376238384Sjkim	ld	[$ap+4],$mul0			! ap[1]
377238384Sjkim	ld	[$ap+8],$apj			! ap[2]
378238384Sjkim	ld	[$np],$car1			! np[0]
379238384Sjkim	ld	[$np+4],$npj			! np[1]
380238384Sjkim	mulx	$n0,$tmp0,$mul1
381238384Sjkim
382238384Sjkim	mulx	$mul0,$mul0,$car0
383238384Sjkim	and	$mul1,$mask,$mul1
384238384Sjkim
385238384Sjkim	mulx	$car1,$mul1,$car1
386238384Sjkim	mulx	$npj,$mul1,$acc1
387238384Sjkim	add	$tmp0,$car1,$car1
388238384Sjkim	and	$car0,$mask,$acc0
389238384Sjkim	ld	[$np+8],$npj			! np[2]
390238384Sjkim	srlx	$car1,32,$car1
391238384Sjkim	add	$tmp1,$car1,$car1
392238384Sjkim	srlx	$car0,32,$car0
393238384Sjkim	add	$acc0,$car1,$car1
394238384Sjkim	and	$car0,1,$sbit
395238384Sjkim	add	$acc1,$car1,$car1
396238384Sjkim	srlx	$car0,1,$car0
397238384Sjkim	mov	12,$j
398238384Sjkim	st	$car1,[%sp+$bias+$frame]	! tp[0]=
399238384Sjkim	srlx	$car1,32,$car1
400238384Sjkim	add	%sp,$bias+$frame+4,$tp
401238384Sjkim
402238384Sjkim.Lsqr_2nd:
403238384Sjkim	mulx	$apj,$mul0,$acc0
404238384Sjkim	mulx	$npj,$mul1,$acc1
405238384Sjkim	add	$acc0,$car0,$car0
406238384Sjkim	add	$tpj,$car1,$car1
407238384Sjkim	ld	[$ap+$j],$apj			! ap[j]
408238384Sjkim	and	$car0,$mask,$acc0
409238384Sjkim	ld	[$np+$j],$npj			! np[j]
410238384Sjkim	srlx	$car0,32,$car0
411238384Sjkim	add	$acc1,$car1,$car1
412238384Sjkim	ld	[$tp+8],$tpj			! tp[j]
413238384Sjkim	add	$acc0,$acc0,$acc0
414238384Sjkim	add	$j,4,$j				! j++
415238384Sjkim	or	$sbit,$acc0,$acc0
416238384Sjkim	srlx	$acc0,32,$sbit
417238384Sjkim	and	$acc0,$mask,$acc0
418238384Sjkim	cmp	$j,$num
419238384Sjkim	add	$acc0,$car1,$car1
420238384Sjkim	st	$car1,[$tp]			! tp[j-1]
421238384Sjkim	srlx	$car1,32,$car1
422238384Sjkim	bl	%icc,.Lsqr_2nd
423238384Sjkim	add	$tp,4,$tp			! tp++
424238384Sjkim!.Lsqr_2nd
425238384Sjkim
426238384Sjkim	mulx	$apj,$mul0,$acc0
427238384Sjkim	mulx	$npj,$mul1,$acc1
428238384Sjkim	add	$acc0,$car0,$car0
429238384Sjkim	add	$tpj,$car1,$car1
430238384Sjkim	and	$car0,$mask,$acc0
431238384Sjkim	srlx	$car0,32,$car0
432238384Sjkim	add	$acc1,$car1,$car1
433238384Sjkim	add	$acc0,$acc0,$acc0
434238384Sjkim	or	$sbit,$acc0,$acc0
435238384Sjkim	srlx	$acc0,32,$sbit
436238384Sjkim	and	$acc0,$mask,$acc0
437238384Sjkim	add	$acc0,$car1,$car1
438238384Sjkim	st	$car1,[$tp]			! tp[j-1]
439238384Sjkim	srlx	$car1,32,$car1
440238384Sjkim
441238384Sjkim	add	$car0,$car0,$car0
442238384Sjkim	or	$sbit,$car0,$car0
443238384Sjkim	add	$car0,$car1,$car1
444238384Sjkim	add	$car2,$car1,$car1
445238384Sjkim	st	$car1,[$tp+4]
446238384Sjkim	srlx	$car1,32,$car2
447238384Sjkim
448238384Sjkim	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
449238384Sjkim	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
450238384Sjkim	ld	[$ap+8],$mul0			! ap[2]
451238384Sjkim	ld	[$np],$car1			! np[0]
452238384Sjkim	ld	[$np+4],$npj			! np[1]
453238384Sjkim	mulx	$n0,$tmp1,$mul1
454238384Sjkim	and	$mul1,$mask,$mul1
455238384Sjkim	mov	8,$i
456238384Sjkim
457238384Sjkim	mulx	$mul0,$mul0,$car0
458238384Sjkim	mulx	$car1,$mul1,$car1
459238384Sjkim	and	$car0,$mask,$acc0
460238384Sjkim	add	$tmp1,$car1,$car1
461238384Sjkim	srlx	$car0,32,$car0
462238384Sjkim	add	%sp,$bias+$frame,$tp
463238384Sjkim	srlx	$car1,32,$car1
464238384Sjkim	and	$car0,1,$sbit
465238384Sjkim	srlx	$car0,1,$car0
466238384Sjkim	mov	4,$j
467238384Sjkim
468238384Sjkim.Lsqr_outer:
469238384Sjkim.Lsqr_inner1:
470238384Sjkim	mulx	$npj,$mul1,$acc1
471238384Sjkim	add	$tpj,$car1,$car1
472238384Sjkim	add	$j,4,$j
473238384Sjkim	ld	[$tp+8],$tpj
474238384Sjkim	cmp	$j,$i
475238384Sjkim	add	$acc1,$car1,$car1
476238384Sjkim	ld	[$np+$j],$npj
477238384Sjkim	st	$car1,[$tp]
478238384Sjkim	srlx	$car1,32,$car1
479238384Sjkim	bl	%icc,.Lsqr_inner1
480238384Sjkim	add	$tp,4,$tp
481238384Sjkim!.Lsqr_inner1
482238384Sjkim
483238384Sjkim	add	$j,4,$j
484238384Sjkim	ld	[$ap+$j],$apj			! ap[j]
485238384Sjkim	mulx	$npj,$mul1,$acc1
486238384Sjkim	add	$tpj,$car1,$car1
487238384Sjkim	ld	[$np+$j],$npj			! np[j]
488238384Sjkim	add	$acc0,$car1,$car1
489238384Sjkim	ld	[$tp+8],$tpj			! tp[j]
490238384Sjkim	add	$acc1,$car1,$car1
491238384Sjkim	st	$car1,[$tp]
492238384Sjkim	srlx	$car1,32,$car1
493238384Sjkim
494238384Sjkim	add	$j,4,$j
495238384Sjkim	cmp	$j,$num
496238384Sjkim	be,pn	%icc,.Lsqr_no_inner2
497238384Sjkim	add	$tp,4,$tp
498238384Sjkim
499238384Sjkim.Lsqr_inner2:
500238384Sjkim	mulx	$apj,$mul0,$acc0
501238384Sjkim	mulx	$npj,$mul1,$acc1
502238384Sjkim	add	$tpj,$car1,$car1
503238384Sjkim	add	$acc0,$car0,$car0
504238384Sjkim	ld	[$ap+$j],$apj			! ap[j]
505238384Sjkim	and	$car0,$mask,$acc0
506238384Sjkim	ld	[$np+$j],$npj			! np[j]
507238384Sjkim	srlx	$car0,32,$car0
508238384Sjkim	add	$acc0,$acc0,$acc0
509238384Sjkim	ld	[$tp+8],$tpj			! tp[j]
510238384Sjkim	or	$sbit,$acc0,$acc0
511238384Sjkim	add	$j,4,$j				! j++
512238384Sjkim	srlx	$acc0,32,$sbit
513238384Sjkim	and	$acc0,$mask,$acc0
514238384Sjkim	cmp	$j,$num
515238384Sjkim	add	$acc0,$car1,$car1
516238384Sjkim	add	$acc1,$car1,$car1
517238384Sjkim	st	$car1,[$tp]			! tp[j-1]
518238384Sjkim	srlx	$car1,32,$car1
519238384Sjkim	bl	%icc,.Lsqr_inner2
520238384Sjkim	add	$tp,4,$tp			! tp++
521238384Sjkim
522238384Sjkim.Lsqr_no_inner2:
523238384Sjkim	mulx	$apj,$mul0,$acc0
524238384Sjkim	mulx	$npj,$mul1,$acc1
525238384Sjkim	add	$tpj,$car1,$car1
526238384Sjkim	add	$acc0,$car0,$car0
527238384Sjkim	and	$car0,$mask,$acc0
528238384Sjkim	srlx	$car0,32,$car0
529238384Sjkim	add	$acc0,$acc0,$acc0
530238384Sjkim	or	$sbit,$acc0,$acc0
531238384Sjkim	srlx	$acc0,32,$sbit
532238384Sjkim	and	$acc0,$mask,$acc0
533238384Sjkim	add	$acc0,$car1,$car1
534238384Sjkim	add	$acc1,$car1,$car1
535238384Sjkim	st	$car1,[$tp]			! tp[j-1]
536238384Sjkim	srlx	$car1,32,$car1
537238384Sjkim
538238384Sjkim	add	$car0,$car0,$car0
539238384Sjkim	or	$sbit,$car0,$car0
540238384Sjkim	add	$car0,$car1,$car1
541238384Sjkim	add	$car2,$car1,$car1
542238384Sjkim	st	$car1,[$tp+4]
543238384Sjkim	srlx	$car1,32,$car2
544238384Sjkim
545238384Sjkim	add	$i,4,$i				! i++
546238384Sjkim	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
547238384Sjkim	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
548238384Sjkim	ld	[$ap+$i],$mul0			! ap[j]
549238384Sjkim	ld	[$np],$car1			! np[0]
550238384Sjkim	ld	[$np+4],$npj			! np[1]
551238384Sjkim	mulx	$n0,$tmp1,$mul1
552238384Sjkim	and	$mul1,$mask,$mul1
553238384Sjkim	add	$i,4,$tmp0
554238384Sjkim
555238384Sjkim	mulx	$mul0,$mul0,$car0
556238384Sjkim	mulx	$car1,$mul1,$car1
557238384Sjkim	and	$car0,$mask,$acc0
558238384Sjkim	add	$tmp1,$car1,$car1
559238384Sjkim	srlx	$car0,32,$car0
560238384Sjkim	add	%sp,$bias+$frame,$tp
561238384Sjkim	srlx	$car1,32,$car1
562238384Sjkim	and	$car0,1,$sbit
563238384Sjkim	srlx	$car0,1,$car0
564238384Sjkim
565238384Sjkim	cmp	$tmp0,$num			! i<num-1
566238384Sjkim	bl	%icc,.Lsqr_outer
567238384Sjkim	mov	4,$j
568238384Sjkim
569238384Sjkim.Lsqr_last:
570238384Sjkim	mulx	$npj,$mul1,$acc1
571238384Sjkim	add	$tpj,$car1,$car1
572238384Sjkim	add	$j,4,$j
573238384Sjkim	ld	[$tp+8],$tpj
574238384Sjkim	cmp	$j,$i
575238384Sjkim	add	$acc1,$car1,$car1
576238384Sjkim	ld	[$np+$j],$npj
577238384Sjkim	st	$car1,[$tp]
578238384Sjkim	srlx	$car1,32,$car1
579238384Sjkim	bl	%icc,.Lsqr_last
580238384Sjkim	add	$tp,4,$tp
581238384Sjkim!.Lsqr_last
582238384Sjkim
583238384Sjkim	mulx	$npj,$mul1,$acc1
584238384Sjkim	add	$tpj,$car1,$car1
585238384Sjkim	add	$acc0,$car1,$car1
586238384Sjkim	add	$acc1,$car1,$car1
587238384Sjkim	st	$car1,[$tp]
588238384Sjkim	srlx	$car1,32,$car1
589238384Sjkim
590238384Sjkim	add	$car0,$car0,$car0		! recover $car0
591238384Sjkim	or	$sbit,$car0,$car0
592238384Sjkim	add	$car0,$car1,$car1
593238384Sjkim	add	$car2,$car1,$car1
594238384Sjkim	st	$car1,[$tp+4]
595238384Sjkim	srlx	$car1,32,$car2
596238384Sjkim
597238384Sjkim	ba	.Ltail
598238384Sjkim	add	$tp,8,$tp
599238384Sjkim.type	$fname,#function
600238384Sjkim.size	$fname,(.-$fname)
601238384Sjkim.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
602238384Sjkim.align	32
603238384Sjkim___
604238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
605238384Sjkimprint $code;
606238384Sjkimclose STDOUT;
607