1160814Ssimon#!/usr/bin/env perl
2160814Ssimon#
3160814Ssimon# Implemented as a Perl wrapper as we want to support several different
4160814Ssimon# architectures with single file. We pick up the target based on the
5160814Ssimon# file name we are asked to generate.
6160814Ssimon#
7160814Ssimon# It should be noted though that this perl code is nothing like
8160814Ssimon# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9160814Ssimon# as pre-processor to cover for platform differences in name decoration,
10160814Ssimon# linker tables, 32-/64-bit instruction sets...
11160814Ssimon#
12160814Ssimon# As you might know there're several PowerPC ABI in use. Most notably
13160814Ssimon# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14160814Ssimon# are similar enough to implement leaf(!) functions, which would be ABI
15160814Ssimon# neutral. And that's what you find here: ABI neutral leaf functions.
16160814Ssimon# In case you wonder what that is...
17160814Ssimon#
18160814Ssimon#       AIX performance
19160814Ssimon#
20160814Ssimon#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21160814Ssimon#
22160814Ssimon#	The following is the performance of 32-bit compiler
23160814Ssimon#	generated code:
24160814Ssimon#
25160814Ssimon#	OpenSSL 0.9.6c 21 dec 2001
26160814Ssimon#	built on: Tue Jun 11 11:06:51 EDT 2002
27160814Ssimon#	options:bn(64,32) ...
28160814Ssimon#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
29160814Ssimon#                  sign    verify    sign/s verify/s
30160814Ssimon#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
31160814Ssimon#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
32160814Ssimon#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
33160814Ssimon#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
34160814Ssimon#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
35160814Ssimon#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
36160814Ssimon#
37160814Ssimon#	Same bechmark with this assembler code:
38160814Ssimon#
39160814Ssimon#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
40160814Ssimon#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
41160814Ssimon#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
42160814Ssimon#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
43160814Ssimon#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
44160814Ssimon#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
45160814Ssimon#
46160814Ssimon#	Number of operations increases by at almost 75%
47160814Ssimon#
48160814Ssimon#	Here are performance numbers for 64-bit compiler
49160814Ssimon#	generated code:
50160814Ssimon#
51160814Ssimon#	OpenSSL 0.9.6g [engine] 9 Aug 2002
52160814Ssimon#	built on: Fri Apr 18 16:59:20 EDT 2003
53160814Ssimon#	options:bn(64,64) ...
54160814Ssimon#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55160814Ssimon#                  sign    verify    sign/s verify/s
56160814Ssimon#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
57160814Ssimon#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
58160814Ssimon#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
59160814Ssimon#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
60160814Ssimon#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
61160814Ssimon#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
62160814Ssimon#
63160814Ssimon#	Same benchmark with this assembler code:
64160814Ssimon#
65160814Ssimon#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
66160814Ssimon#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
67160814Ssimon#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
68160814Ssimon#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
69160814Ssimon#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
70160814Ssimon#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
71160814Ssimon#
72160814Ssimon#	Again, performance increases by at about 75%
73160814Ssimon#
74160814Ssimon#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75160814Ssimon#       OpenSSL 0.9.7c 30 Sep 2003
76160814Ssimon#
77160814Ssimon#       Original code.
78160814Ssimon#
79160814Ssimon#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
80160814Ssimon#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
81160814Ssimon#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
82160814Ssimon#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
83160814Ssimon#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
84160814Ssimon#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
85160814Ssimon#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
86160814Ssimon#
87160814Ssimon#       Same benchmark with this assembler code:
88160814Ssimon#
89160814Ssimon#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
90160814Ssimon#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
91160814Ssimon#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
92160814Ssimon#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
93160814Ssimon#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
94160814Ssimon#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
95160814Ssimon#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
96160814Ssimon#
97160814Ssimon#        Performance increase of ~60%
98160814Ssimon#
99160814Ssimon#	If you have comments or suggestions to improve code send
100160814Ssimon#	me a note at schari@us.ibm.com
101160814Ssimon#
102160814Ssimon
103238405Sjkim$flavour = shift;
104160814Ssimon
105238405Sjkimif ($flavour =~ /32/) {
106160814Ssimon	$BITS=	32;
107160814Ssimon	$BNSZ=	$BITS/8;
108160814Ssimon	$ISA=	"\"ppc\"";
109160814Ssimon
110160814Ssimon	$LD=	"lwz";		# load
111160814Ssimon	$LDU=	"lwzu";		# load and update
112160814Ssimon	$ST=	"stw";		# store
113160814Ssimon	$STU=	"stwu";		# store and update
114160814Ssimon	$UMULL=	"mullw";	# unsigned multiply low
115160814Ssimon	$UMULH=	"mulhwu";	# unsigned multiply high
116160814Ssimon	$UDIV=	"divwu";	# unsigned divide
117160814Ssimon	$UCMPI=	"cmplwi";	# unsigned compare with immediate
118160814Ssimon	$UCMP=	"cmplw";	# unsigned compare
119160814Ssimon	$CNTLZ=	"cntlzw";	# count leading zeros
120160814Ssimon	$SHL=	"slw";		# shift left
121160814Ssimon	$SHR=	"srw";		# unsigned shift right
122160814Ssimon	$SHRI=	"srwi";		# unsigned shift right by immediate
123160814Ssimon	$SHLI=	"slwi";		# shift left by immediate
124160814Ssimon	$CLRU=	"clrlwi";	# clear upper bits
125160814Ssimon	$INSR=	"insrwi";	# insert right
126160814Ssimon	$ROTL=	"rotlwi";	# rotate left by immediate
127160814Ssimon	$TR=	"tw";		# conditional trap
128238405Sjkim} elsif ($flavour =~ /64/) {
129160814Ssimon	$BITS=	64;
130160814Ssimon	$BNSZ=	$BITS/8;
131160814Ssimon	$ISA=	"\"ppc64\"";
132160814Ssimon
133160814Ssimon	# same as above, but 64-bit mnemonics...
134160814Ssimon	$LD=	"ld";		# load
135160814Ssimon	$LDU=	"ldu";		# load and update
136160814Ssimon	$ST=	"std";		# store
137160814Ssimon	$STU=	"stdu";		# store and update
138160814Ssimon	$UMULL=	"mulld";	# unsigned multiply low
139160814Ssimon	$UMULH=	"mulhdu";	# unsigned multiply high
140160814Ssimon	$UDIV=	"divdu";	# unsigned divide
141160814Ssimon	$UCMPI=	"cmpldi";	# unsigned compare with immediate
142160814Ssimon	$UCMP=	"cmpld";	# unsigned compare
143160814Ssimon	$CNTLZ=	"cntlzd";	# count leading zeros
144160814Ssimon	$SHL=	"sld";		# shift left
145160814Ssimon	$SHR=	"srd";		# unsigned shift right
146160814Ssimon	$SHRI=	"srdi";		# unsigned shift right by immediate
147160814Ssimon	$SHLI=	"sldi";		# shift left by immediate
148160814Ssimon	$CLRU=	"clrldi";	# clear upper bits
149160814Ssimon	$INSR=	"insrdi";	# insert right
150160814Ssimon	$ROTL=	"rotldi";	# rotate left by immediate
151160814Ssimon	$TR=	"td";		# conditional trap
152238405Sjkim} else { die "nonsense $flavour"; }
153160814Ssimon
154238405Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155238405Sjkim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156238405Sjkim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157238405Sjkimdie "can't locate ppc-xlate.pl";
158160814Ssimon
159238405Sjkimopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160160814Ssimon
161238405Sjkim$data=<<EOF;
162160814Ssimon#--------------------------------------------------------------------
163160814Ssimon#
164160814Ssimon#
165160814Ssimon#
166160814Ssimon#
167160814Ssimon#	File:		ppc32.s
168160814Ssimon#
169160814Ssimon#	Created by:	Suresh Chari
170160814Ssimon#			IBM Thomas J. Watson Research Library
171160814Ssimon#			Hawthorne, NY
172160814Ssimon#
173160814Ssimon#
174160814Ssimon#	Description:	Optimized assembly routines for OpenSSL crypto
175160814Ssimon#			on the 32 bitPowerPC platform.
176160814Ssimon#
177160814Ssimon#
178160814Ssimon#	Version History
179160814Ssimon#
180160814Ssimon#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181160814Ssimon#	   cleaned up code. Also made a single version which can
182160814Ssimon#	   be used for both the AIX and Linux compilers. See NOTE
183160814Ssimon#	   below.
184160814Ssimon#				12/05/03		Suresh Chari
185160814Ssimon#			(with lots of help from)        Andy Polyakov
186160814Ssimon##
187160814Ssimon#	1. Initial version	10/20/02		Suresh Chari
188160814Ssimon#
189160814Ssimon#
190160814Ssimon#	The following file works for the xlc,cc
191160814Ssimon#	and gcc compilers.
192160814Ssimon#
193160814Ssimon#	NOTE:	To get the file to link correctly with the gcc compiler
194160814Ssimon#	        you have to change the names of the routines and remove
195160814Ssimon#		the first .(dot) character. This should automatically
196160814Ssimon#		be done in the build process.
197160814Ssimon#
198160814Ssimon#	Hand optimized assembly code for the following routines
199160814Ssimon#
200160814Ssimon#	bn_sqr_comba4
201160814Ssimon#	bn_sqr_comba8
202160814Ssimon#	bn_mul_comba4
203160814Ssimon#	bn_mul_comba8
204160814Ssimon#	bn_sub_words
205160814Ssimon#	bn_add_words
206160814Ssimon#	bn_div_words
207160814Ssimon#	bn_sqr_words
208160814Ssimon#	bn_mul_words
209160814Ssimon#	bn_mul_add_words
210160814Ssimon#
211160814Ssimon#	NOTE:	It is possible to optimize this code more for
212160814Ssimon#	specific PowerPC or Power architectures. On the Northstar
213160814Ssimon#	architecture the optimizations in this file do
214160814Ssimon#	 NOT provide much improvement.
215160814Ssimon#
216160814Ssimon#	If you have comments or suggestions to improve code send
217160814Ssimon#	me a note at schari\@us.ibm.com
218160814Ssimon#
219160814Ssimon#--------------------------------------------------------------------------
220160814Ssimon#
221160814Ssimon#	Defines to be used in the assembly code.
222160814Ssimon#
223238405Sjkim#.set r0,0	# we use it as storage for value of 0
224238405Sjkim#.set SP,1	# preserved
225238405Sjkim#.set RTOC,2	# preserved
226238405Sjkim#.set r3,3	# 1st argument/return value
227238405Sjkim#.set r4,4	# 2nd argument/volatile register
228238405Sjkim#.set r5,5	# 3rd argument/volatile register
229238405Sjkim#.set r6,6	# ...
230238405Sjkim#.set r7,7
231238405Sjkim#.set r8,8
232238405Sjkim#.set r9,9
233238405Sjkim#.set r10,10
234238405Sjkim#.set r11,11
235238405Sjkim#.set r12,12
236238405Sjkim#.set r13,13	# not used, nor any other "below" it...
237160814Ssimon
238160814Ssimon#	Declare function names to be global
239160814Ssimon#	NOTE:	For gcc these names MUST be changed to remove
240160814Ssimon#	        the first . i.e. for example change ".bn_sqr_comba4"
241160814Ssimon#		to "bn_sqr_comba4". This should be automatically done
242160814Ssimon#		in the build.
243160814Ssimon
244160814Ssimon	.globl	.bn_sqr_comba4
245160814Ssimon	.globl	.bn_sqr_comba8
246160814Ssimon	.globl	.bn_mul_comba4
247160814Ssimon	.globl	.bn_mul_comba8
248160814Ssimon	.globl	.bn_sub_words
249160814Ssimon	.globl	.bn_add_words
250160814Ssimon	.globl	.bn_div_words
251160814Ssimon	.globl	.bn_sqr_words
252160814Ssimon	.globl	.bn_mul_words
253160814Ssimon	.globl	.bn_mul_add_words
254160814Ssimon
255160814Ssimon# .text section
256160814Ssimon
257238405Sjkim	.machine	"any"
258160814Ssimon
259160814Ssimon#
260160814Ssimon#	NOTE:	The following label name should be changed to
261160814Ssimon#		"bn_sqr_comba4" i.e. remove the first dot
262160814Ssimon#		for the gcc compiler. This should be automatically
263160814Ssimon#		done in the build
264160814Ssimon#
265160814Ssimon
266160814Ssimon.align	4
267160814Ssimon.bn_sqr_comba4:
268160814Ssimon#
269160814Ssimon# Optimized version of bn_sqr_comba4.
270160814Ssimon#
271160814Ssimon# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272160814Ssimon# r3 contains r
273160814Ssimon# r4 contains a
274160814Ssimon#
275160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276160814Ssimon#
277160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied.
278160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3.
280160814Ssimon# Here's the assembly
281160814Ssimon#
282160814Ssimon#
283160814Ssimon	xor		r0,r0,r0		# set r0 = 0. Used in the addze
284160814Ssimon						# instructions below
285160814Ssimon
286160814Ssimon						#sqr_add_c(a,0,c1,c2,c3)
287160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
288160814Ssimon	$UMULL		r9,r5,r5
289160814Ssimon	$UMULH		r10,r5,r5		#in first iteration. No need
290160814Ssimon						#to add since c1=c2=c3=0.
291160814Ssimon						# Note c3(r11) is NOT set to 0
292160814Ssimon						# but will be.
293160814Ssimon
294160814Ssimon	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
295160814Ssimon						# sqr_add_c2(a,1,0,c2,c3,c1);
296160814Ssimon	$LD		r6,`1*$BNSZ`(r4)
297160814Ssimon	$UMULL		r7,r5,r6
298160814Ssimon	$UMULH		r8,r5,r6
299160814Ssimon
300160814Ssimon	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
301160814Ssimon	adde		r8,r8,r8
302160814Ssimon	addze		r9,r0			# catch carry if any.
303160814Ssimon						# r9= r0(=0) and carry
304160814Ssimon
305160814Ssimon	addc		r10,r7,r10		# now add to temp result.
306160814Ssimon	addze		r11,r8                  # r8 added to r11 which is 0
307160814Ssimon	addze		r9,r9
308160814Ssimon
309160814Ssimon	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
310160814Ssimon						#sqr_add_c(a,1,c3,c1,c2)
311160814Ssimon	$UMULL		r7,r6,r6
312160814Ssimon	$UMULH		r8,r6,r6
313160814Ssimon	addc		r11,r7,r11
314160814Ssimon	adde		r9,r8,r9
315160814Ssimon	addze		r10,r0
316160814Ssimon						#sqr_add_c2(a,2,0,c3,c1,c2)
317160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
318160814Ssimon	$UMULL		r7,r5,r6
319160814Ssimon	$UMULH		r8,r5,r6
320160814Ssimon
321160814Ssimon	addc		r7,r7,r7
322160814Ssimon	adde		r8,r8,r8
323160814Ssimon	addze		r10,r10
324160814Ssimon
325160814Ssimon	addc		r11,r7,r11
326160814Ssimon	adde		r9,r8,r9
327160814Ssimon	addze		r10,r10
328160814Ssimon	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
329160814Ssimon						#sqr_add_c2(a,3,0,c1,c2,c3);
330160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
331160814Ssimon	$UMULL		r7,r5,r6
332160814Ssimon	$UMULH		r8,r5,r6
333160814Ssimon	addc		r7,r7,r7
334160814Ssimon	adde		r8,r8,r8
335160814Ssimon	addze		r11,r0
336160814Ssimon
337160814Ssimon	addc		r9,r7,r9
338160814Ssimon	adde		r10,r8,r10
339160814Ssimon	addze		r11,r11
340160814Ssimon						#sqr_add_c2(a,2,1,c1,c2,c3);
341160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
342160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
343160814Ssimon	$UMULL		r7,r5,r6
344160814Ssimon	$UMULH		r8,r5,r6
345160814Ssimon
346160814Ssimon	addc		r7,r7,r7
347160814Ssimon	adde		r8,r8,r8
348160814Ssimon	addze		r11,r11
349160814Ssimon	addc		r9,r7,r9
350160814Ssimon	adde		r10,r8,r10
351160814Ssimon	addze		r11,r11
352160814Ssimon	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
353160814Ssimon						#sqr_add_c(a,2,c2,c3,c1);
354160814Ssimon	$UMULL		r7,r6,r6
355160814Ssimon	$UMULH		r8,r6,r6
356160814Ssimon	addc		r10,r7,r10
357160814Ssimon	adde		r11,r8,r11
358160814Ssimon	addze		r9,r0
359160814Ssimon						#sqr_add_c2(a,3,1,c2,c3,c1);
360160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
361160814Ssimon	$UMULL		r7,r5,r6
362160814Ssimon	$UMULH		r8,r5,r6
363160814Ssimon	addc		r7,r7,r7
364160814Ssimon	adde		r8,r8,r8
365160814Ssimon	addze		r9,r9
366160814Ssimon
367160814Ssimon	addc		r10,r7,r10
368160814Ssimon	adde		r11,r8,r11
369160814Ssimon	addze		r9,r9
370160814Ssimon	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
371160814Ssimon						#sqr_add_c2(a,3,2,c3,c1,c2);
372160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
373160814Ssimon	$UMULL		r7,r5,r6
374160814Ssimon	$UMULH		r8,r5,r6
375160814Ssimon	addc		r7,r7,r7
376160814Ssimon	adde		r8,r8,r8
377160814Ssimon	addze		r10,r0
378160814Ssimon
379160814Ssimon	addc		r11,r7,r11
380160814Ssimon	adde		r9,r8,r9
381160814Ssimon	addze		r10,r10
382160814Ssimon	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
383160814Ssimon						#sqr_add_c(a,3,c1,c2,c3);
384160814Ssimon	$UMULL		r7,r6,r6
385160814Ssimon	$UMULH		r8,r6,r6
386160814Ssimon	addc		r9,r7,r9
387160814Ssimon	adde		r10,r8,r10
388160814Ssimon
389160814Ssimon	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
390160814Ssimon	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
391238405Sjkim	blr
392238405Sjkim	.long	0
393238405Sjkim	.byte	0,12,0x14,0,0,0,2,0
394238405Sjkim	.long	0
395160814Ssimon
396160814Ssimon#
397160814Ssimon#	NOTE:	The following label name should be changed to
398160814Ssimon#		"bn_sqr_comba8" i.e. remove the first dot
399160814Ssimon#		for the gcc compiler. This should be automatically
400160814Ssimon#		done in the build
401160814Ssimon#
402160814Ssimon
403160814Ssimon.align	4
404160814Ssimon.bn_sqr_comba8:
405160814Ssimon#
406160814Ssimon# This is an optimized version of the bn_sqr_comba8 routine.
407160814Ssimon# Tightly uses the adde instruction
408160814Ssimon#
409160814Ssimon#
410160814Ssimon# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
411160814Ssimon# r3 contains r
412160814Ssimon# r4 contains a
413160814Ssimon#
414160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
415160814Ssimon#
416160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied.
417160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply.
418160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3.
419160814Ssimon#
420160814Ssimon# Possible optimization of loading all 8 longs of a into registers
421160814Ssimon# doesnt provide any speedup
422160814Ssimon#
423160814Ssimon
424160814Ssimon	xor		r0,r0,r0		#set r0 = 0.Used in addze
425160814Ssimon						#instructions below.
426160814Ssimon
427160814Ssimon						#sqr_add_c(a,0,c1,c2,c3);
428160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
429160814Ssimon	$UMULL		r9,r5,r5		#1st iteration:	no carries.
430160814Ssimon	$UMULH		r10,r5,r5
431160814Ssimon	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
432160814Ssimon						#sqr_add_c2(a,1,0,c2,c3,c1);
433160814Ssimon	$LD		r6,`1*$BNSZ`(r4)
434160814Ssimon	$UMULL		r7,r5,r6
435160814Ssimon	$UMULH		r8,r5,r6
436160814Ssimon
437160814Ssimon	addc		r10,r7,r10		#add the two register number
438160814Ssimon	adde		r11,r8,r0 		# (r8,r7) to the three register
439160814Ssimon	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
440160814Ssimon
441160814Ssimon	addc		r10,r7,r10		#add the two register number
442160814Ssimon	adde		r11,r8,r11 		# (r8,r7) to the three register
443160814Ssimon	addze		r9,r9			# number (r9,r11,r10).
444160814Ssimon
445160814Ssimon	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
446160814Ssimon
447160814Ssimon						#sqr_add_c(a,1,c3,c1,c2);
448160814Ssimon	$UMULL		r7,r6,r6
449160814Ssimon	$UMULH		r8,r6,r6
450160814Ssimon	addc		r11,r7,r11
451160814Ssimon	adde		r9,r8,r9
452160814Ssimon	addze		r10,r0
453160814Ssimon						#sqr_add_c2(a,2,0,c3,c1,c2);
454160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
455160814Ssimon	$UMULL		r7,r5,r6
456160814Ssimon	$UMULH		r8,r5,r6
457160814Ssimon
458160814Ssimon	addc		r11,r7,r11
459160814Ssimon	adde		r9,r8,r9
460160814Ssimon	addze		r10,r10
461160814Ssimon
462160814Ssimon	addc		r11,r7,r11
463160814Ssimon	adde		r9,r8,r9
464160814Ssimon	addze		r10,r10
465160814Ssimon
466160814Ssimon	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
467160814Ssimon						#sqr_add_c2(a,3,0,c1,c2,c3);
468160814Ssimon	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
469160814Ssimon	$UMULL		r7,r5,r6
470160814Ssimon	$UMULH		r8,r5,r6
471160814Ssimon
472160814Ssimon	addc		r9,r7,r9
473160814Ssimon	adde		r10,r8,r10
474160814Ssimon	addze		r11,r0
475160814Ssimon
476160814Ssimon	addc		r9,r7,r9
477160814Ssimon	adde		r10,r8,r10
478160814Ssimon	addze		r11,r11
479160814Ssimon						#sqr_add_c2(a,2,1,c1,c2,c3);
480160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
481160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
482160814Ssimon	$UMULL		r7,r5,r6
483160814Ssimon	$UMULH		r8,r5,r6
484160814Ssimon
485160814Ssimon	addc		r9,r7,r9
486160814Ssimon	adde		r10,r8,r10
487160814Ssimon	addze		r11,r11
488160814Ssimon
489160814Ssimon	addc		r9,r7,r9
490160814Ssimon	adde		r10,r8,r10
491160814Ssimon	addze		r11,r11
492160814Ssimon
493160814Ssimon	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
494160814Ssimon						#sqr_add_c(a,2,c2,c3,c1);
495160814Ssimon	$UMULL		r7,r6,r6
496160814Ssimon	$UMULH		r8,r6,r6
497160814Ssimon
498160814Ssimon	addc		r10,r7,r10
499160814Ssimon	adde		r11,r8,r11
500160814Ssimon	addze		r9,r0
501160814Ssimon						#sqr_add_c2(a,3,1,c2,c3,c1);
502160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
503160814Ssimon	$UMULL		r7,r5,r6
504160814Ssimon	$UMULH		r8,r5,r6
505160814Ssimon
506160814Ssimon	addc		r10,r7,r10
507160814Ssimon	adde		r11,r8,r11
508160814Ssimon	addze		r9,r9
509160814Ssimon
510160814Ssimon	addc		r10,r7,r10
511160814Ssimon	adde		r11,r8,r11
512160814Ssimon	addze		r9,r9
513160814Ssimon						#sqr_add_c2(a,4,0,c2,c3,c1);
514160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
515160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
516160814Ssimon	$UMULL		r7,r5,r6
517160814Ssimon	$UMULH		r8,r5,r6
518160814Ssimon
519160814Ssimon	addc		r10,r7,r10
520160814Ssimon	adde		r11,r8,r11
521160814Ssimon	addze		r9,r9
522160814Ssimon
523160814Ssimon	addc		r10,r7,r10
524160814Ssimon	adde		r11,r8,r11
525160814Ssimon	addze		r9,r9
526160814Ssimon	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
527160814Ssimon						#sqr_add_c2(a,5,0,c3,c1,c2);
528160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
529160814Ssimon	$UMULL		r7,r5,r6
530160814Ssimon	$UMULH		r8,r5,r6
531160814Ssimon
532160814Ssimon	addc		r11,r7,r11
533160814Ssimon	adde		r9,r8,r9
534160814Ssimon	addze		r10,r0
535160814Ssimon
536160814Ssimon	addc		r11,r7,r11
537160814Ssimon	adde		r9,r8,r9
538160814Ssimon	addze		r10,r10
539160814Ssimon						#sqr_add_c2(a,4,1,c3,c1,c2);
540160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
541160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
542160814Ssimon	$UMULL		r7,r5,r6
543160814Ssimon	$UMULH		r8,r5,r6
544160814Ssimon
545160814Ssimon	addc		r11,r7,r11
546160814Ssimon	adde		r9,r8,r9
547160814Ssimon	addze		r10,r10
548160814Ssimon
549160814Ssimon	addc		r11,r7,r11
550160814Ssimon	adde		r9,r8,r9
551160814Ssimon	addze		r10,r10
552160814Ssimon						#sqr_add_c2(a,3,2,c3,c1,c2);
553160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
554160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
555160814Ssimon	$UMULL		r7,r5,r6
556160814Ssimon	$UMULH		r8,r5,r6
557160814Ssimon
558160814Ssimon	addc		r11,r7,r11
559160814Ssimon	adde		r9,r8,r9
560160814Ssimon	addze		r10,r10
561160814Ssimon
562160814Ssimon	addc		r11,r7,r11
563160814Ssimon	adde		r9,r8,r9
564160814Ssimon	addze		r10,r10
565160814Ssimon	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
566160814Ssimon						#sqr_add_c(a,3,c1,c2,c3);
567160814Ssimon	$UMULL		r7,r6,r6
568160814Ssimon	$UMULH		r8,r6,r6
569160814Ssimon	addc		r9,r7,r9
570160814Ssimon	adde		r10,r8,r10
571160814Ssimon	addze		r11,r0
572160814Ssimon						#sqr_add_c2(a,4,2,c1,c2,c3);
573160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
574160814Ssimon	$UMULL		r7,r5,r6
575160814Ssimon	$UMULH		r8,r5,r6
576160814Ssimon
577160814Ssimon	addc		r9,r7,r9
578160814Ssimon	adde		r10,r8,r10
579160814Ssimon	addze		r11,r11
580160814Ssimon
581160814Ssimon	addc		r9,r7,r9
582160814Ssimon	adde		r10,r8,r10
583160814Ssimon	addze		r11,r11
584160814Ssimon						#sqr_add_c2(a,5,1,c1,c2,c3);
585160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
586160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
587160814Ssimon	$UMULL		r7,r5,r6
588160814Ssimon	$UMULH		r8,r5,r6
589160814Ssimon
590160814Ssimon	addc		r9,r7,r9
591160814Ssimon	adde		r10,r8,r10
592160814Ssimon	addze		r11,r11
593160814Ssimon
594160814Ssimon	addc		r9,r7,r9
595160814Ssimon	adde		r10,r8,r10
596160814Ssimon	addze		r11,r11
597160814Ssimon						#sqr_add_c2(a,6,0,c1,c2,c3);
598160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
599160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
600160814Ssimon	$UMULL		r7,r5,r6
601160814Ssimon	$UMULH		r8,r5,r6
602160814Ssimon	addc		r9,r7,r9
603160814Ssimon	adde		r10,r8,r10
604160814Ssimon	addze		r11,r11
605160814Ssimon	addc		r9,r7,r9
606160814Ssimon	adde		r10,r8,r10
607160814Ssimon	addze		r11,r11
608160814Ssimon	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
609160814Ssimon						#sqr_add_c2(a,7,0,c2,c3,c1);
610160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
611160814Ssimon	$UMULL		r7,r5,r6
612160814Ssimon	$UMULH		r8,r5,r6
613160814Ssimon
614160814Ssimon	addc		r10,r7,r10
615160814Ssimon	adde		r11,r8,r11
616160814Ssimon	addze		r9,r0
617160814Ssimon	addc		r10,r7,r10
618160814Ssimon	adde		r11,r8,r11
619160814Ssimon	addze		r9,r9
620160814Ssimon						#sqr_add_c2(a,6,1,c2,c3,c1);
621160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
622160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
623160814Ssimon	$UMULL		r7,r5,r6
624160814Ssimon	$UMULH		r8,r5,r6
625160814Ssimon
626160814Ssimon	addc		r10,r7,r10
627160814Ssimon	adde		r11,r8,r11
628160814Ssimon	addze		r9,r9
629160814Ssimon	addc		r10,r7,r10
630160814Ssimon	adde		r11,r8,r11
631160814Ssimon	addze		r9,r9
632160814Ssimon						#sqr_add_c2(a,5,2,c2,c3,c1);
633160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
634160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
635160814Ssimon	$UMULL		r7,r5,r6
636160814Ssimon	$UMULH		r8,r5,r6
637160814Ssimon	addc		r10,r7,r10
638160814Ssimon	adde		r11,r8,r11
639160814Ssimon	addze		r9,r9
640160814Ssimon	addc		r10,r7,r10
641160814Ssimon	adde		r11,r8,r11
642160814Ssimon	addze		r9,r9
643160814Ssimon						#sqr_add_c2(a,4,3,c2,c3,c1);
644160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
645160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
646160814Ssimon	$UMULL		r7,r5,r6
647160814Ssimon	$UMULH		r8,r5,r6
648160814Ssimon
649160814Ssimon	addc		r10,r7,r10
650160814Ssimon	adde		r11,r8,r11
651160814Ssimon	addze		r9,r9
652160814Ssimon	addc		r10,r7,r10
653160814Ssimon	adde		r11,r8,r11
654160814Ssimon	addze		r9,r9
655160814Ssimon	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
656160814Ssimon						#sqr_add_c(a,4,c3,c1,c2);
657160814Ssimon	$UMULL		r7,r6,r6
658160814Ssimon	$UMULH		r8,r6,r6
659160814Ssimon	addc		r11,r7,r11
660160814Ssimon	adde		r9,r8,r9
661160814Ssimon	addze		r10,r0
662160814Ssimon						#sqr_add_c2(a,5,3,c3,c1,c2);
663160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
664160814Ssimon	$UMULL		r7,r5,r6
665160814Ssimon	$UMULH		r8,r5,r6
666160814Ssimon	addc		r11,r7,r11
667160814Ssimon	adde		r9,r8,r9
668160814Ssimon	addze		r10,r10
669160814Ssimon	addc		r11,r7,r11
670160814Ssimon	adde		r9,r8,r9
671160814Ssimon	addze		r10,r10
672160814Ssimon						#sqr_add_c2(a,6,2,c3,c1,c2);
673160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
674160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
675160814Ssimon	$UMULL		r7,r5,r6
676160814Ssimon	$UMULH		r8,r5,r6
677160814Ssimon	addc		r11,r7,r11
678160814Ssimon	adde		r9,r8,r9
679160814Ssimon	addze		r10,r10
680160814Ssimon
681160814Ssimon	addc		r11,r7,r11
682160814Ssimon	adde		r9,r8,r9
683160814Ssimon	addze		r10,r10
684160814Ssimon						#sqr_add_c2(a,7,1,c3,c1,c2);
685160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
686160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
687160814Ssimon	$UMULL		r7,r5,r6
688160814Ssimon	$UMULH		r8,r5,r6
689160814Ssimon	addc		r11,r7,r11
690160814Ssimon	adde		r9,r8,r9
691160814Ssimon	addze		r10,r10
692160814Ssimon	addc		r11,r7,r11
693160814Ssimon	adde		r9,r8,r9
694160814Ssimon	addze		r10,r10
695160814Ssimon	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
696160814Ssimon						#sqr_add_c2(a,7,2,c1,c2,c3);
697160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
698160814Ssimon	$UMULL		r7,r5,r6
699160814Ssimon	$UMULH		r8,r5,r6
700160814Ssimon
701160814Ssimon	addc		r9,r7,r9
702160814Ssimon	adde		r10,r8,r10
703160814Ssimon	addze		r11,r0
704160814Ssimon	addc		r9,r7,r9
705160814Ssimon	adde		r10,r8,r10
706160814Ssimon	addze		r11,r11
707160814Ssimon						#sqr_add_c2(a,6,3,c1,c2,c3);
708160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
709160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
710160814Ssimon	$UMULL		r7,r5,r6
711160814Ssimon	$UMULH		r8,r5,r6
712160814Ssimon	addc		r9,r7,r9
713160814Ssimon	adde		r10,r8,r10
714160814Ssimon	addze		r11,r11
715160814Ssimon	addc		r9,r7,r9
716160814Ssimon	adde		r10,r8,r10
717160814Ssimon	addze		r11,r11
718160814Ssimon						#sqr_add_c2(a,5,4,c1,c2,c3);
719160814Ssimon	$LD		r5,`4*$BNSZ`(r4)
720160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
721160814Ssimon	$UMULL		r7,r5,r6
722160814Ssimon	$UMULH		r8,r5,r6
723160814Ssimon	addc		r9,r7,r9
724160814Ssimon	adde		r10,r8,r10
725160814Ssimon	addze		r11,r11
726160814Ssimon	addc		r9,r7,r9
727160814Ssimon	adde		r10,r8,r10
728160814Ssimon	addze		r11,r11
729160814Ssimon	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
730160814Ssimon						#sqr_add_c(a,5,c2,c3,c1);
731160814Ssimon	$UMULL		r7,r6,r6
732160814Ssimon	$UMULH		r8,r6,r6
733160814Ssimon	addc		r10,r7,r10
734160814Ssimon	adde		r11,r8,r11
735160814Ssimon	addze		r9,r0
736160814Ssimon						#sqr_add_c2(a,6,4,c2,c3,c1);
737160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
738160814Ssimon	$UMULL		r7,r5,r6
739160814Ssimon	$UMULH		r8,r5,r6
740160814Ssimon	addc		r10,r7,r10
741160814Ssimon	adde		r11,r8,r11
742160814Ssimon	addze		r9,r9
743160814Ssimon	addc		r10,r7,r10
744160814Ssimon	adde		r11,r8,r11
745160814Ssimon	addze		r9,r9
746160814Ssimon						#sqr_add_c2(a,7,3,c2,c3,c1);
747160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
748160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
749160814Ssimon	$UMULL		r7,r5,r6
750160814Ssimon	$UMULH		r8,r5,r6
751160814Ssimon	addc		r10,r7,r10
752160814Ssimon	adde		r11,r8,r11
753160814Ssimon	addze		r9,r9
754160814Ssimon	addc		r10,r7,r10
755160814Ssimon	adde		r11,r8,r11
756160814Ssimon	addze		r9,r9
757160814Ssimon	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
758160814Ssimon						#sqr_add_c2(a,7,4,c3,c1,c2);
759160814Ssimon	$LD		r5,`4*$BNSZ`(r4)
760160814Ssimon	$UMULL		r7,r5,r6
761160814Ssimon	$UMULH		r8,r5,r6
762160814Ssimon	addc		r11,r7,r11
763160814Ssimon	adde		r9,r8,r9
764160814Ssimon	addze		r10,r0
765160814Ssimon	addc		r11,r7,r11
766160814Ssimon	adde		r9,r8,r9
767160814Ssimon	addze		r10,r10
768160814Ssimon						#sqr_add_c2(a,6,5,c3,c1,c2);
769160814Ssimon	$LD		r5,`5*$BNSZ`(r4)
770160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
771160814Ssimon	$UMULL		r7,r5,r6
772160814Ssimon	$UMULH		r8,r5,r6
773160814Ssimon	addc		r11,r7,r11
774160814Ssimon	adde		r9,r8,r9
775160814Ssimon	addze		r10,r10
776160814Ssimon	addc		r11,r7,r11
777160814Ssimon	adde		r9,r8,r9
778160814Ssimon	addze		r10,r10
779160814Ssimon	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
780160814Ssimon						#sqr_add_c(a,6,c1,c2,c3);
781160814Ssimon	$UMULL		r7,r6,r6
782160814Ssimon	$UMULH		r8,r6,r6
783160814Ssimon	addc		r9,r7,r9
784160814Ssimon	adde		r10,r8,r10
785160814Ssimon	addze		r11,r0
786160814Ssimon						#sqr_add_c2(a,7,5,c1,c2,c3)
787160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
788160814Ssimon	$UMULL		r7,r5,r6
789160814Ssimon	$UMULH		r8,r5,r6
790160814Ssimon	addc		r9,r7,r9
791160814Ssimon	adde		r10,r8,r10
792160814Ssimon	addze		r11,r11
793160814Ssimon	addc		r9,r7,r9
794160814Ssimon	adde		r10,r8,r10
795160814Ssimon	addze		r11,r11
796160814Ssimon	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
797160814Ssimon
798160814Ssimon						#sqr_add_c2(a,7,6,c2,c3,c1)
799160814Ssimon	$LD		r5,`6*$BNSZ`(r4)
800160814Ssimon	$UMULL		r7,r5,r6
801160814Ssimon	$UMULH		r8,r5,r6
802160814Ssimon	addc		r10,r7,r10
803160814Ssimon	adde		r11,r8,r11
804160814Ssimon	addze		r9,r0
805160814Ssimon	addc		r10,r7,r10
806160814Ssimon	adde		r11,r8,r11
807160814Ssimon	addze		r9,r9
808160814Ssimon	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
809160814Ssimon						#sqr_add_c(a,7,c3,c1,c2);
810160814Ssimon	$UMULL		r7,r6,r6
811160814Ssimon	$UMULH		r8,r6,r6
812160814Ssimon	addc		r11,r7,r11
813160814Ssimon	adde		r9,r8,r9
814160814Ssimon	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
815160814Ssimon	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
816160814Ssimon
817160814Ssimon
818238405Sjkim	blr
819238405Sjkim	.long	0
820238405Sjkim	.byte	0,12,0x14,0,0,0,2,0
821238405Sjkim	.long	0
822160814Ssimon
823160814Ssimon#
824160814Ssimon#	NOTE:	The following label name should be changed to
825160814Ssimon#		"bn_mul_comba4" i.e. remove the first dot
826160814Ssimon#		for the gcc compiler. This should be automatically
827160814Ssimon#		done in the build
828160814Ssimon#
829160814Ssimon
830160814Ssimon.align	4
831160814Ssimon.bn_mul_comba4:
832160814Ssimon#
833160814Ssimon# This is an optimized version of the bn_mul_comba4 routine.
834160814Ssimon#
835160814Ssimon# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
836160814Ssimon# r3 contains r
837160814Ssimon# r4 contains a
838160814Ssimon# r5 contains b
839160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied.
840160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply.
841160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3.
842160814Ssimon#
843160814Ssimon	xor	r0,r0,r0		#r0=0. Used in addze below.
844160814Ssimon					#mul_add_c(a[0],b[0],c1,c2,c3);
845160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
846160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
847160814Ssimon	$UMULL	r10,r6,r7
848160814Ssimon	$UMULH	r11,r6,r7
849160814Ssimon	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
850160814Ssimon					#mul_add_c(a[0],b[1],c2,c3,c1);
851160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
852160814Ssimon	$UMULL	r8,r6,r7
853160814Ssimon	$UMULH	r9,r6,r7
854160814Ssimon	addc	r11,r8,r11
855160814Ssimon	adde	r12,r9,r0
856160814Ssimon	addze	r10,r0
857160814Ssimon					#mul_add_c(a[1],b[0],c2,c3,c1);
858160814Ssimon	$LD	r6, `1*$BNSZ`(r4)
859160814Ssimon	$LD	r7, `0*$BNSZ`(r5)
860160814Ssimon	$UMULL	r8,r6,r7
861160814Ssimon	$UMULH	r9,r6,r7
862160814Ssimon	addc	r11,r8,r11
863160814Ssimon	adde	r12,r9,r12
864160814Ssimon	addze	r10,r10
865160814Ssimon	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
866160814Ssimon					#mul_add_c(a[2],b[0],c3,c1,c2);
867160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
868160814Ssimon	$UMULL	r8,r6,r7
869160814Ssimon	$UMULH	r9,r6,r7
870160814Ssimon	addc	r12,r8,r12
871160814Ssimon	adde	r10,r9,r10
872160814Ssimon	addze	r11,r0
873160814Ssimon					#mul_add_c(a[1],b[1],c3,c1,c2);
874160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
875160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
876160814Ssimon	$UMULL	r8,r6,r7
877160814Ssimon	$UMULH	r9,r6,r7
878160814Ssimon	addc	r12,r8,r12
879160814Ssimon	adde	r10,r9,r10
880160814Ssimon	addze	r11,r11
881160814Ssimon					#mul_add_c(a[0],b[2],c3,c1,c2);
882160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
883160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
884160814Ssimon	$UMULL	r8,r6,r7
885160814Ssimon	$UMULH	r9,r6,r7
886160814Ssimon	addc	r12,r8,r12
887160814Ssimon	adde	r10,r9,r10
888160814Ssimon	addze	r11,r11
889160814Ssimon	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
890160814Ssimon					#mul_add_c(a[0],b[3],c1,c2,c3);
891160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
892160814Ssimon	$UMULL	r8,r6,r7
893160814Ssimon	$UMULH	r9,r6,r7
894160814Ssimon	addc	r10,r8,r10
895160814Ssimon	adde	r11,r9,r11
896160814Ssimon	addze	r12,r0
897160814Ssimon					#mul_add_c(a[1],b[2],c1,c2,c3);
898160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
899160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
900160814Ssimon	$UMULL	r8,r6,r7
901160814Ssimon	$UMULH	r9,r6,r7
902160814Ssimon	addc	r10,r8,r10
903160814Ssimon	adde	r11,r9,r11
904160814Ssimon	addze	r12,r12
905160814Ssimon					#mul_add_c(a[2],b[1],c1,c2,c3);
906160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
907160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
908160814Ssimon	$UMULL	r8,r6,r7
909160814Ssimon	$UMULH	r9,r6,r7
910160814Ssimon	addc	r10,r8,r10
911160814Ssimon	adde	r11,r9,r11
912160814Ssimon	addze	r12,r12
913160814Ssimon					#mul_add_c(a[3],b[0],c1,c2,c3);
914160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
915160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
916160814Ssimon	$UMULL	r8,r6,r7
917160814Ssimon	$UMULH	r9,r6,r7
918160814Ssimon	addc	r10,r8,r10
919160814Ssimon	adde	r11,r9,r11
920160814Ssimon	addze	r12,r12
921160814Ssimon	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
922160814Ssimon					#mul_add_c(a[3],b[1],c2,c3,c1);
923160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
924160814Ssimon	$UMULL	r8,r6,r7
925160814Ssimon	$UMULH	r9,r6,r7
926160814Ssimon	addc	r11,r8,r11
927160814Ssimon	adde	r12,r9,r12
928160814Ssimon	addze	r10,r0
929160814Ssimon					#mul_add_c(a[2],b[2],c2,c3,c1);
930160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
931160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
932160814Ssimon	$UMULL	r8,r6,r7
933160814Ssimon	$UMULH	r9,r6,r7
934160814Ssimon	addc	r11,r8,r11
935160814Ssimon	adde	r12,r9,r12
936160814Ssimon	addze	r10,r10
937160814Ssimon					#mul_add_c(a[1],b[3],c2,c3,c1);
938160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
939160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
940160814Ssimon	$UMULL	r8,r6,r7
941160814Ssimon	$UMULH	r9,r6,r7
942160814Ssimon	addc	r11,r8,r11
943160814Ssimon	adde	r12,r9,r12
944160814Ssimon	addze	r10,r10
945160814Ssimon	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
946160814Ssimon					#mul_add_c(a[2],b[3],c3,c1,c2);
947160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
948160814Ssimon	$UMULL	r8,r6,r7
949160814Ssimon	$UMULH	r9,r6,r7
950160814Ssimon	addc	r12,r8,r12
951160814Ssimon	adde	r10,r9,r10
952160814Ssimon	addze	r11,r0
953160814Ssimon					#mul_add_c(a[3],b[2],c3,c1,c2);
954160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
955237657Sjkim	$LD	r7,`2*$BNSZ`(r5)
956160814Ssimon	$UMULL	r8,r6,r7
957160814Ssimon	$UMULH	r9,r6,r7
958160814Ssimon	addc	r12,r8,r12
959160814Ssimon	adde	r10,r9,r10
960160814Ssimon	addze	r11,r11
961160814Ssimon	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
962160814Ssimon					#mul_add_c(a[3],b[3],c1,c2,c3);
963160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
964160814Ssimon	$UMULL	r8,r6,r7
965160814Ssimon	$UMULH	r9,r6,r7
966160814Ssimon	addc	r10,r8,r10
967160814Ssimon	adde	r11,r9,r11
968160814Ssimon
969160814Ssimon	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
970160814Ssimon	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
971238405Sjkim	blr
972238405Sjkim	.long	0
973238405Sjkim	.byte	0,12,0x14,0,0,0,3,0
974238405Sjkim	.long	0
975160814Ssimon
976160814Ssimon#
977160814Ssimon#	NOTE:	The following label name should be changed to
978160814Ssimon#		"bn_mul_comba8" i.e. remove the first dot
979160814Ssimon#		for the gcc compiler. This should be automatically
980160814Ssimon#		done in the build
981160814Ssimon#
982160814Ssimon
983160814Ssimon.align	4
984160814Ssimon.bn_mul_comba8:
985160814Ssimon#
986160814Ssimon# Optimized version of the bn_mul_comba8 routine.
987160814Ssimon#
988160814Ssimon# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
989160814Ssimon# r3 contains r
990160814Ssimon# r4 contains a
991160814Ssimon# r5 contains b
992160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied.
993160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply.
994160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3.
995160814Ssimon#
996160814Ssimon	xor	r0,r0,r0		#r0=0. Used in addze below.
997160814Ssimon
998160814Ssimon					#mul_add_c(a[0],b[0],c1,c2,c3);
999160814Ssimon	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1000160814Ssimon	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1001160814Ssimon	$UMULL	r10,r6,r7
1002160814Ssimon	$UMULH	r11,r6,r7
1003160814Ssimon	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1004160814Ssimon					#mul_add_c(a[0],b[1],c2,c3,c1);
1005160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1006160814Ssimon	$UMULL	r8,r6,r7
1007160814Ssimon	$UMULH	r9,r6,r7
1008160814Ssimon	addc	r11,r11,r8
1009160814Ssimon	addze	r12,r9			# since we didnt set r12 to zero before.
1010160814Ssimon	addze	r10,r0
1011160814Ssimon					#mul_add_c(a[1],b[0],c2,c3,c1);
1012160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1013160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1014160814Ssimon	$UMULL	r8,r6,r7
1015160814Ssimon	$UMULH	r9,r6,r7
1016160814Ssimon	addc	r11,r11,r8
1017160814Ssimon	adde	r12,r12,r9
1018160814Ssimon	addze	r10,r10
1019160814Ssimon	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1020160814Ssimon					#mul_add_c(a[2],b[0],c3,c1,c2);
1021160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1022160814Ssimon	$UMULL	r8,r6,r7
1023160814Ssimon	$UMULH	r9,r6,r7
1024160814Ssimon	addc	r12,r12,r8
1025160814Ssimon	adde	r10,r10,r9
1026160814Ssimon	addze	r11,r0
1027160814Ssimon					#mul_add_c(a[1],b[1],c3,c1,c2);
1028160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1029160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1030160814Ssimon	$UMULL	r8,r6,r7
1031160814Ssimon	$UMULH	r9,r6,r7
1032160814Ssimon	addc	r12,r12,r8
1033160814Ssimon	adde	r10,r10,r9
1034160814Ssimon	addze	r11,r11
1035160814Ssimon					#mul_add_c(a[0],b[2],c3,c1,c2);
1036160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1037160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1038160814Ssimon	$UMULL	r8,r6,r7
1039160814Ssimon	$UMULH	r9,r6,r7
1040160814Ssimon	addc	r12,r12,r8
1041160814Ssimon	adde	r10,r10,r9
1042160814Ssimon	addze	r11,r11
1043160814Ssimon	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1044160814Ssimon					#mul_add_c(a[0],b[3],c1,c2,c3);
1045160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1046160814Ssimon	$UMULL	r8,r6,r7
1047160814Ssimon	$UMULH	r9,r6,r7
1048160814Ssimon	addc	r10,r10,r8
1049160814Ssimon	adde	r11,r11,r9
1050160814Ssimon	addze	r12,r0
1051160814Ssimon					#mul_add_c(a[1],b[2],c1,c2,c3);
1052160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1053160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1054160814Ssimon	$UMULL	r8,r6,r7
1055160814Ssimon	$UMULH	r9,r6,r7
1056160814Ssimon	addc	r10,r10,r8
1057160814Ssimon	adde	r11,r11,r9
1058160814Ssimon	addze	r12,r12
1059160814Ssimon
1060160814Ssimon					#mul_add_c(a[2],b[1],c1,c2,c3);
1061160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1062160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1063160814Ssimon	$UMULL	r8,r6,r7
1064160814Ssimon	$UMULH	r9,r6,r7
1065160814Ssimon	addc	r10,r10,r8
1066160814Ssimon	adde	r11,r11,r9
1067160814Ssimon	addze	r12,r12
1068160814Ssimon					#mul_add_c(a[3],b[0],c1,c2,c3);
1069160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1070160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1071160814Ssimon	$UMULL	r8,r6,r7
1072160814Ssimon	$UMULH	r9,r6,r7
1073160814Ssimon	addc	r10,r10,r8
1074160814Ssimon	adde	r11,r11,r9
1075160814Ssimon	addze	r12,r12
1076160814Ssimon	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1077160814Ssimon					#mul_add_c(a[4],b[0],c2,c3,c1);
1078160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1079160814Ssimon	$UMULL	r8,r6,r7
1080160814Ssimon	$UMULH	r9,r6,r7
1081160814Ssimon	addc	r11,r11,r8
1082160814Ssimon	adde	r12,r12,r9
1083160814Ssimon	addze	r10,r0
1084160814Ssimon					#mul_add_c(a[3],b[1],c2,c3,c1);
1085160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1086160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1087160814Ssimon	$UMULL	r8,r6,r7
1088160814Ssimon	$UMULH	r9,r6,r7
1089160814Ssimon	addc	r11,r11,r8
1090160814Ssimon	adde	r12,r12,r9
1091160814Ssimon	addze	r10,r10
1092160814Ssimon					#mul_add_c(a[2],b[2],c2,c3,c1);
1093160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1094160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1095160814Ssimon	$UMULL	r8,r6,r7
1096160814Ssimon	$UMULH	r9,r6,r7
1097160814Ssimon	addc	r11,r11,r8
1098160814Ssimon	adde	r12,r12,r9
1099160814Ssimon	addze	r10,r10
1100160814Ssimon					#mul_add_c(a[1],b[3],c2,c3,c1);
1101160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1102160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1103160814Ssimon	$UMULL	r8,r6,r7
1104160814Ssimon	$UMULH	r9,r6,r7
1105160814Ssimon	addc	r11,r11,r8
1106160814Ssimon	adde	r12,r12,r9
1107160814Ssimon	addze	r10,r10
1108160814Ssimon					#mul_add_c(a[0],b[4],c2,c3,c1);
1109160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1110160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1111160814Ssimon	$UMULL	r8,r6,r7
1112160814Ssimon	$UMULH	r9,r6,r7
1113160814Ssimon	addc	r11,r11,r8
1114160814Ssimon	adde	r12,r12,r9
1115160814Ssimon	addze	r10,r10
1116160814Ssimon	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1117160814Ssimon					#mul_add_c(a[0],b[5],c3,c1,c2);
1118160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1119160814Ssimon	$UMULL	r8,r6,r7
1120160814Ssimon	$UMULH	r9,r6,r7
1121160814Ssimon	addc	r12,r12,r8
1122160814Ssimon	adde	r10,r10,r9
1123160814Ssimon	addze	r11,r0
1124160814Ssimon					#mul_add_c(a[1],b[4],c3,c1,c2);
1125160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1126160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1127160814Ssimon	$UMULL	r8,r6,r7
1128160814Ssimon	$UMULH	r9,r6,r7
1129160814Ssimon	addc	r12,r12,r8
1130160814Ssimon	adde	r10,r10,r9
1131160814Ssimon	addze	r11,r11
1132160814Ssimon					#mul_add_c(a[2],b[3],c3,c1,c2);
1133160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1134160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1135160814Ssimon	$UMULL	r8,r6,r7
1136160814Ssimon	$UMULH	r9,r6,r7
1137160814Ssimon	addc	r12,r12,r8
1138160814Ssimon	adde	r10,r10,r9
1139160814Ssimon	addze	r11,r11
1140160814Ssimon					#mul_add_c(a[3],b[2],c3,c1,c2);
1141160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1142160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1143160814Ssimon	$UMULL	r8,r6,r7
1144160814Ssimon	$UMULH	r9,r6,r7
1145160814Ssimon	addc	r12,r12,r8
1146160814Ssimon	adde	r10,r10,r9
1147160814Ssimon	addze	r11,r11
1148160814Ssimon					#mul_add_c(a[4],b[1],c3,c1,c2);
1149160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1150160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1151160814Ssimon	$UMULL	r8,r6,r7
1152160814Ssimon	$UMULH	r9,r6,r7
1153160814Ssimon	addc	r12,r12,r8
1154160814Ssimon	adde	r10,r10,r9
1155160814Ssimon	addze	r11,r11
1156160814Ssimon					#mul_add_c(a[5],b[0],c3,c1,c2);
1157160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1158160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1159160814Ssimon	$UMULL	r8,r6,r7
1160160814Ssimon	$UMULH	r9,r6,r7
1161160814Ssimon	addc	r12,r12,r8
1162160814Ssimon	adde	r10,r10,r9
1163160814Ssimon	addze	r11,r11
1164160814Ssimon	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1165160814Ssimon					#mul_add_c(a[6],b[0],c1,c2,c3);
1166160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1167160814Ssimon	$UMULL	r8,r6,r7
1168160814Ssimon	$UMULH	r9,r6,r7
1169160814Ssimon	addc	r10,r10,r8
1170160814Ssimon	adde	r11,r11,r9
1171160814Ssimon	addze	r12,r0
1172160814Ssimon					#mul_add_c(a[5],b[1],c1,c2,c3);
1173160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1174160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1175160814Ssimon	$UMULL	r8,r6,r7
1176160814Ssimon	$UMULH	r9,r6,r7
1177160814Ssimon	addc	r10,r10,r8
1178160814Ssimon	adde	r11,r11,r9
1179160814Ssimon	addze	r12,r12
1180160814Ssimon					#mul_add_c(a[4],b[2],c1,c2,c3);
1181160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1182160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1183160814Ssimon	$UMULL	r8,r6,r7
1184160814Ssimon	$UMULH	r9,r6,r7
1185160814Ssimon	addc	r10,r10,r8
1186160814Ssimon	adde	r11,r11,r9
1187160814Ssimon	addze	r12,r12
1188160814Ssimon					#mul_add_c(a[3],b[3],c1,c2,c3);
1189160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1190160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1191160814Ssimon	$UMULL	r8,r6,r7
1192160814Ssimon	$UMULH	r9,r6,r7
1193160814Ssimon	addc	r10,r10,r8
1194160814Ssimon	adde	r11,r11,r9
1195160814Ssimon	addze	r12,r12
1196160814Ssimon					#mul_add_c(a[2],b[4],c1,c2,c3);
1197160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1198160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1199160814Ssimon	$UMULL	r8,r6,r7
1200160814Ssimon	$UMULH	r9,r6,r7
1201160814Ssimon	addc	r10,r10,r8
1202160814Ssimon	adde	r11,r11,r9
1203160814Ssimon	addze	r12,r12
1204160814Ssimon					#mul_add_c(a[1],b[5],c1,c2,c3);
1205160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1206160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1207160814Ssimon	$UMULL	r8,r6,r7
1208160814Ssimon	$UMULH	r9,r6,r7
1209160814Ssimon	addc	r10,r10,r8
1210160814Ssimon	adde	r11,r11,r9
1211160814Ssimon	addze	r12,r12
1212160814Ssimon					#mul_add_c(a[0],b[6],c1,c2,c3);
1213160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1214160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1215160814Ssimon	$UMULL	r8,r6,r7
1216160814Ssimon	$UMULH	r9,r6,r7
1217160814Ssimon	addc	r10,r10,r8
1218160814Ssimon	adde	r11,r11,r9
1219160814Ssimon	addze	r12,r12
1220160814Ssimon	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1221160814Ssimon					#mul_add_c(a[0],b[7],c2,c3,c1);
1222160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1223160814Ssimon	$UMULL	r8,r6,r7
1224160814Ssimon	$UMULH	r9,r6,r7
1225160814Ssimon	addc	r11,r11,r8
1226160814Ssimon	adde	r12,r12,r9
1227160814Ssimon	addze	r10,r0
1228160814Ssimon					#mul_add_c(a[1],b[6],c2,c3,c1);
1229160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1230160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1231160814Ssimon	$UMULL	r8,r6,r7
1232160814Ssimon	$UMULH	r9,r6,r7
1233160814Ssimon	addc	r11,r11,r8
1234160814Ssimon	adde	r12,r12,r9
1235160814Ssimon	addze	r10,r10
1236160814Ssimon					#mul_add_c(a[2],b[5],c2,c3,c1);
1237160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1238160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1239160814Ssimon	$UMULL	r8,r6,r7
1240160814Ssimon	$UMULH	r9,r6,r7
1241160814Ssimon	addc	r11,r11,r8
1242160814Ssimon	adde	r12,r12,r9
1243160814Ssimon	addze	r10,r10
1244160814Ssimon					#mul_add_c(a[3],b[4],c2,c3,c1);
1245160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1246160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1247160814Ssimon	$UMULL	r8,r6,r7
1248160814Ssimon	$UMULH	r9,r6,r7
1249160814Ssimon	addc	r11,r11,r8
1250160814Ssimon	adde	r12,r12,r9
1251160814Ssimon	addze	r10,r10
1252160814Ssimon					#mul_add_c(a[4],b[3],c2,c3,c1);
1253160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1254160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1255160814Ssimon	$UMULL	r8,r6,r7
1256160814Ssimon	$UMULH	r9,r6,r7
1257160814Ssimon	addc	r11,r11,r8
1258160814Ssimon	adde	r12,r12,r9
1259160814Ssimon	addze	r10,r10
1260160814Ssimon					#mul_add_c(a[5],b[2],c2,c3,c1);
1261160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1262160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1263160814Ssimon	$UMULL	r8,r6,r7
1264160814Ssimon	$UMULH	r9,r6,r7
1265160814Ssimon	addc	r11,r11,r8
1266160814Ssimon	adde	r12,r12,r9
1267160814Ssimon	addze	r10,r10
1268160814Ssimon					#mul_add_c(a[6],b[1],c2,c3,c1);
1269160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1270160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1271160814Ssimon	$UMULL	r8,r6,r7
1272160814Ssimon	$UMULH	r9,r6,r7
1273160814Ssimon	addc	r11,r11,r8
1274160814Ssimon	adde	r12,r12,r9
1275160814Ssimon	addze	r10,r10
1276160814Ssimon					#mul_add_c(a[7],b[0],c2,c3,c1);
1277160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1278160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1279160814Ssimon	$UMULL	r8,r6,r7
1280160814Ssimon	$UMULH	r9,r6,r7
1281160814Ssimon	addc	r11,r11,r8
1282160814Ssimon	adde	r12,r12,r9
1283160814Ssimon	addze	r10,r10
1284160814Ssimon	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1285160814Ssimon					#mul_add_c(a[7],b[1],c3,c1,c2);
1286160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1287160814Ssimon	$UMULL	r8,r6,r7
1288160814Ssimon	$UMULH	r9,r6,r7
1289160814Ssimon	addc	r12,r12,r8
1290160814Ssimon	adde	r10,r10,r9
1291160814Ssimon	addze	r11,r0
1292160814Ssimon					#mul_add_c(a[6],b[2],c3,c1,c2);
1293160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1294160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1295160814Ssimon	$UMULL	r8,r6,r7
1296160814Ssimon	$UMULH	r9,r6,r7
1297160814Ssimon	addc	r12,r12,r8
1298160814Ssimon	adde	r10,r10,r9
1299160814Ssimon	addze	r11,r11
1300160814Ssimon					#mul_add_c(a[5],b[3],c3,c1,c2);
1301160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1302160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1303160814Ssimon	$UMULL	r8,r6,r7
1304160814Ssimon	$UMULH	r9,r6,r7
1305160814Ssimon	addc	r12,r12,r8
1306160814Ssimon	adde	r10,r10,r9
1307160814Ssimon	addze	r11,r11
1308160814Ssimon					#mul_add_c(a[4],b[4],c3,c1,c2);
1309160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1310160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1311160814Ssimon	$UMULL	r8,r6,r7
1312160814Ssimon	$UMULH	r9,r6,r7
1313160814Ssimon	addc	r12,r12,r8
1314160814Ssimon	adde	r10,r10,r9
1315160814Ssimon	addze	r11,r11
1316160814Ssimon					#mul_add_c(a[3],b[5],c3,c1,c2);
1317160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1318160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1319160814Ssimon	$UMULL	r8,r6,r7
1320160814Ssimon	$UMULH	r9,r6,r7
1321160814Ssimon	addc	r12,r12,r8
1322160814Ssimon	adde	r10,r10,r9
1323160814Ssimon	addze	r11,r11
1324160814Ssimon					#mul_add_c(a[2],b[6],c3,c1,c2);
1325160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1326160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1327160814Ssimon	$UMULL	r8,r6,r7
1328160814Ssimon	$UMULH	r9,r6,r7
1329160814Ssimon	addc	r12,r12,r8
1330160814Ssimon	adde	r10,r10,r9
1331160814Ssimon	addze	r11,r11
1332160814Ssimon					#mul_add_c(a[1],b[7],c3,c1,c2);
1333160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1334160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1335160814Ssimon	$UMULL	r8,r6,r7
1336160814Ssimon	$UMULH	r9,r6,r7
1337160814Ssimon	addc	r12,r12,r8
1338160814Ssimon	adde	r10,r10,r9
1339160814Ssimon	addze	r11,r11
1340160814Ssimon	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1341160814Ssimon					#mul_add_c(a[2],b[7],c1,c2,c3);
1342160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1343160814Ssimon	$UMULL	r8,r6,r7
1344160814Ssimon	$UMULH	r9,r6,r7
1345160814Ssimon	addc	r10,r10,r8
1346160814Ssimon	adde	r11,r11,r9
1347160814Ssimon	addze	r12,r0
1348160814Ssimon					#mul_add_c(a[3],b[6],c1,c2,c3);
1349160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1350160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1351160814Ssimon	$UMULL	r8,r6,r7
1352160814Ssimon	$UMULH	r9,r6,r7
1353160814Ssimon	addc	r10,r10,r8
1354160814Ssimon	adde	r11,r11,r9
1355160814Ssimon	addze	r12,r12
1356160814Ssimon					#mul_add_c(a[4],b[5],c1,c2,c3);
1357160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1358160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1359160814Ssimon	$UMULL	r8,r6,r7
1360160814Ssimon	$UMULH	r9,r6,r7
1361160814Ssimon	addc	r10,r10,r8
1362160814Ssimon	adde	r11,r11,r9
1363160814Ssimon	addze	r12,r12
1364160814Ssimon					#mul_add_c(a[5],b[4],c1,c2,c3);
1365160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1366160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1367160814Ssimon	$UMULL	r8,r6,r7
1368160814Ssimon	$UMULH	r9,r6,r7
1369160814Ssimon	addc	r10,r10,r8
1370160814Ssimon	adde	r11,r11,r9
1371160814Ssimon	addze	r12,r12
1372160814Ssimon					#mul_add_c(a[6],b[3],c1,c2,c3);
1373160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1374160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1375160814Ssimon	$UMULL	r8,r6,r7
1376160814Ssimon	$UMULH	r9,r6,r7
1377160814Ssimon	addc	r10,r10,r8
1378160814Ssimon	adde	r11,r11,r9
1379160814Ssimon	addze	r12,r12
1380160814Ssimon					#mul_add_c(a[7],b[2],c1,c2,c3);
1381160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1382160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1383160814Ssimon	$UMULL	r8,r6,r7
1384160814Ssimon	$UMULH	r9,r6,r7
1385160814Ssimon	addc	r10,r10,r8
1386160814Ssimon	adde	r11,r11,r9
1387160814Ssimon	addze	r12,r12
1388160814Ssimon	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1389160814Ssimon					#mul_add_c(a[7],b[3],c2,c3,c1);
1390160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1391160814Ssimon	$UMULL	r8,r6,r7
1392160814Ssimon	$UMULH	r9,r6,r7
1393160814Ssimon	addc	r11,r11,r8
1394160814Ssimon	adde	r12,r12,r9
1395160814Ssimon	addze	r10,r0
1396160814Ssimon					#mul_add_c(a[6],b[4],c2,c3,c1);
1397160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1398160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1399160814Ssimon	$UMULL	r8,r6,r7
1400160814Ssimon	$UMULH	r9,r6,r7
1401160814Ssimon	addc	r11,r11,r8
1402160814Ssimon	adde	r12,r12,r9
1403160814Ssimon	addze	r10,r10
1404160814Ssimon					#mul_add_c(a[5],b[5],c2,c3,c1);
1405160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1406160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1407160814Ssimon	$UMULL	r8,r6,r7
1408160814Ssimon	$UMULH	r9,r6,r7
1409160814Ssimon	addc	r11,r11,r8
1410160814Ssimon	adde	r12,r12,r9
1411160814Ssimon	addze	r10,r10
1412160814Ssimon					#mul_add_c(a[4],b[6],c2,c3,c1);
1413160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1414160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1415160814Ssimon	$UMULL	r8,r6,r7
1416160814Ssimon	$UMULH	r9,r6,r7
1417160814Ssimon	addc	r11,r11,r8
1418160814Ssimon	adde	r12,r12,r9
1419160814Ssimon	addze	r10,r10
1420160814Ssimon					#mul_add_c(a[3],b[7],c2,c3,c1);
1421160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1422160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1423160814Ssimon	$UMULL	r8,r6,r7
1424160814Ssimon	$UMULH	r9,r6,r7
1425160814Ssimon	addc	r11,r11,r8
1426160814Ssimon	adde	r12,r12,r9
1427160814Ssimon	addze	r10,r10
1428160814Ssimon	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1429160814Ssimon					#mul_add_c(a[4],b[7],c3,c1,c2);
1430160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1431160814Ssimon	$UMULL	r8,r6,r7
1432160814Ssimon	$UMULH	r9,r6,r7
1433160814Ssimon	addc	r12,r12,r8
1434160814Ssimon	adde	r10,r10,r9
1435160814Ssimon	addze	r11,r0
1436160814Ssimon					#mul_add_c(a[5],b[6],c3,c1,c2);
1437160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1438160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1439160814Ssimon	$UMULL	r8,r6,r7
1440160814Ssimon	$UMULH	r9,r6,r7
1441160814Ssimon	addc	r12,r12,r8
1442160814Ssimon	adde	r10,r10,r9
1443160814Ssimon	addze	r11,r11
1444160814Ssimon					#mul_add_c(a[6],b[5],c3,c1,c2);
1445160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1446160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1447160814Ssimon	$UMULL	r8,r6,r7
1448160814Ssimon	$UMULH	r9,r6,r7
1449160814Ssimon	addc	r12,r12,r8
1450160814Ssimon	adde	r10,r10,r9
1451160814Ssimon	addze	r11,r11
1452160814Ssimon					#mul_add_c(a[7],b[4],c3,c1,c2);
1453160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1454160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1455160814Ssimon	$UMULL	r8,r6,r7
1456160814Ssimon	$UMULH	r9,r6,r7
1457160814Ssimon	addc	r12,r12,r8
1458160814Ssimon	adde	r10,r10,r9
1459160814Ssimon	addze	r11,r11
1460160814Ssimon	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1461160814Ssimon					#mul_add_c(a[7],b[5],c1,c2,c3);
1462160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1463160814Ssimon	$UMULL	r8,r6,r7
1464160814Ssimon	$UMULH	r9,r6,r7
1465160814Ssimon	addc	r10,r10,r8
1466160814Ssimon	adde	r11,r11,r9
1467160814Ssimon	addze	r12,r0
1468160814Ssimon					#mul_add_c(a[6],b[6],c1,c2,c3);
1469160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1470160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1471160814Ssimon	$UMULL	r8,r6,r7
1472160814Ssimon	$UMULH	r9,r6,r7
1473160814Ssimon	addc	r10,r10,r8
1474160814Ssimon	adde	r11,r11,r9
1475160814Ssimon	addze	r12,r12
1476160814Ssimon					#mul_add_c(a[5],b[7],c1,c2,c3);
1477160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1478160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1479160814Ssimon	$UMULL	r8,r6,r7
1480160814Ssimon	$UMULH	r9,r6,r7
1481160814Ssimon	addc	r10,r10,r8
1482160814Ssimon	adde	r11,r11,r9
1483160814Ssimon	addze	r12,r12
1484160814Ssimon	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1485160814Ssimon					#mul_add_c(a[6],b[7],c2,c3,c1);
1486160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1487160814Ssimon	$UMULL	r8,r6,r7
1488160814Ssimon	$UMULH	r9,r6,r7
1489160814Ssimon	addc	r11,r11,r8
1490160814Ssimon	adde	r12,r12,r9
1491160814Ssimon	addze	r10,r0
1492160814Ssimon					#mul_add_c(a[7],b[6],c2,c3,c1);
1493160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1494160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1495160814Ssimon	$UMULL	r8,r6,r7
1496160814Ssimon	$UMULH	r9,r6,r7
1497160814Ssimon	addc	r11,r11,r8
1498160814Ssimon	adde	r12,r12,r9
1499160814Ssimon	addze	r10,r10
1500160814Ssimon	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1501160814Ssimon					#mul_add_c(a[7],b[7],c3,c1,c2);
1502160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1503160814Ssimon	$UMULL	r8,r6,r7
1504160814Ssimon	$UMULH	r9,r6,r7
1505160814Ssimon	addc	r12,r12,r8
1506160814Ssimon	adde	r10,r10,r9
1507160814Ssimon	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1508160814Ssimon	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1509238405Sjkim	blr
1510238405Sjkim	.long	0
1511238405Sjkim	.byte	0,12,0x14,0,0,0,3,0
1512238405Sjkim	.long	0
1513160814Ssimon
1514160814Ssimon#
1515160814Ssimon#	NOTE:	The following label name should be changed to
1516160814Ssimon#		"bn_sub_words" i.e. remove the first dot
1517160814Ssimon#		for the gcc compiler. This should be automatically
1518160814Ssimon#		done in the build
1519160814Ssimon#
1520160814Ssimon#
1521160814Ssimon.align	4
1522160814Ssimon.bn_sub_words:
1523160814Ssimon#
1524160814Ssimon#	Handcoded version of bn_sub_words
1525160814Ssimon#
1526160814Ssimon#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1527160814Ssimon#
1528160814Ssimon#	r3 = r
1529160814Ssimon#	r4 = a
1530160814Ssimon#	r5 = b
1531160814Ssimon#	r6 = n
1532160814Ssimon#
1533160814Ssimon#       Note:	No loop unrolling done since this is not a performance
1534160814Ssimon#               critical loop.
1535160814Ssimon
1536160814Ssimon	xor	r0,r0,r0	#set r0 = 0
1537160814Ssimon#
1538160814Ssimon#	check for r6 = 0 AND set carry bit.
1539160814Ssimon#
1540160814Ssimon	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1541160814Ssimon				# if r6 > 0 then result !=0
1542160814Ssimon				# In either case carry bit is set.
1543238405Sjkim	beq	Lppcasm_sub_adios
1544160814Ssimon	addi	r4,r4,-$BNSZ
1545160814Ssimon	addi	r3,r3,-$BNSZ
1546160814Ssimon	addi	r5,r5,-$BNSZ
1547160814Ssimon	mtctr	r6
1548160814SsimonLppcasm_sub_mainloop:
1549160814Ssimon	$LDU	r7,$BNSZ(r4)
1550160814Ssimon	$LDU	r8,$BNSZ(r5)
1551160814Ssimon	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1552160814Ssimon				# if carry = 1 this is r7-r8. Else it
1553160814Ssimon				# is r7-r8 -1 as we need.
1554160814Ssimon	$STU	r6,$BNSZ(r3)
1555238405Sjkim	bdnz-	Lppcasm_sub_mainloop
1556160814SsimonLppcasm_sub_adios:
1557160814Ssimon	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1558160814Ssimon	andi.	r3,r3,1         # keep only last bit.
1559238405Sjkim	blr
1560238405Sjkim	.long	0
1561238405Sjkim	.byte	0,12,0x14,0,0,0,4,0
1562238405Sjkim	.long	0
1563160814Ssimon
1564160814Ssimon#
1565160814Ssimon#	NOTE:	The following label name should be changed to
1566160814Ssimon#		"bn_add_words" i.e. remove the first dot
1567160814Ssimon#		for the gcc compiler. This should be automatically
1568160814Ssimon#		done in the build
1569160814Ssimon#
1570160814Ssimon
1571160814Ssimon.align	4
1572160814Ssimon.bn_add_words:
1573160814Ssimon#
1574160814Ssimon#	Handcoded version of bn_add_words
1575160814Ssimon#
1576160814Ssimon#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1577160814Ssimon#
1578160814Ssimon#	r3 = r
1579160814Ssimon#	r4 = a
1580160814Ssimon#	r5 = b
1581160814Ssimon#	r6 = n
1582160814Ssimon#
1583160814Ssimon#       Note:	No loop unrolling done since this is not a performance
1584160814Ssimon#               critical loop.
1585160814Ssimon
1586160814Ssimon	xor	r0,r0,r0
1587160814Ssimon#
1588160814Ssimon#	check for r6 = 0. Is this needed?
1589160814Ssimon#
1590160814Ssimon	addic.	r6,r6,0		#test r6 and clear carry bit.
1591238405Sjkim	beq	Lppcasm_add_adios
1592160814Ssimon	addi	r4,r4,-$BNSZ
1593160814Ssimon	addi	r3,r3,-$BNSZ
1594160814Ssimon	addi	r5,r5,-$BNSZ
1595160814Ssimon	mtctr	r6
1596160814SsimonLppcasm_add_mainloop:
1597160814Ssimon	$LDU	r7,$BNSZ(r4)
1598160814Ssimon	$LDU	r8,$BNSZ(r5)
1599160814Ssimon	adde	r8,r7,r8
1600160814Ssimon	$STU	r8,$BNSZ(r3)
1601238405Sjkim	bdnz-	Lppcasm_add_mainloop
1602160814SsimonLppcasm_add_adios:
1603160814Ssimon	addze	r3,r0			#return carry bit.
1604238405Sjkim	blr
1605238405Sjkim	.long	0
1606238405Sjkim	.byte	0,12,0x14,0,0,0,4,0
1607238405Sjkim	.long	0
1608160814Ssimon
1609160814Ssimon#
1610160814Ssimon#	NOTE:	The following label name should be changed to
1611160814Ssimon#		"bn_div_words" i.e. remove the first dot
1612160814Ssimon#		for the gcc compiler. This should be automatically
1613160814Ssimon#		done in the build
1614160814Ssimon#
1615160814Ssimon
1616160814Ssimon.align	4
1617160814Ssimon.bn_div_words:
1618160814Ssimon#
1619160814Ssimon#	This is a cleaned up version of code generated by
1620160814Ssimon#	the AIX compiler. The only optimization is to use
1621160814Ssimon#	the PPC instruction to count leading zeros instead
1622160814Ssimon#	of call to num_bits_word. Since this was compiled
1623160814Ssimon#	only at level -O2 we can possibly squeeze it more?
1624160814Ssimon#
1625160814Ssimon#	r3 = h
1626160814Ssimon#	r4 = l
1627160814Ssimon#	r5 = d
1628160814Ssimon
1629160814Ssimon	$UCMPI	0,r5,0			# compare r5 and 0
1630238405Sjkim	bne	Lppcasm_div1		# proceed if d!=0
1631160814Ssimon	li	r3,-1			# d=0 return -1
1632238405Sjkim	blr
1633160814SsimonLppcasm_div1:
1634160814Ssimon	xor	r0,r0,r0		#r0=0
1635160814Ssimon	li	r8,$BITS
1636160814Ssimon	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1637238405Sjkim	beq	Lppcasm_div2		#proceed if no leading zeros
1638160814Ssimon	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1639160814Ssimon	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1640160814Ssimon	$TR	16,r9,r0		#if there're, signal to dump core...
1641160814SsimonLppcasm_div2:
1642160814Ssimon	$UCMP	0,r3,r5			#h>=d?
1643238405Sjkim	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
1644160814Ssimon	subf	r3,r5,r3		#h-=d ;
1645160814SsimonLppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1646160814Ssimon	cmpi	0,0,r7,0		# is (i == 0)?
1647238405Sjkim	beq	Lppcasm_div4
1648160814Ssimon	$SHL	r3,r3,r7		# h = (h<< i)
1649160814Ssimon	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1650160814Ssimon	$SHL	r5,r5,r7		# d<<=i
1651160814Ssimon	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1652160814Ssimon	$SHL	r4,r4,r7		# l <<=i
1653160814SsimonLppcasm_div4:
1654160814Ssimon	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1655160814Ssimon					# dl will be computed when needed
1656160814Ssimon					# as it saves registers.
1657160814Ssimon	li	r6,2			#r6=2
1658160814Ssimon	mtctr	r6			#counter will be in count.
1659160814SsimonLppcasm_divouterloop:
1660160814Ssimon	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1661160814Ssimon	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1662160814Ssimon					# compute here for innerloop.
1663160814Ssimon	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1664238405Sjkim	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
1665160814Ssimon
1666160814Ssimon	li	r8,-1
1667160814Ssimon	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1668160814Ssimon	b	Lppcasm_div6
1669160814SsimonLppcasm_div5:
1670160814Ssimon	$UDIV	r8,r3,r9		#q = h/dh
1671160814SsimonLppcasm_div6:
1672160814Ssimon	$UMULL	r12,r9,r8		#th = q*dh
1673160814Ssimon	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1674160814Ssimon	$UMULL	r6,r8,r10		#tl = q*dl
1675160814Ssimon
1676160814SsimonLppcasm_divinnerloop:
1677160814Ssimon	subf	r10,r12,r3		#t = h -th
1678160814Ssimon	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1679160814Ssimon	addic.	r7,r7,0			#test if r7 == 0. used below.
1680160814Ssimon					# now want to compute
1681160814Ssimon					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1682160814Ssimon					# the following 2 instructions do that
1683160814Ssimon	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1684160814Ssimon	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1685238405Sjkim	$UCMP	cr1,r6,r7		# compare (tl <= r7)
1686238405Sjkim	bne	Lppcasm_divinnerexit
1687238405Sjkim	ble	cr1,Lppcasm_divinnerexit
1688160814Ssimon	addi	r8,r8,-1		#q--
1689160814Ssimon	subf	r12,r9,r12		#th -=dh
1690160814Ssimon	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1691160814Ssimon	subf	r6,r10,r6		#tl -=dl
1692160814Ssimon	b	Lppcasm_divinnerloop
1693160814SsimonLppcasm_divinnerexit:
1694160814Ssimon	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1695160814Ssimon	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1696238405Sjkim	$UCMP	cr1,r4,r11		# compare l and tl
1697160814Ssimon	add	r12,r12,r10		# th+=t
1698238405Sjkim	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
1699160814Ssimon	addi	r12,r12,1		# th++
1700160814SsimonLppcasm_div7:
1701160814Ssimon	subf	r11,r11,r4		#r11=l-tl
1702238405Sjkim	$UCMP	cr1,r3,r12		#compare h and th
1703238405Sjkim	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1704160814Ssimon	addi	r8,r8,-1		# q--
1705160814Ssimon	add	r3,r5,r3		# h+=d
1706160814SsimonLppcasm_div8:
1707160814Ssimon	subf	r12,r12,r3		#r12 = h-th
1708160814Ssimon	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1709160814Ssimon					# want to compute
1710160814Ssimon					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1711160814Ssimon					# the following 2 instructions will do this.
1712160814Ssimon	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1713160814Ssimon	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1714238405Sjkim	bdz	Lppcasm_div9		#if (count==0) break ;
1715160814Ssimon	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1716160814Ssimon	b	Lppcasm_divouterloop
1717160814SsimonLppcasm_div9:
1718160814Ssimon	or	r3,r8,r0
1719238405Sjkim	blr
1720238405Sjkim	.long	0
1721238405Sjkim	.byte	0,12,0x14,0,0,0,3,0
1722238405Sjkim	.long	0
1723160814Ssimon
1724160814Ssimon#
1725160814Ssimon#	NOTE:	The following label name should be changed to
1726160814Ssimon#		"bn_sqr_words" i.e. remove the first dot
1727160814Ssimon#		for the gcc compiler. This should be automatically
1728160814Ssimon#		done in the build
1729160814Ssimon#
1730160814Ssimon.align	4
1731160814Ssimon.bn_sqr_words:
1732160814Ssimon#
1733160814Ssimon#	Optimized version of bn_sqr_words
1734160814Ssimon#
1735160814Ssimon#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1736160814Ssimon#
1737160814Ssimon#	r3 = r
1738160814Ssimon#	r4 = a
1739160814Ssimon#	r5 = n
1740160814Ssimon#
1741160814Ssimon#	r6 = a[i].
1742160814Ssimon#	r7,r8 = product.
1743160814Ssimon#
1744160814Ssimon#	No unrolling done here. Not performance critical.
1745160814Ssimon
1746160814Ssimon	addic.	r5,r5,0			#test r5.
1747238405Sjkim	beq	Lppcasm_sqr_adios
1748160814Ssimon	addi	r4,r4,-$BNSZ
1749160814Ssimon	addi	r3,r3,-$BNSZ
1750160814Ssimon	mtctr	r5
1751160814SsimonLppcasm_sqr_mainloop:
1752160814Ssimon					#sqr(r[0],r[1],a[0]);
1753160814Ssimon	$LDU	r6,$BNSZ(r4)
1754160814Ssimon	$UMULL	r7,r6,r6
1755160814Ssimon	$UMULH  r8,r6,r6
1756160814Ssimon	$STU	r7,$BNSZ(r3)
1757160814Ssimon	$STU	r8,$BNSZ(r3)
1758238405Sjkim	bdnz-	Lppcasm_sqr_mainloop
1759160814SsimonLppcasm_sqr_adios:
1760238405Sjkim	blr
1761238405Sjkim	.long	0
1762238405Sjkim	.byte	0,12,0x14,0,0,0,3,0
1763238405Sjkim	.long	0
1764160814Ssimon
1765160814Ssimon#
1766160814Ssimon#	NOTE:	The following label name should be changed to
1767160814Ssimon#		"bn_mul_words" i.e. remove the first dot
1768160814Ssimon#		for the gcc compiler. This should be automatically
1769160814Ssimon#		done in the build
1770160814Ssimon#
1771160814Ssimon
1772160814Ssimon.align	4
1773160814Ssimon.bn_mul_words:
1774160814Ssimon#
1775160814Ssimon# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1776160814Ssimon#
1777160814Ssimon# r3 = rp
1778160814Ssimon# r4 = ap
1779160814Ssimon# r5 = num
1780160814Ssimon# r6 = w
1781160814Ssimon	xor	r0,r0,r0
1782160814Ssimon	xor	r12,r12,r12		# used for carry
1783160814Ssimon	rlwinm.	r7,r5,30,2,31		# num >> 2
1784238405Sjkim	beq	Lppcasm_mw_REM
1785160814Ssimon	mtctr	r7
1786160814SsimonLppcasm_mw_LOOP:
1787160814Ssimon					#mul(rp[0],ap[0],w,c1);
1788160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1789160814Ssimon	$UMULL	r9,r6,r8
1790160814Ssimon	$UMULH  r10,r6,r8
1791160814Ssimon	addc	r9,r9,r12
1792160814Ssimon	#addze	r10,r10			#carry is NOT ignored.
1793160814Ssimon					#will be taken care of
1794160814Ssimon					#in second spin below
1795160814Ssimon					#using adde.
1796160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1797160814Ssimon					#mul(rp[1],ap[1],w,c1);
1798160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1799160814Ssimon	$UMULL	r11,r6,r8
1800160814Ssimon	$UMULH  r12,r6,r8
1801160814Ssimon	adde	r11,r11,r10
1802160814Ssimon	#addze	r12,r12
1803160814Ssimon	$ST	r11,`1*$BNSZ`(r3)
1804160814Ssimon					#mul(rp[2],ap[2],w,c1);
1805160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1806160814Ssimon	$UMULL	r9,r6,r8
1807160814Ssimon	$UMULH  r10,r6,r8
1808160814Ssimon	adde	r9,r9,r12
1809160814Ssimon	#addze	r10,r10
1810160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
1811160814Ssimon					#mul_add(rp[3],ap[3],w,c1);
1812160814Ssimon	$LD	r8,`3*$BNSZ`(r4)
1813160814Ssimon	$UMULL	r11,r6,r8
1814160814Ssimon	$UMULH  r12,r6,r8
1815160814Ssimon	adde	r11,r11,r10
1816160814Ssimon	addze	r12,r12			#this spin we collect carry into
1817160814Ssimon					#r12
1818160814Ssimon	$ST	r11,`3*$BNSZ`(r3)
1819160814Ssimon
1820160814Ssimon	addi	r3,r3,`4*$BNSZ`
1821160814Ssimon	addi	r4,r4,`4*$BNSZ`
1822238405Sjkim	bdnz-	Lppcasm_mw_LOOP
1823160814Ssimon
1824160814SsimonLppcasm_mw_REM:
1825160814Ssimon	andi.	r5,r5,0x3
1826238405Sjkim	beq	Lppcasm_mw_OVER
1827160814Ssimon					#mul(rp[0],ap[0],w,c1);
1828160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1829160814Ssimon	$UMULL	r9,r6,r8
1830160814Ssimon	$UMULH  r10,r6,r8
1831160814Ssimon	addc	r9,r9,r12
1832160814Ssimon	addze	r10,r10
1833160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1834160814Ssimon	addi	r12,r10,0
1835160814Ssimon
1836160814Ssimon	addi	r5,r5,-1
1837160814Ssimon	cmpli	0,0,r5,0
1838238405Sjkim	beq	Lppcasm_mw_OVER
1839160814Ssimon
1840160814Ssimon
1841160814Ssimon					#mul(rp[1],ap[1],w,c1);
1842160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1843160814Ssimon	$UMULL	r9,r6,r8
1844160814Ssimon	$UMULH  r10,r6,r8
1845160814Ssimon	addc	r9,r9,r12
1846160814Ssimon	addze	r10,r10
1847160814Ssimon	$ST	r9,`1*$BNSZ`(r3)
1848160814Ssimon	addi	r12,r10,0
1849160814Ssimon
1850160814Ssimon	addi	r5,r5,-1
1851160814Ssimon	cmpli	0,0,r5,0
1852238405Sjkim	beq	Lppcasm_mw_OVER
1853160814Ssimon
1854160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
1855160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1856160814Ssimon	$UMULL	r9,r6,r8
1857160814Ssimon	$UMULH  r10,r6,r8
1858160814Ssimon	addc	r9,r9,r12
1859160814Ssimon	addze	r10,r10
1860160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
1861160814Ssimon	addi	r12,r10,0
1862160814Ssimon
1863160814SsimonLppcasm_mw_OVER:
1864160814Ssimon	addi	r3,r12,0
1865238405Sjkim	blr
1866238405Sjkim	.long	0
1867238405Sjkim	.byte	0,12,0x14,0,0,0,4,0
1868238405Sjkim	.long	0
1869160814Ssimon
1870160814Ssimon#
1871160814Ssimon#	NOTE:	The following label name should be changed to
1872160814Ssimon#		"bn_mul_add_words" i.e. remove the first dot
1873160814Ssimon#		for the gcc compiler. This should be automatically
1874160814Ssimon#		done in the build
1875160814Ssimon#
1876160814Ssimon
1877160814Ssimon.align	4
1878160814Ssimon.bn_mul_add_words:
1879160814Ssimon#
1880160814Ssimon# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1881160814Ssimon#
1882160814Ssimon# r3 = rp
1883160814Ssimon# r4 = ap
1884160814Ssimon# r5 = num
1885160814Ssimon# r6 = w
1886160814Ssimon#
1887160814Ssimon# empirical evidence suggests that unrolled version performs best!!
1888160814Ssimon#
1889160814Ssimon	xor	r0,r0,r0		#r0 = 0
1890160814Ssimon	xor	r12,r12,r12  		#r12 = 0 . used for carry
1891160814Ssimon	rlwinm.	r7,r5,30,2,31		# num >> 2
1892238405Sjkim	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1893160814Ssimon	mtctr	r7
1894160814SsimonLppcasm_maw_mainloop:
1895160814Ssimon					#mul_add(rp[0],ap[0],w,c1);
1896160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1897160814Ssimon	$LD	r11,`0*$BNSZ`(r3)
1898160814Ssimon	$UMULL	r9,r6,r8
1899160814Ssimon	$UMULH  r10,r6,r8
1900160814Ssimon	addc	r9,r9,r12		#r12 is carry.
1901160814Ssimon	addze	r10,r10
1902160814Ssimon	addc	r9,r9,r11
1903160814Ssimon	#addze	r10,r10
1904160814Ssimon					#the above instruction addze
1905160814Ssimon					#is NOT needed. Carry will NOT
1906160814Ssimon					#be ignored. It's not affected
1907160814Ssimon					#by multiply and will be collected
1908160814Ssimon					#in the next spin
1909160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1910160814Ssimon
1911160814Ssimon					#mul_add(rp[1],ap[1],w,c1);
1912160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1913160814Ssimon	$LD	r9,`1*$BNSZ`(r3)
1914160814Ssimon	$UMULL	r11,r6,r8
1915160814Ssimon	$UMULH  r12,r6,r8
1916160814Ssimon	adde	r11,r11,r10		#r10 is carry.
1917160814Ssimon	addze	r12,r12
1918160814Ssimon	addc	r11,r11,r9
1919160814Ssimon	#addze	r12,r12
1920160814Ssimon	$ST	r11,`1*$BNSZ`(r3)
1921160814Ssimon
1922160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
1923160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1924160814Ssimon	$UMULL	r9,r6,r8
1925160814Ssimon	$LD	r11,`2*$BNSZ`(r3)
1926160814Ssimon	$UMULH  r10,r6,r8
1927160814Ssimon	adde	r9,r9,r12
1928160814Ssimon	addze	r10,r10
1929160814Ssimon	addc	r9,r9,r11
1930160814Ssimon	#addze	r10,r10
1931160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
1932160814Ssimon
1933160814Ssimon					#mul_add(rp[3],ap[3],w,c1);
1934160814Ssimon	$LD	r8,`3*$BNSZ`(r4)
1935160814Ssimon	$UMULL	r11,r6,r8
1936160814Ssimon	$LD	r9,`3*$BNSZ`(r3)
1937160814Ssimon	$UMULH  r12,r6,r8
1938160814Ssimon	adde	r11,r11,r10
1939160814Ssimon	addze	r12,r12
1940160814Ssimon	addc	r11,r11,r9
1941160814Ssimon	addze	r12,r12
1942160814Ssimon	$ST	r11,`3*$BNSZ`(r3)
1943160814Ssimon	addi	r3,r3,`4*$BNSZ`
1944160814Ssimon	addi	r4,r4,`4*$BNSZ`
1945238405Sjkim	bdnz-	Lppcasm_maw_mainloop
1946160814Ssimon
1947160814SsimonLppcasm_maw_leftover:
1948160814Ssimon	andi.	r5,r5,0x3
1949238405Sjkim	beq	Lppcasm_maw_adios
1950160814Ssimon	addi	r3,r3,-$BNSZ
1951160814Ssimon	addi	r4,r4,-$BNSZ
1952160814Ssimon					#mul_add(rp[0],ap[0],w,c1);
1953160814Ssimon	mtctr	r5
1954160814Ssimon	$LDU	r8,$BNSZ(r4)
1955160814Ssimon	$UMULL	r9,r6,r8
1956160814Ssimon	$UMULH  r10,r6,r8
1957160814Ssimon	$LDU	r11,$BNSZ(r3)
1958160814Ssimon	addc	r9,r9,r11
1959160814Ssimon	addze	r10,r10
1960160814Ssimon	addc	r9,r9,r12
1961160814Ssimon	addze	r12,r10
1962160814Ssimon	$ST	r9,0(r3)
1963160814Ssimon
1964238405Sjkim	bdz	Lppcasm_maw_adios
1965160814Ssimon					#mul_add(rp[1],ap[1],w,c1);
1966160814Ssimon	$LDU	r8,$BNSZ(r4)
1967160814Ssimon	$UMULL	r9,r6,r8
1968160814Ssimon	$UMULH  r10,r6,r8
1969160814Ssimon	$LDU	r11,$BNSZ(r3)
1970160814Ssimon	addc	r9,r9,r11
1971160814Ssimon	addze	r10,r10
1972160814Ssimon	addc	r9,r9,r12
1973160814Ssimon	addze	r12,r10
1974160814Ssimon	$ST	r9,0(r3)
1975160814Ssimon
1976238405Sjkim	bdz	Lppcasm_maw_adios
1977160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
1978160814Ssimon	$LDU	r8,$BNSZ(r4)
1979160814Ssimon	$UMULL	r9,r6,r8
1980160814Ssimon	$UMULH  r10,r6,r8
1981160814Ssimon	$LDU	r11,$BNSZ(r3)
1982160814Ssimon	addc	r9,r9,r11
1983160814Ssimon	addze	r10,r10
1984160814Ssimon	addc	r9,r9,r12
1985160814Ssimon	addze	r12,r10
1986160814Ssimon	$ST	r9,0(r3)
1987160814Ssimon
1988160814SsimonLppcasm_maw_adios:
1989160814Ssimon	addi	r3,r12,0
1990238405Sjkim	blr
1991238405Sjkim	.long	0
1992238405Sjkim	.byte	0,12,0x14,0,0,0,4,0
1993238405Sjkim	.long	0
1994160814Ssimon	.align	4
1995160814SsimonEOF
1996238405Sjkim$data =~ s/\`([^\`]*)\`/eval $1/gem;
1997238405Sjkimprint $data;
1998238405Sjkimclose STDOUT;
1999