1160814Ssimon#!/usr/bin/env perl
2160814Ssimon#
3160814Ssimon# Implemented as a Perl wrapper as we want to support several different
4160814Ssimon# architectures with single file. We pick up the target based on the
5160814Ssimon# file name we are asked to generate.
6160814Ssimon#
7160814Ssimon# It should be noted though that this perl code is nothing like
8160814Ssimon# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9160814Ssimon# as pre-processor to cover for platform differences in name decoration,
10160814Ssimon# linker tables, 32-/64-bit instruction sets...
11160814Ssimon#
12160814Ssimon# As you might know there're several PowerPC ABI in use. Most notably
13160814Ssimon# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14160814Ssimon# are similar enough to implement leaf(!) functions, which would be ABI
15160814Ssimon# neutral. And that's what you find here: ABI neutral leaf functions.
16160814Ssimon# In case you wonder what that is...
17160814Ssimon#
18160814Ssimon#       AIX performance
19160814Ssimon#
20160814Ssimon#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21160814Ssimon#
22160814Ssimon#	The following is the performance of 32-bit compiler
23160814Ssimon#	generated code:
24160814Ssimon#
25160814Ssimon#	OpenSSL 0.9.6c 21 dec 2001
26160814Ssimon#	built on: Tue Jun 11 11:06:51 EDT 2002
27160814Ssimon#	options:bn(64,32) ...
28160814Ssimon#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
29160814Ssimon#                  sign    verify    sign/s verify/s
30160814Ssimon#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
31160814Ssimon#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
32160814Ssimon#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
33160814Ssimon#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
34160814Ssimon#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
35160814Ssimon#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
36160814Ssimon#
37160814Ssimon#	Same bechmark with this assembler code:
38160814Ssimon#
39160814Ssimon#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
40160814Ssimon#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
41160814Ssimon#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
42160814Ssimon#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
43160814Ssimon#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
44160814Ssimon#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
45160814Ssimon#
46160814Ssimon#	Number of operations increases by at almost 75%
47160814Ssimon#
48160814Ssimon#	Here are performance numbers for 64-bit compiler
49160814Ssimon#	generated code:
50160814Ssimon#
51160814Ssimon#	OpenSSL 0.9.6g [engine] 9 Aug 2002
52160814Ssimon#	built on: Fri Apr 18 16:59:20 EDT 2003
53160814Ssimon#	options:bn(64,64) ...
54160814Ssimon#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55160814Ssimon#                  sign    verify    sign/s verify/s
56160814Ssimon#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
57160814Ssimon#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
58160814Ssimon#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
59160814Ssimon#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
60160814Ssimon#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
61160814Ssimon#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
62160814Ssimon#
63160814Ssimon#	Same benchmark with this assembler code:
64160814Ssimon#
65160814Ssimon#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
66160814Ssimon#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
67160814Ssimon#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
68160814Ssimon#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
69160814Ssimon#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
70160814Ssimon#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
71160814Ssimon#
72160814Ssimon#	Again, performance increases by at about 75%
73160814Ssimon#
74160814Ssimon#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75160814Ssimon#       OpenSSL 0.9.7c 30 Sep 2003
76160814Ssimon#
77160814Ssimon#       Original code.
78160814Ssimon#
79160814Ssimon#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
80160814Ssimon#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
81160814Ssimon#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
82160814Ssimon#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
83160814Ssimon#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
84160814Ssimon#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
85160814Ssimon#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
86160814Ssimon#
87160814Ssimon#       Same benchmark with this assembler code:
88160814Ssimon#
89160814Ssimon#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
90160814Ssimon#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
91160814Ssimon#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
92160814Ssimon#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
93160814Ssimon#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
94160814Ssimon#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
95160814Ssimon#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
96160814Ssimon#
97160814Ssimon#        Performance increase of ~60%
98160814Ssimon#
99160814Ssimon#	If you have comments or suggestions to improve code send
100160814Ssimon#	me a note at schari@us.ibm.com
101160814Ssimon#
102160814Ssimon
103160814Ssimon$opf = shift;
104160814Ssimon
105160814Ssimonif ($opf =~ /32\.s/) {
106160814Ssimon	$BITS=	32;
107160814Ssimon	$BNSZ=	$BITS/8;
108160814Ssimon	$ISA=	"\"ppc\"";
109160814Ssimon
110160814Ssimon	$LD=	"lwz";		# load
111160814Ssimon	$LDU=	"lwzu";		# load and update
112160814Ssimon	$ST=	"stw";		# store
113160814Ssimon	$STU=	"stwu";		# store and update
114160814Ssimon	$UMULL=	"mullw";	# unsigned multiply low
115160814Ssimon	$UMULH=	"mulhwu";	# unsigned multiply high
116160814Ssimon	$UDIV=	"divwu";	# unsigned divide
117160814Ssimon	$UCMPI=	"cmplwi";	# unsigned compare with immediate
118160814Ssimon	$UCMP=	"cmplw";	# unsigned compare
119160814Ssimon	$CNTLZ=	"cntlzw";	# count leading zeros
120160814Ssimon	$SHL=	"slw";		# shift left
121160814Ssimon	$SHR=	"srw";		# unsigned shift right
122160814Ssimon	$SHRI=	"srwi";		# unsigned shift right by immediate
123160814Ssimon	$SHLI=	"slwi";		# shift left by immediate
124160814Ssimon	$CLRU=	"clrlwi";	# clear upper bits
125160814Ssimon	$INSR=	"insrwi";	# insert right
126160814Ssimon	$ROTL=	"rotlwi";	# rotate left by immediate
127160814Ssimon	$TR=	"tw";		# conditional trap
128160814Ssimon} elsif ($opf =~ /64\.s/) {
129160814Ssimon	$BITS=	64;
130160814Ssimon	$BNSZ=	$BITS/8;
131160814Ssimon	$ISA=	"\"ppc64\"";
132160814Ssimon
133160814Ssimon	# same as above, but 64-bit mnemonics...
134160814Ssimon	$LD=	"ld";		# load
135160814Ssimon	$LDU=	"ldu";		# load and update
136160814Ssimon	$ST=	"std";		# store
137160814Ssimon	$STU=	"stdu";		# store and update
138160814Ssimon	$UMULL=	"mulld";	# unsigned multiply low
139160814Ssimon	$UMULH=	"mulhdu";	# unsigned multiply high
140160814Ssimon	$UDIV=	"divdu";	# unsigned divide
141160814Ssimon	$UCMPI=	"cmpldi";	# unsigned compare with immediate
142160814Ssimon	$UCMP=	"cmpld";	# unsigned compare
143160814Ssimon	$CNTLZ=	"cntlzd";	# count leading zeros
144160814Ssimon	$SHL=	"sld";		# shift left
145160814Ssimon	$SHR=	"srd";		# unsigned shift right
146160814Ssimon	$SHRI=	"srdi";		# unsigned shift right by immediate
147160814Ssimon	$SHLI=	"sldi";		# shift left by immediate
148160814Ssimon	$CLRU=	"clrldi";	# clear upper bits
149160814Ssimon	$INSR=	"insrdi";	# insert right
150160814Ssimon	$ROTL=	"rotldi";	# rotate left by immediate
151160814Ssimon	$TR=	"td";		# conditional trap
152160814Ssimon} else { die "nonsense $opf"; }
153160814Ssimon
154160814Ssimon( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
155160814Ssimon
156160814Ssimon# function entry points from the AIX code
157160814Ssimon#
158160814Ssimon# There are other, more elegant, ways to handle this. We (IBM) chose
159160814Ssimon# this approach as it plays well with scripts we run to 'namespace'
160160814Ssimon# OpenSSL .i.e. we add a prefix to all the public symbols so we can
161160814Ssimon# co-exist in the same process with other implementations of OpenSSL.
162160814Ssimon# 'cleverer' ways of doing these substitutions tend to hide data we
163160814Ssimon# need to be obvious.
164160814Ssimon#
165160814Ssimonmy @items = ("bn_sqr_comba4",
166160814Ssimon	     "bn_sqr_comba8",
167160814Ssimon	     "bn_mul_comba4",
168160814Ssimon	     "bn_mul_comba8",
169160814Ssimon	     "bn_sub_words",
170160814Ssimon	     "bn_add_words",
171160814Ssimon	     "bn_div_words",
172160814Ssimon	     "bn_sqr_words",
173160814Ssimon	     "bn_mul_words",
174160814Ssimon	     "bn_mul_add_words");
175160814Ssimon
176160814Ssimonif    ($opf =~ /linux/)	{  do_linux();	}
177160814Ssimonelsif ($opf =~ /aix/)	{  do_aix();	}
178160814Ssimonelsif ($opf =~ /osx/)	{  do_osx();	}
179160814Ssimonelse			{  do_bsd();	}
180160814Ssimon
181160814Ssimonsub do_linux {
182160814Ssimon    $d=&data();
183160814Ssimon
184160814Ssimon    if ($BITS==64) {
185160814Ssimon      foreach $t (@items) {
186160814Ssimon        $d =~ s/\.$t:/\
187160814Ssimon\t.section\t".opd","aw"\
188160814Ssimon\t.align\t3\
189160814Ssimon\t.globl\t$t\
190160814Ssimon$t:\
191160814Ssimon\t.quad\t.$t,.TOC.\@tocbase,0\
192160814Ssimon\t.size\t$t,24\
193160814Ssimon\t.previous\n\
194160814Ssimon\t.type\t.$t,\@function\
195160814Ssimon\t.globl\t.$t\
196160814Ssimon.$t:/g;
197160814Ssimon      }
198160814Ssimon    }
199160814Ssimon    else {
200160814Ssimon      foreach $t (@items) {
201160814Ssimon        $d=~s/\.$t/$t/g;
202160814Ssimon      }
203160814Ssimon    }
204160814Ssimon    # hide internal labels to avoid pollution of name table...
205160814Ssimon    $d=~s/Lppcasm_/.Lppcasm_/gm;
206160814Ssimon    print $d;
207160814Ssimon}
208160814Ssimon
209160814Ssimonsub do_aix {
210160814Ssimon    # AIX assembler is smart enough to please the linker without
211160814Ssimon    # making us do something special...
212160814Ssimon    print &data();
213160814Ssimon}
214160814Ssimon
215160814Ssimon# MacOSX 32 bit
216160814Ssimonsub do_osx {
217160814Ssimon    $d=&data();
218160814Ssimon    # Change the bn symbol prefix from '.' to '_'
219160814Ssimon    foreach $t (@items) {
220160814Ssimon      $d=~s/\.$t/_$t/g;
221160814Ssimon    }
222160814Ssimon    # Change .machine to something OS X asm will accept
223160814Ssimon    $d=~s/\.machine.*/.text/g;
224160814Ssimon    $d=~s/\#/;/g; # change comment from '#' to ';'
225160814Ssimon    print $d;
226160814Ssimon}
227160814Ssimon
228160814Ssimon# BSD (Untested)
229160814Ssimonsub do_bsd {
230160814Ssimon    $d=&data();
231160814Ssimon    foreach $t (@items) {
232160814Ssimon      $d=~s/\.$t/_$t/g;
233160814Ssimon    }
234160814Ssimon    print $d;
235160814Ssimon}
236160814Ssimon
237160814Ssimonsub data {
238160814Ssimon	local($data)=<<EOF;
239160814Ssimon#--------------------------------------------------------------------
240160814Ssimon#
241160814Ssimon#
242160814Ssimon#
243160814Ssimon#
244160814Ssimon#	File:		ppc32.s
245160814Ssimon#
246160814Ssimon#	Created by:	Suresh Chari
247160814Ssimon#			IBM Thomas J. Watson Research Library
248160814Ssimon#			Hawthorne, NY
249160814Ssimon#
250160814Ssimon#
251160814Ssimon#	Description:	Optimized assembly routines for OpenSSL crypto
252160814Ssimon#			on the 32 bitPowerPC platform.
253160814Ssimon#
254160814Ssimon#
255160814Ssimon#	Version History
256160814Ssimon#
257160814Ssimon#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
258160814Ssimon#	   cleaned up code. Also made a single version which can
259160814Ssimon#	   be used for both the AIX and Linux compilers. See NOTE
260160814Ssimon#	   below.
261160814Ssimon#				12/05/03		Suresh Chari
262160814Ssimon#			(with lots of help from)        Andy Polyakov
263160814Ssimon##
264160814Ssimon#	1. Initial version	10/20/02		Suresh Chari
265160814Ssimon#
266160814Ssimon#
267160814Ssimon#	The following file works for the xlc,cc
268160814Ssimon#	and gcc compilers.
269160814Ssimon#
270160814Ssimon#	NOTE:	To get the file to link correctly with the gcc compiler
271160814Ssimon#	        you have to change the names of the routines and remove
272160814Ssimon#		the first .(dot) character. This should automatically
273160814Ssimon#		be done in the build process.
274160814Ssimon#
275160814Ssimon#	Hand optimized assembly code for the following routines
276160814Ssimon#
277160814Ssimon#	bn_sqr_comba4
278160814Ssimon#	bn_sqr_comba8
279160814Ssimon#	bn_mul_comba4
280160814Ssimon#	bn_mul_comba8
281160814Ssimon#	bn_sub_words
282160814Ssimon#	bn_add_words
283160814Ssimon#	bn_div_words
284160814Ssimon#	bn_sqr_words
285160814Ssimon#	bn_mul_words
286160814Ssimon#	bn_mul_add_words
287160814Ssimon#
288160814Ssimon#	NOTE:	It is possible to optimize this code more for
289160814Ssimon#	specific PowerPC or Power architectures. On the Northstar
290160814Ssimon#	architecture the optimizations in this file do
291160814Ssimon#	 NOT provide much improvement.
292160814Ssimon#
293160814Ssimon#	If you have comments or suggestions to improve code send
294160814Ssimon#	me a note at schari\@us.ibm.com
295160814Ssimon#
296160814Ssimon#--------------------------------------------------------------------------
297160814Ssimon#
298160814Ssimon#	Defines to be used in the assembly code.
299160814Ssimon#
300160814Ssimon.set r0,0	# we use it as storage for value of 0
301160814Ssimon.set SP,1	# preserved
302160814Ssimon.set RTOC,2	# preserved
303160814Ssimon.set r3,3	# 1st argument/return value
304160814Ssimon.set r4,4	# 2nd argument/volatile register
305160814Ssimon.set r5,5	# 3rd argument/volatile register
306160814Ssimon.set r6,6	# ...
307160814Ssimon.set r7,7
308160814Ssimon.set r8,8
309160814Ssimon.set r9,9
310160814Ssimon.set r10,10
311160814Ssimon.set r11,11
312160814Ssimon.set r12,12
313160814Ssimon.set r13,13	# not used, nor any other "below" it...
314160814Ssimon
315160814Ssimon.set BO_IF_NOT,4
316160814Ssimon.set BO_IF,12
317160814Ssimon.set BO_dCTR_NZERO,16
318160814Ssimon.set BO_dCTR_ZERO,18
319160814Ssimon.set BO_ALWAYS,20
320160814Ssimon.set CR0_LT,0;
321160814Ssimon.set CR0_GT,1;
322160814Ssimon.set CR0_EQ,2
323160814Ssimon.set CR1_FX,4;
324160814Ssimon.set CR1_FEX,5;
325160814Ssimon.set CR1_VX,6
326160814Ssimon.set LR,8
327160814Ssimon
328160814Ssimon#	Declare function names to be global
329160814Ssimon#	NOTE:	For gcc these names MUST be changed to remove
330160814Ssimon#	        the first . i.e. for example change ".bn_sqr_comba4"
331160814Ssimon#		to "bn_sqr_comba4". This should be automatically done
332160814Ssimon#		in the build.
333160814Ssimon
334160814Ssimon	.globl	.bn_sqr_comba4
335160814Ssimon	.globl	.bn_sqr_comba8
336160814Ssimon	.globl	.bn_mul_comba4
337160814Ssimon	.globl	.bn_mul_comba8
338160814Ssimon	.globl	.bn_sub_words
339160814Ssimon	.globl	.bn_add_words
340160814Ssimon	.globl	.bn_div_words
341160814Ssimon	.globl	.bn_sqr_words
342160814Ssimon	.globl	.bn_mul_words
343160814Ssimon	.globl	.bn_mul_add_words
344160814Ssimon
345160814Ssimon# .text section
346160814Ssimon
347160814Ssimon	.machine	$ISA
348160814Ssimon
349160814Ssimon#
350160814Ssimon#	NOTE:	The following label name should be changed to
351160814Ssimon#		"bn_sqr_comba4" i.e. remove the first dot
352160814Ssimon#		for the gcc compiler. This should be automatically
353160814Ssimon#		done in the build
354160814Ssimon#
355160814Ssimon
356160814Ssimon.align	4
357160814Ssimon.bn_sqr_comba4:
358160814Ssimon#
359160814Ssimon# Optimized version of bn_sqr_comba4.
360160814Ssimon#
361160814Ssimon# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
362160814Ssimon# r3 contains r
363160814Ssimon# r4 contains a
364160814Ssimon#
365160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
366160814Ssimon#
367160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied.
368160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply.
369160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3.
370160814Ssimon# Here's the assembly
371160814Ssimon#
372160814Ssimon#
373160814Ssimon	xor		r0,r0,r0		# set r0 = 0. Used in the addze
374160814Ssimon						# instructions below
375160814Ssimon
376160814Ssimon						#sqr_add_c(a,0,c1,c2,c3)
377160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
378160814Ssimon	$UMULL		r9,r5,r5
379160814Ssimon	$UMULH		r10,r5,r5		#in first iteration. No need
380160814Ssimon						#to add since c1=c2=c3=0.
381160814Ssimon						# Note c3(r11) is NOT set to 0
382160814Ssimon						# but will be.
383160814Ssimon
384160814Ssimon	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
385160814Ssimon						# sqr_add_c2(a,1,0,c2,c3,c1);
386160814Ssimon	$LD		r6,`1*$BNSZ`(r4)
387160814Ssimon	$UMULL		r7,r5,r6
388160814Ssimon	$UMULH		r8,r5,r6
389160814Ssimon
390160814Ssimon	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
391160814Ssimon	adde		r8,r8,r8
392160814Ssimon	addze		r9,r0			# catch carry if any.
393160814Ssimon						# r9= r0(=0) and carry
394160814Ssimon
395160814Ssimon	addc		r10,r7,r10		# now add to temp result.
396160814Ssimon	addze		r11,r8                  # r8 added to r11 which is 0
397160814Ssimon	addze		r9,r9
398160814Ssimon
399160814Ssimon	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
400160814Ssimon						#sqr_add_c(a,1,c3,c1,c2)
401160814Ssimon	$UMULL		r7,r6,r6
402160814Ssimon	$UMULH		r8,r6,r6
403160814Ssimon	addc		r11,r7,r11
404160814Ssimon	adde		r9,r8,r9
405160814Ssimon	addze		r10,r0
406160814Ssimon						#sqr_add_c2(a,2,0,c3,c1,c2)
407160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
408160814Ssimon	$UMULL		r7,r5,r6
409160814Ssimon	$UMULH		r8,r5,r6
410160814Ssimon
411160814Ssimon	addc		r7,r7,r7
412160814Ssimon	adde		r8,r8,r8
413160814Ssimon	addze		r10,r10
414160814Ssimon
415160814Ssimon	addc		r11,r7,r11
416160814Ssimon	adde		r9,r8,r9
417160814Ssimon	addze		r10,r10
418160814Ssimon	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
419160814Ssimon						#sqr_add_c2(a,3,0,c1,c2,c3);
420160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
421160814Ssimon	$UMULL		r7,r5,r6
422160814Ssimon	$UMULH		r8,r5,r6
423160814Ssimon	addc		r7,r7,r7
424160814Ssimon	adde		r8,r8,r8
425160814Ssimon	addze		r11,r0
426160814Ssimon
427160814Ssimon	addc		r9,r7,r9
428160814Ssimon	adde		r10,r8,r10
429160814Ssimon	addze		r11,r11
430160814Ssimon						#sqr_add_c2(a,2,1,c1,c2,c3);
431160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
432160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
433160814Ssimon	$UMULL		r7,r5,r6
434160814Ssimon	$UMULH		r8,r5,r6
435160814Ssimon
436160814Ssimon	addc		r7,r7,r7
437160814Ssimon	adde		r8,r8,r8
438160814Ssimon	addze		r11,r11
439160814Ssimon	addc		r9,r7,r9
440160814Ssimon	adde		r10,r8,r10
441160814Ssimon	addze		r11,r11
442160814Ssimon	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
443160814Ssimon						#sqr_add_c(a,2,c2,c3,c1);
444160814Ssimon	$UMULL		r7,r6,r6
445160814Ssimon	$UMULH		r8,r6,r6
446160814Ssimon	addc		r10,r7,r10
447160814Ssimon	adde		r11,r8,r11
448160814Ssimon	addze		r9,r0
449160814Ssimon						#sqr_add_c2(a,3,1,c2,c3,c1);
450160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
451160814Ssimon	$UMULL		r7,r5,r6
452160814Ssimon	$UMULH		r8,r5,r6
453160814Ssimon	addc		r7,r7,r7
454160814Ssimon	adde		r8,r8,r8
455160814Ssimon	addze		r9,r9
456160814Ssimon
457160814Ssimon	addc		r10,r7,r10
458160814Ssimon	adde		r11,r8,r11
459160814Ssimon	addze		r9,r9
460160814Ssimon	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
461160814Ssimon						#sqr_add_c2(a,3,2,c3,c1,c2);
462160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
463160814Ssimon	$UMULL		r7,r5,r6
464160814Ssimon	$UMULH		r8,r5,r6
465160814Ssimon	addc		r7,r7,r7
466160814Ssimon	adde		r8,r8,r8
467160814Ssimon	addze		r10,r0
468160814Ssimon
469160814Ssimon	addc		r11,r7,r11
470160814Ssimon	adde		r9,r8,r9
471160814Ssimon	addze		r10,r10
472160814Ssimon	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
473160814Ssimon						#sqr_add_c(a,3,c1,c2,c3);
474160814Ssimon	$UMULL		r7,r6,r6
475160814Ssimon	$UMULH		r8,r6,r6
476160814Ssimon	addc		r9,r7,r9
477160814Ssimon	adde		r10,r8,r10
478160814Ssimon
479160814Ssimon	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
480160814Ssimon	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
481160814Ssimon	bclr	BO_ALWAYS,CR0_LT
482160814Ssimon	.long	0x00000000
483160814Ssimon
484160814Ssimon#
485160814Ssimon#	NOTE:	The following label name should be changed to
486160814Ssimon#		"bn_sqr_comba8" i.e. remove the first dot
487160814Ssimon#		for the gcc compiler. This should be automatically
488160814Ssimon#		done in the build
489160814Ssimon#
490160814Ssimon
491160814Ssimon.align	4
492160814Ssimon.bn_sqr_comba8:
493160814Ssimon#
494160814Ssimon# This is an optimized version of the bn_sqr_comba8 routine.
495160814Ssimon# Tightly uses the adde instruction
496160814Ssimon#
497160814Ssimon#
498160814Ssimon# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
499160814Ssimon# r3 contains r
500160814Ssimon# r4 contains a
501160814Ssimon#
502160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
503160814Ssimon#
504160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied.
505160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply.
506160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3.
507160814Ssimon#
508160814Ssimon# Possible optimization of loading all 8 longs of a into registers
509160814Ssimon# doesnt provide any speedup
510160814Ssimon#
511160814Ssimon
512160814Ssimon	xor		r0,r0,r0		#set r0 = 0.Used in addze
513160814Ssimon						#instructions below.
514160814Ssimon
515160814Ssimon						#sqr_add_c(a,0,c1,c2,c3);
516160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
517160814Ssimon	$UMULL		r9,r5,r5		#1st iteration:	no carries.
518160814Ssimon	$UMULH		r10,r5,r5
519160814Ssimon	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
520160814Ssimon						#sqr_add_c2(a,1,0,c2,c3,c1);
521160814Ssimon	$LD		r6,`1*$BNSZ`(r4)
522160814Ssimon	$UMULL		r7,r5,r6
523160814Ssimon	$UMULH		r8,r5,r6
524160814Ssimon
525160814Ssimon	addc		r10,r7,r10		#add the two register number
526160814Ssimon	adde		r11,r8,r0 		# (r8,r7) to the three register
527160814Ssimon	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
528160814Ssimon
529160814Ssimon	addc		r10,r7,r10		#add the two register number
530160814Ssimon	adde		r11,r8,r11 		# (r8,r7) to the three register
531160814Ssimon	addze		r9,r9			# number (r9,r11,r10).
532160814Ssimon
533160814Ssimon	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
534160814Ssimon
535160814Ssimon						#sqr_add_c(a,1,c3,c1,c2);
536160814Ssimon	$UMULL		r7,r6,r6
537160814Ssimon	$UMULH		r8,r6,r6
538160814Ssimon	addc		r11,r7,r11
539160814Ssimon	adde		r9,r8,r9
540160814Ssimon	addze		r10,r0
541160814Ssimon						#sqr_add_c2(a,2,0,c3,c1,c2);
542160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
543160814Ssimon	$UMULL		r7,r5,r6
544160814Ssimon	$UMULH		r8,r5,r6
545160814Ssimon
546160814Ssimon	addc		r11,r7,r11
547160814Ssimon	adde		r9,r8,r9
548160814Ssimon	addze		r10,r10
549160814Ssimon
550160814Ssimon	addc		r11,r7,r11
551160814Ssimon	adde		r9,r8,r9
552160814Ssimon	addze		r10,r10
553160814Ssimon
554160814Ssimon	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
555160814Ssimon						#sqr_add_c2(a,3,0,c1,c2,c3);
556160814Ssimon	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
557160814Ssimon	$UMULL		r7,r5,r6
558160814Ssimon	$UMULH		r8,r5,r6
559160814Ssimon
560160814Ssimon	addc		r9,r7,r9
561160814Ssimon	adde		r10,r8,r10
562160814Ssimon	addze		r11,r0
563160814Ssimon
564160814Ssimon	addc		r9,r7,r9
565160814Ssimon	adde		r10,r8,r10
566160814Ssimon	addze		r11,r11
567160814Ssimon						#sqr_add_c2(a,2,1,c1,c2,c3);
568160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
569160814Ssimon	$LD		r6,`2*$BNSZ`(r4)
570160814Ssimon	$UMULL		r7,r5,r6
571160814Ssimon	$UMULH		r8,r5,r6
572160814Ssimon
573160814Ssimon	addc		r9,r7,r9
574160814Ssimon	adde		r10,r8,r10
575160814Ssimon	addze		r11,r11
576160814Ssimon
577160814Ssimon	addc		r9,r7,r9
578160814Ssimon	adde		r10,r8,r10
579160814Ssimon	addze		r11,r11
580160814Ssimon
581160814Ssimon	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
582160814Ssimon						#sqr_add_c(a,2,c2,c3,c1);
583160814Ssimon	$UMULL		r7,r6,r6
584160814Ssimon	$UMULH		r8,r6,r6
585160814Ssimon
586160814Ssimon	addc		r10,r7,r10
587160814Ssimon	adde		r11,r8,r11
588160814Ssimon	addze		r9,r0
589160814Ssimon						#sqr_add_c2(a,3,1,c2,c3,c1);
590160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
591160814Ssimon	$UMULL		r7,r5,r6
592160814Ssimon	$UMULH		r8,r5,r6
593160814Ssimon
594160814Ssimon	addc		r10,r7,r10
595160814Ssimon	adde		r11,r8,r11
596160814Ssimon	addze		r9,r9
597160814Ssimon
598160814Ssimon	addc		r10,r7,r10
599160814Ssimon	adde		r11,r8,r11
600160814Ssimon	addze		r9,r9
601160814Ssimon						#sqr_add_c2(a,4,0,c2,c3,c1);
602160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
603160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
604160814Ssimon	$UMULL		r7,r5,r6
605160814Ssimon	$UMULH		r8,r5,r6
606160814Ssimon
607160814Ssimon	addc		r10,r7,r10
608160814Ssimon	adde		r11,r8,r11
609160814Ssimon	addze		r9,r9
610160814Ssimon
611160814Ssimon	addc		r10,r7,r10
612160814Ssimon	adde		r11,r8,r11
613160814Ssimon	addze		r9,r9
614160814Ssimon	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
615160814Ssimon						#sqr_add_c2(a,5,0,c3,c1,c2);
616160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
617160814Ssimon	$UMULL		r7,r5,r6
618160814Ssimon	$UMULH		r8,r5,r6
619160814Ssimon
620160814Ssimon	addc		r11,r7,r11
621160814Ssimon	adde		r9,r8,r9
622160814Ssimon	addze		r10,r0
623160814Ssimon
624160814Ssimon	addc		r11,r7,r11
625160814Ssimon	adde		r9,r8,r9
626160814Ssimon	addze		r10,r10
627160814Ssimon						#sqr_add_c2(a,4,1,c3,c1,c2);
628160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
629160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
630160814Ssimon	$UMULL		r7,r5,r6
631160814Ssimon	$UMULH		r8,r5,r6
632160814Ssimon
633160814Ssimon	addc		r11,r7,r11
634160814Ssimon	adde		r9,r8,r9
635160814Ssimon	addze		r10,r10
636160814Ssimon
637160814Ssimon	addc		r11,r7,r11
638160814Ssimon	adde		r9,r8,r9
639160814Ssimon	addze		r10,r10
640160814Ssimon						#sqr_add_c2(a,3,2,c3,c1,c2);
641160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
642160814Ssimon	$LD		r6,`3*$BNSZ`(r4)
643160814Ssimon	$UMULL		r7,r5,r6
644160814Ssimon	$UMULH		r8,r5,r6
645160814Ssimon
646160814Ssimon	addc		r11,r7,r11
647160814Ssimon	adde		r9,r8,r9
648160814Ssimon	addze		r10,r10
649160814Ssimon
650160814Ssimon	addc		r11,r7,r11
651160814Ssimon	adde		r9,r8,r9
652160814Ssimon	addze		r10,r10
653160814Ssimon	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
654160814Ssimon						#sqr_add_c(a,3,c1,c2,c3);
655160814Ssimon	$UMULL		r7,r6,r6
656160814Ssimon	$UMULH		r8,r6,r6
657160814Ssimon	addc		r9,r7,r9
658160814Ssimon	adde		r10,r8,r10
659160814Ssimon	addze		r11,r0
660160814Ssimon						#sqr_add_c2(a,4,2,c1,c2,c3);
661160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
662160814Ssimon	$UMULL		r7,r5,r6
663160814Ssimon	$UMULH		r8,r5,r6
664160814Ssimon
665160814Ssimon	addc		r9,r7,r9
666160814Ssimon	adde		r10,r8,r10
667160814Ssimon	addze		r11,r11
668160814Ssimon
669160814Ssimon	addc		r9,r7,r9
670160814Ssimon	adde		r10,r8,r10
671160814Ssimon	addze		r11,r11
672160814Ssimon						#sqr_add_c2(a,5,1,c1,c2,c3);
673160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
674160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
675160814Ssimon	$UMULL		r7,r5,r6
676160814Ssimon	$UMULH		r8,r5,r6
677160814Ssimon
678160814Ssimon	addc		r9,r7,r9
679160814Ssimon	adde		r10,r8,r10
680160814Ssimon	addze		r11,r11
681160814Ssimon
682160814Ssimon	addc		r9,r7,r9
683160814Ssimon	adde		r10,r8,r10
684160814Ssimon	addze		r11,r11
685160814Ssimon						#sqr_add_c2(a,6,0,c1,c2,c3);
686160814Ssimon	$LD		r5,`0*$BNSZ`(r4)
687160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
688160814Ssimon	$UMULL		r7,r5,r6
689160814Ssimon	$UMULH		r8,r5,r6
690160814Ssimon	addc		r9,r7,r9
691160814Ssimon	adde		r10,r8,r10
692160814Ssimon	addze		r11,r11
693160814Ssimon	addc		r9,r7,r9
694160814Ssimon	adde		r10,r8,r10
695160814Ssimon	addze		r11,r11
696160814Ssimon	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
697160814Ssimon						#sqr_add_c2(a,7,0,c2,c3,c1);
698160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
699160814Ssimon	$UMULL		r7,r5,r6
700160814Ssimon	$UMULH		r8,r5,r6
701160814Ssimon
702160814Ssimon	addc		r10,r7,r10
703160814Ssimon	adde		r11,r8,r11
704160814Ssimon	addze		r9,r0
705160814Ssimon	addc		r10,r7,r10
706160814Ssimon	adde		r11,r8,r11
707160814Ssimon	addze		r9,r9
708160814Ssimon						#sqr_add_c2(a,6,1,c2,c3,c1);
709160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
710160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
711160814Ssimon	$UMULL		r7,r5,r6
712160814Ssimon	$UMULH		r8,r5,r6
713160814Ssimon
714160814Ssimon	addc		r10,r7,r10
715160814Ssimon	adde		r11,r8,r11
716160814Ssimon	addze		r9,r9
717160814Ssimon	addc		r10,r7,r10
718160814Ssimon	adde		r11,r8,r11
719160814Ssimon	addze		r9,r9
720160814Ssimon						#sqr_add_c2(a,5,2,c2,c3,c1);
721160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
722160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
723160814Ssimon	$UMULL		r7,r5,r6
724160814Ssimon	$UMULH		r8,r5,r6
725160814Ssimon	addc		r10,r7,r10
726160814Ssimon	adde		r11,r8,r11
727160814Ssimon	addze		r9,r9
728160814Ssimon	addc		r10,r7,r10
729160814Ssimon	adde		r11,r8,r11
730160814Ssimon	addze		r9,r9
731160814Ssimon						#sqr_add_c2(a,4,3,c2,c3,c1);
732160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
733160814Ssimon	$LD		r6,`4*$BNSZ`(r4)
734160814Ssimon	$UMULL		r7,r5,r6
735160814Ssimon	$UMULH		r8,r5,r6
736160814Ssimon
737160814Ssimon	addc		r10,r7,r10
738160814Ssimon	adde		r11,r8,r11
739160814Ssimon	addze		r9,r9
740160814Ssimon	addc		r10,r7,r10
741160814Ssimon	adde		r11,r8,r11
742160814Ssimon	addze		r9,r9
743160814Ssimon	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
744160814Ssimon						#sqr_add_c(a,4,c3,c1,c2);
745160814Ssimon	$UMULL		r7,r6,r6
746160814Ssimon	$UMULH		r8,r6,r6
747160814Ssimon	addc		r11,r7,r11
748160814Ssimon	adde		r9,r8,r9
749160814Ssimon	addze		r10,r0
750160814Ssimon						#sqr_add_c2(a,5,3,c3,c1,c2);
751160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
752160814Ssimon	$UMULL		r7,r5,r6
753160814Ssimon	$UMULH		r8,r5,r6
754160814Ssimon	addc		r11,r7,r11
755160814Ssimon	adde		r9,r8,r9
756160814Ssimon	addze		r10,r10
757160814Ssimon	addc		r11,r7,r11
758160814Ssimon	adde		r9,r8,r9
759160814Ssimon	addze		r10,r10
760160814Ssimon						#sqr_add_c2(a,6,2,c3,c1,c2);
761160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
762160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
763160814Ssimon	$UMULL		r7,r5,r6
764160814Ssimon	$UMULH		r8,r5,r6
765160814Ssimon	addc		r11,r7,r11
766160814Ssimon	adde		r9,r8,r9
767160814Ssimon	addze		r10,r10
768160814Ssimon
769160814Ssimon	addc		r11,r7,r11
770160814Ssimon	adde		r9,r8,r9
771160814Ssimon	addze		r10,r10
772160814Ssimon						#sqr_add_c2(a,7,1,c3,c1,c2);
773160814Ssimon	$LD		r5,`1*$BNSZ`(r4)
774160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
775160814Ssimon	$UMULL		r7,r5,r6
776160814Ssimon	$UMULH		r8,r5,r6
777160814Ssimon	addc		r11,r7,r11
778160814Ssimon	adde		r9,r8,r9
779160814Ssimon	addze		r10,r10
780160814Ssimon	addc		r11,r7,r11
781160814Ssimon	adde		r9,r8,r9
782160814Ssimon	addze		r10,r10
783160814Ssimon	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
784160814Ssimon						#sqr_add_c2(a,7,2,c1,c2,c3);
785160814Ssimon	$LD		r5,`2*$BNSZ`(r4)
786160814Ssimon	$UMULL		r7,r5,r6
787160814Ssimon	$UMULH		r8,r5,r6
788160814Ssimon
789160814Ssimon	addc		r9,r7,r9
790160814Ssimon	adde		r10,r8,r10
791160814Ssimon	addze		r11,r0
792160814Ssimon	addc		r9,r7,r9
793160814Ssimon	adde		r10,r8,r10
794160814Ssimon	addze		r11,r11
795160814Ssimon						#sqr_add_c2(a,6,3,c1,c2,c3);
796160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
797160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
798160814Ssimon	$UMULL		r7,r5,r6
799160814Ssimon	$UMULH		r8,r5,r6
800160814Ssimon	addc		r9,r7,r9
801160814Ssimon	adde		r10,r8,r10
802160814Ssimon	addze		r11,r11
803160814Ssimon	addc		r9,r7,r9
804160814Ssimon	adde		r10,r8,r10
805160814Ssimon	addze		r11,r11
806160814Ssimon						#sqr_add_c2(a,5,4,c1,c2,c3);
807160814Ssimon	$LD		r5,`4*$BNSZ`(r4)
808160814Ssimon	$LD		r6,`5*$BNSZ`(r4)
809160814Ssimon	$UMULL		r7,r5,r6
810160814Ssimon	$UMULH		r8,r5,r6
811160814Ssimon	addc		r9,r7,r9
812160814Ssimon	adde		r10,r8,r10
813160814Ssimon	addze		r11,r11
814160814Ssimon	addc		r9,r7,r9
815160814Ssimon	adde		r10,r8,r10
816160814Ssimon	addze		r11,r11
817160814Ssimon	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
818160814Ssimon						#sqr_add_c(a,5,c2,c3,c1);
819160814Ssimon	$UMULL		r7,r6,r6
820160814Ssimon	$UMULH		r8,r6,r6
821160814Ssimon	addc		r10,r7,r10
822160814Ssimon	adde		r11,r8,r11
823160814Ssimon	addze		r9,r0
824160814Ssimon						#sqr_add_c2(a,6,4,c2,c3,c1);
825160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
826160814Ssimon	$UMULL		r7,r5,r6
827160814Ssimon	$UMULH		r8,r5,r6
828160814Ssimon	addc		r10,r7,r10
829160814Ssimon	adde		r11,r8,r11
830160814Ssimon	addze		r9,r9
831160814Ssimon	addc		r10,r7,r10
832160814Ssimon	adde		r11,r8,r11
833160814Ssimon	addze		r9,r9
834160814Ssimon						#sqr_add_c2(a,7,3,c2,c3,c1);
835160814Ssimon	$LD		r5,`3*$BNSZ`(r4)
836160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
837160814Ssimon	$UMULL		r7,r5,r6
838160814Ssimon	$UMULH		r8,r5,r6
839160814Ssimon	addc		r10,r7,r10
840160814Ssimon	adde		r11,r8,r11
841160814Ssimon	addze		r9,r9
842160814Ssimon	addc		r10,r7,r10
843160814Ssimon	adde		r11,r8,r11
844160814Ssimon	addze		r9,r9
845160814Ssimon	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
846160814Ssimon						#sqr_add_c2(a,7,4,c3,c1,c2);
847160814Ssimon	$LD		r5,`4*$BNSZ`(r4)
848160814Ssimon	$UMULL		r7,r5,r6
849160814Ssimon	$UMULH		r8,r5,r6
850160814Ssimon	addc		r11,r7,r11
851160814Ssimon	adde		r9,r8,r9
852160814Ssimon	addze		r10,r0
853160814Ssimon	addc		r11,r7,r11
854160814Ssimon	adde		r9,r8,r9
855160814Ssimon	addze		r10,r10
856160814Ssimon						#sqr_add_c2(a,6,5,c3,c1,c2);
857160814Ssimon	$LD		r5,`5*$BNSZ`(r4)
858160814Ssimon	$LD		r6,`6*$BNSZ`(r4)
859160814Ssimon	$UMULL		r7,r5,r6
860160814Ssimon	$UMULH		r8,r5,r6
861160814Ssimon	addc		r11,r7,r11
862160814Ssimon	adde		r9,r8,r9
863160814Ssimon	addze		r10,r10
864160814Ssimon	addc		r11,r7,r11
865160814Ssimon	adde		r9,r8,r9
866160814Ssimon	addze		r10,r10
867160814Ssimon	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
868160814Ssimon						#sqr_add_c(a,6,c1,c2,c3);
869160814Ssimon	$UMULL		r7,r6,r6
870160814Ssimon	$UMULH		r8,r6,r6
871160814Ssimon	addc		r9,r7,r9
872160814Ssimon	adde		r10,r8,r10
873160814Ssimon	addze		r11,r0
874160814Ssimon						#sqr_add_c2(a,7,5,c1,c2,c3)
875160814Ssimon	$LD		r6,`7*$BNSZ`(r4)
876160814Ssimon	$UMULL		r7,r5,r6
877160814Ssimon	$UMULH		r8,r5,r6
878160814Ssimon	addc		r9,r7,r9
879160814Ssimon	adde		r10,r8,r10
880160814Ssimon	addze		r11,r11
881160814Ssimon	addc		r9,r7,r9
882160814Ssimon	adde		r10,r8,r10
883160814Ssimon	addze		r11,r11
884160814Ssimon	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
885160814Ssimon
886160814Ssimon						#sqr_add_c2(a,7,6,c2,c3,c1)
887160814Ssimon	$LD		r5,`6*$BNSZ`(r4)
888160814Ssimon	$UMULL		r7,r5,r6
889160814Ssimon	$UMULH		r8,r5,r6
890160814Ssimon	addc		r10,r7,r10
891160814Ssimon	adde		r11,r8,r11
892160814Ssimon	addze		r9,r0
893160814Ssimon	addc		r10,r7,r10
894160814Ssimon	adde		r11,r8,r11
895160814Ssimon	addze		r9,r9
896160814Ssimon	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
897160814Ssimon						#sqr_add_c(a,7,c3,c1,c2);
898160814Ssimon	$UMULL		r7,r6,r6
899160814Ssimon	$UMULH		r8,r6,r6
900160814Ssimon	addc		r11,r7,r11
901160814Ssimon	adde		r9,r8,r9
902160814Ssimon	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
903160814Ssimon	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
904160814Ssimon
905160814Ssimon
906160814Ssimon	bclr	BO_ALWAYS,CR0_LT
907160814Ssimon
908160814Ssimon	.long	0x00000000
909160814Ssimon
910160814Ssimon#
911160814Ssimon#	NOTE:	The following label name should be changed to
912160814Ssimon#		"bn_mul_comba4" i.e. remove the first dot
913160814Ssimon#		for the gcc compiler. This should be automatically
914160814Ssimon#		done in the build
915160814Ssimon#
916160814Ssimon
917160814Ssimon.align	4
918160814Ssimon.bn_mul_comba4:
919160814Ssimon#
920160814Ssimon# This is an optimized version of the bn_mul_comba4 routine.
921160814Ssimon#
922160814Ssimon# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
923160814Ssimon# r3 contains r
924160814Ssimon# r4 contains a
925160814Ssimon# r5 contains b
926160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied.
927160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply.
928160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3.
929160814Ssimon#
930160814Ssimon	xor	r0,r0,r0		#r0=0. Used in addze below.
931160814Ssimon					#mul_add_c(a[0],b[0],c1,c2,c3);
932160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
933160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
934160814Ssimon	$UMULL	r10,r6,r7
935160814Ssimon	$UMULH	r11,r6,r7
936160814Ssimon	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
937160814Ssimon					#mul_add_c(a[0],b[1],c2,c3,c1);
938160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
939160814Ssimon	$UMULL	r8,r6,r7
940160814Ssimon	$UMULH	r9,r6,r7
941160814Ssimon	addc	r11,r8,r11
942160814Ssimon	adde	r12,r9,r0
943160814Ssimon	addze	r10,r0
944160814Ssimon					#mul_add_c(a[1],b[0],c2,c3,c1);
945160814Ssimon	$LD	r6, `1*$BNSZ`(r4)
946160814Ssimon	$LD	r7, `0*$BNSZ`(r5)
947160814Ssimon	$UMULL	r8,r6,r7
948160814Ssimon	$UMULH	r9,r6,r7
949160814Ssimon	addc	r11,r8,r11
950160814Ssimon	adde	r12,r9,r12
951160814Ssimon	addze	r10,r10
952160814Ssimon	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
953160814Ssimon					#mul_add_c(a[2],b[0],c3,c1,c2);
954160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
955160814Ssimon	$UMULL	r8,r6,r7
956160814Ssimon	$UMULH	r9,r6,r7
957160814Ssimon	addc	r12,r8,r12
958160814Ssimon	adde	r10,r9,r10
959160814Ssimon	addze	r11,r0
960160814Ssimon					#mul_add_c(a[1],b[1],c3,c1,c2);
961160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
962160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
963160814Ssimon	$UMULL	r8,r6,r7
964160814Ssimon	$UMULH	r9,r6,r7
965160814Ssimon	addc	r12,r8,r12
966160814Ssimon	adde	r10,r9,r10
967160814Ssimon	addze	r11,r11
968160814Ssimon					#mul_add_c(a[0],b[2],c3,c1,c2);
969160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
970160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
971160814Ssimon	$UMULL	r8,r6,r7
972160814Ssimon	$UMULH	r9,r6,r7
973160814Ssimon	addc	r12,r8,r12
974160814Ssimon	adde	r10,r9,r10
975160814Ssimon	addze	r11,r11
976160814Ssimon	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
977160814Ssimon					#mul_add_c(a[0],b[3],c1,c2,c3);
978160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
979160814Ssimon	$UMULL	r8,r6,r7
980160814Ssimon	$UMULH	r9,r6,r7
981160814Ssimon	addc	r10,r8,r10
982160814Ssimon	adde	r11,r9,r11
983160814Ssimon	addze	r12,r0
984160814Ssimon					#mul_add_c(a[1],b[2],c1,c2,c3);
985160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
986160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
987160814Ssimon	$UMULL	r8,r6,r7
988160814Ssimon	$UMULH	r9,r6,r7
989160814Ssimon	addc	r10,r8,r10
990160814Ssimon	adde	r11,r9,r11
991160814Ssimon	addze	r12,r12
992160814Ssimon					#mul_add_c(a[2],b[1],c1,c2,c3);
993160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
994160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
995160814Ssimon	$UMULL	r8,r6,r7
996160814Ssimon	$UMULH	r9,r6,r7
997160814Ssimon	addc	r10,r8,r10
998160814Ssimon	adde	r11,r9,r11
999160814Ssimon	addze	r12,r12
1000160814Ssimon					#mul_add_c(a[3],b[0],c1,c2,c3);
1001160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1002160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1003160814Ssimon	$UMULL	r8,r6,r7
1004160814Ssimon	$UMULH	r9,r6,r7
1005160814Ssimon	addc	r10,r8,r10
1006160814Ssimon	adde	r11,r9,r11
1007160814Ssimon	addze	r12,r12
1008160814Ssimon	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
1009160814Ssimon					#mul_add_c(a[3],b[1],c2,c3,c1);
1010160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1011160814Ssimon	$UMULL	r8,r6,r7
1012160814Ssimon	$UMULH	r9,r6,r7
1013160814Ssimon	addc	r11,r8,r11
1014160814Ssimon	adde	r12,r9,r12
1015160814Ssimon	addze	r10,r0
1016160814Ssimon					#mul_add_c(a[2],b[2],c2,c3,c1);
1017160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1018160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1019160814Ssimon	$UMULL	r8,r6,r7
1020160814Ssimon	$UMULH	r9,r6,r7
1021160814Ssimon	addc	r11,r8,r11
1022160814Ssimon	adde	r12,r9,r12
1023160814Ssimon	addze	r10,r10
1024160814Ssimon					#mul_add_c(a[1],b[3],c2,c3,c1);
1025160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1026160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1027160814Ssimon	$UMULL	r8,r6,r7
1028160814Ssimon	$UMULH	r9,r6,r7
1029160814Ssimon	addc	r11,r8,r11
1030160814Ssimon	adde	r12,r9,r12
1031160814Ssimon	addze	r10,r10
1032160814Ssimon	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
1033160814Ssimon					#mul_add_c(a[2],b[3],c3,c1,c2);
1034160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1035160814Ssimon	$UMULL	r8,r6,r7
1036160814Ssimon	$UMULH	r9,r6,r7
1037160814Ssimon	addc	r12,r8,r12
1038160814Ssimon	adde	r10,r9,r10
1039160814Ssimon	addze	r11,r0
1040160814Ssimon					#mul_add_c(a[3],b[2],c3,c1,c2);
1041160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1042237998Sjkim	$LD	r7,`2*$BNSZ`(r5)
1043160814Ssimon	$UMULL	r8,r6,r7
1044160814Ssimon	$UMULH	r9,r6,r7
1045160814Ssimon	addc	r12,r8,r12
1046160814Ssimon	adde	r10,r9,r10
1047160814Ssimon	addze	r11,r11
1048160814Ssimon	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
1049160814Ssimon					#mul_add_c(a[3],b[3],c1,c2,c3);
1050160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1051160814Ssimon	$UMULL	r8,r6,r7
1052160814Ssimon	$UMULH	r9,r6,r7
1053160814Ssimon	addc	r10,r8,r10
1054160814Ssimon	adde	r11,r9,r11
1055160814Ssimon
1056160814Ssimon	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
1057160814Ssimon	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
1058160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1059160814Ssimon	.long	0x00000000
1060160814Ssimon
1061160814Ssimon#
1062160814Ssimon#	NOTE:	The following label name should be changed to
1063160814Ssimon#		"bn_mul_comba8" i.e. remove the first dot
1064160814Ssimon#		for the gcc compiler. This should be automatically
1065160814Ssimon#		done in the build
1066160814Ssimon#
1067160814Ssimon
1068160814Ssimon.align	4
1069160814Ssimon.bn_mul_comba8:
1070160814Ssimon#
1071160814Ssimon# Optimized version of the bn_mul_comba8 routine.
1072160814Ssimon#
1073160814Ssimon# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1074160814Ssimon# r3 contains r
1075160814Ssimon# r4 contains a
1076160814Ssimon# r5 contains b
1077160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied.
1078160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply.
1079160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3.
1080160814Ssimon#
1081160814Ssimon	xor	r0,r0,r0		#r0=0. Used in addze below.
1082160814Ssimon
1083160814Ssimon					#mul_add_c(a[0],b[0],c1,c2,c3);
1084160814Ssimon	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1085160814Ssimon	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1086160814Ssimon	$UMULL	r10,r6,r7
1087160814Ssimon	$UMULH	r11,r6,r7
1088160814Ssimon	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1089160814Ssimon					#mul_add_c(a[0],b[1],c2,c3,c1);
1090160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1091160814Ssimon	$UMULL	r8,r6,r7
1092160814Ssimon	$UMULH	r9,r6,r7
1093160814Ssimon	addc	r11,r11,r8
1094160814Ssimon	addze	r12,r9			# since we didnt set r12 to zero before.
1095160814Ssimon	addze	r10,r0
1096160814Ssimon					#mul_add_c(a[1],b[0],c2,c3,c1);
1097160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1098160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1099160814Ssimon	$UMULL	r8,r6,r7
1100160814Ssimon	$UMULH	r9,r6,r7
1101160814Ssimon	addc	r11,r11,r8
1102160814Ssimon	adde	r12,r12,r9
1103160814Ssimon	addze	r10,r10
1104160814Ssimon	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1105160814Ssimon					#mul_add_c(a[2],b[0],c3,c1,c2);
1106160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1107160814Ssimon	$UMULL	r8,r6,r7
1108160814Ssimon	$UMULH	r9,r6,r7
1109160814Ssimon	addc	r12,r12,r8
1110160814Ssimon	adde	r10,r10,r9
1111160814Ssimon	addze	r11,r0
1112160814Ssimon					#mul_add_c(a[1],b[1],c3,c1,c2);
1113160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1114160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1115160814Ssimon	$UMULL	r8,r6,r7
1116160814Ssimon	$UMULH	r9,r6,r7
1117160814Ssimon	addc	r12,r12,r8
1118160814Ssimon	adde	r10,r10,r9
1119160814Ssimon	addze	r11,r11
1120160814Ssimon					#mul_add_c(a[0],b[2],c3,c1,c2);
1121160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1122160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1123160814Ssimon	$UMULL	r8,r6,r7
1124160814Ssimon	$UMULH	r9,r6,r7
1125160814Ssimon	addc	r12,r12,r8
1126160814Ssimon	adde	r10,r10,r9
1127160814Ssimon	addze	r11,r11
1128160814Ssimon	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1129160814Ssimon					#mul_add_c(a[0],b[3],c1,c2,c3);
1130160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1131160814Ssimon	$UMULL	r8,r6,r7
1132160814Ssimon	$UMULH	r9,r6,r7
1133160814Ssimon	addc	r10,r10,r8
1134160814Ssimon	adde	r11,r11,r9
1135160814Ssimon	addze	r12,r0
1136160814Ssimon					#mul_add_c(a[1],b[2],c1,c2,c3);
1137160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1138160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1139160814Ssimon	$UMULL	r8,r6,r7
1140160814Ssimon	$UMULH	r9,r6,r7
1141160814Ssimon	addc	r10,r10,r8
1142160814Ssimon	adde	r11,r11,r9
1143160814Ssimon	addze	r12,r12
1144160814Ssimon
1145160814Ssimon					#mul_add_c(a[2],b[1],c1,c2,c3);
1146160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1147160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1148160814Ssimon	$UMULL	r8,r6,r7
1149160814Ssimon	$UMULH	r9,r6,r7
1150160814Ssimon	addc	r10,r10,r8
1151160814Ssimon	adde	r11,r11,r9
1152160814Ssimon	addze	r12,r12
1153160814Ssimon					#mul_add_c(a[3],b[0],c1,c2,c3);
1154160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1155160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1156160814Ssimon	$UMULL	r8,r6,r7
1157160814Ssimon	$UMULH	r9,r6,r7
1158160814Ssimon	addc	r10,r10,r8
1159160814Ssimon	adde	r11,r11,r9
1160160814Ssimon	addze	r12,r12
1161160814Ssimon	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1162160814Ssimon					#mul_add_c(a[4],b[0],c2,c3,c1);
1163160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1164160814Ssimon	$UMULL	r8,r6,r7
1165160814Ssimon	$UMULH	r9,r6,r7
1166160814Ssimon	addc	r11,r11,r8
1167160814Ssimon	adde	r12,r12,r9
1168160814Ssimon	addze	r10,r0
1169160814Ssimon					#mul_add_c(a[3],b[1],c2,c3,c1);
1170160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1171160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1172160814Ssimon	$UMULL	r8,r6,r7
1173160814Ssimon	$UMULH	r9,r6,r7
1174160814Ssimon	addc	r11,r11,r8
1175160814Ssimon	adde	r12,r12,r9
1176160814Ssimon	addze	r10,r10
1177160814Ssimon					#mul_add_c(a[2],b[2],c2,c3,c1);
1178160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1179160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1180160814Ssimon	$UMULL	r8,r6,r7
1181160814Ssimon	$UMULH	r9,r6,r7
1182160814Ssimon	addc	r11,r11,r8
1183160814Ssimon	adde	r12,r12,r9
1184160814Ssimon	addze	r10,r10
1185160814Ssimon					#mul_add_c(a[1],b[3],c2,c3,c1);
1186160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1187160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1188160814Ssimon	$UMULL	r8,r6,r7
1189160814Ssimon	$UMULH	r9,r6,r7
1190160814Ssimon	addc	r11,r11,r8
1191160814Ssimon	adde	r12,r12,r9
1192160814Ssimon	addze	r10,r10
1193160814Ssimon					#mul_add_c(a[0],b[4],c2,c3,c1);
1194160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1195160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1196160814Ssimon	$UMULL	r8,r6,r7
1197160814Ssimon	$UMULH	r9,r6,r7
1198160814Ssimon	addc	r11,r11,r8
1199160814Ssimon	adde	r12,r12,r9
1200160814Ssimon	addze	r10,r10
1201160814Ssimon	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1202160814Ssimon					#mul_add_c(a[0],b[5],c3,c1,c2);
1203160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1204160814Ssimon	$UMULL	r8,r6,r7
1205160814Ssimon	$UMULH	r9,r6,r7
1206160814Ssimon	addc	r12,r12,r8
1207160814Ssimon	adde	r10,r10,r9
1208160814Ssimon	addze	r11,r0
1209160814Ssimon					#mul_add_c(a[1],b[4],c3,c1,c2);
1210160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1211160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1212160814Ssimon	$UMULL	r8,r6,r7
1213160814Ssimon	$UMULH	r9,r6,r7
1214160814Ssimon	addc	r12,r12,r8
1215160814Ssimon	adde	r10,r10,r9
1216160814Ssimon	addze	r11,r11
1217160814Ssimon					#mul_add_c(a[2],b[3],c3,c1,c2);
1218160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1219160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1220160814Ssimon	$UMULL	r8,r6,r7
1221160814Ssimon	$UMULH	r9,r6,r7
1222160814Ssimon	addc	r12,r12,r8
1223160814Ssimon	adde	r10,r10,r9
1224160814Ssimon	addze	r11,r11
1225160814Ssimon					#mul_add_c(a[3],b[2],c3,c1,c2);
1226160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1227160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1228160814Ssimon	$UMULL	r8,r6,r7
1229160814Ssimon	$UMULH	r9,r6,r7
1230160814Ssimon	addc	r12,r12,r8
1231160814Ssimon	adde	r10,r10,r9
1232160814Ssimon	addze	r11,r11
1233160814Ssimon					#mul_add_c(a[4],b[1],c3,c1,c2);
1234160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1235160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1236160814Ssimon	$UMULL	r8,r6,r7
1237160814Ssimon	$UMULH	r9,r6,r7
1238160814Ssimon	addc	r12,r12,r8
1239160814Ssimon	adde	r10,r10,r9
1240160814Ssimon	addze	r11,r11
1241160814Ssimon					#mul_add_c(a[5],b[0],c3,c1,c2);
1242160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1243160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1244160814Ssimon	$UMULL	r8,r6,r7
1245160814Ssimon	$UMULH	r9,r6,r7
1246160814Ssimon	addc	r12,r12,r8
1247160814Ssimon	adde	r10,r10,r9
1248160814Ssimon	addze	r11,r11
1249160814Ssimon	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1250160814Ssimon					#mul_add_c(a[6],b[0],c1,c2,c3);
1251160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1252160814Ssimon	$UMULL	r8,r6,r7
1253160814Ssimon	$UMULH	r9,r6,r7
1254160814Ssimon	addc	r10,r10,r8
1255160814Ssimon	adde	r11,r11,r9
1256160814Ssimon	addze	r12,r0
1257160814Ssimon					#mul_add_c(a[5],b[1],c1,c2,c3);
1258160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1259160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1260160814Ssimon	$UMULL	r8,r6,r7
1261160814Ssimon	$UMULH	r9,r6,r7
1262160814Ssimon	addc	r10,r10,r8
1263160814Ssimon	adde	r11,r11,r9
1264160814Ssimon	addze	r12,r12
1265160814Ssimon					#mul_add_c(a[4],b[2],c1,c2,c3);
1266160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1267160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1268160814Ssimon	$UMULL	r8,r6,r7
1269160814Ssimon	$UMULH	r9,r6,r7
1270160814Ssimon	addc	r10,r10,r8
1271160814Ssimon	adde	r11,r11,r9
1272160814Ssimon	addze	r12,r12
1273160814Ssimon					#mul_add_c(a[3],b[3],c1,c2,c3);
1274160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1275160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1276160814Ssimon	$UMULL	r8,r6,r7
1277160814Ssimon	$UMULH	r9,r6,r7
1278160814Ssimon	addc	r10,r10,r8
1279160814Ssimon	adde	r11,r11,r9
1280160814Ssimon	addze	r12,r12
1281160814Ssimon					#mul_add_c(a[2],b[4],c1,c2,c3);
1282160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1283160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1284160814Ssimon	$UMULL	r8,r6,r7
1285160814Ssimon	$UMULH	r9,r6,r7
1286160814Ssimon	addc	r10,r10,r8
1287160814Ssimon	adde	r11,r11,r9
1288160814Ssimon	addze	r12,r12
1289160814Ssimon					#mul_add_c(a[1],b[5],c1,c2,c3);
1290160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1291160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1292160814Ssimon	$UMULL	r8,r6,r7
1293160814Ssimon	$UMULH	r9,r6,r7
1294160814Ssimon	addc	r10,r10,r8
1295160814Ssimon	adde	r11,r11,r9
1296160814Ssimon	addze	r12,r12
1297160814Ssimon					#mul_add_c(a[0],b[6],c1,c2,c3);
1298160814Ssimon	$LD	r6,`0*$BNSZ`(r4)
1299160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1300160814Ssimon	$UMULL	r8,r6,r7
1301160814Ssimon	$UMULH	r9,r6,r7
1302160814Ssimon	addc	r10,r10,r8
1303160814Ssimon	adde	r11,r11,r9
1304160814Ssimon	addze	r12,r12
1305160814Ssimon	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1306160814Ssimon					#mul_add_c(a[0],b[7],c2,c3,c1);
1307160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1308160814Ssimon	$UMULL	r8,r6,r7
1309160814Ssimon	$UMULH	r9,r6,r7
1310160814Ssimon	addc	r11,r11,r8
1311160814Ssimon	adde	r12,r12,r9
1312160814Ssimon	addze	r10,r0
1313160814Ssimon					#mul_add_c(a[1],b[6],c2,c3,c1);
1314160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1315160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1316160814Ssimon	$UMULL	r8,r6,r7
1317160814Ssimon	$UMULH	r9,r6,r7
1318160814Ssimon	addc	r11,r11,r8
1319160814Ssimon	adde	r12,r12,r9
1320160814Ssimon	addze	r10,r10
1321160814Ssimon					#mul_add_c(a[2],b[5],c2,c3,c1);
1322160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1323160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1324160814Ssimon	$UMULL	r8,r6,r7
1325160814Ssimon	$UMULH	r9,r6,r7
1326160814Ssimon	addc	r11,r11,r8
1327160814Ssimon	adde	r12,r12,r9
1328160814Ssimon	addze	r10,r10
1329160814Ssimon					#mul_add_c(a[3],b[4],c2,c3,c1);
1330160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1331160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1332160814Ssimon	$UMULL	r8,r6,r7
1333160814Ssimon	$UMULH	r9,r6,r7
1334160814Ssimon	addc	r11,r11,r8
1335160814Ssimon	adde	r12,r12,r9
1336160814Ssimon	addze	r10,r10
1337160814Ssimon					#mul_add_c(a[4],b[3],c2,c3,c1);
1338160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1339160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1340160814Ssimon	$UMULL	r8,r6,r7
1341160814Ssimon	$UMULH	r9,r6,r7
1342160814Ssimon	addc	r11,r11,r8
1343160814Ssimon	adde	r12,r12,r9
1344160814Ssimon	addze	r10,r10
1345160814Ssimon					#mul_add_c(a[5],b[2],c2,c3,c1);
1346160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1347160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1348160814Ssimon	$UMULL	r8,r6,r7
1349160814Ssimon	$UMULH	r9,r6,r7
1350160814Ssimon	addc	r11,r11,r8
1351160814Ssimon	adde	r12,r12,r9
1352160814Ssimon	addze	r10,r10
1353160814Ssimon					#mul_add_c(a[6],b[1],c2,c3,c1);
1354160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1355160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1356160814Ssimon	$UMULL	r8,r6,r7
1357160814Ssimon	$UMULH	r9,r6,r7
1358160814Ssimon	addc	r11,r11,r8
1359160814Ssimon	adde	r12,r12,r9
1360160814Ssimon	addze	r10,r10
1361160814Ssimon					#mul_add_c(a[7],b[0],c2,c3,c1);
1362160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1363160814Ssimon	$LD	r7,`0*$BNSZ`(r5)
1364160814Ssimon	$UMULL	r8,r6,r7
1365160814Ssimon	$UMULH	r9,r6,r7
1366160814Ssimon	addc	r11,r11,r8
1367160814Ssimon	adde	r12,r12,r9
1368160814Ssimon	addze	r10,r10
1369160814Ssimon	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1370160814Ssimon					#mul_add_c(a[7],b[1],c3,c1,c2);
1371160814Ssimon	$LD	r7,`1*$BNSZ`(r5)
1372160814Ssimon	$UMULL	r8,r6,r7
1373160814Ssimon	$UMULH	r9,r6,r7
1374160814Ssimon	addc	r12,r12,r8
1375160814Ssimon	adde	r10,r10,r9
1376160814Ssimon	addze	r11,r0
1377160814Ssimon					#mul_add_c(a[6],b[2],c3,c1,c2);
1378160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1379160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1380160814Ssimon	$UMULL	r8,r6,r7
1381160814Ssimon	$UMULH	r9,r6,r7
1382160814Ssimon	addc	r12,r12,r8
1383160814Ssimon	adde	r10,r10,r9
1384160814Ssimon	addze	r11,r11
1385160814Ssimon					#mul_add_c(a[5],b[3],c3,c1,c2);
1386160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1387160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1388160814Ssimon	$UMULL	r8,r6,r7
1389160814Ssimon	$UMULH	r9,r6,r7
1390160814Ssimon	addc	r12,r12,r8
1391160814Ssimon	adde	r10,r10,r9
1392160814Ssimon	addze	r11,r11
1393160814Ssimon					#mul_add_c(a[4],b[4],c3,c1,c2);
1394160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1395160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1396160814Ssimon	$UMULL	r8,r6,r7
1397160814Ssimon	$UMULH	r9,r6,r7
1398160814Ssimon	addc	r12,r12,r8
1399160814Ssimon	adde	r10,r10,r9
1400160814Ssimon	addze	r11,r11
1401160814Ssimon					#mul_add_c(a[3],b[5],c3,c1,c2);
1402160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1403160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1404160814Ssimon	$UMULL	r8,r6,r7
1405160814Ssimon	$UMULH	r9,r6,r7
1406160814Ssimon	addc	r12,r12,r8
1407160814Ssimon	adde	r10,r10,r9
1408160814Ssimon	addze	r11,r11
1409160814Ssimon					#mul_add_c(a[2],b[6],c3,c1,c2);
1410160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1411160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1412160814Ssimon	$UMULL	r8,r6,r7
1413160814Ssimon	$UMULH	r9,r6,r7
1414160814Ssimon	addc	r12,r12,r8
1415160814Ssimon	adde	r10,r10,r9
1416160814Ssimon	addze	r11,r11
1417160814Ssimon					#mul_add_c(a[1],b[7],c3,c1,c2);
1418160814Ssimon	$LD	r6,`1*$BNSZ`(r4)
1419160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1420160814Ssimon	$UMULL	r8,r6,r7
1421160814Ssimon	$UMULH	r9,r6,r7
1422160814Ssimon	addc	r12,r12,r8
1423160814Ssimon	adde	r10,r10,r9
1424160814Ssimon	addze	r11,r11
1425160814Ssimon	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1426160814Ssimon					#mul_add_c(a[2],b[7],c1,c2,c3);
1427160814Ssimon	$LD	r6,`2*$BNSZ`(r4)
1428160814Ssimon	$UMULL	r8,r6,r7
1429160814Ssimon	$UMULH	r9,r6,r7
1430160814Ssimon	addc	r10,r10,r8
1431160814Ssimon	adde	r11,r11,r9
1432160814Ssimon	addze	r12,r0
1433160814Ssimon					#mul_add_c(a[3],b[6],c1,c2,c3);
1434160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1435160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1436160814Ssimon	$UMULL	r8,r6,r7
1437160814Ssimon	$UMULH	r9,r6,r7
1438160814Ssimon	addc	r10,r10,r8
1439160814Ssimon	adde	r11,r11,r9
1440160814Ssimon	addze	r12,r12
1441160814Ssimon					#mul_add_c(a[4],b[5],c1,c2,c3);
1442160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1443160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1444160814Ssimon	$UMULL	r8,r6,r7
1445160814Ssimon	$UMULH	r9,r6,r7
1446160814Ssimon	addc	r10,r10,r8
1447160814Ssimon	adde	r11,r11,r9
1448160814Ssimon	addze	r12,r12
1449160814Ssimon					#mul_add_c(a[5],b[4],c1,c2,c3);
1450160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1451160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1452160814Ssimon	$UMULL	r8,r6,r7
1453160814Ssimon	$UMULH	r9,r6,r7
1454160814Ssimon	addc	r10,r10,r8
1455160814Ssimon	adde	r11,r11,r9
1456160814Ssimon	addze	r12,r12
1457160814Ssimon					#mul_add_c(a[6],b[3],c1,c2,c3);
1458160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1459160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1460160814Ssimon	$UMULL	r8,r6,r7
1461160814Ssimon	$UMULH	r9,r6,r7
1462160814Ssimon	addc	r10,r10,r8
1463160814Ssimon	adde	r11,r11,r9
1464160814Ssimon	addze	r12,r12
1465160814Ssimon					#mul_add_c(a[7],b[2],c1,c2,c3);
1466160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1467160814Ssimon	$LD	r7,`2*$BNSZ`(r5)
1468160814Ssimon	$UMULL	r8,r6,r7
1469160814Ssimon	$UMULH	r9,r6,r7
1470160814Ssimon	addc	r10,r10,r8
1471160814Ssimon	adde	r11,r11,r9
1472160814Ssimon	addze	r12,r12
1473160814Ssimon	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1474160814Ssimon					#mul_add_c(a[7],b[3],c2,c3,c1);
1475160814Ssimon	$LD	r7,`3*$BNSZ`(r5)
1476160814Ssimon	$UMULL	r8,r6,r7
1477160814Ssimon	$UMULH	r9,r6,r7
1478160814Ssimon	addc	r11,r11,r8
1479160814Ssimon	adde	r12,r12,r9
1480160814Ssimon	addze	r10,r0
1481160814Ssimon					#mul_add_c(a[6],b[4],c2,c3,c1);
1482160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1483160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1484160814Ssimon	$UMULL	r8,r6,r7
1485160814Ssimon	$UMULH	r9,r6,r7
1486160814Ssimon	addc	r11,r11,r8
1487160814Ssimon	adde	r12,r12,r9
1488160814Ssimon	addze	r10,r10
1489160814Ssimon					#mul_add_c(a[5],b[5],c2,c3,c1);
1490160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1491160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1492160814Ssimon	$UMULL	r8,r6,r7
1493160814Ssimon	$UMULH	r9,r6,r7
1494160814Ssimon	addc	r11,r11,r8
1495160814Ssimon	adde	r12,r12,r9
1496160814Ssimon	addze	r10,r10
1497160814Ssimon					#mul_add_c(a[4],b[6],c2,c3,c1);
1498160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1499160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1500160814Ssimon	$UMULL	r8,r6,r7
1501160814Ssimon	$UMULH	r9,r6,r7
1502160814Ssimon	addc	r11,r11,r8
1503160814Ssimon	adde	r12,r12,r9
1504160814Ssimon	addze	r10,r10
1505160814Ssimon					#mul_add_c(a[3],b[7],c2,c3,c1);
1506160814Ssimon	$LD	r6,`3*$BNSZ`(r4)
1507160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1508160814Ssimon	$UMULL	r8,r6,r7
1509160814Ssimon	$UMULH	r9,r6,r7
1510160814Ssimon	addc	r11,r11,r8
1511160814Ssimon	adde	r12,r12,r9
1512160814Ssimon	addze	r10,r10
1513160814Ssimon	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1514160814Ssimon					#mul_add_c(a[4],b[7],c3,c1,c2);
1515160814Ssimon	$LD	r6,`4*$BNSZ`(r4)
1516160814Ssimon	$UMULL	r8,r6,r7
1517160814Ssimon	$UMULH	r9,r6,r7
1518160814Ssimon	addc	r12,r12,r8
1519160814Ssimon	adde	r10,r10,r9
1520160814Ssimon	addze	r11,r0
1521160814Ssimon					#mul_add_c(a[5],b[6],c3,c1,c2);
1522160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1523160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1524160814Ssimon	$UMULL	r8,r6,r7
1525160814Ssimon	$UMULH	r9,r6,r7
1526160814Ssimon	addc	r12,r12,r8
1527160814Ssimon	adde	r10,r10,r9
1528160814Ssimon	addze	r11,r11
1529160814Ssimon					#mul_add_c(a[6],b[5],c3,c1,c2);
1530160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1531160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1532160814Ssimon	$UMULL	r8,r6,r7
1533160814Ssimon	$UMULH	r9,r6,r7
1534160814Ssimon	addc	r12,r12,r8
1535160814Ssimon	adde	r10,r10,r9
1536160814Ssimon	addze	r11,r11
1537160814Ssimon					#mul_add_c(a[7],b[4],c3,c1,c2);
1538160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1539160814Ssimon	$LD	r7,`4*$BNSZ`(r5)
1540160814Ssimon	$UMULL	r8,r6,r7
1541160814Ssimon	$UMULH	r9,r6,r7
1542160814Ssimon	addc	r12,r12,r8
1543160814Ssimon	adde	r10,r10,r9
1544160814Ssimon	addze	r11,r11
1545160814Ssimon	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1546160814Ssimon					#mul_add_c(a[7],b[5],c1,c2,c3);
1547160814Ssimon	$LD	r7,`5*$BNSZ`(r5)
1548160814Ssimon	$UMULL	r8,r6,r7
1549160814Ssimon	$UMULH	r9,r6,r7
1550160814Ssimon	addc	r10,r10,r8
1551160814Ssimon	adde	r11,r11,r9
1552160814Ssimon	addze	r12,r0
1553160814Ssimon					#mul_add_c(a[6],b[6],c1,c2,c3);
1554160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1555160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1556160814Ssimon	$UMULL	r8,r6,r7
1557160814Ssimon	$UMULH	r9,r6,r7
1558160814Ssimon	addc	r10,r10,r8
1559160814Ssimon	adde	r11,r11,r9
1560160814Ssimon	addze	r12,r12
1561160814Ssimon					#mul_add_c(a[5],b[7],c1,c2,c3);
1562160814Ssimon	$LD	r6,`5*$BNSZ`(r4)
1563160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1564160814Ssimon	$UMULL	r8,r6,r7
1565160814Ssimon	$UMULH	r9,r6,r7
1566160814Ssimon	addc	r10,r10,r8
1567160814Ssimon	adde	r11,r11,r9
1568160814Ssimon	addze	r12,r12
1569160814Ssimon	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1570160814Ssimon					#mul_add_c(a[6],b[7],c2,c3,c1);
1571160814Ssimon	$LD	r6,`6*$BNSZ`(r4)
1572160814Ssimon	$UMULL	r8,r6,r7
1573160814Ssimon	$UMULH	r9,r6,r7
1574160814Ssimon	addc	r11,r11,r8
1575160814Ssimon	adde	r12,r12,r9
1576160814Ssimon	addze	r10,r0
1577160814Ssimon					#mul_add_c(a[7],b[6],c2,c3,c1);
1578160814Ssimon	$LD	r6,`7*$BNSZ`(r4)
1579160814Ssimon	$LD	r7,`6*$BNSZ`(r5)
1580160814Ssimon	$UMULL	r8,r6,r7
1581160814Ssimon	$UMULH	r9,r6,r7
1582160814Ssimon	addc	r11,r11,r8
1583160814Ssimon	adde	r12,r12,r9
1584160814Ssimon	addze	r10,r10
1585160814Ssimon	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1586160814Ssimon					#mul_add_c(a[7],b[7],c3,c1,c2);
1587160814Ssimon	$LD	r7,`7*$BNSZ`(r5)
1588160814Ssimon	$UMULL	r8,r6,r7
1589160814Ssimon	$UMULH	r9,r6,r7
1590160814Ssimon	addc	r12,r12,r8
1591160814Ssimon	adde	r10,r10,r9
1592160814Ssimon	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1593160814Ssimon	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1594160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1595160814Ssimon	.long	0x00000000
1596160814Ssimon
1597160814Ssimon#
1598160814Ssimon#	NOTE:	The following label name should be changed to
1599160814Ssimon#		"bn_sub_words" i.e. remove the first dot
1600160814Ssimon#		for the gcc compiler. This should be automatically
1601160814Ssimon#		done in the build
1602160814Ssimon#
1603160814Ssimon#
1604160814Ssimon.align	4
1605160814Ssimon.bn_sub_words:
1606160814Ssimon#
1607160814Ssimon#	Handcoded version of bn_sub_words
1608160814Ssimon#
1609160814Ssimon#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1610160814Ssimon#
1611160814Ssimon#	r3 = r
1612160814Ssimon#	r4 = a
1613160814Ssimon#	r5 = b
1614160814Ssimon#	r6 = n
1615160814Ssimon#
1616160814Ssimon#       Note:	No loop unrolling done since this is not a performance
1617160814Ssimon#               critical loop.
1618160814Ssimon
1619160814Ssimon	xor	r0,r0,r0	#set r0 = 0
1620160814Ssimon#
1621160814Ssimon#	check for r6 = 0 AND set carry bit.
1622160814Ssimon#
1623160814Ssimon	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1624160814Ssimon				# if r6 > 0 then result !=0
1625160814Ssimon				# In either case carry bit is set.
1626160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios
1627160814Ssimon	addi	r4,r4,-$BNSZ
1628160814Ssimon	addi	r3,r3,-$BNSZ
1629160814Ssimon	addi	r5,r5,-$BNSZ
1630160814Ssimon	mtctr	r6
1631160814SsimonLppcasm_sub_mainloop:
1632160814Ssimon	$LDU	r7,$BNSZ(r4)
1633160814Ssimon	$LDU	r8,$BNSZ(r5)
1634160814Ssimon	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1635160814Ssimon				# if carry = 1 this is r7-r8. Else it
1636160814Ssimon				# is r7-r8 -1 as we need.
1637160814Ssimon	$STU	r6,$BNSZ(r3)
1638160814Ssimon	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
1639160814SsimonLppcasm_sub_adios:
1640160814Ssimon	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1641160814Ssimon	andi.	r3,r3,1         # keep only last bit.
1642160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1643160814Ssimon	.long	0x00000000
1644160814Ssimon
1645160814Ssimon
1646160814Ssimon#
1647160814Ssimon#	NOTE:	The following label name should be changed to
1648160814Ssimon#		"bn_add_words" i.e. remove the first dot
1649160814Ssimon#		for the gcc compiler. This should be automatically
1650160814Ssimon#		done in the build
1651160814Ssimon#
1652160814Ssimon
1653160814Ssimon.align	4
1654160814Ssimon.bn_add_words:
1655160814Ssimon#
1656160814Ssimon#	Handcoded version of bn_add_words
1657160814Ssimon#
1658160814Ssimon#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1659160814Ssimon#
1660160814Ssimon#	r3 = r
1661160814Ssimon#	r4 = a
1662160814Ssimon#	r5 = b
1663160814Ssimon#	r6 = n
1664160814Ssimon#
1665160814Ssimon#       Note:	No loop unrolling done since this is not a performance
1666160814Ssimon#               critical loop.
1667160814Ssimon
1668160814Ssimon	xor	r0,r0,r0
1669160814Ssimon#
1670160814Ssimon#	check for r6 = 0. Is this needed?
1671160814Ssimon#
1672160814Ssimon	addic.	r6,r6,0		#test r6 and clear carry bit.
1673160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_add_adios
1674160814Ssimon	addi	r4,r4,-$BNSZ
1675160814Ssimon	addi	r3,r3,-$BNSZ
1676160814Ssimon	addi	r5,r5,-$BNSZ
1677160814Ssimon	mtctr	r6
1678160814SsimonLppcasm_add_mainloop:
1679160814Ssimon	$LDU	r7,$BNSZ(r4)
1680160814Ssimon	$LDU	r8,$BNSZ(r5)
1681160814Ssimon	adde	r8,r7,r8
1682160814Ssimon	$STU	r8,$BNSZ(r3)
1683160814Ssimon	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
1684160814SsimonLppcasm_add_adios:
1685160814Ssimon	addze	r3,r0			#return carry bit.
1686160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1687160814Ssimon	.long	0x00000000
1688160814Ssimon
1689160814Ssimon#
1690160814Ssimon#	NOTE:	The following label name should be changed to
1691160814Ssimon#		"bn_div_words" i.e. remove the first dot
1692160814Ssimon#		for the gcc compiler. This should be automatically
1693160814Ssimon#		done in the build
1694160814Ssimon#
1695160814Ssimon
1696160814Ssimon.align	4
1697160814Ssimon.bn_div_words:
1698160814Ssimon#
1699160814Ssimon#	This is a cleaned up version of code generated by
1700160814Ssimon#	the AIX compiler. The only optimization is to use
1701160814Ssimon#	the PPC instruction to count leading zeros instead
1702160814Ssimon#	of call to num_bits_word. Since this was compiled
1703160814Ssimon#	only at level -O2 we can possibly squeeze it more?
1704160814Ssimon#
1705160814Ssimon#	r3 = h
1706160814Ssimon#	r4 = l
1707160814Ssimon#	r5 = d
1708160814Ssimon
1709160814Ssimon	$UCMPI	0,r5,0			# compare r5 and 0
1710160814Ssimon	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0
1711160814Ssimon	li	r3,-1			# d=0 return -1
1712160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1713160814SsimonLppcasm_div1:
1714160814Ssimon	xor	r0,r0,r0		#r0=0
1715160814Ssimon	li	r8,$BITS
1716160814Ssimon	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1717160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if no leading zeros
1718160814Ssimon	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1719160814Ssimon	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1720160814Ssimon	$TR	16,r9,r0		#if there're, signal to dump core...
1721160814SsimonLppcasm_div2:
1722160814Ssimon	$UCMP	0,r3,r5			#h>=d?
1723160814Ssimon	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not
1724160814Ssimon	subf	r3,r5,r3		#h-=d ;
1725160814SsimonLppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1726160814Ssimon	cmpi	0,0,r7,0		# is (i == 0)?
1727160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_div4
1728160814Ssimon	$SHL	r3,r3,r7		# h = (h<< i)
1729160814Ssimon	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1730160814Ssimon	$SHL	r5,r5,r7		# d<<=i
1731160814Ssimon	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1732160814Ssimon	$SHL	r4,r4,r7		# l <<=i
1733160814SsimonLppcasm_div4:
1734160814Ssimon	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1735160814Ssimon					# dl will be computed when needed
1736160814Ssimon					# as it saves registers.
1737160814Ssimon	li	r6,2			#r6=2
1738160814Ssimon	mtctr	r6			#counter will be in count.
1739160814SsimonLppcasm_divouterloop:
1740160814Ssimon	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1741160814Ssimon	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1742160814Ssimon					# compute here for innerloop.
1743160814Ssimon	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1744160814Ssimon	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not
1745160814Ssimon
1746160814Ssimon	li	r8,-1
1747160814Ssimon	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1748160814Ssimon	b	Lppcasm_div6
1749160814SsimonLppcasm_div5:
1750160814Ssimon	$UDIV	r8,r3,r9		#q = h/dh
1751160814SsimonLppcasm_div6:
1752160814Ssimon	$UMULL	r12,r9,r8		#th = q*dh
1753160814Ssimon	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1754160814Ssimon	$UMULL	r6,r8,r10		#tl = q*dl
1755160814Ssimon
1756160814SsimonLppcasm_divinnerloop:
1757160814Ssimon	subf	r10,r12,r3		#t = h -th
1758160814Ssimon	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1759160814Ssimon	addic.	r7,r7,0			#test if r7 == 0. used below.
1760160814Ssimon					# now want to compute
1761160814Ssimon					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1762160814Ssimon					# the following 2 instructions do that
1763160814Ssimon	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1764160814Ssimon	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1765160814Ssimon	$UCMP	1,r6,r7			# compare (tl <= r7)
1766160814Ssimon	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
1767160814Ssimon	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
1768160814Ssimon	addi	r8,r8,-1		#q--
1769160814Ssimon	subf	r12,r9,r12		#th -=dh
1770160814Ssimon	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1771160814Ssimon	subf	r6,r10,r6		#tl -=dl
1772160814Ssimon	b	Lppcasm_divinnerloop
1773160814SsimonLppcasm_divinnerexit:
1774160814Ssimon	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1775160814Ssimon	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1776160814Ssimon	$UCMP	1,r4,r11		# compare l and tl
1777160814Ssimon	add	r12,r12,r10		# th+=t
1778160814Ssimon	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7
1779160814Ssimon	addi	r12,r12,1		# th++
1780160814SsimonLppcasm_div7:
1781160814Ssimon	subf	r11,r11,r4		#r11=l-tl
1782160814Ssimon	$UCMP	1,r3,r12		#compare h and th
1783160814Ssimon	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1784160814Ssimon	addi	r8,r8,-1		# q--
1785160814Ssimon	add	r3,r5,r3		# h+=d
1786160814SsimonLppcasm_div8:
1787160814Ssimon	subf	r12,r12,r3		#r12 = h-th
1788160814Ssimon	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1789160814Ssimon					# want to compute
1790160814Ssimon					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1791160814Ssimon					# the following 2 instructions will do this.
1792160814Ssimon	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1793160814Ssimon	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1794160814Ssimon	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
1795160814Ssimon	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1796160814Ssimon	b	Lppcasm_divouterloop
1797160814SsimonLppcasm_div9:
1798160814Ssimon	or	r3,r8,r0
1799160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1800160814Ssimon	.long	0x00000000
1801160814Ssimon
1802160814Ssimon#
1803160814Ssimon#	NOTE:	The following label name should be changed to
1804160814Ssimon#		"bn_sqr_words" i.e. remove the first dot
1805160814Ssimon#		for the gcc compiler. This should be automatically
1806160814Ssimon#		done in the build
1807160814Ssimon#
1808160814Ssimon.align	4
1809160814Ssimon.bn_sqr_words:
1810160814Ssimon#
1811160814Ssimon#	Optimized version of bn_sqr_words
1812160814Ssimon#
1813160814Ssimon#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1814160814Ssimon#
1815160814Ssimon#	r3 = r
1816160814Ssimon#	r4 = a
1817160814Ssimon#	r5 = n
1818160814Ssimon#
1819160814Ssimon#	r6 = a[i].
1820160814Ssimon#	r7,r8 = product.
1821160814Ssimon#
1822160814Ssimon#	No unrolling done here. Not performance critical.
1823160814Ssimon
1824160814Ssimon	addic.	r5,r5,0			#test r5.
1825160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios
1826160814Ssimon	addi	r4,r4,-$BNSZ
1827160814Ssimon	addi	r3,r3,-$BNSZ
1828160814Ssimon	mtctr	r5
1829160814SsimonLppcasm_sqr_mainloop:
1830160814Ssimon					#sqr(r[0],r[1],a[0]);
1831160814Ssimon	$LDU	r6,$BNSZ(r4)
1832160814Ssimon	$UMULL	r7,r6,r6
1833160814Ssimon	$UMULH  r8,r6,r6
1834160814Ssimon	$STU	r7,$BNSZ(r3)
1835160814Ssimon	$STU	r8,$BNSZ(r3)
1836160814Ssimon	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
1837160814SsimonLppcasm_sqr_adios:
1838160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1839160814Ssimon	.long	0x00000000
1840160814Ssimon
1841160814Ssimon
1842160814Ssimon#
1843160814Ssimon#	NOTE:	The following label name should be changed to
1844160814Ssimon#		"bn_mul_words" i.e. remove the first dot
1845160814Ssimon#		for the gcc compiler. This should be automatically
1846160814Ssimon#		done in the build
1847160814Ssimon#
1848160814Ssimon
1849160814Ssimon.align	4
1850160814Ssimon.bn_mul_words:
1851160814Ssimon#
1852160814Ssimon# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1853160814Ssimon#
1854160814Ssimon# r3 = rp
1855160814Ssimon# r4 = ap
1856160814Ssimon# r5 = num
1857160814Ssimon# r6 = w
1858160814Ssimon	xor	r0,r0,r0
1859160814Ssimon	xor	r12,r12,r12		# used for carry
1860160814Ssimon	rlwinm.	r7,r5,30,2,31		# num >> 2
1861160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM
1862160814Ssimon	mtctr	r7
1863160814SsimonLppcasm_mw_LOOP:
1864160814Ssimon					#mul(rp[0],ap[0],w,c1);
1865160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1866160814Ssimon	$UMULL	r9,r6,r8
1867160814Ssimon	$UMULH  r10,r6,r8
1868160814Ssimon	addc	r9,r9,r12
1869160814Ssimon	#addze	r10,r10			#carry is NOT ignored.
1870160814Ssimon					#will be taken care of
1871160814Ssimon					#in second spin below
1872160814Ssimon					#using adde.
1873160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1874160814Ssimon					#mul(rp[1],ap[1],w,c1);
1875160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1876160814Ssimon	$UMULL	r11,r6,r8
1877160814Ssimon	$UMULH  r12,r6,r8
1878160814Ssimon	adde	r11,r11,r10
1879160814Ssimon	#addze	r12,r12
1880160814Ssimon	$ST	r11,`1*$BNSZ`(r3)
1881160814Ssimon					#mul(rp[2],ap[2],w,c1);
1882160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1883160814Ssimon	$UMULL	r9,r6,r8
1884160814Ssimon	$UMULH  r10,r6,r8
1885160814Ssimon	adde	r9,r9,r12
1886160814Ssimon	#addze	r10,r10
1887160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
1888160814Ssimon					#mul_add(rp[3],ap[3],w,c1);
1889160814Ssimon	$LD	r8,`3*$BNSZ`(r4)
1890160814Ssimon	$UMULL	r11,r6,r8
1891160814Ssimon	$UMULH  r12,r6,r8
1892160814Ssimon	adde	r11,r11,r10
1893160814Ssimon	addze	r12,r12			#this spin we collect carry into
1894160814Ssimon					#r12
1895160814Ssimon	$ST	r11,`3*$BNSZ`(r3)
1896160814Ssimon
1897160814Ssimon	addi	r3,r3,`4*$BNSZ`
1898160814Ssimon	addi	r4,r4,`4*$BNSZ`
1899160814Ssimon	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
1900160814Ssimon
1901160814SsimonLppcasm_mw_REM:
1902160814Ssimon	andi.	r5,r5,0x3
1903160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1904160814Ssimon					#mul(rp[0],ap[0],w,c1);
1905160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1906160814Ssimon	$UMULL	r9,r6,r8
1907160814Ssimon	$UMULH  r10,r6,r8
1908160814Ssimon	addc	r9,r9,r12
1909160814Ssimon	addze	r10,r10
1910160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1911160814Ssimon	addi	r12,r10,0
1912160814Ssimon
1913160814Ssimon	addi	r5,r5,-1
1914160814Ssimon	cmpli	0,0,r5,0
1915160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1916160814Ssimon
1917160814Ssimon
1918160814Ssimon					#mul(rp[1],ap[1],w,c1);
1919160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1920160814Ssimon	$UMULL	r9,r6,r8
1921160814Ssimon	$UMULH  r10,r6,r8
1922160814Ssimon	addc	r9,r9,r12
1923160814Ssimon	addze	r10,r10
1924160814Ssimon	$ST	r9,`1*$BNSZ`(r3)
1925160814Ssimon	addi	r12,r10,0
1926160814Ssimon
1927160814Ssimon	addi	r5,r5,-1
1928160814Ssimon	cmpli	0,0,r5,0
1929160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1930160814Ssimon
1931160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
1932160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1933160814Ssimon	$UMULL	r9,r6,r8
1934160814Ssimon	$UMULH  r10,r6,r8
1935160814Ssimon	addc	r9,r9,r12
1936160814Ssimon	addze	r10,r10
1937160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
1938160814Ssimon	addi	r12,r10,0
1939160814Ssimon
1940160814SsimonLppcasm_mw_OVER:
1941160814Ssimon	addi	r3,r12,0
1942160814Ssimon	bclr	BO_ALWAYS,CR0_LT
1943160814Ssimon	.long	0x00000000
1944160814Ssimon
1945160814Ssimon#
1946160814Ssimon#	NOTE:	The following label name should be changed to
1947160814Ssimon#		"bn_mul_add_words" i.e. remove the first dot
1948160814Ssimon#		for the gcc compiler. This should be automatically
1949160814Ssimon#		done in the build
1950160814Ssimon#
1951160814Ssimon
1952160814Ssimon.align	4
1953160814Ssimon.bn_mul_add_words:
1954160814Ssimon#
1955160814Ssimon# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1956160814Ssimon#
1957160814Ssimon# r3 = rp
1958160814Ssimon# r4 = ap
1959160814Ssimon# r5 = num
1960160814Ssimon# r6 = w
1961160814Ssimon#
1962160814Ssimon# empirical evidence suggests that unrolled version performs best!!
1963160814Ssimon#
1964160814Ssimon	xor	r0,r0,r0		#r0 = 0
1965160814Ssimon	xor	r12,r12,r12  		#r12 = 0 . used for carry
1966160814Ssimon	rlwinm.	r7,r5,30,2,31		# num >> 2
1967160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1968160814Ssimon	mtctr	r7
1969160814SsimonLppcasm_maw_mainloop:
1970160814Ssimon					#mul_add(rp[0],ap[0],w,c1);
1971160814Ssimon	$LD	r8,`0*$BNSZ`(r4)
1972160814Ssimon	$LD	r11,`0*$BNSZ`(r3)
1973160814Ssimon	$UMULL	r9,r6,r8
1974160814Ssimon	$UMULH  r10,r6,r8
1975160814Ssimon	addc	r9,r9,r12		#r12 is carry.
1976160814Ssimon	addze	r10,r10
1977160814Ssimon	addc	r9,r9,r11
1978160814Ssimon	#addze	r10,r10
1979160814Ssimon					#the above instruction addze
1980160814Ssimon					#is NOT needed. Carry will NOT
1981160814Ssimon					#be ignored. It's not affected
1982160814Ssimon					#by multiply and will be collected
1983160814Ssimon					#in the next spin
1984160814Ssimon	$ST	r9,`0*$BNSZ`(r3)
1985160814Ssimon
1986160814Ssimon					#mul_add(rp[1],ap[1],w,c1);
1987160814Ssimon	$LD	r8,`1*$BNSZ`(r4)
1988160814Ssimon	$LD	r9,`1*$BNSZ`(r3)
1989160814Ssimon	$UMULL	r11,r6,r8
1990160814Ssimon	$UMULH  r12,r6,r8
1991160814Ssimon	adde	r11,r11,r10		#r10 is carry.
1992160814Ssimon	addze	r12,r12
1993160814Ssimon	addc	r11,r11,r9
1994160814Ssimon	#addze	r12,r12
1995160814Ssimon	$ST	r11,`1*$BNSZ`(r3)
1996160814Ssimon
1997160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
1998160814Ssimon	$LD	r8,`2*$BNSZ`(r4)
1999160814Ssimon	$UMULL	r9,r6,r8
2000160814Ssimon	$LD	r11,`2*$BNSZ`(r3)
2001160814Ssimon	$UMULH  r10,r6,r8
2002160814Ssimon	adde	r9,r9,r12
2003160814Ssimon	addze	r10,r10
2004160814Ssimon	addc	r9,r9,r11
2005160814Ssimon	#addze	r10,r10
2006160814Ssimon	$ST	r9,`2*$BNSZ`(r3)
2007160814Ssimon
2008160814Ssimon					#mul_add(rp[3],ap[3],w,c1);
2009160814Ssimon	$LD	r8,`3*$BNSZ`(r4)
2010160814Ssimon	$UMULL	r11,r6,r8
2011160814Ssimon	$LD	r9,`3*$BNSZ`(r3)
2012160814Ssimon	$UMULH  r12,r6,r8
2013160814Ssimon	adde	r11,r11,r10
2014160814Ssimon	addze	r12,r12
2015160814Ssimon	addc	r11,r11,r9
2016160814Ssimon	addze	r12,r12
2017160814Ssimon	$ST	r11,`3*$BNSZ`(r3)
2018160814Ssimon	addi	r3,r3,`4*$BNSZ`
2019160814Ssimon	addi	r4,r4,`4*$BNSZ`
2020160814Ssimon	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
2021160814Ssimon
2022160814SsimonLppcasm_maw_leftover:
2023160814Ssimon	andi.	r5,r5,0x3
2024160814Ssimon	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios
2025160814Ssimon	addi	r3,r3,-$BNSZ
2026160814Ssimon	addi	r4,r4,-$BNSZ
2027160814Ssimon					#mul_add(rp[0],ap[0],w,c1);
2028160814Ssimon	mtctr	r5
2029160814Ssimon	$LDU	r8,$BNSZ(r4)
2030160814Ssimon	$UMULL	r9,r6,r8
2031160814Ssimon	$UMULH  r10,r6,r8
2032160814Ssimon	$LDU	r11,$BNSZ(r3)
2033160814Ssimon	addc	r9,r9,r11
2034160814Ssimon	addze	r10,r10
2035160814Ssimon	addc	r9,r9,r12
2036160814Ssimon	addze	r12,r10
2037160814Ssimon	$ST	r9,0(r3)
2038160814Ssimon
2039160814Ssimon	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
2040160814Ssimon					#mul_add(rp[1],ap[1],w,c1);
2041160814Ssimon	$LDU	r8,$BNSZ(r4)
2042160814Ssimon	$UMULL	r9,r6,r8
2043160814Ssimon	$UMULH  r10,r6,r8
2044160814Ssimon	$LDU	r11,$BNSZ(r3)
2045160814Ssimon	addc	r9,r9,r11
2046160814Ssimon	addze	r10,r10
2047160814Ssimon	addc	r9,r9,r12
2048160814Ssimon	addze	r12,r10
2049160814Ssimon	$ST	r9,0(r3)
2050160814Ssimon
2051160814Ssimon	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
2052160814Ssimon					#mul_add(rp[2],ap[2],w,c1);
2053160814Ssimon	$LDU	r8,$BNSZ(r4)
2054160814Ssimon	$UMULL	r9,r6,r8
2055160814Ssimon	$UMULH  r10,r6,r8
2056160814Ssimon	$LDU	r11,$BNSZ(r3)
2057160814Ssimon	addc	r9,r9,r11
2058160814Ssimon	addze	r10,r10
2059160814Ssimon	addc	r9,r9,r12
2060160814Ssimon	addze	r12,r10
2061160814Ssimon	$ST	r9,0(r3)
2062160814Ssimon
2063160814SsimonLppcasm_maw_adios:
2064160814Ssimon	addi	r3,r12,0
2065160814Ssimon	bclr	BO_ALWAYS,CR0_LT
2066160814Ssimon	.long	0x00000000
2067160814Ssimon	.align	4
2068160814SsimonEOF
2069160814Ssimon	$data =~ s/\`([^\`]*)\`/eval $1/gem;
2070160814Ssimon
2071160814Ssimon	# if some assembler chokes on some simplified mnemonic,
2072160814Ssimon	# this is the spot to fix it up, e.g.:
2073160814Ssimon	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
2074160814Ssimon	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
2075160814Ssimon	# assembler X doesn't accept li, load immediate value
2076160814Ssimon	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
2077206046Ssimon	# assembler Y chokes on apostrophes in comments
2078206046Ssimon	$data =~ s/'//gm;
2079160814Ssimon	return($data);
2080160814Ssimon}
2081