1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project.
6238384Sjkim#
7238384Sjkim# Rights for redistribution and usage in source and binary forms are
8238384Sjkim# granted according to the OpenSSL license. Warranty of any kind is
9238384Sjkim# disclaimed.
10238384Sjkim# ====================================================================
11238384Sjkim
12238384Sjkim
13238384Sjkim# July 1999
14238384Sjkim#
15238384Sjkim# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16238384Sjkim#
17238384Sjkim# The module is designed to work with either of the "new" MIPS ABI(5),
18238384Sjkim# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19238384Sjkim# IRIX 5.x not only because it doesn't support new ABIs but also
20238384Sjkim# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21238384Sjkim# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22238384Sjkim# cause illegal instruction exception:-(
23238384Sjkim#
24238384Sjkim# In addition the code depends on preprocessor flags set up by MIPSpro
25238384Sjkim# compiler driver (either as or cc) and therefore (probably?) can't be
26238384Sjkim# compiled by the GNU assembler. GNU C driver manages fine though...
27238384Sjkim# I mean as long as -mmips-as is specified or is the default option,
28238384Sjkim# because then it simply invokes /usr/bin/as which in turn takes
29238384Sjkim# perfect care of the preprocessor definitions. Another neat feature
30238384Sjkim# offered by the MIPSpro assembler is an optimization pass. This gave
31238384Sjkim# me the opportunity to have the code looking more regular as all those
32238384Sjkim# architecture dependent instruction rescheduling details were left to
33238384Sjkim# the assembler. Cool, huh?
34238384Sjkim#
35238384Sjkim# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36238384Sjkim# goes way over 3 times faster!
37238384Sjkim#
38238384Sjkim#					<appro@fy.chalmers.se>
39238384Sjkim
40238384Sjkim# October 2010
41238384Sjkim#
42238384Sjkim# Adapt the module even for 32-bit ABIs and other OSes. The former was
43238384Sjkim# achieved by mechanical replacement of 64-bit arithmetic instructions
44238384Sjkim# such as dmultu, daddu, etc. with their 32-bit counterparts and
45238384Sjkim# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46238384Sjkim# >3x performance improvement naturally does not apply to 32-bit code
47238384Sjkim# [because there is no instruction 32-bit compiler can't use], one
48238384Sjkim# has to content with 40-85% improvement depending on benchmark and
49238384Sjkim# key length, more for longer keys.
50238384Sjkim
51238384Sjkim$flavour = shift;
52238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53238384Sjkimopen STDOUT,">$output";
54238384Sjkim
55238384Sjkimif ($flavour =~ /64|n32/i) {
56238384Sjkim	$LD="ld";
57238384Sjkim	$ST="sd";
58238384Sjkim	$MULTU="dmultu";
59238384Sjkim	$DIVU="ddivu";
60238384Sjkim	$ADDU="daddu";
61238384Sjkim	$SUBU="dsubu";
62238384Sjkim	$SRL="dsrl";
63238384Sjkim	$SLL="dsll";
64238384Sjkim	$BNSZ=8;
65238384Sjkim	$PTR_ADD="daddu";
66238384Sjkim	$PTR_SUB="dsubu";
67238384Sjkim	$SZREG=8;
68238384Sjkim	$REG_S="sd";
69238384Sjkim	$REG_L="ld";
70238384Sjkim} else {
71238384Sjkim	$LD="lw";
72238384Sjkim	$ST="sw";
73238384Sjkim	$MULTU="multu";
74238384Sjkim	$DIVU="divu";
75238384Sjkim	$ADDU="addu";
76238384Sjkim	$SUBU="subu";
77238384Sjkim	$SRL="srl";
78238384Sjkim	$SLL="sll";
79238384Sjkim	$BNSZ=4;
80238384Sjkim	$PTR_ADD="addu";
81238384Sjkim	$PTR_SUB="subu";
82238384Sjkim	$SZREG=4;
83238384Sjkim	$REG_S="sw";
84238384Sjkim	$REG_L="lw";
85238384Sjkim	$code=".set	mips2\n";
86238384Sjkim}
87238384Sjkim
88238384Sjkim# Below is N32/64 register layout used in the original module.
89238384Sjkim#
90238384Sjkim($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91238384Sjkim($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92238384Sjkim($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93238384Sjkim($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94238384Sjkim($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95238384Sjkim($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96238384Sjkim#
97238384Sjkim# No special adaptation is required for O32. NUBI on the other hand
98238384Sjkim# is treated by saving/restoring ($v1,$t0..$t3).
99238384Sjkim
100238384Sjkim$gp=$v1 if ($flavour =~ /nubi/i);
101238384Sjkim
102238384Sjkim$minus4=$v1;
103238384Sjkim
104238384Sjkim$code.=<<___;
105238384Sjkim.rdata
106238384Sjkim.asciiz	"mips3.s, Version 1.2"
107238384Sjkim.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108238384Sjkim
109238384Sjkim.text
110238384Sjkim.set	noat
111238384Sjkim
112238384Sjkim.align	5
113238384Sjkim.globl	bn_mul_add_words
114238384Sjkim.ent	bn_mul_add_words
115238384Sjkimbn_mul_add_words:
116238384Sjkim	.set	noreorder
117238384Sjkim	bgtz	$a2,bn_mul_add_words_internal
118238384Sjkim	move	$v0,$zero
119238384Sjkim	jr	$ra
120238384Sjkim	move	$a0,$v0
121238384Sjkim.end	bn_mul_add_words
122238384Sjkim
123238384Sjkim.align	5
124238384Sjkim.ent	bn_mul_add_words_internal
125238384Sjkimbn_mul_add_words_internal:
126238384Sjkim___
127238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
128238384Sjkim	.frame	$sp,6*$SZREG,$ra
129238384Sjkim	.mask	0x8000f008,-$SZREG
130238384Sjkim	.set	noreorder
131238384Sjkim	$PTR_SUB $sp,6*$SZREG
132238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
133238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
134238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
135238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
136238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
137238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
138238384Sjkim___
139238384Sjkim$code.=<<___;
140238384Sjkim	.set	reorder
141238384Sjkim	li	$minus4,-4
142238384Sjkim	and	$ta0,$a2,$minus4
143238384Sjkim	beqz	$ta0,.L_bn_mul_add_words_tail
144238384Sjkim
145238384Sjkim.L_bn_mul_add_words_loop:
146264331Sjkim	$LD	$t0,0($a1)
147238384Sjkim	$MULTU	$t0,$a3
148238384Sjkim	$LD	$t1,0($a0)
149238384Sjkim	$LD	$t2,$BNSZ($a1)
150238384Sjkim	$LD	$t3,$BNSZ($a0)
151238384Sjkim	$LD	$ta0,2*$BNSZ($a1)
152238384Sjkim	$LD	$ta1,2*$BNSZ($a0)
153238384Sjkim	$ADDU	$t1,$v0
154238384Sjkim	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
155238384Sjkim				# values", but it seems to work fine
156238384Sjkim				# even on 64-bit registers.
157238384Sjkim	mflo	$at
158238384Sjkim	mfhi	$t0
159238384Sjkim	$ADDU	$t1,$at
160238384Sjkim	$ADDU	$v0,$t0
161238384Sjkim	 $MULTU	$t2,$a3
162238384Sjkim	sltu	$at,$t1,$at
163238384Sjkim	$ST	$t1,0($a0)
164238384Sjkim	$ADDU	$v0,$at
165238384Sjkim
166238384Sjkim	$LD	$ta2,3*$BNSZ($a1)
167238384Sjkim	$LD	$ta3,3*$BNSZ($a0)
168238384Sjkim	$ADDU	$t3,$v0
169238384Sjkim	sltu	$v0,$t3,$v0
170238384Sjkim	mflo	$at
171238384Sjkim	mfhi	$t2
172238384Sjkim	$ADDU	$t3,$at
173238384Sjkim	$ADDU	$v0,$t2
174238384Sjkim	 $MULTU	$ta0,$a3
175238384Sjkim	sltu	$at,$t3,$at
176238384Sjkim	$ST	$t3,$BNSZ($a0)
177238384Sjkim	$ADDU	$v0,$at
178238384Sjkim
179238384Sjkim	subu	$a2,4
180238384Sjkim	$PTR_ADD $a0,4*$BNSZ
181238384Sjkim	$PTR_ADD $a1,4*$BNSZ
182238384Sjkim	$ADDU	$ta1,$v0
183238384Sjkim	sltu	$v0,$ta1,$v0
184238384Sjkim	mflo	$at
185238384Sjkim	mfhi	$ta0
186238384Sjkim	$ADDU	$ta1,$at
187238384Sjkim	$ADDU	$v0,$ta0
188238384Sjkim	 $MULTU	$ta2,$a3
189238384Sjkim	sltu	$at,$ta1,$at
190238384Sjkim	$ST	$ta1,-2*$BNSZ($a0)
191238384Sjkim	$ADDU	$v0,$at
192238384Sjkim
193238384Sjkim
194238384Sjkim	and	$ta0,$a2,$minus4
195238384Sjkim	$ADDU	$ta3,$v0
196238384Sjkim	sltu	$v0,$ta3,$v0
197238384Sjkim	mflo	$at
198238384Sjkim	mfhi	$ta2
199238384Sjkim	$ADDU	$ta3,$at
200238384Sjkim	$ADDU	$v0,$ta2
201238384Sjkim	sltu	$at,$ta3,$at
202238384Sjkim	$ST	$ta3,-$BNSZ($a0)
203264331Sjkim	.set	noreorder
204264331Sjkim	bgtz	$ta0,.L_bn_mul_add_words_loop
205238384Sjkim	$ADDU	$v0,$at
206238384Sjkim
207238384Sjkim	beqz	$a2,.L_bn_mul_add_words_return
208238384Sjkim	nop
209238384Sjkim
210238384Sjkim.L_bn_mul_add_words_tail:
211238384Sjkim	.set	reorder
212238384Sjkim	$LD	$t0,0($a1)
213238384Sjkim	$MULTU	$t0,$a3
214238384Sjkim	$LD	$t1,0($a0)
215238384Sjkim	subu	$a2,1
216238384Sjkim	$ADDU	$t1,$v0
217238384Sjkim	sltu	$v0,$t1,$v0
218238384Sjkim	mflo	$at
219238384Sjkim	mfhi	$t0
220238384Sjkim	$ADDU	$t1,$at
221238384Sjkim	$ADDU	$v0,$t0
222238384Sjkim	sltu	$at,$t1,$at
223238384Sjkim	$ST	$t1,0($a0)
224238384Sjkim	$ADDU	$v0,$at
225238384Sjkim	beqz	$a2,.L_bn_mul_add_words_return
226238384Sjkim
227238384Sjkim	$LD	$t0,$BNSZ($a1)
228238384Sjkim	$MULTU	$t0,$a3
229238384Sjkim	$LD	$t1,$BNSZ($a0)
230238384Sjkim	subu	$a2,1
231238384Sjkim	$ADDU	$t1,$v0
232238384Sjkim	sltu	$v0,$t1,$v0
233238384Sjkim	mflo	$at
234238384Sjkim	mfhi	$t0
235238384Sjkim	$ADDU	$t1,$at
236238384Sjkim	$ADDU	$v0,$t0
237238384Sjkim	sltu	$at,$t1,$at
238238384Sjkim	$ST	$t1,$BNSZ($a0)
239238384Sjkim	$ADDU	$v0,$at
240238384Sjkim	beqz	$a2,.L_bn_mul_add_words_return
241238384Sjkim
242238384Sjkim	$LD	$t0,2*$BNSZ($a1)
243238384Sjkim	$MULTU	$t0,$a3
244238384Sjkim	$LD	$t1,2*$BNSZ($a0)
245238384Sjkim	$ADDU	$t1,$v0
246238384Sjkim	sltu	$v0,$t1,$v0
247238384Sjkim	mflo	$at
248238384Sjkim	mfhi	$t0
249238384Sjkim	$ADDU	$t1,$at
250238384Sjkim	$ADDU	$v0,$t0
251238384Sjkim	sltu	$at,$t1,$at
252238384Sjkim	$ST	$t1,2*$BNSZ($a0)
253238384Sjkim	$ADDU	$v0,$at
254238384Sjkim
255238384Sjkim.L_bn_mul_add_words_return:
256238384Sjkim	.set	noreorder
257238384Sjkim___
258238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
259238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
260238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
261238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
262238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
263238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
264238384Sjkim	$PTR_ADD $sp,6*$SZREG
265238384Sjkim___
266238384Sjkim$code.=<<___;
267238384Sjkim	jr	$ra
268238384Sjkim	move	$a0,$v0
269238384Sjkim.end	bn_mul_add_words_internal
270238384Sjkim
271238384Sjkim.align	5
272238384Sjkim.globl	bn_mul_words
273238384Sjkim.ent	bn_mul_words
274238384Sjkimbn_mul_words:
275238384Sjkim	.set	noreorder
276238384Sjkim	bgtz	$a2,bn_mul_words_internal
277238384Sjkim	move	$v0,$zero
278238384Sjkim	jr	$ra
279238384Sjkim	move	$a0,$v0
280238384Sjkim.end	bn_mul_words
281238384Sjkim
282238384Sjkim.align	5
283238384Sjkim.ent	bn_mul_words_internal
284238384Sjkimbn_mul_words_internal:
285238384Sjkim___
286238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
287238384Sjkim	.frame	$sp,6*$SZREG,$ra
288238384Sjkim	.mask	0x8000f008,-$SZREG
289238384Sjkim	.set	noreorder
290238384Sjkim	$PTR_SUB $sp,6*$SZREG
291238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
292238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
293238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
294238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
295238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
296238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
297238384Sjkim___
298238384Sjkim$code.=<<___;
299238384Sjkim	.set	reorder
300238384Sjkim	li	$minus4,-4
301238384Sjkim	and	$ta0,$a2,$minus4
302238384Sjkim	beqz	$ta0,.L_bn_mul_words_tail
303238384Sjkim
304238384Sjkim.L_bn_mul_words_loop:
305264331Sjkim	$LD	$t0,0($a1)
306238384Sjkim	$MULTU	$t0,$a3
307238384Sjkim	$LD	$t2,$BNSZ($a1)
308238384Sjkim	$LD	$ta0,2*$BNSZ($a1)
309238384Sjkim	$LD	$ta2,3*$BNSZ($a1)
310238384Sjkim	mflo	$at
311238384Sjkim	mfhi	$t0
312238384Sjkim	$ADDU	$v0,$at
313238384Sjkim	sltu	$t1,$v0,$at
314238384Sjkim	 $MULTU	$t2,$a3
315238384Sjkim	$ST	$v0,0($a0)
316238384Sjkim	$ADDU	$v0,$t1,$t0
317238384Sjkim
318238384Sjkim	subu	$a2,4
319238384Sjkim	$PTR_ADD $a0,4*$BNSZ
320238384Sjkim	$PTR_ADD $a1,4*$BNSZ
321238384Sjkim	mflo	$at
322238384Sjkim	mfhi	$t2
323238384Sjkim	$ADDU	$v0,$at
324238384Sjkim	sltu	$t3,$v0,$at
325238384Sjkim	 $MULTU	$ta0,$a3
326238384Sjkim	$ST	$v0,-3*$BNSZ($a0)
327238384Sjkim	$ADDU	$v0,$t3,$t2
328238384Sjkim
329238384Sjkim	mflo	$at
330238384Sjkim	mfhi	$ta0
331238384Sjkim	$ADDU	$v0,$at
332238384Sjkim	sltu	$ta1,$v0,$at
333238384Sjkim	 $MULTU	$ta2,$a3
334238384Sjkim	$ST	$v0,-2*$BNSZ($a0)
335238384Sjkim	$ADDU	$v0,$ta1,$ta0
336238384Sjkim
337238384Sjkim	and	$ta0,$a2,$minus4
338238384Sjkim	mflo	$at
339238384Sjkim	mfhi	$ta2
340238384Sjkim	$ADDU	$v0,$at
341238384Sjkim	sltu	$ta3,$v0,$at
342238384Sjkim	$ST	$v0,-$BNSZ($a0)
343264331Sjkim	.set	noreorder
344264331Sjkim	bgtz	$ta0,.L_bn_mul_words_loop
345238384Sjkim	$ADDU	$v0,$ta3,$ta2
346238384Sjkim
347238384Sjkim	beqz	$a2,.L_bn_mul_words_return
348238384Sjkim	nop
349238384Sjkim
350238384Sjkim.L_bn_mul_words_tail:
351238384Sjkim	.set	reorder
352238384Sjkim	$LD	$t0,0($a1)
353238384Sjkim	$MULTU	$t0,$a3
354238384Sjkim	subu	$a2,1
355238384Sjkim	mflo	$at
356238384Sjkim	mfhi	$t0
357238384Sjkim	$ADDU	$v0,$at
358238384Sjkim	sltu	$t1,$v0,$at
359238384Sjkim	$ST	$v0,0($a0)
360238384Sjkim	$ADDU	$v0,$t1,$t0
361238384Sjkim	beqz	$a2,.L_bn_mul_words_return
362238384Sjkim
363238384Sjkim	$LD	$t0,$BNSZ($a1)
364238384Sjkim	$MULTU	$t0,$a3
365238384Sjkim	subu	$a2,1
366238384Sjkim	mflo	$at
367238384Sjkim	mfhi	$t0
368238384Sjkim	$ADDU	$v0,$at
369238384Sjkim	sltu	$t1,$v0,$at
370238384Sjkim	$ST	$v0,$BNSZ($a0)
371238384Sjkim	$ADDU	$v0,$t1,$t0
372238384Sjkim	beqz	$a2,.L_bn_mul_words_return
373238384Sjkim
374238384Sjkim	$LD	$t0,2*$BNSZ($a1)
375238384Sjkim	$MULTU	$t0,$a3
376238384Sjkim	mflo	$at
377238384Sjkim	mfhi	$t0
378238384Sjkim	$ADDU	$v0,$at
379238384Sjkim	sltu	$t1,$v0,$at
380238384Sjkim	$ST	$v0,2*$BNSZ($a0)
381238384Sjkim	$ADDU	$v0,$t1,$t0
382238384Sjkim
383238384Sjkim.L_bn_mul_words_return:
384238384Sjkim	.set	noreorder
385238384Sjkim___
386238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
387238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
388238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
389238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
390238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
391238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
392238384Sjkim	$PTR_ADD $sp,6*$SZREG
393238384Sjkim___
394238384Sjkim$code.=<<___;
395238384Sjkim	jr	$ra
396238384Sjkim	move	$a0,$v0
397238384Sjkim.end	bn_mul_words_internal
398238384Sjkim
399238384Sjkim.align	5
400238384Sjkim.globl	bn_sqr_words
401238384Sjkim.ent	bn_sqr_words
402238384Sjkimbn_sqr_words:
403238384Sjkim	.set	noreorder
404238384Sjkim	bgtz	$a2,bn_sqr_words_internal
405238384Sjkim	move	$v0,$zero
406238384Sjkim	jr	$ra
407238384Sjkim	move	$a0,$v0
408238384Sjkim.end	bn_sqr_words
409238384Sjkim
410238384Sjkim.align	5
411238384Sjkim.ent	bn_sqr_words_internal
412238384Sjkimbn_sqr_words_internal:
413238384Sjkim___
414238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
415238384Sjkim	.frame	$sp,6*$SZREG,$ra
416238384Sjkim	.mask	0x8000f008,-$SZREG
417238384Sjkim	.set	noreorder
418238384Sjkim	$PTR_SUB $sp,6*$SZREG
419238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
420238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
421238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
422238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
423238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
424238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
425238384Sjkim___
426238384Sjkim$code.=<<___;
427238384Sjkim	.set	reorder
428238384Sjkim	li	$minus4,-4
429238384Sjkim	and	$ta0,$a2,$minus4
430238384Sjkim	beqz	$ta0,.L_bn_sqr_words_tail
431238384Sjkim
432238384Sjkim.L_bn_sqr_words_loop:
433264331Sjkim	$LD	$t0,0($a1)
434238384Sjkim	$MULTU	$t0,$t0
435238384Sjkim	$LD	$t2,$BNSZ($a1)
436238384Sjkim	$LD	$ta0,2*$BNSZ($a1)
437238384Sjkim	$LD	$ta2,3*$BNSZ($a1)
438238384Sjkim	mflo	$t1
439238384Sjkim	mfhi	$t0
440238384Sjkim	$ST	$t1,0($a0)
441238384Sjkim	$ST	$t0,$BNSZ($a0)
442238384Sjkim
443238384Sjkim	$MULTU	$t2,$t2
444238384Sjkim	subu	$a2,4
445238384Sjkim	$PTR_ADD $a0,8*$BNSZ
446238384Sjkim	$PTR_ADD $a1,4*$BNSZ
447238384Sjkim	mflo	$t3
448238384Sjkim	mfhi	$t2
449238384Sjkim	$ST	$t3,-6*$BNSZ($a0)
450238384Sjkim	$ST	$t2,-5*$BNSZ($a0)
451238384Sjkim
452238384Sjkim	$MULTU	$ta0,$ta0
453238384Sjkim	mflo	$ta1
454238384Sjkim	mfhi	$ta0
455238384Sjkim	$ST	$ta1,-4*$BNSZ($a0)
456238384Sjkim	$ST	$ta0,-3*$BNSZ($a0)
457238384Sjkim
458238384Sjkim
459238384Sjkim	$MULTU	$ta2,$ta2
460238384Sjkim	and	$ta0,$a2,$minus4
461238384Sjkim	mflo	$ta3
462238384Sjkim	mfhi	$ta2
463238384Sjkim	$ST	$ta3,-2*$BNSZ($a0)
464238384Sjkim
465238384Sjkim	.set	noreorder
466264331Sjkim	bgtz	$ta0,.L_bn_sqr_words_loop
467264331Sjkim	$ST	$ta2,-$BNSZ($a0)
468238384Sjkim
469238384Sjkim	beqz	$a2,.L_bn_sqr_words_return
470238384Sjkim	nop
471238384Sjkim
472238384Sjkim.L_bn_sqr_words_tail:
473238384Sjkim	.set	reorder
474238384Sjkim	$LD	$t0,0($a1)
475238384Sjkim	$MULTU	$t0,$t0
476238384Sjkim	subu	$a2,1
477238384Sjkim	mflo	$t1
478238384Sjkim	mfhi	$t0
479238384Sjkim	$ST	$t1,0($a0)
480238384Sjkim	$ST	$t0,$BNSZ($a0)
481238384Sjkim	beqz	$a2,.L_bn_sqr_words_return
482238384Sjkim
483238384Sjkim	$LD	$t0,$BNSZ($a1)
484238384Sjkim	$MULTU	$t0,$t0
485238384Sjkim	subu	$a2,1
486238384Sjkim	mflo	$t1
487238384Sjkim	mfhi	$t0
488238384Sjkim	$ST	$t1,2*$BNSZ($a0)
489238384Sjkim	$ST	$t0,3*$BNSZ($a0)
490238384Sjkim	beqz	$a2,.L_bn_sqr_words_return
491238384Sjkim
492238384Sjkim	$LD	$t0,2*$BNSZ($a1)
493238384Sjkim	$MULTU	$t0,$t0
494238384Sjkim	mflo	$t1
495238384Sjkim	mfhi	$t0
496238384Sjkim	$ST	$t1,4*$BNSZ($a0)
497238384Sjkim	$ST	$t0,5*$BNSZ($a0)
498238384Sjkim
499238384Sjkim.L_bn_sqr_words_return:
500238384Sjkim	.set	noreorder
501238384Sjkim___
502238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
503238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
504238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
505238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
506238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
507238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
508238384Sjkim	$PTR_ADD $sp,6*$SZREG
509238384Sjkim___
510238384Sjkim$code.=<<___;
511238384Sjkim	jr	$ra
512238384Sjkim	move	$a0,$v0
513238384Sjkim
514238384Sjkim.end	bn_sqr_words_internal
515238384Sjkim
516238384Sjkim.align	5
517238384Sjkim.globl	bn_add_words
518238384Sjkim.ent	bn_add_words
519238384Sjkimbn_add_words:
520238384Sjkim	.set	noreorder
521238384Sjkim	bgtz	$a3,bn_add_words_internal
522238384Sjkim	move	$v0,$zero
523238384Sjkim	jr	$ra
524238384Sjkim	move	$a0,$v0
525238384Sjkim.end	bn_add_words
526238384Sjkim
527238384Sjkim.align	5
528238384Sjkim.ent	bn_add_words_internal
529238384Sjkimbn_add_words_internal:
530238384Sjkim___
531238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
532238384Sjkim	.frame	$sp,6*$SZREG,$ra
533238384Sjkim	.mask	0x8000f008,-$SZREG
534238384Sjkim	.set	noreorder
535238384Sjkim	$PTR_SUB $sp,6*$SZREG
536238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
537238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
538238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
539238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
540238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
541238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
542238384Sjkim___
543238384Sjkim$code.=<<___;
544238384Sjkim	.set	reorder
545238384Sjkim	li	$minus4,-4
546238384Sjkim	and	$at,$a3,$minus4
547238384Sjkim	beqz	$at,.L_bn_add_words_tail
548238384Sjkim
549238384Sjkim.L_bn_add_words_loop:
550264331Sjkim	$LD	$t0,0($a1)
551238384Sjkim	$LD	$ta0,0($a2)
552238384Sjkim	subu	$a3,4
553238384Sjkim	$LD	$t1,$BNSZ($a1)
554238384Sjkim	and	$at,$a3,$minus4
555238384Sjkim	$LD	$t2,2*$BNSZ($a1)
556238384Sjkim	$PTR_ADD $a2,4*$BNSZ
557238384Sjkim	$LD	$t3,3*$BNSZ($a1)
558238384Sjkim	$PTR_ADD $a0,4*$BNSZ
559238384Sjkim	$LD	$ta1,-3*$BNSZ($a2)
560238384Sjkim	$PTR_ADD $a1,4*$BNSZ
561238384Sjkim	$LD	$ta2,-2*$BNSZ($a2)
562238384Sjkim	$LD	$ta3,-$BNSZ($a2)
563238384Sjkim	$ADDU	$ta0,$t0
564238384Sjkim	sltu	$t8,$ta0,$t0
565238384Sjkim	$ADDU	$t0,$ta0,$v0
566238384Sjkim	sltu	$v0,$t0,$ta0
567238384Sjkim	$ST	$t0,-4*$BNSZ($a0)
568238384Sjkim	$ADDU	$v0,$t8
569238384Sjkim
570238384Sjkim	$ADDU	$ta1,$t1
571238384Sjkim	sltu	$t9,$ta1,$t1
572238384Sjkim	$ADDU	$t1,$ta1,$v0
573238384Sjkim	sltu	$v0,$t1,$ta1
574238384Sjkim	$ST	$t1,-3*$BNSZ($a0)
575238384Sjkim	$ADDU	$v0,$t9
576238384Sjkim
577238384Sjkim	$ADDU	$ta2,$t2
578238384Sjkim	sltu	$t8,$ta2,$t2
579238384Sjkim	$ADDU	$t2,$ta2,$v0
580238384Sjkim	sltu	$v0,$t2,$ta2
581238384Sjkim	$ST	$t2,-2*$BNSZ($a0)
582238384Sjkim	$ADDU	$v0,$t8
583238384Sjkim
584238384Sjkim	$ADDU	$ta3,$t3
585238384Sjkim	sltu	$t9,$ta3,$t3
586238384Sjkim	$ADDU	$t3,$ta3,$v0
587238384Sjkim	sltu	$v0,$t3,$ta3
588238384Sjkim	$ST	$t3,-$BNSZ($a0)
589238384Sjkim
590238384Sjkim	.set	noreorder
591264331Sjkim	bgtz	$at,.L_bn_add_words_loop
592264331Sjkim	$ADDU	$v0,$t9
593238384Sjkim
594238384Sjkim	beqz	$a3,.L_bn_add_words_return
595238384Sjkim	nop
596238384Sjkim
597238384Sjkim.L_bn_add_words_tail:
598238384Sjkim	.set	reorder
599238384Sjkim	$LD	$t0,0($a1)
600238384Sjkim	$LD	$ta0,0($a2)
601238384Sjkim	$ADDU	$ta0,$t0
602238384Sjkim	subu	$a3,1
603238384Sjkim	sltu	$t8,$ta0,$t0
604238384Sjkim	$ADDU	$t0,$ta0,$v0
605238384Sjkim	sltu	$v0,$t0,$ta0
606238384Sjkim	$ST	$t0,0($a0)
607238384Sjkim	$ADDU	$v0,$t8
608238384Sjkim	beqz	$a3,.L_bn_add_words_return
609238384Sjkim
610238384Sjkim	$LD	$t1,$BNSZ($a1)
611238384Sjkim	$LD	$ta1,$BNSZ($a2)
612238384Sjkim	$ADDU	$ta1,$t1
613238384Sjkim	subu	$a3,1
614238384Sjkim	sltu	$t9,$ta1,$t1
615238384Sjkim	$ADDU	$t1,$ta1,$v0
616238384Sjkim	sltu	$v0,$t1,$ta1
617238384Sjkim	$ST	$t1,$BNSZ($a0)
618238384Sjkim	$ADDU	$v0,$t9
619238384Sjkim	beqz	$a3,.L_bn_add_words_return
620238384Sjkim
621238384Sjkim	$LD	$t2,2*$BNSZ($a1)
622238384Sjkim	$LD	$ta2,2*$BNSZ($a2)
623238384Sjkim	$ADDU	$ta2,$t2
624238384Sjkim	sltu	$t8,$ta2,$t2
625238384Sjkim	$ADDU	$t2,$ta2,$v0
626238384Sjkim	sltu	$v0,$t2,$ta2
627238384Sjkim	$ST	$t2,2*$BNSZ($a0)
628238384Sjkim	$ADDU	$v0,$t8
629238384Sjkim
630238384Sjkim.L_bn_add_words_return:
631238384Sjkim	.set	noreorder
632238384Sjkim___
633238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
634238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
635238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
636238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
637238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
638238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
639238384Sjkim	$PTR_ADD $sp,6*$SZREG
640238384Sjkim___
641238384Sjkim$code.=<<___;
642238384Sjkim	jr	$ra
643238384Sjkim	move	$a0,$v0
644238384Sjkim
645238384Sjkim.end	bn_add_words_internal
646238384Sjkim
647238384Sjkim.align	5
648238384Sjkim.globl	bn_sub_words
649238384Sjkim.ent	bn_sub_words
650238384Sjkimbn_sub_words:
651238384Sjkim	.set	noreorder
652238384Sjkim	bgtz	$a3,bn_sub_words_internal
653238384Sjkim	move	$v0,$zero
654238384Sjkim	jr	$ra
655238384Sjkim	move	$a0,$zero
656238384Sjkim.end	bn_sub_words
657238384Sjkim
658238384Sjkim.align	5
659238384Sjkim.ent	bn_sub_words_internal
660238384Sjkimbn_sub_words_internal:
661238384Sjkim___
662238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
663238384Sjkim	.frame	$sp,6*$SZREG,$ra
664238384Sjkim	.mask	0x8000f008,-$SZREG
665238384Sjkim	.set	noreorder
666238384Sjkim	$PTR_SUB $sp,6*$SZREG
667238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
668238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
669238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
670238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
671238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
672238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
673238384Sjkim___
674238384Sjkim$code.=<<___;
675238384Sjkim	.set	reorder
676238384Sjkim	li	$minus4,-4
677238384Sjkim	and	$at,$a3,$minus4
678238384Sjkim	beqz	$at,.L_bn_sub_words_tail
679238384Sjkim
680238384Sjkim.L_bn_sub_words_loop:
681264331Sjkim	$LD	$t0,0($a1)
682238384Sjkim	$LD	$ta0,0($a2)
683238384Sjkim	subu	$a3,4
684238384Sjkim	$LD	$t1,$BNSZ($a1)
685238384Sjkim	and	$at,$a3,$minus4
686238384Sjkim	$LD	$t2,2*$BNSZ($a1)
687238384Sjkim	$PTR_ADD $a2,4*$BNSZ
688238384Sjkim	$LD	$t3,3*$BNSZ($a1)
689238384Sjkim	$PTR_ADD $a0,4*$BNSZ
690238384Sjkim	$LD	$ta1,-3*$BNSZ($a2)
691238384Sjkim	$PTR_ADD $a1,4*$BNSZ
692238384Sjkim	$LD	$ta2,-2*$BNSZ($a2)
693238384Sjkim	$LD	$ta3,-$BNSZ($a2)
694238384Sjkim	sltu	$t8,$t0,$ta0
695238384Sjkim	$SUBU	$ta0,$t0,$ta0
696238384Sjkim	$SUBU	$t0,$ta0,$v0
697238384Sjkim	sgtu	$v0,$t0,$ta0
698238384Sjkim	$ST	$t0,-4*$BNSZ($a0)
699238384Sjkim	$ADDU	$v0,$t8
700238384Sjkim
701238384Sjkim	sltu	$t9,$t1,$ta1
702238384Sjkim	$SUBU	$ta1,$t1,$ta1
703238384Sjkim	$SUBU	$t1,$ta1,$v0
704238384Sjkim	sgtu	$v0,$t1,$ta1
705238384Sjkim	$ST	$t1,-3*$BNSZ($a0)
706238384Sjkim	$ADDU	$v0,$t9
707238384Sjkim
708238384Sjkim
709238384Sjkim	sltu	$t8,$t2,$ta2
710238384Sjkim	$SUBU	$ta2,$t2,$ta2
711238384Sjkim	$SUBU	$t2,$ta2,$v0
712238384Sjkim	sgtu	$v0,$t2,$ta2
713238384Sjkim	$ST	$t2,-2*$BNSZ($a0)
714238384Sjkim	$ADDU	$v0,$t8
715238384Sjkim
716238384Sjkim	sltu	$t9,$t3,$ta3
717238384Sjkim	$SUBU	$ta3,$t3,$ta3
718238384Sjkim	$SUBU	$t3,$ta3,$v0
719238384Sjkim	sgtu	$v0,$t3,$ta3
720238384Sjkim	$ST	$t3,-$BNSZ($a0)
721238384Sjkim
722238384Sjkim	.set	noreorder
723264331Sjkim	bgtz	$at,.L_bn_sub_words_loop
724264331Sjkim	$ADDU	$v0,$t9
725238384Sjkim
726238384Sjkim	beqz	$a3,.L_bn_sub_words_return
727238384Sjkim	nop
728238384Sjkim
729238384Sjkim.L_bn_sub_words_tail:
730238384Sjkim	.set	reorder
731238384Sjkim	$LD	$t0,0($a1)
732238384Sjkim	$LD	$ta0,0($a2)
733238384Sjkim	subu	$a3,1
734238384Sjkim	sltu	$t8,$t0,$ta0
735238384Sjkim	$SUBU	$ta0,$t0,$ta0
736238384Sjkim	$SUBU	$t0,$ta0,$v0
737238384Sjkim	sgtu	$v0,$t0,$ta0
738238384Sjkim	$ST	$t0,0($a0)
739238384Sjkim	$ADDU	$v0,$t8
740238384Sjkim	beqz	$a3,.L_bn_sub_words_return
741238384Sjkim
742238384Sjkim	$LD	$t1,$BNSZ($a1)
743238384Sjkim	subu	$a3,1
744238384Sjkim	$LD	$ta1,$BNSZ($a2)
745238384Sjkim	sltu	$t9,$t1,$ta1
746238384Sjkim	$SUBU	$ta1,$t1,$ta1
747238384Sjkim	$SUBU	$t1,$ta1,$v0
748238384Sjkim	sgtu	$v0,$t1,$ta1
749238384Sjkim	$ST	$t1,$BNSZ($a0)
750238384Sjkim	$ADDU	$v0,$t9
751238384Sjkim	beqz	$a3,.L_bn_sub_words_return
752238384Sjkim
753238384Sjkim	$LD	$t2,2*$BNSZ($a1)
754238384Sjkim	$LD	$ta2,2*$BNSZ($a2)
755238384Sjkim	sltu	$t8,$t2,$ta2
756238384Sjkim	$SUBU	$ta2,$t2,$ta2
757238384Sjkim	$SUBU	$t2,$ta2,$v0
758238384Sjkim	sgtu	$v0,$t2,$ta2
759238384Sjkim	$ST	$t2,2*$BNSZ($a0)
760238384Sjkim	$ADDU	$v0,$t8
761238384Sjkim
762238384Sjkim.L_bn_sub_words_return:
763238384Sjkim	.set	noreorder
764238384Sjkim___
765238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
766238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
767238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
768238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
769238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
770238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
771238384Sjkim	$PTR_ADD $sp,6*$SZREG
772238384Sjkim___
773238384Sjkim$code.=<<___;
774238384Sjkim	jr	$ra
775238384Sjkim	move	$a0,$v0
776238384Sjkim.end	bn_sub_words_internal
777238384Sjkim
778238384Sjkim.align 5
779238384Sjkim.globl	bn_div_3_words
780238384Sjkim.ent	bn_div_3_words
781238384Sjkimbn_div_3_words:
782238384Sjkim	.set	noreorder
783238384Sjkim	move	$a3,$a0		# we know that bn_div_words does not
784238384Sjkim				# touch $a3, $ta2, $ta3 and preserves $a2
785238384Sjkim				# so that we can save two arguments
786238384Sjkim				# and return address in registers
787238384Sjkim				# instead of stack:-)
788238384Sjkim
789238384Sjkim	$LD	$a0,($a3)
790238384Sjkim	move	$ta2,$a1
791238384Sjkim	bne	$a0,$a2,bn_div_3_words_internal
792238384Sjkim	$LD	$a1,-$BNSZ($a3)
793238384Sjkim	li	$v0,-1
794238384Sjkim	jr	$ra
795238384Sjkim	move	$a0,$v0
796238384Sjkim.end	bn_div_3_words
797238384Sjkim
798238384Sjkim.align	5
799238384Sjkim.ent	bn_div_3_words_internal
800238384Sjkimbn_div_3_words_internal:
801238384Sjkim___
802238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
803238384Sjkim	.frame	$sp,6*$SZREG,$ra
804238384Sjkim	.mask	0x8000f008,-$SZREG
805238384Sjkim	.set	noreorder
806238384Sjkim	$PTR_SUB $sp,6*$SZREG
807238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
808238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
809238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
810238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
811238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
812238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
813238384Sjkim___
814238384Sjkim$code.=<<___;
815238384Sjkim	.set	reorder
816238384Sjkim	move	$ta3,$ra
817246772Sjkim	bal	bn_div_words_internal
818238384Sjkim	move	$ra,$ta3
819238384Sjkim	$MULTU	$ta2,$v0
820238384Sjkim	$LD	$t2,-2*$BNSZ($a3)
821238384Sjkim	move	$ta0,$zero
822238384Sjkim	mfhi	$t1
823238384Sjkim	mflo	$t0
824238384Sjkim	sltu	$t8,$t1,$a1
825238384Sjkim.L_bn_div_3_words_inner_loop:
826238384Sjkim	bnez	$t8,.L_bn_div_3_words_inner_loop_done
827238384Sjkim	sgeu	$at,$t2,$t0
828238384Sjkim	seq	$t9,$t1,$a1
829238384Sjkim	and	$at,$t9
830238384Sjkim	sltu	$t3,$t0,$ta2
831238384Sjkim	$ADDU	$a1,$a2
832238384Sjkim	$SUBU	$t1,$t3
833238384Sjkim	$SUBU	$t0,$ta2
834238384Sjkim	sltu	$t8,$t1,$a1
835238384Sjkim	sltu	$ta0,$a1,$a2
836238384Sjkim	or	$t8,$ta0
837238384Sjkim	.set	noreorder
838264331Sjkim	beqz	$at,.L_bn_div_3_words_inner_loop
839238384Sjkim	$SUBU	$v0,1
840264331Sjkim	$ADDU	$v0,1
841238384Sjkim	.set	reorder
842238384Sjkim.L_bn_div_3_words_inner_loop_done:
843238384Sjkim	.set	noreorder
844238384Sjkim___
845238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
846238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
847238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
848238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
849238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
850238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
851238384Sjkim	$PTR_ADD $sp,6*$SZREG
852238384Sjkim___
853238384Sjkim$code.=<<___;
854238384Sjkim	jr	$ra
855238384Sjkim	move	$a0,$v0
856238384Sjkim.end	bn_div_3_words_internal
857238384Sjkim
858238384Sjkim.align	5
859238384Sjkim.globl	bn_div_words
860238384Sjkim.ent	bn_div_words
861238384Sjkimbn_div_words:
862238384Sjkim	.set	noreorder
863238384Sjkim	bnez	$a2,bn_div_words_internal
864238384Sjkim	li	$v0,-1		# I would rather signal div-by-zero
865238384Sjkim				# which can be done with 'break 7'
866238384Sjkim	jr	$ra
867238384Sjkim	move	$a0,$v0
868238384Sjkim.end	bn_div_words
869238384Sjkim
870238384Sjkim.align	5
871238384Sjkim.ent	bn_div_words_internal
872238384Sjkimbn_div_words_internal:
873238384Sjkim___
874238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
875238384Sjkim	.frame	$sp,6*$SZREG,$ra
876238384Sjkim	.mask	0x8000f008,-$SZREG
877238384Sjkim	.set	noreorder
878238384Sjkim	$PTR_SUB $sp,6*$SZREG
879238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
880238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
881238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
882238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
883238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
884238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
885238384Sjkim___
886238384Sjkim$code.=<<___;
887238384Sjkim	move	$v1,$zero
888238384Sjkim	bltz	$a2,.L_bn_div_words_body
889238384Sjkim	move	$t9,$v1
890238384Sjkim	$SLL	$a2,1
891238384Sjkim	bgtz	$a2,.-4
892238384Sjkim	addu	$t9,1
893238384Sjkim
894238384Sjkim	.set	reorder
895238384Sjkim	negu	$t1,$t9
896238384Sjkim	li	$t2,-1
897238384Sjkim	$SLL	$t2,$t1
898238384Sjkim	and	$t2,$a0
899238384Sjkim	$SRL	$at,$a1,$t1
900238384Sjkim	.set	noreorder
901264331Sjkim	beqz	$t2,.+12
902264331Sjkim	nop
903238384Sjkim	break	6		# signal overflow
904238384Sjkim	.set	reorder
905238384Sjkim	$SLL	$a0,$t9
906238384Sjkim	$SLL	$a1,$t9
907238384Sjkim	or	$a0,$at
908238384Sjkim___
909238384Sjkim$QT=$ta0;
910238384Sjkim$HH=$ta1;
911238384Sjkim$DH=$v1;
912238384Sjkim$code.=<<___;
913238384Sjkim.L_bn_div_words_body:
914238384Sjkim	$SRL	$DH,$a2,4*$BNSZ	# bits
915238384Sjkim	sgeu	$at,$a0,$a2
916238384Sjkim	.set	noreorder
917264331Sjkim	beqz	$at,.+12
918264331Sjkim	nop
919238384Sjkim	$SUBU	$a0,$a2
920238384Sjkim	.set	reorder
921238384Sjkim
922238384Sjkim	li	$QT,-1
923238384Sjkim	$SRL	$HH,$a0,4*$BNSZ	# bits
924238384Sjkim	$SRL	$QT,4*$BNSZ	# q=0xffffffff
925238384Sjkim	beq	$DH,$HH,.L_bn_div_words_skip_div1
926238384Sjkim	$DIVU	$zero,$a0,$DH
927238384Sjkim	mflo	$QT
928238384Sjkim.L_bn_div_words_skip_div1:
929238384Sjkim	$MULTU	$a2,$QT
930238384Sjkim	$SLL	$t3,$a0,4*$BNSZ	# bits
931238384Sjkim	$SRL	$at,$a1,4*$BNSZ	# bits
932238384Sjkim	or	$t3,$at
933238384Sjkim	mflo	$t0
934238384Sjkim	mfhi	$t1
935238384Sjkim.L_bn_div_words_inner_loop1:
936238384Sjkim	sltu	$t2,$t3,$t0
937238384Sjkim	seq	$t8,$HH,$t1
938238384Sjkim	sltu	$at,$HH,$t1
939238384Sjkim	and	$t2,$t8
940238384Sjkim	sltu	$v0,$t0,$a2
941238384Sjkim	or	$at,$t2
942238384Sjkim	.set	noreorder
943238384Sjkim	beqz	$at,.L_bn_div_words_inner_loop1_done
944238384Sjkim	$SUBU	$t1,$v0
945238384Sjkim	$SUBU	$t0,$a2
946238384Sjkim	b	.L_bn_div_words_inner_loop1
947238384Sjkim	$SUBU	$QT,1
948238384Sjkim	.set	reorder
949238384Sjkim.L_bn_div_words_inner_loop1_done:
950238384Sjkim
951238384Sjkim	$SLL	$a1,4*$BNSZ	# bits
952238384Sjkim	$SUBU	$a0,$t3,$t0
953238384Sjkim	$SLL	$v0,$QT,4*$BNSZ	# bits
954238384Sjkim
955238384Sjkim	li	$QT,-1
956238384Sjkim	$SRL	$HH,$a0,4*$BNSZ	# bits
957238384Sjkim	$SRL	$QT,4*$BNSZ	# q=0xffffffff
958238384Sjkim	beq	$DH,$HH,.L_bn_div_words_skip_div2
959238384Sjkim	$DIVU	$zero,$a0,$DH
960238384Sjkim	mflo	$QT
961238384Sjkim.L_bn_div_words_skip_div2:
962238384Sjkim	$MULTU	$a2,$QT
963238384Sjkim	$SLL	$t3,$a0,4*$BNSZ	# bits
964238384Sjkim	$SRL	$at,$a1,4*$BNSZ	# bits
965238384Sjkim	or	$t3,$at
966238384Sjkim	mflo	$t0
967238384Sjkim	mfhi	$t1
968238384Sjkim.L_bn_div_words_inner_loop2:
969238384Sjkim	sltu	$t2,$t3,$t0
970238384Sjkim	seq	$t8,$HH,$t1
971238384Sjkim	sltu	$at,$HH,$t1
972238384Sjkim	and	$t2,$t8
973238384Sjkim	sltu	$v1,$t0,$a2
974238384Sjkim	or	$at,$t2
975238384Sjkim	.set	noreorder
976238384Sjkim	beqz	$at,.L_bn_div_words_inner_loop2_done
977238384Sjkim	$SUBU	$t1,$v1
978238384Sjkim	$SUBU	$t0,$a2
979238384Sjkim	b	.L_bn_div_words_inner_loop2
980238384Sjkim	$SUBU	$QT,1
981238384Sjkim	.set	reorder
982238384Sjkim.L_bn_div_words_inner_loop2_done:
983238384Sjkim
984238384Sjkim	$SUBU	$a0,$t3,$t0
985238384Sjkim	or	$v0,$QT
986238384Sjkim	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
987238384Sjkim	$SRL	$a2,$t9		# restore $a2
988238384Sjkim
989238384Sjkim	.set	noreorder
990238384Sjkim	move	$a1,$v1
991238384Sjkim___
992238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
993238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
994238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
995238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
996238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
997238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
998238384Sjkim	$PTR_ADD $sp,6*$SZREG
999238384Sjkim___
1000238384Sjkim$code.=<<___;
1001238384Sjkim	jr	$ra
1002238384Sjkim	move	$a0,$v0
1003238384Sjkim.end	bn_div_words_internal
1004238384Sjkim___
1005238384Sjkimundef $HH; undef $QT; undef $DH;
1006238384Sjkim
1007238384Sjkim($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008238384Sjkim($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1009238384Sjkim
1010238384Sjkim($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011238384Sjkim($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1012238384Sjkim
1013238384Sjkim($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1014238384Sjkim
1015238384Sjkim$code.=<<___;
1016238384Sjkim
1017238384Sjkim.align	5
1018238384Sjkim.globl	bn_mul_comba8
1019238384Sjkim.ent	bn_mul_comba8
1020238384Sjkimbn_mul_comba8:
1021238384Sjkim	.set	noreorder
1022238384Sjkim___
1023238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
1024238384Sjkim	.frame	$sp,12*$SZREG,$ra
1025238384Sjkim	.mask	0x803ff008,-$SZREG
1026238384Sjkim	$PTR_SUB $sp,12*$SZREG
1027238384Sjkim	$REG_S	$ra,11*$SZREG($sp)
1028238384Sjkim	$REG_S	$s5,10*$SZREG($sp)
1029238384Sjkim	$REG_S	$s4,9*$SZREG($sp)
1030238384Sjkim	$REG_S	$s3,8*$SZREG($sp)
1031238384Sjkim	$REG_S	$s2,7*$SZREG($sp)
1032238384Sjkim	$REG_S	$s1,6*$SZREG($sp)
1033238384Sjkim	$REG_S	$s0,5*$SZREG($sp)
1034238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
1035238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
1036238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
1037238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
1038238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
1039238384Sjkim___
1040238384Sjkim$code.=<<___ if ($flavour !~ /nubi/i);
1041238384Sjkim	.frame	$sp,6*$SZREG,$ra
1042238384Sjkim	.mask	0x003f0000,-$SZREG
1043238384Sjkim	$PTR_SUB $sp,6*$SZREG
1044238384Sjkim	$REG_S	$s5,5*$SZREG($sp)
1045238384Sjkim	$REG_S	$s4,4*$SZREG($sp)
1046238384Sjkim	$REG_S	$s3,3*$SZREG($sp)
1047238384Sjkim	$REG_S	$s2,2*$SZREG($sp)
1048238384Sjkim	$REG_S	$s1,1*$SZREG($sp)
1049238384Sjkim	$REG_S	$s0,0*$SZREG($sp)
1050238384Sjkim___
1051238384Sjkim$code.=<<___;
1052238384Sjkim
1053238384Sjkim	.set	reorder
1054238384Sjkim	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1055238384Sjkim				# R5000 box assembler barks on this
1056238384Sjkim				# 1ine with "should not have mult/div
1057238384Sjkim				# as last instruction in bb (R10K
1058238384Sjkim				# bug)" warning. If anybody out there
1059238384Sjkim				# has a clue about how to circumvent
1060238384Sjkim				# this do send me a note.
1061238384Sjkim				#		<appro\@fy.chalmers.se>
1062238384Sjkim
1063238384Sjkim	$LD	$b_0,0($a2)
1064238384Sjkim	$LD	$a_1,$BNSZ($a1)
1065238384Sjkim	$LD	$a_2,2*$BNSZ($a1)
1066238384Sjkim	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1067238384Sjkim	$LD	$a_3,3*$BNSZ($a1)
1068238384Sjkim	$LD	$b_1,$BNSZ($a2)
1069238384Sjkim	$LD	$b_2,2*$BNSZ($a2)
1070238384Sjkim	$LD	$b_3,3*$BNSZ($a2)
1071238384Sjkim	mflo	$c_1
1072238384Sjkim	mfhi	$c_2
1073238384Sjkim
1074238384Sjkim	$LD	$a_4,4*$BNSZ($a1)
1075238384Sjkim	$LD	$a_5,5*$BNSZ($a1)
1076238384Sjkim	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1077238384Sjkim	$LD	$a_6,6*$BNSZ($a1)
1078238384Sjkim	$LD	$a_7,7*$BNSZ($a1)
1079238384Sjkim	$LD	$b_4,4*$BNSZ($a2)
1080238384Sjkim	$LD	$b_5,5*$BNSZ($a2)
1081238384Sjkim	mflo	$t_1
1082238384Sjkim	mfhi	$t_2
1083238384Sjkim	$ADDU	$c_2,$t_1
1084238384Sjkim	sltu	$at,$c_2,$t_1
1085238384Sjkim	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1086238384Sjkim	$ADDU	$c_3,$t_2,$at
1087238384Sjkim	$LD	$b_6,6*$BNSZ($a2)
1088238384Sjkim	$LD	$b_7,7*$BNSZ($a2)
1089238384Sjkim	$ST	$c_1,0($a0)	# r[0]=c1;
1090238384Sjkim	mflo	$t_1
1091238384Sjkim	mfhi	$t_2
1092238384Sjkim	$ADDU	$c_2,$t_1
1093238384Sjkim	sltu	$at,$c_2,$t_1
1094238384Sjkim	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1095238384Sjkim	$ADDU	$t_2,$at
1096238384Sjkim	$ADDU	$c_3,$t_2
1097238384Sjkim	sltu	$c_1,$c_3,$t_2
1098238384Sjkim	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1099238384Sjkim
1100238384Sjkim	mflo	$t_1
1101238384Sjkim	mfhi	$t_2
1102238384Sjkim	$ADDU	$c_3,$t_1
1103238384Sjkim	sltu	$at,$c_3,$t_1
1104238384Sjkim	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1105238384Sjkim	$ADDU	$t_2,$at
1106238384Sjkim	$ADDU	$c_1,$t_2
1107238384Sjkim	mflo	$t_1
1108238384Sjkim	mfhi	$t_2
1109238384Sjkim	$ADDU	$c_3,$t_1
1110238384Sjkim	sltu	$at,$c_3,$t_1
1111238384Sjkim	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1112238384Sjkim	$ADDU	$t_2,$at
1113238384Sjkim	$ADDU	$c_1,$t_2
1114238384Sjkim	sltu	$c_2,$c_1,$t_2
1115238384Sjkim	mflo	$t_1
1116238384Sjkim	mfhi	$t_2
1117238384Sjkim	$ADDU	$c_3,$t_1
1118238384Sjkim	sltu	$at,$c_3,$t_1
1119238384Sjkim	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1120238384Sjkim	$ADDU	$t_2,$at
1121238384Sjkim	$ADDU	$c_1,$t_2
1122238384Sjkim	sltu	$at,$c_1,$t_2
1123238384Sjkim	$ADDU	$c_2,$at
1124238384Sjkim	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1125238384Sjkim
1126238384Sjkim	mflo	$t_1
1127238384Sjkim	mfhi	$t_2
1128238384Sjkim	$ADDU	$c_1,$t_1
1129238384Sjkim	sltu	$at,$c_1,$t_1
1130238384Sjkim	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1131238384Sjkim	$ADDU	$t_2,$at
1132238384Sjkim	$ADDU	$c_2,$t_2
1133238384Sjkim	sltu	$c_3,$c_2,$t_2
1134238384Sjkim	mflo	$t_1
1135238384Sjkim	mfhi	$t_2
1136238384Sjkim	$ADDU	$c_1,$t_1
1137238384Sjkim	sltu	$at,$c_1,$t_1
1138238384Sjkim	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1139238384Sjkim	$ADDU	$t_2,$at
1140238384Sjkim	$ADDU	$c_2,$t_2
1141238384Sjkim	sltu	$at,$c_2,$t_2
1142238384Sjkim	$ADDU	$c_3,$at
1143238384Sjkim	mflo	$t_1
1144238384Sjkim	mfhi	$t_2
1145238384Sjkim	$ADDU	$c_1,$t_1
1146238384Sjkim	sltu	$at,$c_1,$t_1
1147238384Sjkim	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1148238384Sjkim	$ADDU	$t_2,$at
1149238384Sjkim	$ADDU	$c_2,$t_2
1150238384Sjkim	sltu	$at,$c_2,$t_2
1151238384Sjkim	$ADDU	$c_3,$at
1152238384Sjkim	mflo	$t_1
1153238384Sjkim	mfhi	$t_2
1154238384Sjkim	$ADDU	$c_1,$t_1
1155238384Sjkim	sltu	$at,$c_1,$t_1
1156238384Sjkim	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
1157238384Sjkim	$ADDU	$t_2,$at
1158238384Sjkim	$ADDU	$c_2,$t_2
1159238384Sjkim	sltu	$at,$c_2,$t_2
1160238384Sjkim	$ADDU	$c_3,$at
1161238384Sjkim	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1162238384Sjkim
1163238384Sjkim	mflo	$t_1
1164238384Sjkim	mfhi	$t_2
1165238384Sjkim	$ADDU	$c_2,$t_1
1166238384Sjkim	sltu	$at,$c_2,$t_1
1167238384Sjkim	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1168238384Sjkim	$ADDU	$t_2,$at
1169238384Sjkim	$ADDU	$c_3,$t_2
1170238384Sjkim	sltu	$c_1,$c_3,$t_2
1171238384Sjkim	mflo	$t_1
1172238384Sjkim	mfhi	$t_2
1173238384Sjkim	$ADDU	$c_2,$t_1
1174238384Sjkim	sltu	$at,$c_2,$t_1
1175238384Sjkim	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1176238384Sjkim	$ADDU	$t_2,$at
1177238384Sjkim	$ADDU	$c_3,$t_2
1178238384Sjkim	sltu	$at,$c_3,$t_2
1179238384Sjkim	$ADDU	$c_1,$at
1180238384Sjkim	mflo	$t_1
1181238384Sjkim	mfhi	$t_2
1182238384Sjkim	$ADDU	$c_2,$t_1
1183238384Sjkim	sltu	$at,$c_2,$t_1
1184238384Sjkim	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1185238384Sjkim	$ADDU	$t_2,$at
1186238384Sjkim	$ADDU	$c_3,$t_2
1187238384Sjkim	sltu	$at,$c_3,$t_2
1188238384Sjkim	$ADDU	$c_1,$at
1189238384Sjkim	mflo	$t_1
1190238384Sjkim	mfhi	$t_2
1191238384Sjkim	$ADDU	$c_2,$t_1
1192238384Sjkim	sltu	$at,$c_2,$t_1
1193238384Sjkim	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
1194238384Sjkim	$ADDU	$t_2,$at
1195238384Sjkim	$ADDU	$c_3,$t_2
1196238384Sjkim	sltu	$at,$c_3,$t_2
1197238384Sjkim	$ADDU	$c_1,$at
1198238384Sjkim	mflo	$t_1
1199238384Sjkim	mfhi	$t_2
1200238384Sjkim	$ADDU	$c_2,$t_1
1201238384Sjkim	sltu	$at,$c_2,$t_1
1202238384Sjkim	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
1203238384Sjkim	$ADDU	$t_2,$at
1204238384Sjkim	$ADDU	$c_3,$t_2
1205238384Sjkim	sltu	$at,$c_3,$t_2
1206238384Sjkim	$ADDU	$c_1,$at
1207238384Sjkim	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1208238384Sjkim
1209238384Sjkim	mflo	$t_1
1210238384Sjkim	mfhi	$t_2
1211238384Sjkim	$ADDU	$c_3,$t_1
1212238384Sjkim	sltu	$at,$c_3,$t_1
1213238384Sjkim	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
1214238384Sjkim	$ADDU	$t_2,$at
1215238384Sjkim	$ADDU	$c_1,$t_2
1216238384Sjkim	sltu	$c_2,$c_1,$t_2
1217238384Sjkim	mflo	$t_1
1218238384Sjkim	mfhi	$t_2
1219238384Sjkim	$ADDU	$c_3,$t_1
1220238384Sjkim	sltu	$at,$c_3,$t_1
1221238384Sjkim	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1222238384Sjkim	$ADDU	$t_2,$at
1223238384Sjkim	$ADDU	$c_1,$t_2
1224238384Sjkim	sltu	$at,$c_1,$t_2
1225238384Sjkim	$ADDU	$c_2,$at
1226238384Sjkim	mflo	$t_1
1227238384Sjkim	mfhi	$t_2
1228238384Sjkim	$ADDU	$c_3,$t_1
1229238384Sjkim	sltu	$at,$c_3,$t_1
1230238384Sjkim	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1231238384Sjkim	$ADDU	$t_2,$at
1232238384Sjkim	$ADDU	$c_1,$t_2
1233238384Sjkim	sltu	$at,$c_1,$t_2
1234238384Sjkim	$ADDU	$c_2,$at
1235238384Sjkim	mflo	$t_1
1236238384Sjkim	mfhi	$t_2
1237238384Sjkim	$ADDU	$c_3,$t_1
1238238384Sjkim	sltu	$at,$c_3,$t_1
1239238384Sjkim	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
1240238384Sjkim	$ADDU	$t_2,$at
1241238384Sjkim	$ADDU	$c_1,$t_2
1242238384Sjkim	sltu	$at,$c_1,$t_2
1243238384Sjkim	$ADDU	$c_2,$at
1244238384Sjkim	mflo	$t_1
1245238384Sjkim	mfhi	$t_2
1246238384Sjkim	$ADDU	$c_3,$t_1
1247238384Sjkim	sltu	$at,$c_3,$t_1
1248238384Sjkim	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
1249238384Sjkim	$ADDU	$t_2,$at
1250238384Sjkim	$ADDU	$c_1,$t_2
1251238384Sjkim	sltu	$at,$c_1,$t_2
1252238384Sjkim	$ADDU	$c_2,$at
1253238384Sjkim	mflo	$t_1
1254238384Sjkim	mfhi	$t_2
1255238384Sjkim	$ADDU	$c_3,$t_1
1256238384Sjkim	sltu	$at,$c_3,$t_1
1257238384Sjkim	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
1258238384Sjkim	$ADDU	$t_2,$at
1259238384Sjkim	$ADDU	$c_1,$t_2
1260238384Sjkim	sltu	$at,$c_1,$t_2
1261238384Sjkim	$ADDU	$c_2,$at
1262238384Sjkim	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1263238384Sjkim
1264238384Sjkim	mflo	$t_1
1265238384Sjkim	mfhi	$t_2
1266238384Sjkim	$ADDU	$c_1,$t_1
1267238384Sjkim	sltu	$at,$c_1,$t_1
1268238384Sjkim	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
1269238384Sjkim	$ADDU	$t_2,$at
1270238384Sjkim	$ADDU	$c_2,$t_2
1271238384Sjkim	sltu	$c_3,$c_2,$t_2
1272238384Sjkim	mflo	$t_1
1273238384Sjkim	mfhi	$t_2
1274238384Sjkim	$ADDU	$c_1,$t_1
1275238384Sjkim	sltu	$at,$c_1,$t_1
1276238384Sjkim	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
1277238384Sjkim	$ADDU	$t_2,$at
1278238384Sjkim	$ADDU	$c_2,$t_2
1279238384Sjkim	sltu	$at,$c_2,$t_2
1280238384Sjkim	$ADDU	$c_3,$at
1281238384Sjkim	mflo	$t_1
1282238384Sjkim	mfhi	$t_2
1283238384Sjkim	$ADDU	$c_1,$t_1
1284238384Sjkim	sltu	$at,$c_1,$t_1
1285238384Sjkim	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1286238384Sjkim	$ADDU	$t_2,$at
1287238384Sjkim	$ADDU	$c_2,$t_2
1288238384Sjkim	sltu	$at,$c_2,$t_2
1289238384Sjkim	$ADDU	$c_3,$at
1290238384Sjkim	mflo	$t_1
1291238384Sjkim	mfhi	$t_2
1292238384Sjkim	$ADDU	$c_1,$t_1
1293238384Sjkim	sltu	$at,$c_1,$t_1
1294238384Sjkim	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
1295238384Sjkim	$ADDU	$t_2,$at
1296238384Sjkim	$ADDU	$c_2,$t_2
1297238384Sjkim	sltu	$at,$c_2,$t_2
1298238384Sjkim	$ADDU	$c_3,$at
1299238384Sjkim	mflo	$t_1
1300238384Sjkim	mfhi	$t_2
1301238384Sjkim	$ADDU	$c_1,$t_1
1302238384Sjkim	sltu	$at,$c_1,$t_1
1303238384Sjkim	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
1304238384Sjkim	$ADDU	$t_2,$at
1305238384Sjkim	$ADDU	$c_2,$t_2
1306238384Sjkim	sltu	$at,$c_2,$t_2
1307238384Sjkim	$ADDU	$c_3,$at
1308238384Sjkim	mflo	$t_1
1309238384Sjkim	mfhi	$t_2
1310238384Sjkim	$ADDU	$c_1,$t_1
1311238384Sjkim	sltu	$at,$c_1,$t_1
1312238384Sjkim	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
1313238384Sjkim	$ADDU	$t_2,$at
1314238384Sjkim	$ADDU	$c_2,$t_2
1315238384Sjkim	sltu	$at,$c_2,$t_2
1316238384Sjkim	$ADDU	$c_3,$at
1317238384Sjkim	mflo	$t_1
1318238384Sjkim	mfhi	$t_2
1319238384Sjkim	$ADDU	$c_1,$t_1
1320238384Sjkim	sltu	$at,$c_1,$t_1
1321238384Sjkim	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
1322238384Sjkim	$ADDU	$t_2,$at
1323238384Sjkim	$ADDU	$c_2,$t_2
1324238384Sjkim	sltu	$at,$c_2,$t_2
1325238384Sjkim	$ADDU	$c_3,$at
1326238384Sjkim	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1327238384Sjkim
1328238384Sjkim	mflo	$t_1
1329238384Sjkim	mfhi	$t_2
1330238384Sjkim	$ADDU	$c_2,$t_1
1331238384Sjkim	sltu	$at,$c_2,$t_1
1332238384Sjkim	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
1333238384Sjkim	$ADDU	$t_2,$at
1334238384Sjkim	$ADDU	$c_3,$t_2
1335238384Sjkim	sltu	$c_1,$c_3,$t_2
1336238384Sjkim	mflo	$t_1
1337238384Sjkim	mfhi	$t_2
1338238384Sjkim	$ADDU	$c_2,$t_1
1339238384Sjkim	sltu	$at,$c_2,$t_1
1340238384Sjkim	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
1341238384Sjkim	$ADDU	$t_2,$at
1342238384Sjkim	$ADDU	$c_3,$t_2
1343238384Sjkim	sltu	$at,$c_3,$t_2
1344238384Sjkim	$ADDU	$c_1,$at
1345238384Sjkim	mflo	$t_1
1346238384Sjkim	mfhi	$t_2
1347238384Sjkim	$ADDU	$c_2,$t_1
1348238384Sjkim	sltu	$at,$c_2,$t_1
1349238384Sjkim	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
1350238384Sjkim	$ADDU	$t_2,$at
1351238384Sjkim	$ADDU	$c_3,$t_2
1352238384Sjkim	sltu	$at,$c_3,$t_2
1353238384Sjkim	$ADDU	$c_1,$at
1354238384Sjkim	mflo	$t_1
1355238384Sjkim	mfhi	$t_2
1356238384Sjkim	$ADDU	$c_2,$t_1
1357238384Sjkim	sltu	$at,$c_2,$t_1
1358238384Sjkim	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
1359238384Sjkim	$ADDU	$t_2,$at
1360238384Sjkim	$ADDU	$c_3,$t_2
1361238384Sjkim	sltu	$at,$c_3,$t_2
1362238384Sjkim	$ADDU	$c_1,$at
1363238384Sjkim	mflo	$t_1
1364238384Sjkim	mfhi	$t_2
1365238384Sjkim	$ADDU	$c_2,$t_1
1366238384Sjkim	sltu	$at,$c_2,$t_1
1367238384Sjkim	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
1368238384Sjkim	$ADDU	$t_2,$at
1369238384Sjkim	$ADDU	$c_3,$t_2
1370238384Sjkim	sltu	$at,$c_3,$t_2
1371238384Sjkim	$ADDU	$c_1,$at
1372238384Sjkim	mflo	$t_1
1373238384Sjkim	mfhi	$t_2
1374238384Sjkim	$ADDU	$c_2,$t_1
1375238384Sjkim	sltu	$at,$c_2,$t_1
1376238384Sjkim	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
1377238384Sjkim	$ADDU	$t_2,$at
1378238384Sjkim	$ADDU	$c_3,$t_2
1379238384Sjkim	sltu	$at,$c_3,$t_2
1380238384Sjkim	$ADDU	$c_1,$at
1381238384Sjkim	mflo	$t_1
1382238384Sjkim	mfhi	$t_2
1383238384Sjkim	$ADDU	$c_2,$t_1
1384238384Sjkim	sltu	$at,$c_2,$t_1
1385238384Sjkim	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
1386238384Sjkim	$ADDU	$t_2,$at
1387238384Sjkim	$ADDU	$c_3,$t_2
1388238384Sjkim	sltu	$at,$c_3,$t_2
1389238384Sjkim	$ADDU	$c_1,$at
1390238384Sjkim	mflo	$t_1
1391238384Sjkim	mfhi	$t_2
1392238384Sjkim	$ADDU	$c_2,$t_1
1393238384Sjkim	sltu	$at,$c_2,$t_1
1394238384Sjkim	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
1395238384Sjkim	$ADDU	$t_2,$at
1396238384Sjkim	$ADDU	$c_3,$t_2
1397238384Sjkim	sltu	$at,$c_3,$t_2
1398238384Sjkim	$ADDU	$c_1,$at
1399238384Sjkim	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1400238384Sjkim
1401238384Sjkim	mflo	$t_1
1402238384Sjkim	mfhi	$t_2
1403238384Sjkim	$ADDU	$c_3,$t_1
1404238384Sjkim	sltu	$at,$c_3,$t_1
1405238384Sjkim	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
1406238384Sjkim	$ADDU	$t_2,$at
1407238384Sjkim	$ADDU	$c_1,$t_2
1408238384Sjkim	sltu	$c_2,$c_1,$t_2
1409238384Sjkim	mflo	$t_1
1410238384Sjkim	mfhi	$t_2
1411238384Sjkim	$ADDU	$c_3,$t_1
1412238384Sjkim	sltu	$at,$c_3,$t_1
1413238384Sjkim	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
1414238384Sjkim	$ADDU	$t_2,$at
1415238384Sjkim	$ADDU	$c_1,$t_2
1416238384Sjkim	sltu	$at,$c_1,$t_2
1417238384Sjkim	$ADDU	$c_2,$at
1418238384Sjkim	mflo	$t_1
1419238384Sjkim	mfhi	$t_2
1420238384Sjkim	$ADDU	$c_3,$t_1
1421238384Sjkim	sltu	$at,$c_3,$t_1
1422238384Sjkim	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
1423238384Sjkim	$ADDU	$t_2,$at
1424238384Sjkim	$ADDU	$c_1,$t_2
1425238384Sjkim	sltu	$at,$c_1,$t_2
1426238384Sjkim	$ADDU	$c_2,$at
1427238384Sjkim	mflo	$t_1
1428238384Sjkim	mfhi	$t_2
1429238384Sjkim	$ADDU	$c_3,$t_1
1430238384Sjkim	sltu	$at,$c_3,$t_1
1431238384Sjkim	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
1432238384Sjkim	$ADDU	$t_2,$at
1433238384Sjkim	$ADDU	$c_1,$t_2
1434238384Sjkim	sltu	$at,$c_1,$t_2
1435238384Sjkim	$ADDU	$c_2,$at
1436238384Sjkim	mflo	$t_1
1437238384Sjkim	mfhi	$t_2
1438238384Sjkim	$ADDU	$c_3,$t_1
1439238384Sjkim	sltu	$at,$c_3,$t_1
1440238384Sjkim	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
1441238384Sjkim	$ADDU	$t_2,$at
1442238384Sjkim	$ADDU	$c_1,$t_2
1443238384Sjkim	sltu	$at,$c_1,$t_2
1444238384Sjkim	$ADDU	$c_2,$at
1445238384Sjkim	mflo	$t_1
1446238384Sjkim	mfhi	$t_2
1447238384Sjkim	$ADDU	$c_3,$t_1
1448238384Sjkim	sltu	$at,$c_3,$t_1
1449238384Sjkim	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
1450238384Sjkim	$ADDU	$t_2,$at
1451238384Sjkim	$ADDU	$c_1,$t_2
1452238384Sjkim	sltu	$at,$c_1,$t_2
1453238384Sjkim	$ADDU	$c_2,$at
1454238384Sjkim	mflo	$t_1
1455238384Sjkim	mfhi	$t_2
1456238384Sjkim	$ADDU	$c_3,$t_1
1457238384Sjkim	sltu	$at,$c_3,$t_1
1458238384Sjkim	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
1459238384Sjkim	$ADDU	$t_2,$at
1460238384Sjkim	$ADDU	$c_1,$t_2
1461238384Sjkim	sltu	$at,$c_1,$t_2
1462238384Sjkim	$ADDU	$c_2,$at
1463238384Sjkim	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1464238384Sjkim
1465238384Sjkim	mflo	$t_1
1466238384Sjkim	mfhi	$t_2
1467238384Sjkim	$ADDU	$c_1,$t_1
1468238384Sjkim	sltu	$at,$c_1,$t_1
1469238384Sjkim	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
1470238384Sjkim	$ADDU	$t_2,$at
1471238384Sjkim	$ADDU	$c_2,$t_2
1472238384Sjkim	sltu	$c_3,$c_2,$t_2
1473238384Sjkim	mflo	$t_1
1474238384Sjkim	mfhi	$t_2
1475238384Sjkim	$ADDU	$c_1,$t_1
1476238384Sjkim	sltu	$at,$c_1,$t_1
1477238384Sjkim	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
1478238384Sjkim	$ADDU	$t_2,$at
1479238384Sjkim	$ADDU	$c_2,$t_2
1480238384Sjkim	sltu	$at,$c_2,$t_2
1481238384Sjkim	$ADDU	$c_3,$at
1482238384Sjkim	mflo	$t_1
1483238384Sjkim	mfhi	$t_2
1484238384Sjkim	$ADDU	$c_1,$t_1
1485238384Sjkim	sltu	$at,$c_1,$t_1
1486238384Sjkim	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
1487238384Sjkim	$ADDU	$t_2,$at
1488238384Sjkim	$ADDU	$c_2,$t_2
1489238384Sjkim	sltu	$at,$c_2,$t_2
1490238384Sjkim	$ADDU	$c_3,$at
1491238384Sjkim	mflo	$t_1
1492238384Sjkim	mfhi	$t_2
1493238384Sjkim	$ADDU	$c_1,$t_1
1494238384Sjkim	sltu	$at,$c_1,$t_1
1495238384Sjkim	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
1496238384Sjkim	$ADDU	$t_2,$at
1497238384Sjkim	$ADDU	$c_2,$t_2
1498238384Sjkim	sltu	$at,$c_2,$t_2
1499238384Sjkim	$ADDU	$c_3,$at
1500238384Sjkim	mflo	$t_1
1501238384Sjkim	mfhi	$t_2
1502238384Sjkim	$ADDU	$c_1,$t_1
1503238384Sjkim	sltu	$at,$c_1,$t_1
1504238384Sjkim	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
1505238384Sjkim	$ADDU	$t_2,$at
1506238384Sjkim	$ADDU	$c_2,$t_2
1507238384Sjkim	sltu	$at,$c_2,$t_2
1508238384Sjkim	$ADDU	$c_3,$at
1509238384Sjkim	mflo	$t_1
1510238384Sjkim	mfhi	$t_2
1511238384Sjkim	$ADDU	$c_1,$t_1
1512238384Sjkim	sltu	$at,$c_1,$t_1
1513238384Sjkim	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
1514238384Sjkim	$ADDU	$t_2,$at
1515238384Sjkim	$ADDU	$c_2,$t_2
1516238384Sjkim	sltu	$at,$c_2,$t_2
1517238384Sjkim	$ADDU	$c_3,$at
1518238384Sjkim	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1519238384Sjkim
1520238384Sjkim	mflo	$t_1
1521238384Sjkim	mfhi	$t_2
1522238384Sjkim	$ADDU	$c_2,$t_1
1523238384Sjkim	sltu	$at,$c_2,$t_1
1524238384Sjkim	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
1525238384Sjkim	$ADDU	$t_2,$at
1526238384Sjkim	$ADDU	$c_3,$t_2
1527238384Sjkim	sltu	$c_1,$c_3,$t_2
1528238384Sjkim	mflo	$t_1
1529238384Sjkim	mfhi	$t_2
1530238384Sjkim	$ADDU	$c_2,$t_1
1531238384Sjkim	sltu	$at,$c_2,$t_1
1532238384Sjkim	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
1533238384Sjkim	$ADDU	$t_2,$at
1534238384Sjkim	$ADDU	$c_3,$t_2
1535238384Sjkim	sltu	$at,$c_3,$t_2
1536238384Sjkim	$ADDU	$c_1,$at
1537238384Sjkim	mflo	$t_1
1538238384Sjkim	mfhi	$t_2
1539238384Sjkim	$ADDU	$c_2,$t_1
1540238384Sjkim	sltu	$at,$c_2,$t_1
1541238384Sjkim	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
1542238384Sjkim	$ADDU	$t_2,$at
1543238384Sjkim	$ADDU	$c_3,$t_2
1544238384Sjkim	sltu	$at,$c_3,$t_2
1545238384Sjkim	$ADDU	$c_1,$at
1546238384Sjkim	mflo	$t_1
1547238384Sjkim	mfhi	$t_2
1548238384Sjkim	$ADDU	$c_2,$t_1
1549238384Sjkim	sltu	$at,$c_2,$t_1
1550238384Sjkim	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
1551238384Sjkim	$ADDU	$t_2,$at
1552238384Sjkim	$ADDU	$c_3,$t_2
1553238384Sjkim	sltu	$at,$c_3,$t_2
1554238384Sjkim	$ADDU	$c_1,$at
1555238384Sjkim	mflo	$t_1
1556238384Sjkim	mfhi	$t_2
1557238384Sjkim	$ADDU	$c_2,$t_1
1558238384Sjkim	sltu	$at,$c_2,$t_1
1559238384Sjkim	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
1560238384Sjkim	$ADDU	$t_2,$at
1561238384Sjkim	$ADDU	$c_3,$t_2
1562238384Sjkim	sltu	$at,$c_3,$t_2
1563238384Sjkim	$ADDU	$c_1,$at
1564238384Sjkim	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1565238384Sjkim
1566238384Sjkim	mflo	$t_1
1567238384Sjkim	mfhi	$t_2
1568238384Sjkim	$ADDU	$c_3,$t_1
1569238384Sjkim	sltu	$at,$c_3,$t_1
1570238384Sjkim	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
1571238384Sjkim	$ADDU	$t_2,$at
1572238384Sjkim	$ADDU	$c_1,$t_2
1573238384Sjkim	sltu	$c_2,$c_1,$t_2
1574238384Sjkim	mflo	$t_1
1575238384Sjkim	mfhi	$t_2
1576238384Sjkim	$ADDU	$c_3,$t_1
1577238384Sjkim	sltu	$at,$c_3,$t_1
1578238384Sjkim	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
1579238384Sjkim	$ADDU	$t_2,$at
1580238384Sjkim	$ADDU	$c_1,$t_2
1581238384Sjkim	sltu	$at,$c_1,$t_2
1582238384Sjkim	$ADDU	$c_2,$at
1583238384Sjkim	mflo	$t_1
1584238384Sjkim	mfhi	$t_2
1585238384Sjkim	$ADDU	$c_3,$t_1
1586238384Sjkim	sltu	$at,$c_3,$t_1
1587238384Sjkim	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
1588238384Sjkim	$ADDU	$t_2,$at
1589238384Sjkim	$ADDU	$c_1,$t_2
1590238384Sjkim	sltu	$at,$c_1,$t_2
1591238384Sjkim	$ADDU	$c_2,$at
1592238384Sjkim	mflo	$t_1
1593238384Sjkim	mfhi	$t_2
1594238384Sjkim	$ADDU	$c_3,$t_1
1595238384Sjkim	sltu	$at,$c_3,$t_1
1596238384Sjkim	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
1597238384Sjkim	$ADDU	$t_2,$at
1598238384Sjkim	$ADDU	$c_1,$t_2
1599238384Sjkim	sltu	$at,$c_1,$t_2
1600238384Sjkim	$ADDU	$c_2,$at
1601238384Sjkim	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1602238384Sjkim
1603238384Sjkim	mflo	$t_1
1604238384Sjkim	mfhi	$t_2
1605238384Sjkim	$ADDU	$c_1,$t_1
1606238384Sjkim	sltu	$at,$c_1,$t_1
1607238384Sjkim	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
1608238384Sjkim	$ADDU	$t_2,$at
1609238384Sjkim	$ADDU	$c_2,$t_2
1610238384Sjkim	sltu	$c_3,$c_2,$t_2
1611238384Sjkim	mflo	$t_1
1612238384Sjkim	mfhi	$t_2
1613238384Sjkim	$ADDU	$c_1,$t_1
1614238384Sjkim	sltu	$at,$c_1,$t_1
1615238384Sjkim	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
1616238384Sjkim	$ADDU	$t_2,$at
1617238384Sjkim	$ADDU	$c_2,$t_2
1618238384Sjkim	sltu	$at,$c_2,$t_2
1619238384Sjkim	$ADDU	$c_3,$at
1620238384Sjkim	mflo	$t_1
1621238384Sjkim	mfhi	$t_2
1622238384Sjkim	$ADDU	$c_1,$t_1
1623238384Sjkim	sltu	$at,$c_1,$t_1
1624238384Sjkim	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
1625238384Sjkim	$ADDU	$t_2,$at
1626238384Sjkim	$ADDU	$c_2,$t_2
1627238384Sjkim	sltu	$at,$c_2,$t_2
1628238384Sjkim	$ADDU	$c_3,$at
1629238384Sjkim	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1630238384Sjkim
1631238384Sjkim	mflo	$t_1
1632238384Sjkim	mfhi	$t_2
1633238384Sjkim	$ADDU	$c_2,$t_1
1634238384Sjkim	sltu	$at,$c_2,$t_1
1635238384Sjkim	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
1636238384Sjkim	$ADDU	$t_2,$at
1637238384Sjkim	$ADDU	$c_3,$t_2
1638238384Sjkim	sltu	$c_1,$c_3,$t_2
1639238384Sjkim	mflo	$t_1
1640238384Sjkim	mfhi	$t_2
1641238384Sjkim	$ADDU	$c_2,$t_1
1642238384Sjkim	sltu	$at,$c_2,$t_1
1643238384Sjkim	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
1644238384Sjkim	$ADDU	$t_2,$at
1645238384Sjkim	$ADDU	$c_3,$t_2
1646238384Sjkim	sltu	$at,$c_3,$t_2
1647238384Sjkim	$ADDU	$c_1,$at
1648238384Sjkim	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1649238384Sjkim
1650238384Sjkim	mflo	$t_1
1651238384Sjkim	mfhi	$t_2
1652238384Sjkim	$ADDU	$c_3,$t_1
1653238384Sjkim	sltu	$at,$c_3,$t_1
1654238384Sjkim	$ADDU	$t_2,$at
1655238384Sjkim	$ADDU	$c_1,$t_2
1656238384Sjkim	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1657238384Sjkim	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1658238384Sjkim
1659238384Sjkim	.set	noreorder
1660238384Sjkim___
1661238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
1662238384Sjkim	$REG_L	$s5,10*$SZREG($sp)
1663238384Sjkim	$REG_L	$s4,9*$SZREG($sp)
1664238384Sjkim	$REG_L	$s3,8*$SZREG($sp)
1665238384Sjkim	$REG_L	$s2,7*$SZREG($sp)
1666238384Sjkim	$REG_L	$s1,6*$SZREG($sp)
1667238384Sjkim	$REG_L	$s0,5*$SZREG($sp)
1668238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
1669238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
1670238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
1671238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
1672238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
1673238384Sjkim	jr	$ra
1674238384Sjkim	$PTR_ADD $sp,12*$SZREG
1675238384Sjkim___
1676238384Sjkim$code.=<<___ if ($flavour !~ /nubi/i);
1677238384Sjkim	$REG_L	$s5,5*$SZREG($sp)
1678238384Sjkim	$REG_L	$s4,4*$SZREG($sp)
1679238384Sjkim	$REG_L	$s3,3*$SZREG($sp)
1680238384Sjkim	$REG_L	$s2,2*$SZREG($sp)
1681238384Sjkim	$REG_L	$s1,1*$SZREG($sp)
1682238384Sjkim	$REG_L	$s0,0*$SZREG($sp)
1683238384Sjkim	jr	$ra
1684238384Sjkim	$PTR_ADD $sp,6*$SZREG
1685238384Sjkim___
1686238384Sjkim$code.=<<___;
1687238384Sjkim.end	bn_mul_comba8
1688238384Sjkim
1689238384Sjkim.align	5
1690238384Sjkim.globl	bn_mul_comba4
1691238384Sjkim.ent	bn_mul_comba4
1692238384Sjkimbn_mul_comba4:
1693238384Sjkim___
1694238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
1695238384Sjkim	.frame	$sp,6*$SZREG,$ra
1696238384Sjkim	.mask	0x8000f008,-$SZREG
1697238384Sjkim	.set	noreorder
1698238384Sjkim	$PTR_SUB $sp,6*$SZREG
1699238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
1700238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
1701238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
1702238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
1703238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
1704238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
1705238384Sjkim___
1706238384Sjkim$code.=<<___;
1707238384Sjkim	.set	reorder
1708238384Sjkim	$LD	$a_0,0($a1)
1709238384Sjkim	$LD	$b_0,0($a2)
1710238384Sjkim	$LD	$a_1,$BNSZ($a1)
1711238384Sjkim	$LD	$a_2,2*$BNSZ($a1)
1712238384Sjkim	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1713238384Sjkim	$LD	$a_3,3*$BNSZ($a1)
1714238384Sjkim	$LD	$b_1,$BNSZ($a2)
1715238384Sjkim	$LD	$b_2,2*$BNSZ($a2)
1716238384Sjkim	$LD	$b_3,3*$BNSZ($a2)
1717238384Sjkim	mflo	$c_1
1718238384Sjkim	mfhi	$c_2
1719238384Sjkim	$ST	$c_1,0($a0)
1720238384Sjkim
1721238384Sjkim	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1722238384Sjkim	mflo	$t_1
1723238384Sjkim	mfhi	$t_2
1724238384Sjkim	$ADDU	$c_2,$t_1
1725238384Sjkim	sltu	$at,$c_2,$t_1
1726238384Sjkim	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1727238384Sjkim	$ADDU	$c_3,$t_2,$at
1728238384Sjkim	mflo	$t_1
1729238384Sjkim	mfhi	$t_2
1730238384Sjkim	$ADDU	$c_2,$t_1
1731238384Sjkim	sltu	$at,$c_2,$t_1
1732238384Sjkim	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1733238384Sjkim	$ADDU	$t_2,$at
1734238384Sjkim	$ADDU	$c_3,$t_2
1735238384Sjkim	sltu	$c_1,$c_3,$t_2
1736238384Sjkim	$ST	$c_2,$BNSZ($a0)
1737238384Sjkim
1738238384Sjkim	mflo	$t_1
1739238384Sjkim	mfhi	$t_2
1740238384Sjkim	$ADDU	$c_3,$t_1
1741238384Sjkim	sltu	$at,$c_3,$t_1
1742238384Sjkim	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1743238384Sjkim	$ADDU	$t_2,$at
1744238384Sjkim	$ADDU	$c_1,$t_2
1745238384Sjkim	mflo	$t_1
1746238384Sjkim	mfhi	$t_2
1747238384Sjkim	$ADDU	$c_3,$t_1
1748238384Sjkim	sltu	$at,$c_3,$t_1
1749238384Sjkim	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1750238384Sjkim	$ADDU	$t_2,$at
1751238384Sjkim	$ADDU	$c_1,$t_2
1752238384Sjkim	sltu	$c_2,$c_1,$t_2
1753238384Sjkim	mflo	$t_1
1754238384Sjkim	mfhi	$t_2
1755238384Sjkim	$ADDU	$c_3,$t_1
1756238384Sjkim	sltu	$at,$c_3,$t_1
1757238384Sjkim	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1758238384Sjkim	$ADDU	$t_2,$at
1759238384Sjkim	$ADDU	$c_1,$t_2
1760238384Sjkim	sltu	$at,$c_1,$t_2
1761238384Sjkim	$ADDU	$c_2,$at
1762238384Sjkim	$ST	$c_3,2*$BNSZ($a0)
1763238384Sjkim
1764238384Sjkim	mflo	$t_1
1765238384Sjkim	mfhi	$t_2
1766238384Sjkim	$ADDU	$c_1,$t_1
1767238384Sjkim	sltu	$at,$c_1,$t_1
1768238384Sjkim	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1769238384Sjkim	$ADDU	$t_2,$at
1770238384Sjkim	$ADDU	$c_2,$t_2
1771238384Sjkim	sltu	$c_3,$c_2,$t_2
1772238384Sjkim	mflo	$t_1
1773238384Sjkim	mfhi	$t_2
1774238384Sjkim	$ADDU	$c_1,$t_1
1775238384Sjkim	sltu	$at,$c_1,$t_1
1776238384Sjkim	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1777238384Sjkim	$ADDU	$t_2,$at
1778238384Sjkim	$ADDU	$c_2,$t_2
1779238384Sjkim	sltu	$at,$c_2,$t_2
1780238384Sjkim	$ADDU	$c_3,$at
1781238384Sjkim	mflo	$t_1
1782238384Sjkim	mfhi	$t_2
1783238384Sjkim	$ADDU	$c_1,$t_1
1784238384Sjkim	sltu	$at,$c_1,$t_1
1785238384Sjkim	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1786238384Sjkim	$ADDU	$t_2,$at
1787238384Sjkim	$ADDU	$c_2,$t_2
1788238384Sjkim	sltu	$at,$c_2,$t_2
1789238384Sjkim	$ADDU	$c_3,$at
1790238384Sjkim	mflo	$t_1
1791238384Sjkim	mfhi	$t_2
1792238384Sjkim	$ADDU	$c_1,$t_1
1793238384Sjkim	sltu	$at,$c_1,$t_1
1794238384Sjkim	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1795238384Sjkim	$ADDU	$t_2,$at
1796238384Sjkim	$ADDU	$c_2,$t_2
1797238384Sjkim	sltu	$at,$c_2,$t_2
1798238384Sjkim	$ADDU	$c_3,$at
1799238384Sjkim	$ST	$c_1,3*$BNSZ($a0)
1800238384Sjkim
1801238384Sjkim	mflo	$t_1
1802238384Sjkim	mfhi	$t_2
1803238384Sjkim	$ADDU	$c_2,$t_1
1804238384Sjkim	sltu	$at,$c_2,$t_1
1805238384Sjkim	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1806238384Sjkim	$ADDU	$t_2,$at
1807238384Sjkim	$ADDU	$c_3,$t_2
1808238384Sjkim	sltu	$c_1,$c_3,$t_2
1809238384Sjkim	mflo	$t_1
1810238384Sjkim	mfhi	$t_2
1811238384Sjkim	$ADDU	$c_2,$t_1
1812238384Sjkim	sltu	$at,$c_2,$t_1
1813238384Sjkim	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1814238384Sjkim	$ADDU	$t_2,$at
1815238384Sjkim	$ADDU	$c_3,$t_2
1816238384Sjkim	sltu	$at,$c_3,$t_2
1817238384Sjkim	$ADDU	$c_1,$at
1818238384Sjkim	mflo	$t_1
1819238384Sjkim	mfhi	$t_2
1820238384Sjkim	$ADDU	$c_2,$t_1
1821238384Sjkim	sltu	$at,$c_2,$t_1
1822238384Sjkim	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1823238384Sjkim	$ADDU	$t_2,$at
1824238384Sjkim	$ADDU	$c_3,$t_2
1825238384Sjkim	sltu	$at,$c_3,$t_2
1826238384Sjkim	$ADDU	$c_1,$at
1827238384Sjkim	$ST	$c_2,4*$BNSZ($a0)
1828238384Sjkim
1829238384Sjkim	mflo	$t_1
1830238384Sjkim	mfhi	$t_2
1831238384Sjkim	$ADDU	$c_3,$t_1
1832238384Sjkim	sltu	$at,$c_3,$t_1
1833238384Sjkim	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1834238384Sjkim	$ADDU	$t_2,$at
1835238384Sjkim	$ADDU	$c_1,$t_2
1836238384Sjkim	sltu	$c_2,$c_1,$t_2
1837238384Sjkim	mflo	$t_1
1838238384Sjkim	mfhi	$t_2
1839238384Sjkim	$ADDU	$c_3,$t_1
1840238384Sjkim	sltu	$at,$c_3,$t_1
1841238384Sjkim	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1842238384Sjkim	$ADDU	$t_2,$at
1843238384Sjkim	$ADDU	$c_1,$t_2
1844238384Sjkim	sltu	$at,$c_1,$t_2
1845238384Sjkim	$ADDU	$c_2,$at
1846238384Sjkim	$ST	$c_3,5*$BNSZ($a0)
1847238384Sjkim
1848238384Sjkim	mflo	$t_1
1849238384Sjkim	mfhi	$t_2
1850238384Sjkim	$ADDU	$c_1,$t_1
1851238384Sjkim	sltu	$at,$c_1,$t_1
1852238384Sjkim	$ADDU	$t_2,$at
1853238384Sjkim	$ADDU	$c_2,$t_2
1854238384Sjkim	$ST	$c_1,6*$BNSZ($a0)
1855238384Sjkim	$ST	$c_2,7*$BNSZ($a0)
1856238384Sjkim
1857238384Sjkim	.set	noreorder
1858238384Sjkim___
1859238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
1860238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
1861238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
1862238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
1863238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
1864238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
1865238384Sjkim	$PTR_ADD $sp,6*$SZREG
1866238384Sjkim___
1867238384Sjkim$code.=<<___;
1868238384Sjkim	jr	$ra
1869238384Sjkim	nop
1870238384Sjkim.end	bn_mul_comba4
1871238384Sjkim___
1872238384Sjkim
1873238384Sjkim($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1874238384Sjkim
1875276864Sjkimsub add_c2 () {
1876276864Sjkimmy ($hi,$lo,$c0,$c1,$c2,
1877276864Sjkim    $warm,      # !$warm denotes first call with specific sequence of
1878276864Sjkim                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879276864Sjkim    $an,$bn     # these two are arguments for multiplication which
1880276864Sjkim                # result is used in *next* step [which is why it's
1881276864Sjkim                # commented as "forward multiplication" below];
1882276864Sjkim    )=@_;
1883238384Sjkim$code.=<<___;
1884276864Sjkim	mflo	$lo
1885276864Sjkim	mfhi	$hi
1886276864Sjkim	$ADDU	$c0,$lo
1887276864Sjkim	sltu	$at,$c0,$lo
1888276864Sjkim	 $MULTU	$an,$bn			# forward multiplication
1889276864Sjkim	$ADDU	$c0,$lo
1890276864Sjkim	$ADDU	$at,$hi
1891276864Sjkim	sltu	$lo,$c0,$lo
1892276864Sjkim	$ADDU	$c1,$at
1893276864Sjkim	$ADDU	$hi,$lo
1894276864Sjkim___
1895276864Sjkim$code.=<<___	if (!$warm);
1896276864Sjkim	sltu	$c2,$c1,$at
1897276864Sjkim	$ADDU	$c1,$hi
1898276864Sjkim	sltu	$hi,$c1,$hi
1899276864Sjkim	$ADDU	$c2,$hi
1900276864Sjkim___
1901276864Sjkim$code.=<<___	if ($warm);
1902276864Sjkim	sltu	$at,$c1,$at
1903276864Sjkim	$ADDU	$c1,$hi
1904276864Sjkim	$ADDU	$c2,$at
1905276864Sjkim	sltu	$hi,$c1,$hi
1906276864Sjkim	$ADDU	$c2,$hi
1907276864Sjkim___
1908276864Sjkim}
1909238384Sjkim
1910276864Sjkim$code.=<<___;
1911276864Sjkim
1912238384Sjkim.align	5
1913238384Sjkim.globl	bn_sqr_comba8
1914238384Sjkim.ent	bn_sqr_comba8
1915238384Sjkimbn_sqr_comba8:
1916238384Sjkim___
1917238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
1918238384Sjkim	.frame	$sp,6*$SZREG,$ra
1919238384Sjkim	.mask	0x8000f008,-$SZREG
1920238384Sjkim	.set	noreorder
1921238384Sjkim	$PTR_SUB $sp,6*$SZREG
1922238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
1923238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
1924238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
1925238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
1926238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
1927238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
1928238384Sjkim___
1929238384Sjkim$code.=<<___;
1930238384Sjkim	.set	reorder
1931238384Sjkim	$LD	$a_0,0($a1)
1932238384Sjkim	$LD	$a_1,$BNSZ($a1)
1933238384Sjkim	$LD	$a_2,2*$BNSZ($a1)
1934238384Sjkim	$LD	$a_3,3*$BNSZ($a1)
1935238384Sjkim
1936238384Sjkim	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1937238384Sjkim	$LD	$a_4,4*$BNSZ($a1)
1938238384Sjkim	$LD	$a_5,5*$BNSZ($a1)
1939238384Sjkim	$LD	$a_6,6*$BNSZ($a1)
1940238384Sjkim	$LD	$a_7,7*$BNSZ($a1)
1941238384Sjkim	mflo	$c_1
1942238384Sjkim	mfhi	$c_2
1943238384Sjkim	$ST	$c_1,0($a0)
1944238384Sjkim
1945238384Sjkim	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
1946238384Sjkim	mflo	$t_1
1947238384Sjkim	mfhi	$t_2
1948238384Sjkim	slt	$c_1,$t_2,$zero
1949238384Sjkim	$SLL	$t_2,1
1950238384Sjkim	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
1951238384Sjkim	slt	$a2,$t_1,$zero
1952238384Sjkim	$ADDU	$t_2,$a2
1953238384Sjkim	$SLL	$t_1,1
1954238384Sjkim	$ADDU	$c_2,$t_1
1955238384Sjkim	sltu	$at,$c_2,$t_1
1956238384Sjkim	$ADDU	$c_3,$t_2,$at
1957238384Sjkim	$ST	$c_2,$BNSZ($a0)
1958276864Sjkim___
1959276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960276864Sjkim		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1961276864Sjkim$code.=<<___;
1962238384Sjkim	mflo	$t_1
1963238384Sjkim	mfhi	$t_2
1964238384Sjkim	$ADDU	$c_3,$t_1
1965238384Sjkim	sltu	$at,$c_3,$t_1
1966238384Sjkim	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
1967238384Sjkim	$ADDU	$t_2,$at
1968238384Sjkim	$ADDU	$c_1,$t_2
1969238384Sjkim	sltu	$at,$c_1,$t_2
1970238384Sjkim	$ADDU	$c_2,$at
1971238384Sjkim	$ST	$c_3,2*$BNSZ($a0)
1972276864Sjkim___
1973276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974276864Sjkim		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
1975276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976276864Sjkim		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
1977276864Sjkim$code.=<<___;
1978238384Sjkim	$ST	$c_1,3*$BNSZ($a0)
1979276864Sjkim___
1980276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981276864Sjkim		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
1982276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983276864Sjkim		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
1984276864Sjkim$code.=<<___;
1985238384Sjkim	mflo	$t_1
1986238384Sjkim	mfhi	$t_2
1987238384Sjkim	$ADDU	$c_2,$t_1
1988238384Sjkim	sltu	$at,$c_2,$t_1
1989238384Sjkim	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
1990238384Sjkim	$ADDU	$t_2,$at
1991238384Sjkim	$ADDU	$c_3,$t_2
1992238384Sjkim	sltu	$at,$c_3,$t_2
1993238384Sjkim	$ADDU	$c_1,$at
1994238384Sjkim	$ST	$c_2,4*$BNSZ($a0)
1995276864Sjkim___
1996276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997276864Sjkim		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
1998276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999276864Sjkim		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2000276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001276864Sjkim		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2002276864Sjkim$code.=<<___;
2003238384Sjkim	$ST	$c_3,5*$BNSZ($a0)
2004276864Sjkim___
2005276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006276864Sjkim		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2007276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008276864Sjkim		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2009276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010276864Sjkim		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2011276864Sjkim$code.=<<___;
2012238384Sjkim	mflo	$t_1
2013238384Sjkim	mfhi	$t_2
2014238384Sjkim	$ADDU	$c_1,$t_1
2015238384Sjkim	sltu	$at,$c_1,$t_1
2016238384Sjkim	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
2017238384Sjkim	$ADDU	$t_2,$at
2018238384Sjkim	$ADDU	$c_2,$t_2
2019238384Sjkim	sltu	$at,$c_2,$t_2
2020238384Sjkim	$ADDU	$c_3,$at
2021238384Sjkim	$ST	$c_1,6*$BNSZ($a0)
2022276864Sjkim___
2023276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024276864Sjkim		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2025276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026276864Sjkim		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2027276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028276864Sjkim		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2029276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030276864Sjkim		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2031276864Sjkim$code.=<<___;
2032238384Sjkim	$ST	$c_2,7*$BNSZ($a0)
2033276864Sjkim___
2034276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035276864Sjkim		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2036276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037276864Sjkim		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2038276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039276864Sjkim		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2040276864Sjkim$code.=<<___;
2041238384Sjkim	mflo	$t_1
2042238384Sjkim	mfhi	$t_2
2043238384Sjkim	$ADDU	$c_3,$t_1
2044238384Sjkim	sltu	$at,$c_3,$t_1
2045238384Sjkim	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
2046238384Sjkim	$ADDU	$t_2,$at
2047238384Sjkim	$ADDU	$c_1,$t_2
2048238384Sjkim	sltu	$at,$c_1,$t_2
2049238384Sjkim	$ADDU	$c_2,$at
2050238384Sjkim	$ST	$c_3,8*$BNSZ($a0)
2051276864Sjkim___
2052276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053276864Sjkim		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2054276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055276864Sjkim		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2056276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057276864Sjkim		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2058276864Sjkim$code.=<<___;
2059238384Sjkim	$ST	$c_1,9*$BNSZ($a0)
2060276864Sjkim___
2061276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062276864Sjkim		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2063276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064276864Sjkim		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2065276864Sjkim$code.=<<___;
2066238384Sjkim	mflo	$t_1
2067238384Sjkim	mfhi	$t_2
2068238384Sjkim	$ADDU	$c_2,$t_1
2069238384Sjkim	sltu	$at,$c_2,$t_1
2070238384Sjkim	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
2071238384Sjkim	$ADDU	$t_2,$at
2072238384Sjkim	$ADDU	$c_3,$t_2
2073238384Sjkim	sltu	$at,$c_3,$t_2
2074238384Sjkim	$ADDU	$c_1,$at
2075238384Sjkim	$ST	$c_2,10*$BNSZ($a0)
2076276864Sjkim___
2077276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078276864Sjkim		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2079276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080276864Sjkim		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2081276864Sjkim$code.=<<___;
2082238384Sjkim	$ST	$c_3,11*$BNSZ($a0)
2083276864Sjkim___
2084276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085276864Sjkim		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2086276864Sjkim$code.=<<___;
2087238384Sjkim	mflo	$t_1
2088238384Sjkim	mfhi	$t_2
2089238384Sjkim	$ADDU	$c_1,$t_1
2090238384Sjkim	sltu	$at,$c_1,$t_1
2091238384Sjkim	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
2092238384Sjkim	$ADDU	$t_2,$at
2093238384Sjkim	$ADDU	$c_2,$t_2
2094238384Sjkim	sltu	$at,$c_2,$t_2
2095238384Sjkim	$ADDU	$c_3,$at
2096238384Sjkim	$ST	$c_1,12*$BNSZ($a0)
2097276864Sjkim___
2098276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099276864Sjkim		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2100276864Sjkim$code.=<<___;
2101238384Sjkim	$ST	$c_2,13*$BNSZ($a0)
2102238384Sjkim
2103238384Sjkim	mflo	$t_1
2104238384Sjkim	mfhi	$t_2
2105238384Sjkim	$ADDU	$c_3,$t_1
2106238384Sjkim	sltu	$at,$c_3,$t_1
2107238384Sjkim	$ADDU	$t_2,$at
2108238384Sjkim	$ADDU	$c_1,$t_2
2109238384Sjkim	$ST	$c_3,14*$BNSZ($a0)
2110238384Sjkim	$ST	$c_1,15*$BNSZ($a0)
2111238384Sjkim
2112238384Sjkim	.set	noreorder
2113238384Sjkim___
2114238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
2115238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
2116238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
2117238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
2118238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
2119238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
2120238384Sjkim	$PTR_ADD $sp,6*$SZREG
2121238384Sjkim___
2122238384Sjkim$code.=<<___;
2123238384Sjkim	jr	$ra
2124238384Sjkim	nop
2125238384Sjkim.end	bn_sqr_comba8
2126238384Sjkim
2127238384Sjkim.align	5
2128238384Sjkim.globl	bn_sqr_comba4
2129238384Sjkim.ent	bn_sqr_comba4
2130238384Sjkimbn_sqr_comba4:
2131238384Sjkim___
2132238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
2133238384Sjkim	.frame	$sp,6*$SZREG,$ra
2134238384Sjkim	.mask	0x8000f008,-$SZREG
2135238384Sjkim	.set	noreorder
2136238384Sjkim	$PTR_SUB $sp,6*$SZREG
2137238384Sjkim	$REG_S	$ra,5*$SZREG($sp)
2138238384Sjkim	$REG_S	$t3,4*$SZREG($sp)
2139238384Sjkim	$REG_S	$t2,3*$SZREG($sp)
2140238384Sjkim	$REG_S	$t1,2*$SZREG($sp)
2141238384Sjkim	$REG_S	$t0,1*$SZREG($sp)
2142238384Sjkim	$REG_S	$gp,0*$SZREG($sp)
2143238384Sjkim___
2144238384Sjkim$code.=<<___;
2145238384Sjkim	.set	reorder
2146238384Sjkim	$LD	$a_0,0($a1)
2147238384Sjkim	$LD	$a_1,$BNSZ($a1)
2148238384Sjkim	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
2149238384Sjkim	$LD	$a_2,2*$BNSZ($a1)
2150238384Sjkim	$LD	$a_3,3*$BNSZ($a1)
2151238384Sjkim	mflo	$c_1
2152238384Sjkim	mfhi	$c_2
2153238384Sjkim	$ST	$c_1,0($a0)
2154238384Sjkim
2155238384Sjkim	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
2156238384Sjkim	mflo	$t_1
2157238384Sjkim	mfhi	$t_2
2158238384Sjkim	slt	$c_1,$t_2,$zero
2159238384Sjkim	$SLL	$t_2,1
2160238384Sjkim	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
2161238384Sjkim	slt	$a2,$t_1,$zero
2162238384Sjkim	$ADDU	$t_2,$a2
2163238384Sjkim	$SLL	$t_1,1
2164238384Sjkim	$ADDU	$c_2,$t_1
2165238384Sjkim	sltu	$at,$c_2,$t_1
2166238384Sjkim	$ADDU	$c_3,$t_2,$at
2167238384Sjkim	$ST	$c_2,$BNSZ($a0)
2168276864Sjkim___
2169276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170276864Sjkim		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2171276864Sjkim$code.=<<___;
2172238384Sjkim	mflo	$t_1
2173238384Sjkim	mfhi	$t_2
2174238384Sjkim	$ADDU	$c_3,$t_1
2175238384Sjkim	sltu	$at,$c_3,$t_1
2176238384Sjkim	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
2177238384Sjkim	$ADDU	$t_2,$at
2178238384Sjkim	$ADDU	$c_1,$t_2
2179238384Sjkim	sltu	$at,$c_1,$t_2
2180238384Sjkim	$ADDU	$c_2,$at
2181238384Sjkim	$ST	$c_3,2*$BNSZ($a0)
2182276864Sjkim___
2183276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184276864Sjkim		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2185276864Sjkim	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186276864Sjkim		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2187276864Sjkim$code.=<<___;
2188238384Sjkim	$ST	$c_1,3*$BNSZ($a0)
2189276864Sjkim___
2190276864Sjkim	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191276864Sjkim		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2192276864Sjkim$code.=<<___;
2193238384Sjkim	mflo	$t_1
2194238384Sjkim	mfhi	$t_2
2195238384Sjkim	$ADDU	$c_2,$t_1
2196238384Sjkim	sltu	$at,$c_2,$t_1
2197238384Sjkim	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
2198238384Sjkim	$ADDU	$t_2,$at
2199238384Sjkim	$ADDU	$c_3,$t_2
2200238384Sjkim	sltu	$at,$c_3,$t_2
2201238384Sjkim	$ADDU	$c_1,$at
2202238384Sjkim	$ST	$c_2,4*$BNSZ($a0)
2203276864Sjkim___
2204276864Sjkim	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205276864Sjkim		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2206276864Sjkim$code.=<<___;
2207238384Sjkim	$ST	$c_3,5*$BNSZ($a0)
2208238384Sjkim
2209238384Sjkim	mflo	$t_1
2210238384Sjkim	mfhi	$t_2
2211238384Sjkim	$ADDU	$c_1,$t_1
2212238384Sjkim	sltu	$at,$c_1,$t_1
2213238384Sjkim	$ADDU	$t_2,$at
2214238384Sjkim	$ADDU	$c_2,$t_2
2215238384Sjkim	$ST	$c_1,6*$BNSZ($a0)
2216238384Sjkim	$ST	$c_2,7*$BNSZ($a0)
2217238384Sjkim
2218238384Sjkim	.set	noreorder
2219238384Sjkim___
2220238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
2221238384Sjkim	$REG_L	$t3,4*$SZREG($sp)
2222238384Sjkim	$REG_L	$t2,3*$SZREG($sp)
2223238384Sjkim	$REG_L	$t1,2*$SZREG($sp)
2224238384Sjkim	$REG_L	$t0,1*$SZREG($sp)
2225238384Sjkim	$REG_L	$gp,0*$SZREG($sp)
2226238384Sjkim	$PTR_ADD $sp,6*$SZREG
2227238384Sjkim___
2228238384Sjkim$code.=<<___;
2229238384Sjkim	jr	$ra
2230238384Sjkim	nop
2231238384Sjkim.end	bn_sqr_comba4
2232238384Sjkim___
2233238384Sjkimprint $code;
2234238384Sjkimclose STDOUT;
2235