mips.pl revision 279264
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project.
6#
7# Rights for redistribution and usage in source and binary forms are
8# granted according to the OpenSSL license. Warranty of any kind is
9# disclaimed.
10# ====================================================================
11
12
13# July 1999
14#
15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16#
17# The module is designed to work with either of the "new" MIPS ABI(5),
18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19# IRIX 5.x not only because it doesn't support new ABIs but also
20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22# cause illegal instruction exception:-(
23#
24# In addition the code depends on preprocessor flags set up by MIPSpro
25# compiler driver (either as or cc) and therefore (probably?) can't be
26# compiled by the GNU assembler. GNU C driver manages fine though...
27# I mean as long as -mmips-as is specified or is the default option,
28# because then it simply invokes /usr/bin/as which in turn takes
29# perfect care of the preprocessor definitions. Another neat feature
30# offered by the MIPSpro assembler is an optimization pass. This gave
31# me the opportunity to have the code looking more regular as all those
32# architecture dependent instruction rescheduling details were left to
33# the assembler. Cool, huh?
34#
35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36# goes way over 3 times faster!
37#
38#					<appro@fy.chalmers.se>
39
40# October 2010
41#
42# Adapt the module even for 32-bit ABIs and other OSes. The former was
43# achieved by mechanical replacement of 64-bit arithmetic instructions
44# such as dmultu, daddu, etc. with their 32-bit counterparts and
45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46# >3x performance improvement naturally does not apply to 32-bit code
47# [because there is no instruction 32-bit compiler can't use], one
48# has to content with 40-85% improvement depending on benchmark and
49# key length, more for longer keys.
50
51$flavour = shift;
52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53open STDOUT,">$output";
54
55if ($flavour =~ /64|n32/i) {
56	$LD="ld";
57	$ST="sd";
58	$MULTU="dmultu";
59	$DIVU="ddivu";
60	$ADDU="daddu";
61	$SUBU="dsubu";
62	$SRL="dsrl";
63	$SLL="dsll";
64	$BNSZ=8;
65	$PTR_ADD="daddu";
66	$PTR_SUB="dsubu";
67	$SZREG=8;
68	$REG_S="sd";
69	$REG_L="ld";
70} else {
71	$LD="lw";
72	$ST="sw";
73	$MULTU="multu";
74	$DIVU="divu";
75	$ADDU="addu";
76	$SUBU="subu";
77	$SRL="srl";
78	$SLL="sll";
79	$BNSZ=4;
80	$PTR_ADD="addu";
81	$PTR_SUB="subu";
82	$SZREG=4;
83	$REG_S="sw";
84	$REG_L="lw";
85	$code=".set	mips2\n";
86}
87
88# Below is N32/64 register layout used in the original module.
89#
90($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96#
97# No special adaptation is required for O32. NUBI on the other hand
98# is treated by saving/restoring ($v1,$t0..$t3).
99
100$gp=$v1 if ($flavour =~ /nubi/i);
101
102$minus4=$v1;
103
104$code.=<<___;
105.rdata
106.asciiz	"mips3.s, Version 1.2"
107.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108
109.text
110.set	noat
111
112.align	5
113.globl	bn_mul_add_words
114.ent	bn_mul_add_words
115bn_mul_add_words:
116	.set	noreorder
117	bgtz	$a2,bn_mul_add_words_internal
118	move	$v0,$zero
119	jr	$ra
120	move	$a0,$v0
121.end	bn_mul_add_words
122
123.align	5
124.ent	bn_mul_add_words_internal
125bn_mul_add_words_internal:
126___
127$code.=<<___ if ($flavour =~ /nubi/i);
128	.frame	$sp,6*$SZREG,$ra
129	.mask	0x8000f008,-$SZREG
130	.set	noreorder
131	$PTR_SUB $sp,6*$SZREG
132	$REG_S	$ra,5*$SZREG($sp)
133	$REG_S	$t3,4*$SZREG($sp)
134	$REG_S	$t2,3*$SZREG($sp)
135	$REG_S	$t1,2*$SZREG($sp)
136	$REG_S	$t0,1*$SZREG($sp)
137	$REG_S	$gp,0*$SZREG($sp)
138___
139$code.=<<___;
140	.set	reorder
141	li	$minus4,-4
142	and	$ta0,$a2,$minus4
143	beqz	$ta0,.L_bn_mul_add_words_tail
144
145.L_bn_mul_add_words_loop:
146	$LD	$t0,0($a1)
147	$MULTU	$t0,$a3
148	$LD	$t1,0($a0)
149	$LD	$t2,$BNSZ($a1)
150	$LD	$t3,$BNSZ($a0)
151	$LD	$ta0,2*$BNSZ($a1)
152	$LD	$ta1,2*$BNSZ($a0)
153	$ADDU	$t1,$v0
154	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
155				# values", but it seems to work fine
156				# even on 64-bit registers.
157	mflo	$at
158	mfhi	$t0
159	$ADDU	$t1,$at
160	$ADDU	$v0,$t0
161	 $MULTU	$t2,$a3
162	sltu	$at,$t1,$at
163	$ST	$t1,0($a0)
164	$ADDU	$v0,$at
165
166	$LD	$ta2,3*$BNSZ($a1)
167	$LD	$ta3,3*$BNSZ($a0)
168	$ADDU	$t3,$v0
169	sltu	$v0,$t3,$v0
170	mflo	$at
171	mfhi	$t2
172	$ADDU	$t3,$at
173	$ADDU	$v0,$t2
174	 $MULTU	$ta0,$a3
175	sltu	$at,$t3,$at
176	$ST	$t3,$BNSZ($a0)
177	$ADDU	$v0,$at
178
179	subu	$a2,4
180	$PTR_ADD $a0,4*$BNSZ
181	$PTR_ADD $a1,4*$BNSZ
182	$ADDU	$ta1,$v0
183	sltu	$v0,$ta1,$v0
184	mflo	$at
185	mfhi	$ta0
186	$ADDU	$ta1,$at
187	$ADDU	$v0,$ta0
188	 $MULTU	$ta2,$a3
189	sltu	$at,$ta1,$at
190	$ST	$ta1,-2*$BNSZ($a0)
191	$ADDU	$v0,$at
192
193
194	and	$ta0,$a2,$minus4
195	$ADDU	$ta3,$v0
196	sltu	$v0,$ta3,$v0
197	mflo	$at
198	mfhi	$ta2
199	$ADDU	$ta3,$at
200	$ADDU	$v0,$ta2
201	sltu	$at,$ta3,$at
202	$ST	$ta3,-$BNSZ($a0)
203	.set	noreorder
204	bgtz	$ta0,.L_bn_mul_add_words_loop
205	$ADDU	$v0,$at
206
207	beqz	$a2,.L_bn_mul_add_words_return
208	nop
209
210.L_bn_mul_add_words_tail:
211	.set	reorder
212	$LD	$t0,0($a1)
213	$MULTU	$t0,$a3
214	$LD	$t1,0($a0)
215	subu	$a2,1
216	$ADDU	$t1,$v0
217	sltu	$v0,$t1,$v0
218	mflo	$at
219	mfhi	$t0
220	$ADDU	$t1,$at
221	$ADDU	$v0,$t0
222	sltu	$at,$t1,$at
223	$ST	$t1,0($a0)
224	$ADDU	$v0,$at
225	beqz	$a2,.L_bn_mul_add_words_return
226
227	$LD	$t0,$BNSZ($a1)
228	$MULTU	$t0,$a3
229	$LD	$t1,$BNSZ($a0)
230	subu	$a2,1
231	$ADDU	$t1,$v0
232	sltu	$v0,$t1,$v0
233	mflo	$at
234	mfhi	$t0
235	$ADDU	$t1,$at
236	$ADDU	$v0,$t0
237	sltu	$at,$t1,$at
238	$ST	$t1,$BNSZ($a0)
239	$ADDU	$v0,$at
240	beqz	$a2,.L_bn_mul_add_words_return
241
242	$LD	$t0,2*$BNSZ($a1)
243	$MULTU	$t0,$a3
244	$LD	$t1,2*$BNSZ($a0)
245	$ADDU	$t1,$v0
246	sltu	$v0,$t1,$v0
247	mflo	$at
248	mfhi	$t0
249	$ADDU	$t1,$at
250	$ADDU	$v0,$t0
251	sltu	$at,$t1,$at
252	$ST	$t1,2*$BNSZ($a0)
253	$ADDU	$v0,$at
254
255.L_bn_mul_add_words_return:
256	.set	noreorder
257___
258$code.=<<___ if ($flavour =~ /nubi/i);
259	$REG_L	$t3,4*$SZREG($sp)
260	$REG_L	$t2,3*$SZREG($sp)
261	$REG_L	$t1,2*$SZREG($sp)
262	$REG_L	$t0,1*$SZREG($sp)
263	$REG_L	$gp,0*$SZREG($sp)
264	$PTR_ADD $sp,6*$SZREG
265___
266$code.=<<___;
267	jr	$ra
268	move	$a0,$v0
269.end	bn_mul_add_words_internal
270
271.align	5
272.globl	bn_mul_words
273.ent	bn_mul_words
274bn_mul_words:
275	.set	noreorder
276	bgtz	$a2,bn_mul_words_internal
277	move	$v0,$zero
278	jr	$ra
279	move	$a0,$v0
280.end	bn_mul_words
281
282.align	5
283.ent	bn_mul_words_internal
284bn_mul_words_internal:
285___
286$code.=<<___ if ($flavour =~ /nubi/i);
287	.frame	$sp,6*$SZREG,$ra
288	.mask	0x8000f008,-$SZREG
289	.set	noreorder
290	$PTR_SUB $sp,6*$SZREG
291	$REG_S	$ra,5*$SZREG($sp)
292	$REG_S	$t3,4*$SZREG($sp)
293	$REG_S	$t2,3*$SZREG($sp)
294	$REG_S	$t1,2*$SZREG($sp)
295	$REG_S	$t0,1*$SZREG($sp)
296	$REG_S	$gp,0*$SZREG($sp)
297___
298$code.=<<___;
299	.set	reorder
300	li	$minus4,-4
301	and	$ta0,$a2,$minus4
302	beqz	$ta0,.L_bn_mul_words_tail
303
304.L_bn_mul_words_loop:
305	$LD	$t0,0($a1)
306	$MULTU	$t0,$a3
307	$LD	$t2,$BNSZ($a1)
308	$LD	$ta0,2*$BNSZ($a1)
309	$LD	$ta2,3*$BNSZ($a1)
310	mflo	$at
311	mfhi	$t0
312	$ADDU	$v0,$at
313	sltu	$t1,$v0,$at
314	 $MULTU	$t2,$a3
315	$ST	$v0,0($a0)
316	$ADDU	$v0,$t1,$t0
317
318	subu	$a2,4
319	$PTR_ADD $a0,4*$BNSZ
320	$PTR_ADD $a1,4*$BNSZ
321	mflo	$at
322	mfhi	$t2
323	$ADDU	$v0,$at
324	sltu	$t3,$v0,$at
325	 $MULTU	$ta0,$a3
326	$ST	$v0,-3*$BNSZ($a0)
327	$ADDU	$v0,$t3,$t2
328
329	mflo	$at
330	mfhi	$ta0
331	$ADDU	$v0,$at
332	sltu	$ta1,$v0,$at
333	 $MULTU	$ta2,$a3
334	$ST	$v0,-2*$BNSZ($a0)
335	$ADDU	$v0,$ta1,$ta0
336
337	and	$ta0,$a2,$minus4
338	mflo	$at
339	mfhi	$ta2
340	$ADDU	$v0,$at
341	sltu	$ta3,$v0,$at
342	$ST	$v0,-$BNSZ($a0)
343	.set	noreorder
344	bgtz	$ta0,.L_bn_mul_words_loop
345	$ADDU	$v0,$ta3,$ta2
346
347	beqz	$a2,.L_bn_mul_words_return
348	nop
349
350.L_bn_mul_words_tail:
351	.set	reorder
352	$LD	$t0,0($a1)
353	$MULTU	$t0,$a3
354	subu	$a2,1
355	mflo	$at
356	mfhi	$t0
357	$ADDU	$v0,$at
358	sltu	$t1,$v0,$at
359	$ST	$v0,0($a0)
360	$ADDU	$v0,$t1,$t0
361	beqz	$a2,.L_bn_mul_words_return
362
363	$LD	$t0,$BNSZ($a1)
364	$MULTU	$t0,$a3
365	subu	$a2,1
366	mflo	$at
367	mfhi	$t0
368	$ADDU	$v0,$at
369	sltu	$t1,$v0,$at
370	$ST	$v0,$BNSZ($a0)
371	$ADDU	$v0,$t1,$t0
372	beqz	$a2,.L_bn_mul_words_return
373
374	$LD	$t0,2*$BNSZ($a1)
375	$MULTU	$t0,$a3
376	mflo	$at
377	mfhi	$t0
378	$ADDU	$v0,$at
379	sltu	$t1,$v0,$at
380	$ST	$v0,2*$BNSZ($a0)
381	$ADDU	$v0,$t1,$t0
382
383.L_bn_mul_words_return:
384	.set	noreorder
385___
386$code.=<<___ if ($flavour =~ /nubi/i);
387	$REG_L	$t3,4*$SZREG($sp)
388	$REG_L	$t2,3*$SZREG($sp)
389	$REG_L	$t1,2*$SZREG($sp)
390	$REG_L	$t0,1*$SZREG($sp)
391	$REG_L	$gp,0*$SZREG($sp)
392	$PTR_ADD $sp,6*$SZREG
393___
394$code.=<<___;
395	jr	$ra
396	move	$a0,$v0
397.end	bn_mul_words_internal
398
399.align	5
400.globl	bn_sqr_words
401.ent	bn_sqr_words
402bn_sqr_words:
403	.set	noreorder
404	bgtz	$a2,bn_sqr_words_internal
405	move	$v0,$zero
406	jr	$ra
407	move	$a0,$v0
408.end	bn_sqr_words
409
410.align	5
411.ent	bn_sqr_words_internal
412bn_sqr_words_internal:
413___
414$code.=<<___ if ($flavour =~ /nubi/i);
415	.frame	$sp,6*$SZREG,$ra
416	.mask	0x8000f008,-$SZREG
417	.set	noreorder
418	$PTR_SUB $sp,6*$SZREG
419	$REG_S	$ra,5*$SZREG($sp)
420	$REG_S	$t3,4*$SZREG($sp)
421	$REG_S	$t2,3*$SZREG($sp)
422	$REG_S	$t1,2*$SZREG($sp)
423	$REG_S	$t0,1*$SZREG($sp)
424	$REG_S	$gp,0*$SZREG($sp)
425___
426$code.=<<___;
427	.set	reorder
428	li	$minus4,-4
429	and	$ta0,$a2,$minus4
430	beqz	$ta0,.L_bn_sqr_words_tail
431
432.L_bn_sqr_words_loop:
433	$LD	$t0,0($a1)
434	$MULTU	$t0,$t0
435	$LD	$t2,$BNSZ($a1)
436	$LD	$ta0,2*$BNSZ($a1)
437	$LD	$ta2,3*$BNSZ($a1)
438	mflo	$t1
439	mfhi	$t0
440	$ST	$t1,0($a0)
441	$ST	$t0,$BNSZ($a0)
442
443	$MULTU	$t2,$t2
444	subu	$a2,4
445	$PTR_ADD $a0,8*$BNSZ
446	$PTR_ADD $a1,4*$BNSZ
447	mflo	$t3
448	mfhi	$t2
449	$ST	$t3,-6*$BNSZ($a0)
450	$ST	$t2,-5*$BNSZ($a0)
451
452	$MULTU	$ta0,$ta0
453	mflo	$ta1
454	mfhi	$ta0
455	$ST	$ta1,-4*$BNSZ($a0)
456	$ST	$ta0,-3*$BNSZ($a0)
457
458
459	$MULTU	$ta2,$ta2
460	and	$ta0,$a2,$minus4
461	mflo	$ta3
462	mfhi	$ta2
463	$ST	$ta3,-2*$BNSZ($a0)
464
465	.set	noreorder
466	bgtz	$ta0,.L_bn_sqr_words_loop
467	$ST	$ta2,-$BNSZ($a0)
468
469	beqz	$a2,.L_bn_sqr_words_return
470	nop
471
472.L_bn_sqr_words_tail:
473	.set	reorder
474	$LD	$t0,0($a1)
475	$MULTU	$t0,$t0
476	subu	$a2,1
477	mflo	$t1
478	mfhi	$t0
479	$ST	$t1,0($a0)
480	$ST	$t0,$BNSZ($a0)
481	beqz	$a2,.L_bn_sqr_words_return
482
483	$LD	$t0,$BNSZ($a1)
484	$MULTU	$t0,$t0
485	subu	$a2,1
486	mflo	$t1
487	mfhi	$t0
488	$ST	$t1,2*$BNSZ($a0)
489	$ST	$t0,3*$BNSZ($a0)
490	beqz	$a2,.L_bn_sqr_words_return
491
492	$LD	$t0,2*$BNSZ($a1)
493	$MULTU	$t0,$t0
494	mflo	$t1
495	mfhi	$t0
496	$ST	$t1,4*$BNSZ($a0)
497	$ST	$t0,5*$BNSZ($a0)
498
499.L_bn_sqr_words_return:
500	.set	noreorder
501___
502$code.=<<___ if ($flavour =~ /nubi/i);
503	$REG_L	$t3,4*$SZREG($sp)
504	$REG_L	$t2,3*$SZREG($sp)
505	$REG_L	$t1,2*$SZREG($sp)
506	$REG_L	$t0,1*$SZREG($sp)
507	$REG_L	$gp,0*$SZREG($sp)
508	$PTR_ADD $sp,6*$SZREG
509___
510$code.=<<___;
511	jr	$ra
512	move	$a0,$v0
513
514.end	bn_sqr_words_internal
515
516.align	5
517.globl	bn_add_words
518.ent	bn_add_words
519bn_add_words:
520	.set	noreorder
521	bgtz	$a3,bn_add_words_internal
522	move	$v0,$zero
523	jr	$ra
524	move	$a0,$v0
525.end	bn_add_words
526
527.align	5
528.ent	bn_add_words_internal
529bn_add_words_internal:
530___
531$code.=<<___ if ($flavour =~ /nubi/i);
532	.frame	$sp,6*$SZREG,$ra
533	.mask	0x8000f008,-$SZREG
534	.set	noreorder
535	$PTR_SUB $sp,6*$SZREG
536	$REG_S	$ra,5*$SZREG($sp)
537	$REG_S	$t3,4*$SZREG($sp)
538	$REG_S	$t2,3*$SZREG($sp)
539	$REG_S	$t1,2*$SZREG($sp)
540	$REG_S	$t0,1*$SZREG($sp)
541	$REG_S	$gp,0*$SZREG($sp)
542___
543$code.=<<___;
544	.set	reorder
545	li	$minus4,-4
546	and	$at,$a3,$minus4
547	beqz	$at,.L_bn_add_words_tail
548
549.L_bn_add_words_loop:
550	$LD	$t0,0($a1)
551	$LD	$ta0,0($a2)
552	subu	$a3,4
553	$LD	$t1,$BNSZ($a1)
554	and	$at,$a3,$minus4
555	$LD	$t2,2*$BNSZ($a1)
556	$PTR_ADD $a2,4*$BNSZ
557	$LD	$t3,3*$BNSZ($a1)
558	$PTR_ADD $a0,4*$BNSZ
559	$LD	$ta1,-3*$BNSZ($a2)
560	$PTR_ADD $a1,4*$BNSZ
561	$LD	$ta2,-2*$BNSZ($a2)
562	$LD	$ta3,-$BNSZ($a2)
563	$ADDU	$ta0,$t0
564	sltu	$t8,$ta0,$t0
565	$ADDU	$t0,$ta0,$v0
566	sltu	$v0,$t0,$ta0
567	$ST	$t0,-4*$BNSZ($a0)
568	$ADDU	$v0,$t8
569
570	$ADDU	$ta1,$t1
571	sltu	$t9,$ta1,$t1
572	$ADDU	$t1,$ta1,$v0
573	sltu	$v0,$t1,$ta1
574	$ST	$t1,-3*$BNSZ($a0)
575	$ADDU	$v0,$t9
576
577	$ADDU	$ta2,$t2
578	sltu	$t8,$ta2,$t2
579	$ADDU	$t2,$ta2,$v0
580	sltu	$v0,$t2,$ta2
581	$ST	$t2,-2*$BNSZ($a0)
582	$ADDU	$v0,$t8
583
584	$ADDU	$ta3,$t3
585	sltu	$t9,$ta3,$t3
586	$ADDU	$t3,$ta3,$v0
587	sltu	$v0,$t3,$ta3
588	$ST	$t3,-$BNSZ($a0)
589
590	.set	noreorder
591	bgtz	$at,.L_bn_add_words_loop
592	$ADDU	$v0,$t9
593
594	beqz	$a3,.L_bn_add_words_return
595	nop
596
597.L_bn_add_words_tail:
598	.set	reorder
599	$LD	$t0,0($a1)
600	$LD	$ta0,0($a2)
601	$ADDU	$ta0,$t0
602	subu	$a3,1
603	sltu	$t8,$ta0,$t0
604	$ADDU	$t0,$ta0,$v0
605	sltu	$v0,$t0,$ta0
606	$ST	$t0,0($a0)
607	$ADDU	$v0,$t8
608	beqz	$a3,.L_bn_add_words_return
609
610	$LD	$t1,$BNSZ($a1)
611	$LD	$ta1,$BNSZ($a2)
612	$ADDU	$ta1,$t1
613	subu	$a3,1
614	sltu	$t9,$ta1,$t1
615	$ADDU	$t1,$ta1,$v0
616	sltu	$v0,$t1,$ta1
617	$ST	$t1,$BNSZ($a0)
618	$ADDU	$v0,$t9
619	beqz	$a3,.L_bn_add_words_return
620
621	$LD	$t2,2*$BNSZ($a1)
622	$LD	$ta2,2*$BNSZ($a2)
623	$ADDU	$ta2,$t2
624	sltu	$t8,$ta2,$t2
625	$ADDU	$t2,$ta2,$v0
626	sltu	$v0,$t2,$ta2
627	$ST	$t2,2*$BNSZ($a0)
628	$ADDU	$v0,$t8
629
630.L_bn_add_words_return:
631	.set	noreorder
632___
633$code.=<<___ if ($flavour =~ /nubi/i);
634	$REG_L	$t3,4*$SZREG($sp)
635	$REG_L	$t2,3*$SZREG($sp)
636	$REG_L	$t1,2*$SZREG($sp)
637	$REG_L	$t0,1*$SZREG($sp)
638	$REG_L	$gp,0*$SZREG($sp)
639	$PTR_ADD $sp,6*$SZREG
640___
641$code.=<<___;
642	jr	$ra
643	move	$a0,$v0
644
645.end	bn_add_words_internal
646
647.align	5
648.globl	bn_sub_words
649.ent	bn_sub_words
650bn_sub_words:
651	.set	noreorder
652	bgtz	$a3,bn_sub_words_internal
653	move	$v0,$zero
654	jr	$ra
655	move	$a0,$zero
656.end	bn_sub_words
657
658.align	5
659.ent	bn_sub_words_internal
660bn_sub_words_internal:
661___
662$code.=<<___ if ($flavour =~ /nubi/i);
663	.frame	$sp,6*$SZREG,$ra
664	.mask	0x8000f008,-$SZREG
665	.set	noreorder
666	$PTR_SUB $sp,6*$SZREG
667	$REG_S	$ra,5*$SZREG($sp)
668	$REG_S	$t3,4*$SZREG($sp)
669	$REG_S	$t2,3*$SZREG($sp)
670	$REG_S	$t1,2*$SZREG($sp)
671	$REG_S	$t0,1*$SZREG($sp)
672	$REG_S	$gp,0*$SZREG($sp)
673___
674$code.=<<___;
675	.set	reorder
676	li	$minus4,-4
677	and	$at,$a3,$minus4
678	beqz	$at,.L_bn_sub_words_tail
679
680.L_bn_sub_words_loop:
681	$LD	$t0,0($a1)
682	$LD	$ta0,0($a2)
683	subu	$a3,4
684	$LD	$t1,$BNSZ($a1)
685	and	$at,$a3,$minus4
686	$LD	$t2,2*$BNSZ($a1)
687	$PTR_ADD $a2,4*$BNSZ
688	$LD	$t3,3*$BNSZ($a1)
689	$PTR_ADD $a0,4*$BNSZ
690	$LD	$ta1,-3*$BNSZ($a2)
691	$PTR_ADD $a1,4*$BNSZ
692	$LD	$ta2,-2*$BNSZ($a2)
693	$LD	$ta3,-$BNSZ($a2)
694	sltu	$t8,$t0,$ta0
695	$SUBU	$ta0,$t0,$ta0
696	$SUBU	$t0,$ta0,$v0
697	sgtu	$v0,$t0,$ta0
698	$ST	$t0,-4*$BNSZ($a0)
699	$ADDU	$v0,$t8
700
701	sltu	$t9,$t1,$ta1
702	$SUBU	$ta1,$t1,$ta1
703	$SUBU	$t1,$ta1,$v0
704	sgtu	$v0,$t1,$ta1
705	$ST	$t1,-3*$BNSZ($a0)
706	$ADDU	$v0,$t9
707
708
709	sltu	$t8,$t2,$ta2
710	$SUBU	$ta2,$t2,$ta2
711	$SUBU	$t2,$ta2,$v0
712	sgtu	$v0,$t2,$ta2
713	$ST	$t2,-2*$BNSZ($a0)
714	$ADDU	$v0,$t8
715
716	sltu	$t9,$t3,$ta3
717	$SUBU	$ta3,$t3,$ta3
718	$SUBU	$t3,$ta3,$v0
719	sgtu	$v0,$t3,$ta3
720	$ST	$t3,-$BNSZ($a0)
721
722	.set	noreorder
723	bgtz	$at,.L_bn_sub_words_loop
724	$ADDU	$v0,$t9
725
726	beqz	$a3,.L_bn_sub_words_return
727	nop
728
729.L_bn_sub_words_tail:
730	.set	reorder
731	$LD	$t0,0($a1)
732	$LD	$ta0,0($a2)
733	subu	$a3,1
734	sltu	$t8,$t0,$ta0
735	$SUBU	$ta0,$t0,$ta0
736	$SUBU	$t0,$ta0,$v0
737	sgtu	$v0,$t0,$ta0
738	$ST	$t0,0($a0)
739	$ADDU	$v0,$t8
740	beqz	$a3,.L_bn_sub_words_return
741
742	$LD	$t1,$BNSZ($a1)
743	subu	$a3,1
744	$LD	$ta1,$BNSZ($a2)
745	sltu	$t9,$t1,$ta1
746	$SUBU	$ta1,$t1,$ta1
747	$SUBU	$t1,$ta1,$v0
748	sgtu	$v0,$t1,$ta1
749	$ST	$t1,$BNSZ($a0)
750	$ADDU	$v0,$t9
751	beqz	$a3,.L_bn_sub_words_return
752
753	$LD	$t2,2*$BNSZ($a1)
754	$LD	$ta2,2*$BNSZ($a2)
755	sltu	$t8,$t2,$ta2
756	$SUBU	$ta2,$t2,$ta2
757	$SUBU	$t2,$ta2,$v0
758	sgtu	$v0,$t2,$ta2
759	$ST	$t2,2*$BNSZ($a0)
760	$ADDU	$v0,$t8
761
762.L_bn_sub_words_return:
763	.set	noreorder
764___
765$code.=<<___ if ($flavour =~ /nubi/i);
766	$REG_L	$t3,4*$SZREG($sp)
767	$REG_L	$t2,3*$SZREG($sp)
768	$REG_L	$t1,2*$SZREG($sp)
769	$REG_L	$t0,1*$SZREG($sp)
770	$REG_L	$gp,0*$SZREG($sp)
771	$PTR_ADD $sp,6*$SZREG
772___
773$code.=<<___;
774	jr	$ra
775	move	$a0,$v0
776.end	bn_sub_words_internal
777
778.align 5
779.globl	bn_div_3_words
780.ent	bn_div_3_words
781bn_div_3_words:
782	.set	noreorder
783	move	$a3,$a0		# we know that bn_div_words does not
784				# touch $a3, $ta2, $ta3 and preserves $a2
785				# so that we can save two arguments
786				# and return address in registers
787				# instead of stack:-)
788
789	$LD	$a0,($a3)
790	move	$ta2,$a1
791	bne	$a0,$a2,bn_div_3_words_internal
792	$LD	$a1,-$BNSZ($a3)
793	li	$v0,-1
794	jr	$ra
795	move	$a0,$v0
796.end	bn_div_3_words
797
798.align	5
799.ent	bn_div_3_words_internal
800bn_div_3_words_internal:
801___
802$code.=<<___ if ($flavour =~ /nubi/i);
803	.frame	$sp,6*$SZREG,$ra
804	.mask	0x8000f008,-$SZREG
805	.set	noreorder
806	$PTR_SUB $sp,6*$SZREG
807	$REG_S	$ra,5*$SZREG($sp)
808	$REG_S	$t3,4*$SZREG($sp)
809	$REG_S	$t2,3*$SZREG($sp)
810	$REG_S	$t1,2*$SZREG($sp)
811	$REG_S	$t0,1*$SZREG($sp)
812	$REG_S	$gp,0*$SZREG($sp)
813___
814$code.=<<___;
815	.set	reorder
816	move	$ta3,$ra
817	bal	bn_div_words_internal
818	move	$ra,$ta3
819	$MULTU	$ta2,$v0
820	$LD	$t2,-2*$BNSZ($a3)
821	move	$ta0,$zero
822	mfhi	$t1
823	mflo	$t0
824	sltu	$t8,$t1,$a1
825.L_bn_div_3_words_inner_loop:
826	bnez	$t8,.L_bn_div_3_words_inner_loop_done
827	sgeu	$at,$t2,$t0
828	seq	$t9,$t1,$a1
829	and	$at,$t9
830	sltu	$t3,$t0,$ta2
831	$ADDU	$a1,$a2
832	$SUBU	$t1,$t3
833	$SUBU	$t0,$ta2
834	sltu	$t8,$t1,$a1
835	sltu	$ta0,$a1,$a2
836	or	$t8,$ta0
837	.set	noreorder
838	beqz	$at,.L_bn_div_3_words_inner_loop
839	$SUBU	$v0,1
840	$ADDU	$v0,1
841	.set	reorder
842.L_bn_div_3_words_inner_loop_done:
843	.set	noreorder
844___
845$code.=<<___ if ($flavour =~ /nubi/i);
846	$REG_L	$t3,4*$SZREG($sp)
847	$REG_L	$t2,3*$SZREG($sp)
848	$REG_L	$t1,2*$SZREG($sp)
849	$REG_L	$t0,1*$SZREG($sp)
850	$REG_L	$gp,0*$SZREG($sp)
851	$PTR_ADD $sp,6*$SZREG
852___
853$code.=<<___;
854	jr	$ra
855	move	$a0,$v0
856.end	bn_div_3_words_internal
857
858.align	5
859.globl	bn_div_words
860.ent	bn_div_words
861bn_div_words:
862	.set	noreorder
863	bnez	$a2,bn_div_words_internal
864	li	$v0,-1		# I would rather signal div-by-zero
865				# which can be done with 'break 7'
866	jr	$ra
867	move	$a0,$v0
868.end	bn_div_words
869
870.align	5
871.ent	bn_div_words_internal
872bn_div_words_internal:
873___
874$code.=<<___ if ($flavour =~ /nubi/i);
875	.frame	$sp,6*$SZREG,$ra
876	.mask	0x8000f008,-$SZREG
877	.set	noreorder
878	$PTR_SUB $sp,6*$SZREG
879	$REG_S	$ra,5*$SZREG($sp)
880	$REG_S	$t3,4*$SZREG($sp)
881	$REG_S	$t2,3*$SZREG($sp)
882	$REG_S	$t1,2*$SZREG($sp)
883	$REG_S	$t0,1*$SZREG($sp)
884	$REG_S	$gp,0*$SZREG($sp)
885___
886$code.=<<___;
887	move	$v1,$zero
888	bltz	$a2,.L_bn_div_words_body
889	move	$t9,$v1
890	$SLL	$a2,1
891	bgtz	$a2,.-4
892	addu	$t9,1
893
894	.set	reorder
895	negu	$t1,$t9
896	li	$t2,-1
897	$SLL	$t2,$t1
898	and	$t2,$a0
899	$SRL	$at,$a1,$t1
900	.set	noreorder
901	beqz	$t2,.+12
902	nop
903	break	6		# signal overflow
904	.set	reorder
905	$SLL	$a0,$t9
906	$SLL	$a1,$t9
907	or	$a0,$at
908___
909$QT=$ta0;
910$HH=$ta1;
911$DH=$v1;
912$code.=<<___;
913.L_bn_div_words_body:
914	$SRL	$DH,$a2,4*$BNSZ	# bits
915	sgeu	$at,$a0,$a2
916	.set	noreorder
917	beqz	$at,.+12
918	nop
919	$SUBU	$a0,$a2
920	.set	reorder
921
922	li	$QT,-1
923	$SRL	$HH,$a0,4*$BNSZ	# bits
924	$SRL	$QT,4*$BNSZ	# q=0xffffffff
925	beq	$DH,$HH,.L_bn_div_words_skip_div1
926	$DIVU	$zero,$a0,$DH
927	mflo	$QT
928.L_bn_div_words_skip_div1:
929	$MULTU	$a2,$QT
930	$SLL	$t3,$a0,4*$BNSZ	# bits
931	$SRL	$at,$a1,4*$BNSZ	# bits
932	or	$t3,$at
933	mflo	$t0
934	mfhi	$t1
935.L_bn_div_words_inner_loop1:
936	sltu	$t2,$t3,$t0
937	seq	$t8,$HH,$t1
938	sltu	$at,$HH,$t1
939	and	$t2,$t8
940	sltu	$v0,$t0,$a2
941	or	$at,$t2
942	.set	noreorder
943	beqz	$at,.L_bn_div_words_inner_loop1_done
944	$SUBU	$t1,$v0
945	$SUBU	$t0,$a2
946	b	.L_bn_div_words_inner_loop1
947	$SUBU	$QT,1
948	.set	reorder
949.L_bn_div_words_inner_loop1_done:
950
951	$SLL	$a1,4*$BNSZ	# bits
952	$SUBU	$a0,$t3,$t0
953	$SLL	$v0,$QT,4*$BNSZ	# bits
954
955	li	$QT,-1
956	$SRL	$HH,$a0,4*$BNSZ	# bits
957	$SRL	$QT,4*$BNSZ	# q=0xffffffff
958	beq	$DH,$HH,.L_bn_div_words_skip_div2
959	$DIVU	$zero,$a0,$DH
960	mflo	$QT
961.L_bn_div_words_skip_div2:
962	$MULTU	$a2,$QT
963	$SLL	$t3,$a0,4*$BNSZ	# bits
964	$SRL	$at,$a1,4*$BNSZ	# bits
965	or	$t3,$at
966	mflo	$t0
967	mfhi	$t1
968.L_bn_div_words_inner_loop2:
969	sltu	$t2,$t3,$t0
970	seq	$t8,$HH,$t1
971	sltu	$at,$HH,$t1
972	and	$t2,$t8
973	sltu	$v1,$t0,$a2
974	or	$at,$t2
975	.set	noreorder
976	beqz	$at,.L_bn_div_words_inner_loop2_done
977	$SUBU	$t1,$v1
978	$SUBU	$t0,$a2
979	b	.L_bn_div_words_inner_loop2
980	$SUBU	$QT,1
981	.set	reorder
982.L_bn_div_words_inner_loop2_done:
983
984	$SUBU	$a0,$t3,$t0
985	or	$v0,$QT
986	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
987	$SRL	$a2,$t9		# restore $a2
988
989	.set	noreorder
990	move	$a1,$v1
991___
992$code.=<<___ if ($flavour =~ /nubi/i);
993	$REG_L	$t3,4*$SZREG($sp)
994	$REG_L	$t2,3*$SZREG($sp)
995	$REG_L	$t1,2*$SZREG($sp)
996	$REG_L	$t0,1*$SZREG($sp)
997	$REG_L	$gp,0*$SZREG($sp)
998	$PTR_ADD $sp,6*$SZREG
999___
1000$code.=<<___;
1001	jr	$ra
1002	move	$a0,$v0
1003.end	bn_div_words_internal
1004___
1005undef $HH; undef $QT; undef $DH;
1006
1007($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1009
1010($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1012
1013($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1014
1015$code.=<<___;
1016
1017.align	5
1018.globl	bn_mul_comba8
1019.ent	bn_mul_comba8
1020bn_mul_comba8:
1021	.set	noreorder
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024	.frame	$sp,12*$SZREG,$ra
1025	.mask	0x803ff008,-$SZREG
1026	$PTR_SUB $sp,12*$SZREG
1027	$REG_S	$ra,11*$SZREG($sp)
1028	$REG_S	$s5,10*$SZREG($sp)
1029	$REG_S	$s4,9*$SZREG($sp)
1030	$REG_S	$s3,8*$SZREG($sp)
1031	$REG_S	$s2,7*$SZREG($sp)
1032	$REG_S	$s1,6*$SZREG($sp)
1033	$REG_S	$s0,5*$SZREG($sp)
1034	$REG_S	$t3,4*$SZREG($sp)
1035	$REG_S	$t2,3*$SZREG($sp)
1036	$REG_S	$t1,2*$SZREG($sp)
1037	$REG_S	$t0,1*$SZREG($sp)
1038	$REG_S	$gp,0*$SZREG($sp)
1039___
1040$code.=<<___ if ($flavour !~ /nubi/i);
1041	.frame	$sp,6*$SZREG,$ra
1042	.mask	0x003f0000,-$SZREG
1043	$PTR_SUB $sp,6*$SZREG
1044	$REG_S	$s5,5*$SZREG($sp)
1045	$REG_S	$s4,4*$SZREG($sp)
1046	$REG_S	$s3,3*$SZREG($sp)
1047	$REG_S	$s2,2*$SZREG($sp)
1048	$REG_S	$s1,1*$SZREG($sp)
1049	$REG_S	$s0,0*$SZREG($sp)
1050___
1051$code.=<<___;
1052
1053	.set	reorder
1054	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1055				# R5000 box assembler barks on this
1056				# 1ine with "should not have mult/div
1057				# as last instruction in bb (R10K
1058				# bug)" warning. If anybody out there
1059				# has a clue about how to circumvent
1060				# this do send me a note.
1061				#		<appro\@fy.chalmers.se>
1062
1063	$LD	$b_0,0($a2)
1064	$LD	$a_1,$BNSZ($a1)
1065	$LD	$a_2,2*$BNSZ($a1)
1066	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1067	$LD	$a_3,3*$BNSZ($a1)
1068	$LD	$b_1,$BNSZ($a2)
1069	$LD	$b_2,2*$BNSZ($a2)
1070	$LD	$b_3,3*$BNSZ($a2)
1071	mflo	$c_1
1072	mfhi	$c_2
1073
1074	$LD	$a_4,4*$BNSZ($a1)
1075	$LD	$a_5,5*$BNSZ($a1)
1076	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1077	$LD	$a_6,6*$BNSZ($a1)
1078	$LD	$a_7,7*$BNSZ($a1)
1079	$LD	$b_4,4*$BNSZ($a2)
1080	$LD	$b_5,5*$BNSZ($a2)
1081	mflo	$t_1
1082	mfhi	$t_2
1083	$ADDU	$c_2,$t_1
1084	sltu	$at,$c_2,$t_1
1085	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1086	$ADDU	$c_3,$t_2,$at
1087	$LD	$b_6,6*$BNSZ($a2)
1088	$LD	$b_7,7*$BNSZ($a2)
1089	$ST	$c_1,0($a0)	# r[0]=c1;
1090	mflo	$t_1
1091	mfhi	$t_2
1092	$ADDU	$c_2,$t_1
1093	sltu	$at,$c_2,$t_1
1094	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1095	$ADDU	$t_2,$at
1096	$ADDU	$c_3,$t_2
1097	sltu	$c_1,$c_3,$t_2
1098	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1099
1100	mflo	$t_1
1101	mfhi	$t_2
1102	$ADDU	$c_3,$t_1
1103	sltu	$at,$c_3,$t_1
1104	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1105	$ADDU	$t_2,$at
1106	$ADDU	$c_1,$t_2
1107	mflo	$t_1
1108	mfhi	$t_2
1109	$ADDU	$c_3,$t_1
1110	sltu	$at,$c_3,$t_1
1111	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1112	$ADDU	$t_2,$at
1113	$ADDU	$c_1,$t_2
1114	sltu	$c_2,$c_1,$t_2
1115	mflo	$t_1
1116	mfhi	$t_2
1117	$ADDU	$c_3,$t_1
1118	sltu	$at,$c_3,$t_1
1119	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1120	$ADDU	$t_2,$at
1121	$ADDU	$c_1,$t_2
1122	sltu	$at,$c_1,$t_2
1123	$ADDU	$c_2,$at
1124	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1125
1126	mflo	$t_1
1127	mfhi	$t_2
1128	$ADDU	$c_1,$t_1
1129	sltu	$at,$c_1,$t_1
1130	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1131	$ADDU	$t_2,$at
1132	$ADDU	$c_2,$t_2
1133	sltu	$c_3,$c_2,$t_2
1134	mflo	$t_1
1135	mfhi	$t_2
1136	$ADDU	$c_1,$t_1
1137	sltu	$at,$c_1,$t_1
1138	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1139	$ADDU	$t_2,$at
1140	$ADDU	$c_2,$t_2
1141	sltu	$at,$c_2,$t_2
1142	$ADDU	$c_3,$at
1143	mflo	$t_1
1144	mfhi	$t_2
1145	$ADDU	$c_1,$t_1
1146	sltu	$at,$c_1,$t_1
1147	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1148	$ADDU	$t_2,$at
1149	$ADDU	$c_2,$t_2
1150	sltu	$at,$c_2,$t_2
1151	$ADDU	$c_3,$at
1152	mflo	$t_1
1153	mfhi	$t_2
1154	$ADDU	$c_1,$t_1
1155	sltu	$at,$c_1,$t_1
1156	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
1157	$ADDU	$t_2,$at
1158	$ADDU	$c_2,$t_2
1159	sltu	$at,$c_2,$t_2
1160	$ADDU	$c_3,$at
1161	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1162
1163	mflo	$t_1
1164	mfhi	$t_2
1165	$ADDU	$c_2,$t_1
1166	sltu	$at,$c_2,$t_1
1167	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1168	$ADDU	$t_2,$at
1169	$ADDU	$c_3,$t_2
1170	sltu	$c_1,$c_3,$t_2
1171	mflo	$t_1
1172	mfhi	$t_2
1173	$ADDU	$c_2,$t_1
1174	sltu	$at,$c_2,$t_1
1175	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1176	$ADDU	$t_2,$at
1177	$ADDU	$c_3,$t_2
1178	sltu	$at,$c_3,$t_2
1179	$ADDU	$c_1,$at
1180	mflo	$t_1
1181	mfhi	$t_2
1182	$ADDU	$c_2,$t_1
1183	sltu	$at,$c_2,$t_1
1184	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1185	$ADDU	$t_2,$at
1186	$ADDU	$c_3,$t_2
1187	sltu	$at,$c_3,$t_2
1188	$ADDU	$c_1,$at
1189	mflo	$t_1
1190	mfhi	$t_2
1191	$ADDU	$c_2,$t_1
1192	sltu	$at,$c_2,$t_1
1193	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
1194	$ADDU	$t_2,$at
1195	$ADDU	$c_3,$t_2
1196	sltu	$at,$c_3,$t_2
1197	$ADDU	$c_1,$at
1198	mflo	$t_1
1199	mfhi	$t_2
1200	$ADDU	$c_2,$t_1
1201	sltu	$at,$c_2,$t_1
1202	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
1203	$ADDU	$t_2,$at
1204	$ADDU	$c_3,$t_2
1205	sltu	$at,$c_3,$t_2
1206	$ADDU	$c_1,$at
1207	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1208
1209	mflo	$t_1
1210	mfhi	$t_2
1211	$ADDU	$c_3,$t_1
1212	sltu	$at,$c_3,$t_1
1213	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
1214	$ADDU	$t_2,$at
1215	$ADDU	$c_1,$t_2
1216	sltu	$c_2,$c_1,$t_2
1217	mflo	$t_1
1218	mfhi	$t_2
1219	$ADDU	$c_3,$t_1
1220	sltu	$at,$c_3,$t_1
1221	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1222	$ADDU	$t_2,$at
1223	$ADDU	$c_1,$t_2
1224	sltu	$at,$c_1,$t_2
1225	$ADDU	$c_2,$at
1226	mflo	$t_1
1227	mfhi	$t_2
1228	$ADDU	$c_3,$t_1
1229	sltu	$at,$c_3,$t_1
1230	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1231	$ADDU	$t_2,$at
1232	$ADDU	$c_1,$t_2
1233	sltu	$at,$c_1,$t_2
1234	$ADDU	$c_2,$at
1235	mflo	$t_1
1236	mfhi	$t_2
1237	$ADDU	$c_3,$t_1
1238	sltu	$at,$c_3,$t_1
1239	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
1240	$ADDU	$t_2,$at
1241	$ADDU	$c_1,$t_2
1242	sltu	$at,$c_1,$t_2
1243	$ADDU	$c_2,$at
1244	mflo	$t_1
1245	mfhi	$t_2
1246	$ADDU	$c_3,$t_1
1247	sltu	$at,$c_3,$t_1
1248	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
1249	$ADDU	$t_2,$at
1250	$ADDU	$c_1,$t_2
1251	sltu	$at,$c_1,$t_2
1252	$ADDU	$c_2,$at
1253	mflo	$t_1
1254	mfhi	$t_2
1255	$ADDU	$c_3,$t_1
1256	sltu	$at,$c_3,$t_1
1257	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
1258	$ADDU	$t_2,$at
1259	$ADDU	$c_1,$t_2
1260	sltu	$at,$c_1,$t_2
1261	$ADDU	$c_2,$at
1262	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1263
1264	mflo	$t_1
1265	mfhi	$t_2
1266	$ADDU	$c_1,$t_1
1267	sltu	$at,$c_1,$t_1
1268	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
1269	$ADDU	$t_2,$at
1270	$ADDU	$c_2,$t_2
1271	sltu	$c_3,$c_2,$t_2
1272	mflo	$t_1
1273	mfhi	$t_2
1274	$ADDU	$c_1,$t_1
1275	sltu	$at,$c_1,$t_1
1276	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
1277	$ADDU	$t_2,$at
1278	$ADDU	$c_2,$t_2
1279	sltu	$at,$c_2,$t_2
1280	$ADDU	$c_3,$at
1281	mflo	$t_1
1282	mfhi	$t_2
1283	$ADDU	$c_1,$t_1
1284	sltu	$at,$c_1,$t_1
1285	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1286	$ADDU	$t_2,$at
1287	$ADDU	$c_2,$t_2
1288	sltu	$at,$c_2,$t_2
1289	$ADDU	$c_3,$at
1290	mflo	$t_1
1291	mfhi	$t_2
1292	$ADDU	$c_1,$t_1
1293	sltu	$at,$c_1,$t_1
1294	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
1295	$ADDU	$t_2,$at
1296	$ADDU	$c_2,$t_2
1297	sltu	$at,$c_2,$t_2
1298	$ADDU	$c_3,$at
1299	mflo	$t_1
1300	mfhi	$t_2
1301	$ADDU	$c_1,$t_1
1302	sltu	$at,$c_1,$t_1
1303	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
1304	$ADDU	$t_2,$at
1305	$ADDU	$c_2,$t_2
1306	sltu	$at,$c_2,$t_2
1307	$ADDU	$c_3,$at
1308	mflo	$t_1
1309	mfhi	$t_2
1310	$ADDU	$c_1,$t_1
1311	sltu	$at,$c_1,$t_1
1312	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
1313	$ADDU	$t_2,$at
1314	$ADDU	$c_2,$t_2
1315	sltu	$at,$c_2,$t_2
1316	$ADDU	$c_3,$at
1317	mflo	$t_1
1318	mfhi	$t_2
1319	$ADDU	$c_1,$t_1
1320	sltu	$at,$c_1,$t_1
1321	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
1322	$ADDU	$t_2,$at
1323	$ADDU	$c_2,$t_2
1324	sltu	$at,$c_2,$t_2
1325	$ADDU	$c_3,$at
1326	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1327
1328	mflo	$t_1
1329	mfhi	$t_2
1330	$ADDU	$c_2,$t_1
1331	sltu	$at,$c_2,$t_1
1332	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
1333	$ADDU	$t_2,$at
1334	$ADDU	$c_3,$t_2
1335	sltu	$c_1,$c_3,$t_2
1336	mflo	$t_1
1337	mfhi	$t_2
1338	$ADDU	$c_2,$t_1
1339	sltu	$at,$c_2,$t_1
1340	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
1341	$ADDU	$t_2,$at
1342	$ADDU	$c_3,$t_2
1343	sltu	$at,$c_3,$t_2
1344	$ADDU	$c_1,$at
1345	mflo	$t_1
1346	mfhi	$t_2
1347	$ADDU	$c_2,$t_1
1348	sltu	$at,$c_2,$t_1
1349	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
1350	$ADDU	$t_2,$at
1351	$ADDU	$c_3,$t_2
1352	sltu	$at,$c_3,$t_2
1353	$ADDU	$c_1,$at
1354	mflo	$t_1
1355	mfhi	$t_2
1356	$ADDU	$c_2,$t_1
1357	sltu	$at,$c_2,$t_1
1358	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
1359	$ADDU	$t_2,$at
1360	$ADDU	$c_3,$t_2
1361	sltu	$at,$c_3,$t_2
1362	$ADDU	$c_1,$at
1363	mflo	$t_1
1364	mfhi	$t_2
1365	$ADDU	$c_2,$t_1
1366	sltu	$at,$c_2,$t_1
1367	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
1368	$ADDU	$t_2,$at
1369	$ADDU	$c_3,$t_2
1370	sltu	$at,$c_3,$t_2
1371	$ADDU	$c_1,$at
1372	mflo	$t_1
1373	mfhi	$t_2
1374	$ADDU	$c_2,$t_1
1375	sltu	$at,$c_2,$t_1
1376	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
1377	$ADDU	$t_2,$at
1378	$ADDU	$c_3,$t_2
1379	sltu	$at,$c_3,$t_2
1380	$ADDU	$c_1,$at
1381	mflo	$t_1
1382	mfhi	$t_2
1383	$ADDU	$c_2,$t_1
1384	sltu	$at,$c_2,$t_1
1385	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
1386	$ADDU	$t_2,$at
1387	$ADDU	$c_3,$t_2
1388	sltu	$at,$c_3,$t_2
1389	$ADDU	$c_1,$at
1390	mflo	$t_1
1391	mfhi	$t_2
1392	$ADDU	$c_2,$t_1
1393	sltu	$at,$c_2,$t_1
1394	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
1395	$ADDU	$t_2,$at
1396	$ADDU	$c_3,$t_2
1397	sltu	$at,$c_3,$t_2
1398	$ADDU	$c_1,$at
1399	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1400
1401	mflo	$t_1
1402	mfhi	$t_2
1403	$ADDU	$c_3,$t_1
1404	sltu	$at,$c_3,$t_1
1405	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
1406	$ADDU	$t_2,$at
1407	$ADDU	$c_1,$t_2
1408	sltu	$c_2,$c_1,$t_2
1409	mflo	$t_1
1410	mfhi	$t_2
1411	$ADDU	$c_3,$t_1
1412	sltu	$at,$c_3,$t_1
1413	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
1414	$ADDU	$t_2,$at
1415	$ADDU	$c_1,$t_2
1416	sltu	$at,$c_1,$t_2
1417	$ADDU	$c_2,$at
1418	mflo	$t_1
1419	mfhi	$t_2
1420	$ADDU	$c_3,$t_1
1421	sltu	$at,$c_3,$t_1
1422	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
1423	$ADDU	$t_2,$at
1424	$ADDU	$c_1,$t_2
1425	sltu	$at,$c_1,$t_2
1426	$ADDU	$c_2,$at
1427	mflo	$t_1
1428	mfhi	$t_2
1429	$ADDU	$c_3,$t_1
1430	sltu	$at,$c_3,$t_1
1431	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
1432	$ADDU	$t_2,$at
1433	$ADDU	$c_1,$t_2
1434	sltu	$at,$c_1,$t_2
1435	$ADDU	$c_2,$at
1436	mflo	$t_1
1437	mfhi	$t_2
1438	$ADDU	$c_3,$t_1
1439	sltu	$at,$c_3,$t_1
1440	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
1441	$ADDU	$t_2,$at
1442	$ADDU	$c_1,$t_2
1443	sltu	$at,$c_1,$t_2
1444	$ADDU	$c_2,$at
1445	mflo	$t_1
1446	mfhi	$t_2
1447	$ADDU	$c_3,$t_1
1448	sltu	$at,$c_3,$t_1
1449	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
1450	$ADDU	$t_2,$at
1451	$ADDU	$c_1,$t_2
1452	sltu	$at,$c_1,$t_2
1453	$ADDU	$c_2,$at
1454	mflo	$t_1
1455	mfhi	$t_2
1456	$ADDU	$c_3,$t_1
1457	sltu	$at,$c_3,$t_1
1458	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
1459	$ADDU	$t_2,$at
1460	$ADDU	$c_1,$t_2
1461	sltu	$at,$c_1,$t_2
1462	$ADDU	$c_2,$at
1463	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1464
1465	mflo	$t_1
1466	mfhi	$t_2
1467	$ADDU	$c_1,$t_1
1468	sltu	$at,$c_1,$t_1
1469	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
1470	$ADDU	$t_2,$at
1471	$ADDU	$c_2,$t_2
1472	sltu	$c_3,$c_2,$t_2
1473	mflo	$t_1
1474	mfhi	$t_2
1475	$ADDU	$c_1,$t_1
1476	sltu	$at,$c_1,$t_1
1477	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
1478	$ADDU	$t_2,$at
1479	$ADDU	$c_2,$t_2
1480	sltu	$at,$c_2,$t_2
1481	$ADDU	$c_3,$at
1482	mflo	$t_1
1483	mfhi	$t_2
1484	$ADDU	$c_1,$t_1
1485	sltu	$at,$c_1,$t_1
1486	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
1487	$ADDU	$t_2,$at
1488	$ADDU	$c_2,$t_2
1489	sltu	$at,$c_2,$t_2
1490	$ADDU	$c_3,$at
1491	mflo	$t_1
1492	mfhi	$t_2
1493	$ADDU	$c_1,$t_1
1494	sltu	$at,$c_1,$t_1
1495	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
1496	$ADDU	$t_2,$at
1497	$ADDU	$c_2,$t_2
1498	sltu	$at,$c_2,$t_2
1499	$ADDU	$c_3,$at
1500	mflo	$t_1
1501	mfhi	$t_2
1502	$ADDU	$c_1,$t_1
1503	sltu	$at,$c_1,$t_1
1504	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
1505	$ADDU	$t_2,$at
1506	$ADDU	$c_2,$t_2
1507	sltu	$at,$c_2,$t_2
1508	$ADDU	$c_3,$at
1509	mflo	$t_1
1510	mfhi	$t_2
1511	$ADDU	$c_1,$t_1
1512	sltu	$at,$c_1,$t_1
1513	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
1514	$ADDU	$t_2,$at
1515	$ADDU	$c_2,$t_2
1516	sltu	$at,$c_2,$t_2
1517	$ADDU	$c_3,$at
1518	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1519
1520	mflo	$t_1
1521	mfhi	$t_2
1522	$ADDU	$c_2,$t_1
1523	sltu	$at,$c_2,$t_1
1524	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
1525	$ADDU	$t_2,$at
1526	$ADDU	$c_3,$t_2
1527	sltu	$c_1,$c_3,$t_2
1528	mflo	$t_1
1529	mfhi	$t_2
1530	$ADDU	$c_2,$t_1
1531	sltu	$at,$c_2,$t_1
1532	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
1533	$ADDU	$t_2,$at
1534	$ADDU	$c_3,$t_2
1535	sltu	$at,$c_3,$t_2
1536	$ADDU	$c_1,$at
1537	mflo	$t_1
1538	mfhi	$t_2
1539	$ADDU	$c_2,$t_1
1540	sltu	$at,$c_2,$t_1
1541	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
1542	$ADDU	$t_2,$at
1543	$ADDU	$c_3,$t_2
1544	sltu	$at,$c_3,$t_2
1545	$ADDU	$c_1,$at
1546	mflo	$t_1
1547	mfhi	$t_2
1548	$ADDU	$c_2,$t_1
1549	sltu	$at,$c_2,$t_1
1550	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
1551	$ADDU	$t_2,$at
1552	$ADDU	$c_3,$t_2
1553	sltu	$at,$c_3,$t_2
1554	$ADDU	$c_1,$at
1555	mflo	$t_1
1556	mfhi	$t_2
1557	$ADDU	$c_2,$t_1
1558	sltu	$at,$c_2,$t_1
1559	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
1560	$ADDU	$t_2,$at
1561	$ADDU	$c_3,$t_2
1562	sltu	$at,$c_3,$t_2
1563	$ADDU	$c_1,$at
1564	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1565
1566	mflo	$t_1
1567	mfhi	$t_2
1568	$ADDU	$c_3,$t_1
1569	sltu	$at,$c_3,$t_1
1570	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
1571	$ADDU	$t_2,$at
1572	$ADDU	$c_1,$t_2
1573	sltu	$c_2,$c_1,$t_2
1574	mflo	$t_1
1575	mfhi	$t_2
1576	$ADDU	$c_3,$t_1
1577	sltu	$at,$c_3,$t_1
1578	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
1579	$ADDU	$t_2,$at
1580	$ADDU	$c_1,$t_2
1581	sltu	$at,$c_1,$t_2
1582	$ADDU	$c_2,$at
1583	mflo	$t_1
1584	mfhi	$t_2
1585	$ADDU	$c_3,$t_1
1586	sltu	$at,$c_3,$t_1
1587	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
1588	$ADDU	$t_2,$at
1589	$ADDU	$c_1,$t_2
1590	sltu	$at,$c_1,$t_2
1591	$ADDU	$c_2,$at
1592	mflo	$t_1
1593	mfhi	$t_2
1594	$ADDU	$c_3,$t_1
1595	sltu	$at,$c_3,$t_1
1596	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
1597	$ADDU	$t_2,$at
1598	$ADDU	$c_1,$t_2
1599	sltu	$at,$c_1,$t_2
1600	$ADDU	$c_2,$at
1601	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1602
1603	mflo	$t_1
1604	mfhi	$t_2
1605	$ADDU	$c_1,$t_1
1606	sltu	$at,$c_1,$t_1
1607	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
1608	$ADDU	$t_2,$at
1609	$ADDU	$c_2,$t_2
1610	sltu	$c_3,$c_2,$t_2
1611	mflo	$t_1
1612	mfhi	$t_2
1613	$ADDU	$c_1,$t_1
1614	sltu	$at,$c_1,$t_1
1615	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
1616	$ADDU	$t_2,$at
1617	$ADDU	$c_2,$t_2
1618	sltu	$at,$c_2,$t_2
1619	$ADDU	$c_3,$at
1620	mflo	$t_1
1621	mfhi	$t_2
1622	$ADDU	$c_1,$t_1
1623	sltu	$at,$c_1,$t_1
1624	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
1625	$ADDU	$t_2,$at
1626	$ADDU	$c_2,$t_2
1627	sltu	$at,$c_2,$t_2
1628	$ADDU	$c_3,$at
1629	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1630
1631	mflo	$t_1
1632	mfhi	$t_2
1633	$ADDU	$c_2,$t_1
1634	sltu	$at,$c_2,$t_1
1635	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
1636	$ADDU	$t_2,$at
1637	$ADDU	$c_3,$t_2
1638	sltu	$c_1,$c_3,$t_2
1639	mflo	$t_1
1640	mfhi	$t_2
1641	$ADDU	$c_2,$t_1
1642	sltu	$at,$c_2,$t_1
1643	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
1644	$ADDU	$t_2,$at
1645	$ADDU	$c_3,$t_2
1646	sltu	$at,$c_3,$t_2
1647	$ADDU	$c_1,$at
1648	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1649
1650	mflo	$t_1
1651	mfhi	$t_2
1652	$ADDU	$c_3,$t_1
1653	sltu	$at,$c_3,$t_1
1654	$ADDU	$t_2,$at
1655	$ADDU	$c_1,$t_2
1656	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1657	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1658
1659	.set	noreorder
1660___
1661$code.=<<___ if ($flavour =~ /nubi/i);
1662	$REG_L	$s5,10*$SZREG($sp)
1663	$REG_L	$s4,9*$SZREG($sp)
1664	$REG_L	$s3,8*$SZREG($sp)
1665	$REG_L	$s2,7*$SZREG($sp)
1666	$REG_L	$s1,6*$SZREG($sp)
1667	$REG_L	$s0,5*$SZREG($sp)
1668	$REG_L	$t3,4*$SZREG($sp)
1669	$REG_L	$t2,3*$SZREG($sp)
1670	$REG_L	$t1,2*$SZREG($sp)
1671	$REG_L	$t0,1*$SZREG($sp)
1672	$REG_L	$gp,0*$SZREG($sp)
1673	jr	$ra
1674	$PTR_ADD $sp,12*$SZREG
1675___
1676$code.=<<___ if ($flavour !~ /nubi/i);
1677	$REG_L	$s5,5*$SZREG($sp)
1678	$REG_L	$s4,4*$SZREG($sp)
1679	$REG_L	$s3,3*$SZREG($sp)
1680	$REG_L	$s2,2*$SZREG($sp)
1681	$REG_L	$s1,1*$SZREG($sp)
1682	$REG_L	$s0,0*$SZREG($sp)
1683	jr	$ra
1684	$PTR_ADD $sp,6*$SZREG
1685___
1686$code.=<<___;
1687.end	bn_mul_comba8
1688
1689.align	5
1690.globl	bn_mul_comba4
1691.ent	bn_mul_comba4
1692bn_mul_comba4:
1693___
1694$code.=<<___ if ($flavour =~ /nubi/i);
1695	.frame	$sp,6*$SZREG,$ra
1696	.mask	0x8000f008,-$SZREG
1697	.set	noreorder
1698	$PTR_SUB $sp,6*$SZREG
1699	$REG_S	$ra,5*$SZREG($sp)
1700	$REG_S	$t3,4*$SZREG($sp)
1701	$REG_S	$t2,3*$SZREG($sp)
1702	$REG_S	$t1,2*$SZREG($sp)
1703	$REG_S	$t0,1*$SZREG($sp)
1704	$REG_S	$gp,0*$SZREG($sp)
1705___
1706$code.=<<___;
1707	.set	reorder
1708	$LD	$a_0,0($a1)
1709	$LD	$b_0,0($a2)
1710	$LD	$a_1,$BNSZ($a1)
1711	$LD	$a_2,2*$BNSZ($a1)
1712	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1713	$LD	$a_3,3*$BNSZ($a1)
1714	$LD	$b_1,$BNSZ($a2)
1715	$LD	$b_2,2*$BNSZ($a2)
1716	$LD	$b_3,3*$BNSZ($a2)
1717	mflo	$c_1
1718	mfhi	$c_2
1719	$ST	$c_1,0($a0)
1720
1721	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1722	mflo	$t_1
1723	mfhi	$t_2
1724	$ADDU	$c_2,$t_1
1725	sltu	$at,$c_2,$t_1
1726	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1727	$ADDU	$c_3,$t_2,$at
1728	mflo	$t_1
1729	mfhi	$t_2
1730	$ADDU	$c_2,$t_1
1731	sltu	$at,$c_2,$t_1
1732	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1733	$ADDU	$t_2,$at
1734	$ADDU	$c_3,$t_2
1735	sltu	$c_1,$c_3,$t_2
1736	$ST	$c_2,$BNSZ($a0)
1737
1738	mflo	$t_1
1739	mfhi	$t_2
1740	$ADDU	$c_3,$t_1
1741	sltu	$at,$c_3,$t_1
1742	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1743	$ADDU	$t_2,$at
1744	$ADDU	$c_1,$t_2
1745	mflo	$t_1
1746	mfhi	$t_2
1747	$ADDU	$c_3,$t_1
1748	sltu	$at,$c_3,$t_1
1749	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1750	$ADDU	$t_2,$at
1751	$ADDU	$c_1,$t_2
1752	sltu	$c_2,$c_1,$t_2
1753	mflo	$t_1
1754	mfhi	$t_2
1755	$ADDU	$c_3,$t_1
1756	sltu	$at,$c_3,$t_1
1757	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1758	$ADDU	$t_2,$at
1759	$ADDU	$c_1,$t_2
1760	sltu	$at,$c_1,$t_2
1761	$ADDU	$c_2,$at
1762	$ST	$c_3,2*$BNSZ($a0)
1763
1764	mflo	$t_1
1765	mfhi	$t_2
1766	$ADDU	$c_1,$t_1
1767	sltu	$at,$c_1,$t_1
1768	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1769	$ADDU	$t_2,$at
1770	$ADDU	$c_2,$t_2
1771	sltu	$c_3,$c_2,$t_2
1772	mflo	$t_1
1773	mfhi	$t_2
1774	$ADDU	$c_1,$t_1
1775	sltu	$at,$c_1,$t_1
1776	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1777	$ADDU	$t_2,$at
1778	$ADDU	$c_2,$t_2
1779	sltu	$at,$c_2,$t_2
1780	$ADDU	$c_3,$at
1781	mflo	$t_1
1782	mfhi	$t_2
1783	$ADDU	$c_1,$t_1
1784	sltu	$at,$c_1,$t_1
1785	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1786	$ADDU	$t_2,$at
1787	$ADDU	$c_2,$t_2
1788	sltu	$at,$c_2,$t_2
1789	$ADDU	$c_3,$at
1790	mflo	$t_1
1791	mfhi	$t_2
1792	$ADDU	$c_1,$t_1
1793	sltu	$at,$c_1,$t_1
1794	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1795	$ADDU	$t_2,$at
1796	$ADDU	$c_2,$t_2
1797	sltu	$at,$c_2,$t_2
1798	$ADDU	$c_3,$at
1799	$ST	$c_1,3*$BNSZ($a0)
1800
1801	mflo	$t_1
1802	mfhi	$t_2
1803	$ADDU	$c_2,$t_1
1804	sltu	$at,$c_2,$t_1
1805	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1806	$ADDU	$t_2,$at
1807	$ADDU	$c_3,$t_2
1808	sltu	$c_1,$c_3,$t_2
1809	mflo	$t_1
1810	mfhi	$t_2
1811	$ADDU	$c_2,$t_1
1812	sltu	$at,$c_2,$t_1
1813	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1814	$ADDU	$t_2,$at
1815	$ADDU	$c_3,$t_2
1816	sltu	$at,$c_3,$t_2
1817	$ADDU	$c_1,$at
1818	mflo	$t_1
1819	mfhi	$t_2
1820	$ADDU	$c_2,$t_1
1821	sltu	$at,$c_2,$t_1
1822	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1823	$ADDU	$t_2,$at
1824	$ADDU	$c_3,$t_2
1825	sltu	$at,$c_3,$t_2
1826	$ADDU	$c_1,$at
1827	$ST	$c_2,4*$BNSZ($a0)
1828
1829	mflo	$t_1
1830	mfhi	$t_2
1831	$ADDU	$c_3,$t_1
1832	sltu	$at,$c_3,$t_1
1833	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1834	$ADDU	$t_2,$at
1835	$ADDU	$c_1,$t_2
1836	sltu	$c_2,$c_1,$t_2
1837	mflo	$t_1
1838	mfhi	$t_2
1839	$ADDU	$c_3,$t_1
1840	sltu	$at,$c_3,$t_1
1841	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1842	$ADDU	$t_2,$at
1843	$ADDU	$c_1,$t_2
1844	sltu	$at,$c_1,$t_2
1845	$ADDU	$c_2,$at
1846	$ST	$c_3,5*$BNSZ($a0)
1847
1848	mflo	$t_1
1849	mfhi	$t_2
1850	$ADDU	$c_1,$t_1
1851	sltu	$at,$c_1,$t_1
1852	$ADDU	$t_2,$at
1853	$ADDU	$c_2,$t_2
1854	$ST	$c_1,6*$BNSZ($a0)
1855	$ST	$c_2,7*$BNSZ($a0)
1856
1857	.set	noreorder
1858___
1859$code.=<<___ if ($flavour =~ /nubi/i);
1860	$REG_L	$t3,4*$SZREG($sp)
1861	$REG_L	$t2,3*$SZREG($sp)
1862	$REG_L	$t1,2*$SZREG($sp)
1863	$REG_L	$t0,1*$SZREG($sp)
1864	$REG_L	$gp,0*$SZREG($sp)
1865	$PTR_ADD $sp,6*$SZREG
1866___
1867$code.=<<___;
1868	jr	$ra
1869	nop
1870.end	bn_mul_comba4
1871___
1872
1873($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1874
1875sub add_c2 () {
1876my ($hi,$lo,$c0,$c1,$c2,
1877    $warm,      # !$warm denotes first call with specific sequence of
1878                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879    $an,$bn     # these two are arguments for multiplication which
1880                # result is used in *next* step [which is why it's
1881                # commented as "forward multiplication" below];
1882    )=@_;
1883$code.=<<___;
1884	mflo	$lo
1885	mfhi	$hi
1886	$ADDU	$c0,$lo
1887	sltu	$at,$c0,$lo
1888	 $MULTU	$an,$bn			# forward multiplication
1889	$ADDU	$c0,$lo
1890	$ADDU	$at,$hi
1891	sltu	$lo,$c0,$lo
1892	$ADDU	$c1,$at
1893	$ADDU	$hi,$lo
1894___
1895$code.=<<___	if (!$warm);
1896	sltu	$c2,$c1,$at
1897	$ADDU	$c1,$hi
1898	sltu	$hi,$c1,$hi
1899	$ADDU	$c2,$hi
1900___
1901$code.=<<___	if ($warm);
1902	sltu	$at,$c1,$at
1903	$ADDU	$c1,$hi
1904	$ADDU	$c2,$at
1905	sltu	$hi,$c1,$hi
1906	$ADDU	$c2,$hi
1907___
1908}
1909
1910$code.=<<___;
1911
1912.align	5
1913.globl	bn_sqr_comba8
1914.ent	bn_sqr_comba8
1915bn_sqr_comba8:
1916___
1917$code.=<<___ if ($flavour =~ /nubi/i);
1918	.frame	$sp,6*$SZREG,$ra
1919	.mask	0x8000f008,-$SZREG
1920	.set	noreorder
1921	$PTR_SUB $sp,6*$SZREG
1922	$REG_S	$ra,5*$SZREG($sp)
1923	$REG_S	$t3,4*$SZREG($sp)
1924	$REG_S	$t2,3*$SZREG($sp)
1925	$REG_S	$t1,2*$SZREG($sp)
1926	$REG_S	$t0,1*$SZREG($sp)
1927	$REG_S	$gp,0*$SZREG($sp)
1928___
1929$code.=<<___;
1930	.set	reorder
1931	$LD	$a_0,0($a1)
1932	$LD	$a_1,$BNSZ($a1)
1933	$LD	$a_2,2*$BNSZ($a1)
1934	$LD	$a_3,3*$BNSZ($a1)
1935
1936	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1937	$LD	$a_4,4*$BNSZ($a1)
1938	$LD	$a_5,5*$BNSZ($a1)
1939	$LD	$a_6,6*$BNSZ($a1)
1940	$LD	$a_7,7*$BNSZ($a1)
1941	mflo	$c_1
1942	mfhi	$c_2
1943	$ST	$c_1,0($a0)
1944
1945	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
1946	mflo	$t_1
1947	mfhi	$t_2
1948	slt	$c_1,$t_2,$zero
1949	$SLL	$t_2,1
1950	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
1951	slt	$a2,$t_1,$zero
1952	$ADDU	$t_2,$a2
1953	$SLL	$t_1,1
1954	$ADDU	$c_2,$t_1
1955	sltu	$at,$c_2,$t_1
1956	$ADDU	$c_3,$t_2,$at
1957	$ST	$c_2,$BNSZ($a0)
1958___
1959	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1961$code.=<<___;
1962	mflo	$t_1
1963	mfhi	$t_2
1964	$ADDU	$c_3,$t_1
1965	sltu	$at,$c_3,$t_1
1966	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
1967	$ADDU	$t_2,$at
1968	$ADDU	$c_1,$t_2
1969	sltu	$at,$c_1,$t_2
1970	$ADDU	$c_2,$at
1971	$ST	$c_3,2*$BNSZ($a0)
1972___
1973	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
1975	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
1977$code.=<<___;
1978	$ST	$c_1,3*$BNSZ($a0)
1979___
1980	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
1982	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
1984$code.=<<___;
1985	mflo	$t_1
1986	mfhi	$t_2
1987	$ADDU	$c_2,$t_1
1988	sltu	$at,$c_2,$t_1
1989	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
1990	$ADDU	$t_2,$at
1991	$ADDU	$c_3,$t_2
1992	sltu	$at,$c_3,$t_2
1993	$ADDU	$c_1,$at
1994	$ST	$c_2,4*$BNSZ($a0)
1995___
1996	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
1998	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2000	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2002$code.=<<___;
2003	$ST	$c_3,5*$BNSZ($a0)
2004___
2005	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2007	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2009	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2011$code.=<<___;
2012	mflo	$t_1
2013	mfhi	$t_2
2014	$ADDU	$c_1,$t_1
2015	sltu	$at,$c_1,$t_1
2016	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
2017	$ADDU	$t_2,$at
2018	$ADDU	$c_2,$t_2
2019	sltu	$at,$c_2,$t_2
2020	$ADDU	$c_3,$at
2021	$ST	$c_1,6*$BNSZ($a0)
2022___
2023	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2025	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2027	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2029	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2031$code.=<<___;
2032	$ST	$c_2,7*$BNSZ($a0)
2033___
2034	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2036	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2038	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2040$code.=<<___;
2041	mflo	$t_1
2042	mfhi	$t_2
2043	$ADDU	$c_3,$t_1
2044	sltu	$at,$c_3,$t_1
2045	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
2046	$ADDU	$t_2,$at
2047	$ADDU	$c_1,$t_2
2048	sltu	$at,$c_1,$t_2
2049	$ADDU	$c_2,$at
2050	$ST	$c_3,8*$BNSZ($a0)
2051___
2052	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2054	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2056	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2058$code.=<<___;
2059	$ST	$c_1,9*$BNSZ($a0)
2060___
2061	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2063	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2065$code.=<<___;
2066	mflo	$t_1
2067	mfhi	$t_2
2068	$ADDU	$c_2,$t_1
2069	sltu	$at,$c_2,$t_1
2070	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
2071	$ADDU	$t_2,$at
2072	$ADDU	$c_3,$t_2
2073	sltu	$at,$c_3,$t_2
2074	$ADDU	$c_1,$at
2075	$ST	$c_2,10*$BNSZ($a0)
2076___
2077	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2079	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2081$code.=<<___;
2082	$ST	$c_3,11*$BNSZ($a0)
2083___
2084	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2086$code.=<<___;
2087	mflo	$t_1
2088	mfhi	$t_2
2089	$ADDU	$c_1,$t_1
2090	sltu	$at,$c_1,$t_1
2091	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
2092	$ADDU	$t_2,$at
2093	$ADDU	$c_2,$t_2
2094	sltu	$at,$c_2,$t_2
2095	$ADDU	$c_3,$at
2096	$ST	$c_1,12*$BNSZ($a0)
2097___
2098	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2100$code.=<<___;
2101	$ST	$c_2,13*$BNSZ($a0)
2102
2103	mflo	$t_1
2104	mfhi	$t_2
2105	$ADDU	$c_3,$t_1
2106	sltu	$at,$c_3,$t_1
2107	$ADDU	$t_2,$at
2108	$ADDU	$c_1,$t_2
2109	$ST	$c_3,14*$BNSZ($a0)
2110	$ST	$c_1,15*$BNSZ($a0)
2111
2112	.set	noreorder
2113___
2114$code.=<<___ if ($flavour =~ /nubi/i);
2115	$REG_L	$t3,4*$SZREG($sp)
2116	$REG_L	$t2,3*$SZREG($sp)
2117	$REG_L	$t1,2*$SZREG($sp)
2118	$REG_L	$t0,1*$SZREG($sp)
2119	$REG_L	$gp,0*$SZREG($sp)
2120	$PTR_ADD $sp,6*$SZREG
2121___
2122$code.=<<___;
2123	jr	$ra
2124	nop
2125.end	bn_sqr_comba8
2126
2127.align	5
2128.globl	bn_sqr_comba4
2129.ent	bn_sqr_comba4
2130bn_sqr_comba4:
2131___
2132$code.=<<___ if ($flavour =~ /nubi/i);
2133	.frame	$sp,6*$SZREG,$ra
2134	.mask	0x8000f008,-$SZREG
2135	.set	noreorder
2136	$PTR_SUB $sp,6*$SZREG
2137	$REG_S	$ra,5*$SZREG($sp)
2138	$REG_S	$t3,4*$SZREG($sp)
2139	$REG_S	$t2,3*$SZREG($sp)
2140	$REG_S	$t1,2*$SZREG($sp)
2141	$REG_S	$t0,1*$SZREG($sp)
2142	$REG_S	$gp,0*$SZREG($sp)
2143___
2144$code.=<<___;
2145	.set	reorder
2146	$LD	$a_0,0($a1)
2147	$LD	$a_1,$BNSZ($a1)
2148	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
2149	$LD	$a_2,2*$BNSZ($a1)
2150	$LD	$a_3,3*$BNSZ($a1)
2151	mflo	$c_1
2152	mfhi	$c_2
2153	$ST	$c_1,0($a0)
2154
2155	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
2156	mflo	$t_1
2157	mfhi	$t_2
2158	slt	$c_1,$t_2,$zero
2159	$SLL	$t_2,1
2160	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
2161	slt	$a2,$t_1,$zero
2162	$ADDU	$t_2,$a2
2163	$SLL	$t_1,1
2164	$ADDU	$c_2,$t_1
2165	sltu	$at,$c_2,$t_1
2166	$ADDU	$c_3,$t_2,$at
2167	$ST	$c_2,$BNSZ($a0)
2168___
2169	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2171$code.=<<___;
2172	mflo	$t_1
2173	mfhi	$t_2
2174	$ADDU	$c_3,$t_1
2175	sltu	$at,$c_3,$t_1
2176	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
2177	$ADDU	$t_2,$at
2178	$ADDU	$c_1,$t_2
2179	sltu	$at,$c_1,$t_2
2180	$ADDU	$c_2,$at
2181	$ST	$c_3,2*$BNSZ($a0)
2182___
2183	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2185	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2187$code.=<<___;
2188	$ST	$c_1,3*$BNSZ($a0)
2189___
2190	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2192$code.=<<___;
2193	mflo	$t_1
2194	mfhi	$t_2
2195	$ADDU	$c_2,$t_1
2196	sltu	$at,$c_2,$t_1
2197	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
2198	$ADDU	$t_2,$at
2199	$ADDU	$c_3,$t_2
2200	sltu	$at,$c_3,$t_2
2201	$ADDU	$c_1,$at
2202	$ST	$c_2,4*$BNSZ($a0)
2203___
2204	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2206$code.=<<___;
2207	$ST	$c_3,5*$BNSZ($a0)
2208
2209	mflo	$t_1
2210	mfhi	$t_2
2211	$ADDU	$c_1,$t_1
2212	sltu	$at,$c_1,$t_1
2213	$ADDU	$t_2,$at
2214	$ADDU	$c_2,$t_2
2215	$ST	$c_1,6*$BNSZ($a0)
2216	$ST	$c_2,7*$BNSZ($a0)
2217
2218	.set	noreorder
2219___
2220$code.=<<___ if ($flavour =~ /nubi/i);
2221	$REG_L	$t3,4*$SZREG($sp)
2222	$REG_L	$t2,3*$SZREG($sp)
2223	$REG_L	$t1,2*$SZREG($sp)
2224	$REG_L	$t0,1*$SZREG($sp)
2225	$REG_L	$gp,0*$SZREG($sp)
2226	$PTR_ADD $sp,6*$SZREG
2227___
2228$code.=<<___;
2229	jr	$ra
2230	nop
2231.end	bn_sqr_comba4
2232___
2233print $code;
2234close STDOUT;
2235