1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# This module doesn't present direct interest for OpenSSL, because it
11238384Sjkim# doesn't provide better performance for longer keys, at least not on
12238384Sjkim# in-order-execution cores. While 512-bit RSA sign operations can be
13238384Sjkim# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
14238384Sjkim# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
15238384Sjkim# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
16238384Sjkim# verify:-( All comparisons are against bn_mul_mont-free assembler.
17238384Sjkim# The module might be of interest to embedded system developers, as
18238384Sjkim# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
19238384Sjkim# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
20238384Sjkim# code.
21238384Sjkim
22238384Sjkim######################################################################
23238384Sjkim# There is a number of MIPS ABI in use, O32 and N32/64 are most
24238384Sjkim# widely used. Then there is a new contender: NUBI. It appears that if
25238384Sjkim# one picks the latter, it's possible to arrange code in ABI neutral
26238384Sjkim# manner. Therefore let's stick to NUBI register layout:
27238384Sjkim#
28238384Sjkim($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
29238384Sjkim($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
30238384Sjkim($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
31238384Sjkim($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
32238384Sjkim#
33238384Sjkim# The return value is placed in $a0. Following coding rules facilitate
34238384Sjkim# interoperability:
35238384Sjkim#
36238384Sjkim# - never ever touch $tp, "thread pointer", former $gp;
37238384Sjkim# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
38238384Sjkim#   old code];
39238384Sjkim# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
40238384Sjkim#
41238384Sjkim# For reference here is register layout for N32/64 MIPS ABIs:
42238384Sjkim#
43238384Sjkim# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
44238384Sjkim# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
45238384Sjkim# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
46238384Sjkim# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
47238384Sjkim# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
48238384Sjkim#
49238384Sjkim$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
50238384Sjkim
51238384Sjkimif ($flavour =~ /64|n32/i) {
52238384Sjkim	$PTR_ADD="dadd";	# incidentally works even on n32
53238384Sjkim	$PTR_SUB="dsub";	# incidentally works even on n32
54238384Sjkim	$REG_S="sd";
55238384Sjkim	$REG_L="ld";
56238384Sjkim	$SZREG=8;
57238384Sjkim} else {
58238384Sjkim	$PTR_ADD="add";
59238384Sjkim	$PTR_SUB="sub";
60238384Sjkim	$REG_S="sw";
61238384Sjkim	$REG_L="lw";
62238384Sjkim	$SZREG=4;
63238384Sjkim}
64238384Sjkim$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
65238384Sjkim#
66238384Sjkim# <appro@openssl.org>
67238384Sjkim#
68238384Sjkim######################################################################
69238384Sjkim
70238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
71238384Sjkimopen STDOUT,">$output";
72238384Sjkim
73238384Sjkimif ($flavour =~ /64|n32/i) {
74238384Sjkim	$LD="ld";
75238384Sjkim	$ST="sd";
76238384Sjkim	$MULTU="dmultu";
77238384Sjkim	$ADDU="daddu";
78238384Sjkim	$SUBU="dsubu";
79238384Sjkim	$BNSZ=8;
80238384Sjkim} else {
81238384Sjkim	$LD="lw";
82238384Sjkim	$ST="sw";
83238384Sjkim	$MULTU="multu";
84238384Sjkim	$ADDU="addu";
85238384Sjkim	$SUBU="subu";
86238384Sjkim	$BNSZ=4;
87238384Sjkim}
88238384Sjkim
89238384Sjkim# int bn_mul_mont(
90238384Sjkim$rp=$a0;	# BN_ULONG *rp,
91238384Sjkim$ap=$a1;	# const BN_ULONG *ap,
92238384Sjkim$bp=$a2;	# const BN_ULONG *bp,
93238384Sjkim$np=$a3;	# const BN_ULONG *np,
94238384Sjkim$n0=$a4;	# const BN_ULONG *n0,
95238384Sjkim$num=$a5;	# int num);
96238384Sjkim
97238384Sjkim$lo0=$a6;
98238384Sjkim$hi0=$a7;
99238384Sjkim$lo1=$t1;
100238384Sjkim$hi1=$t2;
101238384Sjkim$aj=$s0;
102238384Sjkim$bi=$s1;
103238384Sjkim$nj=$s2;
104238384Sjkim$tp=$s3;
105238384Sjkim$alo=$s4;
106238384Sjkim$ahi=$s5;
107238384Sjkim$nlo=$s6;
108238384Sjkim$nhi=$s7;
109238384Sjkim$tj=$s8;
110238384Sjkim$i=$s9;
111238384Sjkim$j=$s10;
112238384Sjkim$m1=$s11;
113238384Sjkim
114238384Sjkim$FRAMESIZE=14;
115238384Sjkim
116238384Sjkim$code=<<___;
117238384Sjkim.text
118238384Sjkim
119238384Sjkim.set	noat
120238384Sjkim.set	noreorder
121238384Sjkim
122238384Sjkim.align	5
123238384Sjkim.globl	bn_mul_mont
124238384Sjkim.ent	bn_mul_mont
125238384Sjkimbn_mul_mont:
126238384Sjkim___
127238384Sjkim$code.=<<___ if ($flavour =~ /o32/i);
128238384Sjkim	lw	$n0,16($sp)
129238384Sjkim	lw	$num,20($sp)
130238384Sjkim___
131238384Sjkim$code.=<<___;
132238384Sjkim	slt	$at,$num,4
133238384Sjkim	bnez	$at,1f
134238384Sjkim	li	$t0,0
135238384Sjkim	slt	$at,$num,17	# on in-order CPU
136264331Sjkim	bnez	$at,bn_mul_mont_internal
137238384Sjkim	nop
138238384Sjkim1:	jr	$ra
139238384Sjkim	li	$a0,0
140238384Sjkim.end	bn_mul_mont
141238384Sjkim
142238384Sjkim.align	5
143238384Sjkim.ent	bn_mul_mont_internal
144238384Sjkimbn_mul_mont_internal:
145238384Sjkim	.frame	$fp,$FRAMESIZE*$SZREG,$ra
146238384Sjkim	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
147238384Sjkim	$PTR_SUB $sp,$FRAMESIZE*$SZREG
148238384Sjkim	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
149238384Sjkim	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
150238384Sjkim	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
151238384Sjkim	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
152238384Sjkim	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
153238384Sjkim	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
154238384Sjkim	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
155238384Sjkim	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
156238384Sjkim	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
157238384Sjkim___
158238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
159238384Sjkim	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
160238384Sjkim	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
161238384Sjkim	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
162238384Sjkim	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
163238384Sjkim___
164238384Sjkim$code.=<<___;
165238384Sjkim	move	$fp,$sp
166238384Sjkim
167238384Sjkim	.set	reorder
168238384Sjkim	$LD	$n0,0($n0)
169238384Sjkim	$LD	$bi,0($bp)	# bp[0]
170238384Sjkim	$LD	$aj,0($ap)	# ap[0]
171238384Sjkim	$LD	$nj,0($np)	# np[0]
172238384Sjkim
173238384Sjkim	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
174238384Sjkim	sll	$num,`log($BNSZ)/log(2)`
175238384Sjkim	li	$at,-4096
176238384Sjkim	$PTR_SUB $sp,$num
177238384Sjkim	and	$sp,$at
178238384Sjkim
179238384Sjkim	$MULTU	$aj,$bi
180238384Sjkim	$LD	$alo,$BNSZ($ap)
181238384Sjkim	$LD	$nlo,$BNSZ($np)
182238384Sjkim	mflo	$lo0
183238384Sjkim	mfhi	$hi0
184238384Sjkim	$MULTU	$lo0,$n0
185238384Sjkim	mflo	$m1
186238384Sjkim
187238384Sjkim	$MULTU	$alo,$bi
188238384Sjkim	mflo	$alo
189238384Sjkim	mfhi	$ahi
190238384Sjkim
191238384Sjkim	$MULTU	$nj,$m1
192238384Sjkim	mflo	$lo1
193238384Sjkim	mfhi	$hi1
194238384Sjkim	$MULTU	$nlo,$m1
195238384Sjkim	$ADDU	$lo1,$lo0
196238384Sjkim	sltu	$at,$lo1,$lo0
197238384Sjkim	$ADDU	$hi1,$at
198238384Sjkim	mflo	$nlo
199238384Sjkim	mfhi	$nhi
200238384Sjkim
201238384Sjkim	move	$tp,$sp
202238384Sjkim	li	$j,2*$BNSZ
203238384Sjkim.align	4
204238384Sjkim.L1st:
205238384Sjkim	.set	noreorder
206238384Sjkim	$PTR_ADD $aj,$ap,$j
207238384Sjkim	$PTR_ADD $nj,$np,$j
208238384Sjkim	$LD	$aj,($aj)
209238384Sjkim	$LD	$nj,($nj)
210238384Sjkim
211238384Sjkim	$MULTU	$aj,$bi
212238384Sjkim	$ADDU	$lo0,$alo,$hi0
213238384Sjkim	$ADDU	$lo1,$nlo,$hi1
214238384Sjkim	sltu	$at,$lo0,$hi0
215238384Sjkim	sltu	$t0,$lo1,$hi1
216238384Sjkim	$ADDU	$hi0,$ahi,$at
217238384Sjkim	$ADDU	$hi1,$nhi,$t0
218238384Sjkim	mflo	$alo
219238384Sjkim	mfhi	$ahi
220238384Sjkim
221238384Sjkim	$ADDU	$lo1,$lo0
222238384Sjkim	sltu	$at,$lo1,$lo0
223238384Sjkim	$MULTU	$nj,$m1
224238384Sjkim	$ADDU	$hi1,$at
225238384Sjkim	addu	$j,$BNSZ
226238384Sjkim	$ST	$lo1,($tp)
227238384Sjkim	sltu	$t0,$j,$num
228238384Sjkim	mflo	$nlo
229238384Sjkim	mfhi	$nhi
230238384Sjkim
231238384Sjkim	bnez	$t0,.L1st
232238384Sjkim	$PTR_ADD $tp,$BNSZ
233238384Sjkim	.set	reorder
234238384Sjkim
235238384Sjkim	$ADDU	$lo0,$alo,$hi0
236238384Sjkim	sltu	$at,$lo0,$hi0
237238384Sjkim	$ADDU	$hi0,$ahi,$at
238238384Sjkim
239238384Sjkim	$ADDU	$lo1,$nlo,$hi1
240238384Sjkim	sltu	$t0,$lo1,$hi1
241238384Sjkim	$ADDU	$hi1,$nhi,$t0
242238384Sjkim	$ADDU	$lo1,$lo0
243238384Sjkim	sltu	$at,$lo1,$lo0
244238384Sjkim	$ADDU	$hi1,$at
245238384Sjkim
246238384Sjkim	$ST	$lo1,($tp)
247238384Sjkim
248238384Sjkim	$ADDU	$hi1,$hi0
249238384Sjkim	sltu	$at,$hi1,$hi0
250238384Sjkim	$ST	$hi1,$BNSZ($tp)
251238384Sjkim	$ST	$at,2*$BNSZ($tp)
252238384Sjkim
253238384Sjkim	li	$i,$BNSZ
254238384Sjkim.align	4
255238384Sjkim.Louter:
256238384Sjkim	$PTR_ADD $bi,$bp,$i
257238384Sjkim	$LD	$bi,($bi)
258238384Sjkim	$LD	$aj,($ap)
259238384Sjkim	$LD	$alo,$BNSZ($ap)
260238384Sjkim	$LD	$tj,($sp)
261238384Sjkim
262238384Sjkim	$MULTU	$aj,$bi
263238384Sjkim	$LD	$nj,($np)
264238384Sjkim	$LD	$nlo,$BNSZ($np)
265238384Sjkim	mflo	$lo0
266238384Sjkim	mfhi	$hi0
267238384Sjkim	$ADDU	$lo0,$tj
268238384Sjkim	$MULTU	$lo0,$n0
269238384Sjkim	sltu	$at,$lo0,$tj
270238384Sjkim	$ADDU	$hi0,$at
271238384Sjkim	mflo	$m1
272238384Sjkim
273238384Sjkim	$MULTU	$alo,$bi
274238384Sjkim	mflo	$alo
275238384Sjkim	mfhi	$ahi
276238384Sjkim
277238384Sjkim	$MULTU	$nj,$m1
278238384Sjkim	mflo	$lo1
279238384Sjkim	mfhi	$hi1
280238384Sjkim
281238384Sjkim	$MULTU	$nlo,$m1
282238384Sjkim	$ADDU	$lo1,$lo0
283238384Sjkim	sltu	$at,$lo1,$lo0
284238384Sjkim	$ADDU	$hi1,$at
285238384Sjkim	mflo	$nlo
286238384Sjkim	mfhi	$nhi
287238384Sjkim
288238384Sjkim	move	$tp,$sp
289238384Sjkim	li	$j,2*$BNSZ
290238384Sjkim	$LD	$tj,$BNSZ($tp)
291238384Sjkim.align	4
292238384Sjkim.Linner:
293238384Sjkim	.set	noreorder
294238384Sjkim	$PTR_ADD $aj,$ap,$j
295238384Sjkim	$PTR_ADD $nj,$np,$j
296238384Sjkim	$LD	$aj,($aj)
297238384Sjkim	$LD	$nj,($nj)
298238384Sjkim
299238384Sjkim	$MULTU	$aj,$bi
300238384Sjkim	$ADDU	$lo0,$alo,$hi0
301238384Sjkim	$ADDU	$lo1,$nlo,$hi1
302238384Sjkim	sltu	$at,$lo0,$hi0
303238384Sjkim	sltu	$t0,$lo1,$hi1
304238384Sjkim	$ADDU	$hi0,$ahi,$at
305238384Sjkim	$ADDU	$hi1,$nhi,$t0
306238384Sjkim	mflo	$alo
307238384Sjkim	mfhi	$ahi
308238384Sjkim
309238384Sjkim	$ADDU	$lo0,$tj
310238384Sjkim	addu	$j,$BNSZ
311238384Sjkim	$MULTU	$nj,$m1
312238384Sjkim	sltu	$at,$lo0,$tj
313238384Sjkim	$ADDU	$lo1,$lo0
314238384Sjkim	$ADDU	$hi0,$at
315238384Sjkim	sltu	$t0,$lo1,$lo0
316238384Sjkim	$LD	$tj,2*$BNSZ($tp)
317238384Sjkim	$ADDU	$hi1,$t0
318238384Sjkim	sltu	$at,$j,$num
319238384Sjkim	mflo	$nlo
320238384Sjkim	mfhi	$nhi
321238384Sjkim	$ST	$lo1,($tp)
322238384Sjkim	bnez	$at,.Linner
323238384Sjkim	$PTR_ADD $tp,$BNSZ
324238384Sjkim	.set	reorder
325238384Sjkim
326238384Sjkim	$ADDU	$lo0,$alo,$hi0
327238384Sjkim	sltu	$at,$lo0,$hi0
328238384Sjkim	$ADDU	$hi0,$ahi,$at
329238384Sjkim	$ADDU	$lo0,$tj
330238384Sjkim	sltu	$t0,$lo0,$tj
331238384Sjkim	$ADDU	$hi0,$t0
332238384Sjkim
333238384Sjkim	$LD	$tj,2*$BNSZ($tp)
334238384Sjkim	$ADDU	$lo1,$nlo,$hi1
335238384Sjkim	sltu	$at,$lo1,$hi1
336238384Sjkim	$ADDU	$hi1,$nhi,$at
337238384Sjkim	$ADDU	$lo1,$lo0
338238384Sjkim	sltu	$t0,$lo1,$lo0
339238384Sjkim	$ADDU	$hi1,$t0
340238384Sjkim	$ST	$lo1,($tp)
341238384Sjkim
342238384Sjkim	$ADDU	$lo1,$hi1,$hi0
343238384Sjkim	sltu	$hi1,$lo1,$hi0
344238384Sjkim	$ADDU	$lo1,$tj
345238384Sjkim	sltu	$at,$lo1,$tj
346238384Sjkim	$ADDU	$hi1,$at
347238384Sjkim	$ST	$lo1,$BNSZ($tp)
348238384Sjkim	$ST	$hi1,2*$BNSZ($tp)
349238384Sjkim
350238384Sjkim	addu	$i,$BNSZ
351238384Sjkim	sltu	$t0,$i,$num
352238384Sjkim	bnez	$t0,.Louter
353238384Sjkim
354238384Sjkim	.set	noreorder
355238384Sjkim	$PTR_ADD $tj,$sp,$num	# &tp[num]
356238384Sjkim	move	$tp,$sp
357238384Sjkim	move	$ap,$sp
358238384Sjkim	li	$hi0,0		# clear borrow bit
359238384Sjkim
360238384Sjkim.align	4
361238384Sjkim.Lsub:	$LD	$lo0,($tp)
362238384Sjkim	$LD	$lo1,($np)
363238384Sjkim	$PTR_ADD $tp,$BNSZ
364238384Sjkim	$PTR_ADD $np,$BNSZ
365238384Sjkim	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
366238384Sjkim	sgtu	$at,$lo1,$lo0
367238384Sjkim	$SUBU	$lo0,$lo1,$hi0
368238384Sjkim	sgtu	$hi0,$lo0,$lo1
369238384Sjkim	$ST	$lo0,($rp)
370238384Sjkim	or	$hi0,$at
371238384Sjkim	sltu	$at,$tp,$tj
372238384Sjkim	bnez	$at,.Lsub
373238384Sjkim	$PTR_ADD $rp,$BNSZ
374238384Sjkim
375238384Sjkim	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
376238384Sjkim	move	$tp,$sp
377238384Sjkim	$PTR_SUB $rp,$num	# restore rp
378238384Sjkim	not	$hi1,$hi0
379238384Sjkim
380238384Sjkim	and	$ap,$hi0,$sp
381238384Sjkim	and	$bp,$hi1,$rp
382238384Sjkim	or	$ap,$ap,$bp	# ap=borrow?tp:rp
383238384Sjkim
384238384Sjkim.align	4
385238384Sjkim.Lcopy:	$LD	$aj,($ap)
386238384Sjkim	$PTR_ADD $ap,$BNSZ
387238384Sjkim	$ST	$zero,($tp)
388238384Sjkim	$PTR_ADD $tp,$BNSZ
389238384Sjkim	sltu	$at,$tp,$tj
390238384Sjkim	$ST	$aj,($rp)
391238384Sjkim	bnez	$at,.Lcopy
392238384Sjkim	$PTR_ADD $rp,$BNSZ
393238384Sjkim
394238384Sjkim	li	$a0,1
395238384Sjkim	li	$t0,1
396238384Sjkim
397238384Sjkim	.set	noreorder
398238384Sjkim	move	$sp,$fp
399238384Sjkim	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
400238384Sjkim	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
401238384Sjkim	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
402238384Sjkim	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
403238384Sjkim	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
404238384Sjkim	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
405238384Sjkim	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
406238384Sjkim	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
407238384Sjkim	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
408238384Sjkim___
409238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i);
410238384Sjkim	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
411238384Sjkim	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
412238384Sjkim	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
413238384Sjkim	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
414238384Sjkim___
415238384Sjkim$code.=<<___;
416238384Sjkim	jr	$ra
417238384Sjkim	$PTR_ADD $sp,$FRAMESIZE*$SZREG
418238384Sjkim.end	bn_mul_mont_internal
419238384Sjkim.rdata
420238384Sjkim.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
421238384Sjkim___
422238384Sjkim
423238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
424238384Sjkim
425238384Sjkimprint $code;
426238384Sjkimclose STDOUT;
427