1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys. While 512-bit
12# RSA private key operations are 40% faster, 1024-bit ones are hardly
13# faster at all, while longer key operations are slower by up to 20%.
14# It might be of interest to embedded system developers though, as
15# it's smaller than 1KB, yet offers ~3x improvement over compiler
16# generated code.
17#
18# The module targets N32 and N64 MIPS ABIs and currently is a bit
19# IRIX-centric, i.e. is likely to require adaptation for other OSes.
20
21# int bn_mul_mont(
22$rp="a0";	# BN_ULONG *rp,
23$ap="a1";	# const BN_ULONG *ap,
24$bp="a2";	# const BN_ULONG *bp,
25$np="a3";	# const BN_ULONG *np,
26$n0="a4";	# const BN_ULONG *n0,
27$num="a5";	# int num);
28
29$lo0="a6";
30$hi0="a7";
31$lo1="v0";
32$hi1="v1";
33$aj="t0";
34$bi="t1";
35$nj="t2";
36$tp="t3";
37$alo="s0";
38$ahi="s1";
39$nlo="s2";
40$nhi="s3";
41$tj="s4";
42$i="s5";
43$j="s6";
44$fp="t8";
45$m1="t9";
46
47$FRAME=8*(2+8);
48
49$code=<<___;
50#include <asm.h>
51#include <regdef.h>
52
53.text
54
55.set	noat
56.set	reorder
57
58.align	5
59.globl	bn_mul_mont
60.ent	bn_mul_mont
61bn_mul_mont:
62	.set	noreorder
63	PTR_SUB	sp,64
64	move	$fp,sp
65	.frame	$fp,64,ra
66	slt	AT,$num,4
67	li	v0,0
68	beqzl	AT,.Lproceed
69	nop
70	jr	ra
71	PTR_ADD	sp,$fp,64
72	.set	reorder
73.align	5
74.Lproceed:
75	ld	$n0,0($n0)
76	ld	$bi,0($bp)	# bp[0]
77	ld	$aj,0($ap)	# ap[0]
78	ld	$nj,0($np)	# np[0]
79	PTR_SUB	sp,16		# place for two extra words
80	sll	$num,3
81	li	AT,-4096
82	PTR_SUB	sp,$num
83	and	sp,AT
84
85	sd	s0,0($fp)
86	sd	s1,8($fp)
87	sd	s2,16($fp)
88	sd	s3,24($fp)
89	sd	s4,32($fp)
90	sd	s5,40($fp)
91	sd	s6,48($fp)
92	sd	s7,56($fp)
93
94	dmultu	$aj,$bi
95	ld	$alo,8($ap)
96	ld	$nlo,8($np)
97	mflo	$lo0
98	mfhi	$hi0
99	dmultu	$lo0,$n0
100	mflo	$m1
101
102	dmultu	$alo,$bi
103	mflo	$alo
104	mfhi	$ahi
105
106	dmultu	$nj,$m1
107	mflo	$lo1
108	mfhi	$hi1
109	dmultu	$nlo,$m1
110	daddu	$lo1,$lo0
111	sltu	AT,$lo1,$lo0
112	daddu	$hi1,AT
113	mflo	$nlo
114	mfhi	$nhi
115
116	move	$tp,sp
117	li	$j,16
118.align	4
119.L1st:
120	.set	noreorder
121	PTR_ADD	$aj,$ap,$j
122	ld	$aj,($aj)
123	PTR_ADD	$nj,$np,$j
124	ld	$nj,($nj)
125
126	dmultu	$aj,$bi
127	daddu	$lo0,$alo,$hi0
128	daddu	$lo1,$nlo,$hi1
129	sltu	AT,$lo0,$hi0
130	sltu	s7,$lo1,$hi1
131	daddu	$hi0,$ahi,AT
132	daddu	$hi1,$nhi,s7
133	mflo	$alo
134	mfhi	$ahi
135
136	daddu	$lo1,$lo0
137	sltu	AT,$lo1,$lo0
138	dmultu	$nj,$m1
139	daddu	$hi1,AT
140	addu	$j,8
141	sd	$lo1,($tp)
142	sltu	s7,$j,$num
143	mflo	$nlo
144	mfhi	$nhi
145
146	bnez	s7,.L1st
147	PTR_ADD	$tp,8
148	.set	reorder
149
150	daddu	$lo0,$alo,$hi0
151	sltu	AT,$lo0,$hi0
152	daddu	$hi0,$ahi,AT
153
154	daddu	$lo1,$nlo,$hi1
155	sltu	s7,$lo1,$hi1
156	daddu	$hi1,$nhi,s7
157	daddu	$lo1,$lo0
158	sltu	AT,$lo1,$lo0
159	daddu	$hi1,AT
160
161	sd	$lo1,($tp)
162
163	daddu	$hi1,$hi0
164	sltu	AT,$hi1,$hi0
165	sd	$hi1,8($tp)
166	sd	AT,16($tp)
167
168	li	$i,8
169.align	4
170.Louter:
171	PTR_ADD	$bi,$bp,$i
172	ld	$bi,($bi)
173	ld	$aj,($ap)
174	ld	$alo,8($ap)
175	ld	$tj,(sp)
176
177	dmultu	$aj,$bi
178	ld	$nj,($np)
179	ld	$nlo,8($np)
180	mflo	$lo0
181	mfhi	$hi0
182	daddu	$lo0,$tj
183	dmultu	$lo0,$n0
184	sltu	AT,$lo0,$tj
185	daddu	$hi0,AT
186	mflo	$m1
187
188	dmultu	$alo,$bi
189	mflo	$alo
190	mfhi	$ahi
191
192	dmultu	$nj,$m1
193	mflo	$lo1
194	mfhi	$hi1
195
196	dmultu	$nlo,$m1
197	daddu	$lo1,$lo0
198	sltu	AT,$lo1,$lo0
199	daddu	$hi1,AT
200	mflo	$nlo
201	mfhi	$nhi
202
203	move	$tp,sp
204	li	$j,16
205	ld	$tj,8($tp)
206.align	4
207.Linner:
208	.set	noreorder
209	PTR_ADD	$aj,$ap,$j
210	ld	$aj,($aj)
211	PTR_ADD	$nj,$np,$j
212	ld	$nj,($nj)
213
214	dmultu	$aj,$bi
215	daddu	$lo0,$alo,$hi0
216	daddu	$lo1,$nlo,$hi1
217	sltu	AT,$lo0,$hi0
218	sltu	s7,$lo1,$hi1
219	daddu	$hi0,$ahi,AT
220	daddu	$hi1,$nhi,s7
221	mflo	$alo
222	mfhi	$ahi
223
224	daddu	$lo0,$tj
225	addu	$j,8
226	dmultu	$nj,$m1
227	sltu	AT,$lo0,$tj
228	daddu	$lo1,$lo0
229	daddu	$hi0,AT
230	sltu	s7,$lo1,$lo0
231	ld	$tj,16($tp)
232	daddu	$hi1,s7
233	sltu	AT,$j,$num
234	mflo	$nlo
235	mfhi	$nhi
236	sd	$lo1,($tp)
237	bnez	AT,.Linner
238	PTR_ADD	$tp,8
239	.set	reorder
240
241	daddu	$lo0,$alo,$hi0
242	sltu	AT,$lo0,$hi0
243	daddu	$hi0,$ahi,AT
244	daddu	$lo0,$tj
245	sltu	s7,$lo0,$tj
246	daddu	$hi0,s7
247
248	ld	$tj,16($tp)
249	daddu	$lo1,$nlo,$hi1
250	sltu	AT,$lo1,$hi1
251	daddu	$hi1,$nhi,AT
252	daddu	$lo1,$lo0
253	sltu	s7,$lo1,$lo0
254	daddu	$hi1,s7
255	sd	$lo1,($tp)
256
257	daddu	$lo1,$hi1,$hi0
258	sltu	$hi1,$lo1,$hi0
259	daddu	$lo1,$tj
260	sltu	AT,$lo1,$tj
261	daddu	$hi1,AT
262	sd	$lo1,8($tp)
263	sd	$hi1,16($tp)
264
265	addu	$i,8
266	sltu	s7,$i,$num
267	bnez	s7,.Louter
268
269	.set	noreorder
270	PTR_ADD	$tj,sp,$num	# &tp[num]
271	move	$tp,sp
272	move	$ap,sp
273	li	$hi0,0		# clear borrow bit
274
275.align	4
276.Lsub:	ld	$lo0,($tp)
277	ld	$lo1,($np)
278	PTR_ADD	$tp,8
279	PTR_ADD	$np,8
280	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
281	sgtu	AT,$lo1,$lo0
282	dsubu	$lo0,$lo1,$hi0
283	sgtu	$hi0,$lo0,$lo1
284	sd	$lo0,($rp)
285	or	$hi0,AT
286	sltu	AT,$tp,$tj
287	bnez	AT,.Lsub
288	PTR_ADD	$rp,8
289
290	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
291	move	$tp,sp
292	PTR_SUB	$rp,$num	# restore rp
293	not	$hi1,$hi0
294
295	and	$ap,$hi0,sp
296	and	$bp,$hi1,$rp
297	or	$ap,$ap,$bp	# ap=borrow?tp:rp
298
299.align	4
300.Lcopy:	ld	$aj,($ap)
301	PTR_ADD	$ap,8
302	PTR_ADD	$tp,8
303	sd	zero,-8($tp)
304	sltu	AT,$tp,$tj
305	sd	$aj,($rp)
306	bnez	AT,.Lcopy
307	PTR_ADD	$rp,8
308
309	ld	s0,0($fp)
310	ld	s1,8($fp)
311	ld	s2,16($fp)
312	ld	s3,24($fp)
313	ld	s4,32($fp)
314	ld	s5,40($fp)
315	ld	s6,48($fp)
316	ld	s7,56($fp)
317	li	v0,1
318	jr	ra
319	PTR_ADD	sp,$fp,64
320	.set	reorder
321END(bn_mul_mont)
322.rdata
323.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
324___
325
326print $code;
327close STDOUT;
328