1163953Srrs#!/usr/bin/env perl
2185694Srrs
3235828Stuexen# ====================================================================
4235828Stuexen# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5163953Srrs# project. The module is, however, dual licensed under OpenSSL and
6163953Srrs# CRYPTOGAMS licenses depending on where you obtain it. For further
7163953Srrs# details see http://www.openssl.org/~appro/cryptogams/.
8163953Srrs# ====================================================================
9163953Srrs
10228653Stuexen# January 2007.
11163953Srrs
12163953Srrs# Montgomery multiplication for ARMv4.
13163953Srrs#
14228653Stuexen# Performance improvement naturally varies among CPU implementations
15163953Srrs# and compilers. The code was observed to provide +65-35% improvement
16163953Srrs# [depending on key length, less for longer keys] on ARM920T, and
17163953Srrs# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18163953Srrs# base and compiler generated code with in-lined umull and even umlal
19163953Srrs# instructions. The latter means that this code didn't really have an
20163953Srrs# "advantage" of utilizing some "secret" instruction.
21163953Srrs#
22163953Srrs# The code is interoperable with Thumb ISA and is rather compact, less
23163953Srrs# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24163953Srrs# about decorations, ABI and instruction syntax are identical.
25163953Srrs
26163953Srrswhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27163953Srrsopen STDOUT,">$output";
28163953Srrs
29163953Srrs$num="r0";	# starts as num argument, but holds &tp[num-1]
30163953Srrs$ap="r1";
31163953Srrs$bp="r2"; $bi="r2"; $rp="r2";
32163953Srrs$np="r3";
33163953Srrs$tp="r4";
34163953Srrs$aj="r5";
35163953Srrs$nj="r6";
36166086Srrs$tj="r7";
37163953Srrs$n0="r8";
38163953Srrs###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
39167598Srrs$alo="r10";	# sl, gcc uses it to keep @GOT
40163953Srrs$ahi="r11";	# fp
41163953Srrs$nlo="r12";	# ip
42163953Srrs###########	# r13 is stack pointer
43163953Srrs$nhi="r14";	# lr
44163953Srrs###########	# r15 is program counter
45163953Srrs
46163953Srrs#### argument block layout relative to &tp[num-1], a.k.a. $num
47167598Srrs$_rp="$num,#12*4";
48215817Srrs# ap permanently resides in r1
49270350Stuexen$_bp="$num,#13*4";
50179157Srrs# np permanently resides in r3
51270350Stuexen$_n0="$num,#14*4";
52225549Stuexen$_num="$num,#15*4";	$_bpend=$_num;
53225549Stuexen
54225549Stuexen$code=<<___;
55218211Srrs.text
56218211Srrs
57218211Srrs.global	bn_mul_mont
58163953Srrs.type	bn_mul_mont,%function
59163953Srrs
60197288Srrs.align	2
61179157Srrsbn_mul_mont:
62163953Srrs	stmdb	sp!,{r0,r2}		@ sp points at argument block
63163953Srrs	ldr	$num,[sp,#3*4]		@ load num
64178251Srrs	cmp	$num,#2
65163953Srrs	movlt	r0,#0
66179157Srrs	addlt	sp,sp,#2*4
67163953Srrs	blt	.Labrt
68179157Srrs
69163953Srrs	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
70179157Srrs
71197288Srrs	mov	$num,$num,lsl#2		@ rescale $num for byte count
72228907Stuexen	sub	sp,sp,$num		@ alloca(4*num)
73179157Srrs	sub	sp,sp,#4		@ +extra dword
74179157Srrs	sub	$num,$num,#4		@ "num=num-1"
75197288Srrs	add	$tp,$bp,$num		@ &bp[num-1]
76228907Stuexen
77179157Srrs	add	$num,sp,$num		@ $num to point at &tp[num-1]
78179157Srrs	ldr	$n0,[$_n0]		@ &n0
79163953Srrs	ldr	$bi,[$bp]		@ bp[0]
80163953Srrs	ldr	$aj,[$ap],#4		@ ap[0],ap++
81178251Srrs	ldr	$nj,[$np],#4		@ np[0],np++
82178251Srrs	ldr	$n0,[$n0]		@ *n0
83163953Srrs	str	$tp,[$_bpend]		@ save &bp[num]
84163953Srrs
85163953Srrs	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
86163953Srrs	str	$n0,[$_n0]		@ save n0 value
87163953Srrs	mul	$n0,$alo,$n0		@ "tp[0]"*n0
88163953Srrs	mov	$nlo,#0
89163953Srrs	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
90163953Srrs	mov	$tp,sp
91179783Srrs
92179783Srrs.L1st:
93179783Srrs	ldr	$aj,[$ap],#4		@ ap[j],ap++
94179783Srrs	mov	$alo,$ahi
95179783Srrs	ldr	$nj,[$np],#4		@ np[j],np++
96179783Srrs	mov	$ahi,#0
97179783Srrs	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
98179783Srrs	mov	$nhi,#0
99163953Srrs	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
100163953Srrs	adds	$nlo,$nlo,$alo
101163953Srrs	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
102246595Stuexen	adc	$nlo,$nhi,#0
103167598Srrs	cmp	$tp,$num
104167598Srrs	bne	.L1st
105167598Srrs
106167598Srrs	adds	$nlo,$nlo,$ahi
107167598Srrs	ldr	$tp,[$_bp]		@ restore bp
108167598Srrs	mov	$nhi,#0
109167598Srrs	ldr	$n0,[$_n0]		@ restore n0
110167598Srrs	adc	$nhi,$nhi,#0
111167598Srrs	str	$nlo,[$num]		@ tp[num-1]=
112167598Srrs	str	$nhi,[$num,#4]		@ tp[num]=
113167598Srrs
114167598Srrs.Louter:
115167598Srrs	sub	$tj,$num,sp		@ "original" $num-1 value
116167598Srrs	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
117167598Srrs	ldr	$bi,[$tp,#4]!		@ *(++bp)
118215034Sbrucec	sub	$np,$np,$tj		@ "rewind" np to &np[1]
119167598Srrs	ldr	$aj,[$ap,#-4]		@ ap[0]
120167598Srrs	ldr	$alo,[sp]		@ tp[0]
121167598Srrs	ldr	$nj,[$np,#-4]		@ np[0]
122167598Srrs	ldr	$tj,[sp,#4]		@ tp[1]
123167598Srrs
124167598Srrs	mov	$ahi,#0
125167598Srrs	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
126167598Srrs	str	$tp,[$_bp]		@ save bp
127167598Srrs	mul	$n0,$alo,$n0
128167598Srrs	mov	$nlo,#0
129167598Srrs	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
130167598Srrs	mov	$tp,sp
131167598Srrs
132167598Srrs.Linner:
133167598Srrs	ldr	$aj,[$ap],#4		@ ap[j],ap++
134167598Srrs	adds	$alo,$ahi,$tj		@ +=tp[j]
135167598Srrs	ldr	$nj,[$np],#4		@ np[j],np++
136167598Srrs	mov	$ahi,#0
137167598Srrs	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
138167598Srrs	mov	$nhi,#0
139167598Srrs	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
140167598Srrs	adc	$ahi,$ahi,#0
141167598Srrs	ldr	$tj,[$tp,#8]		@ tp[j+1]
142167598Srrs	adds	$nlo,$nlo,$alo
143167598Srrs	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
144167598Srrs	adc	$nlo,$nhi,#0
145167598Srrs	cmp	$tp,$num
146163953Srrs	bne	.Linner
147167598Srrs
148169352Srrs	adds	$nlo,$nlo,$ahi
149167598Srrs	mov	$nhi,#0
150167598Srrs	ldr	$tp,[$_bp]		@ restore bp
151167598Srrs	adc	$nhi,$nhi,#0
152167598Srrs	ldr	$n0,[$_n0]		@ restore n0
153167598Srrs	adds	$nlo,$nlo,$tj
154169352Srrs	ldr	$tj,[$_bpend]		@ restore &bp[num]
155167598Srrs	adc	$nhi,$nhi,#0
156167598Srrs	str	$nlo,[$num]		@ tp[num-1]=
157167598Srrs	str	$nhi,[$num,#4]		@ tp[num]=
158167598Srrs
159167598Srrs	cmp	$tp,$tj
160170091Srrs	bne	.Louter
161167598Srrs
162167598Srrs	ldr	$rp,[$_rp]		@ pull rp
163167598Srrs	add	$num,$num,#4		@ $num to point at &tp[num]
164169352Srrs	sub	$aj,$num,sp		@ "original" num value
165167598Srrs	mov	$tp,sp			@ "rewind" $tp
166167598Srrs	mov	$ap,$tp			@ "borrow" $ap
167167598Srrs	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
168167598Srrs
169167598Srrs	subs	$tj,$tj,$tj		@ "clear" carry flag
170169352Srrs.Lsub:	ldr	$tj,[$tp],#4
171167598Srrs	ldr	$nj,[$np],#4
172167598Srrs	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
173170744Srrs	str	$tj,[$rp],#4		@ rp[j]=
174170205Srrs	teq	$tp,$num		@ preserve carry
175170205Srrs	bne	.Lsub
176167698Srrs	sbcs	$nhi,$nhi,#0		@ upmost carry
177169352Srrs	mov	$tp,sp			@ "rewind" $tp
178169352Srrs	sub	$rp,$rp,$aj		@ "rewind" $rp
179167698Srrs
180167698Srrs	and	$ap,$tp,$nhi
181167698Srrs	bic	$np,$rp,$nhi
182169352Srrs	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
183167698Srrs
184170091Srrs.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
185167698Srrs	str	sp,[$tp],#4		@ zap tp
186167698Srrs	str	$tj,[$rp],#4
187167598Srrs	cmp	$tp,$num
188179783Srrs	bne	.Lcopy
189167598Srrs
190179783Srrs	add	sp,$num,#4		@ skip over tp[num+1]
191167598Srrs	ldmia	sp!,{r4-r12,lr}		@ restore registers
192167598Srrs	add	sp,sp,#2*4		@ skip over {r0,r2}
193167598Srrs	mov	r0,#1
194167598Srrs.Labrt:	tst	lr,#1
195167598Srrs	moveq	pc,lr			@ be binary compatible with V4, yet
196170205Srrs	bx	lr			@ interoperable with Thumb ISA:-)
197167598Srrs.size	bn_mul_mont,.-bn_mul_mont
198167598Srrs.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199169352Srrs.align	2
200167598Srrs___
201167598Srrs
202215034Sbrucec$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
203167598Srrsprint $code;
204167598Srrsclose STDOUT;
205179783Srrs