1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# sha1_block procedure for ARMv4.
11238384Sjkim#
12238384Sjkim# January 2007.
13238384Sjkim
14238384Sjkim# Size/performance trade-off
15238384Sjkim# ====================================================================
16238384Sjkim# impl		size in bytes	comp cycles[*]	measured performance
17238384Sjkim# ====================================================================
18238384Sjkim# thumb		304		3212		4420
19238384Sjkim# armv4-small	392/+29%	1958/+64%	2250/+96%
20238384Sjkim# armv4-compact	740/+89%	1552/+26%	1840/+22%
21238384Sjkim# armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
22238384Sjkim# full unroll	~5100/+260%	~1260/+4%	~1300/+5%
23238384Sjkim# ====================================================================
24238384Sjkim# thumb		= same as 'small' but in Thumb instructions[**] and
25238384Sjkim#		  with recurring code in two private functions;
26238384Sjkim# small		= detached Xload/update, loops are folded;
27238384Sjkim# compact	= detached Xload/update, 5x unroll;
28238384Sjkim# large		= interleaved Xload/update, 5x unroll;
29238384Sjkim# full unroll	= interleaved Xload/update, full unroll, estimated[!];
30238384Sjkim#
31238384Sjkim# [*]	Manually counted instructions in "grand" loop body. Measured
32238384Sjkim#	performance is affected by prologue and epilogue overhead,
33238384Sjkim#	i-cache availability, branch penalties, etc.
34238384Sjkim# [**]	While each Thumb instruction is twice smaller, they are not as
35238384Sjkim#	diverse as ARM ones: e.g., there are only two arithmetic
36238384Sjkim#	instructions with 3 arguments, no [fixed] rotate, addressing
37238384Sjkim#	modes are limited. As result it takes more instructions to do
38238384Sjkim#	the same job in Thumb, therefore the code is never twice as
39238384Sjkim#	small and always slower.
40238384Sjkim# [***]	which is also ~35% better than compiler generated code. Dual-
41238384Sjkim#	issue Cortex A8 core was measured to process input block in
42238384Sjkim#	~990 cycles.
43238384Sjkim
44238384Sjkim# August 2010.
45238384Sjkim#
46238384Sjkim# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47238384Sjkim# Cortex A8 core and in absolute terms ~870 cycles per input block
48238384Sjkim# [or 13.6 cycles per byte].
49238384Sjkim
50238384Sjkim# February 2011.
51238384Sjkim#
52238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 10%
53238384Sjkim# improvement on Cortex A8 core and 12.2 cycles per byte.
54238384Sjkim
55238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
56238384Sjkimopen STDOUT,">$output";
57238384Sjkim
58238384Sjkim$ctx="r0";
59238384Sjkim$inp="r1";
60238384Sjkim$len="r2";
61238384Sjkim$a="r3";
62238384Sjkim$b="r4";
63238384Sjkim$c="r5";
64238384Sjkim$d="r6";
65238384Sjkim$e="r7";
66238384Sjkim$K="r8";
67238384Sjkim$t0="r9";
68238384Sjkim$t1="r10";
69238384Sjkim$t2="r11";
70238384Sjkim$t3="r12";
71238384Sjkim$Xi="r14";
72238384Sjkim@V=($a,$b,$c,$d,$e);
73238384Sjkim
74238384Sjkimsub Xupdate {
75238384Sjkimmy ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
76238384Sjkim$code.=<<___;
77238384Sjkim	ldr	$t0,[$Xi,#15*4]
78238384Sjkim	ldr	$t1,[$Xi,#13*4]
79238384Sjkim	ldr	$t2,[$Xi,#7*4]
80238384Sjkim	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
81238384Sjkim	ldr	$t3,[$Xi,#2*4]
82238384Sjkim	eor	$t0,$t0,$t1
83238384Sjkim	eor	$t2,$t2,$t3			@ 1 cycle stall
84238384Sjkim	eor	$t1,$c,$d			@ F_xx_xx
85238384Sjkim	mov	$t0,$t0,ror#31
86238384Sjkim	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
87238384Sjkim	eor	$t0,$t0,$t2,ror#31
88238384Sjkim	str	$t0,[$Xi,#-4]!
89238384Sjkim	$opt1					@ F_xx_xx
90238384Sjkim	$opt2					@ F_xx_xx
91238384Sjkim	add	$e,$e,$t0			@ E+=X[i]
92238384Sjkim___
93238384Sjkim}
94238384Sjkim
95238384Sjkimsub BODY_00_15 {
96238384Sjkimmy ($a,$b,$c,$d,$e)=@_;
97238384Sjkim$code.=<<___;
98238384Sjkim#if __ARM_ARCH__<7
99238384Sjkim	ldrb	$t1,[$inp,#2]
100238384Sjkim	ldrb	$t0,[$inp,#3]
101238384Sjkim	ldrb	$t2,[$inp,#1]
102238384Sjkim	add	$e,$K,$e,ror#2			@ E+=K_00_19
103238384Sjkim	ldrb	$t3,[$inp],#4
104238384Sjkim	orr	$t0,$t0,$t1,lsl#8
105238384Sjkim	eor	$t1,$c,$d			@ F_xx_xx
106238384Sjkim	orr	$t0,$t0,$t2,lsl#16
107238384Sjkim	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
108238384Sjkim	orr	$t0,$t0,$t3,lsl#24
109238384Sjkim#else
110238384Sjkim	ldr	$t0,[$inp],#4			@ handles unaligned
111238384Sjkim	add	$e,$K,$e,ror#2			@ E+=K_00_19
112238384Sjkim	eor	$t1,$c,$d			@ F_xx_xx
113238384Sjkim	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
114238384Sjkim#ifdef __ARMEL__
115238384Sjkim	rev	$t0,$t0				@ byte swap
116238384Sjkim#endif
117238384Sjkim#endif
118238384Sjkim	and	$t1,$b,$t1,ror#2
119238384Sjkim	add	$e,$e,$t0			@ E+=X[i]
120238384Sjkim	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
121238384Sjkim	str	$t0,[$Xi,#-4]!
122238384Sjkim	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
123238384Sjkim___
124238384Sjkim}
125238384Sjkim
126238384Sjkimsub BODY_16_19 {
127238384Sjkimmy ($a,$b,$c,$d,$e)=@_;
128238384Sjkim	&Xupdate(@_,"and $t1,$b,$t1,ror#2");
129238384Sjkim$code.=<<___;
130238384Sjkim	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
131238384Sjkim	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
132238384Sjkim___
133238384Sjkim}
134238384Sjkim
135238384Sjkimsub BODY_20_39 {
136238384Sjkimmy ($a,$b,$c,$d,$e)=@_;
137238384Sjkim	&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
138238384Sjkim$code.=<<___;
139238384Sjkim	add	$e,$e,$t1			@ E+=F_20_39(B,C,D)
140238384Sjkim___
141238384Sjkim}
142238384Sjkim
143238384Sjkimsub BODY_40_59 {
144238384Sjkimmy ($a,$b,$c,$d,$e)=@_;
145238384Sjkim	&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
146238384Sjkim$code.=<<___;
147238384Sjkim	add	$e,$e,$t1			@ E+=F_40_59(B,C,D)
148238384Sjkim	add	$e,$e,$t2,ror#2
149238384Sjkim___
150238384Sjkim}
151238384Sjkim
152238384Sjkim$code=<<___;
153238384Sjkim#include "arm_arch.h"
154238384Sjkim
155238384Sjkim.text
156238384Sjkim
157238384Sjkim.global	sha1_block_data_order
158238384Sjkim.type	sha1_block_data_order,%function
159238384Sjkim
160238384Sjkim.align	2
161238384Sjkimsha1_block_data_order:
162238384Sjkim	stmdb	sp!,{r4-r12,lr}
163238384Sjkim	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
164238384Sjkim	ldmia	$ctx,{$a,$b,$c,$d,$e}
165238384Sjkim.Lloop:
166238384Sjkim	ldr	$K,.LK_00_19
167238384Sjkim	mov	$Xi,sp
168238384Sjkim	sub	sp,sp,#15*4
169238384Sjkim	mov	$c,$c,ror#30
170238384Sjkim	mov	$d,$d,ror#30
171238384Sjkim	mov	$e,$e,ror#30		@ [6]
172238384Sjkim.L_00_15:
173238384Sjkim___
174238384Sjkimfor($i=0;$i<5;$i++) {
175238384Sjkim	&BODY_00_15(@V);	unshift(@V,pop(@V));
176238384Sjkim}
177238384Sjkim$code.=<<___;
178238384Sjkim	teq	$Xi,sp
179238384Sjkim	bne	.L_00_15		@ [((11+4)*5+2)*3]
180246772Sjkim	sub	sp,sp,#25*4
181238384Sjkim___
182238384Sjkim	&BODY_00_15(@V);	unshift(@V,pop(@V));
183238384Sjkim	&BODY_16_19(@V);	unshift(@V,pop(@V));
184238384Sjkim	&BODY_16_19(@V);	unshift(@V,pop(@V));
185238384Sjkim	&BODY_16_19(@V);	unshift(@V,pop(@V));
186238384Sjkim	&BODY_16_19(@V);	unshift(@V,pop(@V));
187238384Sjkim$code.=<<___;
188238384Sjkim
189238384Sjkim	ldr	$K,.LK_20_39		@ [+15+16*4]
190238384Sjkim	cmn	sp,#0			@ [+3], clear carry to denote 20_39
191238384Sjkim.L_20_39_or_60_79:
192238384Sjkim___
193238384Sjkimfor($i=0;$i<5;$i++) {
194238384Sjkim	&BODY_20_39(@V);	unshift(@V,pop(@V));
195238384Sjkim}
196238384Sjkim$code.=<<___;
197238384Sjkim	teq	$Xi,sp			@ preserve carry
198238384Sjkim	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
199238384Sjkim	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
200238384Sjkim
201238384Sjkim	ldr	$K,.LK_40_59
202238384Sjkim	sub	sp,sp,#20*4		@ [+2]
203238384Sjkim.L_40_59:
204238384Sjkim___
205238384Sjkimfor($i=0;$i<5;$i++) {
206238384Sjkim	&BODY_40_59(@V);	unshift(@V,pop(@V));
207238384Sjkim}
208238384Sjkim$code.=<<___;
209238384Sjkim	teq	$Xi,sp
210238384Sjkim	bne	.L_40_59		@ [+((12+5)*5+2)*4]
211238384Sjkim
212238384Sjkim	ldr	$K,.LK_60_79
213238384Sjkim	sub	sp,sp,#20*4
214238384Sjkim	cmp	sp,#0			@ set carry to denote 60_79
215238384Sjkim	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
216238384Sjkim.L_done:
217238384Sjkim	add	sp,sp,#80*4		@ "deallocate" stack frame
218238384Sjkim	ldmia	$ctx,{$K,$t0,$t1,$t2,$t3}
219238384Sjkim	add	$a,$K,$a
220238384Sjkim	add	$b,$t0,$b
221238384Sjkim	add	$c,$t1,$c,ror#2
222238384Sjkim	add	$d,$t2,$d,ror#2
223238384Sjkim	add	$e,$t3,$e,ror#2
224238384Sjkim	stmia	$ctx,{$a,$b,$c,$d,$e}
225238384Sjkim	teq	$inp,$len
226238384Sjkim	bne	.Lloop			@ [+18], total 1307
227238384Sjkim
228238384Sjkim#if __ARM_ARCH__>=5
229238384Sjkim	ldmia	sp!,{r4-r12,pc}
230238384Sjkim#else
231238384Sjkim	ldmia	sp!,{r4-r12,lr}
232238384Sjkim	tst	lr,#1
233238384Sjkim	moveq	pc,lr			@ be binary compatible with V4, yet
234238384Sjkim	bx	lr			@ interoperable with Thumb ISA:-)
235238384Sjkim#endif
236238384Sjkim.align	2
237238384Sjkim.LK_00_19:	.word	0x5a827999
238238384Sjkim.LK_20_39:	.word	0x6ed9eba1
239238384Sjkim.LK_40_59:	.word	0x8f1bbcdc
240238384Sjkim.LK_60_79:	.word	0xca62c1d6
241238384Sjkim.size	sha1_block_data_order,.-sha1_block_data_order
242238384Sjkim.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
243238384Sjkim.align	2
244238384Sjkim___
245238384Sjkim
246238384Sjkim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
247238384Sjkimprint $code;
248238384Sjkimclose STDOUT; # enforce flush
249