1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# sha1_block procedure for ARMv4. 11238384Sjkim# 12238384Sjkim# January 2007. 13238384Sjkim 14238384Sjkim# Size/performance trade-off 15238384Sjkim# ==================================================================== 16238384Sjkim# impl size in bytes comp cycles[*] measured performance 17238384Sjkim# ==================================================================== 18238384Sjkim# thumb 304 3212 4420 19238384Sjkim# armv4-small 392/+29% 1958/+64% 2250/+96% 20238384Sjkim# armv4-compact 740/+89% 1552/+26% 1840/+22% 21238384Sjkim# armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 22238384Sjkim# full unroll ~5100/+260% ~1260/+4% ~1300/+5% 23238384Sjkim# ==================================================================== 24238384Sjkim# thumb = same as 'small' but in Thumb instructions[**] and 25238384Sjkim# with recurring code in two private functions; 26238384Sjkim# small = detached Xload/update, loops are folded; 27238384Sjkim# compact = detached Xload/update, 5x unroll; 28238384Sjkim# large = interleaved Xload/update, 5x unroll; 29238384Sjkim# full unroll = interleaved Xload/update, full unroll, estimated[!]; 30238384Sjkim# 31238384Sjkim# [*] Manually counted instructions in "grand" loop body. Measured 32238384Sjkim# performance is affected by prologue and epilogue overhead, 33238384Sjkim# i-cache availability, branch penalties, etc. 34238384Sjkim# [**] While each Thumb instruction is twice smaller, they are not as 35238384Sjkim# diverse as ARM ones: e.g., there are only two arithmetic 36238384Sjkim# instructions with 3 arguments, no [fixed] rotate, addressing 37238384Sjkim# modes are limited. As result it takes more instructions to do 38238384Sjkim# the same job in Thumb, therefore the code is never twice as 39238384Sjkim# small and always slower. 40238384Sjkim# [***] which is also ~35% better than compiler generated code. Dual- 41238384Sjkim# issue Cortex A8 core was measured to process input block in 42238384Sjkim# ~990 cycles. 43238384Sjkim 44238384Sjkim# August 2010. 45238384Sjkim# 46238384Sjkim# Rescheduling for dual-issue pipeline resulted in 13% improvement on 47238384Sjkim# Cortex A8 core and in absolute terms ~870 cycles per input block 48238384Sjkim# [or 13.6 cycles per byte]. 49238384Sjkim 50238384Sjkim# February 2011. 51238384Sjkim# 52238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 10% 53238384Sjkim# improvement on Cortex A8 core and 12.2 cycles per byte. 54238384Sjkim 55238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 56238384Sjkimopen STDOUT,">$output"; 57238384Sjkim 58238384Sjkim$ctx="r0"; 59238384Sjkim$inp="r1"; 60238384Sjkim$len="r2"; 61238384Sjkim$a="r3"; 62238384Sjkim$b="r4"; 63238384Sjkim$c="r5"; 64238384Sjkim$d="r6"; 65238384Sjkim$e="r7"; 66238384Sjkim$K="r8"; 67238384Sjkim$t0="r9"; 68238384Sjkim$t1="r10"; 69238384Sjkim$t2="r11"; 70238384Sjkim$t3="r12"; 71238384Sjkim$Xi="r14"; 72238384Sjkim@V=($a,$b,$c,$d,$e); 73238384Sjkim 74238384Sjkimsub Xupdate { 75238384Sjkimmy ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; 76238384Sjkim$code.=<<___; 77238384Sjkim ldr $t0,[$Xi,#15*4] 78238384Sjkim ldr $t1,[$Xi,#13*4] 79238384Sjkim ldr $t2,[$Xi,#7*4] 80238384Sjkim add $e,$K,$e,ror#2 @ E+=K_xx_xx 81238384Sjkim ldr $t3,[$Xi,#2*4] 82238384Sjkim eor $t0,$t0,$t1 83238384Sjkim eor $t2,$t2,$t3 @ 1 cycle stall 84238384Sjkim eor $t1,$c,$d @ F_xx_xx 85238384Sjkim mov $t0,$t0,ror#31 86238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 87238384Sjkim eor $t0,$t0,$t2,ror#31 88238384Sjkim str $t0,[$Xi,#-4]! 89238384Sjkim $opt1 @ F_xx_xx 90238384Sjkim $opt2 @ F_xx_xx 91238384Sjkim add $e,$e,$t0 @ E+=X[i] 92238384Sjkim___ 93238384Sjkim} 94238384Sjkim 95238384Sjkimsub BODY_00_15 { 96238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 97238384Sjkim$code.=<<___; 98238384Sjkim#if __ARM_ARCH__<7 99238384Sjkim ldrb $t1,[$inp,#2] 100238384Sjkim ldrb $t0,[$inp,#3] 101238384Sjkim ldrb $t2,[$inp,#1] 102238384Sjkim add $e,$K,$e,ror#2 @ E+=K_00_19 103238384Sjkim ldrb $t3,[$inp],#4 104238384Sjkim orr $t0,$t0,$t1,lsl#8 105238384Sjkim eor $t1,$c,$d @ F_xx_xx 106238384Sjkim orr $t0,$t0,$t2,lsl#16 107238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 108238384Sjkim orr $t0,$t0,$t3,lsl#24 109238384Sjkim#else 110238384Sjkim ldr $t0,[$inp],#4 @ handles unaligned 111238384Sjkim add $e,$K,$e,ror#2 @ E+=K_00_19 112238384Sjkim eor $t1,$c,$d @ F_xx_xx 113238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 114238384Sjkim#ifdef __ARMEL__ 115238384Sjkim rev $t0,$t0 @ byte swap 116238384Sjkim#endif 117238384Sjkim#endif 118238384Sjkim and $t1,$b,$t1,ror#2 119238384Sjkim add $e,$e,$t0 @ E+=X[i] 120238384Sjkim eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 121238384Sjkim str $t0,[$Xi,#-4]! 122238384Sjkim add $e,$e,$t1 @ E+=F_00_19(B,C,D) 123238384Sjkim___ 124238384Sjkim} 125238384Sjkim 126238384Sjkimsub BODY_16_19 { 127238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 128238384Sjkim &Xupdate(@_,"and $t1,$b,$t1,ror#2"); 129238384Sjkim$code.=<<___; 130238384Sjkim eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 131238384Sjkim add $e,$e,$t1 @ E+=F_00_19(B,C,D) 132238384Sjkim___ 133238384Sjkim} 134238384Sjkim 135238384Sjkimsub BODY_20_39 { 136238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 137238384Sjkim &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); 138238384Sjkim$code.=<<___; 139238384Sjkim add $e,$e,$t1 @ E+=F_20_39(B,C,D) 140238384Sjkim___ 141238384Sjkim} 142238384Sjkim 143238384Sjkimsub BODY_40_59 { 144238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 145238384Sjkim &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); 146238384Sjkim$code.=<<___; 147238384Sjkim add $e,$e,$t1 @ E+=F_40_59(B,C,D) 148238384Sjkim add $e,$e,$t2,ror#2 149238384Sjkim___ 150238384Sjkim} 151238384Sjkim 152238384Sjkim$code=<<___; 153238384Sjkim#include "arm_arch.h" 154238384Sjkim 155238384Sjkim.text 156238384Sjkim 157238384Sjkim.global sha1_block_data_order 158238384Sjkim.type sha1_block_data_order,%function 159238384Sjkim 160238384Sjkim.align 2 161238384Sjkimsha1_block_data_order: 162238384Sjkim stmdb sp!,{r4-r12,lr} 163238384Sjkim add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp 164238384Sjkim ldmia $ctx,{$a,$b,$c,$d,$e} 165238384Sjkim.Lloop: 166238384Sjkim ldr $K,.LK_00_19 167238384Sjkim mov $Xi,sp 168238384Sjkim sub sp,sp,#15*4 169238384Sjkim mov $c,$c,ror#30 170238384Sjkim mov $d,$d,ror#30 171238384Sjkim mov $e,$e,ror#30 @ [6] 172238384Sjkim.L_00_15: 173238384Sjkim___ 174238384Sjkimfor($i=0;$i<5;$i++) { 175238384Sjkim &BODY_00_15(@V); unshift(@V,pop(@V)); 176238384Sjkim} 177238384Sjkim$code.=<<___; 178238384Sjkim teq $Xi,sp 179238384Sjkim bne .L_00_15 @ [((11+4)*5+2)*3] 180246772Sjkim sub sp,sp,#25*4 181238384Sjkim___ 182238384Sjkim &BODY_00_15(@V); unshift(@V,pop(@V)); 183238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 184238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 185238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 186238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 187238384Sjkim$code.=<<___; 188238384Sjkim 189238384Sjkim ldr $K,.LK_20_39 @ [+15+16*4] 190238384Sjkim cmn sp,#0 @ [+3], clear carry to denote 20_39 191238384Sjkim.L_20_39_or_60_79: 192238384Sjkim___ 193238384Sjkimfor($i=0;$i<5;$i++) { 194238384Sjkim &BODY_20_39(@V); unshift(@V,pop(@V)); 195238384Sjkim} 196238384Sjkim$code.=<<___; 197238384Sjkim teq $Xi,sp @ preserve carry 198238384Sjkim bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 199238384Sjkim bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 200238384Sjkim 201238384Sjkim ldr $K,.LK_40_59 202238384Sjkim sub sp,sp,#20*4 @ [+2] 203238384Sjkim.L_40_59: 204238384Sjkim___ 205238384Sjkimfor($i=0;$i<5;$i++) { 206238384Sjkim &BODY_40_59(@V); unshift(@V,pop(@V)); 207238384Sjkim} 208238384Sjkim$code.=<<___; 209238384Sjkim teq $Xi,sp 210238384Sjkim bne .L_40_59 @ [+((12+5)*5+2)*4] 211238384Sjkim 212238384Sjkim ldr $K,.LK_60_79 213238384Sjkim sub sp,sp,#20*4 214238384Sjkim cmp sp,#0 @ set carry to denote 60_79 215238384Sjkim b .L_20_39_or_60_79 @ [+4], spare 300 bytes 216238384Sjkim.L_done: 217238384Sjkim add sp,sp,#80*4 @ "deallocate" stack frame 218238384Sjkim ldmia $ctx,{$K,$t0,$t1,$t2,$t3} 219238384Sjkim add $a,$K,$a 220238384Sjkim add $b,$t0,$b 221238384Sjkim add $c,$t1,$c,ror#2 222238384Sjkim add $d,$t2,$d,ror#2 223238384Sjkim add $e,$t3,$e,ror#2 224238384Sjkim stmia $ctx,{$a,$b,$c,$d,$e} 225238384Sjkim teq $inp,$len 226238384Sjkim bne .Lloop @ [+18], total 1307 227238384Sjkim 228238384Sjkim#if __ARM_ARCH__>=5 229238384Sjkim ldmia sp!,{r4-r12,pc} 230238384Sjkim#else 231238384Sjkim ldmia sp!,{r4-r12,lr} 232238384Sjkim tst lr,#1 233238384Sjkim moveq pc,lr @ be binary compatible with V4, yet 234238384Sjkim bx lr @ interoperable with Thumb ISA:-) 235238384Sjkim#endif 236238384Sjkim.align 2 237238384Sjkim.LK_00_19: .word 0x5a827999 238238384Sjkim.LK_20_39: .word 0x6ed9eba1 239238384Sjkim.LK_40_59: .word 0x8f1bbcdc 240238384Sjkim.LK_60_79: .word 0xca62c1d6 241238384Sjkim.size sha1_block_data_order,.-sha1_block_data_order 242238384Sjkim.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 243238384Sjkim.align 2 244238384Sjkim___ 245238384Sjkim 246238384Sjkim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 247238384Sjkimprint $code; 248238384Sjkimclose STDOUT; # enforce flush 249