1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# SHA256 block procedure for ARMv4. May 2007. 11238384Sjkim 12238384Sjkim# Performance is ~2x better than gcc 3.4 generated code and in "abso- 13238384Sjkim# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 14238384Sjkim# byte [on single-issue Xscale PXA250 core]. 15238384Sjkim 16238384Sjkim# July 2010. 17238384Sjkim# 18238384Sjkim# Rescheduling for dual-issue pipeline resulted in 22% improvement on 19238384Sjkim# Cortex A8 core and ~20 cycles per processed byte. 20238384Sjkim 21238384Sjkim# February 2011. 22238384Sjkim# 23238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 16% 24238384Sjkim# improvement on Cortex A8 core and ~17 cycles per processed byte. 25238384Sjkim 26238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 27238384Sjkimopen STDOUT,">$output"; 28238384Sjkim 29238384Sjkim$ctx="r0"; $t0="r0"; 30238384Sjkim$inp="r1"; $t3="r1"; 31238384Sjkim$len="r2"; $t1="r2"; 32238384Sjkim$T1="r3"; 33238384Sjkim$A="r4"; 34238384Sjkim$B="r5"; 35238384Sjkim$C="r6"; 36238384Sjkim$D="r7"; 37238384Sjkim$E="r8"; 38238384Sjkim$F="r9"; 39238384Sjkim$G="r10"; 40238384Sjkim$H="r11"; 41238384Sjkim@V=($A,$B,$C,$D,$E,$F,$G,$H); 42238384Sjkim$t2="r12"; 43238384Sjkim$Ktbl="r14"; 44238384Sjkim 45238384Sjkim@Sigma0=( 2,13,22); 46238384Sjkim@Sigma1=( 6,11,25); 47238384Sjkim@sigma0=( 7,18, 3); 48238384Sjkim@sigma1=(17,19,10); 49238384Sjkim 50238384Sjkimsub BODY_00_15 { 51238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 52238384Sjkim 53238384Sjkim$code.=<<___ if ($i<16); 54238384Sjkim#if __ARM_ARCH__>=7 55238384Sjkim ldr $T1,[$inp],#4 56238384Sjkim#else 57238384Sjkim ldrb $T1,[$inp,#3] @ $i 58238384Sjkim ldrb $t2,[$inp,#2] 59238384Sjkim ldrb $t1,[$inp,#1] 60238384Sjkim ldrb $t0,[$inp],#4 61238384Sjkim orr $T1,$T1,$t2,lsl#8 62238384Sjkim orr $T1,$T1,$t1,lsl#16 63238384Sjkim orr $T1,$T1,$t0,lsl#24 64238384Sjkim#endif 65238384Sjkim___ 66238384Sjkim$code.=<<___; 67238384Sjkim mov $t0,$e,ror#$Sigma1[0] 68238384Sjkim ldr $t2,[$Ktbl],#4 @ *K256++ 69238384Sjkim eor $t0,$t0,$e,ror#$Sigma1[1] 70238384Sjkim eor $t1,$f,$g 71238384Sjkim#if $i>=16 72238384Sjkim add $T1,$T1,$t3 @ from BODY_16_xx 73238384Sjkim#elif __ARM_ARCH__>=7 && defined(__ARMEL__) 74238384Sjkim rev $T1,$T1 75238384Sjkim#endif 76238384Sjkim#if $i==15 77238384Sjkim str $inp,[sp,#17*4] @ leave room for $t3 78238384Sjkim#endif 79238384Sjkim eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) 80238384Sjkim and $t1,$t1,$e 81238384Sjkim str $T1,[sp,#`$i%16`*4] 82238384Sjkim add $T1,$T1,$t0 83238384Sjkim eor $t1,$t1,$g @ Ch(e,f,g) 84238384Sjkim add $T1,$T1,$h 85238384Sjkim mov $h,$a,ror#$Sigma0[0] 86238384Sjkim add $T1,$T1,$t1 87238384Sjkim eor $h,$h,$a,ror#$Sigma0[1] 88238384Sjkim add $T1,$T1,$t2 89238384Sjkim eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) 90238384Sjkim#if $i>=15 91238384Sjkim ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx 92238384Sjkim#endif 93238384Sjkim orr $t0,$a,$b 94238384Sjkim and $t1,$a,$b 95238384Sjkim and $t0,$t0,$c 96238384Sjkim add $h,$h,$T1 97238384Sjkim orr $t0,$t0,$t1 @ Maj(a,b,c) 98238384Sjkim add $d,$d,$T1 99238384Sjkim add $h,$h,$t0 100238384Sjkim___ 101238384Sjkim} 102238384Sjkim 103238384Sjkimsub BODY_16_XX { 104238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 105238384Sjkim 106238384Sjkim$code.=<<___; 107238384Sjkim @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i 108238384Sjkim ldr $t2,[sp,#`($i+14)%16`*4] 109238384Sjkim mov $t0,$t3,ror#$sigma0[0] 110238384Sjkim ldr $T1,[sp,#`($i+0)%16`*4] 111238384Sjkim eor $t0,$t0,$t3,ror#$sigma0[1] 112238384Sjkim ldr $t1,[sp,#`($i+9)%16`*4] 113238384Sjkim eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) 114238384Sjkim mov $t3,$t2,ror#$sigma1[0] 115238384Sjkim add $T1,$T1,$t0 116238384Sjkim eor $t3,$t3,$t2,ror#$sigma1[1] 117238384Sjkim add $T1,$T1,$t1 118238384Sjkim eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) 119238384Sjkim @ add $T1,$T1,$t3 120238384Sjkim___ 121238384Sjkim &BODY_00_15(@_); 122238384Sjkim} 123238384Sjkim 124238384Sjkim$code=<<___; 125238384Sjkim#include "arm_arch.h" 126238384Sjkim 127238384Sjkim.text 128238384Sjkim.code 32 129238384Sjkim 130238384Sjkim.type K256,%object 131238384Sjkim.align 5 132238384SjkimK256: 133238384Sjkim.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 134238384Sjkim.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 135238384Sjkim.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 136238384Sjkim.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 137238384Sjkim.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 138238384Sjkim.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 139238384Sjkim.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 140238384Sjkim.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 141238384Sjkim.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 142238384Sjkim.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 143238384Sjkim.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 144238384Sjkim.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 145238384Sjkim.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 146238384Sjkim.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 147238384Sjkim.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 148238384Sjkim.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 149238384Sjkim.size K256,.-K256 150238384Sjkim 151238384Sjkim.global sha256_block_data_order 152238384Sjkim.type sha256_block_data_order,%function 153238384Sjkimsha256_block_data_order: 154238384Sjkim sub r3,pc,#8 @ sha256_block_data_order 155238384Sjkim add $len,$inp,$len,lsl#6 @ len to point at the end of inp 156238384Sjkim stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 157238384Sjkim ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 158238384Sjkim sub $Ktbl,r3,#256 @ K256 159238384Sjkim sub sp,sp,#16*4 @ alloca(X[16]) 160238384Sjkim.Loop: 161238384Sjkim___ 162238384Sjkimfor($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 163238384Sjkim$code.=".Lrounds_16_xx:\n"; 164238384Sjkimfor (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 165238384Sjkim$code.=<<___; 166238384Sjkim and $t2,$t2,#0xff 167238384Sjkim cmp $t2,#0xf2 168238384Sjkim bne .Lrounds_16_xx 169238384Sjkim 170238384Sjkim ldr $T1,[sp,#16*4] @ pull ctx 171238384Sjkim ldr $t0,[$T1,#0] 172238384Sjkim ldr $t1,[$T1,#4] 173238384Sjkim ldr $t2,[$T1,#8] 174238384Sjkim add $A,$A,$t0 175238384Sjkim ldr $t0,[$T1,#12] 176238384Sjkim add $B,$B,$t1 177238384Sjkim ldr $t1,[$T1,#16] 178238384Sjkim add $C,$C,$t2 179238384Sjkim ldr $t2,[$T1,#20] 180238384Sjkim add $D,$D,$t0 181238384Sjkim ldr $t0,[$T1,#24] 182238384Sjkim add $E,$E,$t1 183238384Sjkim ldr $t1,[$T1,#28] 184238384Sjkim add $F,$F,$t2 185238384Sjkim ldr $inp,[sp,#17*4] @ pull inp 186238384Sjkim ldr $t2,[sp,#18*4] @ pull inp+len 187238384Sjkim add $G,$G,$t0 188238384Sjkim add $H,$H,$t1 189238384Sjkim stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H} 190238384Sjkim cmp $inp,$t2 191238384Sjkim sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 192238384Sjkim bne .Loop 193238384Sjkim 194238384Sjkim add sp,sp,#`16+3`*4 @ destroy frame 195238384Sjkim#if __ARM_ARCH__>=5 196238384Sjkim ldmia sp!,{r4-r11,pc} 197238384Sjkim#else 198238384Sjkim ldmia sp!,{r4-r11,lr} 199238384Sjkim tst lr,#1 200238384Sjkim moveq pc,lr @ be binary compatible with V4, yet 201238384Sjkim bx lr @ interoperable with Thumb ISA:-) 202238384Sjkim#endif 203238384Sjkim.size sha256_block_data_order,.-sha256_block_data_order 204238384Sjkim.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 205238384Sjkim.align 2 206238384Sjkim___ 207238384Sjkim 208238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 209238384Sjkim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 210238384Sjkimprint $code; 211238384Sjkimclose STDOUT; # enforce flush 212