1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# May 2011 11238384Sjkim# 12238384Sjkim# The module implements bn_GF2m_mul_2x2 polynomial multiplication 13238384Sjkim# used in bn_gf2m.c. It's kind of low-hanging mechanical port from 14238384Sjkim# C for the time being... Except that it has two code paths: pure 15238384Sjkim# integer code suitable for any ARMv4 and later CPU and NEON code 16238384Sjkim# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs 17238384Sjkim# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% 18238384Sjkim# faster than compiler-generated code. For ECDH and ECDSA verify (but 19238384Sjkim# not for ECDSA sign) it means 25%-45% improvement depending on key 20238384Sjkim# length, more for longer keys. Even though NEON 1x1 multiplication 21238384Sjkim# runs in even less cycles, ~30, improvement is measurable only on 22238384Sjkim# longer keys. One has to optimize code elsewhere to get NEON glow... 23238384Sjkim 24238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 25238384Sjkimopen STDOUT,">$output"; 26238384Sjkim 27238384Sjkimsub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 28238384Sjkimsub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 29238384Sjkimsub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } 30238384Sjkim 31238384Sjkim$code=<<___; 32238384Sjkim#include "arm_arch.h" 33238384Sjkim 34238384Sjkim.text 35238384Sjkim.code 32 36238384Sjkim 37238384Sjkim#if __ARM_ARCH__>=7 38238384Sjkim.fpu neon 39238384Sjkim 40238384Sjkim.type mul_1x1_neon,%function 41238384Sjkim.align 5 42238384Sjkimmul_1x1_neon: 43238384Sjkim vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a 44291721Sjkim vmull.p8 `&Q("d0")`,d16,d17 @ a��bb 45238384Sjkim vshl.u64 `&Dlo("q2")`,d16,#16 46291721Sjkim vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8��bb 47238384Sjkim vshl.u64 `&Dlo("q3")`,d16,#24 48291721Sjkim vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16��bb 49238384Sjkim vshr.u64 `&Dlo("q1")`,#8 50291721Sjkim vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24��bb 51238384Sjkim vshl.u64 `&Dhi("q1")`,#24 52238384Sjkim veor d0,`&Dlo("q1")` 53238384Sjkim vshr.u64 `&Dlo("q2")`,#16 54238384Sjkim veor d0,`&Dhi("q1")` 55238384Sjkim vshl.u64 `&Dhi("q2")`,#16 56238384Sjkim veor d0,`&Dlo("q2")` 57238384Sjkim vshr.u64 `&Dlo("q3")`,#24 58238384Sjkim veor d0,`&Dhi("q2")` 59238384Sjkim vshl.u64 `&Dhi("q3")`,#8 60238384Sjkim veor d0,`&Dlo("q3")` 61238384Sjkim veor d0,`&Dhi("q3")` 62238384Sjkim bx lr 63238384Sjkim.size mul_1x1_neon,.-mul_1x1_neon 64238384Sjkim#endif 65238384Sjkim___ 66238384Sjkim################ 67238384Sjkim# private interface to mul_1x1_ialu 68238384Sjkim# 69238384Sjkim$a="r1"; 70238384Sjkim$b="r0"; 71238384Sjkim 72238384Sjkim($a0,$a1,$a2,$a12,$a4,$a14)= 73238384Sjkim($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); 74238384Sjkim 75238384Sjkim$mask="r12"; 76238384Sjkim 77238384Sjkim$code.=<<___; 78238384Sjkim.type mul_1x1_ialu,%function 79238384Sjkim.align 5 80238384Sjkimmul_1x1_ialu: 81238384Sjkim mov $a0,#0 82238384Sjkim bic $a1,$a,#3<<30 @ a1=a&0x3fffffff 83238384Sjkim str $a0,[sp,#0] @ tab[0]=0 84238384Sjkim add $a2,$a1,$a1 @ a2=a1<<1 85238384Sjkim str $a1,[sp,#4] @ tab[1]=a1 86238384Sjkim eor $a12,$a1,$a2 @ a1^a2 87238384Sjkim str $a2,[sp,#8] @ tab[2]=a2 88238384Sjkim mov $a4,$a1,lsl#2 @ a4=a1<<2 89238384Sjkim str $a12,[sp,#12] @ tab[3]=a1^a2 90238384Sjkim eor $a14,$a1,$a4 @ a1^a4 91238384Sjkim str $a4,[sp,#16] @ tab[4]=a4 92238384Sjkim eor $a0,$a2,$a4 @ a2^a4 93238384Sjkim str $a14,[sp,#20] @ tab[5]=a1^a4 94238384Sjkim eor $a12,$a12,$a4 @ a1^a2^a4 95238384Sjkim str $a0,[sp,#24] @ tab[6]=a2^a4 96238384Sjkim and $i0,$mask,$b,lsl#2 97238384Sjkim str $a12,[sp,#28] @ tab[7]=a1^a2^a4 98238384Sjkim 99238384Sjkim and $i1,$mask,$b,lsr#1 100238384Sjkim ldr $lo,[sp,$i0] @ tab[b & 0x7] 101238384Sjkim and $i0,$mask,$b,lsr#4 102238384Sjkim ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] 103238384Sjkim and $i1,$mask,$b,lsr#7 104238384Sjkim ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] 105238384Sjkim eor $lo,$lo,$t1,lsl#3 @ stall 106238384Sjkim mov $hi,$t1,lsr#29 107238384Sjkim ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] 108238384Sjkim 109238384Sjkim and $i0,$mask,$b,lsr#10 110238384Sjkim eor $lo,$lo,$t0,lsl#6 111238384Sjkim eor $hi,$hi,$t0,lsr#26 112238384Sjkim ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] 113238384Sjkim 114238384Sjkim and $i1,$mask,$b,lsr#13 115238384Sjkim eor $lo,$lo,$t1,lsl#9 116238384Sjkim eor $hi,$hi,$t1,lsr#23 117238384Sjkim ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] 118238384Sjkim 119238384Sjkim and $i0,$mask,$b,lsr#16 120238384Sjkim eor $lo,$lo,$t0,lsl#12 121238384Sjkim eor $hi,$hi,$t0,lsr#20 122238384Sjkim ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] 123238384Sjkim 124238384Sjkim and $i1,$mask,$b,lsr#19 125238384Sjkim eor $lo,$lo,$t1,lsl#15 126238384Sjkim eor $hi,$hi,$t1,lsr#17 127238384Sjkim ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] 128238384Sjkim 129238384Sjkim and $i0,$mask,$b,lsr#22 130238384Sjkim eor $lo,$lo,$t0,lsl#18 131238384Sjkim eor $hi,$hi,$t0,lsr#14 132238384Sjkim ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] 133238384Sjkim 134238384Sjkim and $i1,$mask,$b,lsr#25 135238384Sjkim eor $lo,$lo,$t1,lsl#21 136238384Sjkim eor $hi,$hi,$t1,lsr#11 137238384Sjkim ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] 138238384Sjkim 139238384Sjkim tst $a,#1<<30 140238384Sjkim and $i0,$mask,$b,lsr#28 141238384Sjkim eor $lo,$lo,$t0,lsl#24 142238384Sjkim eor $hi,$hi,$t0,lsr#8 143238384Sjkim ldr $t0,[sp,$i0] @ tab[b >> 30 ] 144238384Sjkim 145238384Sjkim eorne $lo,$lo,$b,lsl#30 146238384Sjkim eorne $hi,$hi,$b,lsr#2 147238384Sjkim tst $a,#1<<31 148238384Sjkim eor $lo,$lo,$t1,lsl#27 149238384Sjkim eor $hi,$hi,$t1,lsr#5 150238384Sjkim eorne $lo,$lo,$b,lsl#31 151238384Sjkim eorne $hi,$hi,$b,lsr#1 152238384Sjkim eor $lo,$lo,$t0,lsl#30 153238384Sjkim eor $hi,$hi,$t0,lsr#2 154238384Sjkim 155238384Sjkim mov pc,lr 156238384Sjkim.size mul_1x1_ialu,.-mul_1x1_ialu 157238384Sjkim___ 158238384Sjkim################ 159238384Sjkim# void bn_GF2m_mul_2x2(BN_ULONG *r, 160238384Sjkim# BN_ULONG a1,BN_ULONG a0, 161291721Sjkim# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0��b1b0 162238384Sjkim 163238384Sjkim($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); 164238384Sjkim 165238384Sjkim$code.=<<___; 166238384Sjkim.global bn_GF2m_mul_2x2 167238384Sjkim.type bn_GF2m_mul_2x2,%function 168238384Sjkim.align 5 169238384Sjkimbn_GF2m_mul_2x2: 170238384Sjkim#if __ARM_ARCH__>=7 171238384Sjkim ldr r12,.LOPENSSL_armcap 172238384Sjkim.Lpic: ldr r12,[pc,r12] 173238384Sjkim tst r12,#1 174238384Sjkim beq .Lialu 175238384Sjkim 176238384Sjkim veor $A1,$A1 177238384Sjkim vmov.32 $B1,r3,r3 @ two copies of b1 178238384Sjkim vmov.32 ${A1}[0],r1 @ a1 179238384Sjkim 180238384Sjkim veor $A0,$A0 181238384Sjkim vld1.32 ${B0}[],[sp,:32] @ two copies of b0 182238384Sjkim vmov.32 ${A0}[0],r2 @ a0 183238384Sjkim mov r12,lr 184238384Sjkim 185238384Sjkim vmov d16,$A1 186238384Sjkim vmov d17,$B1 187291721Sjkim bl mul_1x1_neon @ a1��b1 188238384Sjkim vmov $A1B1,d0 189238384Sjkim 190238384Sjkim vmov d16,$A0 191238384Sjkim vmov d17,$B0 192291721Sjkim bl mul_1x1_neon @ a0��b0 193238384Sjkim vmov $A0B0,d0 194238384Sjkim 195238384Sjkim veor d16,$A0,$A1 196238384Sjkim veor d17,$B0,$B1 197238384Sjkim veor $A0,$A0B0,$A1B1 198291721Sjkim bl mul_1x1_neon @ (a0+a1)��(b0+b1) 199238384Sjkim 200291721Sjkim veor d0,$A0 @ (a0+a1)��(b0+b1)-a0��b0-a1��b1 201238384Sjkim vshl.u64 d1,d0,#32 202238384Sjkim vshr.u64 d0,d0,#32 203238384Sjkim veor $A0B0,d1 204238384Sjkim veor $A1B1,d0 205238384Sjkim vst1.32 {${A0B0}[0]},[r0,:32]! 206238384Sjkim vst1.32 {${A0B0}[1]},[r0,:32]! 207238384Sjkim vst1.32 {${A1B1}[0]},[r0,:32]! 208238384Sjkim vst1.32 {${A1B1}[1]},[r0,:32] 209238384Sjkim bx r12 210238384Sjkim.align 4 211238384Sjkim.Lialu: 212238384Sjkim#endif 213238384Sjkim___ 214238384Sjkim$ret="r10"; # reassigned 1st argument 215238384Sjkim$code.=<<___; 216238384Sjkim stmdb sp!,{r4-r10,lr} 217238384Sjkim mov $ret,r0 @ reassign 1st argument 218238384Sjkim mov $b,r3 @ $b=b1 219238384Sjkim ldr r3,[sp,#32] @ load b0 220238384Sjkim mov $mask,#7<<2 221238384Sjkim sub sp,sp,#32 @ allocate tab[8] 222238384Sjkim 223291721Sjkim bl mul_1x1_ialu @ a1��b1 224238384Sjkim str $lo,[$ret,#8] 225238384Sjkim str $hi,[$ret,#12] 226238384Sjkim 227238384Sjkim eor $b,$b,r3 @ flip b0 and b1 228238384Sjkim eor $a,$a,r2 @ flip a0 and a1 229238384Sjkim eor r3,r3,$b 230238384Sjkim eor r2,r2,$a 231238384Sjkim eor $b,$b,r3 232238384Sjkim eor $a,$a,r2 233291721Sjkim bl mul_1x1_ialu @ a0��b0 234238384Sjkim str $lo,[$ret] 235238384Sjkim str $hi,[$ret,#4] 236238384Sjkim 237238384Sjkim eor $a,$a,r2 238238384Sjkim eor $b,$b,r3 239291721Sjkim bl mul_1x1_ialu @ (a1+a0)��(b1+b0) 240238384Sjkim___ 241238384Sjkim@r=map("r$_",(6..9)); 242238384Sjkim$code.=<<___; 243238384Sjkim ldmia $ret,{@r[0]-@r[3]} 244238384Sjkim eor $lo,$lo,$hi 245238384Sjkim eor $hi,$hi,@r[1] 246238384Sjkim eor $lo,$lo,@r[0] 247238384Sjkim eor $hi,$hi,@r[2] 248238384Sjkim eor $lo,$lo,@r[3] 249238384Sjkim eor $hi,$hi,@r[3] 250238384Sjkim str $hi,[$ret,#8] 251238384Sjkim eor $lo,$lo,$hi 252238384Sjkim add sp,sp,#32 @ destroy tab[8] 253238384Sjkim str $lo,[$ret,#4] 254238384Sjkim 255238384Sjkim#if __ARM_ARCH__>=5 256238384Sjkim ldmia sp!,{r4-r10,pc} 257238384Sjkim#else 258238384Sjkim ldmia sp!,{r4-r10,lr} 259238384Sjkim tst lr,#1 260238384Sjkim moveq pc,lr @ be binary compatible with V4, yet 261238384Sjkim bx lr @ interoperable with Thumb ISA:-) 262238384Sjkim#endif 263238384Sjkim.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 264238384Sjkim#if __ARM_ARCH__>=7 265238384Sjkim.align 5 266238384Sjkim.LOPENSSL_armcap: 267238384Sjkim.word OPENSSL_armcap_P-(.Lpic+8) 268238384Sjkim#endif 269238384Sjkim.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 270238384Sjkim.align 5 271238384Sjkim 272238384Sjkim.comm OPENSSL_armcap_P,4,4 273238384Sjkim___ 274238384Sjkim 275238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 276238384Sjkim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 277238384Sjkimprint $code; 278238384Sjkimclose STDOUT; # enforce flush 279