1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# May 2011 11238384Sjkim# 12238384Sjkim# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 13238384Sjkim# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 14238384Sjkim# the time being... gcc 4.3 appeared to generate poor code, therefore 15238384Sjkim# the effort. And indeed, the module delivers 55%-90%(*) improvement 16238384Sjkim# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit 17238384Sjkim# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. 18238384Sjkim# This is for 64-bit build. In 32-bit "highgprs" case improvement is 19238384Sjkim# even higher, for example on z990 it was measured 80%-150%. ECDSA 20238384Sjkim# sign is modest 9%-12% faster. Keep in mind that these coefficients 21238384Sjkim# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is 22238384Sjkim# burnt in it... 23238384Sjkim# 24238384Sjkim# (*) gcc 4.1 was observed to deliver better results than gcc 4.3, 25238384Sjkim# so that improvement coefficients can vary from one specific 26238384Sjkim# setup to another. 27238384Sjkim 28238384Sjkim$flavour = shift; 29238384Sjkim 30238384Sjkimif ($flavour =~ /3[12]/) { 31238384Sjkim $SIZE_T=4; 32238384Sjkim $g=""; 33238384Sjkim} else { 34238384Sjkim $SIZE_T=8; 35238384Sjkim $g="g"; 36238384Sjkim} 37238384Sjkim 38238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 39238384Sjkimopen STDOUT,">$output"; 40238384Sjkim 41238384Sjkim$stdframe=16*$SIZE_T+4*8; 42238384Sjkim 43238384Sjkim$rp="%r2"; 44238384Sjkim$a1="%r3"; 45238384Sjkim$a0="%r4"; 46238384Sjkim$b1="%r5"; 47238384Sjkim$b0="%r6"; 48238384Sjkim 49238384Sjkim$ra="%r14"; 50238384Sjkim$sp="%r15"; 51238384Sjkim 52238384Sjkim@T=("%r0","%r1"); 53238384Sjkim@i=("%r12","%r13"); 54238384Sjkim 55238384Sjkim($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); 56238384Sjkim($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; 57238384Sjkim 58238384Sjkim$code.=<<___; 59238384Sjkim.text 60238384Sjkim 61238384Sjkim.type _mul_1x1,\@function 62238384Sjkim.align 16 63238384Sjkim_mul_1x1: 64238384Sjkim lgr $a1,$a 65238384Sjkim sllg $a2,$a,1 66238384Sjkim sllg $a4,$a,2 67238384Sjkim sllg $a8,$a,3 68238384Sjkim 69238384Sjkim srag $lo,$a1,63 # broadcast 63rd bit 70238384Sjkim nihh $a1,0x1fff 71238384Sjkim srag @i[0],$a2,63 # broadcast 62nd bit 72238384Sjkim nihh $a2,0x3fff 73238384Sjkim srag @i[1],$a4,63 # broadcast 61st bit 74238384Sjkim nihh $a4,0x7fff 75238384Sjkim ngr $lo,$b 76238384Sjkim ngr @i[0],$b 77238384Sjkim ngr @i[1],$b 78238384Sjkim 79238384Sjkim lghi @T[0],0 80238384Sjkim lgr $a12,$a1 81238384Sjkim stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 82238384Sjkim xgr $a12,$a2 83238384Sjkim stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 84238384Sjkim lgr $a48,$a4 85238384Sjkim stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 86238384Sjkim xgr $a48,$a8 87238384Sjkim stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 88238384Sjkim xgr $a1,$a4 89238384Sjkim 90238384Sjkim stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 91238384Sjkim xgr $a2,$a4 92238384Sjkim stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 93238384Sjkim xgr $a12,$a4 94238384Sjkim stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 95238384Sjkim xgr $a1,$a48 96238384Sjkim stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 97238384Sjkim xgr $a2,$a48 98238384Sjkim 99238384Sjkim stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 100238384Sjkim xgr $a12,$a48 101238384Sjkim stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 102238384Sjkim xgr $a1,$a4 103238384Sjkim stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 104238384Sjkim xgr $a2,$a4 105238384Sjkim stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 106238384Sjkim 107238384Sjkim xgr $a12,$a4 108238384Sjkim stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 109238384Sjkim srlg $hi,$lo,1 110238384Sjkim stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 111238384Sjkim sllg $lo,$lo,63 112238384Sjkim stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 113238384Sjkim srlg @T[0],@i[0],2 114238384Sjkim stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 115238384Sjkim 116238384Sjkim lghi $mask,`0xf<<3` 117238384Sjkim sllg $a1,@i[0],62 118238384Sjkim sllg @i[0],$b,3 119238384Sjkim srlg @T[1],@i[1],3 120238384Sjkim ngr @i[0],$mask 121238384Sjkim sllg $a2,@i[1],61 122238384Sjkim srlg @i[1],$b,4-3 123238384Sjkim xgr $hi,@T[0] 124238384Sjkim ngr @i[1],$mask 125238384Sjkim xgr $lo,$a1 126238384Sjkim xgr $hi,@T[1] 127238384Sjkim xgr $lo,$a2 128238384Sjkim 129238384Sjkim xg $lo,$stdframe(@i[0],$sp) 130238384Sjkim srlg @i[0],$b,8-3 131238384Sjkim ngr @i[0],$mask 132238384Sjkim___ 133238384Sjkimfor($n=1;$n<14;$n++) { 134238384Sjkim$code.=<<___; 135238384Sjkim lg @T[1],$stdframe(@i[1],$sp) 136238384Sjkim srlg @i[1],$b,`($n+2)*4`-3 137238384Sjkim sllg @T[0],@T[1],`$n*4` 138238384Sjkim ngr @i[1],$mask 139238384Sjkim srlg @T[1],@T[1],`64-$n*4` 140238384Sjkim xgr $lo,@T[0] 141238384Sjkim xgr $hi,@T[1] 142238384Sjkim___ 143238384Sjkim push(@i,shift(@i)); push(@T,shift(@T)); 144238384Sjkim} 145238384Sjkim$code.=<<___; 146238384Sjkim lg @T[1],$stdframe(@i[1],$sp) 147238384Sjkim sllg @T[0],@T[1],`$n*4` 148238384Sjkim srlg @T[1],@T[1],`64-$n*4` 149238384Sjkim xgr $lo,@T[0] 150238384Sjkim xgr $hi,@T[1] 151238384Sjkim 152238384Sjkim lg @T[0],$stdframe(@i[0],$sp) 153238384Sjkim sllg @T[1],@T[0],`($n+1)*4` 154238384Sjkim srlg @T[0],@T[0],`64-($n+1)*4` 155238384Sjkim xgr $lo,@T[1] 156238384Sjkim xgr $hi,@T[0] 157238384Sjkim 158238384Sjkim br $ra 159238384Sjkim.size _mul_1x1,.-_mul_1x1 160238384Sjkim 161238384Sjkim.globl bn_GF2m_mul_2x2 162238384Sjkim.type bn_GF2m_mul_2x2,\@function 163238384Sjkim.align 16 164238384Sjkimbn_GF2m_mul_2x2: 165238384Sjkim stm${g} %r3,%r15,3*$SIZE_T($sp) 166238384Sjkim 167238384Sjkim lghi %r1,-$stdframe-128 168238384Sjkim la %r0,0($sp) 169238384Sjkim la $sp,0(%r1,$sp) # alloca 170238384Sjkim st${g} %r0,0($sp) # back chain 171238384Sjkim___ 172238384Sjkimif ($SIZE_T==8) { 173238384Sjkimmy @r=map("%r$_",(6..9)); 174238384Sjkim$code.=<<___; 175238384Sjkim bras $ra,_mul_1x1 # a1�b1 176238384Sjkim stmg $lo,$hi,16($rp) 177238384Sjkim 178238384Sjkim lg $a,`$stdframe+128+4*$SIZE_T`($sp) 179238384Sjkim lg $b,`$stdframe+128+6*$SIZE_T`($sp) 180238384Sjkim bras $ra,_mul_1x1 # a0�b0 181238384Sjkim stmg $lo,$hi,0($rp) 182238384Sjkim 183238384Sjkim lg $a,`$stdframe+128+3*$SIZE_T`($sp) 184238384Sjkim lg $b,`$stdframe+128+5*$SIZE_T`($sp) 185238384Sjkim xg $a,`$stdframe+128+4*$SIZE_T`($sp) 186238384Sjkim xg $b,`$stdframe+128+6*$SIZE_T`($sp) 187238384Sjkim bras $ra,_mul_1x1 # (a0+a1)�(b0+b1) 188238384Sjkim lmg @r[0],@r[3],0($rp) 189238384Sjkim 190238384Sjkim xgr $lo,$hi 191238384Sjkim xgr $hi,@r[1] 192238384Sjkim xgr $lo,@r[0] 193238384Sjkim xgr $hi,@r[2] 194238384Sjkim xgr $lo,@r[3] 195238384Sjkim xgr $hi,@r[3] 196238384Sjkim xgr $lo,$hi 197238384Sjkim stg $hi,16($rp) 198238384Sjkim stg $lo,8($rp) 199238384Sjkim___ 200238384Sjkim} else { 201238384Sjkim$code.=<<___; 202238384Sjkim sllg %r3,%r3,32 203238384Sjkim sllg %r5,%r5,32 204238384Sjkim or %r3,%r4 205238384Sjkim or %r5,%r6 206238384Sjkim bras $ra,_mul_1x1 207238384Sjkim rllg $lo,$lo,32 208238384Sjkim rllg $hi,$hi,32 209238384Sjkim stmg $lo,$hi,0($rp) 210238384Sjkim___ 211238384Sjkim} 212238384Sjkim$code.=<<___; 213238384Sjkim lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) 214238384Sjkim br $ra 215238384Sjkim.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 216238384Sjkim.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" 217238384Sjkim___ 218238384Sjkim 219238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 220238384Sjkimprint $code; 221238384Sjkimclose STDOUT; 222