1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# SHA1 block procedure for s390x. 11238384Sjkim 12238384Sjkim# April 2007. 13238384Sjkim# 14238384Sjkim# Performance is >30% better than gcc 3.3 generated code. But the real 15238384Sjkim# twist is that SHA1 hardware support is detected and utilized. In 16238384Sjkim# which case performance can reach further >4.5x for larger chunks. 17238384Sjkim 18238384Sjkim# January 2009. 19238384Sjkim# 20238384Sjkim# Optimize Xupdate for amount of memory references and reschedule 21238384Sjkim# instructions to favour dual-issue z10 pipeline. On z10 hardware is 22238384Sjkim# "only" ~2.3x faster than software. 23238384Sjkim 24238384Sjkim# November 2010. 25238384Sjkim# 26238384Sjkim# Adapt for -m31 build. If kernel supports what's called "highgprs" 27238384Sjkim# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 28238384Sjkim# instructions and achieve "64-bit" performance even in 31-bit legacy 29238384Sjkim# application context. The feature is not specific to any particular 30238384Sjkim# processor, as long as it's "z-CPU". Latter implies that the code 31238384Sjkim# remains z/Architecture specific. 32238384Sjkim 33238384Sjkim$kimdfunc=1; # magic function code for kimd instruction 34238384Sjkim 35238384Sjkim$flavour = shift; 36238384Sjkim 37238384Sjkimif ($flavour =~ /3[12]/) { 38238384Sjkim $SIZE_T=4; 39238384Sjkim $g=""; 40238384Sjkim} else { 41238384Sjkim $SIZE_T=8; 42238384Sjkim $g="g"; 43238384Sjkim} 44238384Sjkim 45238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 46238384Sjkimopen STDOUT,">$output"; 47238384Sjkim 48238384Sjkim$K_00_39="%r0"; $K=$K_00_39; 49238384Sjkim$K_40_79="%r1"; 50238384Sjkim$ctx="%r2"; $prefetch="%r2"; 51238384Sjkim$inp="%r3"; 52238384Sjkim$len="%r4"; 53238384Sjkim 54238384Sjkim$A="%r5"; 55238384Sjkim$B="%r6"; 56238384Sjkim$C="%r7"; 57238384Sjkim$D="%r8"; 58238384Sjkim$E="%r9"; @V=($A,$B,$C,$D,$E); 59238384Sjkim$t0="%r10"; 60238384Sjkim$t1="%r11"; 61238384Sjkim@X=("%r12","%r13","%r14"); 62238384Sjkim$sp="%r15"; 63238384Sjkim 64238384Sjkim$stdframe=16*$SIZE_T+4*8; 65238384Sjkim$frame=$stdframe+16*4; 66238384Sjkim 67238384Sjkimsub Xupdate { 68238384Sjkimmy $i=shift; 69238384Sjkim 70238384Sjkim$code.=<<___ if ($i==15); 71238384Sjkim lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up 72238384Sjkim lr $X[0],$X[2] 73238384Sjkim___ 74238384Sjkimreturn if ($i&1); # Xupdate is vectorized and executed every 2nd cycle 75238384Sjkim$code.=<<___ if ($i<16); 76238384Sjkim lg $X[0],`$i*4`($inp) ### Xload($i) 77238384Sjkim rllg $X[1],$X[0],32 78238384Sjkim___ 79238384Sjkim$code.=<<___ if ($i>=16); 80238384Sjkim xgr $X[0],$prefetch ### Xupdate($i) 81238384Sjkim lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) 82238384Sjkim xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) 83238384Sjkim xgr $X[0],$prefetch 84238384Sjkim rll $X[0],$X[0],1 85238384Sjkim rllg $X[1],$X[0],32 86238384Sjkim rll $X[1],$X[1],1 87238384Sjkim rllg $X[0],$X[1],32 88238384Sjkim lr $X[2],$X[1] # feedback 89238384Sjkim___ 90238384Sjkim$code.=<<___ if ($i<=70); 91238384Sjkim stg $X[0],`$stdframe+4*($i%16)`($sp) 92238384Sjkim___ 93238384Sjkimunshift(@X,pop(@X)); 94238384Sjkim} 95238384Sjkim 96238384Sjkimsub BODY_00_19 { 97238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 98238384Sjkimmy $xi=$X[1]; 99238384Sjkim 100238384Sjkim &Xupdate($i); 101238384Sjkim$code.=<<___; 102238384Sjkim alr $e,$K ### $i 103238384Sjkim rll $t1,$a,5 104238384Sjkim lr $t0,$d 105238384Sjkim xr $t0,$c 106238384Sjkim alr $e,$t1 107238384Sjkim nr $t0,$b 108238384Sjkim alr $e,$xi 109238384Sjkim xr $t0,$d 110238384Sjkim rll $b,$b,30 111238384Sjkim alr $e,$t0 112238384Sjkim___ 113238384Sjkim} 114238384Sjkim 115238384Sjkimsub BODY_20_39 { 116238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 117238384Sjkimmy $xi=$X[1]; 118238384Sjkim 119238384Sjkim &Xupdate($i); 120238384Sjkim$code.=<<___; 121238384Sjkim alr $e,$K ### $i 122238384Sjkim rll $t1,$a,5 123238384Sjkim lr $t0,$b 124238384Sjkim alr $e,$t1 125238384Sjkim xr $t0,$c 126238384Sjkim alr $e,$xi 127238384Sjkim xr $t0,$d 128238384Sjkim rll $b,$b,30 129238384Sjkim alr $e,$t0 130238384Sjkim___ 131238384Sjkim} 132238384Sjkim 133238384Sjkimsub BODY_40_59 { 134238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 135238384Sjkimmy $xi=$X[1]; 136238384Sjkim 137238384Sjkim &Xupdate($i); 138238384Sjkim$code.=<<___; 139238384Sjkim alr $e,$K ### $i 140238384Sjkim rll $t1,$a,5 141238384Sjkim lr $t0,$b 142238384Sjkim alr $e,$t1 143238384Sjkim or $t0,$c 144238384Sjkim lr $t1,$b 145238384Sjkim nr $t0,$d 146238384Sjkim nr $t1,$c 147238384Sjkim alr $e,$xi 148238384Sjkim or $t0,$t1 149238384Sjkim rll $b,$b,30 150238384Sjkim alr $e,$t0 151238384Sjkim___ 152238384Sjkim} 153238384Sjkim 154238384Sjkim$code.=<<___; 155238384Sjkim.text 156238384Sjkim.align 64 157238384Sjkim.type Ktable,\@object 158238384SjkimKtable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6 159238384Sjkim .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0 160238384Sjkim.size Ktable,.-Ktable 161238384Sjkim.globl sha1_block_data_order 162238384Sjkim.type sha1_block_data_order,\@function 163238384Sjkimsha1_block_data_order: 164238384Sjkim___ 165238384Sjkim$code.=<<___ if ($kimdfunc); 166238384Sjkim larl %r1,OPENSSL_s390xcap_P 167238384Sjkim lg %r0,0(%r1) 168238384Sjkim tmhl %r0,0x4000 # check for message-security assist 169238384Sjkim jz .Lsoftware 170238384Sjkim lghi %r0,0 171238384Sjkim la %r1,`2*$SIZE_T`($sp) 172238384Sjkim .long 0xb93e0002 # kimd %r0,%r2 173238384Sjkim lg %r0,`2*$SIZE_T`($sp) 174238384Sjkim tmhh %r0,`0x8000>>$kimdfunc` 175238384Sjkim jz .Lsoftware 176238384Sjkim lghi %r0,$kimdfunc 177238384Sjkim lgr %r1,$ctx 178238384Sjkim lgr %r2,$inp 179238384Sjkim sllg %r3,$len,6 180238384Sjkim .long 0xb93e0002 # kimd %r0,%r2 181238384Sjkim brc 1,.-4 # pay attention to "partial completion" 182238384Sjkim br %r14 183238384Sjkim.align 16 184238384Sjkim.Lsoftware: 185238384Sjkim___ 186238384Sjkim$code.=<<___; 187238384Sjkim lghi %r1,-$frame 188238384Sjkim st${g} $ctx,`2*$SIZE_T`($sp) 189238384Sjkim stm${g} %r6,%r15,`6*$SIZE_T`($sp) 190238384Sjkim lgr %r0,$sp 191238384Sjkim la $sp,0(%r1,$sp) 192238384Sjkim st${g} %r0,0($sp) 193238384Sjkim 194238384Sjkim larl $t0,Ktable 195238384Sjkim llgf $A,0($ctx) 196238384Sjkim llgf $B,4($ctx) 197238384Sjkim llgf $C,8($ctx) 198238384Sjkim llgf $D,12($ctx) 199238384Sjkim llgf $E,16($ctx) 200238384Sjkim 201238384Sjkim lg $K_00_39,0($t0) 202238384Sjkim lg $K_40_79,8($t0) 203238384Sjkim 204238384Sjkim.Lloop: 205238384Sjkim rllg $K_00_39,$K_00_39,32 206238384Sjkim___ 207238384Sjkimfor ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 208238384Sjkim$code.=<<___; 209238384Sjkim rllg $K_00_39,$K_00_39,32 210238384Sjkim___ 211238384Sjkimfor (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 212238384Sjkim$code.=<<___; $K=$K_40_79; 213238384Sjkim rllg $K_40_79,$K_40_79,32 214238384Sjkim___ 215238384Sjkimfor (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 216238384Sjkim$code.=<<___; 217238384Sjkim rllg $K_40_79,$K_40_79,32 218238384Sjkim___ 219238384Sjkimfor (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 220238384Sjkim$code.=<<___; 221238384Sjkim 222238384Sjkim l${g} $ctx,`$frame+2*$SIZE_T`($sp) 223238384Sjkim la $inp,64($inp) 224238384Sjkim al $A,0($ctx) 225238384Sjkim al $B,4($ctx) 226238384Sjkim al $C,8($ctx) 227238384Sjkim al $D,12($ctx) 228238384Sjkim al $E,16($ctx) 229238384Sjkim st $A,0($ctx) 230238384Sjkim st $B,4($ctx) 231238384Sjkim st $C,8($ctx) 232238384Sjkim st $D,12($ctx) 233238384Sjkim st $E,16($ctx) 234238384Sjkim brct${g} $len,.Lloop 235238384Sjkim 236238384Sjkim lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) 237238384Sjkim br %r14 238238384Sjkim.size sha1_block_data_order,.-sha1_block_data_order 239238384Sjkim.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" 240238384Sjkim.comm OPENSSL_s390xcap_P,16,8 241238384Sjkim___ 242238384Sjkim 243238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 244238384Sjkim 245238384Sjkimprint $code; 246238384Sjkimclose STDOUT; 247