1183234Ssimon#!/usr/bin/env perl 2183234Ssimon# 3183234Ssimon# ==================================================================== 4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and 6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon# 10183234Ssimon# sha1_block procedure for x86_64. 11183234Ssimon# 12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code 13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on 14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit 15183234Ssimon# assembler, which originally made it hard to motivate the effort. 16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I 17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank 18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19238405Sjkim# implementation:-) However! While 64-bit code does perform better 20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core 22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T 23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as 24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the 25183234Ssimon# following table: 26183234Ssimon# 27183234Ssimon# gcc 3.4 32-bit asm cycles/byte 28183234Ssimon# Opteron +45% +20% 6.8 29183234Ssimon# Xeon P4 +65% +0% 9.9 30183234Ssimon# Core2 +60% +10% 7.0 31183234Ssimon 32238405Sjkim# August 2009. 33238405Sjkim# 34238405Sjkim# The code was revised to minimize code size and to maximize 35238405Sjkim# "distance" between instructions producing input to 'lea' 36238405Sjkim# instruction and the 'lea' instruction itself, which is essential 37238405Sjkim# for Intel Atom core. 38183234Ssimon 39238405Sjkim# October 2010. 40238405Sjkim# 41238405Sjkim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 42238405Sjkim# is to offload message schedule denoted by Wt in NIST specification, 43238405Sjkim# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module 44238405Sjkim# for background and implementation details. The only difference from 45238405Sjkim# 32-bit code is that 64-bit code doesn't have to spill @X[] elements 46238405Sjkim# to free temporary registers. 47238405Sjkim 48238405Sjkim# April 2011. 49238405Sjkim# 50238405Sjkim# Add AVX code path. See sha1-586.pl for further information. 51238405Sjkim 52238405Sjkim###################################################################### 53238405Sjkim# Current performance is summarized in following table. Numbers are 54238405Sjkim# CPU clock cycles spent to process single byte (less is better). 55238405Sjkim# 56238405Sjkim# x86_64 SSSE3 AVX 57238405Sjkim# P4 9.8 - 58238405Sjkim# Opteron 6.6 - 59238405Sjkim# Core2 6.7 6.1/+10% - 60238405Sjkim# Atom 11.0 9.7/+13% - 61238405Sjkim# Westmere 7.1 5.6/+27% - 62238405Sjkim# Sandy Bridge 7.9 6.3/+25% 5.2/+51% 63238405Sjkim 64238405Sjkim$flavour = shift; 65238405Sjkim$output = shift; 66238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 67238405Sjkim 68238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 69238405Sjkim 70183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 72183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 73183234Ssimondie "can't locate x86_64-xlate.pl"; 74183234Ssimon 75238405Sjkim$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 76238405Sjkim =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 77238405Sjkim $1>=2.19); 78238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79238405Sjkim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 80238405Sjkim $1>=2.09); 81238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 82238405Sjkim `ml64 2>&1` =~ /Version ([0-9]+)\./ && 83238405Sjkim $1>=10); 84299964Sjkim$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && 85299964Sjkim $2>=3.0); 86183234Ssimon 87246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 88246772Sjkim*STDOUT=*OUT; 89238405Sjkim 90183234Ssimon$ctx="%rdi"; # 1st arg 91183234Ssimon$inp="%rsi"; # 2nd arg 92183234Ssimon$num="%rdx"; # 3rd arg 93183234Ssimon 94183234Ssimon# reassign arguments in order to produce more compact code 95183234Ssimon$ctx="%r8"; 96183234Ssimon$inp="%r9"; 97183234Ssimon$num="%r10"; 98183234Ssimon 99238405Sjkim$t0="%eax"; 100238405Sjkim$t1="%ebx"; 101238405Sjkim$t2="%ecx"; 102238405Sjkim@xi=("%edx","%ebp"); 103238405Sjkim$A="%esi"; 104238405Sjkim$B="%edi"; 105238405Sjkim$C="%r11d"; 106238405Sjkim$D="%r12d"; 107238405Sjkim$E="%r13d"; 108183234Ssimon 109238405Sjkim@V=($A,$B,$C,$D,$E); 110183234Ssimon 111183234Ssimonsub BODY_00_19 { 112238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 113183234Ssimonmy $j=$i+1; 114183234Ssimon$code.=<<___ if ($i==0); 115238405Sjkim mov `4*$i`($inp),$xi[0] 116238405Sjkim bswap $xi[0] 117238405Sjkim mov $xi[0],`4*$i`(%rsp) 118183234Ssimon___ 119183234Ssimon$code.=<<___ if ($i<15); 120183234Ssimon mov $c,$t0 121238405Sjkim mov `4*$j`($inp),$xi[1] 122238405Sjkim mov $a,$t2 123183234Ssimon xor $d,$t0 124238405Sjkim bswap $xi[1] 125238405Sjkim rol \$5,$t2 126238405Sjkim lea 0x5a827999($xi[0],$e),$e 127183234Ssimon and $b,$t0 128238405Sjkim mov $xi[1],`4*$j`(%rsp) 129238405Sjkim add $t2,$e 130183234Ssimon xor $d,$t0 131183234Ssimon rol \$30,$b 132238405Sjkim add $t0,$e 133183234Ssimon___ 134183234Ssimon$code.=<<___ if ($i>=15); 135238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 136183234Ssimon mov $c,$t0 137238405Sjkim mov $a,$t2 138238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 139183234Ssimon xor $d,$t0 140238405Sjkim rol \$5,$t2 141238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 142183234Ssimon and $b,$t0 143238405Sjkim lea 0x5a827999($xi[0],$e),$e 144238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 145183234Ssimon xor $d,$t0 146238405Sjkim rol \$1,$xi[1] 147238405Sjkim add $t2,$e 148183234Ssimon rol \$30,$b 149238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 150238405Sjkim add $t0,$e 151183234Ssimon___ 152238405Sjkimunshift(@xi,pop(@xi)); 153183234Ssimon} 154183234Ssimon 155183234Ssimonsub BODY_20_39 { 156238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 157183234Ssimonmy $j=$i+1; 158183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6; 159183234Ssimon$code.=<<___ if ($i<79); 160238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 161183234Ssimon mov $c,$t0 162238405Sjkim mov $a,$t2 163238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 164183234Ssimon xor $b,$t0 165238405Sjkim rol \$5,$t2 166238405Sjkim lea $K($xi[0],$e),$e 167238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 168183234Ssimon xor $d,$t0 169238405Sjkim add $t2,$e 170238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 171183234Ssimon rol \$30,$b 172238405Sjkim add $t0,$e 173238405Sjkim rol \$1,$xi[1] 174183234Ssimon___ 175183234Ssimon$code.=<<___ if ($i<76); 176238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 177183234Ssimon___ 178183234Ssimon$code.=<<___ if ($i==79); 179183234Ssimon mov $c,$t0 180238405Sjkim mov $a,$t2 181183234Ssimon xor $b,$t0 182238405Sjkim lea $K($xi[0],$e),$e 183238405Sjkim rol \$5,$t2 184183234Ssimon xor $d,$t0 185238405Sjkim add $t2,$e 186183234Ssimon rol \$30,$b 187238405Sjkim add $t0,$e 188183234Ssimon___ 189238405Sjkimunshift(@xi,pop(@xi)); 190183234Ssimon} 191183234Ssimon 192183234Ssimonsub BODY_40_59 { 193238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 194183234Ssimonmy $j=$i+1; 195183234Ssimon$code.=<<___; 196238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 197238405Sjkim mov $c,$t0 198238405Sjkim mov $c,$t1 199238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 200238405Sjkim and $d,$t0 201238405Sjkim mov $a,$t2 202238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 203238405Sjkim xor $d,$t1 204238405Sjkim lea 0x8f1bbcdc($xi[0],$e),$e 205238405Sjkim rol \$5,$t2 206238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 207238405Sjkim add $t0,$e 208238405Sjkim and $b,$t1 209238405Sjkim rol \$1,$xi[1] 210238405Sjkim add $t1,$e 211183234Ssimon rol \$30,$b 212238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 213238405Sjkim add $t2,$e 214183234Ssimon___ 215238405Sjkimunshift(@xi,pop(@xi)); 216183234Ssimon} 217183234Ssimon 218238405Sjkim$code.=<<___; 219238405Sjkim.text 220238405Sjkim.extern OPENSSL_ia32cap_P 221183234Ssimon 222238405Sjkim.globl sha1_block_data_order 223238405Sjkim.type sha1_block_data_order,\@function,3 224238405Sjkim.align 16 225238405Sjkimsha1_block_data_order: 226238405Sjkim mov OPENSSL_ia32cap_P+0(%rip),%r9d 227238405Sjkim mov OPENSSL_ia32cap_P+4(%rip),%r8d 228238405Sjkim test \$`1<<9`,%r8d # check SSSE3 bit 229238405Sjkim jz .Lialu 230238405Sjkim___ 231238405Sjkim$code.=<<___ if ($avx); 232238405Sjkim and \$`1<<28`,%r8d # mask AVX bit 233238405Sjkim and \$`1<<30`,%r9d # mask "Intel CPU" bit 234238405Sjkim or %r9d,%r8d 235238405Sjkim cmp \$`1<<28|1<<30`,%r8d 236238405Sjkim je _avx_shortcut 237238405Sjkim___ 238238405Sjkim$code.=<<___; 239238405Sjkim jmp _ssse3_shortcut 240238405Sjkim 241238405Sjkim.align 16 242238405Sjkim.Lialu: 243238405Sjkim push %rbx 244238405Sjkim push %rbp 245238405Sjkim push %r12 246238405Sjkim push %r13 247238405Sjkim mov %rsp,%r11 248238405Sjkim mov %rdi,$ctx # reassigned argument 249238405Sjkim sub \$`8+16*4`,%rsp 250238405Sjkim mov %rsi,$inp # reassigned argument 251238405Sjkim and \$-64,%rsp 252238405Sjkim mov %rdx,$num # reassigned argument 253238405Sjkim mov %r11,`16*4`(%rsp) 254238405Sjkim.Lprologue: 255238405Sjkim 256238405Sjkim mov 0($ctx),$A 257238405Sjkim mov 4($ctx),$B 258238405Sjkim mov 8($ctx),$C 259238405Sjkim mov 12($ctx),$D 260238405Sjkim mov 16($ctx),$E 261238405Sjkim jmp .Lloop 262238405Sjkim 263238405Sjkim.align 16 264238405Sjkim.Lloop: 265238405Sjkim___ 266183234Ssimonfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 267183234Ssimonfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 268183234Ssimonfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 269183234Ssimonfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 270183234Ssimon$code.=<<___; 271238405Sjkim add 0($ctx),$A 272238405Sjkim add 4($ctx),$B 273238405Sjkim add 8($ctx),$C 274238405Sjkim add 12($ctx),$D 275238405Sjkim add 16($ctx),$E 276238405Sjkim mov $A,0($ctx) 277238405Sjkim mov $B,4($ctx) 278238405Sjkim mov $C,8($ctx) 279238405Sjkim mov $D,12($ctx) 280238405Sjkim mov $E,16($ctx) 281183234Ssimon 282238405Sjkim sub \$1,$num 283183234Ssimon lea `16*4`($inp),$inp 284183234Ssimon jnz .Lloop 285238405Sjkim 286238405Sjkim mov `16*4`(%rsp),%rsi 287238405Sjkim mov (%rsi),%r13 288238405Sjkim mov 8(%rsi),%r12 289238405Sjkim mov 16(%rsi),%rbp 290238405Sjkim mov 24(%rsi),%rbx 291238405Sjkim lea 32(%rsi),%rsp 292238405Sjkim.Lepilogue: 293238405Sjkim ret 294238405Sjkim.size sha1_block_data_order,.-sha1_block_data_order 295183234Ssimon___ 296238405Sjkim{{{ 297238405Sjkimmy $Xi=4; 298238405Sjkimmy @X=map("%xmm$_",(4..7,0..3)); 299238405Sjkimmy @Tx=map("%xmm$_",(8..10)); 300238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 301238405Sjkimmy @T=("%esi","%edi"); 302238405Sjkimmy $j=0; 303238405Sjkimmy $K_XX_XX="%r11"; 304238405Sjkim 305238405Sjkimmy $_rol=sub { &rol(@_) }; 306238405Sjkimmy $_ror=sub { &ror(@_) }; 307238405Sjkim 308183234Ssimon$code.=<<___; 309238405Sjkim.type sha1_block_data_order_ssse3,\@function,3 310238405Sjkim.align 16 311238405Sjkimsha1_block_data_order_ssse3: 312238405Sjkim_ssse3_shortcut: 313238405Sjkim push %rbx 314238405Sjkim push %rbp 315238405Sjkim push %r12 316238405Sjkim lea `-64-($win64?5*16:0)`(%rsp),%rsp 317238405Sjkim___ 318238405Sjkim$code.=<<___ if ($win64); 319238405Sjkim movaps %xmm6,64+0(%rsp) 320238405Sjkim movaps %xmm7,64+16(%rsp) 321238405Sjkim movaps %xmm8,64+32(%rsp) 322238405Sjkim movaps %xmm9,64+48(%rsp) 323238405Sjkim movaps %xmm10,64+64(%rsp) 324238405Sjkim.Lprologue_ssse3: 325238405Sjkim___ 326238405Sjkim$code.=<<___; 327238405Sjkim mov %rdi,$ctx # reassigned argument 328238405Sjkim mov %rsi,$inp # reassigned argument 329238405Sjkim mov %rdx,$num # reassigned argument 330238405Sjkim 331238405Sjkim shl \$6,$num 332238405Sjkim add $inp,$num 333238405Sjkim lea K_XX_XX(%rip),$K_XX_XX 334238405Sjkim 335238405Sjkim mov 0($ctx),$A # load context 336238405Sjkim mov 4($ctx),$B 337238405Sjkim mov 8($ctx),$C 338238405Sjkim mov 12($ctx),$D 339238405Sjkim mov $B,@T[0] # magic seed 340238405Sjkim mov 16($ctx),$E 341238405Sjkim 342238405Sjkim movdqa 64($K_XX_XX),@X[2] # pbswap mask 343238405Sjkim movdqa 0($K_XX_XX),@Tx[1] # K_00_19 344238405Sjkim movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 345238405Sjkim movdqu 16($inp),@X[-3&7] 346238405Sjkim movdqu 32($inp),@X[-2&7] 347238405Sjkim movdqu 48($inp),@X[-1&7] 348238405Sjkim pshufb @X[2],@X[-4&7] # byte swap 349238405Sjkim add \$64,$inp 350238405Sjkim pshufb @X[2],@X[-3&7] 351238405Sjkim pshufb @X[2],@X[-2&7] 352238405Sjkim pshufb @X[2],@X[-1&7] 353238405Sjkim paddd @Tx[1],@X[-4&7] # add K_00_19 354238405Sjkim paddd @Tx[1],@X[-3&7] 355238405Sjkim paddd @Tx[1],@X[-2&7] 356238405Sjkim movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 357238405Sjkim psubd @Tx[1],@X[-4&7] # restore X[] 358238405Sjkim movdqa @X[-3&7],16(%rsp) 359238405Sjkim psubd @Tx[1],@X[-3&7] 360238405Sjkim movdqa @X[-2&7],32(%rsp) 361238405Sjkim psubd @Tx[1],@X[-2&7] 362238405Sjkim jmp .Loop_ssse3 363238405Sjkim___ 364238405Sjkim 365238405Sjkimsub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 366238405Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 367238405Sjkim my $arg = pop; 368238405Sjkim $arg = "\$$arg" if ($arg*1 eq $arg); 369238405Sjkim $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 370238405Sjkim} 371238405Sjkim 372238405Sjkimsub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 373238405Sjkim{ use integer; 374238405Sjkim my $body = shift; 375238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 376238405Sjkim my ($a,$b,$c,$d,$e); 377238405Sjkim 378238405Sjkim &movdqa (@X[0],@X[-3&7]); 379238405Sjkim eval(shift(@insns)); 380238405Sjkim eval(shift(@insns)); 381238405Sjkim &movdqa (@Tx[0],@X[-1&7]); 382238405Sjkim &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 383238405Sjkim eval(shift(@insns)); 384238405Sjkim eval(shift(@insns)); 385238405Sjkim 386238405Sjkim &paddd (@Tx[1],@X[-1&7]); 387238405Sjkim eval(shift(@insns)); 388238405Sjkim eval(shift(@insns)); 389238405Sjkim &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 390238405Sjkim eval(shift(@insns)); 391238405Sjkim eval(shift(@insns)); 392238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 393238405Sjkim eval(shift(@insns)); 394238405Sjkim eval(shift(@insns)); 395238405Sjkim 396238405Sjkim &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 397238405Sjkim eval(shift(@insns)); 398238405Sjkim eval(shift(@insns)); 399238405Sjkim eval(shift(@insns)); 400238405Sjkim eval(shift(@insns)); 401238405Sjkim 402238405Sjkim &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 403238405Sjkim eval(shift(@insns)); 404238405Sjkim eval(shift(@insns)); 405238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 406238405Sjkim eval(shift(@insns)); 407238405Sjkim eval(shift(@insns)); 408238405Sjkim 409238405Sjkim &movdqa (@Tx[2],@X[0]); 410238405Sjkim &movdqa (@Tx[0],@X[0]); 411238405Sjkim eval(shift(@insns)); 412238405Sjkim eval(shift(@insns)); 413238405Sjkim eval(shift(@insns)); 414238405Sjkim eval(shift(@insns)); 415238405Sjkim 416238405Sjkim &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 417238405Sjkim &paddd (@X[0],@X[0]); 418238405Sjkim eval(shift(@insns)); 419238405Sjkim eval(shift(@insns)); 420238405Sjkim eval(shift(@insns)); 421238405Sjkim eval(shift(@insns)); 422238405Sjkim 423238405Sjkim &psrld (@Tx[0],31); 424238405Sjkim eval(shift(@insns)); 425238405Sjkim eval(shift(@insns)); 426238405Sjkim &movdqa (@Tx[1],@Tx[2]); 427238405Sjkim eval(shift(@insns)); 428238405Sjkim eval(shift(@insns)); 429238405Sjkim 430238405Sjkim &psrld (@Tx[2],30); 431238405Sjkim &por (@X[0],@Tx[0]); # "X[0]"<<<=1 432238405Sjkim eval(shift(@insns)); 433238405Sjkim eval(shift(@insns)); 434238405Sjkim eval(shift(@insns)); 435238405Sjkim eval(shift(@insns)); 436238405Sjkim 437238405Sjkim &pslld (@Tx[1],2); 438238405Sjkim &pxor (@X[0],@Tx[2]); 439238405Sjkim eval(shift(@insns)); 440238405Sjkim eval(shift(@insns)); 441238405Sjkim &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 442238405Sjkim eval(shift(@insns)); 443238405Sjkim eval(shift(@insns)); 444238405Sjkim 445238405Sjkim &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 446238405Sjkim 447238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 448238405Sjkim 449238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 450238405Sjkim push(@Tx,shift(@Tx)); 451238405Sjkim} 452238405Sjkim 453238405Sjkimsub Xupdate_ssse3_32_79() 454238405Sjkim{ use integer; 455238405Sjkim my $body = shift; 456238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 457238405Sjkim my ($a,$b,$c,$d,$e); 458238405Sjkim 459238405Sjkim &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); 460238405Sjkim eval(shift(@insns)); # body_20_39 461238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 462238405Sjkim &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" 463238405Sjkim eval(shift(@insns)); 464238405Sjkim eval(shift(@insns)); 465238405Sjkim eval(shift(@insns)); # rol 466238405Sjkim 467238405Sjkim &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 468238405Sjkim eval(shift(@insns)); 469238405Sjkim eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 470238405Sjkim if ($Xi%5) { 471238405Sjkim &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 472238405Sjkim } else { # ... or load next one 473238405Sjkim &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 474238405Sjkim } 475238405Sjkim &paddd (@Tx[1],@X[-1&7]); 476238405Sjkim eval(shift(@insns)); # ror 477238405Sjkim eval(shift(@insns)); 478238405Sjkim 479238405Sjkim &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 480238405Sjkim eval(shift(@insns)); # body_20_39 481238405Sjkim eval(shift(@insns)); 482238405Sjkim eval(shift(@insns)); 483238405Sjkim eval(shift(@insns)); # rol 484238405Sjkim 485238405Sjkim &movdqa (@Tx[0],@X[0]); 486238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 487238405Sjkim eval(shift(@insns)); 488238405Sjkim eval(shift(@insns)); 489238405Sjkim eval(shift(@insns)); # ror 490238405Sjkim eval(shift(@insns)); 491238405Sjkim 492238405Sjkim &pslld (@X[0],2); 493238405Sjkim eval(shift(@insns)); # body_20_39 494238405Sjkim eval(shift(@insns)); 495238405Sjkim &psrld (@Tx[0],30); 496238405Sjkim eval(shift(@insns)); 497238405Sjkim eval(shift(@insns)); # rol 498238405Sjkim eval(shift(@insns)); 499238405Sjkim eval(shift(@insns)); 500238405Sjkim eval(shift(@insns)); # ror 501238405Sjkim eval(shift(@insns)); 502238405Sjkim 503238405Sjkim &por (@X[0],@Tx[0]); # "X[0]"<<<=2 504238405Sjkim eval(shift(@insns)); # body_20_39 505238405Sjkim eval(shift(@insns)); 506238405Sjkim &movdqa (@Tx[1],@X[0]) if ($Xi<19); 507238405Sjkim eval(shift(@insns)); 508238405Sjkim eval(shift(@insns)); # rol 509238405Sjkim eval(shift(@insns)); 510238405Sjkim eval(shift(@insns)); 511238405Sjkim eval(shift(@insns)); # rol 512238405Sjkim eval(shift(@insns)); 513238405Sjkim 514238405Sjkim foreach (@insns) { eval; } # remaining instructions 515238405Sjkim 516238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 517238405Sjkim push(@Tx,shift(@Tx)); 518238405Sjkim} 519238405Sjkim 520238405Sjkimsub Xuplast_ssse3_80() 521238405Sjkim{ use integer; 522238405Sjkim my $body = shift; 523238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 524238405Sjkim my ($a,$b,$c,$d,$e); 525238405Sjkim 526238405Sjkim eval(shift(@insns)); 527238405Sjkim &paddd (@Tx[1],@X[-1&7]); 528238405Sjkim eval(shift(@insns)); 529238405Sjkim eval(shift(@insns)); 530238405Sjkim eval(shift(@insns)); 531238405Sjkim eval(shift(@insns)); 532238405Sjkim 533238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 534238405Sjkim 535238405Sjkim foreach (@insns) { eval; } # remaining instructions 536238405Sjkim 537238405Sjkim &cmp ($inp,$num); 538238405Sjkim &je (".Ldone_ssse3"); 539238405Sjkim 540238405Sjkim unshift(@Tx,pop(@Tx)); 541238405Sjkim 542238405Sjkim &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask 543238405Sjkim &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 544238405Sjkim &movdqu (@X[-4&7],"0($inp)"); # load input 545238405Sjkim &movdqu (@X[-3&7],"16($inp)"); 546238405Sjkim &movdqu (@X[-2&7],"32($inp)"); 547238405Sjkim &movdqu (@X[-1&7],"48($inp)"); 548238405Sjkim &pshufb (@X[-4&7],@X[2]); # byte swap 549238405Sjkim &add ($inp,64); 550238405Sjkim 551238405Sjkim $Xi=0; 552238405Sjkim} 553238405Sjkim 554238405Sjkimsub Xloop_ssse3() 555238405Sjkim{ use integer; 556238405Sjkim my $body = shift; 557238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 558238405Sjkim my ($a,$b,$c,$d,$e); 559238405Sjkim 560238405Sjkim eval(shift(@insns)); 561238405Sjkim eval(shift(@insns)); 562238405Sjkim &pshufb (@X[($Xi-3)&7],@X[2]); 563238405Sjkim eval(shift(@insns)); 564238405Sjkim eval(shift(@insns)); 565238405Sjkim &paddd (@X[($Xi-4)&7],@Tx[1]); 566238405Sjkim eval(shift(@insns)); 567238405Sjkim eval(shift(@insns)); 568238405Sjkim eval(shift(@insns)); 569238405Sjkim eval(shift(@insns)); 570238405Sjkim &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 571238405Sjkim eval(shift(@insns)); 572238405Sjkim eval(shift(@insns)); 573238405Sjkim &psubd (@X[($Xi-4)&7],@Tx[1]); 574238405Sjkim 575238405Sjkim foreach (@insns) { eval; } 576238405Sjkim $Xi++; 577238405Sjkim} 578238405Sjkim 579238405Sjkimsub Xtail_ssse3() 580238405Sjkim{ use integer; 581238405Sjkim my $body = shift; 582238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 583238405Sjkim my ($a,$b,$c,$d,$e); 584238405Sjkim 585238405Sjkim foreach (@insns) { eval; } 586238405Sjkim} 587238405Sjkim 588238405Sjkimsub body_00_19 () { 589238405Sjkim ( 590238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 591238405Sjkim '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer 592238405Sjkim '&xor ($c,$d);', 593238405Sjkim '&mov (@T[1],$a);', # $b in next round 594238405Sjkim '&$_rol ($a,5);', 595238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 596238405Sjkim '&xor ($c,$d);', # restore $c 597238405Sjkim '&xor (@T[0],$d);', 598238405Sjkim '&add ($e,$a);', 599238405Sjkim '&$_ror ($b,$j?7:2);', # $b>>>2 600238405Sjkim '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 601238405Sjkim ); 602238405Sjkim} 603238405Sjkim 604238405Sjkimsub body_20_39 () { 605238405Sjkim ( 606238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 607238405Sjkim '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 608238405Sjkim '&xor (@T[0],$d);', # ($b^$d) 609238405Sjkim '&mov (@T[1],$a);', # $b in next round 610238405Sjkim '&$_rol ($a,5);', 611238405Sjkim '&xor (@T[0],$c);', # ($b^$d^$c) 612238405Sjkim '&add ($e,$a);', 613238405Sjkim '&$_ror ($b,7);', # $b>>>2 614238405Sjkim '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 615238405Sjkim ); 616238405Sjkim} 617238405Sjkim 618238405Sjkimsub body_40_59 () { 619238405Sjkim ( 620238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 621238405Sjkim '&mov (@T[1],$c);', 622238405Sjkim '&xor ($c,$d);', 623238405Sjkim '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 624238405Sjkim '&and (@T[1],$d);', 625238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 626238405Sjkim '&$_ror ($b,7);', # $b>>>2 627238405Sjkim '&add ($e,@T[1]);', 628238405Sjkim '&mov (@T[1],$a);', # $b in next round 629238405Sjkim '&$_rol ($a,5);', 630238405Sjkim '&add ($e,@T[0]);', 631238405Sjkim '&xor ($c,$d);', # restore $c 632238405Sjkim '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 633238405Sjkim ); 634238405Sjkim} 635238405Sjkim$code.=<<___; 636238405Sjkim.align 16 637238405Sjkim.Loop_ssse3: 638238405Sjkim___ 639238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 640238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 641238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 642238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 643238405Sjkim &Xupdate_ssse3_32_79(\&body_00_19); 644238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 645238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 646238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 647238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 648238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 649238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 650238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 651238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 652238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 653238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 654238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 655238405Sjkim &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 656238405Sjkim 657238405Sjkim $saved_j=$j; @saved_V=@V; 658238405Sjkim 659238405Sjkim &Xloop_ssse3(\&body_20_39); 660238405Sjkim &Xloop_ssse3(\&body_20_39); 661238405Sjkim &Xloop_ssse3(\&body_20_39); 662238405Sjkim 663238405Sjkim$code.=<<___; 664238405Sjkim add 0($ctx),$A # update context 665238405Sjkim add 4($ctx),@T[0] 666238405Sjkim add 8($ctx),$C 667238405Sjkim add 12($ctx),$D 668238405Sjkim mov $A,0($ctx) 669238405Sjkim add 16($ctx),$E 670238405Sjkim mov @T[0],4($ctx) 671238405Sjkim mov @T[0],$B # magic seed 672238405Sjkim mov $C,8($ctx) 673238405Sjkim mov $D,12($ctx) 674238405Sjkim mov $E,16($ctx) 675238405Sjkim jmp .Loop_ssse3 676238405Sjkim 677238405Sjkim.align 16 678238405Sjkim.Ldone_ssse3: 679238405Sjkim___ 680238405Sjkim $j=$saved_j; @V=@saved_V; 681238405Sjkim 682238405Sjkim &Xtail_ssse3(\&body_20_39); 683238405Sjkim &Xtail_ssse3(\&body_20_39); 684238405Sjkim &Xtail_ssse3(\&body_20_39); 685238405Sjkim 686238405Sjkim$code.=<<___; 687238405Sjkim add 0($ctx),$A # update context 688238405Sjkim add 4($ctx),@T[0] 689238405Sjkim add 8($ctx),$C 690238405Sjkim mov $A,0($ctx) 691238405Sjkim add 12($ctx),$D 692238405Sjkim mov @T[0],4($ctx) 693238405Sjkim add 16($ctx),$E 694238405Sjkim mov $C,8($ctx) 695238405Sjkim mov $D,12($ctx) 696238405Sjkim mov $E,16($ctx) 697238405Sjkim___ 698238405Sjkim$code.=<<___ if ($win64); 699238405Sjkim movaps 64+0(%rsp),%xmm6 700238405Sjkim movaps 64+16(%rsp),%xmm7 701238405Sjkim movaps 64+32(%rsp),%xmm8 702238405Sjkim movaps 64+48(%rsp),%xmm9 703238405Sjkim movaps 64+64(%rsp),%xmm10 704238405Sjkim___ 705238405Sjkim$code.=<<___; 706238405Sjkim lea `64+($win64?5*16:0)`(%rsp),%rsi 707238405Sjkim mov 0(%rsi),%r12 708238405Sjkim mov 8(%rsi),%rbp 709238405Sjkim mov 16(%rsi),%rbx 710238405Sjkim lea 24(%rsi),%rsp 711238405Sjkim.Lepilogue_ssse3: 712238405Sjkim ret 713238405Sjkim.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 714238405Sjkim___ 715238405Sjkim 716238405Sjkimif ($avx) { 717238405Sjkimmy $Xi=4; 718238405Sjkimmy @X=map("%xmm$_",(4..7,0..3)); 719238405Sjkimmy @Tx=map("%xmm$_",(8..10)); 720238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 721238405Sjkimmy @T=("%esi","%edi"); 722238405Sjkimmy $j=0; 723238405Sjkimmy $K_XX_XX="%r11"; 724238405Sjkim 725238405Sjkimmy $_rol=sub { &shld(@_[0],@_) }; 726238405Sjkimmy $_ror=sub { &shrd(@_[0],@_) }; 727238405Sjkim 728238405Sjkim$code.=<<___; 729238405Sjkim.type sha1_block_data_order_avx,\@function,3 730238405Sjkim.align 16 731238405Sjkimsha1_block_data_order_avx: 732238405Sjkim_avx_shortcut: 733238405Sjkim push %rbx 734238405Sjkim push %rbp 735238405Sjkim push %r12 736238405Sjkim lea `-64-($win64?5*16:0)`(%rsp),%rsp 737238405Sjkim___ 738238405Sjkim$code.=<<___ if ($win64); 739238405Sjkim movaps %xmm6,64+0(%rsp) 740238405Sjkim movaps %xmm7,64+16(%rsp) 741238405Sjkim movaps %xmm8,64+32(%rsp) 742238405Sjkim movaps %xmm9,64+48(%rsp) 743238405Sjkim movaps %xmm10,64+64(%rsp) 744238405Sjkim.Lprologue_avx: 745238405Sjkim___ 746238405Sjkim$code.=<<___; 747238405Sjkim mov %rdi,$ctx # reassigned argument 748238405Sjkim mov %rsi,$inp # reassigned argument 749238405Sjkim mov %rdx,$num # reassigned argument 750264331Sjkim vzeroupper 751238405Sjkim 752238405Sjkim shl \$6,$num 753238405Sjkim add $inp,$num 754238405Sjkim lea K_XX_XX(%rip),$K_XX_XX 755238405Sjkim 756238405Sjkim mov 0($ctx),$A # load context 757238405Sjkim mov 4($ctx),$B 758238405Sjkim mov 8($ctx),$C 759238405Sjkim mov 12($ctx),$D 760238405Sjkim mov $B,@T[0] # magic seed 761238405Sjkim mov 16($ctx),$E 762238405Sjkim 763238405Sjkim vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 764238405Sjkim vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 765238405Sjkim vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 766238405Sjkim vmovdqu 16($inp),@X[-3&7] 767238405Sjkim vmovdqu 32($inp),@X[-2&7] 768238405Sjkim vmovdqu 48($inp),@X[-1&7] 769238405Sjkim vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 770238405Sjkim add \$64,$inp 771238405Sjkim vpshufb @X[2],@X[-3&7],@X[-3&7] 772238405Sjkim vpshufb @X[2],@X[-2&7],@X[-2&7] 773238405Sjkim vpshufb @X[2],@X[-1&7],@X[-1&7] 774238405Sjkim vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 775238405Sjkim vpaddd @Tx[1],@X[-3&7],@X[1] 776238405Sjkim vpaddd @Tx[1],@X[-2&7],@X[2] 777238405Sjkim vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 778238405Sjkim vmovdqa @X[1],16(%rsp) 779238405Sjkim vmovdqa @X[2],32(%rsp) 780238405Sjkim jmp .Loop_avx 781238405Sjkim___ 782238405Sjkim 783238405Sjkimsub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 784238405Sjkim{ use integer; 785238405Sjkim my $body = shift; 786238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 787238405Sjkim my ($a,$b,$c,$d,$e); 788238405Sjkim 789238405Sjkim eval(shift(@insns)); 790238405Sjkim eval(shift(@insns)); 791238405Sjkim &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 792238405Sjkim eval(shift(@insns)); 793238405Sjkim eval(shift(@insns)); 794238405Sjkim 795238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 796238405Sjkim eval(shift(@insns)); 797238405Sjkim eval(shift(@insns)); 798238405Sjkim &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 799238405Sjkim eval(shift(@insns)); 800238405Sjkim eval(shift(@insns)); 801238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 802238405Sjkim eval(shift(@insns)); 803238405Sjkim eval(shift(@insns)); 804238405Sjkim 805238405Sjkim &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 806238405Sjkim eval(shift(@insns)); 807238405Sjkim eval(shift(@insns)); 808238405Sjkim eval(shift(@insns)); 809238405Sjkim eval(shift(@insns)); 810238405Sjkim 811238405Sjkim &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 812238405Sjkim eval(shift(@insns)); 813238405Sjkim eval(shift(@insns)); 814238405Sjkim &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 815238405Sjkim eval(shift(@insns)); 816238405Sjkim eval(shift(@insns)); 817238405Sjkim 818238405Sjkim &vpsrld (@Tx[0],@X[0],31); 819238405Sjkim eval(shift(@insns)); 820238405Sjkim eval(shift(@insns)); 821238405Sjkim eval(shift(@insns)); 822238405Sjkim eval(shift(@insns)); 823238405Sjkim 824238405Sjkim &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 825238405Sjkim &vpaddd (@X[0],@X[0],@X[0]); 826238405Sjkim eval(shift(@insns)); 827238405Sjkim eval(shift(@insns)); 828238405Sjkim eval(shift(@insns)); 829238405Sjkim eval(shift(@insns)); 830238405Sjkim 831238405Sjkim &vpsrld (@Tx[1],@Tx[2],30); 832238405Sjkim &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 833238405Sjkim eval(shift(@insns)); 834238405Sjkim eval(shift(@insns)); 835238405Sjkim eval(shift(@insns)); 836238405Sjkim eval(shift(@insns)); 837238405Sjkim 838238405Sjkim &vpslld (@Tx[2],@Tx[2],2); 839238405Sjkim &vpxor (@X[0],@X[0],@Tx[1]); 840238405Sjkim eval(shift(@insns)); 841238405Sjkim eval(shift(@insns)); 842238405Sjkim eval(shift(@insns)); 843238405Sjkim eval(shift(@insns)); 844238405Sjkim 845238405Sjkim &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 846238405Sjkim eval(shift(@insns)); 847238405Sjkim eval(shift(@insns)); 848238405Sjkim &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 849238405Sjkim eval(shift(@insns)); 850238405Sjkim eval(shift(@insns)); 851238405Sjkim 852238405Sjkim 853238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 854238405Sjkim 855238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 856238405Sjkim push(@Tx,shift(@Tx)); 857238405Sjkim} 858238405Sjkim 859238405Sjkimsub Xupdate_avx_32_79() 860238405Sjkim{ use integer; 861238405Sjkim my $body = shift; 862238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 863238405Sjkim my ($a,$b,$c,$d,$e); 864238405Sjkim 865238405Sjkim &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 866238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 867238405Sjkim eval(shift(@insns)); # body_20_39 868238405Sjkim eval(shift(@insns)); 869238405Sjkim eval(shift(@insns)); 870238405Sjkim eval(shift(@insns)); # rol 871238405Sjkim 872238405Sjkim &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 873238405Sjkim eval(shift(@insns)); 874238405Sjkim eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 875238405Sjkim if ($Xi%5) { 876238405Sjkim &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 877238405Sjkim } else { # ... or load next one 878238405Sjkim &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 879238405Sjkim } 880238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 881238405Sjkim eval(shift(@insns)); # ror 882238405Sjkim eval(shift(@insns)); 883238405Sjkim 884238405Sjkim &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 885238405Sjkim eval(shift(@insns)); # body_20_39 886238405Sjkim eval(shift(@insns)); 887238405Sjkim eval(shift(@insns)); 888238405Sjkim eval(shift(@insns)); # rol 889238405Sjkim 890238405Sjkim &vpsrld (@Tx[0],@X[0],30); 891238405Sjkim &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 892238405Sjkim eval(shift(@insns)); 893238405Sjkim eval(shift(@insns)); 894238405Sjkim eval(shift(@insns)); # ror 895238405Sjkim eval(shift(@insns)); 896238405Sjkim 897238405Sjkim &vpslld (@X[0],@X[0],2); 898238405Sjkim eval(shift(@insns)); # body_20_39 899238405Sjkim eval(shift(@insns)); 900238405Sjkim eval(shift(@insns)); 901238405Sjkim eval(shift(@insns)); # rol 902238405Sjkim eval(shift(@insns)); 903238405Sjkim eval(shift(@insns)); 904238405Sjkim eval(shift(@insns)); # ror 905238405Sjkim eval(shift(@insns)); 906238405Sjkim 907238405Sjkim &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 908238405Sjkim eval(shift(@insns)); # body_20_39 909238405Sjkim eval(shift(@insns)); 910238405Sjkim &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); 911238405Sjkim eval(shift(@insns)); 912238405Sjkim eval(shift(@insns)); # rol 913238405Sjkim eval(shift(@insns)); 914238405Sjkim eval(shift(@insns)); 915238405Sjkim eval(shift(@insns)); # rol 916238405Sjkim eval(shift(@insns)); 917238405Sjkim 918238405Sjkim foreach (@insns) { eval; } # remaining instructions 919238405Sjkim 920238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 921238405Sjkim push(@Tx,shift(@Tx)); 922238405Sjkim} 923238405Sjkim 924238405Sjkimsub Xuplast_avx_80() 925238405Sjkim{ use integer; 926238405Sjkim my $body = shift; 927238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 928238405Sjkim my ($a,$b,$c,$d,$e); 929238405Sjkim 930238405Sjkim eval(shift(@insns)); 931238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 932238405Sjkim eval(shift(@insns)); 933238405Sjkim eval(shift(@insns)); 934238405Sjkim eval(shift(@insns)); 935238405Sjkim eval(shift(@insns)); 936238405Sjkim 937238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 938238405Sjkim 939238405Sjkim foreach (@insns) { eval; } # remaining instructions 940238405Sjkim 941238405Sjkim &cmp ($inp,$num); 942238405Sjkim &je (".Ldone_avx"); 943238405Sjkim 944238405Sjkim unshift(@Tx,pop(@Tx)); 945238405Sjkim 946238405Sjkim &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask 947238405Sjkim &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 948238405Sjkim &vmovdqu(@X[-4&7],"0($inp)"); # load input 949238405Sjkim &vmovdqu(@X[-3&7],"16($inp)"); 950238405Sjkim &vmovdqu(@X[-2&7],"32($inp)"); 951238405Sjkim &vmovdqu(@X[-1&7],"48($inp)"); 952238405Sjkim &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 953238405Sjkim &add ($inp,64); 954238405Sjkim 955238405Sjkim $Xi=0; 956238405Sjkim} 957238405Sjkim 958238405Sjkimsub Xloop_avx() 959238405Sjkim{ use integer; 960238405Sjkim my $body = shift; 961238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 962238405Sjkim my ($a,$b,$c,$d,$e); 963238405Sjkim 964238405Sjkim eval(shift(@insns)); 965238405Sjkim eval(shift(@insns)); 966238405Sjkim &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 967238405Sjkim eval(shift(@insns)); 968238405Sjkim eval(shift(@insns)); 969238405Sjkim &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); 970238405Sjkim eval(shift(@insns)); 971238405Sjkim eval(shift(@insns)); 972238405Sjkim eval(shift(@insns)); 973238405Sjkim eval(shift(@insns)); 974238405Sjkim &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU 975238405Sjkim eval(shift(@insns)); 976238405Sjkim eval(shift(@insns)); 977238405Sjkim 978238405Sjkim foreach (@insns) { eval; } 979238405Sjkim $Xi++; 980238405Sjkim} 981238405Sjkim 982238405Sjkimsub Xtail_avx() 983238405Sjkim{ use integer; 984238405Sjkim my $body = shift; 985238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 986238405Sjkim my ($a,$b,$c,$d,$e); 987238405Sjkim 988238405Sjkim foreach (@insns) { eval; } 989238405Sjkim} 990238405Sjkim 991238405Sjkim$code.=<<___; 992238405Sjkim.align 16 993238405Sjkim.Loop_avx: 994238405Sjkim___ 995238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 996238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 997238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 998238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 999238405Sjkim &Xupdate_avx_32_79(\&body_00_19); 1000238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1001238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1002238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1003238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1004238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1005238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1006238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1007238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1008238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1009238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1010238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1011238405Sjkim &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1012238405Sjkim 1013238405Sjkim $saved_j=$j; @saved_V=@V; 1014238405Sjkim 1015238405Sjkim &Xloop_avx(\&body_20_39); 1016238405Sjkim &Xloop_avx(\&body_20_39); 1017238405Sjkim &Xloop_avx(\&body_20_39); 1018238405Sjkim 1019238405Sjkim$code.=<<___; 1020238405Sjkim add 0($ctx),$A # update context 1021238405Sjkim add 4($ctx),@T[0] 1022238405Sjkim add 8($ctx),$C 1023238405Sjkim add 12($ctx),$D 1024238405Sjkim mov $A,0($ctx) 1025238405Sjkim add 16($ctx),$E 1026238405Sjkim mov @T[0],4($ctx) 1027238405Sjkim mov @T[0],$B # magic seed 1028238405Sjkim mov $C,8($ctx) 1029238405Sjkim mov $D,12($ctx) 1030238405Sjkim mov $E,16($ctx) 1031238405Sjkim jmp .Loop_avx 1032238405Sjkim 1033238405Sjkim.align 16 1034238405Sjkim.Ldone_avx: 1035238405Sjkim___ 1036238405Sjkim $j=$saved_j; @V=@saved_V; 1037238405Sjkim 1038238405Sjkim &Xtail_avx(\&body_20_39); 1039238405Sjkim &Xtail_avx(\&body_20_39); 1040238405Sjkim &Xtail_avx(\&body_20_39); 1041238405Sjkim 1042238405Sjkim$code.=<<___; 1043264331Sjkim vzeroupper 1044238405Sjkim 1045238405Sjkim add 0($ctx),$A # update context 1046238405Sjkim add 4($ctx),@T[0] 1047238405Sjkim add 8($ctx),$C 1048238405Sjkim mov $A,0($ctx) 1049238405Sjkim add 12($ctx),$D 1050238405Sjkim mov @T[0],4($ctx) 1051238405Sjkim add 16($ctx),$E 1052238405Sjkim mov $C,8($ctx) 1053238405Sjkim mov $D,12($ctx) 1054238405Sjkim mov $E,16($ctx) 1055238405Sjkim___ 1056238405Sjkim$code.=<<___ if ($win64); 1057238405Sjkim movaps 64+0(%rsp),%xmm6 1058238405Sjkim movaps 64+16(%rsp),%xmm7 1059238405Sjkim movaps 64+32(%rsp),%xmm8 1060238405Sjkim movaps 64+48(%rsp),%xmm9 1061238405Sjkim movaps 64+64(%rsp),%xmm10 1062238405Sjkim___ 1063238405Sjkim$code.=<<___; 1064238405Sjkim lea `64+($win64?5*16:0)`(%rsp),%rsi 1065238405Sjkim mov 0(%rsi),%r12 1066238405Sjkim mov 8(%rsi),%rbp 1067238405Sjkim mov 16(%rsi),%rbx 1068238405Sjkim lea 24(%rsi),%rsp 1069238405Sjkim.Lepilogue_avx: 1070238405Sjkim ret 1071238405Sjkim.size sha1_block_data_order_avx,.-sha1_block_data_order_avx 1072238405Sjkim___ 1073238405Sjkim} 1074238405Sjkim$code.=<<___; 1075238405Sjkim.align 64 1076238405SjkimK_XX_XX: 1077238405Sjkim.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1078238405Sjkim.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1079238405Sjkim.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1080238405Sjkim.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1081238405Sjkim.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1082238405Sjkim___ 1083238405Sjkim}}} 1084238405Sjkim$code.=<<___; 1085183234Ssimon.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1086238405Sjkim.align 64 1087183234Ssimon___ 1088183234Ssimon 1089238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1090238405Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1091238405Sjkimif ($win64) { 1092238405Sjkim$rec="%rcx"; 1093238405Sjkim$frame="%rdx"; 1094238405Sjkim$context="%r8"; 1095238405Sjkim$disp="%r9"; 1096238405Sjkim 1097238405Sjkim$code.=<<___; 1098238405Sjkim.extern __imp_RtlVirtualUnwind 1099238405Sjkim.type se_handler,\@abi-omnipotent 1100238405Sjkim.align 16 1101238405Sjkimse_handler: 1102238405Sjkim push %rsi 1103238405Sjkim push %rdi 1104238405Sjkim push %rbx 1105238405Sjkim push %rbp 1106238405Sjkim push %r12 1107238405Sjkim push %r13 1108238405Sjkim push %r14 1109238405Sjkim push %r15 1110238405Sjkim pushfq 1111238405Sjkim sub \$64,%rsp 1112238405Sjkim 1113238405Sjkim mov 120($context),%rax # pull context->Rax 1114238405Sjkim mov 248($context),%rbx # pull context->Rip 1115238405Sjkim 1116238405Sjkim lea .Lprologue(%rip),%r10 1117238405Sjkim cmp %r10,%rbx # context->Rip<.Lprologue 1118238405Sjkim jb .Lcommon_seh_tail 1119238405Sjkim 1120238405Sjkim mov 152($context),%rax # pull context->Rsp 1121238405Sjkim 1122238405Sjkim lea .Lepilogue(%rip),%r10 1123238405Sjkim cmp %r10,%rbx # context->Rip>=.Lepilogue 1124238405Sjkim jae .Lcommon_seh_tail 1125238405Sjkim 1126238405Sjkim mov `16*4`(%rax),%rax # pull saved stack pointer 1127238405Sjkim lea 32(%rax),%rax 1128238405Sjkim 1129238405Sjkim mov -8(%rax),%rbx 1130238405Sjkim mov -16(%rax),%rbp 1131238405Sjkim mov -24(%rax),%r12 1132238405Sjkim mov -32(%rax),%r13 1133238405Sjkim mov %rbx,144($context) # restore context->Rbx 1134238405Sjkim mov %rbp,160($context) # restore context->Rbp 1135238405Sjkim mov %r12,216($context) # restore context->R12 1136238405Sjkim mov %r13,224($context) # restore context->R13 1137238405Sjkim 1138238405Sjkim jmp .Lcommon_seh_tail 1139238405Sjkim.size se_handler,.-se_handler 1140238405Sjkim 1141238405Sjkim.type ssse3_handler,\@abi-omnipotent 1142238405Sjkim.align 16 1143238405Sjkimssse3_handler: 1144238405Sjkim push %rsi 1145238405Sjkim push %rdi 1146238405Sjkim push %rbx 1147238405Sjkim push %rbp 1148238405Sjkim push %r12 1149238405Sjkim push %r13 1150238405Sjkim push %r14 1151238405Sjkim push %r15 1152238405Sjkim pushfq 1153238405Sjkim sub \$64,%rsp 1154238405Sjkim 1155238405Sjkim mov 120($context),%rax # pull context->Rax 1156238405Sjkim mov 248($context),%rbx # pull context->Rip 1157238405Sjkim 1158238405Sjkim mov 8($disp),%rsi # disp->ImageBase 1159238405Sjkim mov 56($disp),%r11 # disp->HandlerData 1160238405Sjkim 1161238405Sjkim mov 0(%r11),%r10d # HandlerData[0] 1162238405Sjkim lea (%rsi,%r10),%r10 # prologue label 1163238405Sjkim cmp %r10,%rbx # context->Rip<prologue label 1164238405Sjkim jb .Lcommon_seh_tail 1165238405Sjkim 1166238405Sjkim mov 152($context),%rax # pull context->Rsp 1167238405Sjkim 1168238405Sjkim mov 4(%r11),%r10d # HandlerData[1] 1169238405Sjkim lea (%rsi,%r10),%r10 # epilogue label 1170238405Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 1171238405Sjkim jae .Lcommon_seh_tail 1172238405Sjkim 1173238405Sjkim lea 64(%rax),%rsi 1174238405Sjkim lea 512($context),%rdi # &context.Xmm6 1175238405Sjkim mov \$10,%ecx 1176238405Sjkim .long 0xa548f3fc # cld; rep movsq 1177238405Sjkim lea `24+64+5*16`(%rax),%rax # adjust stack pointer 1178238405Sjkim 1179238405Sjkim mov -8(%rax),%rbx 1180238405Sjkim mov -16(%rax),%rbp 1181238405Sjkim mov -24(%rax),%r12 1182238405Sjkim mov %rbx,144($context) # restore context->Rbx 1183238405Sjkim mov %rbp,160($context) # restore context->Rbp 1184238405Sjkim mov %r12,216($context) # restore cotnext->R12 1185238405Sjkim 1186238405Sjkim.Lcommon_seh_tail: 1187238405Sjkim mov 8(%rax),%rdi 1188238405Sjkim mov 16(%rax),%rsi 1189238405Sjkim mov %rax,152($context) # restore context->Rsp 1190238405Sjkim mov %rsi,168($context) # restore context->Rsi 1191238405Sjkim mov %rdi,176($context) # restore context->Rdi 1192238405Sjkim 1193238405Sjkim mov 40($disp),%rdi # disp->ContextRecord 1194238405Sjkim mov $context,%rsi # context 1195238405Sjkim mov \$154,%ecx # sizeof(CONTEXT) 1196238405Sjkim .long 0xa548f3fc # cld; rep movsq 1197238405Sjkim 1198238405Sjkim mov $disp,%rsi 1199238405Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1200238405Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 1201238405Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 1202238405Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1203238405Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 1204238405Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 1205238405Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 1206238405Sjkim mov %r10,32(%rsp) # arg5 1207238405Sjkim mov %r11,40(%rsp) # arg6 1208238405Sjkim mov %r12,48(%rsp) # arg7 1209238405Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 1210238405Sjkim call *__imp_RtlVirtualUnwind(%rip) 1211238405Sjkim 1212238405Sjkim mov \$1,%eax # ExceptionContinueSearch 1213238405Sjkim add \$64,%rsp 1214238405Sjkim popfq 1215238405Sjkim pop %r15 1216238405Sjkim pop %r14 1217238405Sjkim pop %r13 1218238405Sjkim pop %r12 1219238405Sjkim pop %rbp 1220238405Sjkim pop %rbx 1221238405Sjkim pop %rdi 1222238405Sjkim pop %rsi 1223238405Sjkim ret 1224238405Sjkim.size ssse3_handler,.-ssse3_handler 1225238405Sjkim 1226238405Sjkim.section .pdata 1227238405Sjkim.align 4 1228238405Sjkim .rva .LSEH_begin_sha1_block_data_order 1229238405Sjkim .rva .LSEH_end_sha1_block_data_order 1230238405Sjkim .rva .LSEH_info_sha1_block_data_order 1231238405Sjkim .rva .LSEH_begin_sha1_block_data_order_ssse3 1232238405Sjkim .rva .LSEH_end_sha1_block_data_order_ssse3 1233238405Sjkim .rva .LSEH_info_sha1_block_data_order_ssse3 1234238405Sjkim___ 1235238405Sjkim$code.=<<___ if ($avx); 1236238405Sjkim .rva .LSEH_begin_sha1_block_data_order_avx 1237238405Sjkim .rva .LSEH_end_sha1_block_data_order_avx 1238238405Sjkim .rva .LSEH_info_sha1_block_data_order_avx 1239238405Sjkim___ 1240238405Sjkim$code.=<<___; 1241238405Sjkim.section .xdata 1242238405Sjkim.align 8 1243238405Sjkim.LSEH_info_sha1_block_data_order: 1244238405Sjkim .byte 9,0,0,0 1245238405Sjkim .rva se_handler 1246238405Sjkim.LSEH_info_sha1_block_data_order_ssse3: 1247238405Sjkim .byte 9,0,0,0 1248238405Sjkim .rva ssse3_handler 1249238405Sjkim .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1250238405Sjkim___ 1251238405Sjkim$code.=<<___ if ($avx); 1252238405Sjkim.LSEH_info_sha1_block_data_order_avx: 1253238405Sjkim .byte 9,0,0,0 1254238405Sjkim .rva ssse3_handler 1255238405Sjkim .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1256238405Sjkim___ 1257238405Sjkim} 1258238405Sjkim 1259183234Ssimon#################################################################### 1260183234Ssimon 1261183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem; 1262183234Ssimonprint $code; 1263183234Ssimonclose STDOUT; 1264