sha1-x86_64.pl revision 299964
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# sha1_block procedure for x86_64. 11# 12# It was brought to my attention that on EM64T compiler-generated code 13# was far behind 32-bit assembler implementation. This is unlike on 14# Opteron where compiler-generated code was only 15% behind 32-bit 15# assembler, which originally made it hard to motivate the effort. 16# There was suggestion to mechanically translate 32-bit code, but I 17# dismissed it, reasoning that x86_64 offers enough register bank 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19# implementation:-) However! While 64-bit code does perform better 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21# x86_64 does offer larger *addressable* bank, but out-of-order core 22# reaches for even more registers through dynamic aliasing, and EM64T 23# core must have managed to run-time optimize even 32-bit code just as 24# good as 64-bit one. Performance improvement is summarized in the 25# following table: 26# 27# gcc 3.4 32-bit asm cycles/byte 28# Opteron +45% +20% 6.8 29# Xeon P4 +65% +0% 9.9 30# Core2 +60% +10% 7.0 31 32# August 2009. 33# 34# The code was revised to minimize code size and to maximize 35# "distance" between instructions producing input to 'lea' 36# instruction and the 'lea' instruction itself, which is essential 37# for Intel Atom core. 38 39# October 2010. 40# 41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 42# is to offload message schedule denoted by Wt in NIST specification, 43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module 44# for background and implementation details. The only difference from 45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements 46# to free temporary registers. 47 48# April 2011. 49# 50# Add AVX code path. See sha1-586.pl for further information. 51 52###################################################################### 53# Current performance is summarized in following table. Numbers are 54# CPU clock cycles spent to process single byte (less is better). 55# 56# x86_64 SSSE3 AVX 57# P4 9.8 - 58# Opteron 6.6 - 59# Core2 6.7 6.1/+10% - 60# Atom 11.0 9.7/+13% - 61# Westmere 7.1 5.6/+27% - 62# Sandy Bridge 7.9 6.3/+25% 5.2/+51% 63 64$flavour = shift; 65$output = shift; 66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 67 68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 73die "can't locate x86_64-xlate.pl"; 74 75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 77 $1>=2.19); 78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 80 $1>=2.09); 81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 82 `ml64 2>&1` =~ /Version ([0-9]+)\./ && 83 $1>=10); 84$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && 85 $2>=3.0); 86 87open OUT,"| \"$^X\" $xlate $flavour $output"; 88*STDOUT=*OUT; 89 90$ctx="%rdi"; # 1st arg 91$inp="%rsi"; # 2nd arg 92$num="%rdx"; # 3rd arg 93 94# reassign arguments in order to produce more compact code 95$ctx="%r8"; 96$inp="%r9"; 97$num="%r10"; 98 99$t0="%eax"; 100$t1="%ebx"; 101$t2="%ecx"; 102@xi=("%edx","%ebp"); 103$A="%esi"; 104$B="%edi"; 105$C="%r11d"; 106$D="%r12d"; 107$E="%r13d"; 108 109@V=($A,$B,$C,$D,$E); 110 111sub BODY_00_19 { 112my ($i,$a,$b,$c,$d,$e)=@_; 113my $j=$i+1; 114$code.=<<___ if ($i==0); 115 mov `4*$i`($inp),$xi[0] 116 bswap $xi[0] 117 mov $xi[0],`4*$i`(%rsp) 118___ 119$code.=<<___ if ($i<15); 120 mov $c,$t0 121 mov `4*$j`($inp),$xi[1] 122 mov $a,$t2 123 xor $d,$t0 124 bswap $xi[1] 125 rol \$5,$t2 126 lea 0x5a827999($xi[0],$e),$e 127 and $b,$t0 128 mov $xi[1],`4*$j`(%rsp) 129 add $t2,$e 130 xor $d,$t0 131 rol \$30,$b 132 add $t0,$e 133___ 134$code.=<<___ if ($i>=15); 135 mov `4*($j%16)`(%rsp),$xi[1] 136 mov $c,$t0 137 mov $a,$t2 138 xor `4*(($j+2)%16)`(%rsp),$xi[1] 139 xor $d,$t0 140 rol \$5,$t2 141 xor `4*(($j+8)%16)`(%rsp),$xi[1] 142 and $b,$t0 143 lea 0x5a827999($xi[0],$e),$e 144 xor `4*(($j+13)%16)`(%rsp),$xi[1] 145 xor $d,$t0 146 rol \$1,$xi[1] 147 add $t2,$e 148 rol \$30,$b 149 mov $xi[1],`4*($j%16)`(%rsp) 150 add $t0,$e 151___ 152unshift(@xi,pop(@xi)); 153} 154 155sub BODY_20_39 { 156my ($i,$a,$b,$c,$d,$e)=@_; 157my $j=$i+1; 158my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 159$code.=<<___ if ($i<79); 160 mov `4*($j%16)`(%rsp),$xi[1] 161 mov $c,$t0 162 mov $a,$t2 163 xor `4*(($j+2)%16)`(%rsp),$xi[1] 164 xor $b,$t0 165 rol \$5,$t2 166 lea $K($xi[0],$e),$e 167 xor `4*(($j+8)%16)`(%rsp),$xi[1] 168 xor $d,$t0 169 add $t2,$e 170 xor `4*(($j+13)%16)`(%rsp),$xi[1] 171 rol \$30,$b 172 add $t0,$e 173 rol \$1,$xi[1] 174___ 175$code.=<<___ if ($i<76); 176 mov $xi[1],`4*($j%16)`(%rsp) 177___ 178$code.=<<___ if ($i==79); 179 mov $c,$t0 180 mov $a,$t2 181 xor $b,$t0 182 lea $K($xi[0],$e),$e 183 rol \$5,$t2 184 xor $d,$t0 185 add $t2,$e 186 rol \$30,$b 187 add $t0,$e 188___ 189unshift(@xi,pop(@xi)); 190} 191 192sub BODY_40_59 { 193my ($i,$a,$b,$c,$d,$e)=@_; 194my $j=$i+1; 195$code.=<<___; 196 mov `4*($j%16)`(%rsp),$xi[1] 197 mov $c,$t0 198 mov $c,$t1 199 xor `4*(($j+2)%16)`(%rsp),$xi[1] 200 and $d,$t0 201 mov $a,$t2 202 xor `4*(($j+8)%16)`(%rsp),$xi[1] 203 xor $d,$t1 204 lea 0x8f1bbcdc($xi[0],$e),$e 205 rol \$5,$t2 206 xor `4*(($j+13)%16)`(%rsp),$xi[1] 207 add $t0,$e 208 and $b,$t1 209 rol \$1,$xi[1] 210 add $t1,$e 211 rol \$30,$b 212 mov $xi[1],`4*($j%16)`(%rsp) 213 add $t2,$e 214___ 215unshift(@xi,pop(@xi)); 216} 217 218$code.=<<___; 219.text 220.extern OPENSSL_ia32cap_P 221 222.globl sha1_block_data_order 223.type sha1_block_data_order,\@function,3 224.align 16 225sha1_block_data_order: 226 mov OPENSSL_ia32cap_P+0(%rip),%r9d 227 mov OPENSSL_ia32cap_P+4(%rip),%r8d 228 test \$`1<<9`,%r8d # check SSSE3 bit 229 jz .Lialu 230___ 231$code.=<<___ if ($avx); 232 and \$`1<<28`,%r8d # mask AVX bit 233 and \$`1<<30`,%r9d # mask "Intel CPU" bit 234 or %r9d,%r8d 235 cmp \$`1<<28|1<<30`,%r8d 236 je _avx_shortcut 237___ 238$code.=<<___; 239 jmp _ssse3_shortcut 240 241.align 16 242.Lialu: 243 push %rbx 244 push %rbp 245 push %r12 246 push %r13 247 mov %rsp,%r11 248 mov %rdi,$ctx # reassigned argument 249 sub \$`8+16*4`,%rsp 250 mov %rsi,$inp # reassigned argument 251 and \$-64,%rsp 252 mov %rdx,$num # reassigned argument 253 mov %r11,`16*4`(%rsp) 254.Lprologue: 255 256 mov 0($ctx),$A 257 mov 4($ctx),$B 258 mov 8($ctx),$C 259 mov 12($ctx),$D 260 mov 16($ctx),$E 261 jmp .Lloop 262 263.align 16 264.Lloop: 265___ 266for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 267for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 268for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 269for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 270$code.=<<___; 271 add 0($ctx),$A 272 add 4($ctx),$B 273 add 8($ctx),$C 274 add 12($ctx),$D 275 add 16($ctx),$E 276 mov $A,0($ctx) 277 mov $B,4($ctx) 278 mov $C,8($ctx) 279 mov $D,12($ctx) 280 mov $E,16($ctx) 281 282 sub \$1,$num 283 lea `16*4`($inp),$inp 284 jnz .Lloop 285 286 mov `16*4`(%rsp),%rsi 287 mov (%rsi),%r13 288 mov 8(%rsi),%r12 289 mov 16(%rsi),%rbp 290 mov 24(%rsi),%rbx 291 lea 32(%rsi),%rsp 292.Lepilogue: 293 ret 294.size sha1_block_data_order,.-sha1_block_data_order 295___ 296{{{ 297my $Xi=4; 298my @X=map("%xmm$_",(4..7,0..3)); 299my @Tx=map("%xmm$_",(8..10)); 300my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 301my @T=("%esi","%edi"); 302my $j=0; 303my $K_XX_XX="%r11"; 304 305my $_rol=sub { &rol(@_) }; 306my $_ror=sub { &ror(@_) }; 307 308$code.=<<___; 309.type sha1_block_data_order_ssse3,\@function,3 310.align 16 311sha1_block_data_order_ssse3: 312_ssse3_shortcut: 313 push %rbx 314 push %rbp 315 push %r12 316 lea `-64-($win64?5*16:0)`(%rsp),%rsp 317___ 318$code.=<<___ if ($win64); 319 movaps %xmm6,64+0(%rsp) 320 movaps %xmm7,64+16(%rsp) 321 movaps %xmm8,64+32(%rsp) 322 movaps %xmm9,64+48(%rsp) 323 movaps %xmm10,64+64(%rsp) 324.Lprologue_ssse3: 325___ 326$code.=<<___; 327 mov %rdi,$ctx # reassigned argument 328 mov %rsi,$inp # reassigned argument 329 mov %rdx,$num # reassigned argument 330 331 shl \$6,$num 332 add $inp,$num 333 lea K_XX_XX(%rip),$K_XX_XX 334 335 mov 0($ctx),$A # load context 336 mov 4($ctx),$B 337 mov 8($ctx),$C 338 mov 12($ctx),$D 339 mov $B,@T[0] # magic seed 340 mov 16($ctx),$E 341 342 movdqa 64($K_XX_XX),@X[2] # pbswap mask 343 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 344 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 345 movdqu 16($inp),@X[-3&7] 346 movdqu 32($inp),@X[-2&7] 347 movdqu 48($inp),@X[-1&7] 348 pshufb @X[2],@X[-4&7] # byte swap 349 add \$64,$inp 350 pshufb @X[2],@X[-3&7] 351 pshufb @X[2],@X[-2&7] 352 pshufb @X[2],@X[-1&7] 353 paddd @Tx[1],@X[-4&7] # add K_00_19 354 paddd @Tx[1],@X[-3&7] 355 paddd @Tx[1],@X[-2&7] 356 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 357 psubd @Tx[1],@X[-4&7] # restore X[] 358 movdqa @X[-3&7],16(%rsp) 359 psubd @Tx[1],@X[-3&7] 360 movdqa @X[-2&7],32(%rsp) 361 psubd @Tx[1],@X[-2&7] 362 jmp .Loop_ssse3 363___ 364 365sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 366{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 367 my $arg = pop; 368 $arg = "\$$arg" if ($arg*1 eq $arg); 369 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 370} 371 372sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 373{ use integer; 374 my $body = shift; 375 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 376 my ($a,$b,$c,$d,$e); 377 378 &movdqa (@X[0],@X[-3&7]); 379 eval(shift(@insns)); 380 eval(shift(@insns)); 381 &movdqa (@Tx[0],@X[-1&7]); 382 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 386 &paddd (@Tx[1],@X[-1&7]); 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 396 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 397 eval(shift(@insns)); 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 eval(shift(@insns)); 401 402 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 403 eval(shift(@insns)); 404 eval(shift(@insns)); 405 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 406 eval(shift(@insns)); 407 eval(shift(@insns)); 408 409 &movdqa (@Tx[2],@X[0]); 410 &movdqa (@Tx[0],@X[0]); 411 eval(shift(@insns)); 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 416 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 417 &paddd (@X[0],@X[0]); 418 eval(shift(@insns)); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 eval(shift(@insns)); 422 423 &psrld (@Tx[0],31); 424 eval(shift(@insns)); 425 eval(shift(@insns)); 426 &movdqa (@Tx[1],@Tx[2]); 427 eval(shift(@insns)); 428 eval(shift(@insns)); 429 430 &psrld (@Tx[2],30); 431 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 432 eval(shift(@insns)); 433 eval(shift(@insns)); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 437 &pslld (@Tx[1],2); 438 &pxor (@X[0],@Tx[2]); 439 eval(shift(@insns)); 440 eval(shift(@insns)); 441 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 442 eval(shift(@insns)); 443 eval(shift(@insns)); 444 445 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 446 447 foreach (@insns) { eval; } # remaining instructions [if any] 448 449 $Xi++; push(@X,shift(@X)); # "rotate" X[] 450 push(@Tx,shift(@Tx)); 451} 452 453sub Xupdate_ssse3_32_79() 454{ use integer; 455 my $body = shift; 456 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 457 my ($a,$b,$c,$d,$e); 458 459 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); 460 eval(shift(@insns)); # body_20_39 461 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 462 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" 463 eval(shift(@insns)); 464 eval(shift(@insns)); 465 eval(shift(@insns)); # rol 466 467 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 468 eval(shift(@insns)); 469 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 470 if ($Xi%5) { 471 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 472 } else { # ... or load next one 473 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 474 } 475 &paddd (@Tx[1],@X[-1&7]); 476 eval(shift(@insns)); # ror 477 eval(shift(@insns)); 478 479 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 480 eval(shift(@insns)); # body_20_39 481 eval(shift(@insns)); 482 eval(shift(@insns)); 483 eval(shift(@insns)); # rol 484 485 &movdqa (@Tx[0],@X[0]); 486 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 487 eval(shift(@insns)); 488 eval(shift(@insns)); 489 eval(shift(@insns)); # ror 490 eval(shift(@insns)); 491 492 &pslld (@X[0],2); 493 eval(shift(@insns)); # body_20_39 494 eval(shift(@insns)); 495 &psrld (@Tx[0],30); 496 eval(shift(@insns)); 497 eval(shift(@insns)); # rol 498 eval(shift(@insns)); 499 eval(shift(@insns)); 500 eval(shift(@insns)); # ror 501 eval(shift(@insns)); 502 503 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 504 eval(shift(@insns)); # body_20_39 505 eval(shift(@insns)); 506 &movdqa (@Tx[1],@X[0]) if ($Xi<19); 507 eval(shift(@insns)); 508 eval(shift(@insns)); # rol 509 eval(shift(@insns)); 510 eval(shift(@insns)); 511 eval(shift(@insns)); # rol 512 eval(shift(@insns)); 513 514 foreach (@insns) { eval; } # remaining instructions 515 516 $Xi++; push(@X,shift(@X)); # "rotate" X[] 517 push(@Tx,shift(@Tx)); 518} 519 520sub Xuplast_ssse3_80() 521{ use integer; 522 my $body = shift; 523 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 524 my ($a,$b,$c,$d,$e); 525 526 eval(shift(@insns)); 527 &paddd (@Tx[1],@X[-1&7]); 528 eval(shift(@insns)); 529 eval(shift(@insns)); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 533 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 534 535 foreach (@insns) { eval; } # remaining instructions 536 537 &cmp ($inp,$num); 538 &je (".Ldone_ssse3"); 539 540 unshift(@Tx,pop(@Tx)); 541 542 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask 543 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 544 &movdqu (@X[-4&7],"0($inp)"); # load input 545 &movdqu (@X[-3&7],"16($inp)"); 546 &movdqu (@X[-2&7],"32($inp)"); 547 &movdqu (@X[-1&7],"48($inp)"); 548 &pshufb (@X[-4&7],@X[2]); # byte swap 549 &add ($inp,64); 550 551 $Xi=0; 552} 553 554sub Xloop_ssse3() 555{ use integer; 556 my $body = shift; 557 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 558 my ($a,$b,$c,$d,$e); 559 560 eval(shift(@insns)); 561 eval(shift(@insns)); 562 &pshufb (@X[($Xi-3)&7],@X[2]); 563 eval(shift(@insns)); 564 eval(shift(@insns)); 565 &paddd (@X[($Xi-4)&7],@Tx[1]); 566 eval(shift(@insns)); 567 eval(shift(@insns)); 568 eval(shift(@insns)); 569 eval(shift(@insns)); 570 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 571 eval(shift(@insns)); 572 eval(shift(@insns)); 573 &psubd (@X[($Xi-4)&7],@Tx[1]); 574 575 foreach (@insns) { eval; } 576 $Xi++; 577} 578 579sub Xtail_ssse3() 580{ use integer; 581 my $body = shift; 582 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 583 my ($a,$b,$c,$d,$e); 584 585 foreach (@insns) { eval; } 586} 587 588sub body_00_19 () { 589 ( 590 '($a,$b,$c,$d,$e)=@V;'. 591 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer 592 '&xor ($c,$d);', 593 '&mov (@T[1],$a);', # $b in next round 594 '&$_rol ($a,5);', 595 '&and (@T[0],$c);', # ($b&($c^$d)) 596 '&xor ($c,$d);', # restore $c 597 '&xor (@T[0],$d);', 598 '&add ($e,$a);', 599 '&$_ror ($b,$j?7:2);', # $b>>>2 600 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 601 ); 602} 603 604sub body_20_39 () { 605 ( 606 '($a,$b,$c,$d,$e)=@V;'. 607 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 608 '&xor (@T[0],$d);', # ($b^$d) 609 '&mov (@T[1],$a);', # $b in next round 610 '&$_rol ($a,5);', 611 '&xor (@T[0],$c);', # ($b^$d^$c) 612 '&add ($e,$a);', 613 '&$_ror ($b,7);', # $b>>>2 614 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 615 ); 616} 617 618sub body_40_59 () { 619 ( 620 '($a,$b,$c,$d,$e)=@V;'. 621 '&mov (@T[1],$c);', 622 '&xor ($c,$d);', 623 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 624 '&and (@T[1],$d);', 625 '&and (@T[0],$c);', # ($b&($c^$d)) 626 '&$_ror ($b,7);', # $b>>>2 627 '&add ($e,@T[1]);', 628 '&mov (@T[1],$a);', # $b in next round 629 '&$_rol ($a,5);', 630 '&add ($e,@T[0]);', 631 '&xor ($c,$d);', # restore $c 632 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 633 ); 634} 635$code.=<<___; 636.align 16 637.Loop_ssse3: 638___ 639 &Xupdate_ssse3_16_31(\&body_00_19); 640 &Xupdate_ssse3_16_31(\&body_00_19); 641 &Xupdate_ssse3_16_31(\&body_00_19); 642 &Xupdate_ssse3_16_31(\&body_00_19); 643 &Xupdate_ssse3_32_79(\&body_00_19); 644 &Xupdate_ssse3_32_79(\&body_20_39); 645 &Xupdate_ssse3_32_79(\&body_20_39); 646 &Xupdate_ssse3_32_79(\&body_20_39); 647 &Xupdate_ssse3_32_79(\&body_20_39); 648 &Xupdate_ssse3_32_79(\&body_20_39); 649 &Xupdate_ssse3_32_79(\&body_40_59); 650 &Xupdate_ssse3_32_79(\&body_40_59); 651 &Xupdate_ssse3_32_79(\&body_40_59); 652 &Xupdate_ssse3_32_79(\&body_40_59); 653 &Xupdate_ssse3_32_79(\&body_40_59); 654 &Xupdate_ssse3_32_79(\&body_20_39); 655 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 656 657 $saved_j=$j; @saved_V=@V; 658 659 &Xloop_ssse3(\&body_20_39); 660 &Xloop_ssse3(\&body_20_39); 661 &Xloop_ssse3(\&body_20_39); 662 663$code.=<<___; 664 add 0($ctx),$A # update context 665 add 4($ctx),@T[0] 666 add 8($ctx),$C 667 add 12($ctx),$D 668 mov $A,0($ctx) 669 add 16($ctx),$E 670 mov @T[0],4($ctx) 671 mov @T[0],$B # magic seed 672 mov $C,8($ctx) 673 mov $D,12($ctx) 674 mov $E,16($ctx) 675 jmp .Loop_ssse3 676 677.align 16 678.Ldone_ssse3: 679___ 680 $j=$saved_j; @V=@saved_V; 681 682 &Xtail_ssse3(\&body_20_39); 683 &Xtail_ssse3(\&body_20_39); 684 &Xtail_ssse3(\&body_20_39); 685 686$code.=<<___; 687 add 0($ctx),$A # update context 688 add 4($ctx),@T[0] 689 add 8($ctx),$C 690 mov $A,0($ctx) 691 add 12($ctx),$D 692 mov @T[0],4($ctx) 693 add 16($ctx),$E 694 mov $C,8($ctx) 695 mov $D,12($ctx) 696 mov $E,16($ctx) 697___ 698$code.=<<___ if ($win64); 699 movaps 64+0(%rsp),%xmm6 700 movaps 64+16(%rsp),%xmm7 701 movaps 64+32(%rsp),%xmm8 702 movaps 64+48(%rsp),%xmm9 703 movaps 64+64(%rsp),%xmm10 704___ 705$code.=<<___; 706 lea `64+($win64?5*16:0)`(%rsp),%rsi 707 mov 0(%rsi),%r12 708 mov 8(%rsi),%rbp 709 mov 16(%rsi),%rbx 710 lea 24(%rsi),%rsp 711.Lepilogue_ssse3: 712 ret 713.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 714___ 715 716if ($avx) { 717my $Xi=4; 718my @X=map("%xmm$_",(4..7,0..3)); 719my @Tx=map("%xmm$_",(8..10)); 720my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 721my @T=("%esi","%edi"); 722my $j=0; 723my $K_XX_XX="%r11"; 724 725my $_rol=sub { &shld(@_[0],@_) }; 726my $_ror=sub { &shrd(@_[0],@_) }; 727 728$code.=<<___; 729.type sha1_block_data_order_avx,\@function,3 730.align 16 731sha1_block_data_order_avx: 732_avx_shortcut: 733 push %rbx 734 push %rbp 735 push %r12 736 lea `-64-($win64?5*16:0)`(%rsp),%rsp 737___ 738$code.=<<___ if ($win64); 739 movaps %xmm6,64+0(%rsp) 740 movaps %xmm7,64+16(%rsp) 741 movaps %xmm8,64+32(%rsp) 742 movaps %xmm9,64+48(%rsp) 743 movaps %xmm10,64+64(%rsp) 744.Lprologue_avx: 745___ 746$code.=<<___; 747 mov %rdi,$ctx # reassigned argument 748 mov %rsi,$inp # reassigned argument 749 mov %rdx,$num # reassigned argument 750 vzeroupper 751 752 shl \$6,$num 753 add $inp,$num 754 lea K_XX_XX(%rip),$K_XX_XX 755 756 mov 0($ctx),$A # load context 757 mov 4($ctx),$B 758 mov 8($ctx),$C 759 mov 12($ctx),$D 760 mov $B,@T[0] # magic seed 761 mov 16($ctx),$E 762 763 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 764 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 765 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 766 vmovdqu 16($inp),@X[-3&7] 767 vmovdqu 32($inp),@X[-2&7] 768 vmovdqu 48($inp),@X[-1&7] 769 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 770 add \$64,$inp 771 vpshufb @X[2],@X[-3&7],@X[-3&7] 772 vpshufb @X[2],@X[-2&7],@X[-2&7] 773 vpshufb @X[2],@X[-1&7],@X[-1&7] 774 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 775 vpaddd @Tx[1],@X[-3&7],@X[1] 776 vpaddd @Tx[1],@X[-2&7],@X[2] 777 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 778 vmovdqa @X[1],16(%rsp) 779 vmovdqa @X[2],32(%rsp) 780 jmp .Loop_avx 781___ 782 783sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 784{ use integer; 785 my $body = shift; 786 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 787 my ($a,$b,$c,$d,$e); 788 789 eval(shift(@insns)); 790 eval(shift(@insns)); 791 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 792 eval(shift(@insns)); 793 eval(shift(@insns)); 794 795 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 796 eval(shift(@insns)); 797 eval(shift(@insns)); 798 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 799 eval(shift(@insns)); 800 eval(shift(@insns)); 801 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 802 eval(shift(@insns)); 803 eval(shift(@insns)); 804 805 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 806 eval(shift(@insns)); 807 eval(shift(@insns)); 808 eval(shift(@insns)); 809 eval(shift(@insns)); 810 811 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 812 eval(shift(@insns)); 813 eval(shift(@insns)); 814 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 815 eval(shift(@insns)); 816 eval(shift(@insns)); 817 818 &vpsrld (@Tx[0],@X[0],31); 819 eval(shift(@insns)); 820 eval(shift(@insns)); 821 eval(shift(@insns)); 822 eval(shift(@insns)); 823 824 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 825 &vpaddd (@X[0],@X[0],@X[0]); 826 eval(shift(@insns)); 827 eval(shift(@insns)); 828 eval(shift(@insns)); 829 eval(shift(@insns)); 830 831 &vpsrld (@Tx[1],@Tx[2],30); 832 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 833 eval(shift(@insns)); 834 eval(shift(@insns)); 835 eval(shift(@insns)); 836 eval(shift(@insns)); 837 838 &vpslld (@Tx[2],@Tx[2],2); 839 &vpxor (@X[0],@X[0],@Tx[1]); 840 eval(shift(@insns)); 841 eval(shift(@insns)); 842 eval(shift(@insns)); 843 eval(shift(@insns)); 844 845 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 846 eval(shift(@insns)); 847 eval(shift(@insns)); 848 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 849 eval(shift(@insns)); 850 eval(shift(@insns)); 851 852 853 foreach (@insns) { eval; } # remaining instructions [if any] 854 855 $Xi++; push(@X,shift(@X)); # "rotate" X[] 856 push(@Tx,shift(@Tx)); 857} 858 859sub Xupdate_avx_32_79() 860{ use integer; 861 my $body = shift; 862 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 863 my ($a,$b,$c,$d,$e); 864 865 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 866 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 867 eval(shift(@insns)); # body_20_39 868 eval(shift(@insns)); 869 eval(shift(@insns)); 870 eval(shift(@insns)); # rol 871 872 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 873 eval(shift(@insns)); 874 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 875 if ($Xi%5) { 876 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 877 } else { # ... or load next one 878 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 879 } 880 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 881 eval(shift(@insns)); # ror 882 eval(shift(@insns)); 883 884 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 885 eval(shift(@insns)); # body_20_39 886 eval(shift(@insns)); 887 eval(shift(@insns)); 888 eval(shift(@insns)); # rol 889 890 &vpsrld (@Tx[0],@X[0],30); 891 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 892 eval(shift(@insns)); 893 eval(shift(@insns)); 894 eval(shift(@insns)); # ror 895 eval(shift(@insns)); 896 897 &vpslld (@X[0],@X[0],2); 898 eval(shift(@insns)); # body_20_39 899 eval(shift(@insns)); 900 eval(shift(@insns)); 901 eval(shift(@insns)); # rol 902 eval(shift(@insns)); 903 eval(shift(@insns)); 904 eval(shift(@insns)); # ror 905 eval(shift(@insns)); 906 907 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 908 eval(shift(@insns)); # body_20_39 909 eval(shift(@insns)); 910 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); 911 eval(shift(@insns)); 912 eval(shift(@insns)); # rol 913 eval(shift(@insns)); 914 eval(shift(@insns)); 915 eval(shift(@insns)); # rol 916 eval(shift(@insns)); 917 918 foreach (@insns) { eval; } # remaining instructions 919 920 $Xi++; push(@X,shift(@X)); # "rotate" X[] 921 push(@Tx,shift(@Tx)); 922} 923 924sub Xuplast_avx_80() 925{ use integer; 926 my $body = shift; 927 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 928 my ($a,$b,$c,$d,$e); 929 930 eval(shift(@insns)); 931 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 932 eval(shift(@insns)); 933 eval(shift(@insns)); 934 eval(shift(@insns)); 935 eval(shift(@insns)); 936 937 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 938 939 foreach (@insns) { eval; } # remaining instructions 940 941 &cmp ($inp,$num); 942 &je (".Ldone_avx"); 943 944 unshift(@Tx,pop(@Tx)); 945 946 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask 947 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 948 &vmovdqu(@X[-4&7],"0($inp)"); # load input 949 &vmovdqu(@X[-3&7],"16($inp)"); 950 &vmovdqu(@X[-2&7],"32($inp)"); 951 &vmovdqu(@X[-1&7],"48($inp)"); 952 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 953 &add ($inp,64); 954 955 $Xi=0; 956} 957 958sub Xloop_avx() 959{ use integer; 960 my $body = shift; 961 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 962 my ($a,$b,$c,$d,$e); 963 964 eval(shift(@insns)); 965 eval(shift(@insns)); 966 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 eval(shift(@insns)); 974 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU 975 eval(shift(@insns)); 976 eval(shift(@insns)); 977 978 foreach (@insns) { eval; } 979 $Xi++; 980} 981 982sub Xtail_avx() 983{ use integer; 984 my $body = shift; 985 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 986 my ($a,$b,$c,$d,$e); 987 988 foreach (@insns) { eval; } 989} 990 991$code.=<<___; 992.align 16 993.Loop_avx: 994___ 995 &Xupdate_avx_16_31(\&body_00_19); 996 &Xupdate_avx_16_31(\&body_00_19); 997 &Xupdate_avx_16_31(\&body_00_19); 998 &Xupdate_avx_16_31(\&body_00_19); 999 &Xupdate_avx_32_79(\&body_00_19); 1000 &Xupdate_avx_32_79(\&body_20_39); 1001 &Xupdate_avx_32_79(\&body_20_39); 1002 &Xupdate_avx_32_79(\&body_20_39); 1003 &Xupdate_avx_32_79(\&body_20_39); 1004 &Xupdate_avx_32_79(\&body_20_39); 1005 &Xupdate_avx_32_79(\&body_40_59); 1006 &Xupdate_avx_32_79(\&body_40_59); 1007 &Xupdate_avx_32_79(\&body_40_59); 1008 &Xupdate_avx_32_79(\&body_40_59); 1009 &Xupdate_avx_32_79(\&body_40_59); 1010 &Xupdate_avx_32_79(\&body_20_39); 1011 &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1012 1013 $saved_j=$j; @saved_V=@V; 1014 1015 &Xloop_avx(\&body_20_39); 1016 &Xloop_avx(\&body_20_39); 1017 &Xloop_avx(\&body_20_39); 1018 1019$code.=<<___; 1020 add 0($ctx),$A # update context 1021 add 4($ctx),@T[0] 1022 add 8($ctx),$C 1023 add 12($ctx),$D 1024 mov $A,0($ctx) 1025 add 16($ctx),$E 1026 mov @T[0],4($ctx) 1027 mov @T[0],$B # magic seed 1028 mov $C,8($ctx) 1029 mov $D,12($ctx) 1030 mov $E,16($ctx) 1031 jmp .Loop_avx 1032 1033.align 16 1034.Ldone_avx: 1035___ 1036 $j=$saved_j; @V=@saved_V; 1037 1038 &Xtail_avx(\&body_20_39); 1039 &Xtail_avx(\&body_20_39); 1040 &Xtail_avx(\&body_20_39); 1041 1042$code.=<<___; 1043 vzeroupper 1044 1045 add 0($ctx),$A # update context 1046 add 4($ctx),@T[0] 1047 add 8($ctx),$C 1048 mov $A,0($ctx) 1049 add 12($ctx),$D 1050 mov @T[0],4($ctx) 1051 add 16($ctx),$E 1052 mov $C,8($ctx) 1053 mov $D,12($ctx) 1054 mov $E,16($ctx) 1055___ 1056$code.=<<___ if ($win64); 1057 movaps 64+0(%rsp),%xmm6 1058 movaps 64+16(%rsp),%xmm7 1059 movaps 64+32(%rsp),%xmm8 1060 movaps 64+48(%rsp),%xmm9 1061 movaps 64+64(%rsp),%xmm10 1062___ 1063$code.=<<___; 1064 lea `64+($win64?5*16:0)`(%rsp),%rsi 1065 mov 0(%rsi),%r12 1066 mov 8(%rsi),%rbp 1067 mov 16(%rsi),%rbx 1068 lea 24(%rsi),%rsp 1069.Lepilogue_avx: 1070 ret 1071.size sha1_block_data_order_avx,.-sha1_block_data_order_avx 1072___ 1073} 1074$code.=<<___; 1075.align 64 1076K_XX_XX: 1077.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1078.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1079.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1080.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1081.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1082___ 1083}}} 1084$code.=<<___; 1085.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1086.align 64 1087___ 1088 1089# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1090# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1091if ($win64) { 1092$rec="%rcx"; 1093$frame="%rdx"; 1094$context="%r8"; 1095$disp="%r9"; 1096 1097$code.=<<___; 1098.extern __imp_RtlVirtualUnwind 1099.type se_handler,\@abi-omnipotent 1100.align 16 1101se_handler: 1102 push %rsi 1103 push %rdi 1104 push %rbx 1105 push %rbp 1106 push %r12 1107 push %r13 1108 push %r14 1109 push %r15 1110 pushfq 1111 sub \$64,%rsp 1112 1113 mov 120($context),%rax # pull context->Rax 1114 mov 248($context),%rbx # pull context->Rip 1115 1116 lea .Lprologue(%rip),%r10 1117 cmp %r10,%rbx # context->Rip<.Lprologue 1118 jb .Lcommon_seh_tail 1119 1120 mov 152($context),%rax # pull context->Rsp 1121 1122 lea .Lepilogue(%rip),%r10 1123 cmp %r10,%rbx # context->Rip>=.Lepilogue 1124 jae .Lcommon_seh_tail 1125 1126 mov `16*4`(%rax),%rax # pull saved stack pointer 1127 lea 32(%rax),%rax 1128 1129 mov -8(%rax),%rbx 1130 mov -16(%rax),%rbp 1131 mov -24(%rax),%r12 1132 mov -32(%rax),%r13 1133 mov %rbx,144($context) # restore context->Rbx 1134 mov %rbp,160($context) # restore context->Rbp 1135 mov %r12,216($context) # restore context->R12 1136 mov %r13,224($context) # restore context->R13 1137 1138 jmp .Lcommon_seh_tail 1139.size se_handler,.-se_handler 1140 1141.type ssse3_handler,\@abi-omnipotent 1142.align 16 1143ssse3_handler: 1144 push %rsi 1145 push %rdi 1146 push %rbx 1147 push %rbp 1148 push %r12 1149 push %r13 1150 push %r14 1151 push %r15 1152 pushfq 1153 sub \$64,%rsp 1154 1155 mov 120($context),%rax # pull context->Rax 1156 mov 248($context),%rbx # pull context->Rip 1157 1158 mov 8($disp),%rsi # disp->ImageBase 1159 mov 56($disp),%r11 # disp->HandlerData 1160 1161 mov 0(%r11),%r10d # HandlerData[0] 1162 lea (%rsi,%r10),%r10 # prologue label 1163 cmp %r10,%rbx # context->Rip<prologue label 1164 jb .Lcommon_seh_tail 1165 1166 mov 152($context),%rax # pull context->Rsp 1167 1168 mov 4(%r11),%r10d # HandlerData[1] 1169 lea (%rsi,%r10),%r10 # epilogue label 1170 cmp %r10,%rbx # context->Rip>=epilogue label 1171 jae .Lcommon_seh_tail 1172 1173 lea 64(%rax),%rsi 1174 lea 512($context),%rdi # &context.Xmm6 1175 mov \$10,%ecx 1176 .long 0xa548f3fc # cld; rep movsq 1177 lea `24+64+5*16`(%rax),%rax # adjust stack pointer 1178 1179 mov -8(%rax),%rbx 1180 mov -16(%rax),%rbp 1181 mov -24(%rax),%r12 1182 mov %rbx,144($context) # restore context->Rbx 1183 mov %rbp,160($context) # restore context->Rbp 1184 mov %r12,216($context) # restore cotnext->R12 1185 1186.Lcommon_seh_tail: 1187 mov 8(%rax),%rdi 1188 mov 16(%rax),%rsi 1189 mov %rax,152($context) # restore context->Rsp 1190 mov %rsi,168($context) # restore context->Rsi 1191 mov %rdi,176($context) # restore context->Rdi 1192 1193 mov 40($disp),%rdi # disp->ContextRecord 1194 mov $context,%rsi # context 1195 mov \$154,%ecx # sizeof(CONTEXT) 1196 .long 0xa548f3fc # cld; rep movsq 1197 1198 mov $disp,%rsi 1199 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1200 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1201 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1202 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1203 mov 40(%rsi),%r10 # disp->ContextRecord 1204 lea 56(%rsi),%r11 # &disp->HandlerData 1205 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1206 mov %r10,32(%rsp) # arg5 1207 mov %r11,40(%rsp) # arg6 1208 mov %r12,48(%rsp) # arg7 1209 mov %rcx,56(%rsp) # arg8, (NULL) 1210 call *__imp_RtlVirtualUnwind(%rip) 1211 1212 mov \$1,%eax # ExceptionContinueSearch 1213 add \$64,%rsp 1214 popfq 1215 pop %r15 1216 pop %r14 1217 pop %r13 1218 pop %r12 1219 pop %rbp 1220 pop %rbx 1221 pop %rdi 1222 pop %rsi 1223 ret 1224.size ssse3_handler,.-ssse3_handler 1225 1226.section .pdata 1227.align 4 1228 .rva .LSEH_begin_sha1_block_data_order 1229 .rva .LSEH_end_sha1_block_data_order 1230 .rva .LSEH_info_sha1_block_data_order 1231 .rva .LSEH_begin_sha1_block_data_order_ssse3 1232 .rva .LSEH_end_sha1_block_data_order_ssse3 1233 .rva .LSEH_info_sha1_block_data_order_ssse3 1234___ 1235$code.=<<___ if ($avx); 1236 .rva .LSEH_begin_sha1_block_data_order_avx 1237 .rva .LSEH_end_sha1_block_data_order_avx 1238 .rva .LSEH_info_sha1_block_data_order_avx 1239___ 1240$code.=<<___; 1241.section .xdata 1242.align 8 1243.LSEH_info_sha1_block_data_order: 1244 .byte 9,0,0,0 1245 .rva se_handler 1246.LSEH_info_sha1_block_data_order_ssse3: 1247 .byte 9,0,0,0 1248 .rva ssse3_handler 1249 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1250___ 1251$code.=<<___ if ($avx); 1252.LSEH_info_sha1_block_data_order_avx: 1253 .byte 9,0,0,0 1254 .rva ssse3_handler 1255 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1256___ 1257} 1258 1259#################################################################### 1260 1261$code =~ s/\`([^\`]*)\`/eval $1/gem; 1262print $code; 1263close STDOUT; 1264