sha1-586.pl revision 299964
1#!/usr/bin/env perl 2 3# ==================================================================== 4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* 11# functions were re-implemented to address P4 performance issue [see 12# commentary below], and in 2006 the rest was rewritten in order to 13# gain freedom to liberate licensing terms. 14 15# January, September 2004. 16# 17# It was noted that Intel IA-32 C compiler generates code which 18# performs ~30% *faster* on P4 CPU than original *hand-coded* 19# SHA1 assembler implementation. To address this problem (and 20# prove that humans are still better than machines:-), the 21# original code was overhauled, which resulted in following 22# performance changes: 23# 24# compared with original compared with Intel cc 25# assembler impl. generated code 26# Pentium -16% +48% 27# PIII/AMD +8% +16% 28# P4 +85%(!) +45% 29# 30# As you can see Pentium came out as looser:-( Yet I reckoned that 31# improvement on P4 outweights the loss and incorporate this 32# re-tuned code to 0.9.7 and later. 33# ---------------------------------------------------------------- 34# <appro@fy.chalmers.se> 35 36# August 2009. 37# 38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as 39# '(c&d) + (b&(c^d))', which allows to accumulate partial results 40# and lighten "pressure" on scratch registers. This resulted in 41# >12% performance improvement on contemporary AMD cores (with no 42# degradation on other CPUs:-). Also, the code was revised to maximize 43# "distance" between instructions producing input to 'lea' instruction 44# and the 'lea' instruction itself, which is essential for Intel Atom 45# core and resulted in ~15% improvement. 46 47# October 2010. 48# 49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 50# is to offload message schedule denoted by Wt in NIST specification, 51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, 52# and in SSE2 context was first explored by Dean Gaudet in 2004, see 53# http://arctic.org/~dean/crypto/sha1.html. Since then several things 54# have changed that made it interesting again: 55# 56# a) XMM units became faster and wider; 57# b) instruction set became more versatile; 58# c) an important observation was made by Max Locktykhin, which made 59# it possible to reduce amount of instructions required to perform 60# the operation in question, for further details see 61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. 62 63# April 2011. 64# 65# Add AVX code path, probably most controversial... The thing is that 66# switch to AVX alone improves performance by as little as 4% in 67# comparison to SSSE3 code path. But below result doesn't look like 68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as 69# pair of ��-ops, and it's the additional ��-ops, two per round, that 70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded 71# as single ��-op by Sandy Bridge and it's replacing 'ro[rl]' with 72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 73# cycles per processed byte. But 'sh[rl]d' is not something that used 74# to be fast, nor does it appear to be fast in upcoming Bulldozer 75# [according to its optimization manual]. Which is why AVX code path 76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. 77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it 78# makes no sense to keep the AVX code path. If somebody feels that 79# strongly, it's probably more appropriate to discuss possibility of 80# using vector rotate XOP on AMD... 81 82###################################################################### 83# Current performance is summarized in following table. Numbers are 84# CPU clock cycles spent to process single byte (less is better). 85# 86# x86 SSSE3 AVX 87# Pentium 15.7 - 88# PIII 11.5 - 89# P4 10.6 - 90# AMD K8 7.1 - 91# Core2 7.3 6.1/+20% - 92# Atom 12.5 9.5(*)/+32% - 93# Westmere 7.3 5.6/+30% - 94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% 95# 96# (*) Loop is 1056 instructions long and expected result is ~8.25. 97# It remains mystery [to me] why ILP is limited to 1.7. 98# 99# (**) As per above comment, the result is for AVX *plus* sh[rl]d. 100 101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 102push(@INC,"${dir}","${dir}../../perlasm"); 103require "x86asm.pl"; 104 105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); 106 107$xmm=$ymm=0; 108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 109 110$ymm=1 if ($xmm && 111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 113 $1>=2.19); # first version supporting AVX 114 115$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 116 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 117 $1>=2.03); # first version supporting AVX 118 119$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && 120 $2>=3.0); # first version supporting AVX 121 122&external_label("OPENSSL_ia32cap_P") if ($xmm); 123 124 125$A="eax"; 126$B="ebx"; 127$C="ecx"; 128$D="edx"; 129$E="edi"; 130$T="esi"; 131$tmp1="ebp"; 132 133@V=($A,$B,$C,$D,$E,$T); 134 135$alt=0; # 1 denotes alternative IALU implementation, which performs 136 # 8% *worse* on P4, same on Westmere and Atom, 2% better on 137 # Sandy Bridge... 138 139sub BODY_00_15 140 { 141 local($n,$a,$b,$c,$d,$e,$f)=@_; 142 143 &comment("00_15 $n"); 144 145 &mov($f,$c); # f to hold F_00_19(b,c,d) 146 if ($n==0) { &mov($tmp1,$a); } 147 else { &mov($a,$tmp1); } 148 &rotl($tmp1,5); # tmp1=ROTATE(a,5) 149 &xor($f,$d); 150 &add($tmp1,$e); # tmp1+=e; 151 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded 152 # with xi, also note that e becomes 153 # f in next round... 154 &and($f,$b); 155 &rotr($b,2); # b=ROTATE(b,30) 156 &xor($f,$d); # f holds F_00_19(b,c,d) 157 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi 158 159 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round 160 &add($f,$tmp1); } # f+=tmp1 161 else { &add($tmp1,$f); } # f becomes a in next round 162 &mov($tmp1,$a) if ($alt && $n==15); 163 } 164 165sub BODY_16_19 166 { 167 local($n,$a,$b,$c,$d,$e,$f)=@_; 168 169 &comment("16_19 $n"); 170 171if ($alt) { 172 &xor($c,$d); 173 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 174 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d 175 &xor($f,&swtmp(($n+8)%16)); 176 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 177 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 178 &rotl($f,1); # f=ROTATE(f,1) 179 &add($e,$tmp1); # e+=F_00_19(b,c,d) 180 &xor($c,$d); # restore $c 181 &mov($tmp1,$a); # b in next round 182 &rotr($b,$n==16?2:7); # b=ROTATE(b,30) 183 &mov(&swtmp($n%16),$f); # xi=f 184 &rotl($a,5); # ROTATE(a,5) 185 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e 186 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 187 &add($f,$a); # f+=ROTATE(a,5) 188} else { 189 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) 190 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 191 &xor($tmp1,$d); 192 &xor($f,&swtmp(($n+8)%16)); 193 &and($tmp1,$b); 194 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 195 &rotl($f,1); # f=ROTATE(f,1) 196 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 197 &add($e,$tmp1); # e+=F_00_19(b,c,d) 198 &mov($tmp1,$a); 199 &rotr($b,2); # b=ROTATE(b,30) 200 &mov(&swtmp($n%16),$f); # xi=f 201 &rotl($tmp1,5); # ROTATE(a,5) 202 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e 203 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 204 &add($f,$tmp1); # f+=ROTATE(a,5) 205} 206 } 207 208sub BODY_20_39 209 { 210 local($n,$a,$b,$c,$d,$e,$f)=@_; 211 local $K=($n<40)?0x6ed9eba1:0xca62c1d6; 212 213 &comment("20_39 $n"); 214 215if ($alt) { 216 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c 217 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 218 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 219 &xor($f,&swtmp(($n+8)%16)); 220 &add($e,$tmp1); # e+=F_20_39(b,c,d) 221 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 222 &rotl($f,1); # f=ROTATE(f,1) 223 &mov($tmp1,$a); # b in next round 224 &rotr($b,7); # b=ROTATE(b,30) 225 &mov(&swtmp($n%16),$f) if($n<77);# xi=f 226 &rotl($a,5); # ROTATE(a,5) 227 &xor($b,$c) if($n==39);# warm up for BODY_40_59 228 &and($tmp1,$b) if($n==39); 229 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY 230 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round 231 &add($f,$a); # f+=ROTATE(a,5) 232 &rotr($a,5) if ($n==79); 233} else { 234 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) 235 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 236 &xor($tmp1,$c); 237 &xor($f,&swtmp(($n+8)%16)); 238 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 239 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 240 &rotl($f,1); # f=ROTATE(f,1) 241 &add($e,$tmp1); # e+=F_20_39(b,c,d) 242 &rotr($b,2); # b=ROTATE(b,30) 243 &mov($tmp1,$a); 244 &rotl($tmp1,5); # ROTATE(a,5) 245 &mov(&swtmp($n%16),$f) if($n<77);# xi=f 246 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY 247 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round 248 &add($f,$tmp1); # f+=ROTATE(a,5) 249} 250 } 251 252sub BODY_40_59 253 { 254 local($n,$a,$b,$c,$d,$e,$f)=@_; 255 256 &comment("40_59 $n"); 257 258if ($alt) { 259 &add($e,$tmp1); # e+=b&(c^d) 260 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 261 &mov($tmp1,$d); 262 &xor($f,&swtmp(($n+8)%16)); 263 &xor($c,$d); # restore $c 264 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 265 &rotl($f,1); # f=ROTATE(f,1) 266 &and($tmp1,$c); 267 &rotr($b,7); # b=ROTATE(b,30) 268 &add($e,$tmp1); # e+=c&d 269 &mov($tmp1,$a); # b in next round 270 &mov(&swtmp($n%16),$f); # xi=f 271 &rotl($a,5); # ROTATE(a,5) 272 &xor($b,$c) if ($n<59); 273 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) 274 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) 275 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 276 &add($f,$a); # f+=ROTATE(a,5) 277} else { 278 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) 279 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 280 &xor($tmp1,$d); 281 &xor($f,&swtmp(($n+8)%16)); 282 &and($tmp1,$b); 283 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 284 &rotl($f,1); # f=ROTATE(f,1) 285 &add($tmp1,$e); # b&(c^d)+=e 286 &rotr($b,2); # b=ROTATE(b,30) 287 &mov($e,$a); # e becomes volatile 288 &rotl($e,5); # ROTATE(a,5) 289 &mov(&swtmp($n%16),$f); # xi=f 290 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) 291 &mov($tmp1,$c); 292 &add($f,$e); # f+=ROTATE(a,5) 293 &and($tmp1,$d); 294 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 295 &add($f,$tmp1); # f+=c&d 296} 297 } 298 299&function_begin("sha1_block_data_order"); 300if ($xmm) { 301 &static_label("ssse3_shortcut"); 302 &static_label("avx_shortcut") if ($ymm); 303 &static_label("K_XX_XX"); 304 305 &call (&label("pic_point")); # make it PIC! 306 &set_label("pic_point"); 307 &blindpop($tmp1); 308 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); 309 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 310 311 &mov ($A,&DWP(0,$T)); 312 &mov ($D,&DWP(4,$T)); 313 &test ($D,1<<9); # check SSSE3 bit 314 &jz (&label("x86")); 315 &test ($A,1<<24); # check FXSR bit 316 &jz (&label("x86")); 317 if ($ymm) { 318 &and ($D,1<<28); # mask AVX bit 319 &and ($A,1<<30); # mask "Intel CPU" bit 320 &or ($A,$D); 321 &cmp ($A,1<<28|1<<30); 322 &je (&label("avx_shortcut")); 323 } 324 &jmp (&label("ssse3_shortcut")); 325 &set_label("x86",16); 326} 327 &mov($tmp1,&wparam(0)); # SHA_CTX *c 328 &mov($T,&wparam(1)); # const void *input 329 &mov($A,&wparam(2)); # size_t num 330 &stack_push(16+3); # allocate X[16] 331 &shl($A,6); 332 &add($A,$T); 333 &mov(&wparam(2),$A); # pointer beyond the end of input 334 &mov($E,&DWP(16,$tmp1));# pre-load E 335 &jmp(&label("loop")); 336 337&set_label("loop",16); 338 339 # copy input chunk to X, but reversing byte order! 340 for ($i=0; $i<16; $i+=4) 341 { 342 &mov($A,&DWP(4*($i+0),$T)); 343 &mov($B,&DWP(4*($i+1),$T)); 344 &mov($C,&DWP(4*($i+2),$T)); 345 &mov($D,&DWP(4*($i+3),$T)); 346 &bswap($A); 347 &bswap($B); 348 &bswap($C); 349 &bswap($D); 350 &mov(&swtmp($i+0),$A); 351 &mov(&swtmp($i+1),$B); 352 &mov(&swtmp($i+2),$C); 353 &mov(&swtmp($i+3),$D); 354 } 355 &mov(&wparam(1),$T); # redundant in 1st spin 356 357 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX 358 &mov($B,&DWP(4,$tmp1)); 359 &mov($C,&DWP(8,$tmp1)); 360 &mov($D,&DWP(12,$tmp1)); 361 # E is pre-loaded 362 363 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 364 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 365 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 366 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 367 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 368 369 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check 370 371 &mov($tmp1,&wparam(0)); # re-load SHA_CTX* 372 &mov($D,&wparam(1)); # D is last "T" and is discarded 373 374 &add($E,&DWP(0,$tmp1)); # E is last "A"... 375 &add($T,&DWP(4,$tmp1)); 376 &add($A,&DWP(8,$tmp1)); 377 &add($B,&DWP(12,$tmp1)); 378 &add($C,&DWP(16,$tmp1)); 379 380 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX 381 &add($D,64); # advance input pointer 382 &mov(&DWP(4,$tmp1),$T); 383 &cmp($D,&wparam(2)); # have we reached the end yet? 384 &mov(&DWP(8,$tmp1),$A); 385 &mov($E,$C); # C is last "E" which needs to be "pre-loaded" 386 &mov(&DWP(12,$tmp1),$B); 387 &mov($T,$D); # input pointer 388 &mov(&DWP(16,$tmp1),$C); 389 &jb(&label("loop")); 390 391 &stack_pop(16+3); 392&function_end("sha1_block_data_order"); 393 394if ($xmm) { 395###################################################################### 396# The SSSE3 implementation. 397# 398# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last 399# 32 elements of the message schedule or Xupdate outputs. First 4 400# quadruples are simply byte-swapped input, next 4 are calculated 401# according to method originally suggested by Dean Gaudet (modulo 402# being implemented in SSSE3). Once 8 quadruples or 32 elements are 403# collected, it switches to routine proposed by Max Locktyukhin. 404# 405# Calculations inevitably require temporary reqisters, and there are 406# no %xmm registers left to spare. For this reason part of the ring 407# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring 408# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - 409# X[-5], and X[4] - X[-4]... 410# 411# Another notable optimization is aggressive stack frame compression 412# aiming to minimize amount of 9-byte instructions... 413# 414# Yet another notable optimization is "jumping" $B variable. It means 415# that there is no register permanently allocated for $B value. This 416# allowed to eliminate one instruction from body_20_39... 417# 418my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded 419my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 420my @V=($A,$B,$C,$D,$E); 421my $j=0; # hash round 422my @T=($T,$tmp1); 423my $inp; 424 425my $_rol=sub { &rol(@_) }; 426my $_ror=sub { &ror(@_) }; 427 428&function_begin("_sha1_block_data_order_ssse3"); 429 &call (&label("pic_point")); # make it PIC! 430 &set_label("pic_point"); 431 &blindpop($tmp1); 432 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 433&set_label("ssse3_shortcut"); 434 435 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 436 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 437 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 438 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 439 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask 440 441 &mov ($E,&wparam(0)); # load argument block 442 &mov ($inp=@T[1],&wparam(1)); 443 &mov ($D,&wparam(2)); 444 &mov (@T[0],"esp"); 445 446 # stack frame layout 447 # 448 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area 449 # X[4]+K X[5]+K X[6]+K X[7]+K 450 # X[8]+K X[9]+K X[10]+K X[11]+K 451 # X[12]+K X[13]+K X[14]+K X[15]+K 452 # 453 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area 454 # X[4] X[5] X[6] X[7] 455 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 456 # 457 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants 458 # K_40_59 K_40_59 K_40_59 K_40_59 459 # K_60_79 K_60_79 K_60_79 K_60_79 460 # K_00_19 K_00_19 K_00_19 K_00_19 461 # pbswap mask 462 # 463 # +192 ctx # argument block 464 # +196 inp 465 # +200 end 466 # +204 esp 467 &sub ("esp",208); 468 &and ("esp",-64); 469 470 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants 471 &movdqa (&QWP(112+16,"esp"),@X[5]); 472 &movdqa (&QWP(112+32,"esp"),@X[6]); 473 &shl ($D,6); # len*64 474 &movdqa (&QWP(112+48,"esp"),@X[3]); 475 &add ($D,$inp); # end of input 476 &movdqa (&QWP(112+64,"esp"),@X[2]); 477 &add ($inp,64); 478 &mov (&DWP(192+0,"esp"),$E); # save argument block 479 &mov (&DWP(192+4,"esp"),$inp); 480 &mov (&DWP(192+8,"esp"),$D); 481 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp 482 483 &mov ($A,&DWP(0,$E)); # load context 484 &mov ($B,&DWP(4,$E)); 485 &mov ($C,&DWP(8,$E)); 486 &mov ($D,&DWP(12,$E)); 487 &mov ($E,&DWP(16,$E)); 488 &mov (@T[0],$B); # magic seed 489 490 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] 491 &movdqu (@X[-3&7],&QWP(-48,$inp)); 492 &movdqu (@X[-2&7],&QWP(-32,$inp)); 493 &movdqu (@X[-1&7],&QWP(-16,$inp)); 494 &pshufb (@X[-4&7],@X[2]); # byte swap 495 &pshufb (@X[-3&7],@X[2]); 496 &pshufb (@X[-2&7],@X[2]); 497 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 498 &pshufb (@X[-1&7],@X[2]); 499 &paddd (@X[-4&7],@X[3]); # add K_00_19 500 &paddd (@X[-3&7],@X[3]); 501 &paddd (@X[-2&7],@X[3]); 502 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU 503 &psubd (@X[-4&7],@X[3]); # restore X[] 504 &movdqa (&QWP(0+16,"esp"),@X[-3&7]); 505 &psubd (@X[-3&7],@X[3]); 506 &movdqa (&QWP(0+32,"esp"),@X[-2&7]); 507 &psubd (@X[-2&7],@X[3]); 508 &movdqa (@X[0],@X[-3&7]); 509 &jmp (&label("loop")); 510 511###################################################################### 512# SSE instruction sequence is first broken to groups of indepentent 513# instructions, independent in respect to their inputs and shifter 514# (not all architectures have more than one). Then IALU instructions 515# are "knitted in" between the SSE groups. Distance is maintained for 516# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer 517# [which allegedly also implements SSSE3]... 518# 519# Temporary registers usage. X[2] is volatile at the entry and at the 520# end is restored from backtrace ring buffer. X[3] is expected to 521# contain current K_XX_XX constant and is used to caclulate X[-1]+K 522# from previous round, it becomes volatile the moment the value is 523# saved to stack for transfer to IALU. X[4] becomes volatile whenever 524# X[-4] is accumulated and offloaded to backtrace ring buffer, at the 525# end it is loaded with next K_XX_XX [which becomes X[3] in next 526# round]... 527# 528sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 529{ use integer; 530 my $body = shift; 531 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 532 my ($a,$b,$c,$d,$e); 533 534 eval(shift(@insns)); 535 eval(shift(@insns)); 536 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 537 &movdqa (@X[2],@X[-1&7]); 538 eval(shift(@insns)); 539 eval(shift(@insns)); 540 541 &paddd (@X[3],@X[-1&7]); 542 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer 543 eval(shift(@insns)); 544 eval(shift(@insns)); 545 &psrldq (@X[2],4); # "X[-3]", 3 dwords 546 eval(shift(@insns)); 547 eval(shift(@insns)); 548 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 549 eval(shift(@insns)); 550 eval(shift(@insns)); 551 552 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" 553 eval(shift(@insns)); 554 eval(shift(@insns)); 555 eval(shift(@insns)); 556 eval(shift(@insns)); 557 558 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" 559 eval(shift(@insns)); 560 eval(shift(@insns)); 561 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 562 eval(shift(@insns)); 563 eval(shift(@insns)); 564 565 &movdqa (@X[4],@X[0]); 566 &movdqa (@X[2],@X[0]); 567 eval(shift(@insns)); 568 eval(shift(@insns)); 569 eval(shift(@insns)); 570 eval(shift(@insns)); 571 572 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword 573 &paddd (@X[0],@X[0]); 574 eval(shift(@insns)); 575 eval(shift(@insns)); 576 eval(shift(@insns)); 577 eval(shift(@insns)); 578 579 &psrld (@X[2],31); 580 eval(shift(@insns)); 581 eval(shift(@insns)); 582 &movdqa (@X[3],@X[4]); 583 eval(shift(@insns)); 584 eval(shift(@insns)); 585 586 &psrld (@X[4],30); 587 &por (@X[0],@X[2]); # "X[0]"<<<=1 588 eval(shift(@insns)); 589 eval(shift(@insns)); 590 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer 591 eval(shift(@insns)); 592 eval(shift(@insns)); 593 594 &pslld (@X[3],2); 595 &pxor (@X[0],@X[4]); 596 eval(shift(@insns)); 597 eval(shift(@insns)); 598 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX 599 eval(shift(@insns)); 600 eval(shift(@insns)); 601 602 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 603 &movdqa (@X[1],@X[-2&7]) if ($Xi<7); 604 eval(shift(@insns)); 605 eval(shift(@insns)); 606 607 foreach (@insns) { eval; } # remaining instructions [if any] 608 609 $Xi++; push(@X,shift(@X)); # "rotate" X[] 610} 611 612sub Xupdate_ssse3_32_79() 613{ use integer; 614 my $body = shift; 615 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 616 my ($a,$b,$c,$d,$e); 617 618 &movdqa (@X[2],@X[-1&7]) if ($Xi==8); 619 eval(shift(@insns)); # body_20_39 620 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 621 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" 622 eval(shift(@insns)); 623 eval(shift(@insns)); 624 eval(shift(@insns)); # rol 625 626 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 627 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer 628 eval(shift(@insns)); 629 eval(shift(@insns)); 630 if ($Xi%5) { 631 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... 632 } else { # ... or load next one 633 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); 634 } 635 &paddd (@X[3],@X[-1&7]); 636 eval(shift(@insns)); # ror 637 eval(shift(@insns)); 638 639 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" 640 eval(shift(@insns)); # body_20_39 641 eval(shift(@insns)); 642 eval(shift(@insns)); 643 eval(shift(@insns)); # rol 644 645 &movdqa (@X[2],@X[0]); 646 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 647 eval(shift(@insns)); 648 eval(shift(@insns)); 649 eval(shift(@insns)); # ror 650 eval(shift(@insns)); 651 652 &pslld (@X[0],2); 653 eval(shift(@insns)); # body_20_39 654 eval(shift(@insns)); 655 &psrld (@X[2],30); 656 eval(shift(@insns)); 657 eval(shift(@insns)); # rol 658 eval(shift(@insns)); 659 eval(shift(@insns)); 660 eval(shift(@insns)); # ror 661 eval(shift(@insns)); 662 663 &por (@X[0],@X[2]); # "X[0]"<<<=2 664 eval(shift(@insns)); # body_20_39 665 eval(shift(@insns)); 666 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer 667 eval(shift(@insns)); 668 eval(shift(@insns)); # rol 669 eval(shift(@insns)); 670 eval(shift(@insns)); 671 eval(shift(@insns)); # ror 672 &movdqa (@X[3],@X[0]) if ($Xi<19); 673 eval(shift(@insns)); 674 675 foreach (@insns) { eval; } # remaining instructions 676 677 $Xi++; push(@X,shift(@X)); # "rotate" X[] 678} 679 680sub Xuplast_ssse3_80() 681{ use integer; 682 my $body = shift; 683 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 684 my ($a,$b,$c,$d,$e); 685 686 eval(shift(@insns)); 687 &paddd (@X[3],@X[-1&7]); 688 eval(shift(@insns)); 689 eval(shift(@insns)); 690 eval(shift(@insns)); 691 eval(shift(@insns)); 692 693 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU 694 695 foreach (@insns) { eval; } # remaining instructions 696 697 &mov ($inp=@T[1],&DWP(192+4,"esp")); 698 &cmp ($inp,&DWP(192+8,"esp")); 699 &je (&label("done")); 700 701 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 702 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask 703 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input 704 &movdqu (@X[-3&7],&QWP(16,$inp)); 705 &movdqu (@X[-2&7],&QWP(32,$inp)); 706 &movdqu (@X[-1&7],&QWP(48,$inp)); 707 &add ($inp,64); 708 &pshufb (@X[-4&7],@X[2]); # byte swap 709 &mov (&DWP(192+4,"esp"),$inp); 710 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 711 712 $Xi=0; 713} 714 715sub Xloop_ssse3() 716{ use integer; 717 my $body = shift; 718 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 719 my ($a,$b,$c,$d,$e); 720 721 eval(shift(@insns)); 722 eval(shift(@insns)); 723 &pshufb (@X[($Xi-3)&7],@X[2]); 724 eval(shift(@insns)); 725 eval(shift(@insns)); 726 &paddd (@X[($Xi-4)&7],@X[3]); 727 eval(shift(@insns)); 728 eval(shift(@insns)); 729 eval(shift(@insns)); 730 eval(shift(@insns)); 731 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU 732 eval(shift(@insns)); 733 eval(shift(@insns)); 734 &psubd (@X[($Xi-4)&7],@X[3]); 735 736 foreach (@insns) { eval; } 737 $Xi++; 738} 739 740sub Xtail_ssse3() 741{ use integer; 742 my $body = shift; 743 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 744 my ($a,$b,$c,$d,$e); 745 746 foreach (@insns) { eval; } 747} 748 749sub body_00_19 () { 750 ( 751 '($a,$b,$c,$d,$e)=@V;'. 752 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer 753 '&xor ($c,$d);', 754 '&mov (@T[1],$a);', # $b in next round 755 '&$_rol ($a,5);', 756 '&and (@T[0],$c);', # ($b&($c^$d)) 757 '&xor ($c,$d);', # restore $c 758 '&xor (@T[0],$d);', 759 '&add ($e,$a);', 760 '&$_ror ($b,$j?7:2);', # $b>>>2 761 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 762 ); 763} 764 765sub body_20_39 () { 766 ( 767 '($a,$b,$c,$d,$e)=@V;'. 768 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer 769 '&xor (@T[0],$d);', # ($b^$d) 770 '&mov (@T[1],$a);', # $b in next round 771 '&$_rol ($a,5);', 772 '&xor (@T[0],$c);', # ($b^$d^$c) 773 '&add ($e,$a);', 774 '&$_ror ($b,7);', # $b>>>2 775 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 776 ); 777} 778 779sub body_40_59 () { 780 ( 781 '($a,$b,$c,$d,$e)=@V;'. 782 '&mov (@T[1],$c);', 783 '&xor ($c,$d);', 784 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer 785 '&and (@T[1],$d);', 786 '&and (@T[0],$c);', # ($b&($c^$d)) 787 '&$_ror ($b,7);', # $b>>>2 788 '&add ($e,@T[1]);', 789 '&mov (@T[1],$a);', # $b in next round 790 '&$_rol ($a,5);', 791 '&add ($e,@T[0]);', 792 '&xor ($c,$d);', # restore $c 793 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 794 ); 795} 796 797&set_label("loop",16); 798 &Xupdate_ssse3_16_31(\&body_00_19); 799 &Xupdate_ssse3_16_31(\&body_00_19); 800 &Xupdate_ssse3_16_31(\&body_00_19); 801 &Xupdate_ssse3_16_31(\&body_00_19); 802 &Xupdate_ssse3_32_79(\&body_00_19); 803 &Xupdate_ssse3_32_79(\&body_20_39); 804 &Xupdate_ssse3_32_79(\&body_20_39); 805 &Xupdate_ssse3_32_79(\&body_20_39); 806 &Xupdate_ssse3_32_79(\&body_20_39); 807 &Xupdate_ssse3_32_79(\&body_20_39); 808 &Xupdate_ssse3_32_79(\&body_40_59); 809 &Xupdate_ssse3_32_79(\&body_40_59); 810 &Xupdate_ssse3_32_79(\&body_40_59); 811 &Xupdate_ssse3_32_79(\&body_40_59); 812 &Xupdate_ssse3_32_79(\&body_40_59); 813 &Xupdate_ssse3_32_79(\&body_20_39); 814 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 815 816 $saved_j=$j; @saved_V=@V; 817 818 &Xloop_ssse3(\&body_20_39); 819 &Xloop_ssse3(\&body_20_39); 820 &Xloop_ssse3(\&body_20_39); 821 822 &mov (@T[1],&DWP(192,"esp")); # update context 823 &add ($A,&DWP(0,@T[1])); 824 &add (@T[0],&DWP(4,@T[1])); # $b 825 &add ($C,&DWP(8,@T[1])); 826 &mov (&DWP(0,@T[1]),$A); 827 &add ($D,&DWP(12,@T[1])); 828 &mov (&DWP(4,@T[1]),@T[0]); 829 &add ($E,&DWP(16,@T[1])); 830 &mov (&DWP(8,@T[1]),$C); 831 &mov ($B,@T[0]); 832 &mov (&DWP(12,@T[1]),$D); 833 &mov (&DWP(16,@T[1]),$E); 834 &movdqa (@X[0],@X[-3&7]); 835 836 &jmp (&label("loop")); 837 838&set_label("done",16); $j=$saved_j; @V=@saved_V; 839 840 &Xtail_ssse3(\&body_20_39); 841 &Xtail_ssse3(\&body_20_39); 842 &Xtail_ssse3(\&body_20_39); 843 844 &mov (@T[1],&DWP(192,"esp")); # update context 845 &add ($A,&DWP(0,@T[1])); 846 &mov ("esp",&DWP(192+12,"esp")); # restore %esp 847 &add (@T[0],&DWP(4,@T[1])); # $b 848 &add ($C,&DWP(8,@T[1])); 849 &mov (&DWP(0,@T[1]),$A); 850 &add ($D,&DWP(12,@T[1])); 851 &mov (&DWP(4,@T[1]),@T[0]); 852 &add ($E,&DWP(16,@T[1])); 853 &mov (&DWP(8,@T[1]),$C); 854 &mov (&DWP(12,@T[1]),$D); 855 &mov (&DWP(16,@T[1]),$E); 856 857&function_end("_sha1_block_data_order_ssse3"); 858 859if ($ymm) { 860my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded 861my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 862my @V=($A,$B,$C,$D,$E); 863my $j=0; # hash round 864my @T=($T,$tmp1); 865my $inp; 866 867my $_rol=sub { &shld(@_[0],@_) }; 868my $_ror=sub { &shrd(@_[0],@_) }; 869 870&function_begin("_sha1_block_data_order_avx"); 871 &call (&label("pic_point")); # make it PIC! 872 &set_label("pic_point"); 873 &blindpop($tmp1); 874 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 875&set_label("avx_shortcut"); 876 &vzeroall(); 877 878 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 879 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 880 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 881 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 882 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask 883 884 &mov ($E,&wparam(0)); # load argument block 885 &mov ($inp=@T[1],&wparam(1)); 886 &mov ($D,&wparam(2)); 887 &mov (@T[0],"esp"); 888 889 # stack frame layout 890 # 891 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area 892 # X[4]+K X[5]+K X[6]+K X[7]+K 893 # X[8]+K X[9]+K X[10]+K X[11]+K 894 # X[12]+K X[13]+K X[14]+K X[15]+K 895 # 896 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area 897 # X[4] X[5] X[6] X[7] 898 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 899 # 900 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants 901 # K_40_59 K_40_59 K_40_59 K_40_59 902 # K_60_79 K_60_79 K_60_79 K_60_79 903 # K_00_19 K_00_19 K_00_19 K_00_19 904 # pbswap mask 905 # 906 # +192 ctx # argument block 907 # +196 inp 908 # +200 end 909 # +204 esp 910 &sub ("esp",208); 911 &and ("esp",-64); 912 913 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants 914 &vmovdqa(&QWP(112+16,"esp"),@X[5]); 915 &vmovdqa(&QWP(112+32,"esp"),@X[6]); 916 &shl ($D,6); # len*64 917 &vmovdqa(&QWP(112+48,"esp"),@X[3]); 918 &add ($D,$inp); # end of input 919 &vmovdqa(&QWP(112+64,"esp"),@X[2]); 920 &add ($inp,64); 921 &mov (&DWP(192+0,"esp"),$E); # save argument block 922 &mov (&DWP(192+4,"esp"),$inp); 923 &mov (&DWP(192+8,"esp"),$D); 924 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp 925 926 &mov ($A,&DWP(0,$E)); # load context 927 &mov ($B,&DWP(4,$E)); 928 &mov ($C,&DWP(8,$E)); 929 &mov ($D,&DWP(12,$E)); 930 &mov ($E,&DWP(16,$E)); 931 &mov (@T[0],$B); # magic seed 932 933 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] 934 &vmovdqu(@X[-3&7],&QWP(-48,$inp)); 935 &vmovdqu(@X[-2&7],&QWP(-32,$inp)); 936 &vmovdqu(@X[-1&7],&QWP(-16,$inp)); 937 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 938 &vpshufb(@X[-3&7],@X[-3&7],@X[2]); 939 &vpshufb(@X[-2&7],@X[-2&7],@X[2]); 940 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 941 &vpshufb(@X[-1&7],@X[-1&7],@X[2]); 942 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 943 &vpaddd (@X[1],@X[-3&7],@X[3]); 944 &vpaddd (@X[2],@X[-2&7],@X[3]); 945 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU 946 &vmovdqa(&QWP(0+16,"esp"),@X[1]); 947 &vmovdqa(&QWP(0+32,"esp"),@X[2]); 948 &jmp (&label("loop")); 949 950sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 951{ use integer; 952 my $body = shift; 953 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 954 my ($a,$b,$c,$d,$e); 955 956 eval(shift(@insns)); 957 eval(shift(@insns)); 958 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 959 eval(shift(@insns)); 960 eval(shift(@insns)); 961 962 &vpaddd (@X[3],@X[3],@X[-1&7]); 963 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer 964 eval(shift(@insns)); 965 eval(shift(@insns)); 966 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 973 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" 974 eval(shift(@insns)); 975 eval(shift(@insns)); 976 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 977 eval(shift(@insns)); 978 eval(shift(@insns)); 979 980 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" 981 eval(shift(@insns)); 982 eval(shift(@insns)); 983 eval(shift(@insns)); 984 eval(shift(@insns)); 985 986 &vpsrld (@X[2],@X[0],31); 987 eval(shift(@insns)); 988 eval(shift(@insns)); 989 eval(shift(@insns)); 990 eval(shift(@insns)); 991 992 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword 993 &vpaddd (@X[0],@X[0],@X[0]); 994 eval(shift(@insns)); 995 eval(shift(@insns)); 996 eval(shift(@insns)); 997 eval(shift(@insns)); 998 999 &vpsrld (@X[3],@X[4],30); 1000 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); 1004 eval(shift(@insns)); 1005 1006 &vpslld (@X[4],@X[4],2); 1007 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); 1010 &vpxor (@X[0],@X[0],@X[3]); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); 1015 1016 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX 1020 eval(shift(@insns)); 1021 eval(shift(@insns)); 1022 1023 foreach (@insns) { eval; } # remaining instructions [if any] 1024 1025 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1026} 1027 1028sub Xupdate_avx_32_79() 1029{ use integer; 1030 my $body = shift; 1031 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 1032 my ($a,$b,$c,$d,$e); 1033 1034 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1035 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1036 eval(shift(@insns)); # body_20_39 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); 1039 eval(shift(@insns)); # rol 1040 1041 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1042 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer 1043 eval(shift(@insns)); 1044 eval(shift(@insns)); 1045 if ($Xi%5) { 1046 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... 1047 } else { # ... or load next one 1048 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); 1049 } 1050 &vpaddd (@X[3],@X[3],@X[-1&7]); 1051 eval(shift(@insns)); # ror 1052 eval(shift(@insns)); 1053 1054 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" 1055 eval(shift(@insns)); # body_20_39 1056 eval(shift(@insns)); 1057 eval(shift(@insns)); 1058 eval(shift(@insns)); # rol 1059 1060 &vpsrld (@X[2],@X[0],30); 1061 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 1062 eval(shift(@insns)); 1063 eval(shift(@insns)); 1064 eval(shift(@insns)); # ror 1065 eval(shift(@insns)); 1066 1067 &vpslld (@X[0],@X[0],2); 1068 eval(shift(@insns)); # body_20_39 1069 eval(shift(@insns)); 1070 eval(shift(@insns)); 1071 eval(shift(@insns)); # rol 1072 eval(shift(@insns)); 1073 eval(shift(@insns)); 1074 eval(shift(@insns)); # ror 1075 eval(shift(@insns)); 1076 1077 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 1078 eval(shift(@insns)); # body_20_39 1079 eval(shift(@insns)); 1080 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer 1081 eval(shift(@insns)); 1082 eval(shift(@insns)); # rol 1083 eval(shift(@insns)); 1084 eval(shift(@insns)); 1085 eval(shift(@insns)); # ror 1086 eval(shift(@insns)); 1087 1088 foreach (@insns) { eval; } # remaining instructions 1089 1090 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1091} 1092 1093sub Xuplast_avx_80() 1094{ use integer; 1095 my $body = shift; 1096 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1097 my ($a,$b,$c,$d,$e); 1098 1099 eval(shift(@insns)); 1100 &vpaddd (@X[3],@X[3],@X[-1&7]); 1101 eval(shift(@insns)); 1102 eval(shift(@insns)); 1103 eval(shift(@insns)); 1104 eval(shift(@insns)); 1105 1106 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU 1107 1108 foreach (@insns) { eval; } # remaining instructions 1109 1110 &mov ($inp=@T[1],&DWP(192+4,"esp")); 1111 &cmp ($inp,&DWP(192+8,"esp")); 1112 &je (&label("done")); 1113 1114 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 1115 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask 1116 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input 1117 &vmovdqu(@X[-3&7],&QWP(16,$inp)); 1118 &vmovdqu(@X[-2&7],&QWP(32,$inp)); 1119 &vmovdqu(@X[-1&7],&QWP(48,$inp)); 1120 &add ($inp,64); 1121 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 1122 &mov (&DWP(192+4,"esp"),$inp); 1123 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 1124 1125 $Xi=0; 1126} 1127 1128sub Xloop_avx() 1129{ use integer; 1130 my $body = shift; 1131 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1132 my ($a,$b,$c,$d,$e); 1133 1134 eval(shift(@insns)); 1135 eval(shift(@insns)); 1136 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 1137 eval(shift(@insns)); 1138 eval(shift(@insns)); 1139 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); 1140 eval(shift(@insns)); 1141 eval(shift(@insns)); 1142 eval(shift(@insns)); 1143 eval(shift(@insns)); 1144 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU 1145 eval(shift(@insns)); 1146 eval(shift(@insns)); 1147 1148 foreach (@insns) { eval; } 1149 $Xi++; 1150} 1151 1152sub Xtail_avx() 1153{ use integer; 1154 my $body = shift; 1155 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1156 my ($a,$b,$c,$d,$e); 1157 1158 foreach (@insns) { eval; } 1159} 1160 1161&set_label("loop",16); 1162 &Xupdate_avx_16_31(\&body_00_19); 1163 &Xupdate_avx_16_31(\&body_00_19); 1164 &Xupdate_avx_16_31(\&body_00_19); 1165 &Xupdate_avx_16_31(\&body_00_19); 1166 &Xupdate_avx_32_79(\&body_00_19); 1167 &Xupdate_avx_32_79(\&body_20_39); 1168 &Xupdate_avx_32_79(\&body_20_39); 1169 &Xupdate_avx_32_79(\&body_20_39); 1170 &Xupdate_avx_32_79(\&body_20_39); 1171 &Xupdate_avx_32_79(\&body_20_39); 1172 &Xupdate_avx_32_79(\&body_40_59); 1173 &Xupdate_avx_32_79(\&body_40_59); 1174 &Xupdate_avx_32_79(\&body_40_59); 1175 &Xupdate_avx_32_79(\&body_40_59); 1176 &Xupdate_avx_32_79(\&body_40_59); 1177 &Xupdate_avx_32_79(\&body_20_39); 1178 &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1179 1180 $saved_j=$j; @saved_V=@V; 1181 1182 &Xloop_avx(\&body_20_39); 1183 &Xloop_avx(\&body_20_39); 1184 &Xloop_avx(\&body_20_39); 1185 1186 &mov (@T[1],&DWP(192,"esp")); # update context 1187 &add ($A,&DWP(0,@T[1])); 1188 &add (@T[0],&DWP(4,@T[1])); # $b 1189 &add ($C,&DWP(8,@T[1])); 1190 &mov (&DWP(0,@T[1]),$A); 1191 &add ($D,&DWP(12,@T[1])); 1192 &mov (&DWP(4,@T[1]),@T[0]); 1193 &add ($E,&DWP(16,@T[1])); 1194 &mov (&DWP(8,@T[1]),$C); 1195 &mov ($B,@T[0]); 1196 &mov (&DWP(12,@T[1]),$D); 1197 &mov (&DWP(16,@T[1]),$E); 1198 1199 &jmp (&label("loop")); 1200 1201&set_label("done",16); $j=$saved_j; @V=@saved_V; 1202 1203 &Xtail_avx(\&body_20_39); 1204 &Xtail_avx(\&body_20_39); 1205 &Xtail_avx(\&body_20_39); 1206 1207 &vzeroall(); 1208 1209 &mov (@T[1],&DWP(192,"esp")); # update context 1210 &add ($A,&DWP(0,@T[1])); 1211 &mov ("esp",&DWP(192+12,"esp")); # restore %esp 1212 &add (@T[0],&DWP(4,@T[1])); # $b 1213 &add ($C,&DWP(8,@T[1])); 1214 &mov (&DWP(0,@T[1]),$A); 1215 &add ($D,&DWP(12,@T[1])); 1216 &mov (&DWP(4,@T[1]),@T[0]); 1217 &add ($E,&DWP(16,@T[1])); 1218 &mov (&DWP(8,@T[1]),$C); 1219 &mov (&DWP(12,@T[1]),$D); 1220 &mov (&DWP(16,@T[1]),$E); 1221&function_end("_sha1_block_data_order_avx"); 1222} 1223&set_label("K_XX_XX",64); 1224&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 1225&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 1226&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 1227&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 1228&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask 1229} 1230&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 1231 1232&asm_finish(); 1233