1194206Ssimon#!/usr/bin/env perl 255714Skris 3194206Ssimon# ==================================================================== 4194206Ssimon# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5194206Ssimon# project. The module is, however, dual licensed under OpenSSL and 6194206Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7194206Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8194206Ssimon# ==================================================================== 9194206Ssimon 10194206Ssimon# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* 11194206Ssimon# functions were re-implemented to address P4 performance issue [see 12194206Ssimon# commentary below], and in 2006 the rest was rewritten in order to 13194206Ssimon# gain freedom to liberate licensing terms. 14194206Ssimon 15238405Sjkim# January, September 2004. 16238405Sjkim# 17127128Snectar# It was noted that Intel IA-32 C compiler generates code which 18127128Snectar# performs ~30% *faster* on P4 CPU than original *hand-coded* 19127128Snectar# SHA1 assembler implementation. To address this problem (and 20127128Snectar# prove that humans are still better than machines:-), the 21127128Snectar# original code was overhauled, which resulted in following 22127128Snectar# performance changes: 23127128Snectar# 24127128Snectar# compared with original compared with Intel cc 25127128Snectar# assembler impl. generated code 26160814Ssimon# Pentium -16% +48% 27127128Snectar# PIII/AMD +8% +16% 28127128Snectar# P4 +85%(!) +45% 29127128Snectar# 30127128Snectar# As you can see Pentium came out as looser:-( Yet I reckoned that 31127128Snectar# improvement on P4 outweights the loss and incorporate this 32127128Snectar# re-tuned code to 0.9.7 and later. 33127128Snectar# ---------------------------------------------------------------- 34127128Snectar# <appro@fy.chalmers.se> 35127128Snectar 36238405Sjkim# August 2009. 37238405Sjkim# 38238405Sjkim# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as 39238405Sjkim# '(c&d) + (b&(c^d))', which allows to accumulate partial results 40238405Sjkim# and lighten "pressure" on scratch registers. This resulted in 41238405Sjkim# >12% performance improvement on contemporary AMD cores (with no 42238405Sjkim# degradation on other CPUs:-). Also, the code was revised to maximize 43238405Sjkim# "distance" between instructions producing input to 'lea' instruction 44238405Sjkim# and the 'lea' instruction itself, which is essential for Intel Atom 45238405Sjkim# core and resulted in ~15% improvement. 46238405Sjkim 47238405Sjkim# October 2010. 48238405Sjkim# 49238405Sjkim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 50238405Sjkim# is to offload message schedule denoted by Wt in NIST specification, 51238405Sjkim# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, 52238405Sjkim# and in SSE2 context was first explored by Dean Gaudet in 2004, see 53238405Sjkim# http://arctic.org/~dean/crypto/sha1.html. Since then several things 54238405Sjkim# have changed that made it interesting again: 55238405Sjkim# 56238405Sjkim# a) XMM units became faster and wider; 57238405Sjkim# b) instruction set became more versatile; 58238405Sjkim# c) an important observation was made by Max Locktykhin, which made 59238405Sjkim# it possible to reduce amount of instructions required to perform 60238405Sjkim# the operation in question, for further details see 61238405Sjkim# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. 62238405Sjkim 63238405Sjkim# April 2011. 64238405Sjkim# 65238405Sjkim# Add AVX code path, probably most controversial... The thing is that 66238405Sjkim# switch to AVX alone improves performance by as little as 4% in 67238405Sjkim# comparison to SSSE3 code path. But below result doesn't look like 68238405Sjkim# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as 69238405Sjkim# pair of �-ops, and it's the additional �-ops, two per round, that 70238405Sjkim# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded 71238405Sjkim# as single �-op by Sandy Bridge and it's replacing 'ro[rl]' with 72238405Sjkim# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 73238405Sjkim# cycles per processed byte. But 'sh[rl]d' is not something that used 74238405Sjkim# to be fast, nor does it appear to be fast in upcoming Bulldozer 75238405Sjkim# [according to its optimization manual]. Which is why AVX code path 76238405Sjkim# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. 77238405Sjkim# One can argue that it's unfair to AMD, but without 'sh[rl]d' it 78238405Sjkim# makes no sense to keep the AVX code path. If somebody feels that 79238405Sjkim# strongly, it's probably more appropriate to discuss possibility of 80238405Sjkim# using vector rotate XOP on AMD... 81238405Sjkim 82238405Sjkim###################################################################### 83238405Sjkim# Current performance is summarized in following table. Numbers are 84238405Sjkim# CPU clock cycles spent to process single byte (less is better). 85238405Sjkim# 86238405Sjkim# x86 SSSE3 AVX 87238405Sjkim# Pentium 15.7 - 88238405Sjkim# PIII 11.5 - 89238405Sjkim# P4 10.6 - 90238405Sjkim# AMD K8 7.1 - 91238405Sjkim# Core2 7.3 6.1/+20% - 92238405Sjkim# Atom 12.5 9.5(*)/+32% - 93238405Sjkim# Westmere 7.3 5.6/+30% - 94238405Sjkim# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% 95238405Sjkim# 96238405Sjkim# (*) Loop is 1056 instructions long and expected result is ~8.25. 97238405Sjkim# It remains mystery [to me] why ILP is limited to 1.7. 98238405Sjkim# 99238405Sjkim# (**) As per above comment, the result is for AVX *plus* sh[rl]d. 100238405Sjkim 101194206Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 102194206Ssimonpush(@INC,"${dir}","${dir}../../perlasm"); 10355714Skrisrequire "x86asm.pl"; 10455714Skris 10555714Skris&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); 10655714Skris 107238405Sjkim$xmm=$ymm=0; 108238405Sjkimfor (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 109238405Sjkim 110238405Sjkim$ymm=1 if ($xmm && 111238405Sjkim `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 112238405Sjkim =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 113238405Sjkim $1>=2.19); # first version supporting AVX 114238405Sjkim 115238405Sjkim$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 116238405Sjkim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 117238405Sjkim $1>=2.03); # first version supporting AVX 118238405Sjkim 119238405Sjkim&external_label("OPENSSL_ia32cap_P") if ($xmm); 120238405Sjkim 121238405Sjkim 12255714Skris$A="eax"; 123194206Ssimon$B="ebx"; 124194206Ssimon$C="ecx"; 12555714Skris$D="edx"; 12655714Skris$E="edi"; 12755714Skris$T="esi"; 12855714Skris$tmp1="ebp"; 12955714Skris 130194206Ssimon@V=($A,$B,$C,$D,$E,$T); 13155714Skris 132238405Sjkim$alt=0; # 1 denotes alternative IALU implementation, which performs 133238405Sjkim # 8% *worse* on P4, same on Westmere and Atom, 2% better on 134238405Sjkim # Sandy Bridge... 135238405Sjkim 13655714Skrissub BODY_00_15 13755714Skris { 138194206Ssimon local($n,$a,$b,$c,$d,$e,$f)=@_; 13955714Skris 14055714Skris &comment("00_15 $n"); 14155714Skris 142160814Ssimon &mov($f,$c); # f to hold F_00_19(b,c,d) 143160814Ssimon if ($n==0) { &mov($tmp1,$a); } 144160814Ssimon else { &mov($a,$tmp1); } 145127128Snectar &rotl($tmp1,5); # tmp1=ROTATE(a,5) 146127128Snectar &xor($f,$d); 147194206Ssimon &add($tmp1,$e); # tmp1+=e; 148238405Sjkim &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded 149194206Ssimon # with xi, also note that e becomes 150194206Ssimon # f in next round... 151238405Sjkim &and($f,$b); 152238405Sjkim &rotr($b,2); # b=ROTATE(b,30) 153160814Ssimon &xor($f,$d); # f holds F_00_19(b,c,d) 154238405Sjkim &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi 155160814Ssimon 156238405Sjkim if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round 157238405Sjkim &add($f,$tmp1); } # f+=tmp1 158194206Ssimon else { &add($tmp1,$f); } # f becomes a in next round 159238405Sjkim &mov($tmp1,$a) if ($alt && $n==15); 16055714Skris } 16155714Skris 16255714Skrissub BODY_16_19 16355714Skris { 164194206Ssimon local($n,$a,$b,$c,$d,$e,$f)=@_; 16555714Skris 16655714Skris &comment("16_19 $n"); 16755714Skris 168238405Sjkimif ($alt) { 169238405Sjkim &xor($c,$d); 170238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 171238405Sjkim &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d 172238405Sjkim &xor($f,&swtmp(($n+8)%16)); 173238405Sjkim &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 174194206Ssimon &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 175194206Ssimon &rotl($f,1); # f=ROTATE(f,1) 176238405Sjkim &add($e,$tmp1); # e+=F_00_19(b,c,d) 177238405Sjkim &xor($c,$d); # restore $c 178238405Sjkim &mov($tmp1,$a); # b in next round 179238405Sjkim &rotr($b,$n==16?2:7); # b=ROTATE(b,30) 180238405Sjkim &mov(&swtmp($n%16),$f); # xi=f 181238405Sjkim &rotl($a,5); # ROTATE(a,5) 182238405Sjkim &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e 183238405Sjkim &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 184238405Sjkim &add($f,$a); # f+=ROTATE(a,5) 185238405Sjkim} else { 186238405Sjkim &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) 187238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 188238405Sjkim &xor($tmp1,$d); 189238405Sjkim &xor($f,&swtmp(($n+8)%16)); 190238405Sjkim &and($tmp1,$b); 191238405Sjkim &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 192238405Sjkim &rotl($f,1); # f=ROTATE(f,1) 193160814Ssimon &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 194238405Sjkim &add($e,$tmp1); # e+=F_00_19(b,c,d) 195238405Sjkim &mov($tmp1,$a); 196238405Sjkim &rotr($b,2); # b=ROTATE(b,30) 197238405Sjkim &mov(&swtmp($n%16),$f); # xi=f 198238405Sjkim &rotl($tmp1,5); # ROTATE(a,5) 199238405Sjkim &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e 200238405Sjkim &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 201238405Sjkim &add($f,$tmp1); # f+=ROTATE(a,5) 202238405Sjkim} 20355714Skris } 20455714Skris 20555714Skrissub BODY_20_39 20655714Skris { 207194206Ssimon local($n,$a,$b,$c,$d,$e,$f)=@_; 208194206Ssimon local $K=($n<40)?0x6ed9eba1:0xca62c1d6; 20955714Skris 21055714Skris &comment("20_39 $n"); 21155714Skris 212238405Sjkimif ($alt) { 213238405Sjkim &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c 214238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 215238405Sjkim &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 216238405Sjkim &xor($f,&swtmp(($n+8)%16)); 217238405Sjkim &add($e,$tmp1); # e+=F_20_39(b,c,d) 218238405Sjkim &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 219238405Sjkim &rotl($f,1); # f=ROTATE(f,1) 220238405Sjkim &mov($tmp1,$a); # b in next round 221238405Sjkim &rotr($b,7); # b=ROTATE(b,30) 222238405Sjkim &mov(&swtmp($n%16),$f) if($n<77);# xi=f 223238405Sjkim &rotl($a,5); # ROTATE(a,5) 224238405Sjkim &xor($b,$c) if($n==39);# warm up for BODY_40_59 225238405Sjkim &and($tmp1,$b) if($n==39); 226238405Sjkim &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY 227238405Sjkim &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round 228238405Sjkim &add($f,$a); # f+=ROTATE(a,5) 229238405Sjkim &rotr($a,5) if ($n==79); 230238405Sjkim} else { 231160814Ssimon &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) 232238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 233160814Ssimon &xor($tmp1,$c); 234194206Ssimon &xor($f,&swtmp(($n+8)%16)); 235160814Ssimon &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 236194206Ssimon &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 237127128Snectar &rotl($f,1); # f=ROTATE(f,1) 238238405Sjkim &add($e,$tmp1); # e+=F_20_39(b,c,d) 239238405Sjkim &rotr($b,2); # b=ROTATE(b,30) 240238405Sjkim &mov($tmp1,$a); 241238405Sjkim &rotl($tmp1,5); # ROTATE(a,5) 242238405Sjkim &mov(&swtmp($n%16),$f) if($n<77);# xi=f 243238405Sjkim &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY 244238405Sjkim &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round 245238405Sjkim &add($f,$tmp1); # f+=ROTATE(a,5) 246238405Sjkim} 24755714Skris } 24855714Skris 24955714Skrissub BODY_40_59 25055714Skris { 251194206Ssimon local($n,$a,$b,$c,$d,$e,$f)=@_; 25255714Skris 25355714Skris &comment("40_59 $n"); 25455714Skris 255238405Sjkimif ($alt) { 256238405Sjkim &add($e,$tmp1); # e+=b&(c^d) 257238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 258238405Sjkim &mov($tmp1,$d); 259238405Sjkim &xor($f,&swtmp(($n+8)%16)); 260238405Sjkim &xor($c,$d); # restore $c 261238405Sjkim &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 262160814Ssimon &rotl($f,1); # f=ROTATE(f,1) 263238405Sjkim &and($tmp1,$c); 264238405Sjkim &rotr($b,7); # b=ROTATE(b,30) 265238405Sjkim &add($e,$tmp1); # e+=c&d 266238405Sjkim &mov($tmp1,$a); # b in next round 267238405Sjkim &mov(&swtmp($n%16),$f); # xi=f 268238405Sjkim &rotl($a,5); # ROTATE(a,5) 269238405Sjkim &xor($b,$c) if ($n<59); 270238405Sjkim &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) 271238405Sjkim &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) 272238405Sjkim &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 273238405Sjkim &add($f,$a); # f+=ROTATE(a,5) 274238405Sjkim} else { 275238405Sjkim &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) 276238405Sjkim &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 277238405Sjkim &xor($tmp1,$d); 278238405Sjkim &xor($f,&swtmp(($n+8)%16)); 279238405Sjkim &and($tmp1,$b); 280238405Sjkim &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 281238405Sjkim &rotl($f,1); # f=ROTATE(f,1) 282238405Sjkim &add($tmp1,$e); # b&(c^d)+=e 283127128Snectar &rotr($b,2); # b=ROTATE(b,30) 284238405Sjkim &mov($e,$a); # e becomes volatile 285238405Sjkim &rotl($e,5); # ROTATE(a,5) 286238405Sjkim &mov(&swtmp($n%16),$f); # xi=f 287238405Sjkim &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) 288238405Sjkim &mov($tmp1,$c); 289160814Ssimon &add($f,$e); # f+=ROTATE(a,5) 290238405Sjkim &and($tmp1,$d); 291238405Sjkim &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round 292238405Sjkim &add($f,$tmp1); # f+=c&d 293238405Sjkim} 29455714Skris } 29555714Skris 296194206Ssimon&function_begin("sha1_block_data_order"); 297238405Sjkimif ($xmm) { 298238405Sjkim &static_label("ssse3_shortcut"); 299238405Sjkim &static_label("avx_shortcut") if ($ymm); 300238405Sjkim &static_label("K_XX_XX"); 301238405Sjkim 302238405Sjkim &call (&label("pic_point")); # make it PIC! 303238405Sjkim &set_label("pic_point"); 304238405Sjkim &blindpop($tmp1); 305238405Sjkim &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); 306238405Sjkim &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 307238405Sjkim 308238405Sjkim &mov ($A,&DWP(0,$T)); 309238405Sjkim &mov ($D,&DWP(4,$T)); 310238405Sjkim &test ($D,1<<9); # check SSSE3 bit 311238405Sjkim &jz (&label("x86")); 312238405Sjkim &test ($A,1<<24); # check FXSR bit 313238405Sjkim &jz (&label("x86")); 314238405Sjkim if ($ymm) { 315238405Sjkim &and ($D,1<<28); # mask AVX bit 316238405Sjkim &and ($A,1<<30); # mask "Intel CPU" bit 317238405Sjkim &or ($A,$D); 318238405Sjkim &cmp ($A,1<<28|1<<30); 319238405Sjkim &je (&label("avx_shortcut")); 320238405Sjkim } 321238405Sjkim &jmp (&label("ssse3_shortcut")); 322238405Sjkim &set_label("x86",16); 323238405Sjkim} 324194206Ssimon &mov($tmp1,&wparam(0)); # SHA_CTX *c 325194206Ssimon &mov($T,&wparam(1)); # const void *input 326194206Ssimon &mov($A,&wparam(2)); # size_t num 327238405Sjkim &stack_push(16+3); # allocate X[16] 328194206Ssimon &shl($A,6); 329194206Ssimon &add($A,$T); 330194206Ssimon &mov(&wparam(2),$A); # pointer beyond the end of input 331194206Ssimon &mov($E,&DWP(16,$tmp1));# pre-load E 332238405Sjkim &jmp(&label("loop")); 33355714Skris 334238405Sjkim&set_label("loop",16); 33555714Skris 336194206Ssimon # copy input chunk to X, but reversing byte order! 337194206Ssimon for ($i=0; $i<16; $i+=4) 33859191Skris { 339194206Ssimon &mov($A,&DWP(4*($i+0),$T)); 340194206Ssimon &mov($B,&DWP(4*($i+1),$T)); 341194206Ssimon &mov($C,&DWP(4*($i+2),$T)); 342194206Ssimon &mov($D,&DWP(4*($i+3),$T)); 343194206Ssimon &bswap($A); 344194206Ssimon &bswap($B); 345194206Ssimon &bswap($C); 346194206Ssimon &bswap($D); 34759191Skris &mov(&swtmp($i+0),$A); 348194206Ssimon &mov(&swtmp($i+1),$B); 349194206Ssimon &mov(&swtmp($i+2),$C); 350194206Ssimon &mov(&swtmp($i+3),$D); 35159191Skris } 352194206Ssimon &mov(&wparam(1),$T); # redundant in 1st spin 35359191Skris 354194206Ssimon &mov($A,&DWP(0,$tmp1)); # load SHA_CTX 355194206Ssimon &mov($B,&DWP(4,$tmp1)); 356194206Ssimon &mov($C,&DWP(8,$tmp1)); 357194206Ssimon &mov($D,&DWP(12,$tmp1)); 358194206Ssimon # E is pre-loaded 35959191Skris 360194206Ssimon for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 361194206Ssimon for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 362194206Ssimon for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 363194206Ssimon for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 364194206Ssimon for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 36559191Skris 366194206Ssimon (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check 36759191Skris 368194206Ssimon &mov($tmp1,&wparam(0)); # re-load SHA_CTX* 369194206Ssimon &mov($D,&wparam(1)); # D is last "T" and is discarded 37059191Skris 371194206Ssimon &add($E,&DWP(0,$tmp1)); # E is last "A"... 372194206Ssimon &add($T,&DWP(4,$tmp1)); 373194206Ssimon &add($A,&DWP(8,$tmp1)); 374194206Ssimon &add($B,&DWP(12,$tmp1)); 375194206Ssimon &add($C,&DWP(16,$tmp1)); 37655714Skris 377194206Ssimon &mov(&DWP(0,$tmp1),$E); # update SHA_CTX 378194206Ssimon &add($D,64); # advance input pointer 379194206Ssimon &mov(&DWP(4,$tmp1),$T); 380194206Ssimon &cmp($D,&wparam(2)); # have we reached the end yet? 381194206Ssimon &mov(&DWP(8,$tmp1),$A); 382194206Ssimon &mov($E,$C); # C is last "E" which needs to be "pre-loaded" 383194206Ssimon &mov(&DWP(12,$tmp1),$B); 384194206Ssimon &mov($T,$D); # input pointer 385194206Ssimon &mov(&DWP(16,$tmp1),$C); 386194206Ssimon &jb(&label("loop")); 38755714Skris 388238405Sjkim &stack_pop(16+3); 389194206Ssimon&function_end("sha1_block_data_order"); 39055714Skris 391238405Sjkimif ($xmm) { 392238405Sjkim###################################################################### 393238405Sjkim# The SSSE3 implementation. 394238405Sjkim# 395238405Sjkim# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last 396238405Sjkim# 32 elements of the message schedule or Xupdate outputs. First 4 397238405Sjkim# quadruples are simply byte-swapped input, next 4 are calculated 398238405Sjkim# according to method originally suggested by Dean Gaudet (modulo 399238405Sjkim# being implemented in SSSE3). Once 8 quadruples or 32 elements are 400238405Sjkim# collected, it switches to routine proposed by Max Locktyukhin. 401238405Sjkim# 402238405Sjkim# Calculations inevitably require temporary reqisters, and there are 403238405Sjkim# no %xmm registers left to spare. For this reason part of the ring 404238405Sjkim# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring 405238405Sjkim# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - 406238405Sjkim# X[-5], and X[4] - X[-4]... 407238405Sjkim# 408238405Sjkim# Another notable optimization is aggressive stack frame compression 409238405Sjkim# aiming to minimize amount of 9-byte instructions... 410238405Sjkim# 411238405Sjkim# Yet another notable optimization is "jumping" $B variable. It means 412238405Sjkim# that there is no register permanently allocated for $B value. This 413238405Sjkim# allowed to eliminate one instruction from body_20_39... 414238405Sjkim# 415238405Sjkimmy $Xi=4; # 4xSIMD Xupdate round, start pre-seeded 416238405Sjkimmy @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 417238405Sjkimmy @V=($A,$B,$C,$D,$E); 418238405Sjkimmy $j=0; # hash round 419238405Sjkimmy @T=($T,$tmp1); 420238405Sjkimmy $inp; 421238405Sjkim 422238405Sjkimmy $_rol=sub { &rol(@_) }; 423238405Sjkimmy $_ror=sub { &ror(@_) }; 424238405Sjkim 425238405Sjkim&function_begin("_sha1_block_data_order_ssse3"); 426238405Sjkim &call (&label("pic_point")); # make it PIC! 427238405Sjkim &set_label("pic_point"); 428238405Sjkim &blindpop($tmp1); 429238405Sjkim &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 430238405Sjkim&set_label("ssse3_shortcut"); 431238405Sjkim 432238405Sjkim &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 433238405Sjkim &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 434238405Sjkim &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 435238405Sjkim &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 436238405Sjkim &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask 437238405Sjkim 438238405Sjkim &mov ($E,&wparam(0)); # load argument block 439238405Sjkim &mov ($inp=@T[1],&wparam(1)); 440238405Sjkim &mov ($D,&wparam(2)); 441238405Sjkim &mov (@T[0],"esp"); 442238405Sjkim 443238405Sjkim # stack frame layout 444238405Sjkim # 445238405Sjkim # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area 446238405Sjkim # X[4]+K X[5]+K X[6]+K X[7]+K 447238405Sjkim # X[8]+K X[9]+K X[10]+K X[11]+K 448238405Sjkim # X[12]+K X[13]+K X[14]+K X[15]+K 449238405Sjkim # 450238405Sjkim # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area 451238405Sjkim # X[4] X[5] X[6] X[7] 452238405Sjkim # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 453238405Sjkim # 454238405Sjkim # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants 455238405Sjkim # K_40_59 K_40_59 K_40_59 K_40_59 456238405Sjkim # K_60_79 K_60_79 K_60_79 K_60_79 457238405Sjkim # K_00_19 K_00_19 K_00_19 K_00_19 458238405Sjkim # pbswap mask 459238405Sjkim # 460238405Sjkim # +192 ctx # argument block 461238405Sjkim # +196 inp 462238405Sjkim # +200 end 463238405Sjkim # +204 esp 464238405Sjkim &sub ("esp",208); 465238405Sjkim &and ("esp",-64); 466238405Sjkim 467238405Sjkim &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants 468238405Sjkim &movdqa (&QWP(112+16,"esp"),@X[5]); 469238405Sjkim &movdqa (&QWP(112+32,"esp"),@X[6]); 470238405Sjkim &shl ($D,6); # len*64 471238405Sjkim &movdqa (&QWP(112+48,"esp"),@X[3]); 472238405Sjkim &add ($D,$inp); # end of input 473238405Sjkim &movdqa (&QWP(112+64,"esp"),@X[2]); 474238405Sjkim &add ($inp,64); 475238405Sjkim &mov (&DWP(192+0,"esp"),$E); # save argument block 476238405Sjkim &mov (&DWP(192+4,"esp"),$inp); 477238405Sjkim &mov (&DWP(192+8,"esp"),$D); 478238405Sjkim &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp 479238405Sjkim 480238405Sjkim &mov ($A,&DWP(0,$E)); # load context 481238405Sjkim &mov ($B,&DWP(4,$E)); 482238405Sjkim &mov ($C,&DWP(8,$E)); 483238405Sjkim &mov ($D,&DWP(12,$E)); 484238405Sjkim &mov ($E,&DWP(16,$E)); 485238405Sjkim &mov (@T[0],$B); # magic seed 486238405Sjkim 487238405Sjkim &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] 488238405Sjkim &movdqu (@X[-3&7],&QWP(-48,$inp)); 489238405Sjkim &movdqu (@X[-2&7],&QWP(-32,$inp)); 490238405Sjkim &movdqu (@X[-1&7],&QWP(-16,$inp)); 491238405Sjkim &pshufb (@X[-4&7],@X[2]); # byte swap 492238405Sjkim &pshufb (@X[-3&7],@X[2]); 493238405Sjkim &pshufb (@X[-2&7],@X[2]); 494238405Sjkim &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 495238405Sjkim &pshufb (@X[-1&7],@X[2]); 496238405Sjkim &paddd (@X[-4&7],@X[3]); # add K_00_19 497238405Sjkim &paddd (@X[-3&7],@X[3]); 498238405Sjkim &paddd (@X[-2&7],@X[3]); 499238405Sjkim &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU 500238405Sjkim &psubd (@X[-4&7],@X[3]); # restore X[] 501238405Sjkim &movdqa (&QWP(0+16,"esp"),@X[-3&7]); 502238405Sjkim &psubd (@X[-3&7],@X[3]); 503238405Sjkim &movdqa (&QWP(0+32,"esp"),@X[-2&7]); 504238405Sjkim &psubd (@X[-2&7],@X[3]); 505238405Sjkim &movdqa (@X[0],@X[-3&7]); 506238405Sjkim &jmp (&label("loop")); 507238405Sjkim 508238405Sjkim###################################################################### 509238405Sjkim# SSE instruction sequence is first broken to groups of indepentent 510238405Sjkim# instructions, independent in respect to their inputs and shifter 511238405Sjkim# (not all architectures have more than one). Then IALU instructions 512238405Sjkim# are "knitted in" between the SSE groups. Distance is maintained for 513238405Sjkim# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer 514238405Sjkim# [which allegedly also implements SSSE3]... 515238405Sjkim# 516238405Sjkim# Temporary registers usage. X[2] is volatile at the entry and at the 517238405Sjkim# end is restored from backtrace ring buffer. X[3] is expected to 518238405Sjkim# contain current K_XX_XX constant and is used to caclulate X[-1]+K 519238405Sjkim# from previous round, it becomes volatile the moment the value is 520238405Sjkim# saved to stack for transfer to IALU. X[4] becomes volatile whenever 521238405Sjkim# X[-4] is accumulated and offloaded to backtrace ring buffer, at the 522238405Sjkim# end it is loaded with next K_XX_XX [which becomes X[3] in next 523238405Sjkim# round]... 524238405Sjkim# 525238405Sjkimsub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 526238405Sjkim{ use integer; 527238405Sjkim my $body = shift; 528238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 529238405Sjkim my ($a,$b,$c,$d,$e); 530238405Sjkim 531238405Sjkim eval(shift(@insns)); 532238405Sjkim eval(shift(@insns)); 533238405Sjkim &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 534238405Sjkim &movdqa (@X[2],@X[-1&7]); 535238405Sjkim eval(shift(@insns)); 536238405Sjkim eval(shift(@insns)); 537238405Sjkim 538238405Sjkim &paddd (@X[3],@X[-1&7]); 539238405Sjkim &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer 540238405Sjkim eval(shift(@insns)); 541238405Sjkim eval(shift(@insns)); 542238405Sjkim &psrldq (@X[2],4); # "X[-3]", 3 dwords 543238405Sjkim eval(shift(@insns)); 544238405Sjkim eval(shift(@insns)); 545238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 546238405Sjkim eval(shift(@insns)); 547238405Sjkim eval(shift(@insns)); 548238405Sjkim 549238405Sjkim &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" 550238405Sjkim eval(shift(@insns)); 551238405Sjkim eval(shift(@insns)); 552238405Sjkim eval(shift(@insns)); 553238405Sjkim eval(shift(@insns)); 554238405Sjkim 555238405Sjkim &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" 556238405Sjkim eval(shift(@insns)); 557238405Sjkim eval(shift(@insns)); 558238405Sjkim &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 559238405Sjkim eval(shift(@insns)); 560238405Sjkim eval(shift(@insns)); 561238405Sjkim 562238405Sjkim &movdqa (@X[4],@X[0]); 563238405Sjkim &movdqa (@X[2],@X[0]); 564238405Sjkim eval(shift(@insns)); 565238405Sjkim eval(shift(@insns)); 566238405Sjkim eval(shift(@insns)); 567238405Sjkim eval(shift(@insns)); 568238405Sjkim 569238405Sjkim &pslldq (@X[4],12); # "X[0]"<<96, extract one dword 570238405Sjkim &paddd (@X[0],@X[0]); 571238405Sjkim eval(shift(@insns)); 572238405Sjkim eval(shift(@insns)); 573238405Sjkim eval(shift(@insns)); 574238405Sjkim eval(shift(@insns)); 575238405Sjkim 576238405Sjkim &psrld (@X[2],31); 577238405Sjkim eval(shift(@insns)); 578238405Sjkim eval(shift(@insns)); 579238405Sjkim &movdqa (@X[3],@X[4]); 580238405Sjkim eval(shift(@insns)); 581238405Sjkim eval(shift(@insns)); 582238405Sjkim 583238405Sjkim &psrld (@X[4],30); 584238405Sjkim &por (@X[0],@X[2]); # "X[0]"<<<=1 585238405Sjkim eval(shift(@insns)); 586238405Sjkim eval(shift(@insns)); 587238405Sjkim &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer 588238405Sjkim eval(shift(@insns)); 589238405Sjkim eval(shift(@insns)); 590238405Sjkim 591238405Sjkim &pslld (@X[3],2); 592238405Sjkim &pxor (@X[0],@X[4]); 593238405Sjkim eval(shift(@insns)); 594238405Sjkim eval(shift(@insns)); 595238405Sjkim &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX 596238405Sjkim eval(shift(@insns)); 597238405Sjkim eval(shift(@insns)); 598238405Sjkim 599238405Sjkim &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 600238405Sjkim &movdqa (@X[1],@X[-2&7]) if ($Xi<7); 601238405Sjkim eval(shift(@insns)); 602238405Sjkim eval(shift(@insns)); 603238405Sjkim 604238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 605238405Sjkim 606238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 607238405Sjkim} 608238405Sjkim 609238405Sjkimsub Xupdate_ssse3_32_79() 610238405Sjkim{ use integer; 611238405Sjkim my $body = shift; 612238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 613238405Sjkim my ($a,$b,$c,$d,$e); 614238405Sjkim 615238405Sjkim &movdqa (@X[2],@X[-1&7]) if ($Xi==8); 616238405Sjkim eval(shift(@insns)); # body_20_39 617238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 618238405Sjkim &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" 619238405Sjkim eval(shift(@insns)); 620238405Sjkim eval(shift(@insns)); 621238405Sjkim eval(shift(@insns)); # rol 622238405Sjkim 623238405Sjkim &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 624238405Sjkim &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer 625238405Sjkim eval(shift(@insns)); 626238405Sjkim eval(shift(@insns)); 627238405Sjkim if ($Xi%5) { 628238405Sjkim &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... 629238405Sjkim } else { # ... or load next one 630238405Sjkim &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); 631238405Sjkim } 632238405Sjkim &paddd (@X[3],@X[-1&7]); 633238405Sjkim eval(shift(@insns)); # ror 634238405Sjkim eval(shift(@insns)); 635238405Sjkim 636238405Sjkim &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" 637238405Sjkim eval(shift(@insns)); # body_20_39 638238405Sjkim eval(shift(@insns)); 639238405Sjkim eval(shift(@insns)); 640238405Sjkim eval(shift(@insns)); # rol 641238405Sjkim 642238405Sjkim &movdqa (@X[2],@X[0]); 643238405Sjkim &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 644238405Sjkim eval(shift(@insns)); 645238405Sjkim eval(shift(@insns)); 646238405Sjkim eval(shift(@insns)); # ror 647238405Sjkim eval(shift(@insns)); 648238405Sjkim 649238405Sjkim &pslld (@X[0],2); 650238405Sjkim eval(shift(@insns)); # body_20_39 651238405Sjkim eval(shift(@insns)); 652238405Sjkim &psrld (@X[2],30); 653238405Sjkim eval(shift(@insns)); 654238405Sjkim eval(shift(@insns)); # rol 655238405Sjkim eval(shift(@insns)); 656238405Sjkim eval(shift(@insns)); 657238405Sjkim eval(shift(@insns)); # ror 658238405Sjkim eval(shift(@insns)); 659238405Sjkim 660238405Sjkim &por (@X[0],@X[2]); # "X[0]"<<<=2 661238405Sjkim eval(shift(@insns)); # body_20_39 662238405Sjkim eval(shift(@insns)); 663238405Sjkim &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer 664238405Sjkim eval(shift(@insns)); 665238405Sjkim eval(shift(@insns)); # rol 666238405Sjkim eval(shift(@insns)); 667238405Sjkim eval(shift(@insns)); 668238405Sjkim eval(shift(@insns)); # ror 669238405Sjkim &movdqa (@X[3],@X[0]) if ($Xi<19); 670238405Sjkim eval(shift(@insns)); 671238405Sjkim 672238405Sjkim foreach (@insns) { eval; } # remaining instructions 673238405Sjkim 674238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 675238405Sjkim} 676238405Sjkim 677238405Sjkimsub Xuplast_ssse3_80() 678238405Sjkim{ use integer; 679238405Sjkim my $body = shift; 680238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 681238405Sjkim my ($a,$b,$c,$d,$e); 682238405Sjkim 683238405Sjkim eval(shift(@insns)); 684238405Sjkim &paddd (@X[3],@X[-1&7]); 685238405Sjkim eval(shift(@insns)); 686238405Sjkim eval(shift(@insns)); 687238405Sjkim eval(shift(@insns)); 688238405Sjkim eval(shift(@insns)); 689238405Sjkim 690238405Sjkim &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU 691238405Sjkim 692238405Sjkim foreach (@insns) { eval; } # remaining instructions 693238405Sjkim 694238405Sjkim &mov ($inp=@T[1],&DWP(192+4,"esp")); 695238405Sjkim &cmp ($inp,&DWP(192+8,"esp")); 696238405Sjkim &je (&label("done")); 697238405Sjkim 698238405Sjkim &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 699238405Sjkim &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask 700238405Sjkim &movdqu (@X[-4&7],&QWP(0,$inp)); # load input 701238405Sjkim &movdqu (@X[-3&7],&QWP(16,$inp)); 702238405Sjkim &movdqu (@X[-2&7],&QWP(32,$inp)); 703238405Sjkim &movdqu (@X[-1&7],&QWP(48,$inp)); 704238405Sjkim &add ($inp,64); 705238405Sjkim &pshufb (@X[-4&7],@X[2]); # byte swap 706238405Sjkim &mov (&DWP(192+4,"esp"),$inp); 707238405Sjkim &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 708238405Sjkim 709238405Sjkim $Xi=0; 710238405Sjkim} 711238405Sjkim 712238405Sjkimsub Xloop_ssse3() 713238405Sjkim{ use integer; 714238405Sjkim my $body = shift; 715238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 716238405Sjkim my ($a,$b,$c,$d,$e); 717238405Sjkim 718238405Sjkim eval(shift(@insns)); 719238405Sjkim eval(shift(@insns)); 720238405Sjkim &pshufb (@X[($Xi-3)&7],@X[2]); 721238405Sjkim eval(shift(@insns)); 722238405Sjkim eval(shift(@insns)); 723238405Sjkim &paddd (@X[($Xi-4)&7],@X[3]); 724238405Sjkim eval(shift(@insns)); 725238405Sjkim eval(shift(@insns)); 726238405Sjkim eval(shift(@insns)); 727238405Sjkim eval(shift(@insns)); 728238405Sjkim &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU 729238405Sjkim eval(shift(@insns)); 730238405Sjkim eval(shift(@insns)); 731238405Sjkim &psubd (@X[($Xi-4)&7],@X[3]); 732238405Sjkim 733238405Sjkim foreach (@insns) { eval; } 734238405Sjkim $Xi++; 735238405Sjkim} 736238405Sjkim 737238405Sjkimsub Xtail_ssse3() 738238405Sjkim{ use integer; 739238405Sjkim my $body = shift; 740238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 741238405Sjkim my ($a,$b,$c,$d,$e); 742238405Sjkim 743238405Sjkim foreach (@insns) { eval; } 744238405Sjkim} 745238405Sjkim 746238405Sjkimsub body_00_19 () { 747238405Sjkim ( 748238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 749238405Sjkim '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer 750238405Sjkim '&xor ($c,$d);', 751238405Sjkim '&mov (@T[1],$a);', # $b in next round 752238405Sjkim '&$_rol ($a,5);', 753238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 754238405Sjkim '&xor ($c,$d);', # restore $c 755238405Sjkim '&xor (@T[0],$d);', 756238405Sjkim '&add ($e,$a);', 757238405Sjkim '&$_ror ($b,$j?7:2);', # $b>>>2 758238405Sjkim '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 759238405Sjkim ); 760238405Sjkim} 761238405Sjkim 762238405Sjkimsub body_20_39 () { 763238405Sjkim ( 764238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 765238405Sjkim '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer 766238405Sjkim '&xor (@T[0],$d);', # ($b^$d) 767238405Sjkim '&mov (@T[1],$a);', # $b in next round 768238405Sjkim '&$_rol ($a,5);', 769238405Sjkim '&xor (@T[0],$c);', # ($b^$d^$c) 770238405Sjkim '&add ($e,$a);', 771238405Sjkim '&$_ror ($b,7);', # $b>>>2 772238405Sjkim '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 773238405Sjkim ); 774238405Sjkim} 775238405Sjkim 776238405Sjkimsub body_40_59 () { 777238405Sjkim ( 778238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 779238405Sjkim '&mov (@T[1],$c);', 780238405Sjkim '&xor ($c,$d);', 781238405Sjkim '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer 782238405Sjkim '&and (@T[1],$d);', 783238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 784238405Sjkim '&$_ror ($b,7);', # $b>>>2 785238405Sjkim '&add ($e,@T[1]);', 786238405Sjkim '&mov (@T[1],$a);', # $b in next round 787238405Sjkim '&$_rol ($a,5);', 788238405Sjkim '&add ($e,@T[0]);', 789238405Sjkim '&xor ($c,$d);', # restore $c 790238405Sjkim '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 791238405Sjkim ); 792238405Sjkim} 793238405Sjkim 794238405Sjkim&set_label("loop",16); 795238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 796238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 797238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 798238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 799238405Sjkim &Xupdate_ssse3_32_79(\&body_00_19); 800238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 801238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 802238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 803238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 804238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 805238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 806238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 807238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 808238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 809238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 810238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 811238405Sjkim &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 812238405Sjkim 813238405Sjkim $saved_j=$j; @saved_V=@V; 814238405Sjkim 815238405Sjkim &Xloop_ssse3(\&body_20_39); 816238405Sjkim &Xloop_ssse3(\&body_20_39); 817238405Sjkim &Xloop_ssse3(\&body_20_39); 818238405Sjkim 819238405Sjkim &mov (@T[1],&DWP(192,"esp")); # update context 820238405Sjkim &add ($A,&DWP(0,@T[1])); 821238405Sjkim &add (@T[0],&DWP(4,@T[1])); # $b 822238405Sjkim &add ($C,&DWP(8,@T[1])); 823238405Sjkim &mov (&DWP(0,@T[1]),$A); 824238405Sjkim &add ($D,&DWP(12,@T[1])); 825238405Sjkim &mov (&DWP(4,@T[1]),@T[0]); 826238405Sjkim &add ($E,&DWP(16,@T[1])); 827238405Sjkim &mov (&DWP(8,@T[1]),$C); 828238405Sjkim &mov ($B,@T[0]); 829238405Sjkim &mov (&DWP(12,@T[1]),$D); 830238405Sjkim &mov (&DWP(16,@T[1]),$E); 831238405Sjkim &movdqa (@X[0],@X[-3&7]); 832238405Sjkim 833238405Sjkim &jmp (&label("loop")); 834238405Sjkim 835238405Sjkim&set_label("done",16); $j=$saved_j; @V=@saved_V; 836238405Sjkim 837238405Sjkim &Xtail_ssse3(\&body_20_39); 838238405Sjkim &Xtail_ssse3(\&body_20_39); 839238405Sjkim &Xtail_ssse3(\&body_20_39); 840238405Sjkim 841238405Sjkim &mov (@T[1],&DWP(192,"esp")); # update context 842238405Sjkim &add ($A,&DWP(0,@T[1])); 843238405Sjkim &mov ("esp",&DWP(192+12,"esp")); # restore %esp 844238405Sjkim &add (@T[0],&DWP(4,@T[1])); # $b 845238405Sjkim &add ($C,&DWP(8,@T[1])); 846238405Sjkim &mov (&DWP(0,@T[1]),$A); 847238405Sjkim &add ($D,&DWP(12,@T[1])); 848238405Sjkim &mov (&DWP(4,@T[1]),@T[0]); 849238405Sjkim &add ($E,&DWP(16,@T[1])); 850238405Sjkim &mov (&DWP(8,@T[1]),$C); 851238405Sjkim &mov (&DWP(12,@T[1]),$D); 852238405Sjkim &mov (&DWP(16,@T[1]),$E); 853238405Sjkim 854238405Sjkim&function_end("_sha1_block_data_order_ssse3"); 855238405Sjkim 856238405Sjkimif ($ymm) { 857238405Sjkimmy $Xi=4; # 4xSIMD Xupdate round, start pre-seeded 858238405Sjkimmy @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 859238405Sjkimmy @V=($A,$B,$C,$D,$E); 860238405Sjkimmy $j=0; # hash round 861238405Sjkimmy @T=($T,$tmp1); 862238405Sjkimmy $inp; 863238405Sjkim 864238405Sjkimmy $_rol=sub { &shld(@_[0],@_) }; 865238405Sjkimmy $_ror=sub { &shrd(@_[0],@_) }; 866238405Sjkim 867238405Sjkim&function_begin("_sha1_block_data_order_avx"); 868238405Sjkim &call (&label("pic_point")); # make it PIC! 869238405Sjkim &set_label("pic_point"); 870238405Sjkim &blindpop($tmp1); 871238405Sjkim &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); 872238405Sjkim&set_label("avx_shortcut"); 873238405Sjkim &vzeroall(); 874238405Sjkim 875238405Sjkim &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 876238405Sjkim &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 877238405Sjkim &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 878238405Sjkim &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 879238405Sjkim &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask 880238405Sjkim 881238405Sjkim &mov ($E,&wparam(0)); # load argument block 882238405Sjkim &mov ($inp=@T[1],&wparam(1)); 883238405Sjkim &mov ($D,&wparam(2)); 884238405Sjkim &mov (@T[0],"esp"); 885238405Sjkim 886238405Sjkim # stack frame layout 887238405Sjkim # 888238405Sjkim # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area 889238405Sjkim # X[4]+K X[5]+K X[6]+K X[7]+K 890238405Sjkim # X[8]+K X[9]+K X[10]+K X[11]+K 891238405Sjkim # X[12]+K X[13]+K X[14]+K X[15]+K 892238405Sjkim # 893238405Sjkim # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area 894238405Sjkim # X[4] X[5] X[6] X[7] 895238405Sjkim # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 896238405Sjkim # 897238405Sjkim # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants 898238405Sjkim # K_40_59 K_40_59 K_40_59 K_40_59 899238405Sjkim # K_60_79 K_60_79 K_60_79 K_60_79 900238405Sjkim # K_00_19 K_00_19 K_00_19 K_00_19 901238405Sjkim # pbswap mask 902238405Sjkim # 903238405Sjkim # +192 ctx # argument block 904238405Sjkim # +196 inp 905238405Sjkim # +200 end 906238405Sjkim # +204 esp 907238405Sjkim &sub ("esp",208); 908238405Sjkim &and ("esp",-64); 909238405Sjkim 910238405Sjkim &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants 911238405Sjkim &vmovdqa(&QWP(112+16,"esp"),@X[5]); 912238405Sjkim &vmovdqa(&QWP(112+32,"esp"),@X[6]); 913238405Sjkim &shl ($D,6); # len*64 914238405Sjkim &vmovdqa(&QWP(112+48,"esp"),@X[3]); 915238405Sjkim &add ($D,$inp); # end of input 916238405Sjkim &vmovdqa(&QWP(112+64,"esp"),@X[2]); 917238405Sjkim &add ($inp,64); 918238405Sjkim &mov (&DWP(192+0,"esp"),$E); # save argument block 919238405Sjkim &mov (&DWP(192+4,"esp"),$inp); 920238405Sjkim &mov (&DWP(192+8,"esp"),$D); 921238405Sjkim &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp 922238405Sjkim 923238405Sjkim &mov ($A,&DWP(0,$E)); # load context 924238405Sjkim &mov ($B,&DWP(4,$E)); 925238405Sjkim &mov ($C,&DWP(8,$E)); 926238405Sjkim &mov ($D,&DWP(12,$E)); 927238405Sjkim &mov ($E,&DWP(16,$E)); 928238405Sjkim &mov (@T[0],$B); # magic seed 929238405Sjkim 930238405Sjkim &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] 931238405Sjkim &vmovdqu(@X[-3&7],&QWP(-48,$inp)); 932238405Sjkim &vmovdqu(@X[-2&7],&QWP(-32,$inp)); 933238405Sjkim &vmovdqu(@X[-1&7],&QWP(-16,$inp)); 934238405Sjkim &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 935238405Sjkim &vpshufb(@X[-3&7],@X[-3&7],@X[2]); 936238405Sjkim &vpshufb(@X[-2&7],@X[-2&7],@X[2]); 937238405Sjkim &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 938238405Sjkim &vpshufb(@X[-1&7],@X[-1&7],@X[2]); 939238405Sjkim &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 940238405Sjkim &vpaddd (@X[1],@X[-3&7],@X[3]); 941238405Sjkim &vpaddd (@X[2],@X[-2&7],@X[3]); 942238405Sjkim &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU 943238405Sjkim &vmovdqa(&QWP(0+16,"esp"),@X[1]); 944238405Sjkim &vmovdqa(&QWP(0+32,"esp"),@X[2]); 945238405Sjkim &jmp (&label("loop")); 946238405Sjkim 947238405Sjkimsub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 948238405Sjkim{ use integer; 949238405Sjkim my $body = shift; 950238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 951238405Sjkim my ($a,$b,$c,$d,$e); 952238405Sjkim 953238405Sjkim eval(shift(@insns)); 954238405Sjkim eval(shift(@insns)); 955238405Sjkim &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 956238405Sjkim eval(shift(@insns)); 957238405Sjkim eval(shift(@insns)); 958238405Sjkim 959238405Sjkim &vpaddd (@X[3],@X[3],@X[-1&7]); 960238405Sjkim &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer 961238405Sjkim eval(shift(@insns)); 962238405Sjkim eval(shift(@insns)); 963238405Sjkim &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords 964238405Sjkim eval(shift(@insns)); 965238405Sjkim eval(shift(@insns)); 966238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 967238405Sjkim eval(shift(@insns)); 968238405Sjkim eval(shift(@insns)); 969238405Sjkim 970238405Sjkim &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" 971238405Sjkim eval(shift(@insns)); 972238405Sjkim eval(shift(@insns)); 973238405Sjkim &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 974238405Sjkim eval(shift(@insns)); 975238405Sjkim eval(shift(@insns)); 976238405Sjkim 977238405Sjkim &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" 978238405Sjkim eval(shift(@insns)); 979238405Sjkim eval(shift(@insns)); 980238405Sjkim eval(shift(@insns)); 981238405Sjkim eval(shift(@insns)); 982238405Sjkim 983238405Sjkim &vpsrld (@X[2],@X[0],31); 984238405Sjkim eval(shift(@insns)); 985238405Sjkim eval(shift(@insns)); 986238405Sjkim eval(shift(@insns)); 987238405Sjkim eval(shift(@insns)); 988238405Sjkim 989238405Sjkim &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword 990238405Sjkim &vpaddd (@X[0],@X[0],@X[0]); 991238405Sjkim eval(shift(@insns)); 992238405Sjkim eval(shift(@insns)); 993238405Sjkim eval(shift(@insns)); 994238405Sjkim eval(shift(@insns)); 995238405Sjkim 996238405Sjkim &vpsrld (@X[3],@X[4],30); 997238405Sjkim &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 998238405Sjkim eval(shift(@insns)); 999238405Sjkim eval(shift(@insns)); 1000238405Sjkim eval(shift(@insns)); 1001238405Sjkim eval(shift(@insns)); 1002238405Sjkim 1003238405Sjkim &vpslld (@X[4],@X[4],2); 1004238405Sjkim &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer 1005238405Sjkim eval(shift(@insns)); 1006238405Sjkim eval(shift(@insns)); 1007238405Sjkim &vpxor (@X[0],@X[0],@X[3]); 1008238405Sjkim eval(shift(@insns)); 1009238405Sjkim eval(shift(@insns)); 1010238405Sjkim eval(shift(@insns)); 1011238405Sjkim eval(shift(@insns)); 1012238405Sjkim 1013238405Sjkim &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 1014238405Sjkim eval(shift(@insns)); 1015238405Sjkim eval(shift(@insns)); 1016238405Sjkim &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX 1017238405Sjkim eval(shift(@insns)); 1018238405Sjkim eval(shift(@insns)); 1019238405Sjkim 1020238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 1021238405Sjkim 1022238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 1023238405Sjkim} 1024238405Sjkim 1025238405Sjkimsub Xupdate_avx_32_79() 1026238405Sjkim{ use integer; 1027238405Sjkim my $body = shift; 1028238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 1029238405Sjkim my ($a,$b,$c,$d,$e); 1030238405Sjkim 1031238405Sjkim &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1032238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1033238405Sjkim eval(shift(@insns)); # body_20_39 1034238405Sjkim eval(shift(@insns)); 1035238405Sjkim eval(shift(@insns)); 1036238405Sjkim eval(shift(@insns)); # rol 1037238405Sjkim 1038238405Sjkim &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1039238405Sjkim &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer 1040238405Sjkim eval(shift(@insns)); 1041238405Sjkim eval(shift(@insns)); 1042238405Sjkim if ($Xi%5) { 1043238405Sjkim &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... 1044238405Sjkim } else { # ... or load next one 1045238405Sjkim &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); 1046238405Sjkim } 1047238405Sjkim &vpaddd (@X[3],@X[3],@X[-1&7]); 1048238405Sjkim eval(shift(@insns)); # ror 1049238405Sjkim eval(shift(@insns)); 1050238405Sjkim 1051238405Sjkim &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" 1052238405Sjkim eval(shift(@insns)); # body_20_39 1053238405Sjkim eval(shift(@insns)); 1054238405Sjkim eval(shift(@insns)); 1055238405Sjkim eval(shift(@insns)); # rol 1056238405Sjkim 1057238405Sjkim &vpsrld (@X[2],@X[0],30); 1058238405Sjkim &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU 1059238405Sjkim eval(shift(@insns)); 1060238405Sjkim eval(shift(@insns)); 1061238405Sjkim eval(shift(@insns)); # ror 1062238405Sjkim eval(shift(@insns)); 1063238405Sjkim 1064238405Sjkim &vpslld (@X[0],@X[0],2); 1065238405Sjkim eval(shift(@insns)); # body_20_39 1066238405Sjkim eval(shift(@insns)); 1067238405Sjkim eval(shift(@insns)); 1068238405Sjkim eval(shift(@insns)); # rol 1069238405Sjkim eval(shift(@insns)); 1070238405Sjkim eval(shift(@insns)); 1071238405Sjkim eval(shift(@insns)); # ror 1072238405Sjkim eval(shift(@insns)); 1073238405Sjkim 1074238405Sjkim &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 1075238405Sjkim eval(shift(@insns)); # body_20_39 1076238405Sjkim eval(shift(@insns)); 1077238405Sjkim &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer 1078238405Sjkim eval(shift(@insns)); 1079238405Sjkim eval(shift(@insns)); # rol 1080238405Sjkim eval(shift(@insns)); 1081238405Sjkim eval(shift(@insns)); 1082238405Sjkim eval(shift(@insns)); # ror 1083238405Sjkim eval(shift(@insns)); 1084238405Sjkim 1085238405Sjkim foreach (@insns) { eval; } # remaining instructions 1086238405Sjkim 1087238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 1088238405Sjkim} 1089238405Sjkim 1090238405Sjkimsub Xuplast_avx_80() 1091238405Sjkim{ use integer; 1092238405Sjkim my $body = shift; 1093238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1094238405Sjkim my ($a,$b,$c,$d,$e); 1095238405Sjkim 1096238405Sjkim eval(shift(@insns)); 1097238405Sjkim &vpaddd (@X[3],@X[3],@X[-1&7]); 1098238405Sjkim eval(shift(@insns)); 1099238405Sjkim eval(shift(@insns)); 1100238405Sjkim eval(shift(@insns)); 1101238405Sjkim eval(shift(@insns)); 1102238405Sjkim 1103238405Sjkim &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU 1104238405Sjkim 1105238405Sjkim foreach (@insns) { eval; } # remaining instructions 1106238405Sjkim 1107238405Sjkim &mov ($inp=@T[1],&DWP(192+4,"esp")); 1108238405Sjkim &cmp ($inp,&DWP(192+8,"esp")); 1109238405Sjkim &je (&label("done")); 1110238405Sjkim 1111238405Sjkim &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 1112238405Sjkim &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask 1113238405Sjkim &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input 1114238405Sjkim &vmovdqu(@X[-3&7],&QWP(16,$inp)); 1115238405Sjkim &vmovdqu(@X[-2&7],&QWP(32,$inp)); 1116238405Sjkim &vmovdqu(@X[-1&7],&QWP(48,$inp)); 1117238405Sjkim &add ($inp,64); 1118238405Sjkim &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 1119238405Sjkim &mov (&DWP(192+4,"esp"),$inp); 1120238405Sjkim &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot 1121238405Sjkim 1122238405Sjkim $Xi=0; 1123238405Sjkim} 1124238405Sjkim 1125238405Sjkimsub Xloop_avx() 1126238405Sjkim{ use integer; 1127238405Sjkim my $body = shift; 1128238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1129238405Sjkim my ($a,$b,$c,$d,$e); 1130238405Sjkim 1131238405Sjkim eval(shift(@insns)); 1132238405Sjkim eval(shift(@insns)); 1133238405Sjkim &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 1134238405Sjkim eval(shift(@insns)); 1135238405Sjkim eval(shift(@insns)); 1136238405Sjkim &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); 1137238405Sjkim eval(shift(@insns)); 1138238405Sjkim eval(shift(@insns)); 1139238405Sjkim eval(shift(@insns)); 1140238405Sjkim eval(shift(@insns)); 1141238405Sjkim &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU 1142238405Sjkim eval(shift(@insns)); 1143238405Sjkim eval(shift(@insns)); 1144238405Sjkim 1145238405Sjkim foreach (@insns) { eval; } 1146238405Sjkim $Xi++; 1147238405Sjkim} 1148238405Sjkim 1149238405Sjkimsub Xtail_avx() 1150238405Sjkim{ use integer; 1151238405Sjkim my $body = shift; 1152238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1153238405Sjkim my ($a,$b,$c,$d,$e); 1154238405Sjkim 1155238405Sjkim foreach (@insns) { eval; } 1156238405Sjkim} 1157238405Sjkim 1158238405Sjkim&set_label("loop",16); 1159238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 1160238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 1161238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 1162238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 1163238405Sjkim &Xupdate_avx_32_79(\&body_00_19); 1164238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1165238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1166238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1167238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1168238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1169238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1170238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1171238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1172238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1173238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1174238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1175238405Sjkim &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1176238405Sjkim 1177238405Sjkim $saved_j=$j; @saved_V=@V; 1178238405Sjkim 1179238405Sjkim &Xloop_avx(\&body_20_39); 1180238405Sjkim &Xloop_avx(\&body_20_39); 1181238405Sjkim &Xloop_avx(\&body_20_39); 1182238405Sjkim 1183238405Sjkim &mov (@T[1],&DWP(192,"esp")); # update context 1184238405Sjkim &add ($A,&DWP(0,@T[1])); 1185238405Sjkim &add (@T[0],&DWP(4,@T[1])); # $b 1186238405Sjkim &add ($C,&DWP(8,@T[1])); 1187238405Sjkim &mov (&DWP(0,@T[1]),$A); 1188238405Sjkim &add ($D,&DWP(12,@T[1])); 1189238405Sjkim &mov (&DWP(4,@T[1]),@T[0]); 1190238405Sjkim &add ($E,&DWP(16,@T[1])); 1191238405Sjkim &mov (&DWP(8,@T[1]),$C); 1192238405Sjkim &mov ($B,@T[0]); 1193238405Sjkim &mov (&DWP(12,@T[1]),$D); 1194238405Sjkim &mov (&DWP(16,@T[1]),$E); 1195238405Sjkim 1196238405Sjkim &jmp (&label("loop")); 1197238405Sjkim 1198238405Sjkim&set_label("done",16); $j=$saved_j; @V=@saved_V; 1199238405Sjkim 1200238405Sjkim &Xtail_avx(\&body_20_39); 1201238405Sjkim &Xtail_avx(\&body_20_39); 1202238405Sjkim &Xtail_avx(\&body_20_39); 1203238405Sjkim 1204238405Sjkim &vzeroall(); 1205238405Sjkim 1206238405Sjkim &mov (@T[1],&DWP(192,"esp")); # update context 1207238405Sjkim &add ($A,&DWP(0,@T[1])); 1208238405Sjkim &mov ("esp",&DWP(192+12,"esp")); # restore %esp 1209238405Sjkim &add (@T[0],&DWP(4,@T[1])); # $b 1210238405Sjkim &add ($C,&DWP(8,@T[1])); 1211238405Sjkim &mov (&DWP(0,@T[1]),$A); 1212238405Sjkim &add ($D,&DWP(12,@T[1])); 1213238405Sjkim &mov (&DWP(4,@T[1]),@T[0]); 1214238405Sjkim &add ($E,&DWP(16,@T[1])); 1215238405Sjkim &mov (&DWP(8,@T[1]),$C); 1216238405Sjkim &mov (&DWP(12,@T[1]),$D); 1217238405Sjkim &mov (&DWP(16,@T[1]),$E); 1218238405Sjkim&function_end("_sha1_block_data_order_avx"); 1219238405Sjkim} 1220238405Sjkim&set_label("K_XX_XX",64); 1221238405Sjkim&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 1222238405Sjkim&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 1223238405Sjkim&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 1224238405Sjkim&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 1225238405Sjkim&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask 1226238405Sjkim} 1227238405Sjkim&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 1228238405Sjkim 1229194206Ssimon&asm_finish(); 1230