1183234Ssimon#!/usr/bin/env perl
2183234Ssimon#
3183234Ssimon# ====================================================================
4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and
6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further
7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/.
8183234Ssimon# ====================================================================
9183234Ssimon#
10183234Ssimon# sha1_block procedure for x86_64.
11183234Ssimon#
12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code
13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on
14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit
15183234Ssimon# assembler, which originally made it hard to motivate the effort.
16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I
17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank
18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19238405Sjkim# implementation:-) However! While 64-bit code does perform better
20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core
22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T
23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as
24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the
25183234Ssimon# following table:
26183234Ssimon#
27183234Ssimon#		gcc 3.4		32-bit asm	cycles/byte
28183234Ssimon# Opteron	+45%		+20%		6.8
29183234Ssimon# Xeon P4	+65%		+0%		9.9
30183234Ssimon# Core2		+60%		+10%		7.0
31183234Ssimon
32238405Sjkim# August 2009.
33238405Sjkim#
34238405Sjkim# The code was revised to minimize code size and to maximize
35238405Sjkim# "distance" between instructions producing input to 'lea'
36238405Sjkim# instruction and the 'lea' instruction itself, which is essential
37238405Sjkim# for Intel Atom core.
38183234Ssimon
39238405Sjkim# October 2010.
40238405Sjkim#
41238405Sjkim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42238405Sjkim# is to offload message schedule denoted by Wt in NIST specification,
43238405Sjkim# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44238405Sjkim# for background and implementation details. The only difference from
45238405Sjkim# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46238405Sjkim# to free temporary registers.
47238405Sjkim
48238405Sjkim# April 2011.
49238405Sjkim#
50238405Sjkim# Add AVX code path. See sha1-586.pl for further information.
51238405Sjkim
52238405Sjkim######################################################################
53238405Sjkim# Current performance is summarized in following table. Numbers are
54238405Sjkim# CPU clock cycles spent to process single byte (less is better).
55238405Sjkim#
56238405Sjkim#		x86_64		SSSE3		AVX
57238405Sjkim# P4		9.8		-
58238405Sjkim# Opteron	6.6		-
59238405Sjkim# Core2		6.7		6.1/+10%	-
60238405Sjkim# Atom		11.0		9.7/+13%	-
61238405Sjkim# Westmere	7.1		5.6/+27%	-
62238405Sjkim# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
63238405Sjkim
64238405Sjkim$flavour = shift;
65238405Sjkim$output  = shift;
66238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67238405Sjkim
68238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69238405Sjkim
70183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73183234Ssimondie "can't locate x86_64-xlate.pl";
74183234Ssimon
75238405Sjkim$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76238405Sjkim		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77238405Sjkim	   $1>=2.19);
78238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79238405Sjkim	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80238405Sjkim	   $1>=2.09);
81238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82238405Sjkim	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83238405Sjkim	   $1>=10);
84299964Sjkim$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ &&
85299964Sjkim	   $2>=3.0);
86183234Ssimon
87246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
88246772Sjkim*STDOUT=*OUT;
89238405Sjkim
90183234Ssimon$ctx="%rdi";	# 1st arg
91183234Ssimon$inp="%rsi";	# 2nd arg
92183234Ssimon$num="%rdx";	# 3rd arg
93183234Ssimon
94183234Ssimon# reassign arguments in order to produce more compact code
95183234Ssimon$ctx="%r8";
96183234Ssimon$inp="%r9";
97183234Ssimon$num="%r10";
98183234Ssimon
99238405Sjkim$t0="%eax";
100238405Sjkim$t1="%ebx";
101238405Sjkim$t2="%ecx";
102238405Sjkim@xi=("%edx","%ebp");
103238405Sjkim$A="%esi";
104238405Sjkim$B="%edi";
105238405Sjkim$C="%r11d";
106238405Sjkim$D="%r12d";
107238405Sjkim$E="%r13d";
108183234Ssimon
109238405Sjkim@V=($A,$B,$C,$D,$E);
110183234Ssimon
111183234Ssimonsub BODY_00_19 {
112238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
113183234Ssimonmy $j=$i+1;
114183234Ssimon$code.=<<___ if ($i==0);
115238405Sjkim	mov	`4*$i`($inp),$xi[0]
116238405Sjkim	bswap	$xi[0]
117238405Sjkim	mov	$xi[0],`4*$i`(%rsp)
118183234Ssimon___
119183234Ssimon$code.=<<___ if ($i<15);
120183234Ssimon	mov	$c,$t0
121238405Sjkim	mov	`4*$j`($inp),$xi[1]
122238405Sjkim	mov	$a,$t2
123183234Ssimon	xor	$d,$t0
124238405Sjkim	bswap	$xi[1]
125238405Sjkim	rol	\$5,$t2
126238405Sjkim	lea	0x5a827999($xi[0],$e),$e
127183234Ssimon	and	$b,$t0
128238405Sjkim	mov	$xi[1],`4*$j`(%rsp)
129238405Sjkim	add	$t2,$e
130183234Ssimon	xor	$d,$t0
131183234Ssimon	rol	\$30,$b
132238405Sjkim	add	$t0,$e
133183234Ssimon___
134183234Ssimon$code.=<<___ if ($i>=15);
135238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
136183234Ssimon	mov	$c,$t0
137238405Sjkim	mov	$a,$t2
138238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
139183234Ssimon	xor	$d,$t0
140238405Sjkim	rol	\$5,$t2
141238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
142183234Ssimon	and	$b,$t0
143238405Sjkim	lea	0x5a827999($xi[0],$e),$e
144238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
145183234Ssimon	xor	$d,$t0
146238405Sjkim	rol	\$1,$xi[1]
147238405Sjkim	add	$t2,$e
148183234Ssimon	rol	\$30,$b
149238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
150238405Sjkim	add	$t0,$e
151183234Ssimon___
152238405Sjkimunshift(@xi,pop(@xi));
153183234Ssimon}
154183234Ssimon
155183234Ssimonsub BODY_20_39 {
156238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
157183234Ssimonmy $j=$i+1;
158183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
159183234Ssimon$code.=<<___ if ($i<79);
160238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
161183234Ssimon	mov	$c,$t0
162238405Sjkim	mov	$a,$t2
163238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
164183234Ssimon	xor	$b,$t0
165238405Sjkim	rol	\$5,$t2
166238405Sjkim	lea	$K($xi[0],$e),$e
167238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
168183234Ssimon	xor	$d,$t0
169238405Sjkim	add	$t2,$e
170238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
171183234Ssimon	rol	\$30,$b
172238405Sjkim	add	$t0,$e
173238405Sjkim	rol	\$1,$xi[1]
174183234Ssimon___
175183234Ssimon$code.=<<___ if ($i<76);
176238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
177183234Ssimon___
178183234Ssimon$code.=<<___ if ($i==79);
179183234Ssimon	mov	$c,$t0
180238405Sjkim	mov	$a,$t2
181183234Ssimon	xor	$b,$t0
182238405Sjkim	lea	$K($xi[0],$e),$e
183238405Sjkim	rol	\$5,$t2
184183234Ssimon	xor	$d,$t0
185238405Sjkim	add	$t2,$e
186183234Ssimon	rol	\$30,$b
187238405Sjkim	add	$t0,$e
188183234Ssimon___
189238405Sjkimunshift(@xi,pop(@xi));
190183234Ssimon}
191183234Ssimon
192183234Ssimonsub BODY_40_59 {
193238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
194183234Ssimonmy $j=$i+1;
195183234Ssimon$code.=<<___;
196238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
197238405Sjkim	mov	$c,$t0
198238405Sjkim	mov	$c,$t1
199238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
200238405Sjkim	and	$d,$t0
201238405Sjkim	mov	$a,$t2
202238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
203238405Sjkim	xor	$d,$t1
204238405Sjkim	lea	0x8f1bbcdc($xi[0],$e),$e
205238405Sjkim	rol	\$5,$t2
206238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
207238405Sjkim	add	$t0,$e
208238405Sjkim	and	$b,$t1
209238405Sjkim	rol	\$1,$xi[1]
210238405Sjkim	add	$t1,$e
211183234Ssimon	rol	\$30,$b
212238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
213238405Sjkim	add	$t2,$e
214183234Ssimon___
215238405Sjkimunshift(@xi,pop(@xi));
216183234Ssimon}
217183234Ssimon
218238405Sjkim$code.=<<___;
219238405Sjkim.text
220238405Sjkim.extern	OPENSSL_ia32cap_P
221183234Ssimon
222238405Sjkim.globl	sha1_block_data_order
223238405Sjkim.type	sha1_block_data_order,\@function,3
224238405Sjkim.align	16
225238405Sjkimsha1_block_data_order:
226238405Sjkim	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
227238405Sjkim	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
228238405Sjkim	test	\$`1<<9`,%r8d		# check SSSE3 bit
229238405Sjkim	jz	.Lialu
230238405Sjkim___
231238405Sjkim$code.=<<___ if ($avx);
232238405Sjkim	and	\$`1<<28`,%r8d		# mask AVX bit
233238405Sjkim	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
234238405Sjkim	or	%r9d,%r8d
235238405Sjkim	cmp	\$`1<<28|1<<30`,%r8d
236238405Sjkim	je	_avx_shortcut
237238405Sjkim___
238238405Sjkim$code.=<<___;
239238405Sjkim	jmp	_ssse3_shortcut
240238405Sjkim
241238405Sjkim.align	16
242238405Sjkim.Lialu:
243238405Sjkim	push	%rbx
244238405Sjkim	push	%rbp
245238405Sjkim	push	%r12
246238405Sjkim	push	%r13
247238405Sjkim	mov	%rsp,%r11
248238405Sjkim	mov	%rdi,$ctx	# reassigned argument
249238405Sjkim	sub	\$`8+16*4`,%rsp
250238405Sjkim	mov	%rsi,$inp	# reassigned argument
251238405Sjkim	and	\$-64,%rsp
252238405Sjkim	mov	%rdx,$num	# reassigned argument
253238405Sjkim	mov	%r11,`16*4`(%rsp)
254238405Sjkim.Lprologue:
255238405Sjkim
256238405Sjkim	mov	0($ctx),$A
257238405Sjkim	mov	4($ctx),$B
258238405Sjkim	mov	8($ctx),$C
259238405Sjkim	mov	12($ctx),$D
260238405Sjkim	mov	16($ctx),$E
261238405Sjkim	jmp	.Lloop
262238405Sjkim
263238405Sjkim.align	16
264238405Sjkim.Lloop:
265238405Sjkim___
266183234Ssimonfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
267183234Ssimonfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268183234Ssimonfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
269183234Ssimonfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
270183234Ssimon$code.=<<___;
271238405Sjkim	add	0($ctx),$A
272238405Sjkim	add	4($ctx),$B
273238405Sjkim	add	8($ctx),$C
274238405Sjkim	add	12($ctx),$D
275238405Sjkim	add	16($ctx),$E
276238405Sjkim	mov	$A,0($ctx)
277238405Sjkim	mov	$B,4($ctx)
278238405Sjkim	mov	$C,8($ctx)
279238405Sjkim	mov	$D,12($ctx)
280238405Sjkim	mov	$E,16($ctx)
281183234Ssimon
282238405Sjkim	sub	\$1,$num
283183234Ssimon	lea	`16*4`($inp),$inp
284183234Ssimon	jnz	.Lloop
285238405Sjkim
286238405Sjkim	mov	`16*4`(%rsp),%rsi
287238405Sjkim	mov	(%rsi),%r13
288238405Sjkim	mov	8(%rsi),%r12
289238405Sjkim	mov	16(%rsi),%rbp
290238405Sjkim	mov	24(%rsi),%rbx
291238405Sjkim	lea	32(%rsi),%rsp
292238405Sjkim.Lepilogue:
293238405Sjkim	ret
294238405Sjkim.size	sha1_block_data_order,.-sha1_block_data_order
295183234Ssimon___
296238405Sjkim{{{
297238405Sjkimmy $Xi=4;
298238405Sjkimmy @X=map("%xmm$_",(4..7,0..3));
299238405Sjkimmy @Tx=map("%xmm$_",(8..10));
300238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
301238405Sjkimmy @T=("%esi","%edi");
302238405Sjkimmy $j=0;
303238405Sjkimmy $K_XX_XX="%r11";
304238405Sjkim
305238405Sjkimmy $_rol=sub { &rol(@_) };
306238405Sjkimmy $_ror=sub { &ror(@_) };
307238405Sjkim
308183234Ssimon$code.=<<___;
309238405Sjkim.type	sha1_block_data_order_ssse3,\@function,3
310238405Sjkim.align	16
311238405Sjkimsha1_block_data_order_ssse3:
312238405Sjkim_ssse3_shortcut:
313238405Sjkim	push	%rbx
314238405Sjkim	push	%rbp
315238405Sjkim	push	%r12
316238405Sjkim	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
317238405Sjkim___
318238405Sjkim$code.=<<___ if ($win64);
319238405Sjkim	movaps	%xmm6,64+0(%rsp)
320238405Sjkim	movaps	%xmm7,64+16(%rsp)
321238405Sjkim	movaps	%xmm8,64+32(%rsp)
322238405Sjkim	movaps	%xmm9,64+48(%rsp)
323238405Sjkim	movaps	%xmm10,64+64(%rsp)
324238405Sjkim.Lprologue_ssse3:
325238405Sjkim___
326238405Sjkim$code.=<<___;
327238405Sjkim	mov	%rdi,$ctx	# reassigned argument
328238405Sjkim	mov	%rsi,$inp	# reassigned argument
329238405Sjkim	mov	%rdx,$num	# reassigned argument
330238405Sjkim
331238405Sjkim	shl	\$6,$num
332238405Sjkim	add	$inp,$num
333238405Sjkim	lea	K_XX_XX(%rip),$K_XX_XX
334238405Sjkim
335238405Sjkim	mov	0($ctx),$A		# load context
336238405Sjkim	mov	4($ctx),$B
337238405Sjkim	mov	8($ctx),$C
338238405Sjkim	mov	12($ctx),$D
339238405Sjkim	mov	$B,@T[0]		# magic seed
340238405Sjkim	mov	16($ctx),$E
341238405Sjkim
342238405Sjkim	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
343238405Sjkim	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
344238405Sjkim	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
345238405Sjkim	movdqu	16($inp),@X[-3&7]
346238405Sjkim	movdqu	32($inp),@X[-2&7]
347238405Sjkim	movdqu	48($inp),@X[-1&7]
348238405Sjkim	pshufb	@X[2],@X[-4&7]		# byte swap
349238405Sjkim	add	\$64,$inp
350238405Sjkim	pshufb	@X[2],@X[-3&7]
351238405Sjkim	pshufb	@X[2],@X[-2&7]
352238405Sjkim	pshufb	@X[2],@X[-1&7]
353238405Sjkim	paddd	@Tx[1],@X[-4&7]		# add K_00_19
354238405Sjkim	paddd	@Tx[1],@X[-3&7]
355238405Sjkim	paddd	@Tx[1],@X[-2&7]
356238405Sjkim	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
357238405Sjkim	psubd	@Tx[1],@X[-4&7]		# restore X[]
358238405Sjkim	movdqa	@X[-3&7],16(%rsp)
359238405Sjkim	psubd	@Tx[1],@X[-3&7]
360238405Sjkim	movdqa	@X[-2&7],32(%rsp)
361238405Sjkim	psubd	@Tx[1],@X[-2&7]
362238405Sjkim	jmp	.Loop_ssse3
363238405Sjkim___
364238405Sjkim
365238405Sjkimsub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
366238405Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
367238405Sjkim  my $arg = pop;
368238405Sjkim    $arg = "\$$arg" if ($arg*1 eq $arg);
369238405Sjkim    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
370238405Sjkim}
371238405Sjkim
372238405Sjkimsub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
373238405Sjkim{ use integer;
374238405Sjkim  my $body = shift;
375238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
376238405Sjkim  my ($a,$b,$c,$d,$e);
377238405Sjkim
378238405Sjkim	&movdqa	(@X[0],@X[-3&7]);
379238405Sjkim	 eval(shift(@insns));
380238405Sjkim	 eval(shift(@insns));
381238405Sjkim	&movdqa	(@Tx[0],@X[-1&7]);
382238405Sjkim	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
383238405Sjkim	 eval(shift(@insns));
384238405Sjkim	 eval(shift(@insns));
385238405Sjkim
386238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
387238405Sjkim	 eval(shift(@insns));
388238405Sjkim	 eval(shift(@insns));
389238405Sjkim	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
390238405Sjkim	 eval(shift(@insns));
391238405Sjkim	 eval(shift(@insns));
392238405Sjkim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
393238405Sjkim	 eval(shift(@insns));
394238405Sjkim	 eval(shift(@insns));
395238405Sjkim
396238405Sjkim	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
397238405Sjkim	 eval(shift(@insns));
398238405Sjkim	 eval(shift(@insns));
399238405Sjkim	 eval(shift(@insns));
400238405Sjkim	 eval(shift(@insns));
401238405Sjkim
402238405Sjkim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
403238405Sjkim	 eval(shift(@insns));
404238405Sjkim	 eval(shift(@insns));
405238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
406238405Sjkim	 eval(shift(@insns));
407238405Sjkim	 eval(shift(@insns));
408238405Sjkim
409238405Sjkim	&movdqa	(@Tx[2],@X[0]);
410238405Sjkim	&movdqa	(@Tx[0],@X[0]);
411238405Sjkim	 eval(shift(@insns));
412238405Sjkim	 eval(shift(@insns));
413238405Sjkim	 eval(shift(@insns));
414238405Sjkim	 eval(shift(@insns));
415238405Sjkim
416238405Sjkim	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
417238405Sjkim	&paddd	(@X[0],@X[0]);
418238405Sjkim	 eval(shift(@insns));
419238405Sjkim	 eval(shift(@insns));
420238405Sjkim	 eval(shift(@insns));
421238405Sjkim	 eval(shift(@insns));
422238405Sjkim
423238405Sjkim	&psrld	(@Tx[0],31);
424238405Sjkim	 eval(shift(@insns));
425238405Sjkim	 eval(shift(@insns));
426238405Sjkim	&movdqa	(@Tx[1],@Tx[2]);
427238405Sjkim	 eval(shift(@insns));
428238405Sjkim	 eval(shift(@insns));
429238405Sjkim
430238405Sjkim	&psrld	(@Tx[2],30);
431238405Sjkim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
432238405Sjkim	 eval(shift(@insns));
433238405Sjkim	 eval(shift(@insns));
434238405Sjkim	 eval(shift(@insns));
435238405Sjkim	 eval(shift(@insns));
436238405Sjkim
437238405Sjkim	&pslld	(@Tx[1],2);
438238405Sjkim	&pxor	(@X[0],@Tx[2]);
439238405Sjkim	 eval(shift(@insns));
440238405Sjkim	 eval(shift(@insns));
441238405Sjkim	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
442238405Sjkim	 eval(shift(@insns));
443238405Sjkim	 eval(shift(@insns));
444238405Sjkim
445238405Sjkim	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
446238405Sjkim
447238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions [if any]
448238405Sjkim
449238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
450238405Sjkim		push(@Tx,shift(@Tx));
451238405Sjkim}
452238405Sjkim
453238405Sjkimsub Xupdate_ssse3_32_79()
454238405Sjkim{ use integer;
455238405Sjkim  my $body = shift;
456238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
457238405Sjkim  my ($a,$b,$c,$d,$e);
458238405Sjkim
459238405Sjkim	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
460238405Sjkim	 eval(shift(@insns));		# body_20_39
461238405Sjkim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
462238405Sjkim	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
463238405Sjkim	 eval(shift(@insns));
464238405Sjkim	 eval(shift(@insns));
465238405Sjkim	 eval(shift(@insns));		# rol
466238405Sjkim
467238405Sjkim	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
468238405Sjkim	 eval(shift(@insns));
469238405Sjkim	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
470238405Sjkim	if ($Xi%5) {
471238405Sjkim	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
472238405Sjkim	} else {			# ... or load next one
473238405Sjkim	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
474238405Sjkim	}
475238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
476238405Sjkim	 eval(shift(@insns));		# ror
477238405Sjkim	 eval(shift(@insns));
478238405Sjkim
479238405Sjkim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
480238405Sjkim	 eval(shift(@insns));		# body_20_39
481238405Sjkim	 eval(shift(@insns));
482238405Sjkim	 eval(shift(@insns));
483238405Sjkim	 eval(shift(@insns));		# rol
484238405Sjkim
485238405Sjkim	&movdqa	(@Tx[0],@X[0]);
486238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
487238405Sjkim	 eval(shift(@insns));
488238405Sjkim	 eval(shift(@insns));
489238405Sjkim	 eval(shift(@insns));		# ror
490238405Sjkim	 eval(shift(@insns));
491238405Sjkim
492238405Sjkim	&pslld	(@X[0],2);
493238405Sjkim	 eval(shift(@insns));		# body_20_39
494238405Sjkim	 eval(shift(@insns));
495238405Sjkim	&psrld	(@Tx[0],30);
496238405Sjkim	 eval(shift(@insns));
497238405Sjkim	 eval(shift(@insns));		# rol
498238405Sjkim	 eval(shift(@insns));
499238405Sjkim	 eval(shift(@insns));
500238405Sjkim	 eval(shift(@insns));		# ror
501238405Sjkim	 eval(shift(@insns));
502238405Sjkim
503238405Sjkim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
504238405Sjkim	 eval(shift(@insns));		# body_20_39
505238405Sjkim	 eval(shift(@insns));
506238405Sjkim	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
507238405Sjkim	 eval(shift(@insns));
508238405Sjkim	 eval(shift(@insns));		# rol
509238405Sjkim	 eval(shift(@insns));
510238405Sjkim	 eval(shift(@insns));
511238405Sjkim	 eval(shift(@insns));		# rol
512238405Sjkim	 eval(shift(@insns));
513238405Sjkim
514238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions
515238405Sjkim
516238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
517238405Sjkim		push(@Tx,shift(@Tx));
518238405Sjkim}
519238405Sjkim
520238405Sjkimsub Xuplast_ssse3_80()
521238405Sjkim{ use integer;
522238405Sjkim  my $body = shift;
523238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
524238405Sjkim  my ($a,$b,$c,$d,$e);
525238405Sjkim
526238405Sjkim	 eval(shift(@insns));
527238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
528238405Sjkim	 eval(shift(@insns));
529238405Sjkim	 eval(shift(@insns));
530238405Sjkim	 eval(shift(@insns));
531238405Sjkim	 eval(shift(@insns));
532238405Sjkim
533238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
534238405Sjkim
535238405Sjkim	 foreach (@insns) { eval; }		# remaining instructions
536238405Sjkim
537238405Sjkim	&cmp	($inp,$num);
538238405Sjkim	&je	(".Ldone_ssse3");
539238405Sjkim
540238405Sjkim	unshift(@Tx,pop(@Tx));
541238405Sjkim
542238405Sjkim	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
543238405Sjkim	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
544238405Sjkim	&movdqu	(@X[-4&7],"0($inp)");		# load input
545238405Sjkim	&movdqu	(@X[-3&7],"16($inp)");
546238405Sjkim	&movdqu	(@X[-2&7],"32($inp)");
547238405Sjkim	&movdqu	(@X[-1&7],"48($inp)");
548238405Sjkim	&pshufb	(@X[-4&7],@X[2]);		# byte swap
549238405Sjkim	&add	($inp,64);
550238405Sjkim
551238405Sjkim  $Xi=0;
552238405Sjkim}
553238405Sjkim
554238405Sjkimsub Xloop_ssse3()
555238405Sjkim{ use integer;
556238405Sjkim  my $body = shift;
557238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
558238405Sjkim  my ($a,$b,$c,$d,$e);
559238405Sjkim
560238405Sjkim	 eval(shift(@insns));
561238405Sjkim	 eval(shift(@insns));
562238405Sjkim	&pshufb	(@X[($Xi-3)&7],@X[2]);
563238405Sjkim	 eval(shift(@insns));
564238405Sjkim	 eval(shift(@insns));
565238405Sjkim	&paddd	(@X[($Xi-4)&7],@Tx[1]);
566238405Sjkim	 eval(shift(@insns));
567238405Sjkim	 eval(shift(@insns));
568238405Sjkim	 eval(shift(@insns));
569238405Sjkim	 eval(shift(@insns));
570238405Sjkim	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
571238405Sjkim	 eval(shift(@insns));
572238405Sjkim	 eval(shift(@insns));
573238405Sjkim	&psubd	(@X[($Xi-4)&7],@Tx[1]);
574238405Sjkim
575238405Sjkim	foreach (@insns) { eval; }
576238405Sjkim  $Xi++;
577238405Sjkim}
578238405Sjkim
579238405Sjkimsub Xtail_ssse3()
580238405Sjkim{ use integer;
581238405Sjkim  my $body = shift;
582238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
583238405Sjkim  my ($a,$b,$c,$d,$e);
584238405Sjkim
585238405Sjkim	foreach (@insns) { eval; }
586238405Sjkim}
587238405Sjkim
588238405Sjkimsub body_00_19 () {
589238405Sjkim	(
590238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
591238405Sjkim	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
592238405Sjkim	'&xor	($c,$d);',
593238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
594238405Sjkim	'&$_rol	($a,5);',
595238405Sjkim	'&and	(@T[0],$c);',	# ($b&($c^$d))
596238405Sjkim	'&xor	($c,$d);',	# restore $c
597238405Sjkim	'&xor	(@T[0],$d);',
598238405Sjkim	'&add	($e,$a);',
599238405Sjkim	'&$_ror	($b,$j?7:2);',	# $b>>>2
600238405Sjkim	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
601238405Sjkim	);
602238405Sjkim}
603238405Sjkim
604238405Sjkimsub body_20_39 () {
605238405Sjkim	(
606238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
607238405Sjkim	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
608238405Sjkim	'&xor	(@T[0],$d);',	# ($b^$d)
609238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
610238405Sjkim	'&$_rol	($a,5);',
611238405Sjkim	'&xor	(@T[0],$c);',	# ($b^$d^$c)
612238405Sjkim	'&add	($e,$a);',
613238405Sjkim	'&$_ror	($b,7);',	# $b>>>2
614238405Sjkim	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
615238405Sjkim	);
616238405Sjkim}
617238405Sjkim
618238405Sjkimsub body_40_59 () {
619238405Sjkim	(
620238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
621238405Sjkim	'&mov	(@T[1],$c);',
622238405Sjkim	'&xor	($c,$d);',
623238405Sjkim	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
624238405Sjkim	'&and	(@T[1],$d);',
625238405Sjkim	'&and	(@T[0],$c);',	# ($b&($c^$d))
626238405Sjkim	'&$_ror	($b,7);',	# $b>>>2
627238405Sjkim	'&add	($e,@T[1]);',
628238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
629238405Sjkim	'&$_rol	($a,5);',
630238405Sjkim	'&add	($e,@T[0]);',
631238405Sjkim	'&xor	($c,$d);',	# restore $c
632238405Sjkim	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
633238405Sjkim	);
634238405Sjkim}
635238405Sjkim$code.=<<___;
636238405Sjkim.align	16
637238405Sjkim.Loop_ssse3:
638238405Sjkim___
639238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
640238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
641238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
642238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
643238405Sjkim	&Xupdate_ssse3_32_79(\&body_00_19);
644238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
645238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
646238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
647238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
648238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
649238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
650238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
651238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
652238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
653238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
654238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
655238405Sjkim	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
656238405Sjkim
657238405Sjkim				$saved_j=$j; @saved_V=@V;
658238405Sjkim
659238405Sjkim	&Xloop_ssse3(\&body_20_39);
660238405Sjkim	&Xloop_ssse3(\&body_20_39);
661238405Sjkim	&Xloop_ssse3(\&body_20_39);
662238405Sjkim
663238405Sjkim$code.=<<___;
664238405Sjkim	add	0($ctx),$A			# update context
665238405Sjkim	add	4($ctx),@T[0]
666238405Sjkim	add	8($ctx),$C
667238405Sjkim	add	12($ctx),$D
668238405Sjkim	mov	$A,0($ctx)
669238405Sjkim	add	16($ctx),$E
670238405Sjkim	mov	@T[0],4($ctx)
671238405Sjkim	mov	@T[0],$B			# magic seed
672238405Sjkim	mov	$C,8($ctx)
673238405Sjkim	mov	$D,12($ctx)
674238405Sjkim	mov	$E,16($ctx)
675238405Sjkim	jmp	.Loop_ssse3
676238405Sjkim
677238405Sjkim.align	16
678238405Sjkim.Ldone_ssse3:
679238405Sjkim___
680238405Sjkim				$j=$saved_j; @V=@saved_V;
681238405Sjkim
682238405Sjkim	&Xtail_ssse3(\&body_20_39);
683238405Sjkim	&Xtail_ssse3(\&body_20_39);
684238405Sjkim	&Xtail_ssse3(\&body_20_39);
685238405Sjkim
686238405Sjkim$code.=<<___;
687238405Sjkim	add	0($ctx),$A			# update context
688238405Sjkim	add	4($ctx),@T[0]
689238405Sjkim	add	8($ctx),$C
690238405Sjkim	mov	$A,0($ctx)
691238405Sjkim	add	12($ctx),$D
692238405Sjkim	mov	@T[0],4($ctx)
693238405Sjkim	add	16($ctx),$E
694238405Sjkim	mov	$C,8($ctx)
695238405Sjkim	mov	$D,12($ctx)
696238405Sjkim	mov	$E,16($ctx)
697238405Sjkim___
698238405Sjkim$code.=<<___ if ($win64);
699238405Sjkim	movaps	64+0(%rsp),%xmm6
700238405Sjkim	movaps	64+16(%rsp),%xmm7
701238405Sjkim	movaps	64+32(%rsp),%xmm8
702238405Sjkim	movaps	64+48(%rsp),%xmm9
703238405Sjkim	movaps	64+64(%rsp),%xmm10
704238405Sjkim___
705238405Sjkim$code.=<<___;
706238405Sjkim	lea	`64+($win64?5*16:0)`(%rsp),%rsi
707238405Sjkim	mov	0(%rsi),%r12
708238405Sjkim	mov	8(%rsi),%rbp
709238405Sjkim	mov	16(%rsi),%rbx
710238405Sjkim	lea	24(%rsi),%rsp
711238405Sjkim.Lepilogue_ssse3:
712238405Sjkim	ret
713238405Sjkim.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
714238405Sjkim___
715238405Sjkim
716238405Sjkimif ($avx) {
717238405Sjkimmy $Xi=4;
718238405Sjkimmy @X=map("%xmm$_",(4..7,0..3));
719238405Sjkimmy @Tx=map("%xmm$_",(8..10));
720238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
721238405Sjkimmy @T=("%esi","%edi");
722238405Sjkimmy $j=0;
723238405Sjkimmy $K_XX_XX="%r11";
724238405Sjkim
725238405Sjkimmy $_rol=sub { &shld(@_[0],@_) };
726238405Sjkimmy $_ror=sub { &shrd(@_[0],@_) };
727238405Sjkim
728238405Sjkim$code.=<<___;
729238405Sjkim.type	sha1_block_data_order_avx,\@function,3
730238405Sjkim.align	16
731238405Sjkimsha1_block_data_order_avx:
732238405Sjkim_avx_shortcut:
733238405Sjkim	push	%rbx
734238405Sjkim	push	%rbp
735238405Sjkim	push	%r12
736238405Sjkim	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
737238405Sjkim___
738238405Sjkim$code.=<<___ if ($win64);
739238405Sjkim	movaps	%xmm6,64+0(%rsp)
740238405Sjkim	movaps	%xmm7,64+16(%rsp)
741238405Sjkim	movaps	%xmm8,64+32(%rsp)
742238405Sjkim	movaps	%xmm9,64+48(%rsp)
743238405Sjkim	movaps	%xmm10,64+64(%rsp)
744238405Sjkim.Lprologue_avx:
745238405Sjkim___
746238405Sjkim$code.=<<___;
747238405Sjkim	mov	%rdi,$ctx	# reassigned argument
748238405Sjkim	mov	%rsi,$inp	# reassigned argument
749238405Sjkim	mov	%rdx,$num	# reassigned argument
750264331Sjkim	vzeroupper
751238405Sjkim
752238405Sjkim	shl	\$6,$num
753238405Sjkim	add	$inp,$num
754238405Sjkim	lea	K_XX_XX(%rip),$K_XX_XX
755238405Sjkim
756238405Sjkim	mov	0($ctx),$A		# load context
757238405Sjkim	mov	4($ctx),$B
758238405Sjkim	mov	8($ctx),$C
759238405Sjkim	mov	12($ctx),$D
760238405Sjkim	mov	$B,@T[0]		# magic seed
761238405Sjkim	mov	16($ctx),$E
762238405Sjkim
763238405Sjkim	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
764238405Sjkim	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
765238405Sjkim	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
766238405Sjkim	vmovdqu	16($inp),@X[-3&7]
767238405Sjkim	vmovdqu	32($inp),@X[-2&7]
768238405Sjkim	vmovdqu	48($inp),@X[-1&7]
769238405Sjkim	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
770238405Sjkim	add	\$64,$inp
771238405Sjkim	vpshufb	@X[2],@X[-3&7],@X[-3&7]
772238405Sjkim	vpshufb	@X[2],@X[-2&7],@X[-2&7]
773238405Sjkim	vpshufb	@X[2],@X[-1&7],@X[-1&7]
774238405Sjkim	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
775238405Sjkim	vpaddd	@Tx[1],@X[-3&7],@X[1]
776238405Sjkim	vpaddd	@Tx[1],@X[-2&7],@X[2]
777238405Sjkim	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
778238405Sjkim	vmovdqa	@X[1],16(%rsp)
779238405Sjkim	vmovdqa	@X[2],32(%rsp)
780238405Sjkim	jmp	.Loop_avx
781238405Sjkim___
782238405Sjkim
783238405Sjkimsub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
784238405Sjkim{ use integer;
785238405Sjkim  my $body = shift;
786238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
787238405Sjkim  my ($a,$b,$c,$d,$e);
788238405Sjkim
789238405Sjkim	 eval(shift(@insns));
790238405Sjkim	 eval(shift(@insns));
791238405Sjkim	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
792238405Sjkim	 eval(shift(@insns));
793238405Sjkim	 eval(shift(@insns));
794238405Sjkim
795238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
796238405Sjkim	 eval(shift(@insns));
797238405Sjkim	 eval(shift(@insns));
798238405Sjkim	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
799238405Sjkim	 eval(shift(@insns));
800238405Sjkim	 eval(shift(@insns));
801238405Sjkim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
802238405Sjkim	 eval(shift(@insns));
803238405Sjkim	 eval(shift(@insns));
804238405Sjkim
805238405Sjkim	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
806238405Sjkim	 eval(shift(@insns));
807238405Sjkim	 eval(shift(@insns));
808238405Sjkim	 eval(shift(@insns));
809238405Sjkim	 eval(shift(@insns));
810238405Sjkim
811238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
812238405Sjkim	 eval(shift(@insns));
813238405Sjkim	 eval(shift(@insns));
814238405Sjkim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
815238405Sjkim	 eval(shift(@insns));
816238405Sjkim	 eval(shift(@insns));
817238405Sjkim
818238405Sjkim	&vpsrld	(@Tx[0],@X[0],31);
819238405Sjkim	 eval(shift(@insns));
820238405Sjkim	 eval(shift(@insns));
821238405Sjkim	 eval(shift(@insns));
822238405Sjkim	 eval(shift(@insns));
823238405Sjkim
824238405Sjkim	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
825238405Sjkim	&vpaddd	(@X[0],@X[0],@X[0]);
826238405Sjkim	 eval(shift(@insns));
827238405Sjkim	 eval(shift(@insns));
828238405Sjkim	 eval(shift(@insns));
829238405Sjkim	 eval(shift(@insns));
830238405Sjkim
831238405Sjkim	&vpsrld	(@Tx[1],@Tx[2],30);
832238405Sjkim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
833238405Sjkim	 eval(shift(@insns));
834238405Sjkim	 eval(shift(@insns));
835238405Sjkim	 eval(shift(@insns));
836238405Sjkim	 eval(shift(@insns));
837238405Sjkim
838238405Sjkim	&vpslld	(@Tx[2],@Tx[2],2);
839238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[1]);
840238405Sjkim	 eval(shift(@insns));
841238405Sjkim	 eval(shift(@insns));
842238405Sjkim	 eval(shift(@insns));
843238405Sjkim	 eval(shift(@insns));
844238405Sjkim
845238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
846238405Sjkim	 eval(shift(@insns));
847238405Sjkim	 eval(shift(@insns));
848238405Sjkim	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
849238405Sjkim	 eval(shift(@insns));
850238405Sjkim	 eval(shift(@insns));
851238405Sjkim
852238405Sjkim
853238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions [if any]
854238405Sjkim
855238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
856238405Sjkim		push(@Tx,shift(@Tx));
857238405Sjkim}
858238405Sjkim
859238405Sjkimsub Xupdate_avx_32_79()
860238405Sjkim{ use integer;
861238405Sjkim  my $body = shift;
862238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
863238405Sjkim  my ($a,$b,$c,$d,$e);
864238405Sjkim
865238405Sjkim	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
866238405Sjkim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
867238405Sjkim	 eval(shift(@insns));		# body_20_39
868238405Sjkim	 eval(shift(@insns));
869238405Sjkim	 eval(shift(@insns));
870238405Sjkim	 eval(shift(@insns));		# rol
871238405Sjkim
872238405Sjkim	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
873238405Sjkim	 eval(shift(@insns));
874238405Sjkim	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
875238405Sjkim	if ($Xi%5) {
876238405Sjkim	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
877238405Sjkim	} else {			# ... or load next one
878238405Sjkim	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
879238405Sjkim	}
880238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
881238405Sjkim	 eval(shift(@insns));		# ror
882238405Sjkim	 eval(shift(@insns));
883238405Sjkim
884238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
885238405Sjkim	 eval(shift(@insns));		# body_20_39
886238405Sjkim	 eval(shift(@insns));
887238405Sjkim	 eval(shift(@insns));
888238405Sjkim	 eval(shift(@insns));		# rol
889238405Sjkim
890238405Sjkim	&vpsrld	(@Tx[0],@X[0],30);
891238405Sjkim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
892238405Sjkim	 eval(shift(@insns));
893238405Sjkim	 eval(shift(@insns));
894238405Sjkim	 eval(shift(@insns));		# ror
895238405Sjkim	 eval(shift(@insns));
896238405Sjkim
897238405Sjkim	&vpslld	(@X[0],@X[0],2);
898238405Sjkim	 eval(shift(@insns));		# body_20_39
899238405Sjkim	 eval(shift(@insns));
900238405Sjkim	 eval(shift(@insns));
901238405Sjkim	 eval(shift(@insns));		# rol
902238405Sjkim	 eval(shift(@insns));
903238405Sjkim	 eval(shift(@insns));
904238405Sjkim	 eval(shift(@insns));		# ror
905238405Sjkim	 eval(shift(@insns));
906238405Sjkim
907238405Sjkim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
908238405Sjkim	 eval(shift(@insns));		# body_20_39
909238405Sjkim	 eval(shift(@insns));
910238405Sjkim	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
911238405Sjkim	 eval(shift(@insns));
912238405Sjkim	 eval(shift(@insns));		# rol
913238405Sjkim	 eval(shift(@insns));
914238405Sjkim	 eval(shift(@insns));
915238405Sjkim	 eval(shift(@insns));		# rol
916238405Sjkim	 eval(shift(@insns));
917238405Sjkim
918238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions
919238405Sjkim
920238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
921238405Sjkim		push(@Tx,shift(@Tx));
922238405Sjkim}
923238405Sjkim
924238405Sjkimsub Xuplast_avx_80()
925238405Sjkim{ use integer;
926238405Sjkim  my $body = shift;
927238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
928238405Sjkim  my ($a,$b,$c,$d,$e);
929238405Sjkim
930238405Sjkim	 eval(shift(@insns));
931238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
932238405Sjkim	 eval(shift(@insns));
933238405Sjkim	 eval(shift(@insns));
934238405Sjkim	 eval(shift(@insns));
935238405Sjkim	 eval(shift(@insns));
936238405Sjkim
937238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
938238405Sjkim
939238405Sjkim	 foreach (@insns) { eval; }		# remaining instructions
940238405Sjkim
941238405Sjkim	&cmp	($inp,$num);
942238405Sjkim	&je	(".Ldone_avx");
943238405Sjkim
944238405Sjkim	unshift(@Tx,pop(@Tx));
945238405Sjkim
946238405Sjkim	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
947238405Sjkim	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
948238405Sjkim	&vmovdqu(@X[-4&7],"0($inp)");		# load input
949238405Sjkim	&vmovdqu(@X[-3&7],"16($inp)");
950238405Sjkim	&vmovdqu(@X[-2&7],"32($inp)");
951238405Sjkim	&vmovdqu(@X[-1&7],"48($inp)");
952238405Sjkim	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
953238405Sjkim	&add	($inp,64);
954238405Sjkim
955238405Sjkim  $Xi=0;
956238405Sjkim}
957238405Sjkim
958238405Sjkimsub Xloop_avx()
959238405Sjkim{ use integer;
960238405Sjkim  my $body = shift;
961238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
962238405Sjkim  my ($a,$b,$c,$d,$e);
963238405Sjkim
964238405Sjkim	 eval(shift(@insns));
965238405Sjkim	 eval(shift(@insns));
966238405Sjkim	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
967238405Sjkim	 eval(shift(@insns));
968238405Sjkim	 eval(shift(@insns));
969238405Sjkim	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
970238405Sjkim	 eval(shift(@insns));
971238405Sjkim	 eval(shift(@insns));
972238405Sjkim	 eval(shift(@insns));
973238405Sjkim	 eval(shift(@insns));
974238405Sjkim	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
975238405Sjkim	 eval(shift(@insns));
976238405Sjkim	 eval(shift(@insns));
977238405Sjkim
978238405Sjkim	foreach (@insns) { eval; }
979238405Sjkim  $Xi++;
980238405Sjkim}
981238405Sjkim
982238405Sjkimsub Xtail_avx()
983238405Sjkim{ use integer;
984238405Sjkim  my $body = shift;
985238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
986238405Sjkim  my ($a,$b,$c,$d,$e);
987238405Sjkim
988238405Sjkim	foreach (@insns) { eval; }
989238405Sjkim}
990238405Sjkim
991238405Sjkim$code.=<<___;
992238405Sjkim.align	16
993238405Sjkim.Loop_avx:
994238405Sjkim___
995238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
996238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
997238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
998238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
999238405Sjkim	&Xupdate_avx_32_79(\&body_00_19);
1000238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1001238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1002238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1003238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1004238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1005238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1006238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1007238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1008238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1009238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1010238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1011238405Sjkim	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1012238405Sjkim
1013238405Sjkim				$saved_j=$j; @saved_V=@V;
1014238405Sjkim
1015238405Sjkim	&Xloop_avx(\&body_20_39);
1016238405Sjkim	&Xloop_avx(\&body_20_39);
1017238405Sjkim	&Xloop_avx(\&body_20_39);
1018238405Sjkim
1019238405Sjkim$code.=<<___;
1020238405Sjkim	add	0($ctx),$A			# update context
1021238405Sjkim	add	4($ctx),@T[0]
1022238405Sjkim	add	8($ctx),$C
1023238405Sjkim	add	12($ctx),$D
1024238405Sjkim	mov	$A,0($ctx)
1025238405Sjkim	add	16($ctx),$E
1026238405Sjkim	mov	@T[0],4($ctx)
1027238405Sjkim	mov	@T[0],$B			# magic seed
1028238405Sjkim	mov	$C,8($ctx)
1029238405Sjkim	mov	$D,12($ctx)
1030238405Sjkim	mov	$E,16($ctx)
1031238405Sjkim	jmp	.Loop_avx
1032238405Sjkim
1033238405Sjkim.align	16
1034238405Sjkim.Ldone_avx:
1035238405Sjkim___
1036238405Sjkim				$j=$saved_j; @V=@saved_V;
1037238405Sjkim
1038238405Sjkim	&Xtail_avx(\&body_20_39);
1039238405Sjkim	&Xtail_avx(\&body_20_39);
1040238405Sjkim	&Xtail_avx(\&body_20_39);
1041238405Sjkim
1042238405Sjkim$code.=<<___;
1043264331Sjkim	vzeroupper
1044238405Sjkim
1045238405Sjkim	add	0($ctx),$A			# update context
1046238405Sjkim	add	4($ctx),@T[0]
1047238405Sjkim	add	8($ctx),$C
1048238405Sjkim	mov	$A,0($ctx)
1049238405Sjkim	add	12($ctx),$D
1050238405Sjkim	mov	@T[0],4($ctx)
1051238405Sjkim	add	16($ctx),$E
1052238405Sjkim	mov	$C,8($ctx)
1053238405Sjkim	mov	$D,12($ctx)
1054238405Sjkim	mov	$E,16($ctx)
1055238405Sjkim___
1056238405Sjkim$code.=<<___ if ($win64);
1057238405Sjkim	movaps	64+0(%rsp),%xmm6
1058238405Sjkim	movaps	64+16(%rsp),%xmm7
1059238405Sjkim	movaps	64+32(%rsp),%xmm8
1060238405Sjkim	movaps	64+48(%rsp),%xmm9
1061238405Sjkim	movaps	64+64(%rsp),%xmm10
1062238405Sjkim___
1063238405Sjkim$code.=<<___;
1064238405Sjkim	lea	`64+($win64?5*16:0)`(%rsp),%rsi
1065238405Sjkim	mov	0(%rsi),%r12
1066238405Sjkim	mov	8(%rsi),%rbp
1067238405Sjkim	mov	16(%rsi),%rbx
1068238405Sjkim	lea	24(%rsi),%rsp
1069238405Sjkim.Lepilogue_avx:
1070238405Sjkim	ret
1071238405Sjkim.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1072238405Sjkim___
1073238405Sjkim}
1074238405Sjkim$code.=<<___;
1075238405Sjkim.align	64
1076238405SjkimK_XX_XX:
1077238405Sjkim.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1078238405Sjkim.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1079238405Sjkim.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1080238405Sjkim.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1081238405Sjkim.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1082238405Sjkim___
1083238405Sjkim}}}
1084238405Sjkim$code.=<<___;
1085183234Ssimon.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086238405Sjkim.align	64
1087183234Ssimon___
1088183234Ssimon
1089238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1090238405Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1091238405Sjkimif ($win64) {
1092238405Sjkim$rec="%rcx";
1093238405Sjkim$frame="%rdx";
1094238405Sjkim$context="%r8";
1095238405Sjkim$disp="%r9";
1096238405Sjkim
1097238405Sjkim$code.=<<___;
1098238405Sjkim.extern	__imp_RtlVirtualUnwind
1099238405Sjkim.type	se_handler,\@abi-omnipotent
1100238405Sjkim.align	16
1101238405Sjkimse_handler:
1102238405Sjkim	push	%rsi
1103238405Sjkim	push	%rdi
1104238405Sjkim	push	%rbx
1105238405Sjkim	push	%rbp
1106238405Sjkim	push	%r12
1107238405Sjkim	push	%r13
1108238405Sjkim	push	%r14
1109238405Sjkim	push	%r15
1110238405Sjkim	pushfq
1111238405Sjkim	sub	\$64,%rsp
1112238405Sjkim
1113238405Sjkim	mov	120($context),%rax	# pull context->Rax
1114238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1115238405Sjkim
1116238405Sjkim	lea	.Lprologue(%rip),%r10
1117238405Sjkim	cmp	%r10,%rbx		# context->Rip<.Lprologue
1118238405Sjkim	jb	.Lcommon_seh_tail
1119238405Sjkim
1120238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1121238405Sjkim
1122238405Sjkim	lea	.Lepilogue(%rip),%r10
1123238405Sjkim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1124238405Sjkim	jae	.Lcommon_seh_tail
1125238405Sjkim
1126238405Sjkim	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1127238405Sjkim	lea	32(%rax),%rax
1128238405Sjkim
1129238405Sjkim	mov	-8(%rax),%rbx
1130238405Sjkim	mov	-16(%rax),%rbp
1131238405Sjkim	mov	-24(%rax),%r12
1132238405Sjkim	mov	-32(%rax),%r13
1133238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1134238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1135238405Sjkim	mov	%r12,216($context)	# restore context->R12
1136238405Sjkim	mov	%r13,224($context)	# restore context->R13
1137238405Sjkim
1138238405Sjkim	jmp	.Lcommon_seh_tail
1139238405Sjkim.size	se_handler,.-se_handler
1140238405Sjkim
1141238405Sjkim.type	ssse3_handler,\@abi-omnipotent
1142238405Sjkim.align	16
1143238405Sjkimssse3_handler:
1144238405Sjkim	push	%rsi
1145238405Sjkim	push	%rdi
1146238405Sjkim	push	%rbx
1147238405Sjkim	push	%rbp
1148238405Sjkim	push	%r12
1149238405Sjkim	push	%r13
1150238405Sjkim	push	%r14
1151238405Sjkim	push	%r15
1152238405Sjkim	pushfq
1153238405Sjkim	sub	\$64,%rsp
1154238405Sjkim
1155238405Sjkim	mov	120($context),%rax	# pull context->Rax
1156238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1157238405Sjkim
1158238405Sjkim	mov	8($disp),%rsi		# disp->ImageBase
1159238405Sjkim	mov	56($disp),%r11		# disp->HandlerData
1160238405Sjkim
1161238405Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
1162238405Sjkim	lea	(%rsi,%r10),%r10	# prologue label
1163238405Sjkim	cmp	%r10,%rbx		# context->Rip<prologue label
1164238405Sjkim	jb	.Lcommon_seh_tail
1165238405Sjkim
1166238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1167238405Sjkim
1168238405Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
1169238405Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
1170238405Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
1171238405Sjkim	jae	.Lcommon_seh_tail
1172238405Sjkim
1173238405Sjkim	lea	64(%rax),%rsi
1174238405Sjkim	lea	512($context),%rdi	# &context.Xmm6
1175238405Sjkim	mov	\$10,%ecx
1176238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
1177238405Sjkim	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
1178238405Sjkim
1179238405Sjkim	mov	-8(%rax),%rbx
1180238405Sjkim	mov	-16(%rax),%rbp
1181238405Sjkim	mov	-24(%rax),%r12
1182238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1183238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1184238405Sjkim	mov	%r12,216($context)	# restore cotnext->R12
1185238405Sjkim
1186238405Sjkim.Lcommon_seh_tail:
1187238405Sjkim	mov	8(%rax),%rdi
1188238405Sjkim	mov	16(%rax),%rsi
1189238405Sjkim	mov	%rax,152($context)	# restore context->Rsp
1190238405Sjkim	mov	%rsi,168($context)	# restore context->Rsi
1191238405Sjkim	mov	%rdi,176($context)	# restore context->Rdi
1192238405Sjkim
1193238405Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
1194238405Sjkim	mov	$context,%rsi		# context
1195238405Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
1196238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
1197238405Sjkim
1198238405Sjkim	mov	$disp,%rsi
1199238405Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1200238405Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1201238405Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1202238405Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1203238405Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
1204238405Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
1205238405Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1206238405Sjkim	mov	%r10,32(%rsp)		# arg5
1207238405Sjkim	mov	%r11,40(%rsp)		# arg6
1208238405Sjkim	mov	%r12,48(%rsp)		# arg7
1209238405Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
1210238405Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
1211238405Sjkim
1212238405Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
1213238405Sjkim	add	\$64,%rsp
1214238405Sjkim	popfq
1215238405Sjkim	pop	%r15
1216238405Sjkim	pop	%r14
1217238405Sjkim	pop	%r13
1218238405Sjkim	pop	%r12
1219238405Sjkim	pop	%rbp
1220238405Sjkim	pop	%rbx
1221238405Sjkim	pop	%rdi
1222238405Sjkim	pop	%rsi
1223238405Sjkim	ret
1224238405Sjkim.size	ssse3_handler,.-ssse3_handler
1225238405Sjkim
1226238405Sjkim.section	.pdata
1227238405Sjkim.align	4
1228238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order
1229238405Sjkim	.rva	.LSEH_end_sha1_block_data_order
1230238405Sjkim	.rva	.LSEH_info_sha1_block_data_order
1231238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1232238405Sjkim	.rva	.LSEH_end_sha1_block_data_order_ssse3
1233238405Sjkim	.rva	.LSEH_info_sha1_block_data_order_ssse3
1234238405Sjkim___
1235238405Sjkim$code.=<<___ if ($avx);
1236238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order_avx
1237238405Sjkim	.rva	.LSEH_end_sha1_block_data_order_avx
1238238405Sjkim	.rva	.LSEH_info_sha1_block_data_order_avx
1239238405Sjkim___
1240238405Sjkim$code.=<<___;
1241238405Sjkim.section	.xdata
1242238405Sjkim.align	8
1243238405Sjkim.LSEH_info_sha1_block_data_order:
1244238405Sjkim	.byte	9,0,0,0
1245238405Sjkim	.rva	se_handler
1246238405Sjkim.LSEH_info_sha1_block_data_order_ssse3:
1247238405Sjkim	.byte	9,0,0,0
1248238405Sjkim	.rva	ssse3_handler
1249238405Sjkim	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1250238405Sjkim___
1251238405Sjkim$code.=<<___ if ($avx);
1252238405Sjkim.LSEH_info_sha1_block_data_order_avx:
1253238405Sjkim	.byte	9,0,0,0
1254238405Sjkim	.rva	ssse3_handler
1255238405Sjkim	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1256238405Sjkim___
1257238405Sjkim}
1258238405Sjkim
1259183234Ssimon####################################################################
1260183234Ssimon
1261183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem;
1262183234Ssimonprint $code;
1263183234Ssimonclose STDOUT;
1264