sha1-x86_64.pl revision 299964
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56#		x86_64		SSSE3		AVX
57# P4		9.8		-
58# Opteron	6.6		-
59# Core2		6.7		6.1/+10%	-
60# Atom		11.0		9.7/+13%	-
61# Westmere	7.1		5.6/+27%	-
62# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
63
64$flavour = shift;
65$output  = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77	   $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80	   $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83	   $1>=10);
84$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ &&
85	   $2>=3.0);
86
87open OUT,"| \"$^X\" $xlate $flavour $output";
88*STDOUT=*OUT;
89
90$ctx="%rdi";	# 1st arg
91$inp="%rsi";	# 2nd arg
92$num="%rdx";	# 3rd arg
93
94# reassign arguments in order to produce more compact code
95$ctx="%r8";
96$inp="%r9";
97$num="%r10";
98
99$t0="%eax";
100$t1="%ebx";
101$t2="%ecx";
102@xi=("%edx","%ebp");
103$A="%esi";
104$B="%edi";
105$C="%r11d";
106$D="%r12d";
107$E="%r13d";
108
109@V=($A,$B,$C,$D,$E);
110
111sub BODY_00_19 {
112my ($i,$a,$b,$c,$d,$e)=@_;
113my $j=$i+1;
114$code.=<<___ if ($i==0);
115	mov	`4*$i`($inp),$xi[0]
116	bswap	$xi[0]
117	mov	$xi[0],`4*$i`(%rsp)
118___
119$code.=<<___ if ($i<15);
120	mov	$c,$t0
121	mov	`4*$j`($inp),$xi[1]
122	mov	$a,$t2
123	xor	$d,$t0
124	bswap	$xi[1]
125	rol	\$5,$t2
126	lea	0x5a827999($xi[0],$e),$e
127	and	$b,$t0
128	mov	$xi[1],`4*$j`(%rsp)
129	add	$t2,$e
130	xor	$d,$t0
131	rol	\$30,$b
132	add	$t0,$e
133___
134$code.=<<___ if ($i>=15);
135	mov	`4*($j%16)`(%rsp),$xi[1]
136	mov	$c,$t0
137	mov	$a,$t2
138	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
139	xor	$d,$t0
140	rol	\$5,$t2
141	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
142	and	$b,$t0
143	lea	0x5a827999($xi[0],$e),$e
144	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
145	xor	$d,$t0
146	rol	\$1,$xi[1]
147	add	$t2,$e
148	rol	\$30,$b
149	mov	$xi[1],`4*($j%16)`(%rsp)
150	add	$t0,$e
151___
152unshift(@xi,pop(@xi));
153}
154
155sub BODY_20_39 {
156my ($i,$a,$b,$c,$d,$e)=@_;
157my $j=$i+1;
158my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
159$code.=<<___ if ($i<79);
160	mov	`4*($j%16)`(%rsp),$xi[1]
161	mov	$c,$t0
162	mov	$a,$t2
163	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
164	xor	$b,$t0
165	rol	\$5,$t2
166	lea	$K($xi[0],$e),$e
167	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
168	xor	$d,$t0
169	add	$t2,$e
170	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
171	rol	\$30,$b
172	add	$t0,$e
173	rol	\$1,$xi[1]
174___
175$code.=<<___ if ($i<76);
176	mov	$xi[1],`4*($j%16)`(%rsp)
177___
178$code.=<<___ if ($i==79);
179	mov	$c,$t0
180	mov	$a,$t2
181	xor	$b,$t0
182	lea	$K($xi[0],$e),$e
183	rol	\$5,$t2
184	xor	$d,$t0
185	add	$t2,$e
186	rol	\$30,$b
187	add	$t0,$e
188___
189unshift(@xi,pop(@xi));
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___;
196	mov	`4*($j%16)`(%rsp),$xi[1]
197	mov	$c,$t0
198	mov	$c,$t1
199	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
200	and	$d,$t0
201	mov	$a,$t2
202	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
203	xor	$d,$t1
204	lea	0x8f1bbcdc($xi[0],$e),$e
205	rol	\$5,$t2
206	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
207	add	$t0,$e
208	and	$b,$t1
209	rol	\$1,$xi[1]
210	add	$t1,$e
211	rol	\$30,$b
212	mov	$xi[1],`4*($j%16)`(%rsp)
213	add	$t2,$e
214___
215unshift(@xi,pop(@xi));
216}
217
218$code.=<<___;
219.text
220.extern	OPENSSL_ia32cap_P
221
222.globl	sha1_block_data_order
223.type	sha1_block_data_order,\@function,3
224.align	16
225sha1_block_data_order:
226	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
227	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
228	test	\$`1<<9`,%r8d		# check SSSE3 bit
229	jz	.Lialu
230___
231$code.=<<___ if ($avx);
232	and	\$`1<<28`,%r8d		# mask AVX bit
233	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
234	or	%r9d,%r8d
235	cmp	\$`1<<28|1<<30`,%r8d
236	je	_avx_shortcut
237___
238$code.=<<___;
239	jmp	_ssse3_shortcut
240
241.align	16
242.Lialu:
243	push	%rbx
244	push	%rbp
245	push	%r12
246	push	%r13
247	mov	%rsp,%r11
248	mov	%rdi,$ctx	# reassigned argument
249	sub	\$`8+16*4`,%rsp
250	mov	%rsi,$inp	# reassigned argument
251	and	\$-64,%rsp
252	mov	%rdx,$num	# reassigned argument
253	mov	%r11,`16*4`(%rsp)
254.Lprologue:
255
256	mov	0($ctx),$A
257	mov	4($ctx),$B
258	mov	8($ctx),$C
259	mov	12($ctx),$D
260	mov	16($ctx),$E
261	jmp	.Lloop
262
263.align	16
264.Lloop:
265___
266for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
267for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
269for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
270$code.=<<___;
271	add	0($ctx),$A
272	add	4($ctx),$B
273	add	8($ctx),$C
274	add	12($ctx),$D
275	add	16($ctx),$E
276	mov	$A,0($ctx)
277	mov	$B,4($ctx)
278	mov	$C,8($ctx)
279	mov	$D,12($ctx)
280	mov	$E,16($ctx)
281
282	sub	\$1,$num
283	lea	`16*4`($inp),$inp
284	jnz	.Lloop
285
286	mov	`16*4`(%rsp),%rsi
287	mov	(%rsi),%r13
288	mov	8(%rsi),%r12
289	mov	16(%rsi),%rbp
290	mov	24(%rsi),%rbx
291	lea	32(%rsi),%rsp
292.Lepilogue:
293	ret
294.size	sha1_block_data_order,.-sha1_block_data_order
295___
296{{{
297my $Xi=4;
298my @X=map("%xmm$_",(4..7,0..3));
299my @Tx=map("%xmm$_",(8..10));
300my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
301my @T=("%esi","%edi");
302my $j=0;
303my $K_XX_XX="%r11";
304
305my $_rol=sub { &rol(@_) };
306my $_ror=sub { &ror(@_) };
307
308$code.=<<___;
309.type	sha1_block_data_order_ssse3,\@function,3
310.align	16
311sha1_block_data_order_ssse3:
312_ssse3_shortcut:
313	push	%rbx
314	push	%rbp
315	push	%r12
316	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
317___
318$code.=<<___ if ($win64);
319	movaps	%xmm6,64+0(%rsp)
320	movaps	%xmm7,64+16(%rsp)
321	movaps	%xmm8,64+32(%rsp)
322	movaps	%xmm9,64+48(%rsp)
323	movaps	%xmm10,64+64(%rsp)
324.Lprologue_ssse3:
325___
326$code.=<<___;
327	mov	%rdi,$ctx	# reassigned argument
328	mov	%rsi,$inp	# reassigned argument
329	mov	%rdx,$num	# reassigned argument
330
331	shl	\$6,$num
332	add	$inp,$num
333	lea	K_XX_XX(%rip),$K_XX_XX
334
335	mov	0($ctx),$A		# load context
336	mov	4($ctx),$B
337	mov	8($ctx),$C
338	mov	12($ctx),$D
339	mov	$B,@T[0]		# magic seed
340	mov	16($ctx),$E
341
342	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
343	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
344	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
345	movdqu	16($inp),@X[-3&7]
346	movdqu	32($inp),@X[-2&7]
347	movdqu	48($inp),@X[-1&7]
348	pshufb	@X[2],@X[-4&7]		# byte swap
349	add	\$64,$inp
350	pshufb	@X[2],@X[-3&7]
351	pshufb	@X[2],@X[-2&7]
352	pshufb	@X[2],@X[-1&7]
353	paddd	@Tx[1],@X[-4&7]		# add K_00_19
354	paddd	@Tx[1],@X[-3&7]
355	paddd	@Tx[1],@X[-2&7]
356	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
357	psubd	@Tx[1],@X[-4&7]		# restore X[]
358	movdqa	@X[-3&7],16(%rsp)
359	psubd	@Tx[1],@X[-3&7]
360	movdqa	@X[-2&7],32(%rsp)
361	psubd	@Tx[1],@X[-2&7]
362	jmp	.Loop_ssse3
363___
364
365sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
366{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
367  my $arg = pop;
368    $arg = "\$$arg" if ($arg*1 eq $arg);
369    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
370}
371
372sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
373{ use integer;
374  my $body = shift;
375  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
376  my ($a,$b,$c,$d,$e);
377
378	&movdqa	(@X[0],@X[-3&7]);
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	&movdqa	(@Tx[0],@X[-1&7]);
382	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385
386	  &paddd	(@Tx[1],@X[-1&7]);
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395
396	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
397	 eval(shift(@insns));
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400	 eval(shift(@insns));
401
402	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
403	 eval(shift(@insns));
404	 eval(shift(@insns));
405	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
406	 eval(shift(@insns));
407	 eval(shift(@insns));
408
409	&movdqa	(@Tx[2],@X[0]);
410	&movdqa	(@Tx[0],@X[0]);
411	 eval(shift(@insns));
412	 eval(shift(@insns));
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415
416	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
417	&paddd	(@X[0],@X[0]);
418	 eval(shift(@insns));
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422
423	&psrld	(@Tx[0],31);
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	&movdqa	(@Tx[1],@Tx[2]);
427	 eval(shift(@insns));
428	 eval(shift(@insns));
429
430	&psrld	(@Tx[2],30);
431	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
432	 eval(shift(@insns));
433	 eval(shift(@insns));
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436
437	&pslld	(@Tx[1],2);
438	&pxor	(@X[0],@Tx[2]);
439	 eval(shift(@insns));
440	 eval(shift(@insns));
441	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
442	 eval(shift(@insns));
443	 eval(shift(@insns));
444
445	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
446
447	 foreach (@insns) { eval; }	# remaining instructions [if any]
448
449  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
450		push(@Tx,shift(@Tx));
451}
452
453sub Xupdate_ssse3_32_79()
454{ use integer;
455  my $body = shift;
456  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
457  my ($a,$b,$c,$d,$e);
458
459	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
460	 eval(shift(@insns));		# body_20_39
461	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
462	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
463	 eval(shift(@insns));
464	 eval(shift(@insns));
465	 eval(shift(@insns));		# rol
466
467	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
468	 eval(shift(@insns));
469	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
470	if ($Xi%5) {
471	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
472	} else {			# ... or load next one
473	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
474	}
475	  &paddd	(@Tx[1],@X[-1&7]);
476	 eval(shift(@insns));		# ror
477	 eval(shift(@insns));
478
479	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
480	 eval(shift(@insns));		# body_20_39
481	 eval(shift(@insns));
482	 eval(shift(@insns));
483	 eval(shift(@insns));		# rol
484
485	&movdqa	(@Tx[0],@X[0]);
486	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
487	 eval(shift(@insns));
488	 eval(shift(@insns));
489	 eval(shift(@insns));		# ror
490	 eval(shift(@insns));
491
492	&pslld	(@X[0],2);
493	 eval(shift(@insns));		# body_20_39
494	 eval(shift(@insns));
495	&psrld	(@Tx[0],30);
496	 eval(shift(@insns));
497	 eval(shift(@insns));		# rol
498	 eval(shift(@insns));
499	 eval(shift(@insns));
500	 eval(shift(@insns));		# ror
501	 eval(shift(@insns));
502
503	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
504	 eval(shift(@insns));		# body_20_39
505	 eval(shift(@insns));
506	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
507	 eval(shift(@insns));
508	 eval(shift(@insns));		# rol
509	 eval(shift(@insns));
510	 eval(shift(@insns));
511	 eval(shift(@insns));		# rol
512	 eval(shift(@insns));
513
514	 foreach (@insns) { eval; }	# remaining instructions
515
516  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
517		push(@Tx,shift(@Tx));
518}
519
520sub Xuplast_ssse3_80()
521{ use integer;
522  my $body = shift;
523  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
524  my ($a,$b,$c,$d,$e);
525
526	 eval(shift(@insns));
527	  &paddd	(@Tx[1],@X[-1&7]);
528	 eval(shift(@insns));
529	 eval(shift(@insns));
530	 eval(shift(@insns));
531	 eval(shift(@insns));
532
533	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
534
535	 foreach (@insns) { eval; }		# remaining instructions
536
537	&cmp	($inp,$num);
538	&je	(".Ldone_ssse3");
539
540	unshift(@Tx,pop(@Tx));
541
542	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
543	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
544	&movdqu	(@X[-4&7],"0($inp)");		# load input
545	&movdqu	(@X[-3&7],"16($inp)");
546	&movdqu	(@X[-2&7],"32($inp)");
547	&movdqu	(@X[-1&7],"48($inp)");
548	&pshufb	(@X[-4&7],@X[2]);		# byte swap
549	&add	($inp,64);
550
551  $Xi=0;
552}
553
554sub Xloop_ssse3()
555{ use integer;
556  my $body = shift;
557  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
558  my ($a,$b,$c,$d,$e);
559
560	 eval(shift(@insns));
561	 eval(shift(@insns));
562	&pshufb	(@X[($Xi-3)&7],@X[2]);
563	 eval(shift(@insns));
564	 eval(shift(@insns));
565	&paddd	(@X[($Xi-4)&7],@Tx[1]);
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568	 eval(shift(@insns));
569	 eval(shift(@insns));
570	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
571	 eval(shift(@insns));
572	 eval(shift(@insns));
573	&psubd	(@X[($Xi-4)&7],@Tx[1]);
574
575	foreach (@insns) { eval; }
576  $Xi++;
577}
578
579sub Xtail_ssse3()
580{ use integer;
581  my $body = shift;
582  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
583  my ($a,$b,$c,$d,$e);
584
585	foreach (@insns) { eval; }
586}
587
588sub body_00_19 () {
589	(
590	'($a,$b,$c,$d,$e)=@V;'.
591	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
592	'&xor	($c,$d);',
593	'&mov	(@T[1],$a);',	# $b in next round
594	'&$_rol	($a,5);',
595	'&and	(@T[0],$c);',	# ($b&($c^$d))
596	'&xor	($c,$d);',	# restore $c
597	'&xor	(@T[0],$d);',
598	'&add	($e,$a);',
599	'&$_ror	($b,$j?7:2);',	# $b>>>2
600	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
601	);
602}
603
604sub body_20_39 () {
605	(
606	'($a,$b,$c,$d,$e)=@V;'.
607	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
608	'&xor	(@T[0],$d);',	# ($b^$d)
609	'&mov	(@T[1],$a);',	# $b in next round
610	'&$_rol	($a,5);',
611	'&xor	(@T[0],$c);',	# ($b^$d^$c)
612	'&add	($e,$a);',
613	'&$_ror	($b,7);',	# $b>>>2
614	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
615	);
616}
617
618sub body_40_59 () {
619	(
620	'($a,$b,$c,$d,$e)=@V;'.
621	'&mov	(@T[1],$c);',
622	'&xor	($c,$d);',
623	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
624	'&and	(@T[1],$d);',
625	'&and	(@T[0],$c);',	# ($b&($c^$d))
626	'&$_ror	($b,7);',	# $b>>>2
627	'&add	($e,@T[1]);',
628	'&mov	(@T[1],$a);',	# $b in next round
629	'&$_rol	($a,5);',
630	'&add	($e,@T[0]);',
631	'&xor	($c,$d);',	# restore $c
632	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
633	);
634}
635$code.=<<___;
636.align	16
637.Loop_ssse3:
638___
639	&Xupdate_ssse3_16_31(\&body_00_19);
640	&Xupdate_ssse3_16_31(\&body_00_19);
641	&Xupdate_ssse3_16_31(\&body_00_19);
642	&Xupdate_ssse3_16_31(\&body_00_19);
643	&Xupdate_ssse3_32_79(\&body_00_19);
644	&Xupdate_ssse3_32_79(\&body_20_39);
645	&Xupdate_ssse3_32_79(\&body_20_39);
646	&Xupdate_ssse3_32_79(\&body_20_39);
647	&Xupdate_ssse3_32_79(\&body_20_39);
648	&Xupdate_ssse3_32_79(\&body_20_39);
649	&Xupdate_ssse3_32_79(\&body_40_59);
650	&Xupdate_ssse3_32_79(\&body_40_59);
651	&Xupdate_ssse3_32_79(\&body_40_59);
652	&Xupdate_ssse3_32_79(\&body_40_59);
653	&Xupdate_ssse3_32_79(\&body_40_59);
654	&Xupdate_ssse3_32_79(\&body_20_39);
655	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
656
657				$saved_j=$j; @saved_V=@V;
658
659	&Xloop_ssse3(\&body_20_39);
660	&Xloop_ssse3(\&body_20_39);
661	&Xloop_ssse3(\&body_20_39);
662
663$code.=<<___;
664	add	0($ctx),$A			# update context
665	add	4($ctx),@T[0]
666	add	8($ctx),$C
667	add	12($ctx),$D
668	mov	$A,0($ctx)
669	add	16($ctx),$E
670	mov	@T[0],4($ctx)
671	mov	@T[0],$B			# magic seed
672	mov	$C,8($ctx)
673	mov	$D,12($ctx)
674	mov	$E,16($ctx)
675	jmp	.Loop_ssse3
676
677.align	16
678.Ldone_ssse3:
679___
680				$j=$saved_j; @V=@saved_V;
681
682	&Xtail_ssse3(\&body_20_39);
683	&Xtail_ssse3(\&body_20_39);
684	&Xtail_ssse3(\&body_20_39);
685
686$code.=<<___;
687	add	0($ctx),$A			# update context
688	add	4($ctx),@T[0]
689	add	8($ctx),$C
690	mov	$A,0($ctx)
691	add	12($ctx),$D
692	mov	@T[0],4($ctx)
693	add	16($ctx),$E
694	mov	$C,8($ctx)
695	mov	$D,12($ctx)
696	mov	$E,16($ctx)
697___
698$code.=<<___ if ($win64);
699	movaps	64+0(%rsp),%xmm6
700	movaps	64+16(%rsp),%xmm7
701	movaps	64+32(%rsp),%xmm8
702	movaps	64+48(%rsp),%xmm9
703	movaps	64+64(%rsp),%xmm10
704___
705$code.=<<___;
706	lea	`64+($win64?5*16:0)`(%rsp),%rsi
707	mov	0(%rsi),%r12
708	mov	8(%rsi),%rbp
709	mov	16(%rsi),%rbx
710	lea	24(%rsi),%rsp
711.Lepilogue_ssse3:
712	ret
713.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
714___
715
716if ($avx) {
717my $Xi=4;
718my @X=map("%xmm$_",(4..7,0..3));
719my @Tx=map("%xmm$_",(8..10));
720my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
721my @T=("%esi","%edi");
722my $j=0;
723my $K_XX_XX="%r11";
724
725my $_rol=sub { &shld(@_[0],@_) };
726my $_ror=sub { &shrd(@_[0],@_) };
727
728$code.=<<___;
729.type	sha1_block_data_order_avx,\@function,3
730.align	16
731sha1_block_data_order_avx:
732_avx_shortcut:
733	push	%rbx
734	push	%rbp
735	push	%r12
736	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
737___
738$code.=<<___ if ($win64);
739	movaps	%xmm6,64+0(%rsp)
740	movaps	%xmm7,64+16(%rsp)
741	movaps	%xmm8,64+32(%rsp)
742	movaps	%xmm9,64+48(%rsp)
743	movaps	%xmm10,64+64(%rsp)
744.Lprologue_avx:
745___
746$code.=<<___;
747	mov	%rdi,$ctx	# reassigned argument
748	mov	%rsi,$inp	# reassigned argument
749	mov	%rdx,$num	# reassigned argument
750	vzeroupper
751
752	shl	\$6,$num
753	add	$inp,$num
754	lea	K_XX_XX(%rip),$K_XX_XX
755
756	mov	0($ctx),$A		# load context
757	mov	4($ctx),$B
758	mov	8($ctx),$C
759	mov	12($ctx),$D
760	mov	$B,@T[0]		# magic seed
761	mov	16($ctx),$E
762
763	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
764	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
765	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
766	vmovdqu	16($inp),@X[-3&7]
767	vmovdqu	32($inp),@X[-2&7]
768	vmovdqu	48($inp),@X[-1&7]
769	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
770	add	\$64,$inp
771	vpshufb	@X[2],@X[-3&7],@X[-3&7]
772	vpshufb	@X[2],@X[-2&7],@X[-2&7]
773	vpshufb	@X[2],@X[-1&7],@X[-1&7]
774	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
775	vpaddd	@Tx[1],@X[-3&7],@X[1]
776	vpaddd	@Tx[1],@X[-2&7],@X[2]
777	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
778	vmovdqa	@X[1],16(%rsp)
779	vmovdqa	@X[2],32(%rsp)
780	jmp	.Loop_avx
781___
782
783sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
784{ use integer;
785  my $body = shift;
786  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
787  my ($a,$b,$c,$d,$e);
788
789	 eval(shift(@insns));
790	 eval(shift(@insns));
791	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
792	 eval(shift(@insns));
793	 eval(shift(@insns));
794
795	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
796	 eval(shift(@insns));
797	 eval(shift(@insns));
798	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
799	 eval(shift(@insns));
800	 eval(shift(@insns));
801	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
802	 eval(shift(@insns));
803	 eval(shift(@insns));
804
805	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
806	 eval(shift(@insns));
807	 eval(shift(@insns));
808	 eval(shift(@insns));
809	 eval(shift(@insns));
810
811	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
812	 eval(shift(@insns));
813	 eval(shift(@insns));
814	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
815	 eval(shift(@insns));
816	 eval(shift(@insns));
817
818	&vpsrld	(@Tx[0],@X[0],31);
819	 eval(shift(@insns));
820	 eval(shift(@insns));
821	 eval(shift(@insns));
822	 eval(shift(@insns));
823
824	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
825	&vpaddd	(@X[0],@X[0],@X[0]);
826	 eval(shift(@insns));
827	 eval(shift(@insns));
828	 eval(shift(@insns));
829	 eval(shift(@insns));
830
831	&vpsrld	(@Tx[1],@Tx[2],30);
832	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
833	 eval(shift(@insns));
834	 eval(shift(@insns));
835	 eval(shift(@insns));
836	 eval(shift(@insns));
837
838	&vpslld	(@Tx[2],@Tx[2],2);
839	&vpxor	(@X[0],@X[0],@Tx[1]);
840	 eval(shift(@insns));
841	 eval(shift(@insns));
842	 eval(shift(@insns));
843	 eval(shift(@insns));
844
845	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
846	 eval(shift(@insns));
847	 eval(shift(@insns));
848	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
849	 eval(shift(@insns));
850	 eval(shift(@insns));
851
852
853	 foreach (@insns) { eval; }	# remaining instructions [if any]
854
855  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
856		push(@Tx,shift(@Tx));
857}
858
859sub Xupdate_avx_32_79()
860{ use integer;
861  my $body = shift;
862  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
863  my ($a,$b,$c,$d,$e);
864
865	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
866	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
867	 eval(shift(@insns));		# body_20_39
868	 eval(shift(@insns));
869	 eval(shift(@insns));
870	 eval(shift(@insns));		# rol
871
872	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
873	 eval(shift(@insns));
874	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
875	if ($Xi%5) {
876	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
877	} else {			# ... or load next one
878	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
879	}
880	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
881	 eval(shift(@insns));		# ror
882	 eval(shift(@insns));
883
884	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
885	 eval(shift(@insns));		# body_20_39
886	 eval(shift(@insns));
887	 eval(shift(@insns));
888	 eval(shift(@insns));		# rol
889
890	&vpsrld	(@Tx[0],@X[0],30);
891	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
892	 eval(shift(@insns));
893	 eval(shift(@insns));
894	 eval(shift(@insns));		# ror
895	 eval(shift(@insns));
896
897	&vpslld	(@X[0],@X[0],2);
898	 eval(shift(@insns));		# body_20_39
899	 eval(shift(@insns));
900	 eval(shift(@insns));
901	 eval(shift(@insns));		# rol
902	 eval(shift(@insns));
903	 eval(shift(@insns));
904	 eval(shift(@insns));		# ror
905	 eval(shift(@insns));
906
907	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
908	 eval(shift(@insns));		# body_20_39
909	 eval(shift(@insns));
910	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
911	 eval(shift(@insns));
912	 eval(shift(@insns));		# rol
913	 eval(shift(@insns));
914	 eval(shift(@insns));
915	 eval(shift(@insns));		# rol
916	 eval(shift(@insns));
917
918	 foreach (@insns) { eval; }	# remaining instructions
919
920  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
921		push(@Tx,shift(@Tx));
922}
923
924sub Xuplast_avx_80()
925{ use integer;
926  my $body = shift;
927  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
928  my ($a,$b,$c,$d,$e);
929
930	 eval(shift(@insns));
931	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
932	 eval(shift(@insns));
933	 eval(shift(@insns));
934	 eval(shift(@insns));
935	 eval(shift(@insns));
936
937	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
938
939	 foreach (@insns) { eval; }		# remaining instructions
940
941	&cmp	($inp,$num);
942	&je	(".Ldone_avx");
943
944	unshift(@Tx,pop(@Tx));
945
946	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
947	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
948	&vmovdqu(@X[-4&7],"0($inp)");		# load input
949	&vmovdqu(@X[-3&7],"16($inp)");
950	&vmovdqu(@X[-2&7],"32($inp)");
951	&vmovdqu(@X[-1&7],"48($inp)");
952	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
953	&add	($inp,64);
954
955  $Xi=0;
956}
957
958sub Xloop_avx()
959{ use integer;
960  my $body = shift;
961  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
962  my ($a,$b,$c,$d,$e);
963
964	 eval(shift(@insns));
965	 eval(shift(@insns));
966	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
967	 eval(shift(@insns));
968	 eval(shift(@insns));
969	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
970	 eval(shift(@insns));
971	 eval(shift(@insns));
972	 eval(shift(@insns));
973	 eval(shift(@insns));
974	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
975	 eval(shift(@insns));
976	 eval(shift(@insns));
977
978	foreach (@insns) { eval; }
979  $Xi++;
980}
981
982sub Xtail_avx()
983{ use integer;
984  my $body = shift;
985  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
986  my ($a,$b,$c,$d,$e);
987
988	foreach (@insns) { eval; }
989}
990
991$code.=<<___;
992.align	16
993.Loop_avx:
994___
995	&Xupdate_avx_16_31(\&body_00_19);
996	&Xupdate_avx_16_31(\&body_00_19);
997	&Xupdate_avx_16_31(\&body_00_19);
998	&Xupdate_avx_16_31(\&body_00_19);
999	&Xupdate_avx_32_79(\&body_00_19);
1000	&Xupdate_avx_32_79(\&body_20_39);
1001	&Xupdate_avx_32_79(\&body_20_39);
1002	&Xupdate_avx_32_79(\&body_20_39);
1003	&Xupdate_avx_32_79(\&body_20_39);
1004	&Xupdate_avx_32_79(\&body_20_39);
1005	&Xupdate_avx_32_79(\&body_40_59);
1006	&Xupdate_avx_32_79(\&body_40_59);
1007	&Xupdate_avx_32_79(\&body_40_59);
1008	&Xupdate_avx_32_79(\&body_40_59);
1009	&Xupdate_avx_32_79(\&body_40_59);
1010	&Xupdate_avx_32_79(\&body_20_39);
1011	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1012
1013				$saved_j=$j; @saved_V=@V;
1014
1015	&Xloop_avx(\&body_20_39);
1016	&Xloop_avx(\&body_20_39);
1017	&Xloop_avx(\&body_20_39);
1018
1019$code.=<<___;
1020	add	0($ctx),$A			# update context
1021	add	4($ctx),@T[0]
1022	add	8($ctx),$C
1023	add	12($ctx),$D
1024	mov	$A,0($ctx)
1025	add	16($ctx),$E
1026	mov	@T[0],4($ctx)
1027	mov	@T[0],$B			# magic seed
1028	mov	$C,8($ctx)
1029	mov	$D,12($ctx)
1030	mov	$E,16($ctx)
1031	jmp	.Loop_avx
1032
1033.align	16
1034.Ldone_avx:
1035___
1036				$j=$saved_j; @V=@saved_V;
1037
1038	&Xtail_avx(\&body_20_39);
1039	&Xtail_avx(\&body_20_39);
1040	&Xtail_avx(\&body_20_39);
1041
1042$code.=<<___;
1043	vzeroupper
1044
1045	add	0($ctx),$A			# update context
1046	add	4($ctx),@T[0]
1047	add	8($ctx),$C
1048	mov	$A,0($ctx)
1049	add	12($ctx),$D
1050	mov	@T[0],4($ctx)
1051	add	16($ctx),$E
1052	mov	$C,8($ctx)
1053	mov	$D,12($ctx)
1054	mov	$E,16($ctx)
1055___
1056$code.=<<___ if ($win64);
1057	movaps	64+0(%rsp),%xmm6
1058	movaps	64+16(%rsp),%xmm7
1059	movaps	64+32(%rsp),%xmm8
1060	movaps	64+48(%rsp),%xmm9
1061	movaps	64+64(%rsp),%xmm10
1062___
1063$code.=<<___;
1064	lea	`64+($win64?5*16:0)`(%rsp),%rsi
1065	mov	0(%rsi),%r12
1066	mov	8(%rsi),%rbp
1067	mov	16(%rsi),%rbx
1068	lea	24(%rsi),%rsp
1069.Lepilogue_avx:
1070	ret
1071.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1072___
1073}
1074$code.=<<___;
1075.align	64
1076K_XX_XX:
1077.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1078.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1079.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1080.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1081.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1082___
1083}}}
1084$code.=<<___;
1085.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086.align	64
1087___
1088
1089# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1090#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1091if ($win64) {
1092$rec="%rcx";
1093$frame="%rdx";
1094$context="%r8";
1095$disp="%r9";
1096
1097$code.=<<___;
1098.extern	__imp_RtlVirtualUnwind
1099.type	se_handler,\@abi-omnipotent
1100.align	16
1101se_handler:
1102	push	%rsi
1103	push	%rdi
1104	push	%rbx
1105	push	%rbp
1106	push	%r12
1107	push	%r13
1108	push	%r14
1109	push	%r15
1110	pushfq
1111	sub	\$64,%rsp
1112
1113	mov	120($context),%rax	# pull context->Rax
1114	mov	248($context),%rbx	# pull context->Rip
1115
1116	lea	.Lprologue(%rip),%r10
1117	cmp	%r10,%rbx		# context->Rip<.Lprologue
1118	jb	.Lcommon_seh_tail
1119
1120	mov	152($context),%rax	# pull context->Rsp
1121
1122	lea	.Lepilogue(%rip),%r10
1123	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1124	jae	.Lcommon_seh_tail
1125
1126	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1127	lea	32(%rax),%rax
1128
1129	mov	-8(%rax),%rbx
1130	mov	-16(%rax),%rbp
1131	mov	-24(%rax),%r12
1132	mov	-32(%rax),%r13
1133	mov	%rbx,144($context)	# restore context->Rbx
1134	mov	%rbp,160($context)	# restore context->Rbp
1135	mov	%r12,216($context)	# restore context->R12
1136	mov	%r13,224($context)	# restore context->R13
1137
1138	jmp	.Lcommon_seh_tail
1139.size	se_handler,.-se_handler
1140
1141.type	ssse3_handler,\@abi-omnipotent
1142.align	16
1143ssse3_handler:
1144	push	%rsi
1145	push	%rdi
1146	push	%rbx
1147	push	%rbp
1148	push	%r12
1149	push	%r13
1150	push	%r14
1151	push	%r15
1152	pushfq
1153	sub	\$64,%rsp
1154
1155	mov	120($context),%rax	# pull context->Rax
1156	mov	248($context),%rbx	# pull context->Rip
1157
1158	mov	8($disp),%rsi		# disp->ImageBase
1159	mov	56($disp),%r11		# disp->HandlerData
1160
1161	mov	0(%r11),%r10d		# HandlerData[0]
1162	lea	(%rsi,%r10),%r10	# prologue label
1163	cmp	%r10,%rbx		# context->Rip<prologue label
1164	jb	.Lcommon_seh_tail
1165
1166	mov	152($context),%rax	# pull context->Rsp
1167
1168	mov	4(%r11),%r10d		# HandlerData[1]
1169	lea	(%rsi,%r10),%r10	# epilogue label
1170	cmp	%r10,%rbx		# context->Rip>=epilogue label
1171	jae	.Lcommon_seh_tail
1172
1173	lea	64(%rax),%rsi
1174	lea	512($context),%rdi	# &context.Xmm6
1175	mov	\$10,%ecx
1176	.long	0xa548f3fc		# cld; rep movsq
1177	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
1178
1179	mov	-8(%rax),%rbx
1180	mov	-16(%rax),%rbp
1181	mov	-24(%rax),%r12
1182	mov	%rbx,144($context)	# restore context->Rbx
1183	mov	%rbp,160($context)	# restore context->Rbp
1184	mov	%r12,216($context)	# restore cotnext->R12
1185
1186.Lcommon_seh_tail:
1187	mov	8(%rax),%rdi
1188	mov	16(%rax),%rsi
1189	mov	%rax,152($context)	# restore context->Rsp
1190	mov	%rsi,168($context)	# restore context->Rsi
1191	mov	%rdi,176($context)	# restore context->Rdi
1192
1193	mov	40($disp),%rdi		# disp->ContextRecord
1194	mov	$context,%rsi		# context
1195	mov	\$154,%ecx		# sizeof(CONTEXT)
1196	.long	0xa548f3fc		# cld; rep movsq
1197
1198	mov	$disp,%rsi
1199	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1200	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1201	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1202	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1203	mov	40(%rsi),%r10		# disp->ContextRecord
1204	lea	56(%rsi),%r11		# &disp->HandlerData
1205	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1206	mov	%r10,32(%rsp)		# arg5
1207	mov	%r11,40(%rsp)		# arg6
1208	mov	%r12,48(%rsp)		# arg7
1209	mov	%rcx,56(%rsp)		# arg8, (NULL)
1210	call	*__imp_RtlVirtualUnwind(%rip)
1211
1212	mov	\$1,%eax		# ExceptionContinueSearch
1213	add	\$64,%rsp
1214	popfq
1215	pop	%r15
1216	pop	%r14
1217	pop	%r13
1218	pop	%r12
1219	pop	%rbp
1220	pop	%rbx
1221	pop	%rdi
1222	pop	%rsi
1223	ret
1224.size	ssse3_handler,.-ssse3_handler
1225
1226.section	.pdata
1227.align	4
1228	.rva	.LSEH_begin_sha1_block_data_order
1229	.rva	.LSEH_end_sha1_block_data_order
1230	.rva	.LSEH_info_sha1_block_data_order
1231	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1232	.rva	.LSEH_end_sha1_block_data_order_ssse3
1233	.rva	.LSEH_info_sha1_block_data_order_ssse3
1234___
1235$code.=<<___ if ($avx);
1236	.rva	.LSEH_begin_sha1_block_data_order_avx
1237	.rva	.LSEH_end_sha1_block_data_order_avx
1238	.rva	.LSEH_info_sha1_block_data_order_avx
1239___
1240$code.=<<___;
1241.section	.xdata
1242.align	8
1243.LSEH_info_sha1_block_data_order:
1244	.byte	9,0,0,0
1245	.rva	se_handler
1246.LSEH_info_sha1_block_data_order_ssse3:
1247	.byte	9,0,0,0
1248	.rva	ssse3_handler
1249	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1250___
1251$code.=<<___ if ($avx);
1252.LSEH_info_sha1_block_data_order_avx:
1253	.byte	9,0,0,0
1254	.rva	ssse3_handler
1255	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1256___
1257}
1258
1259####################################################################
1260
1261$code =~ s/\`([^\`]*)\`/eval $1/gem;
1262print $code;
1263close STDOUT;
1264