aesni-sha1-x86_64.pl revision 299964
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23#		AES-128-CBC	+SHA1		stitch      gain
24# Westmere	3.77[+5.6]	9.37		6.65	    +41%
25# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
26#
27#		AES-192-CBC
28# Westmere	4.51		10.11		6.97	    +45%
29# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
30#
31#		AES-256-CBC
32# Westmere	5.25		10.85		7.25	    +50%
33# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
34#
35# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36#	background information. Above numbers in parentheses are SSSE3
37#	results collected on AVX-capable CPU, i.e. apply on OSes that
38#	don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47#		AES-128-CBC	AES-192-CBC	AES-256-CBC
48# Westmere	1.31		1.55		1.80
49# Sandy Bridge	0.93		1.06		1.22
50
51$flavour = shift;
52$output  = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64	   $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67	   $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70	   $1>=10);
71$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
72
73open OUT,"| \"$^X\" $xlate $flavour $output";
74*STDOUT=*OUT;
75
76# void aesni_cbc_sha1_enc(const void *inp,
77#			void *out,
78#			size_t length,
79#			const AES_KEY *key,
80#			unsigned char *iv,
81#			SHA_CTX *ctx,
82#			const void *in0);
83
84$code.=<<___;
85.text
86.extern	OPENSSL_ia32cap_P
87
88.globl	aesni_cbc_sha1_enc
89.type	aesni_cbc_sha1_enc,\@abi-omnipotent
90.align	16
91aesni_cbc_sha1_enc:
92	# caller should check for SSSE3 and AES-NI bits
93	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
94	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
95___
96$code.=<<___ if ($avx);
97	and	\$`1<<28`,%r11d		# mask AVX bit
98	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
99	or	%r11d,%r10d
100	cmp	\$`1<<28|1<<30`,%r10d
101	je	aesni_cbc_sha1_enc_avx
102___
103$code.=<<___;
104	jmp	aesni_cbc_sha1_enc_ssse3
105	ret
106.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
107___
108
109my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
110
111my $Xi=4;
112my @X=map("%xmm$_",(4..7,0..3));
113my @Tx=map("%xmm$_",(8..10));
114my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
115my @T=("%esi","%edi");
116my $j=0; my $jj=0; my $r=0; my $sn=0;
117my $K_XX_XX="%r11";
118my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
119my @rndkey=("%xmm14","%xmm15");
120
121sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
122{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
123  my $arg = pop;
124    $arg = "\$$arg" if ($arg*1 eq $arg);
125    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
126}
127
128my $_rol=sub { &rol(@_) };
129my $_ror=sub { &ror(@_) };
130
131$code.=<<___;
132.type	aesni_cbc_sha1_enc_ssse3,\@function,6
133.align	16
134aesni_cbc_sha1_enc_ssse3:
135	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
136	#shr	\$6,$len			# debugging artefact
137	#jz	.Lepilogue_ssse3		# debugging artefact
138	push	%rbx
139	push	%rbp
140	push	%r12
141	push	%r13
142	push	%r14
143	push	%r15
144	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
145	#mov	$in0,$inp			# debugging artefact
146	#lea	64(%rsp),$ctx			# debugging artefact
147___
148$code.=<<___ if ($win64);
149	movaps	%xmm6,96+0(%rsp)
150	movaps	%xmm7,96+16(%rsp)
151	movaps	%xmm8,96+32(%rsp)
152	movaps	%xmm9,96+48(%rsp)
153	movaps	%xmm10,96+64(%rsp)
154	movaps	%xmm11,96+80(%rsp)
155	movaps	%xmm12,96+96(%rsp)
156	movaps	%xmm13,96+112(%rsp)
157	movaps	%xmm14,96+128(%rsp)
158	movaps	%xmm15,96+144(%rsp)
159.Lprologue_ssse3:
160___
161$code.=<<___;
162	mov	$in0,%r12			# reassign arguments
163	mov	$out,%r13
164	mov	$len,%r14
165	mov	$key,%r15
166	movdqu	($ivp),$iv			# load IV
167	mov	$ivp,88(%rsp)			# save $ivp
168___
169my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
170my $rounds="${ivp}d";
171$code.=<<___;
172	shl	\$6,$len
173	sub	$in0,$out
174	mov	240($key),$rounds
175	add	$inp,$len		# end of input
176
177	lea	K_XX_XX(%rip),$K_XX_XX
178	mov	0($ctx),$A		# load context
179	mov	4($ctx),$B
180	mov	8($ctx),$C
181	mov	12($ctx),$D
182	mov	$B,@T[0]		# magic seed
183	mov	16($ctx),$E
184
185	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
186	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
187	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
188	movdqu	16($inp),@X[-3&7]
189	movdqu	32($inp),@X[-2&7]
190	movdqu	48($inp),@X[-1&7]
191	pshufb	@X[2],@X[-4&7]		# byte swap
192	add	\$64,$inp
193	pshufb	@X[2],@X[-3&7]
194	pshufb	@X[2],@X[-2&7]
195	pshufb	@X[2],@X[-1&7]
196	paddd	@Tx[1],@X[-4&7]		# add K_00_19
197	paddd	@Tx[1],@X[-3&7]
198	paddd	@Tx[1],@X[-2&7]
199	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
200	psubd	@Tx[1],@X[-4&7]		# restore X[]
201	movdqa	@X[-3&7],16(%rsp)
202	psubd	@Tx[1],@X[-3&7]
203	movdqa	@X[-2&7],32(%rsp)
204	psubd	@Tx[1],@X[-2&7]
205	movups	($key),$rndkey0		# $key[0]
206	movups	16($key),$rndkey[0]	# forward reference
207	jmp	.Loop_ssse3
208___
209
210my $aesenc=sub {
211  use integer;
212  my ($n,$k)=($r/10,$r%10);
213    if ($k==0) {
214      $code.=<<___;
215	movups		`16*$n`($in0),$in		# load input
216	xorps		$rndkey0,$in
217___
218      $code.=<<___ if ($n);
219	movups		$iv,`16*($n-1)`($out,$in0)	# write output
220___
221      $code.=<<___;
222	xorps		$in,$iv
223	aesenc		$rndkey[0],$iv
224	movups		`32+16*$k`($key),$rndkey[1]
225___
226    } elsif ($k==9) {
227      $sn++;
228      $code.=<<___;
229	cmp		\$11,$rounds
230	jb		.Laesenclast$sn
231	movups		`32+16*($k+0)`($key),$rndkey[1]
232	aesenc		$rndkey[0],$iv
233	movups		`32+16*($k+1)`($key),$rndkey[0]
234	aesenc		$rndkey[1],$iv
235	je		.Laesenclast$sn
236	movups		`32+16*($k+2)`($key),$rndkey[1]
237	aesenc		$rndkey[0],$iv
238	movups		`32+16*($k+3)`($key),$rndkey[0]
239	aesenc		$rndkey[1],$iv
240.Laesenclast$sn:
241	aesenclast	$rndkey[0],$iv
242	movups		16($key),$rndkey[1]		# forward reference
243___
244    } else {
245      $code.=<<___;
246	aesenc		$rndkey[0],$iv
247	movups		`32+16*$k`($key),$rndkey[1]
248___
249    }
250    $r++;	unshift(@rndkey,pop(@rndkey));
251};
252
253sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
254{ use integer;
255  my $body = shift;
256  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
257  my ($a,$b,$c,$d,$e);
258
259	&movdqa	(@X[0],@X[-3&7]);
260	 eval(shift(@insns));
261	 eval(shift(@insns));
262	&movdqa	(@Tx[0],@X[-1&7]);
263	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
264	 eval(shift(@insns));
265	 eval(shift(@insns));
266
267	  &paddd	(@Tx[1],@X[-1&7]);
268	 eval(shift(@insns));
269	 eval(shift(@insns));
270	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
271	 eval(shift(@insns));
272	 eval(shift(@insns));
273	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
274	 eval(shift(@insns));
275	 eval(shift(@insns));
276
277	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
278	 eval(shift(@insns));
279	 eval(shift(@insns));
280	 eval(shift(@insns));
281	 eval(shift(@insns));
282
283	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
284	 eval(shift(@insns));
285	 eval(shift(@insns));
286	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
287	 eval(shift(@insns));
288	 eval(shift(@insns));
289
290	&movdqa	(@Tx[2],@X[0]);
291	&movdqa	(@Tx[0],@X[0]);
292	 eval(shift(@insns));
293	 eval(shift(@insns));
294	 eval(shift(@insns));
295	 eval(shift(@insns));
296
297	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
298	&paddd	(@X[0],@X[0]);
299	 eval(shift(@insns));
300	 eval(shift(@insns));
301	 eval(shift(@insns));
302	 eval(shift(@insns));
303
304	&psrld	(@Tx[0],31);
305	 eval(shift(@insns));
306	 eval(shift(@insns));
307	&movdqa	(@Tx[1],@Tx[2]);
308	 eval(shift(@insns));
309	 eval(shift(@insns));
310
311	&psrld	(@Tx[2],30);
312	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	 eval(shift(@insns));
317
318	&pslld	(@Tx[1],2);
319	&pxor	(@X[0],@Tx[2]);
320	 eval(shift(@insns));
321	 eval(shift(@insns));
322	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
323	 eval(shift(@insns));
324	 eval(shift(@insns));
325
326	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
327
328	 foreach (@insns) { eval; }	# remaining instructions [if any]
329
330  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
331		push(@Tx,shift(@Tx));
332}
333
334sub Xupdate_ssse3_32_79()
335{ use integer;
336  my $body = shift;
337  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
338  my ($a,$b,$c,$d,$e);
339
340	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
341	 eval(shift(@insns));		# body_20_39
342	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
343	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346	 eval(shift(@insns));		# rol
347
348	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
349	 eval(shift(@insns));
350	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
351	if ($Xi%5) {
352	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
353	} else {			# ... or load next one
354	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
355	}
356	  &paddd	(@Tx[1],@X[-1&7]);
357	 eval(shift(@insns));		# ror
358	 eval(shift(@insns));
359
360	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
361	 eval(shift(@insns));		# body_20_39
362	 eval(shift(@insns));
363	 eval(shift(@insns));
364	 eval(shift(@insns));		# rol
365
366	&movdqa	(@Tx[0],@X[0]);
367	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
368	 eval(shift(@insns));
369	 eval(shift(@insns));
370	 eval(shift(@insns));		# ror
371	 eval(shift(@insns));
372
373	&pslld	(@X[0],2);
374	 eval(shift(@insns));		# body_20_39
375	 eval(shift(@insns));
376	&psrld	(@Tx[0],30);
377	 eval(shift(@insns));
378	 eval(shift(@insns));		# rol
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	 eval(shift(@insns));		# ror
382	 eval(shift(@insns));
383
384	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
385	 eval(shift(@insns));		# body_20_39
386	 eval(shift(@insns));
387	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
388	 eval(shift(@insns));
389	 eval(shift(@insns));		# rol
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	 eval(shift(@insns));		# rol
393	 eval(shift(@insns));
394
395	 foreach (@insns) { eval; }	# remaining instructions
396
397  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
398		push(@Tx,shift(@Tx));
399}
400
401sub Xuplast_ssse3_80()
402{ use integer;
403  my $body = shift;
404  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
405  my ($a,$b,$c,$d,$e);
406
407	 eval(shift(@insns));
408	  &paddd	(@Tx[1],@X[-1&7]);
409	 eval(shift(@insns));
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412	 eval(shift(@insns));
413
414	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
415
416	 foreach (@insns) { eval; }		# remaining instructions
417
418	&cmp	($inp,$len);
419	&je	(".Ldone_ssse3");
420
421	unshift(@Tx,pop(@Tx));
422
423	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
424	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
425	&movdqu	(@X[-4&7],"0($inp)");		# load input
426	&movdqu	(@X[-3&7],"16($inp)");
427	&movdqu	(@X[-2&7],"32($inp)");
428	&movdqu	(@X[-1&7],"48($inp)");
429	&pshufb	(@X[-4&7],@X[2]);		# byte swap
430	&add	($inp,64);
431
432  $Xi=0;
433}
434
435sub Xloop_ssse3()
436{ use integer;
437  my $body = shift;
438  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
439  my ($a,$b,$c,$d,$e);
440
441	 eval(shift(@insns));
442	 eval(shift(@insns));
443	&pshufb	(@X[($Xi-3)&7],@X[2]);
444	 eval(shift(@insns));
445	 eval(shift(@insns));
446	&paddd	(@X[($Xi-4)&7],@Tx[1]);
447	 eval(shift(@insns));
448	 eval(shift(@insns));
449	 eval(shift(@insns));
450	 eval(shift(@insns));
451	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
452	 eval(shift(@insns));
453	 eval(shift(@insns));
454	&psubd	(@X[($Xi-4)&7],@Tx[1]);
455
456	foreach (@insns) { eval; }
457  $Xi++;
458}
459
460sub Xtail_ssse3()
461{ use integer;
462  my $body = shift;
463  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
464  my ($a,$b,$c,$d,$e);
465
466	foreach (@insns) { eval; }
467}
468
469sub body_00_19 () {
470  use integer;
471  my ($k,$n);
472  my @r=(
473	'($a,$b,$c,$d,$e)=@V;'.
474	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
475	'&xor	($c,$d);',
476	'&mov	(@T[1],$a);',	# $b in next round
477	'&$_rol	($a,5);',
478	'&and	(@T[0],$c);',	# ($b&($c^$d))
479	'&xor	($c,$d);',	# restore $c
480	'&xor	(@T[0],$d);',
481	'&add	($e,$a);',
482	'&$_ror	($b,$j?7:2);',	# $b>>>2
483	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
484	);
485	$n = scalar(@r);
486	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
487	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
488	$jj++;
489    return @r;
490}
491
492sub body_20_39 () {
493  use integer;
494  my ($k,$n);
495  my @r=(
496	'($a,$b,$c,$d,$e)=@V;'.
497	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
498	'&xor	(@T[0],$d);',	# ($b^$d)
499	'&mov	(@T[1],$a);',	# $b in next round
500	'&$_rol	($a,5);',
501	'&xor	(@T[0],$c);',	# ($b^$d^$c)
502	'&add	($e,$a);',
503	'&$_ror	($b,7);',	# $b>>>2
504	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
505	);
506	$n = scalar(@r);
507	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
508	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
509	$jj++;
510    return @r;
511}
512
513sub body_40_59 () {
514  use integer;
515  my ($k,$n);
516  my @r=(
517	'($a,$b,$c,$d,$e)=@V;'.
518	'&mov	(@T[1],$c);',
519	'&xor	($c,$d);',
520	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
521	'&and	(@T[1],$d);',
522	'&and	(@T[0],$c);',	# ($b&($c^$d))
523	'&$_ror	($b,7);',	# $b>>>2
524	'&add	($e,@T[1]);',
525	'&mov	(@T[1],$a);',	# $b in next round
526	'&$_rol	($a,5);',
527	'&add	($e,@T[0]);',
528	'&xor	($c,$d);',	# restore $c
529	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
530	);
531	$n = scalar(@r);
532	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
533	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
534	$jj++;
535    return @r;
536}
537$code.=<<___;
538.align	16
539.Loop_ssse3:
540___
541	&Xupdate_ssse3_16_31(\&body_00_19);
542	&Xupdate_ssse3_16_31(\&body_00_19);
543	&Xupdate_ssse3_16_31(\&body_00_19);
544	&Xupdate_ssse3_16_31(\&body_00_19);
545	&Xupdate_ssse3_32_79(\&body_00_19);
546	&Xupdate_ssse3_32_79(\&body_20_39);
547	&Xupdate_ssse3_32_79(\&body_20_39);
548	&Xupdate_ssse3_32_79(\&body_20_39);
549	&Xupdate_ssse3_32_79(\&body_20_39);
550	&Xupdate_ssse3_32_79(\&body_20_39);
551	&Xupdate_ssse3_32_79(\&body_40_59);
552	&Xupdate_ssse3_32_79(\&body_40_59);
553	&Xupdate_ssse3_32_79(\&body_40_59);
554	&Xupdate_ssse3_32_79(\&body_40_59);
555	&Xupdate_ssse3_32_79(\&body_40_59);
556	&Xupdate_ssse3_32_79(\&body_20_39);
557	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
558
559				$saved_j=$j; @saved_V=@V;
560				$saved_r=$r; @saved_rndkey=@rndkey;
561
562	&Xloop_ssse3(\&body_20_39);
563	&Xloop_ssse3(\&body_20_39);
564	&Xloop_ssse3(\&body_20_39);
565
566$code.=<<___;
567	movups	$iv,48($out,$in0)		# write output
568	lea	64($in0),$in0
569
570	add	0($ctx),$A			# update context
571	add	4($ctx),@T[0]
572	add	8($ctx),$C
573	add	12($ctx),$D
574	mov	$A,0($ctx)
575	add	16($ctx),$E
576	mov	@T[0],4($ctx)
577	mov	@T[0],$B			# magic seed
578	mov	$C,8($ctx)
579	mov	$D,12($ctx)
580	mov	$E,16($ctx)
581	jmp	.Loop_ssse3
582
583.align	16
584.Ldone_ssse3:
585___
586				$jj=$j=$saved_j; @V=@saved_V;
587				$r=$saved_r;     @rndkey=@saved_rndkey;
588
589	&Xtail_ssse3(\&body_20_39);
590	&Xtail_ssse3(\&body_20_39);
591	&Xtail_ssse3(\&body_20_39);
592
593$code.=<<___;
594	movups	$iv,48($out,$in0)		# write output
595	mov	88(%rsp),$ivp			# restore $ivp
596
597	add	0($ctx),$A			# update context
598	add	4($ctx),@T[0]
599	add	8($ctx),$C
600	mov	$A,0($ctx)
601	add	12($ctx),$D
602	mov	@T[0],4($ctx)
603	add	16($ctx),$E
604	mov	$C,8($ctx)
605	mov	$D,12($ctx)
606	mov	$E,16($ctx)
607	movups	$iv,($ivp)			# write IV
608___
609$code.=<<___ if ($win64);
610	movaps	96+0(%rsp),%xmm6
611	movaps	96+16(%rsp),%xmm7
612	movaps	96+32(%rsp),%xmm8
613	movaps	96+48(%rsp),%xmm9
614	movaps	96+64(%rsp),%xmm10
615	movaps	96+80(%rsp),%xmm11
616	movaps	96+96(%rsp),%xmm12
617	movaps	96+112(%rsp),%xmm13
618	movaps	96+128(%rsp),%xmm14
619	movaps	96+144(%rsp),%xmm15
620___
621$code.=<<___;
622	lea	`104+($win64?10*16:0)`(%rsp),%rsi
623	mov	0(%rsi),%r15
624	mov	8(%rsi),%r14
625	mov	16(%rsi),%r13
626	mov	24(%rsi),%r12
627	mov	32(%rsi),%rbp
628	mov	40(%rsi),%rbx
629	lea	48(%rsi),%rsp
630.Lepilogue_ssse3:
631	ret
632.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
633___
634
635$j=$jj=$r=$sn=0;
636
637if ($avx) {
638my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
639
640my $Xi=4;
641my @X=map("%xmm$_",(4..7,0..3));
642my @Tx=map("%xmm$_",(8..10));
643my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
644my @T=("%esi","%edi");
645
646my $_rol=sub { &shld(@_[0],@_) };
647my $_ror=sub { &shrd(@_[0],@_) };
648
649$code.=<<___;
650.type	aesni_cbc_sha1_enc_avx,\@function,6
651.align	16
652aesni_cbc_sha1_enc_avx:
653	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
654	#shr	\$6,$len			# debugging artefact
655	#jz	.Lepilogue_avx			# debugging artefact
656	push	%rbx
657	push	%rbp
658	push	%r12
659	push	%r13
660	push	%r14
661	push	%r15
662	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
663	#mov	$in0,$inp			# debugging artefact
664	#lea	64(%rsp),$ctx			# debugging artefact
665___
666$code.=<<___ if ($win64);
667	movaps	%xmm6,96+0(%rsp)
668	movaps	%xmm7,96+16(%rsp)
669	movaps	%xmm8,96+32(%rsp)
670	movaps	%xmm9,96+48(%rsp)
671	movaps	%xmm10,96+64(%rsp)
672	movaps	%xmm11,96+80(%rsp)
673	movaps	%xmm12,96+96(%rsp)
674	movaps	%xmm13,96+112(%rsp)
675	movaps	%xmm14,96+128(%rsp)
676	movaps	%xmm15,96+144(%rsp)
677.Lprologue_avx:
678___
679$code.=<<___;
680	vzeroall
681	mov	$in0,%r12			# reassign arguments
682	mov	$out,%r13
683	mov	$len,%r14
684	mov	$key,%r15
685	vmovdqu	($ivp),$iv			# load IV
686	mov	$ivp,88(%rsp)			# save $ivp
687___
688my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
689my $rounds="${ivp}d";
690$code.=<<___;
691	shl	\$6,$len
692	sub	$in0,$out
693	mov	240($key),$rounds
694	add	\$112,$key		# size optimization
695	add	$inp,$len		# end of input
696
697	lea	K_XX_XX(%rip),$K_XX_XX
698	mov	0($ctx),$A		# load context
699	mov	4($ctx),$B
700	mov	8($ctx),$C
701	mov	12($ctx),$D
702	mov	$B,@T[0]		# magic seed
703	mov	16($ctx),$E
704
705	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
706	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
707	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
708	vmovdqu	16($inp),@X[-3&7]
709	vmovdqu	32($inp),@X[-2&7]
710	vmovdqu	48($inp),@X[-1&7]
711	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
712	add	\$64,$inp
713	vpshufb	@X[2],@X[-3&7],@X[-3&7]
714	vpshufb	@X[2],@X[-2&7],@X[-2&7]
715	vpshufb	@X[2],@X[-1&7],@X[-1&7]
716	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
717	vpaddd	@Tx[1],@X[-3&7],@X[1]
718	vpaddd	@Tx[1],@X[-2&7],@X[2]
719	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
720	vmovdqa	@X[1],16(%rsp)
721	vmovdqa	@X[2],32(%rsp)
722	vmovups	-112($key),$rndkey0	# $key[0]
723	vmovups	16-112($key),$rndkey[0]	# forward reference
724	jmp	.Loop_avx
725___
726
727my $aesenc=sub {
728  use integer;
729  my ($n,$k)=($r/10,$r%10);
730    if ($k==0) {
731      $code.=<<___;
732	vmovups		`16*$n`($in0),$in		# load input
733	vxorps		$rndkey0,$in,$in
734___
735      $code.=<<___ if ($n);
736	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
737___
738      $code.=<<___;
739	vxorps		$in,$iv,$iv
740	vaesenc		$rndkey[0],$iv,$iv
741	vmovups		`32+16*$k-112`($key),$rndkey[1]
742___
743    } elsif ($k==9) {
744      $sn++;
745      $code.=<<___;
746	cmp		\$11,$rounds
747	jb		.Lvaesenclast$sn
748	vaesenc		$rndkey[0],$iv,$iv
749	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
750	vaesenc		$rndkey[1],$iv,$iv
751	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
752	je		.Lvaesenclast$sn
753	vaesenc		$rndkey[0],$iv,$iv
754	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
755	vaesenc		$rndkey[1],$iv,$iv
756	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
757.Lvaesenclast$sn:
758	vaesenclast	$rndkey[0],$iv,$iv
759	vmovups		16-112($key),$rndkey[1]		# forward reference
760___
761    } else {
762      $code.=<<___;
763	vaesenc		$rndkey[0],$iv,$iv
764	vmovups		`32+16*$k-112`($key),$rndkey[1]
765___
766    }
767    $r++;	unshift(@rndkey,pop(@rndkey));
768};
769
770sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
771{ use integer;
772  my $body = shift;
773  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
774  my ($a,$b,$c,$d,$e);
775
776	 eval(shift(@insns));
777	 eval(shift(@insns));
778	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
779	 eval(shift(@insns));
780	 eval(shift(@insns));
781
782	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
783	 eval(shift(@insns));
784	 eval(shift(@insns));
785	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
786	 eval(shift(@insns));
787	 eval(shift(@insns));
788	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
789	 eval(shift(@insns));
790	 eval(shift(@insns));
791
792	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
793	 eval(shift(@insns));
794	 eval(shift(@insns));
795	 eval(shift(@insns));
796	 eval(shift(@insns));
797
798	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
799	 eval(shift(@insns));
800	 eval(shift(@insns));
801	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
802	 eval(shift(@insns));
803	 eval(shift(@insns));
804
805	&vpsrld	(@Tx[0],@X[0],31);
806	 eval(shift(@insns));
807	 eval(shift(@insns));
808	 eval(shift(@insns));
809	 eval(shift(@insns));
810
811	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
812	&vpaddd	(@X[0],@X[0],@X[0]);
813	 eval(shift(@insns));
814	 eval(shift(@insns));
815	 eval(shift(@insns));
816	 eval(shift(@insns));
817
818	&vpsrld	(@Tx[1],@Tx[2],30);
819	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
820	 eval(shift(@insns));
821	 eval(shift(@insns));
822	 eval(shift(@insns));
823	 eval(shift(@insns));
824
825	&vpslld	(@Tx[2],@Tx[2],2);
826	&vpxor	(@X[0],@X[0],@Tx[1]);
827	 eval(shift(@insns));
828	 eval(shift(@insns));
829	 eval(shift(@insns));
830	 eval(shift(@insns));
831
832	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
833	 eval(shift(@insns));
834	 eval(shift(@insns));
835	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
836	 eval(shift(@insns));
837	 eval(shift(@insns));
838
839
840	 foreach (@insns) { eval; }	# remaining instructions [if any]
841
842  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
843		push(@Tx,shift(@Tx));
844}
845
846sub Xupdate_avx_32_79()
847{ use integer;
848  my $body = shift;
849  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
850  my ($a,$b,$c,$d,$e);
851
852	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
853	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
854	 eval(shift(@insns));		# body_20_39
855	 eval(shift(@insns));
856	 eval(shift(@insns));
857	 eval(shift(@insns));		# rol
858
859	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
860	 eval(shift(@insns));
861	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
862	if ($Xi%5) {
863	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
864	} else {			# ... or load next one
865	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
866	}
867	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
868	 eval(shift(@insns));		# ror
869	 eval(shift(@insns));
870
871	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
872	 eval(shift(@insns));		# body_20_39
873	 eval(shift(@insns));
874	 eval(shift(@insns));
875	 eval(shift(@insns));		# rol
876
877	&vpsrld	(@Tx[0],@X[0],30);
878	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
879	 eval(shift(@insns));
880	 eval(shift(@insns));
881	 eval(shift(@insns));		# ror
882	 eval(shift(@insns));
883
884	&vpslld	(@X[0],@X[0],2);
885	 eval(shift(@insns));		# body_20_39
886	 eval(shift(@insns));
887	 eval(shift(@insns));
888	 eval(shift(@insns));		# rol
889	 eval(shift(@insns));
890	 eval(shift(@insns));
891	 eval(shift(@insns));		# ror
892	 eval(shift(@insns));
893
894	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
895	 eval(shift(@insns));		# body_20_39
896	 eval(shift(@insns));
897	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
898	 eval(shift(@insns));
899	 eval(shift(@insns));		# rol
900	 eval(shift(@insns));
901	 eval(shift(@insns));
902	 eval(shift(@insns));		# rol
903	 eval(shift(@insns));
904
905	 foreach (@insns) { eval; }	# remaining instructions
906
907  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
908		push(@Tx,shift(@Tx));
909}
910
911sub Xuplast_avx_80()
912{ use integer;
913  my $body = shift;
914  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
915  my ($a,$b,$c,$d,$e);
916
917	 eval(shift(@insns));
918	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
919	 eval(shift(@insns));
920	 eval(shift(@insns));
921	 eval(shift(@insns));
922	 eval(shift(@insns));
923
924	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
925
926	 foreach (@insns) { eval; }		# remaining instructions
927
928	&cmp	($inp,$len);
929	&je	(".Ldone_avx");
930
931	unshift(@Tx,pop(@Tx));
932
933	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
934	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
935	&vmovdqu(@X[-4&7],"0($inp)");		# load input
936	&vmovdqu(@X[-3&7],"16($inp)");
937	&vmovdqu(@X[-2&7],"32($inp)");
938	&vmovdqu(@X[-1&7],"48($inp)");
939	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
940	&add	($inp,64);
941
942  $Xi=0;
943}
944
945sub Xloop_avx()
946{ use integer;
947  my $body = shift;
948  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
949  my ($a,$b,$c,$d,$e);
950
951	 eval(shift(@insns));
952	 eval(shift(@insns));
953	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
954	 eval(shift(@insns));
955	 eval(shift(@insns));
956	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
957	 eval(shift(@insns));
958	 eval(shift(@insns));
959	 eval(shift(@insns));
960	 eval(shift(@insns));
961	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
962	 eval(shift(@insns));
963	 eval(shift(@insns));
964
965	foreach (@insns) { eval; }
966  $Xi++;
967}
968
969sub Xtail_avx()
970{ use integer;
971  my $body = shift;
972  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
973  my ($a,$b,$c,$d,$e);
974
975	foreach (@insns) { eval; }
976}
977
978$code.=<<___;
979.align	16
980.Loop_avx:
981___
982	&Xupdate_avx_16_31(\&body_00_19);
983	&Xupdate_avx_16_31(\&body_00_19);
984	&Xupdate_avx_16_31(\&body_00_19);
985	&Xupdate_avx_16_31(\&body_00_19);
986	&Xupdate_avx_32_79(\&body_00_19);
987	&Xupdate_avx_32_79(\&body_20_39);
988	&Xupdate_avx_32_79(\&body_20_39);
989	&Xupdate_avx_32_79(\&body_20_39);
990	&Xupdate_avx_32_79(\&body_20_39);
991	&Xupdate_avx_32_79(\&body_20_39);
992	&Xupdate_avx_32_79(\&body_40_59);
993	&Xupdate_avx_32_79(\&body_40_59);
994	&Xupdate_avx_32_79(\&body_40_59);
995	&Xupdate_avx_32_79(\&body_40_59);
996	&Xupdate_avx_32_79(\&body_40_59);
997	&Xupdate_avx_32_79(\&body_20_39);
998	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
999
1000				$saved_j=$j; @saved_V=@V;
1001				$saved_r=$r; @saved_rndkey=@rndkey;
1002
1003	&Xloop_avx(\&body_20_39);
1004	&Xloop_avx(\&body_20_39);
1005	&Xloop_avx(\&body_20_39);
1006
1007$code.=<<___;
1008	vmovups	$iv,48($out,$in0)		# write output
1009	lea	64($in0),$in0
1010
1011	add	0($ctx),$A			# update context
1012	add	4($ctx),@T[0]
1013	add	8($ctx),$C
1014	add	12($ctx),$D
1015	mov	$A,0($ctx)
1016	add	16($ctx),$E
1017	mov	@T[0],4($ctx)
1018	mov	@T[0],$B			# magic seed
1019	mov	$C,8($ctx)
1020	mov	$D,12($ctx)
1021	mov	$E,16($ctx)
1022	jmp	.Loop_avx
1023
1024.align	16
1025.Ldone_avx:
1026___
1027				$jj=$j=$saved_j; @V=@saved_V;
1028				$r=$saved_r;     @rndkey=@saved_rndkey;
1029
1030	&Xtail_avx(\&body_20_39);
1031	&Xtail_avx(\&body_20_39);
1032	&Xtail_avx(\&body_20_39);
1033
1034$code.=<<___;
1035	vmovups	$iv,48($out,$in0)		# write output
1036	mov	88(%rsp),$ivp			# restore $ivp
1037
1038	add	0($ctx),$A			# update context
1039	add	4($ctx),@T[0]
1040	add	8($ctx),$C
1041	mov	$A,0($ctx)
1042	add	12($ctx),$D
1043	mov	@T[0],4($ctx)
1044	add	16($ctx),$E
1045	mov	$C,8($ctx)
1046	mov	$D,12($ctx)
1047	mov	$E,16($ctx)
1048	vmovups	$iv,($ivp)			# write IV
1049	vzeroall
1050___
1051$code.=<<___ if ($win64);
1052	movaps	96+0(%rsp),%xmm6
1053	movaps	96+16(%rsp),%xmm7
1054	movaps	96+32(%rsp),%xmm8
1055	movaps	96+48(%rsp),%xmm9
1056	movaps	96+64(%rsp),%xmm10
1057	movaps	96+80(%rsp),%xmm11
1058	movaps	96+96(%rsp),%xmm12
1059	movaps	96+112(%rsp),%xmm13
1060	movaps	96+128(%rsp),%xmm14
1061	movaps	96+144(%rsp),%xmm15
1062___
1063$code.=<<___;
1064	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1065	mov	0(%rsi),%r15
1066	mov	8(%rsi),%r14
1067	mov	16(%rsi),%r13
1068	mov	24(%rsi),%r12
1069	mov	32(%rsi),%rbp
1070	mov	40(%rsi),%rbx
1071	lea	48(%rsi),%rsp
1072.Lepilogue_avx:
1073	ret
1074.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1075___
1076}
1077$code.=<<___;
1078.align	64
1079K_XX_XX:
1080.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1081.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1082.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1083.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1084.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1085
1086.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1087.align	64
1088___
1089
1090# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1091#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1092if ($win64) {
1093$rec="%rcx";
1094$frame="%rdx";
1095$context="%r8";
1096$disp="%r9";
1097
1098$code.=<<___;
1099.extern	__imp_RtlVirtualUnwind
1100.type	ssse3_handler,\@abi-omnipotent
1101.align	16
1102ssse3_handler:
1103	push	%rsi
1104	push	%rdi
1105	push	%rbx
1106	push	%rbp
1107	push	%r12
1108	push	%r13
1109	push	%r14
1110	push	%r15
1111	pushfq
1112	sub	\$64,%rsp
1113
1114	mov	120($context),%rax	# pull context->Rax
1115	mov	248($context),%rbx	# pull context->Rip
1116
1117	mov	8($disp),%rsi		# disp->ImageBase
1118	mov	56($disp),%r11		# disp->HandlerData
1119
1120	mov	0(%r11),%r10d		# HandlerData[0]
1121	lea	(%rsi,%r10),%r10	# prologue label
1122	cmp	%r10,%rbx		# context->Rip<prologue label
1123	jb	.Lcommon_seh_tail
1124
1125	mov	152($context),%rax	# pull context->Rsp
1126
1127	mov	4(%r11),%r10d		# HandlerData[1]
1128	lea	(%rsi,%r10),%r10	# epilogue label
1129	cmp	%r10,%rbx		# context->Rip>=epilogue label
1130	jae	.Lcommon_seh_tail
1131
1132	lea	96(%rax),%rsi
1133	lea	512($context),%rdi	# &context.Xmm6
1134	mov	\$20,%ecx
1135	.long	0xa548f3fc		# cld; rep movsq
1136	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1137
1138	mov	0(%rax),%r15
1139	mov	8(%rax),%r14
1140	mov	16(%rax),%r13
1141	mov	24(%rax),%r12
1142	mov	32(%rax),%rbp
1143	mov	40(%rax),%rbx
1144	lea	48(%rax),%rax
1145	mov	%rbx,144($context)	# restore context->Rbx
1146	mov	%rbp,160($context)	# restore context->Rbp
1147	mov	%r12,216($context)	# restore context->R12
1148	mov	%r13,224($context)	# restore context->R13
1149	mov	%r14,232($context)	# restore context->R14
1150	mov	%r15,240($context)	# restore context->R15
1151
1152.Lcommon_seh_tail:
1153	mov	8(%rax),%rdi
1154	mov	16(%rax),%rsi
1155	mov	%rax,152($context)	# restore context->Rsp
1156	mov	%rsi,168($context)	# restore context->Rsi
1157	mov	%rdi,176($context)	# restore context->Rdi
1158
1159	mov	40($disp),%rdi		# disp->ContextRecord
1160	mov	$context,%rsi		# context
1161	mov	\$154,%ecx		# sizeof(CONTEXT)
1162	.long	0xa548f3fc		# cld; rep movsq
1163
1164	mov	$disp,%rsi
1165	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1166	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1167	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1168	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1169	mov	40(%rsi),%r10		# disp->ContextRecord
1170	lea	56(%rsi),%r11		# &disp->HandlerData
1171	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1172	mov	%r10,32(%rsp)		# arg5
1173	mov	%r11,40(%rsp)		# arg6
1174	mov	%r12,48(%rsp)		# arg7
1175	mov	%rcx,56(%rsp)		# arg8, (NULL)
1176	call	*__imp_RtlVirtualUnwind(%rip)
1177
1178	mov	\$1,%eax		# ExceptionContinueSearch
1179	add	\$64,%rsp
1180	popfq
1181	pop	%r15
1182	pop	%r14
1183	pop	%r13
1184	pop	%r12
1185	pop	%rbp
1186	pop	%rbx
1187	pop	%rdi
1188	pop	%rsi
1189	ret
1190.size	ssse3_handler,.-ssse3_handler
1191
1192.section	.pdata
1193.align	4
1194	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
1195	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
1196	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
1197___
1198$code.=<<___ if ($avx);
1199	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
1200	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
1201	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
1202___
1203$code.=<<___;
1204.section	.xdata
1205.align	8
1206.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1207	.byte	9,0,0,0
1208	.rva	ssse3_handler
1209	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1210___
1211$code.=<<___ if ($avx);
1212.LSEH_info_aesni_cbc_sha1_enc_avx:
1213	.byte	9,0,0,0
1214	.rva	ssse3_handler
1215	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1216___
1217}
1218
1219####################################################################
1220sub rex {
1221  local *opcode=shift;
1222  my ($dst,$src)=@_;
1223  my $rex=0;
1224
1225    $rex|=0x04			if($dst>=8);
1226    $rex|=0x01			if($src>=8);
1227    push @opcode,$rex|0x40	if($rex);
1228}
1229
1230sub aesni {
1231  my $line=shift;
1232  my @opcode=(0x66);
1233
1234    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1235	my %opcodelet = (
1236		"aesenc" => 0xdc,	"aesenclast" => 0xdd
1237	);
1238	return undef if (!defined($opcodelet{$1}));
1239	rex(\@opcode,$3,$2);
1240	push @opcode,0x0f,0x38,$opcodelet{$1};
1241	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1242	return ".byte\t".join(',',@opcode);
1243    }
1244    return $line;
1245}
1246
1247$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1248$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1249
1250print $code;
1251close STDOUT;
1252