1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim#
10238384Sjkim# This module implements support for Intel AES-NI extension. In
11238384Sjkim# OpenSSL context it's used with Intel engine, but can also be used as
12238384Sjkim# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13238384Sjkim# details].
14238384Sjkim#
15238384Sjkim# Performance.
16238384Sjkim#
17238384Sjkim# To start with see corresponding paragraph in aesni-x86_64.pl...
18238384Sjkim# Instead of filling table similar to one found there I've chosen to
19238384Sjkim# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20238384Sjkim# The simplified table below represents 32-bit performance relative
21238384Sjkim# to 64-bit one in every given point. Ratios vary for different
22238384Sjkim# encryption modes, therefore interval values.
23238384Sjkim#
24238384Sjkim#	16-byte     64-byte     256-byte    1-KB        8-KB
25238384Sjkim#	53-67%      67-84%      91-94%      95-98%      97-99.5%
26238384Sjkim#
27238384Sjkim# Lower ratios for smaller block sizes are perfectly understandable,
28238384Sjkim# because function call overhead is higher in 32-bit mode. Largest
29238384Sjkim# 8-KB block performance is virtually same: 32-bit code is less than
30238384Sjkim# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31238384Sjkim
32238384Sjkim# January 2011
33238384Sjkim#
34238384Sjkim# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35238384Sjkim# interleaves at most 6 aes[enc|dec] instructions, because there are
36238384Sjkim# not enough registers for 8x interleave [which should be optimal for
37238384Sjkim# Sandy Bridge]. Actually, performance results for 6x interleave
38238384Sjkim# factor presented in aesni-x86_64.pl (except for CTR) are for this
39238384Sjkim# module.
40238384Sjkim
41238384Sjkim# April 2011
42238384Sjkim#
43238384Sjkim# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44238384Sjkim# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45238384Sjkim
46238384Sjkim$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
47238384Sjkim			# generates drop-in replacement for
48238384Sjkim			# crypto/aes/asm/aes-586.pl:-)
49238384Sjkim$inline=1;		# inline _aesni_[en|de]crypt
50238384Sjkim
51238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm");
53238384Sjkimrequire "x86asm.pl";
54238384Sjkim
55238384Sjkim&asm_init($ARGV[0],$0);
56238384Sjkim
57238384Sjkimif ($PREFIX eq "aesni")	{ $movekey=*movups; }
58238384Sjkimelse			{ $movekey=*movups; }
59238384Sjkim
60238384Sjkim$len="eax";
61238384Sjkim$rounds="ecx";
62238384Sjkim$key="edx";
63238384Sjkim$inp="esi";
64238384Sjkim$out="edi";
65238384Sjkim$rounds_="ebx";	# backup copy for $rounds
66238384Sjkim$key_="ebp";	# backup copy for $key
67238384Sjkim
68238384Sjkim$rndkey0="xmm0";
69238384Sjkim$rndkey1="xmm1";
70238384Sjkim$inout0="xmm2";
71238384Sjkim$inout1="xmm3";
72238384Sjkim$inout2="xmm4";
73238384Sjkim$inout3="xmm5";	$in1="xmm5";
74238384Sjkim$inout4="xmm6";	$in0="xmm6";
75238384Sjkim$inout5="xmm7";	$ivec="xmm7";
76238384Sjkim
77238384Sjkim# AESNI extenstion
78238384Sjkimsub aeskeygenassist
79238384Sjkim{ my($dst,$src,$imm)=@_;
80238384Sjkim    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81238384Sjkim    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
82238384Sjkim}
83238384Sjkimsub aescommon
84238384Sjkim{ my($opcodelet,$dst,$src)=@_;
85238384Sjkim    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86238384Sjkim    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87238384Sjkim}
88238384Sjkimsub aesimc	{ aescommon(0xdb,@_); }
89238384Sjkimsub aesenc	{ aescommon(0xdc,@_); }
90238384Sjkimsub aesenclast	{ aescommon(0xdd,@_); }
91238384Sjkimsub aesdec	{ aescommon(0xde,@_); }
92238384Sjkimsub aesdeclast	{ aescommon(0xdf,@_); }
93238384Sjkim
94238384Sjkim# Inline version of internal aesni_[en|de]crypt1
95238384Sjkim{ my $sn;
96238384Sjkimsub aesni_inline_generate1
97238384Sjkim{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98238384Sjkim  $sn++;
99238384Sjkim
100238384Sjkim    &$movekey		($rndkey0,&QWP(0,$key));
101238384Sjkim    &$movekey		($rndkey1,&QWP(16,$key));
102238384Sjkim    &xorps		($ivec,$rndkey0)	if (defined($ivec));
103238384Sjkim    &lea		($key,&DWP(32,$key));
104238384Sjkim    &xorps		($inout,$ivec)		if (defined($ivec));
105238384Sjkim    &xorps		($inout,$rndkey0)	if (!defined($ivec));
106238384Sjkim    &set_label("${p}1_loop_$sn");
107238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
108238384Sjkim	&dec		($rounds);
109238384Sjkim	&$movekey	($rndkey1,&QWP(0,$key));
110238384Sjkim	&lea		($key,&DWP(16,$key));
111238384Sjkim    &jnz		(&label("${p}1_loop_$sn"));
112238384Sjkim    eval"&aes${p}last	($inout,$rndkey1)";
113238384Sjkim}}
114238384Sjkim
115238384Sjkimsub aesni_generate1	# fully unrolled loop
116238384Sjkim{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117238384Sjkim
118238384Sjkim    &function_begin_B("_aesni_${p}rypt1");
119238384Sjkim	&movups		($rndkey0,&QWP(0,$key));
120238384Sjkim	&$movekey	($rndkey1,&QWP(0x10,$key));
121238384Sjkim	&xorps		($inout,$rndkey0);
122238384Sjkim	&$movekey	($rndkey0,&QWP(0x20,$key));
123238384Sjkim	&lea		($key,&DWP(0x30,$key));
124238384Sjkim	&cmp		($rounds,11);
125238384Sjkim	&jb		(&label("${p}128"));
126238384Sjkim	&lea		($key,&DWP(0x20,$key));
127238384Sjkim	&je		(&label("${p}192"));
128238384Sjkim	&lea		($key,&DWP(0x20,$key));
129238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
130238384Sjkim	&$movekey	($rndkey1,&QWP(-0x40,$key));
131238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
132238384Sjkim	&$movekey	($rndkey0,&QWP(-0x30,$key));
133238384Sjkim    &set_label("${p}192");
134238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
135238384Sjkim	&$movekey	($rndkey1,&QWP(-0x20,$key));
136238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
137238384Sjkim	&$movekey	($rndkey0,&QWP(-0x10,$key));
138238384Sjkim    &set_label("${p}128");
139238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
140238384Sjkim	&$movekey	($rndkey1,&QWP(0,$key));
141238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
142238384Sjkim	&$movekey	($rndkey0,&QWP(0x10,$key));
143238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
144238384Sjkim	&$movekey	($rndkey1,&QWP(0x20,$key));
145238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
146238384Sjkim	&$movekey	($rndkey0,&QWP(0x30,$key));
147238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
148238384Sjkim	&$movekey	($rndkey1,&QWP(0x40,$key));
149238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
150238384Sjkim	&$movekey	($rndkey0,&QWP(0x50,$key));
151238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
152238384Sjkim	&$movekey	($rndkey1,&QWP(0x60,$key));
153238384Sjkim	eval"&aes${p}	($inout,$rndkey0)";
154238384Sjkim	&$movekey	($rndkey0,&QWP(0x70,$key));
155238384Sjkim	eval"&aes${p}	($inout,$rndkey1)";
156238384Sjkim    eval"&aes${p}last	($inout,$rndkey0)";
157238384Sjkim    &ret();
158238384Sjkim    &function_end_B("_aesni_${p}rypt1");
159238384Sjkim}
160238384Sjkim
161238384Sjkim# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162238384Sjkim&aesni_generate1("enc") if (!$inline);
163238384Sjkim&function_begin_B("${PREFIX}_encrypt");
164238384Sjkim	&mov	("eax",&wparam(0));
165238384Sjkim	&mov	($key,&wparam(2));
166238384Sjkim	&movups	($inout0,&QWP(0,"eax"));
167238384Sjkim	&mov	($rounds,&DWP(240,$key));
168238384Sjkim	&mov	("eax",&wparam(1));
169238384Sjkim	if ($inline)
170238384Sjkim	{   &aesni_inline_generate1("enc");	}
171238384Sjkim	else
172238384Sjkim	{   &call	("_aesni_encrypt1");	}
173238384Sjkim	&movups	(&QWP(0,"eax"),$inout0);
174238384Sjkim	&ret	();
175238384Sjkim&function_end_B("${PREFIX}_encrypt");
176238384Sjkim
177238384Sjkim# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178238384Sjkim&aesni_generate1("dec") if(!$inline);
179238384Sjkim&function_begin_B("${PREFIX}_decrypt");
180238384Sjkim	&mov	("eax",&wparam(0));
181238384Sjkim	&mov	($key,&wparam(2));
182238384Sjkim	&movups	($inout0,&QWP(0,"eax"));
183238384Sjkim	&mov	($rounds,&DWP(240,$key));
184238384Sjkim	&mov	("eax",&wparam(1));
185238384Sjkim	if ($inline)
186238384Sjkim	{   &aesni_inline_generate1("dec");	}
187238384Sjkim	else
188238384Sjkim	{   &call	("_aesni_decrypt1");	}
189238384Sjkim	&movups	(&QWP(0,"eax"),$inout0);
190238384Sjkim	&ret	();
191238384Sjkim&function_end_B("${PREFIX}_decrypt");
192238384Sjkim
193238384Sjkim# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194238384Sjkim# factor. Why 3x subroutine were originally used in loops? Even though
195238384Sjkim# aes[enc|dec] latency was originally 6, it could be scheduled only
196238384Sjkim# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197238384Sjkim# utilization, i.e. when subroutine's throughput is virtually same as
198238384Sjkim# of non-interleaved subroutine [for number of input blocks up to 3].
199238384Sjkim# This is why it makes no sense to implement 2x subroutine.
200238384Sjkim# aes[enc|dec] latency in next processor generation is 8, but the
201238384Sjkim# instructions can be scheduled every cycle. Optimal interleave for
202238384Sjkim# new processor is therefore 8x, but it's unfeasible to accommodate it
203238384Sjkim# in XMM registers addreassable in 32-bit mode and therefore 6x is
204238384Sjkim# used instead...
205238384Sjkim
206238384Sjkimsub aesni_generate3
207238384Sjkim{ my $p=shift;
208238384Sjkim
209238384Sjkim    &function_begin_B("_aesni_${p}rypt3");
210238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
211238384Sjkim	&shr		($rounds,1);
212238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
213238384Sjkim	&lea		($key,&DWP(32,$key));
214238384Sjkim	&xorps		($inout0,$rndkey0);
215238384Sjkim	&pxor		($inout1,$rndkey0);
216238384Sjkim	&pxor		($inout2,$rndkey0);
217238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
218238384Sjkim
219238384Sjkim    &set_label("${p}3_loop");
220238384Sjkim	eval"&aes${p}	($inout0,$rndkey1)";
221238384Sjkim	eval"&aes${p}	($inout1,$rndkey1)";
222238384Sjkim	&dec		($rounds);
223238384Sjkim	eval"&aes${p}	($inout2,$rndkey1)";
224238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
225238384Sjkim	eval"&aes${p}	($inout0,$rndkey0)";
226238384Sjkim	eval"&aes${p}	($inout1,$rndkey0)";
227238384Sjkim	&lea		($key,&DWP(32,$key));
228238384Sjkim	eval"&aes${p}	($inout2,$rndkey0)";
229238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
230238384Sjkim	&jnz		(&label("${p}3_loop"));
231238384Sjkim    eval"&aes${p}	($inout0,$rndkey1)";
232238384Sjkim    eval"&aes${p}	($inout1,$rndkey1)";
233238384Sjkim    eval"&aes${p}	($inout2,$rndkey1)";
234238384Sjkim    eval"&aes${p}last	($inout0,$rndkey0)";
235238384Sjkim    eval"&aes${p}last	($inout1,$rndkey0)";
236238384Sjkim    eval"&aes${p}last	($inout2,$rndkey0)";
237238384Sjkim    &ret();
238238384Sjkim    &function_end_B("_aesni_${p}rypt3");
239238384Sjkim}
240238384Sjkim
241238384Sjkim# 4x interleave is implemented to improve small block performance,
242238384Sjkim# most notably [and naturally] 4 block by ~30%. One can argue that one
243238384Sjkim# should have implemented 5x as well, but improvement  would be <20%,
244238384Sjkim# so it's not worth it...
245238384Sjkimsub aesni_generate4
246238384Sjkim{ my $p=shift;
247238384Sjkim
248238384Sjkim    &function_begin_B("_aesni_${p}rypt4");
249238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
250238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
251238384Sjkim	&shr		($rounds,1);
252238384Sjkim	&lea		($key,&DWP(32,$key));
253238384Sjkim	&xorps		($inout0,$rndkey0);
254238384Sjkim	&pxor		($inout1,$rndkey0);
255238384Sjkim	&pxor		($inout2,$rndkey0);
256238384Sjkim	&pxor		($inout3,$rndkey0);
257238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
258238384Sjkim
259238384Sjkim    &set_label("${p}4_loop");
260238384Sjkim	eval"&aes${p}	($inout0,$rndkey1)";
261238384Sjkim	eval"&aes${p}	($inout1,$rndkey1)";
262238384Sjkim	&dec		($rounds);
263238384Sjkim	eval"&aes${p}	($inout2,$rndkey1)";
264238384Sjkim	eval"&aes${p}	($inout3,$rndkey1)";
265238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
266238384Sjkim	eval"&aes${p}	($inout0,$rndkey0)";
267238384Sjkim	eval"&aes${p}	($inout1,$rndkey0)";
268238384Sjkim	&lea		($key,&DWP(32,$key));
269238384Sjkim	eval"&aes${p}	($inout2,$rndkey0)";
270238384Sjkim	eval"&aes${p}	($inout3,$rndkey0)";
271238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
272238384Sjkim    &jnz		(&label("${p}4_loop"));
273238384Sjkim
274238384Sjkim    eval"&aes${p}	($inout0,$rndkey1)";
275238384Sjkim    eval"&aes${p}	($inout1,$rndkey1)";
276238384Sjkim    eval"&aes${p}	($inout2,$rndkey1)";
277238384Sjkim    eval"&aes${p}	($inout3,$rndkey1)";
278238384Sjkim    eval"&aes${p}last	($inout0,$rndkey0)";
279238384Sjkim    eval"&aes${p}last	($inout1,$rndkey0)";
280238384Sjkim    eval"&aes${p}last	($inout2,$rndkey0)";
281238384Sjkim    eval"&aes${p}last	($inout3,$rndkey0)";
282238384Sjkim    &ret();
283238384Sjkim    &function_end_B("_aesni_${p}rypt4");
284238384Sjkim}
285238384Sjkim
286238384Sjkimsub aesni_generate6
287238384Sjkim{ my $p=shift;
288238384Sjkim
289238384Sjkim    &function_begin_B("_aesni_${p}rypt6");
290238384Sjkim    &static_label("_aesni_${p}rypt6_enter");
291238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
292238384Sjkim	&shr		($rounds,1);
293238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
294238384Sjkim	&lea		($key,&DWP(32,$key));
295238384Sjkim	&xorps		($inout0,$rndkey0);
296238384Sjkim	&pxor		($inout1,$rndkey0);	# pxor does better here
297238384Sjkim	eval"&aes${p}	($inout0,$rndkey1)";
298238384Sjkim	&pxor		($inout2,$rndkey0);
299238384Sjkim	eval"&aes${p}	($inout1,$rndkey1)";
300238384Sjkim	&pxor		($inout3,$rndkey0);
301238384Sjkim	&dec		($rounds);
302238384Sjkim	eval"&aes${p}	($inout2,$rndkey1)";
303238384Sjkim	&pxor		($inout4,$rndkey0);
304238384Sjkim	eval"&aes${p}	($inout3,$rndkey1)";
305238384Sjkim	&pxor		($inout5,$rndkey0);
306238384Sjkim	eval"&aes${p}	($inout4,$rndkey1)";
307238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
308238384Sjkim	eval"&aes${p}	($inout5,$rndkey1)";
309238384Sjkim	&jmp		(&label("_aesni_${p}rypt6_enter"));
310238384Sjkim
311238384Sjkim    &set_label("${p}6_loop",16);
312238384Sjkim	eval"&aes${p}	($inout0,$rndkey1)";
313238384Sjkim	eval"&aes${p}	($inout1,$rndkey1)";
314238384Sjkim	&dec		($rounds);
315238384Sjkim	eval"&aes${p}	($inout2,$rndkey1)";
316238384Sjkim	eval"&aes${p}	($inout3,$rndkey1)";
317238384Sjkim	eval"&aes${p}	($inout4,$rndkey1)";
318238384Sjkim	eval"&aes${p}	($inout5,$rndkey1)";
319238384Sjkim    &set_label("_aesni_${p}rypt6_enter",16);
320238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
321238384Sjkim	eval"&aes${p}	($inout0,$rndkey0)";
322238384Sjkim	eval"&aes${p}	($inout1,$rndkey0)";
323238384Sjkim	&lea		($key,&DWP(32,$key));
324238384Sjkim	eval"&aes${p}	($inout2,$rndkey0)";
325238384Sjkim	eval"&aes${p}	($inout3,$rndkey0)";
326238384Sjkim	eval"&aes${p}	($inout4,$rndkey0)";
327238384Sjkim	eval"&aes${p}	($inout5,$rndkey0)";
328238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
329238384Sjkim    &jnz		(&label("${p}6_loop"));
330238384Sjkim
331238384Sjkim    eval"&aes${p}	($inout0,$rndkey1)";
332238384Sjkim    eval"&aes${p}	($inout1,$rndkey1)";
333238384Sjkim    eval"&aes${p}	($inout2,$rndkey1)";
334238384Sjkim    eval"&aes${p}	($inout3,$rndkey1)";
335238384Sjkim    eval"&aes${p}	($inout4,$rndkey1)";
336238384Sjkim    eval"&aes${p}	($inout5,$rndkey1)";
337238384Sjkim    eval"&aes${p}last	($inout0,$rndkey0)";
338238384Sjkim    eval"&aes${p}last	($inout1,$rndkey0)";
339238384Sjkim    eval"&aes${p}last	($inout2,$rndkey0)";
340238384Sjkim    eval"&aes${p}last	($inout3,$rndkey0)";
341238384Sjkim    eval"&aes${p}last	($inout4,$rndkey0)";
342238384Sjkim    eval"&aes${p}last	($inout5,$rndkey0)";
343238384Sjkim    &ret();
344238384Sjkim    &function_end_B("_aesni_${p}rypt6");
345238384Sjkim}
346238384Sjkim&aesni_generate3("enc") if ($PREFIX eq "aesni");
347238384Sjkim&aesni_generate3("dec");
348238384Sjkim&aesni_generate4("enc") if ($PREFIX eq "aesni");
349238384Sjkim&aesni_generate4("dec");
350238384Sjkim&aesni_generate6("enc") if ($PREFIX eq "aesni");
351238384Sjkim&aesni_generate6("dec");
352238384Sjkim
353238384Sjkimif ($PREFIX eq "aesni") {
354238384Sjkim######################################################################
355238384Sjkim# void aesni_ecb_encrypt (const void *in, void *out,
356238384Sjkim#                         size_t length, const AES_KEY *key,
357238384Sjkim#                         int enc);
358238384Sjkim&function_begin("aesni_ecb_encrypt");
359238384Sjkim	&mov	($inp,&wparam(0));
360238384Sjkim	&mov	($out,&wparam(1));
361238384Sjkim	&mov	($len,&wparam(2));
362238384Sjkim	&mov	($key,&wparam(3));
363238384Sjkim	&mov	($rounds_,&wparam(4));
364238384Sjkim	&and	($len,-16);
365238384Sjkim	&jz	(&label("ecb_ret"));
366238384Sjkim	&mov	($rounds,&DWP(240,$key));
367238384Sjkim	&test	($rounds_,$rounds_);
368238384Sjkim	&jz	(&label("ecb_decrypt"));
369238384Sjkim
370238384Sjkim	&mov	($key_,$key);		# backup $key
371238384Sjkim	&mov	($rounds_,$rounds);	# backup $rounds
372238384Sjkim	&cmp	($len,0x60);
373238384Sjkim	&jb	(&label("ecb_enc_tail"));
374238384Sjkim
375238384Sjkim	&movdqu	($inout0,&QWP(0,$inp));
376238384Sjkim	&movdqu	($inout1,&QWP(0x10,$inp));
377238384Sjkim	&movdqu	($inout2,&QWP(0x20,$inp));
378238384Sjkim	&movdqu	($inout3,&QWP(0x30,$inp));
379238384Sjkim	&movdqu	($inout4,&QWP(0x40,$inp));
380238384Sjkim	&movdqu	($inout5,&QWP(0x50,$inp));
381238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
382238384Sjkim	&sub	($len,0x60);
383238384Sjkim	&jmp	(&label("ecb_enc_loop6_enter"));
384238384Sjkim
385238384Sjkim&set_label("ecb_enc_loop6",16);
386238384Sjkim	&movups	(&QWP(0,$out),$inout0);
387238384Sjkim	&movdqu	($inout0,&QWP(0,$inp));
388238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
389238384Sjkim	&movdqu	($inout1,&QWP(0x10,$inp));
390238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
391238384Sjkim	&movdqu	($inout2,&QWP(0x20,$inp));
392238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
393238384Sjkim	&movdqu	($inout3,&QWP(0x30,$inp));
394238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
395238384Sjkim	&movdqu	($inout4,&QWP(0x40,$inp));
396238384Sjkim	&movups	(&QWP(0x50,$out),$inout5);
397238384Sjkim	&lea	($out,&DWP(0x60,$out));
398238384Sjkim	&movdqu	($inout5,&QWP(0x50,$inp));
399238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
400238384Sjkim&set_label("ecb_enc_loop6_enter");
401238384Sjkim
402238384Sjkim	&call	("_aesni_encrypt6");
403238384Sjkim
404238384Sjkim	&mov	($key,$key_);		# restore $key
405238384Sjkim	&mov	($rounds,$rounds_);	# restore $rounds
406238384Sjkim	&sub	($len,0x60);
407238384Sjkim	&jnc	(&label("ecb_enc_loop6"));
408238384Sjkim
409238384Sjkim	&movups	(&QWP(0,$out),$inout0);
410238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
411238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
412238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
413238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
414238384Sjkim	&movups	(&QWP(0x50,$out),$inout5);
415238384Sjkim	&lea	($out,&DWP(0x60,$out));
416238384Sjkim	&add	($len,0x60);
417238384Sjkim	&jz	(&label("ecb_ret"));
418238384Sjkim
419238384Sjkim&set_label("ecb_enc_tail");
420238384Sjkim	&movups	($inout0,&QWP(0,$inp));
421238384Sjkim	&cmp	($len,0x20);
422238384Sjkim	&jb	(&label("ecb_enc_one"));
423238384Sjkim	&movups	($inout1,&QWP(0x10,$inp));
424238384Sjkim	&je	(&label("ecb_enc_two"));
425238384Sjkim	&movups	($inout2,&QWP(0x20,$inp));
426238384Sjkim	&cmp	($len,0x40);
427238384Sjkim	&jb	(&label("ecb_enc_three"));
428238384Sjkim	&movups	($inout3,&QWP(0x30,$inp));
429238384Sjkim	&je	(&label("ecb_enc_four"));
430238384Sjkim	&movups	($inout4,&QWP(0x40,$inp));
431238384Sjkim	&xorps	($inout5,$inout5);
432238384Sjkim	&call	("_aesni_encrypt6");
433238384Sjkim	&movups	(&QWP(0,$out),$inout0);
434238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
435238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
436238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
437238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
438238384Sjkim	jmp	(&label("ecb_ret"));
439238384Sjkim
440238384Sjkim&set_label("ecb_enc_one",16);
441238384Sjkim	if ($inline)
442238384Sjkim	{   &aesni_inline_generate1("enc");	}
443238384Sjkim	else
444238384Sjkim	{   &call	("_aesni_encrypt1");	}
445238384Sjkim	&movups	(&QWP(0,$out),$inout0);
446238384Sjkim	&jmp	(&label("ecb_ret"));
447238384Sjkim
448238384Sjkim&set_label("ecb_enc_two",16);
449238384Sjkim	&xorps	($inout2,$inout2);
450238384Sjkim	&call	("_aesni_encrypt3");
451238384Sjkim	&movups	(&QWP(0,$out),$inout0);
452238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
453238384Sjkim	&jmp	(&label("ecb_ret"));
454238384Sjkim
455238384Sjkim&set_label("ecb_enc_three",16);
456238384Sjkim	&call	("_aesni_encrypt3");
457238384Sjkim	&movups	(&QWP(0,$out),$inout0);
458238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
459238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
460238384Sjkim	&jmp	(&label("ecb_ret"));
461238384Sjkim
462238384Sjkim&set_label("ecb_enc_four",16);
463238384Sjkim	&call	("_aesni_encrypt4");
464238384Sjkim	&movups	(&QWP(0,$out),$inout0);
465238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
466238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
467238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
468238384Sjkim	&jmp	(&label("ecb_ret"));
469238384Sjkim######################################################################
470238384Sjkim&set_label("ecb_decrypt",16);
471238384Sjkim	&mov	($key_,$key);		# backup $key
472238384Sjkim	&mov	($rounds_,$rounds);	# backup $rounds
473238384Sjkim	&cmp	($len,0x60);
474238384Sjkim	&jb	(&label("ecb_dec_tail"));
475238384Sjkim
476238384Sjkim	&movdqu	($inout0,&QWP(0,$inp));
477238384Sjkim	&movdqu	($inout1,&QWP(0x10,$inp));
478238384Sjkim	&movdqu	($inout2,&QWP(0x20,$inp));
479238384Sjkim	&movdqu	($inout3,&QWP(0x30,$inp));
480238384Sjkim	&movdqu	($inout4,&QWP(0x40,$inp));
481238384Sjkim	&movdqu	($inout5,&QWP(0x50,$inp));
482238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
483238384Sjkim	&sub	($len,0x60);
484238384Sjkim	&jmp	(&label("ecb_dec_loop6_enter"));
485238384Sjkim
486238384Sjkim&set_label("ecb_dec_loop6",16);
487238384Sjkim	&movups	(&QWP(0,$out),$inout0);
488238384Sjkim	&movdqu	($inout0,&QWP(0,$inp));
489238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
490238384Sjkim	&movdqu	($inout1,&QWP(0x10,$inp));
491238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
492238384Sjkim	&movdqu	($inout2,&QWP(0x20,$inp));
493238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
494238384Sjkim	&movdqu	($inout3,&QWP(0x30,$inp));
495238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
496238384Sjkim	&movdqu	($inout4,&QWP(0x40,$inp));
497238384Sjkim	&movups	(&QWP(0x50,$out),$inout5);
498238384Sjkim	&lea	($out,&DWP(0x60,$out));
499238384Sjkim	&movdqu	($inout5,&QWP(0x50,$inp));
500238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
501238384Sjkim&set_label("ecb_dec_loop6_enter");
502238384Sjkim
503238384Sjkim	&call	("_aesni_decrypt6");
504238384Sjkim
505238384Sjkim	&mov	($key,$key_);		# restore $key
506238384Sjkim	&mov	($rounds,$rounds_);	# restore $rounds
507238384Sjkim	&sub	($len,0x60);
508238384Sjkim	&jnc	(&label("ecb_dec_loop6"));
509238384Sjkim
510238384Sjkim	&movups	(&QWP(0,$out),$inout0);
511238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
512238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
513238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
514238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
515238384Sjkim	&movups	(&QWP(0x50,$out),$inout5);
516238384Sjkim	&lea	($out,&DWP(0x60,$out));
517238384Sjkim	&add	($len,0x60);
518238384Sjkim	&jz	(&label("ecb_ret"));
519238384Sjkim
520238384Sjkim&set_label("ecb_dec_tail");
521238384Sjkim	&movups	($inout0,&QWP(0,$inp));
522238384Sjkim	&cmp	($len,0x20);
523238384Sjkim	&jb	(&label("ecb_dec_one"));
524238384Sjkim	&movups	($inout1,&QWP(0x10,$inp));
525238384Sjkim	&je	(&label("ecb_dec_two"));
526238384Sjkim	&movups	($inout2,&QWP(0x20,$inp));
527238384Sjkim	&cmp	($len,0x40);
528238384Sjkim	&jb	(&label("ecb_dec_three"));
529238384Sjkim	&movups	($inout3,&QWP(0x30,$inp));
530238384Sjkim	&je	(&label("ecb_dec_four"));
531238384Sjkim	&movups	($inout4,&QWP(0x40,$inp));
532238384Sjkim	&xorps	($inout5,$inout5);
533238384Sjkim	&call	("_aesni_decrypt6");
534238384Sjkim	&movups	(&QWP(0,$out),$inout0);
535238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
536238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
537238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
538238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
539238384Sjkim	&jmp	(&label("ecb_ret"));
540238384Sjkim
541238384Sjkim&set_label("ecb_dec_one",16);
542238384Sjkim	if ($inline)
543238384Sjkim	{   &aesni_inline_generate1("dec");	}
544238384Sjkim	else
545238384Sjkim	{   &call	("_aesni_decrypt1");	}
546238384Sjkim	&movups	(&QWP(0,$out),$inout0);
547238384Sjkim	&jmp	(&label("ecb_ret"));
548238384Sjkim
549238384Sjkim&set_label("ecb_dec_two",16);
550238384Sjkim	&xorps	($inout2,$inout2);
551238384Sjkim	&call	("_aesni_decrypt3");
552238384Sjkim	&movups	(&QWP(0,$out),$inout0);
553238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
554238384Sjkim	&jmp	(&label("ecb_ret"));
555238384Sjkim
556238384Sjkim&set_label("ecb_dec_three",16);
557238384Sjkim	&call	("_aesni_decrypt3");
558238384Sjkim	&movups	(&QWP(0,$out),$inout0);
559238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
560238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
561238384Sjkim	&jmp	(&label("ecb_ret"));
562238384Sjkim
563238384Sjkim&set_label("ecb_dec_four",16);
564238384Sjkim	&call	("_aesni_decrypt4");
565238384Sjkim	&movups	(&QWP(0,$out),$inout0);
566238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
567238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
568238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
569238384Sjkim
570238384Sjkim&set_label("ecb_ret");
571238384Sjkim&function_end("aesni_ecb_encrypt");
572238384Sjkim
573238384Sjkim######################################################################
574238384Sjkim# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575238384Sjkim#                         size_t blocks, const AES_KEY *key,
576238384Sjkim#                         const char *ivec,char *cmac);
577238384Sjkim#
578238384Sjkim# Handles only complete blocks, operates on 64-bit counter and
579238384Sjkim# does not update *ivec! Nor does it finalize CMAC value
580238384Sjkim# (see engine/eng_aesni.c for details)
581238384Sjkim#
582238384Sjkim{ my $cmac=$inout1;
583238384Sjkim&function_begin("aesni_ccm64_encrypt_blocks");
584238384Sjkim	&mov	($inp,&wparam(0));
585238384Sjkim	&mov	($out,&wparam(1));
586238384Sjkim	&mov	($len,&wparam(2));
587238384Sjkim	&mov	($key,&wparam(3));
588238384Sjkim	&mov	($rounds_,&wparam(4));
589238384Sjkim	&mov	($rounds,&wparam(5));
590238384Sjkim	&mov	($key_,"esp");
591238384Sjkim	&sub	("esp",60);
592238384Sjkim	&and	("esp",-16);			# align stack
593238384Sjkim	&mov	(&DWP(48,"esp"),$key_);
594238384Sjkim
595238384Sjkim	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
596238384Sjkim	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
597238384Sjkim	&mov	($rounds,&DWP(240,$key));
598238384Sjkim
599238384Sjkim	# compose byte-swap control mask for pshufb on stack
600238384Sjkim	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
601238384Sjkim	&mov	(&DWP(4,"esp"),0x08090a0b);
602238384Sjkim	&mov	(&DWP(8,"esp"),0x04050607);
603238384Sjkim	&mov	(&DWP(12,"esp"),0x00010203);
604238384Sjkim
605238384Sjkim	# compose counter increment vector on stack
606238384Sjkim	&mov	($rounds_,1);
607238384Sjkim	&xor	($key_,$key_);
608238384Sjkim	&mov	(&DWP(16,"esp"),$rounds_);
609238384Sjkim	&mov	(&DWP(20,"esp"),$key_);
610238384Sjkim	&mov	(&DWP(24,"esp"),$key_);
611238384Sjkim	&mov	(&DWP(28,"esp"),$key_);
612238384Sjkim
613238384Sjkim	&shr	($rounds,1);
614238384Sjkim	&lea	($key_,&DWP(0,$key));
615238384Sjkim	&movdqa	($inout3,&QWP(0,"esp"));
616238384Sjkim	&movdqa	($inout0,$ivec);
617238384Sjkim	&mov	($rounds_,$rounds);
618238384Sjkim	&pshufb	($ivec,$inout3);
619238384Sjkim
620238384Sjkim&set_label("ccm64_enc_outer");
621238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key_));
622238384Sjkim	&mov		($rounds,$rounds_);
623238384Sjkim	&movups		($in0,&QWP(0,$inp));
624238384Sjkim
625238384Sjkim	&xorps		($inout0,$rndkey0);
626238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key_));
627238384Sjkim	&xorps		($rndkey0,$in0);
628238384Sjkim	&lea		($key,&DWP(32,$key_));
629238384Sjkim	&xorps		($cmac,$rndkey0);		# cmac^=inp
630238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
631238384Sjkim
632238384Sjkim&set_label("ccm64_enc2_loop");
633238384Sjkim	&aesenc		($inout0,$rndkey1);
634238384Sjkim	&dec		($rounds);
635238384Sjkim	&aesenc		($cmac,$rndkey1);
636238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
637238384Sjkim	&aesenc		($inout0,$rndkey0);
638238384Sjkim	&lea		($key,&DWP(32,$key));
639238384Sjkim	&aesenc		($cmac,$rndkey0);
640238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
641238384Sjkim	&jnz		(&label("ccm64_enc2_loop"));
642238384Sjkim	&aesenc		($inout0,$rndkey1);
643238384Sjkim	&aesenc		($cmac,$rndkey1);
644238384Sjkim	&paddq		($ivec,&QWP(16,"esp"));
645238384Sjkim	&aesenclast	($inout0,$rndkey0);
646238384Sjkim	&aesenclast	($cmac,$rndkey0);
647238384Sjkim
648238384Sjkim	&dec	($len);
649238384Sjkim	&lea	($inp,&DWP(16,$inp));
650238384Sjkim	&xorps	($in0,$inout0);			# inp^=E(ivec)
651238384Sjkim	&movdqa	($inout0,$ivec);
652238384Sjkim	&movups	(&QWP(0,$out),$in0);		# save output
653238384Sjkim	&lea	($out,&DWP(16,$out));
654238384Sjkim	&pshufb	($inout0,$inout3);
655238384Sjkim	&jnz	(&label("ccm64_enc_outer"));
656238384Sjkim
657238384Sjkim	&mov	("esp",&DWP(48,"esp"));
658238384Sjkim	&mov	($out,&wparam(5));
659238384Sjkim	&movups	(&QWP(0,$out),$cmac);
660238384Sjkim&function_end("aesni_ccm64_encrypt_blocks");
661238384Sjkim
662238384Sjkim&function_begin("aesni_ccm64_decrypt_blocks");
663238384Sjkim	&mov	($inp,&wparam(0));
664238384Sjkim	&mov	($out,&wparam(1));
665238384Sjkim	&mov	($len,&wparam(2));
666238384Sjkim	&mov	($key,&wparam(3));
667238384Sjkim	&mov	($rounds_,&wparam(4));
668238384Sjkim	&mov	($rounds,&wparam(5));
669238384Sjkim	&mov	($key_,"esp");
670238384Sjkim	&sub	("esp",60);
671238384Sjkim	&and	("esp",-16);			# align stack
672238384Sjkim	&mov	(&DWP(48,"esp"),$key_);
673238384Sjkim
674238384Sjkim	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
675238384Sjkim	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
676238384Sjkim	&mov	($rounds,&DWP(240,$key));
677238384Sjkim
678238384Sjkim	# compose byte-swap control mask for pshufb on stack
679238384Sjkim	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
680238384Sjkim	&mov	(&DWP(4,"esp"),0x08090a0b);
681238384Sjkim	&mov	(&DWP(8,"esp"),0x04050607);
682238384Sjkim	&mov	(&DWP(12,"esp"),0x00010203);
683238384Sjkim
684238384Sjkim	# compose counter increment vector on stack
685238384Sjkim	&mov	($rounds_,1);
686238384Sjkim	&xor	($key_,$key_);
687238384Sjkim	&mov	(&DWP(16,"esp"),$rounds_);
688238384Sjkim	&mov	(&DWP(20,"esp"),$key_);
689238384Sjkim	&mov	(&DWP(24,"esp"),$key_);
690238384Sjkim	&mov	(&DWP(28,"esp"),$key_);
691238384Sjkim
692238384Sjkim	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
693238384Sjkim	&movdqa	($inout0,$ivec);
694238384Sjkim
695238384Sjkim	&mov	($key_,$key);
696238384Sjkim	&mov	($rounds_,$rounds);
697238384Sjkim
698238384Sjkim	&pshufb	($ivec,$inout3);
699238384Sjkim	if ($inline)
700238384Sjkim	{   &aesni_inline_generate1("enc");	}
701238384Sjkim	else
702238384Sjkim	{   &call	("_aesni_encrypt1");	}
703238384Sjkim	&movups	($in0,&QWP(0,$inp));		# load inp
704238384Sjkim	&paddq	($ivec,&QWP(16,"esp"));
705238384Sjkim	&lea	($inp,&QWP(16,$inp));
706238384Sjkim	&jmp	(&label("ccm64_dec_outer"));
707238384Sjkim
708238384Sjkim&set_label("ccm64_dec_outer",16);
709238384Sjkim	&xorps	($in0,$inout0);			# inp ^= E(ivec)
710238384Sjkim	&movdqa	($inout0,$ivec);
711238384Sjkim	&mov	($rounds,$rounds_);
712238384Sjkim	&movups	(&QWP(0,$out),$in0);		# save output
713238384Sjkim	&lea	($out,&DWP(16,$out));
714238384Sjkim	&pshufb	($inout0,$inout3);
715238384Sjkim
716238384Sjkim	&sub	($len,1);
717238384Sjkim	&jz	(&label("ccm64_dec_break"));
718238384Sjkim
719238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key_));
720238384Sjkim	&shr		($rounds,1);
721238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key_));
722238384Sjkim	&xorps		($in0,$rndkey0);
723238384Sjkim	&lea		($key,&DWP(32,$key_));
724238384Sjkim	&xorps		($inout0,$rndkey0);
725238384Sjkim	&xorps		($cmac,$in0);		# cmac^=out
726238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
727238384Sjkim
728238384Sjkim&set_label("ccm64_dec2_loop");
729238384Sjkim	&aesenc		($inout0,$rndkey1);
730238384Sjkim	&dec		($rounds);
731238384Sjkim	&aesenc		($cmac,$rndkey1);
732238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key));
733238384Sjkim	&aesenc		($inout0,$rndkey0);
734238384Sjkim	&lea		($key,&DWP(32,$key));
735238384Sjkim	&aesenc		($cmac,$rndkey0);
736238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
737238384Sjkim	&jnz		(&label("ccm64_dec2_loop"));
738238384Sjkim	&movups		($in0,&QWP(0,$inp));	# load inp
739238384Sjkim	&paddq		($ivec,&QWP(16,"esp"));
740238384Sjkim	&aesenc		($inout0,$rndkey1);
741238384Sjkim	&aesenc		($cmac,$rndkey1);
742238384Sjkim	&lea		($inp,&QWP(16,$inp));
743238384Sjkim	&aesenclast	($inout0,$rndkey0);
744238384Sjkim	&aesenclast	($cmac,$rndkey0);
745238384Sjkim	&jmp	(&label("ccm64_dec_outer"));
746238384Sjkim
747238384Sjkim&set_label("ccm64_dec_break",16);
748238384Sjkim	&mov	($key,$key_);
749238384Sjkim	if ($inline)
750238384Sjkim	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
751238384Sjkim	else
752238384Sjkim	{   &call	("_aesni_encrypt1",$cmac);	}
753238384Sjkim
754238384Sjkim	&mov	("esp",&DWP(48,"esp"));
755238384Sjkim	&mov	($out,&wparam(5));
756238384Sjkim	&movups	(&QWP(0,$out),$cmac);
757238384Sjkim&function_end("aesni_ccm64_decrypt_blocks");
758238384Sjkim}
759238384Sjkim
760238384Sjkim######################################################################
761238384Sjkim# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762238384Sjkim#                         size_t blocks, const AES_KEY *key,
763238384Sjkim#                         const char *ivec);
764238384Sjkim#
765238384Sjkim# Handles only complete blocks, operates on 32-bit counter and
766238384Sjkim# does not update *ivec! (see engine/eng_aesni.c for details)
767238384Sjkim#
768238384Sjkim# stack layout:
769238384Sjkim#	0	pshufb mask
770238384Sjkim#	16	vector addend: 0,6,6,6
771238384Sjkim# 	32	counter-less ivec
772238384Sjkim#	48	1st triplet of counter vector
773238384Sjkim#	64	2nd triplet of counter vector
774238384Sjkim#	80	saved %esp
775238384Sjkim
776238384Sjkim&function_begin("aesni_ctr32_encrypt_blocks");
777238384Sjkim	&mov	($inp,&wparam(0));
778238384Sjkim	&mov	($out,&wparam(1));
779238384Sjkim	&mov	($len,&wparam(2));
780238384Sjkim	&mov	($key,&wparam(3));
781238384Sjkim	&mov	($rounds_,&wparam(4));
782238384Sjkim	&mov	($key_,"esp");
783238384Sjkim	&sub	("esp",88);
784238384Sjkim	&and	("esp",-16);			# align stack
785238384Sjkim	&mov	(&DWP(80,"esp"),$key_);
786238384Sjkim
787238384Sjkim	&cmp	($len,1);
788238384Sjkim	&je	(&label("ctr32_one_shortcut"));
789238384Sjkim
790238384Sjkim	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
791238384Sjkim
792238384Sjkim	# compose byte-swap control mask for pshufb on stack
793238384Sjkim	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
794238384Sjkim	&mov	(&DWP(4,"esp"),0x08090a0b);
795238384Sjkim	&mov	(&DWP(8,"esp"),0x04050607);
796238384Sjkim	&mov	(&DWP(12,"esp"),0x00010203);
797238384Sjkim
798238384Sjkim	# compose counter increment vector on stack
799238384Sjkim	&mov	($rounds,6);
800238384Sjkim	&xor	($key_,$key_);
801238384Sjkim	&mov	(&DWP(16,"esp"),$rounds);
802238384Sjkim	&mov	(&DWP(20,"esp"),$rounds);
803238384Sjkim	&mov	(&DWP(24,"esp"),$rounds);
804238384Sjkim	&mov	(&DWP(28,"esp"),$key_);
805238384Sjkim
806238384Sjkim	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
807238384Sjkim	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
808238384Sjkim
809238384Sjkim	&mov	($rounds,&DWP(240,$key));	# key->rounds
810238384Sjkim
811238384Sjkim	# compose 2 vectors of 3x32-bit counters
812238384Sjkim	&bswap	($rounds_);
813238384Sjkim	&pxor	($rndkey1,$rndkey1);
814238384Sjkim	&pxor	($rndkey0,$rndkey0);
815238384Sjkim	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
816238384Sjkim	&pinsrd	($rndkey1,$rounds_,0);
817238384Sjkim	&lea	($key_,&DWP(3,$rounds_));
818238384Sjkim	&pinsrd	($rndkey0,$key_,0);
819238384Sjkim	&inc	($rounds_);
820238384Sjkim	&pinsrd	($rndkey1,$rounds_,1);
821238384Sjkim	&inc	($key_);
822238384Sjkim	&pinsrd	($rndkey0,$key_,1);
823238384Sjkim	&inc	($rounds_);
824238384Sjkim	&pinsrd	($rndkey1,$rounds_,2);
825238384Sjkim	&inc	($key_);
826238384Sjkim	&pinsrd	($rndkey0,$key_,2);
827238384Sjkim	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
828238384Sjkim	&pshufb	($rndkey1,$inout0);		# byte swap
829238384Sjkim	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
830238384Sjkim	&pshufb	($rndkey0,$inout0);		# byte swap
831238384Sjkim
832238384Sjkim	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
833238384Sjkim	&pshufd	($inout1,$rndkey1,2<<6);
834238384Sjkim	&cmp	($len,6);
835238384Sjkim	&jb	(&label("ctr32_tail"));
836238384Sjkim	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
837238384Sjkim	&shr	($rounds,1);
838238384Sjkim	&mov	($key_,$key);			# backup $key
839238384Sjkim	&mov	($rounds_,$rounds);		# backup $rounds
840238384Sjkim	&sub	($len,6);
841238384Sjkim	&jmp	(&label("ctr32_loop6"));
842238384Sjkim
843238384Sjkim&set_label("ctr32_loop6",16);
844238384Sjkim	&pshufd	($inout2,$rndkey1,1<<6);
845238384Sjkim	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
846238384Sjkim	&pshufd	($inout3,$rndkey0,3<<6);
847238384Sjkim	&por	($inout0,$rndkey1);		# merge counter-less ivec
848238384Sjkim	&pshufd	($inout4,$rndkey0,2<<6);
849238384Sjkim	&por	($inout1,$rndkey1);
850238384Sjkim	&pshufd	($inout5,$rndkey0,1<<6);
851238384Sjkim	&por	($inout2,$rndkey1);
852238384Sjkim	&por	($inout3,$rndkey1);
853238384Sjkim	&por	($inout4,$rndkey1);
854238384Sjkim	&por	($inout5,$rndkey1);
855238384Sjkim
856238384Sjkim	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
857238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key_));
858238384Sjkim	&$movekey	($rndkey1,&QWP(16,$key_));
859238384Sjkim	&lea		($key,&DWP(32,$key_));
860238384Sjkim	&dec		($rounds);
861238384Sjkim	&pxor		($inout0,$rndkey0);
862238384Sjkim	&pxor		($inout1,$rndkey0);
863238384Sjkim	&aesenc		($inout0,$rndkey1);
864238384Sjkim	&pxor		($inout2,$rndkey0);
865238384Sjkim	&aesenc		($inout1,$rndkey1);
866238384Sjkim	&pxor		($inout3,$rndkey0);
867238384Sjkim	&aesenc		($inout2,$rndkey1);
868238384Sjkim	&pxor		($inout4,$rndkey0);
869238384Sjkim	&aesenc		($inout3,$rndkey1);
870238384Sjkim	&pxor		($inout5,$rndkey0);
871238384Sjkim	&aesenc		($inout4,$rndkey1);
872238384Sjkim	&$movekey	($rndkey0,&QWP(0,$key));
873238384Sjkim	&aesenc		($inout5,$rndkey1);
874238384Sjkim
875238384Sjkim	&call		(&label("_aesni_encrypt6_enter"));
876238384Sjkim
877238384Sjkim	&movups	($rndkey1,&QWP(0,$inp));
878238384Sjkim	&movups	($rndkey0,&QWP(0x10,$inp));
879238384Sjkim	&xorps	($inout0,$rndkey1);
880238384Sjkim	&movups	($rndkey1,&QWP(0x20,$inp));
881238384Sjkim	&xorps	($inout1,$rndkey0);
882238384Sjkim	&movups	(&QWP(0,$out),$inout0);
883238384Sjkim	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
884238384Sjkim	&xorps	($inout2,$rndkey1);
885238384Sjkim	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
886238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
887238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
888238384Sjkim
889238384Sjkim	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
890238384Sjkim	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
891238384Sjkim	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
892238384Sjkim
893238384Sjkim	&movups	($inout1,&QWP(0x30,$inp));
894238384Sjkim	&movups	($inout2,&QWP(0x40,$inp));
895238384Sjkim	&xorps	($inout3,$inout1);
896238384Sjkim	&movups	($inout1,&QWP(0x50,$inp));
897238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
898238384Sjkim	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
899238384Sjkim	&pshufb	($rndkey1,$inout0);		# byte swap
900238384Sjkim	&xorps	($inout4,$inout2);
901238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
902238384Sjkim	&xorps	($inout5,$inout1);
903238384Sjkim	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
904238384Sjkim	&pshufb	($rndkey0,$inout0);		# byte swap
905238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
906238384Sjkim	&pshufd	($inout0,$rndkey1,3<<6);
907238384Sjkim	&movups	(&QWP(0x50,$out),$inout5);
908238384Sjkim	&lea	($out,&DWP(0x60,$out));
909238384Sjkim
910238384Sjkim	&mov	($rounds,$rounds_);
911238384Sjkim	&pshufd	($inout1,$rndkey1,2<<6);
912238384Sjkim	&sub	($len,6);
913238384Sjkim	&jnc	(&label("ctr32_loop6"));
914238384Sjkim
915238384Sjkim	&add	($len,6);
916238384Sjkim	&jz	(&label("ctr32_ret"));
917238384Sjkim	&mov	($key,$key_);
918238384Sjkim	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
919238384Sjkim	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
920238384Sjkim
921238384Sjkim&set_label("ctr32_tail");
922238384Sjkim	&por	($inout0,$inout5);
923238384Sjkim	&cmp	($len,2);
924238384Sjkim	&jb	(&label("ctr32_one"));
925238384Sjkim
926238384Sjkim	&pshufd	($inout2,$rndkey1,1<<6);
927238384Sjkim	&por	($inout1,$inout5);
928238384Sjkim	&je	(&label("ctr32_two"));
929238384Sjkim
930238384Sjkim	&pshufd	($inout3,$rndkey0,3<<6);
931238384Sjkim	&por	($inout2,$inout5);
932238384Sjkim	&cmp	($len,4);
933238384Sjkim	&jb	(&label("ctr32_three"));
934238384Sjkim
935238384Sjkim	&pshufd	($inout4,$rndkey0,2<<6);
936238384Sjkim	&por	($inout3,$inout5);
937238384Sjkim	&je	(&label("ctr32_four"));
938238384Sjkim
939238384Sjkim	&por	($inout4,$inout5);
940238384Sjkim	&call	("_aesni_encrypt6");
941238384Sjkim	&movups	($rndkey1,&QWP(0,$inp));
942238384Sjkim	&movups	($rndkey0,&QWP(0x10,$inp));
943238384Sjkim	&xorps	($inout0,$rndkey1);
944238384Sjkim	&movups	($rndkey1,&QWP(0x20,$inp));
945238384Sjkim	&xorps	($inout1,$rndkey0);
946238384Sjkim	&movups	($rndkey0,&QWP(0x30,$inp));
947238384Sjkim	&xorps	($inout2,$rndkey1);
948238384Sjkim	&movups	($rndkey1,&QWP(0x40,$inp));
949238384Sjkim	&xorps	($inout3,$rndkey0);
950238384Sjkim	&movups	(&QWP(0,$out),$inout0);
951238384Sjkim	&xorps	($inout4,$rndkey1);
952238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
953238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
954238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
955238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
956238384Sjkim	&jmp	(&label("ctr32_ret"));
957238384Sjkim
958238384Sjkim&set_label("ctr32_one_shortcut",16);
959238384Sjkim	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
960238384Sjkim	&mov	($rounds,&DWP(240,$key));
961238384Sjkim
962238384Sjkim&set_label("ctr32_one");
963238384Sjkim	if ($inline)
964238384Sjkim	{   &aesni_inline_generate1("enc");	}
965238384Sjkim	else
966238384Sjkim	{   &call	("_aesni_encrypt1");	}
967238384Sjkim	&movups	($in0,&QWP(0,$inp));
968238384Sjkim	&xorps	($in0,$inout0);
969238384Sjkim	&movups	(&QWP(0,$out),$in0);
970238384Sjkim	&jmp	(&label("ctr32_ret"));
971238384Sjkim
972238384Sjkim&set_label("ctr32_two",16);
973238384Sjkim	&call	("_aesni_encrypt3");
974238384Sjkim	&movups	($inout3,&QWP(0,$inp));
975238384Sjkim	&movups	($inout4,&QWP(0x10,$inp));
976238384Sjkim	&xorps	($inout0,$inout3);
977238384Sjkim	&xorps	($inout1,$inout4);
978238384Sjkim	&movups	(&QWP(0,$out),$inout0);
979238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
980238384Sjkim	&jmp	(&label("ctr32_ret"));
981238384Sjkim
982238384Sjkim&set_label("ctr32_three",16);
983238384Sjkim	&call	("_aesni_encrypt3");
984238384Sjkim	&movups	($inout3,&QWP(0,$inp));
985238384Sjkim	&movups	($inout4,&QWP(0x10,$inp));
986238384Sjkim	&xorps	($inout0,$inout3);
987238384Sjkim	&movups	($inout5,&QWP(0x20,$inp));
988238384Sjkim	&xorps	($inout1,$inout4);
989238384Sjkim	&movups	(&QWP(0,$out),$inout0);
990238384Sjkim	&xorps	($inout2,$inout5);
991238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
992238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
993238384Sjkim	&jmp	(&label("ctr32_ret"));
994238384Sjkim
995238384Sjkim&set_label("ctr32_four",16);
996238384Sjkim	&call	("_aesni_encrypt4");
997238384Sjkim	&movups	($inout4,&QWP(0,$inp));
998238384Sjkim	&movups	($inout5,&QWP(0x10,$inp));
999238384Sjkim	&movups	($rndkey1,&QWP(0x20,$inp));
1000238384Sjkim	&xorps	($inout0,$inout4);
1001238384Sjkim	&movups	($rndkey0,&QWP(0x30,$inp));
1002238384Sjkim	&xorps	($inout1,$inout5);
1003238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1004238384Sjkim	&xorps	($inout2,$rndkey1);
1005238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
1006238384Sjkim	&xorps	($inout3,$rndkey0);
1007238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
1008238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
1009238384Sjkim
1010238384Sjkim&set_label("ctr32_ret");
1011238384Sjkim	&mov	("esp",&DWP(80,"esp"));
1012238384Sjkim&function_end("aesni_ctr32_encrypt_blocks");
1013238384Sjkim
1014238384Sjkim######################################################################
1015238384Sjkim# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016238384Sjkim#	const AES_KEY *key1, const AES_KEY *key2
1017238384Sjkim#	const unsigned char iv[16]);
1018238384Sjkim#
1019238384Sjkim{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020238384Sjkim
1021238384Sjkim&function_begin("aesni_xts_encrypt");
1022238384Sjkim	&mov	($key,&wparam(4));		# key2
1023238384Sjkim	&mov	($inp,&wparam(5));		# clear-text tweak
1024238384Sjkim
1025238384Sjkim	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1026238384Sjkim	&movups	($inout0,&QWP(0,$inp));
1027238384Sjkim	if ($inline)
1028238384Sjkim	{   &aesni_inline_generate1("enc");	}
1029238384Sjkim	else
1030238384Sjkim	{   &call	("_aesni_encrypt1");	}
1031238384Sjkim
1032238384Sjkim	&mov	($inp,&wparam(0));
1033238384Sjkim	&mov	($out,&wparam(1));
1034238384Sjkim	&mov	($len,&wparam(2));
1035238384Sjkim	&mov	($key,&wparam(3));		# key1
1036238384Sjkim
1037238384Sjkim	&mov	($key_,"esp");
1038238384Sjkim	&sub	("esp",16*7+8);
1039238384Sjkim	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1040238384Sjkim	&and	("esp",-16);			# align stack
1041238384Sjkim
1042238384Sjkim	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1043238384Sjkim	&mov	(&DWP(16*6+4,"esp"),0);
1044238384Sjkim	&mov	(&DWP(16*6+8,"esp"),1);
1045238384Sjkim	&mov	(&DWP(16*6+12,"esp"),0);
1046238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1047238384Sjkim	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1048238384Sjkim
1049238384Sjkim	&movdqa	($tweak,$inout0);
1050238384Sjkim	&pxor	($twtmp,$twtmp);
1051238384Sjkim	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1052238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1053238384Sjkim
1054238384Sjkim	&and	($len,-16);
1055238384Sjkim	&mov	($key_,$key);			# backup $key
1056238384Sjkim	&mov	($rounds_,$rounds);		# backup $rounds
1057238384Sjkim	&sub	($len,16*6);
1058238384Sjkim	&jc	(&label("xts_enc_short"));
1059238384Sjkim
1060238384Sjkim	&shr	($rounds,1);
1061238384Sjkim	&mov	($rounds_,$rounds);
1062238384Sjkim	&jmp	(&label("xts_enc_loop6"));
1063238384Sjkim
1064238384Sjkim&set_label("xts_enc_loop6",16);
1065238384Sjkim	for ($i=0;$i<4;$i++) {
1066238384Sjkim	    &pshufd	($twres,$twtmp,0x13);
1067238384Sjkim	    &pxor	($twtmp,$twtmp);
1068238384Sjkim	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1069238384Sjkim	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1070238384Sjkim	    &pand	($twres,$twmask);	# isolate carry and residue
1071238384Sjkim	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1072238384Sjkim	    &pxor	($tweak,$twres);
1073238384Sjkim	}
1074238384Sjkim	&pshufd	($inout5,$twtmp,0x13);
1075238384Sjkim	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1076238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1077238384Sjkim	 &$movekey	($rndkey0,&QWP(0,$key_));
1078238384Sjkim	&pand	($inout5,$twmask);		# isolate carry and residue
1079238384Sjkim	 &movups	($inout0,&QWP(0,$inp));	# load input
1080238384Sjkim	&pxor	($inout5,$tweak);
1081238384Sjkim
1082238384Sjkim	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083238384Sjkim	&movdqu	($inout1,&QWP(16*1,$inp));
1084238384Sjkim	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1085238384Sjkim	&movdqu	($inout2,&QWP(16*2,$inp));
1086238384Sjkim	 &pxor		($inout1,$rndkey0);
1087238384Sjkim	&movdqu	($inout3,&QWP(16*3,$inp));
1088238384Sjkim	 &pxor		($inout2,$rndkey0);
1089238384Sjkim	&movdqu	($inout4,&QWP(16*4,$inp));
1090238384Sjkim	 &pxor		($inout3,$rndkey0);
1091238384Sjkim	&movdqu	($rndkey1,&QWP(16*5,$inp));
1092238384Sjkim	 &pxor		($inout4,$rndkey0);
1093238384Sjkim	&lea	($inp,&DWP(16*6,$inp));
1094238384Sjkim	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1095238384Sjkim	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1096238384Sjkim	&pxor	($inout5,$rndkey1);
1097238384Sjkim
1098238384Sjkim	 &$movekey	($rndkey1,&QWP(16,$key_));
1099238384Sjkim	 &lea		($key,&DWP(32,$key_));
1100238384Sjkim	&pxor	($inout1,&QWP(16*1,"esp"));
1101238384Sjkim	 &aesenc	($inout0,$rndkey1);
1102238384Sjkim	&pxor	($inout2,&QWP(16*2,"esp"));
1103238384Sjkim	 &aesenc	($inout1,$rndkey1);
1104238384Sjkim	&pxor	($inout3,&QWP(16*3,"esp"));
1105238384Sjkim	 &dec		($rounds);
1106238384Sjkim	 &aesenc	($inout2,$rndkey1);
1107238384Sjkim	&pxor	($inout4,&QWP(16*4,"esp"));
1108238384Sjkim	 &aesenc	($inout3,$rndkey1);
1109238384Sjkim	&pxor		($inout5,$rndkey0);
1110238384Sjkim	 &aesenc	($inout4,$rndkey1);
1111238384Sjkim	 &$movekey	($rndkey0,&QWP(0,$key));
1112238384Sjkim	 &aesenc	($inout5,$rndkey1);
1113238384Sjkim	&call		(&label("_aesni_encrypt6_enter"));
1114238384Sjkim
1115238384Sjkim	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1116238384Sjkim       &pxor	($twtmp,$twtmp);
1117238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1118238384Sjkim       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1119238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1120238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1121238384Sjkim	&xorps	($inout2,&QWP(16*2,"esp"));
1122238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1123238384Sjkim	&xorps	($inout3,&QWP(16*3,"esp"));
1124238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1125238384Sjkim	&xorps	($inout4,&QWP(16*4,"esp"));
1126238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1127238384Sjkim	&xorps	($inout5,$tweak);
1128238384Sjkim	&movups	(&QWP(16*4,$out),$inout4);
1129238384Sjkim       &pshufd	($twres,$twtmp,0x13);
1130238384Sjkim	&movups	(&QWP(16*5,$out),$inout5);
1131238384Sjkim	&lea	($out,&DWP(16*6,$out));
1132238384Sjkim       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1133238384Sjkim
1134238384Sjkim	&pxor	($twtmp,$twtmp);
1135238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1136238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1137238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1138238384Sjkim	&mov	($rounds,$rounds_);		# restore $rounds
1139238384Sjkim	&pxor	($tweak,$twres);
1140238384Sjkim
1141238384Sjkim	&sub	($len,16*6);
1142238384Sjkim	&jnc	(&label("xts_enc_loop6"));
1143238384Sjkim
1144238384Sjkim	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
1145238384Sjkim	&mov	($key,$key_);			# restore $key
1146238384Sjkim	&mov	($rounds_,$rounds);
1147238384Sjkim
1148238384Sjkim&set_label("xts_enc_short");
1149238384Sjkim	&add	($len,16*6);
1150238384Sjkim	&jz	(&label("xts_enc_done6x"));
1151238384Sjkim
1152238384Sjkim	&movdqa	($inout3,$tweak);		# put aside previous tweak
1153238384Sjkim	&cmp	($len,0x20);
1154238384Sjkim	&jb	(&label("xts_enc_one"));
1155238384Sjkim
1156238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1157238384Sjkim	&pxor	($twtmp,$twtmp);
1158238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1159238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1160238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1161238384Sjkim	&pxor	($tweak,$twres);
1162238384Sjkim	&je	(&label("xts_enc_two"));
1163238384Sjkim
1164238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1165238384Sjkim	&pxor	($twtmp,$twtmp);
1166238384Sjkim	&movdqa	($inout4,$tweak);		# put aside previous tweak
1167238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1168238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1169238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1170238384Sjkim	&pxor	($tweak,$twres);
1171238384Sjkim	&cmp	($len,0x40);
1172238384Sjkim	&jb	(&label("xts_enc_three"));
1173238384Sjkim
1174238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1175238384Sjkim	&pxor	($twtmp,$twtmp);
1176238384Sjkim	&movdqa	($inout5,$tweak);		# put aside previous tweak
1177238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1178238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1179238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1180238384Sjkim	&pxor	($tweak,$twres);
1181238384Sjkim	&movdqa	(&QWP(16*0,"esp"),$inout3);
1182238384Sjkim	&movdqa	(&QWP(16*1,"esp"),$inout4);
1183238384Sjkim	&je	(&label("xts_enc_four"));
1184238384Sjkim
1185238384Sjkim	&movdqa	(&QWP(16*2,"esp"),$inout5);
1186238384Sjkim	&pshufd	($inout5,$twtmp,0x13);
1187238384Sjkim	&movdqa	(&QWP(16*3,"esp"),$tweak);
1188238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1189238384Sjkim	&pand	($inout5,$twmask);		# isolate carry and residue
1190238384Sjkim	&pxor	($inout5,$tweak);
1191238384Sjkim
1192238384Sjkim	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1193238384Sjkim	&movdqu	($inout1,&QWP(16*1,$inp));
1194238384Sjkim	&movdqu	($inout2,&QWP(16*2,$inp));
1195238384Sjkim	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1196238384Sjkim	&movdqu	($inout3,&QWP(16*3,$inp));
1197238384Sjkim	&pxor	($inout1,&QWP(16*1,"esp"));
1198238384Sjkim	&movdqu	($inout4,&QWP(16*4,$inp));
1199238384Sjkim	&pxor	($inout2,&QWP(16*2,"esp"));
1200238384Sjkim	&lea	($inp,&DWP(16*5,$inp));
1201238384Sjkim	&pxor	($inout3,&QWP(16*3,"esp"));
1202238384Sjkim	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1203238384Sjkim	&pxor	($inout4,$inout5);
1204238384Sjkim
1205238384Sjkim	&call	("_aesni_encrypt6");
1206238384Sjkim
1207238384Sjkim	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1208238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1209238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1210238384Sjkim	&xorps	($inout2,&QWP(16*2,"esp"));
1211238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1212238384Sjkim	&xorps	($inout3,&QWP(16*3,"esp"));
1213238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1214238384Sjkim	&xorps	($inout4,$tweak);
1215238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1216238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1217238384Sjkim	&movups	(&QWP(16*4,$out),$inout4);
1218238384Sjkim	&lea	($out,&DWP(16*5,$out));
1219238384Sjkim	&jmp	(&label("xts_enc_done"));
1220238384Sjkim
1221238384Sjkim&set_label("xts_enc_one",16);
1222238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1223238384Sjkim	&lea	($inp,&DWP(16*1,$inp));
1224238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1225238384Sjkim	if ($inline)
1226238384Sjkim	{   &aesni_inline_generate1("enc");	}
1227238384Sjkim	else
1228238384Sjkim	{   &call	("_aesni_encrypt1");	}
1229238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1230238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1231238384Sjkim	&lea	($out,&DWP(16*1,$out));
1232238384Sjkim
1233238384Sjkim	&movdqa	($tweak,$inout3);		# last tweak
1234238384Sjkim	&jmp	(&label("xts_enc_done"));
1235238384Sjkim
1236238384Sjkim&set_label("xts_enc_two",16);
1237238384Sjkim	&movaps	($inout4,$tweak);		# put aside last tweak
1238238384Sjkim
1239238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1240238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1241238384Sjkim	&lea	($inp,&DWP(16*2,$inp));
1242238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1243238384Sjkim	&xorps	($inout1,$inout4);
1244238384Sjkim	&xorps	($inout2,$inout2);
1245238384Sjkim
1246238384Sjkim	&call	("_aesni_encrypt3");
1247238384Sjkim
1248238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1249238384Sjkim	&xorps	($inout1,$inout4);
1250238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1251238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1252238384Sjkim	&lea	($out,&DWP(16*2,$out));
1253238384Sjkim
1254238384Sjkim	&movdqa	($tweak,$inout4);		# last tweak
1255238384Sjkim	&jmp	(&label("xts_enc_done"));
1256238384Sjkim
1257238384Sjkim&set_label("xts_enc_three",16);
1258238384Sjkim	&movaps	($inout5,$tweak);		# put aside last tweak
1259238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1260238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1261238384Sjkim	&movups	($inout2,&QWP(16*2,$inp));
1262238384Sjkim	&lea	($inp,&DWP(16*3,$inp));
1263238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1264238384Sjkim	&xorps	($inout1,$inout4);
1265238384Sjkim	&xorps	($inout2,$inout5);
1266238384Sjkim
1267238384Sjkim	&call	("_aesni_encrypt3");
1268238384Sjkim
1269238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1270238384Sjkim	&xorps	($inout1,$inout4);
1271238384Sjkim	&xorps	($inout2,$inout5);
1272238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1273238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1274238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1275238384Sjkim	&lea	($out,&DWP(16*3,$out));
1276238384Sjkim
1277238384Sjkim	&movdqa	($tweak,$inout5);		# last tweak
1278238384Sjkim	&jmp	(&label("xts_enc_done"));
1279238384Sjkim
1280238384Sjkim&set_label("xts_enc_four",16);
1281238384Sjkim	&movaps	($inout4,$tweak);		# put aside last tweak
1282238384Sjkim
1283238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1284238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1285238384Sjkim	&movups	($inout2,&QWP(16*2,$inp));
1286238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1287238384Sjkim	&movups	($inout3,&QWP(16*3,$inp));
1288238384Sjkim	&lea	($inp,&DWP(16*4,$inp));
1289238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1290238384Sjkim	&xorps	($inout2,$inout5);
1291238384Sjkim	&xorps	($inout3,$inout4);
1292238384Sjkim
1293238384Sjkim	&call	("_aesni_encrypt4");
1294238384Sjkim
1295238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1296238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1297238384Sjkim	&xorps	($inout2,$inout5);
1298238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1299238384Sjkim	&xorps	($inout3,$inout4);
1300238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1301238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1302238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1303238384Sjkim	&lea	($out,&DWP(16*4,$out));
1304238384Sjkim
1305238384Sjkim	&movdqa	($tweak,$inout4);		# last tweak
1306238384Sjkim	&jmp	(&label("xts_enc_done"));
1307238384Sjkim
1308238384Sjkim&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1309238384Sjkim	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1310238384Sjkim	&and	($len,15);
1311238384Sjkim	&jz	(&label("xts_enc_ret"));
1312238384Sjkim	&movdqa	($inout3,$tweak);
1313238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1314238384Sjkim	&jmp	(&label("xts_enc_steal"));
1315238384Sjkim
1316238384Sjkim&set_label("xts_enc_done",16);
1317238384Sjkim	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1318238384Sjkim	&pxor	($twtmp,$twtmp);
1319238384Sjkim	&and	($len,15);
1320238384Sjkim	&jz	(&label("xts_enc_ret"));
1321238384Sjkim
1322238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1323238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1324238384Sjkim	&pshufd	($inout3,$twtmp,0x13);
1325238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1326238384Sjkim	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1327238384Sjkim	&pxor	($inout3,$tweak);
1328238384Sjkim
1329238384Sjkim&set_label("xts_enc_steal");
1330238384Sjkim	&movz	($rounds,&BP(0,$inp));
1331238384Sjkim	&movz	($key,&BP(-16,$out));
1332238384Sjkim	&lea	($inp,&DWP(1,$inp));
1333238384Sjkim	&mov	(&BP(-16,$out),&LB($rounds));
1334238384Sjkim	&mov	(&BP(0,$out),&LB($key));
1335238384Sjkim	&lea	($out,&DWP(1,$out));
1336238384Sjkim	&sub	($len,1);
1337238384Sjkim	&jnz	(&label("xts_enc_steal"));
1338238384Sjkim
1339238384Sjkim	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1340238384Sjkim	&mov	($key,$key_);			# restore $key
1341238384Sjkim	&mov	($rounds,$rounds_);		# restore $rounds
1342238384Sjkim
1343238384Sjkim	&movups	($inout0,&QWP(-16,$out));	# load input
1344238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1345238384Sjkim	if ($inline)
1346238384Sjkim	{   &aesni_inline_generate1("enc");	}
1347238384Sjkim	else
1348238384Sjkim	{   &call	("_aesni_encrypt1");	}
1349238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1350238384Sjkim	&movups	(&QWP(-16,$out),$inout0);	# write output
1351238384Sjkim
1352238384Sjkim&set_label("xts_enc_ret");
1353238384Sjkim	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1354238384Sjkim&function_end("aesni_xts_encrypt");
1355238384Sjkim
1356238384Sjkim&function_begin("aesni_xts_decrypt");
1357238384Sjkim	&mov	($key,&wparam(4));		# key2
1358238384Sjkim	&mov	($inp,&wparam(5));		# clear-text tweak
1359238384Sjkim
1360238384Sjkim	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1361238384Sjkim	&movups	($inout0,&QWP(0,$inp));
1362238384Sjkim	if ($inline)
1363238384Sjkim	{   &aesni_inline_generate1("enc");	}
1364238384Sjkim	else
1365238384Sjkim	{   &call	("_aesni_encrypt1");	}
1366238384Sjkim
1367238384Sjkim	&mov	($inp,&wparam(0));
1368238384Sjkim	&mov	($out,&wparam(1));
1369238384Sjkim	&mov	($len,&wparam(2));
1370238384Sjkim	&mov	($key,&wparam(3));		# key1
1371238384Sjkim
1372238384Sjkim	&mov	($key_,"esp");
1373238384Sjkim	&sub	("esp",16*7+8);
1374238384Sjkim	&and	("esp",-16);			# align stack
1375238384Sjkim
1376238384Sjkim	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1377238384Sjkim	&test	($len,15);
1378238384Sjkim	&setnz	(&LB($rounds_));
1379238384Sjkim	&shl	($rounds_,4);
1380238384Sjkim	&sub	($len,$rounds_);
1381238384Sjkim
1382238384Sjkim	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1383238384Sjkim	&mov	(&DWP(16*6+4,"esp"),0);
1384238384Sjkim	&mov	(&DWP(16*6+8,"esp"),1);
1385238384Sjkim	&mov	(&DWP(16*6+12,"esp"),0);
1386238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1387238384Sjkim	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1388238384Sjkim
1389238384Sjkim	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1390238384Sjkim	&mov	($key_,$key);			# backup $key
1391238384Sjkim	&mov	($rounds_,$rounds);		# backup $rounds
1392238384Sjkim
1393238384Sjkim	&movdqa	($tweak,$inout0);
1394238384Sjkim	&pxor	($twtmp,$twtmp);
1395238384Sjkim	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1396238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1397238384Sjkim
1398238384Sjkim	&and	($len,-16);
1399238384Sjkim	&sub	($len,16*6);
1400238384Sjkim	&jc	(&label("xts_dec_short"));
1401238384Sjkim
1402238384Sjkim	&shr	($rounds,1);
1403238384Sjkim	&mov	($rounds_,$rounds);
1404238384Sjkim	&jmp	(&label("xts_dec_loop6"));
1405238384Sjkim
1406238384Sjkim&set_label("xts_dec_loop6",16);
1407238384Sjkim	for ($i=0;$i<4;$i++) {
1408238384Sjkim	    &pshufd	($twres,$twtmp,0x13);
1409238384Sjkim	    &pxor	($twtmp,$twtmp);
1410238384Sjkim	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1411238384Sjkim	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1412238384Sjkim	    &pand	($twres,$twmask);	# isolate carry and residue
1413238384Sjkim	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1414238384Sjkim	    &pxor	($tweak,$twres);
1415238384Sjkim	}
1416238384Sjkim	&pshufd	($inout5,$twtmp,0x13);
1417238384Sjkim	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1418238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1419238384Sjkim	 &$movekey	($rndkey0,&QWP(0,$key_));
1420238384Sjkim	&pand	($inout5,$twmask);		# isolate carry and residue
1421238384Sjkim	 &movups	($inout0,&QWP(0,$inp));	# load input
1422238384Sjkim	&pxor	($inout5,$tweak);
1423238384Sjkim
1424238384Sjkim	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425238384Sjkim	&movdqu	($inout1,&QWP(16*1,$inp));
1426238384Sjkim	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1427238384Sjkim	&movdqu	($inout2,&QWP(16*2,$inp));
1428238384Sjkim	 &pxor		($inout1,$rndkey0);
1429238384Sjkim	&movdqu	($inout3,&QWP(16*3,$inp));
1430238384Sjkim	 &pxor		($inout2,$rndkey0);
1431238384Sjkim	&movdqu	($inout4,&QWP(16*4,$inp));
1432238384Sjkim	 &pxor		($inout3,$rndkey0);
1433238384Sjkim	&movdqu	($rndkey1,&QWP(16*5,$inp));
1434238384Sjkim	 &pxor		($inout4,$rndkey0);
1435238384Sjkim	&lea	($inp,&DWP(16*6,$inp));
1436238384Sjkim	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1437238384Sjkim	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1438238384Sjkim	&pxor	($inout5,$rndkey1);
1439238384Sjkim
1440238384Sjkim	 &$movekey	($rndkey1,&QWP(16,$key_));
1441238384Sjkim	 &lea		($key,&DWP(32,$key_));
1442238384Sjkim	&pxor	($inout1,&QWP(16*1,"esp"));
1443238384Sjkim	 &aesdec	($inout0,$rndkey1);
1444238384Sjkim	&pxor	($inout2,&QWP(16*2,"esp"));
1445238384Sjkim	 &aesdec	($inout1,$rndkey1);
1446238384Sjkim	&pxor	($inout3,&QWP(16*3,"esp"));
1447238384Sjkim	 &dec		($rounds);
1448238384Sjkim	 &aesdec	($inout2,$rndkey1);
1449238384Sjkim	&pxor	($inout4,&QWP(16*4,"esp"));
1450238384Sjkim	 &aesdec	($inout3,$rndkey1);
1451238384Sjkim	&pxor		($inout5,$rndkey0);
1452238384Sjkim	 &aesdec	($inout4,$rndkey1);
1453238384Sjkim	 &$movekey	($rndkey0,&QWP(0,$key));
1454238384Sjkim	 &aesdec	($inout5,$rndkey1);
1455238384Sjkim	&call		(&label("_aesni_decrypt6_enter"));
1456238384Sjkim
1457238384Sjkim	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1458238384Sjkim       &pxor	($twtmp,$twtmp);
1459238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1460238384Sjkim       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1461238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1462238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1463238384Sjkim	&xorps	($inout2,&QWP(16*2,"esp"));
1464238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1465238384Sjkim	&xorps	($inout3,&QWP(16*3,"esp"));
1466238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1467238384Sjkim	&xorps	($inout4,&QWP(16*4,"esp"));
1468238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1469238384Sjkim	&xorps	($inout5,$tweak);
1470238384Sjkim	&movups	(&QWP(16*4,$out),$inout4);
1471238384Sjkim       &pshufd	($twres,$twtmp,0x13);
1472238384Sjkim	&movups	(&QWP(16*5,$out),$inout5);
1473238384Sjkim	&lea	($out,&DWP(16*6,$out));
1474238384Sjkim       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1475238384Sjkim
1476238384Sjkim	&pxor	($twtmp,$twtmp);
1477238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1478238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1479238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1480238384Sjkim	&mov	($rounds,$rounds_);		# restore $rounds
1481238384Sjkim	&pxor	($tweak,$twres);
1482238384Sjkim
1483238384Sjkim	&sub	($len,16*6);
1484238384Sjkim	&jnc	(&label("xts_dec_loop6"));
1485238384Sjkim
1486238384Sjkim	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
1487238384Sjkim	&mov	($key,$key_);			# restore $key
1488238384Sjkim	&mov	($rounds_,$rounds);
1489238384Sjkim
1490238384Sjkim&set_label("xts_dec_short");
1491238384Sjkim	&add	($len,16*6);
1492238384Sjkim	&jz	(&label("xts_dec_done6x"));
1493238384Sjkim
1494238384Sjkim	&movdqa	($inout3,$tweak);		# put aside previous tweak
1495238384Sjkim	&cmp	($len,0x20);
1496238384Sjkim	&jb	(&label("xts_dec_one"));
1497238384Sjkim
1498238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1499238384Sjkim	&pxor	($twtmp,$twtmp);
1500238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1501238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1502238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1503238384Sjkim	&pxor	($tweak,$twres);
1504238384Sjkim	&je	(&label("xts_dec_two"));
1505238384Sjkim
1506238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1507238384Sjkim	&pxor	($twtmp,$twtmp);
1508238384Sjkim	&movdqa	($inout4,$tweak);		# put aside previous tweak
1509238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1510238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1511238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1512238384Sjkim	&pxor	($tweak,$twres);
1513238384Sjkim	&cmp	($len,0x40);
1514238384Sjkim	&jb	(&label("xts_dec_three"));
1515238384Sjkim
1516238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1517238384Sjkim	&pxor	($twtmp,$twtmp);
1518238384Sjkim	&movdqa	($inout5,$tweak);		# put aside previous tweak
1519238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1520238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1521238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1522238384Sjkim	&pxor	($tweak,$twres);
1523238384Sjkim	&movdqa	(&QWP(16*0,"esp"),$inout3);
1524238384Sjkim	&movdqa	(&QWP(16*1,"esp"),$inout4);
1525238384Sjkim	&je	(&label("xts_dec_four"));
1526238384Sjkim
1527238384Sjkim	&movdqa	(&QWP(16*2,"esp"),$inout5);
1528238384Sjkim	&pshufd	($inout5,$twtmp,0x13);
1529238384Sjkim	&movdqa	(&QWP(16*3,"esp"),$tweak);
1530238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1531238384Sjkim	&pand	($inout5,$twmask);		# isolate carry and residue
1532238384Sjkim	&pxor	($inout5,$tweak);
1533238384Sjkim
1534238384Sjkim	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1535238384Sjkim	&movdqu	($inout1,&QWP(16*1,$inp));
1536238384Sjkim	&movdqu	($inout2,&QWP(16*2,$inp));
1537238384Sjkim	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1538238384Sjkim	&movdqu	($inout3,&QWP(16*3,$inp));
1539238384Sjkim	&pxor	($inout1,&QWP(16*1,"esp"));
1540238384Sjkim	&movdqu	($inout4,&QWP(16*4,$inp));
1541238384Sjkim	&pxor	($inout2,&QWP(16*2,"esp"));
1542238384Sjkim	&lea	($inp,&DWP(16*5,$inp));
1543238384Sjkim	&pxor	($inout3,&QWP(16*3,"esp"));
1544238384Sjkim	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1545238384Sjkim	&pxor	($inout4,$inout5);
1546238384Sjkim
1547238384Sjkim	&call	("_aesni_decrypt6");
1548238384Sjkim
1549238384Sjkim	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1550238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1551238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1552238384Sjkim	&xorps	($inout2,&QWP(16*2,"esp"));
1553238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1554238384Sjkim	&xorps	($inout3,&QWP(16*3,"esp"));
1555238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1556238384Sjkim	&xorps	($inout4,$tweak);
1557238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1558238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1559238384Sjkim	&movups	(&QWP(16*4,$out),$inout4);
1560238384Sjkim	&lea	($out,&DWP(16*5,$out));
1561238384Sjkim	&jmp	(&label("xts_dec_done"));
1562238384Sjkim
1563238384Sjkim&set_label("xts_dec_one",16);
1564238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1565238384Sjkim	&lea	($inp,&DWP(16*1,$inp));
1566238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1567238384Sjkim	if ($inline)
1568238384Sjkim	{   &aesni_inline_generate1("dec");	}
1569238384Sjkim	else
1570238384Sjkim	{   &call	("_aesni_decrypt1");	}
1571238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1572238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1573238384Sjkim	&lea	($out,&DWP(16*1,$out));
1574238384Sjkim
1575238384Sjkim	&movdqa	($tweak,$inout3);		# last tweak
1576238384Sjkim	&jmp	(&label("xts_dec_done"));
1577238384Sjkim
1578238384Sjkim&set_label("xts_dec_two",16);
1579238384Sjkim	&movaps	($inout4,$tweak);		# put aside last tweak
1580238384Sjkim
1581238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1582238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1583238384Sjkim	&lea	($inp,&DWP(16*2,$inp));
1584238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1585238384Sjkim	&xorps	($inout1,$inout4);
1586238384Sjkim
1587238384Sjkim	&call	("_aesni_decrypt3");
1588238384Sjkim
1589238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1590238384Sjkim	&xorps	($inout1,$inout4);
1591238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1592238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1593238384Sjkim	&lea	($out,&DWP(16*2,$out));
1594238384Sjkim
1595238384Sjkim	&movdqa	($tweak,$inout4);		# last tweak
1596238384Sjkim	&jmp	(&label("xts_dec_done"));
1597238384Sjkim
1598238384Sjkim&set_label("xts_dec_three",16);
1599238384Sjkim	&movaps	($inout5,$tweak);		# put aside last tweak
1600238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1601238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1602238384Sjkim	&movups	($inout2,&QWP(16*2,$inp));
1603238384Sjkim	&lea	($inp,&DWP(16*3,$inp));
1604238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1605238384Sjkim	&xorps	($inout1,$inout4);
1606238384Sjkim	&xorps	($inout2,$inout5);
1607238384Sjkim
1608238384Sjkim	&call	("_aesni_decrypt3");
1609238384Sjkim
1610238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1611238384Sjkim	&xorps	($inout1,$inout4);
1612238384Sjkim	&xorps	($inout2,$inout5);
1613238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1614238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1615238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1616238384Sjkim	&lea	($out,&DWP(16*3,$out));
1617238384Sjkim
1618238384Sjkim	&movdqa	($tweak,$inout5);		# last tweak
1619238384Sjkim	&jmp	(&label("xts_dec_done"));
1620238384Sjkim
1621238384Sjkim&set_label("xts_dec_four",16);
1622238384Sjkim	&movaps	($inout4,$tweak);		# put aside last tweak
1623238384Sjkim
1624238384Sjkim	&movups	($inout0,&QWP(16*0,$inp));	# load input
1625238384Sjkim	&movups	($inout1,&QWP(16*1,$inp));
1626238384Sjkim	&movups	($inout2,&QWP(16*2,$inp));
1627238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1628238384Sjkim	&movups	($inout3,&QWP(16*3,$inp));
1629238384Sjkim	&lea	($inp,&DWP(16*4,$inp));
1630238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1631238384Sjkim	&xorps	($inout2,$inout5);
1632238384Sjkim	&xorps	($inout3,$inout4);
1633238384Sjkim
1634238384Sjkim	&call	("_aesni_decrypt4");
1635238384Sjkim
1636238384Sjkim	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1637238384Sjkim	&xorps	($inout1,&QWP(16*1,"esp"));
1638238384Sjkim	&xorps	($inout2,$inout5);
1639238384Sjkim	&movups	(&QWP(16*0,$out),$inout0);	# write output
1640238384Sjkim	&xorps	($inout3,$inout4);
1641238384Sjkim	&movups	(&QWP(16*1,$out),$inout1);
1642238384Sjkim	&movups	(&QWP(16*2,$out),$inout2);
1643238384Sjkim	&movups	(&QWP(16*3,$out),$inout3);
1644238384Sjkim	&lea	($out,&DWP(16*4,$out));
1645238384Sjkim
1646238384Sjkim	&movdqa	($tweak,$inout4);		# last tweak
1647238384Sjkim	&jmp	(&label("xts_dec_done"));
1648238384Sjkim
1649238384Sjkim&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1650238384Sjkim	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1651238384Sjkim	&and	($len,15);
1652238384Sjkim	&jz	(&label("xts_dec_ret"));
1653238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1654238384Sjkim	&jmp	(&label("xts_dec_only_one_more"));
1655238384Sjkim
1656238384Sjkim&set_label("xts_dec_done",16);
1657238384Sjkim	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1658238384Sjkim	&pxor	($twtmp,$twtmp);
1659238384Sjkim	&and	($len,15);
1660238384Sjkim	&jz	(&label("xts_dec_ret"));
1661238384Sjkim
1662238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1663238384Sjkim	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1664238384Sjkim	&pshufd	($twres,$twtmp,0x13);
1665238384Sjkim	&pxor	($twtmp,$twtmp);
1666238384Sjkim	&movdqa	($twmask,&QWP(16*6,"esp"));
1667238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1668238384Sjkim	&pand	($twres,$twmask);		# isolate carry and residue
1669238384Sjkim	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1670238384Sjkim	&pxor	($tweak,$twres);
1671238384Sjkim
1672238384Sjkim&set_label("xts_dec_only_one_more");
1673238384Sjkim	&pshufd	($inout3,$twtmp,0x13);
1674238384Sjkim	&movdqa	($inout4,$tweak);		# put aside previous tweak
1675238384Sjkim	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1676238384Sjkim	&pand	($inout3,$twmask);		# isolate carry and residue
1677238384Sjkim	&pxor	($inout3,$tweak);
1678238384Sjkim
1679238384Sjkim	&mov	($key,$key_);			# restore $key
1680238384Sjkim	&mov	($rounds,$rounds_);		# restore $rounds
1681238384Sjkim
1682238384Sjkim	&movups	($inout0,&QWP(0,$inp));		# load input
1683238384Sjkim	&xorps	($inout0,$inout3);		# input^=tweak
1684238384Sjkim	if ($inline)
1685238384Sjkim	{   &aesni_inline_generate1("dec");	}
1686238384Sjkim	else
1687238384Sjkim	{   &call	("_aesni_decrypt1");	}
1688238384Sjkim	&xorps	($inout0,$inout3);		# output^=tweak
1689238384Sjkim	&movups	(&QWP(0,$out),$inout0);		# write output
1690238384Sjkim
1691238384Sjkim&set_label("xts_dec_steal");
1692238384Sjkim	&movz	($rounds,&BP(16,$inp));
1693238384Sjkim	&movz	($key,&BP(0,$out));
1694238384Sjkim	&lea	($inp,&DWP(1,$inp));
1695238384Sjkim	&mov	(&BP(0,$out),&LB($rounds));
1696238384Sjkim	&mov	(&BP(16,$out),&LB($key));
1697238384Sjkim	&lea	($out,&DWP(1,$out));
1698238384Sjkim	&sub	($len,1);
1699238384Sjkim	&jnz	(&label("xts_dec_steal"));
1700238384Sjkim
1701238384Sjkim	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1702238384Sjkim	&mov	($key,$key_);			# restore $key
1703238384Sjkim	&mov	($rounds,$rounds_);		# restore $rounds
1704238384Sjkim
1705238384Sjkim	&movups	($inout0,&QWP(0,$out));		# load input
1706238384Sjkim	&xorps	($inout0,$inout4);		# input^=tweak
1707238384Sjkim	if ($inline)
1708238384Sjkim	{   &aesni_inline_generate1("dec");	}
1709238384Sjkim	else
1710238384Sjkim	{   &call	("_aesni_decrypt1");	}
1711238384Sjkim	&xorps	($inout0,$inout4);		# output^=tweak
1712238384Sjkim	&movups	(&QWP(0,$out),$inout0);		# write output
1713238384Sjkim
1714238384Sjkim&set_label("xts_dec_ret");
1715238384Sjkim	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1716238384Sjkim&function_end("aesni_xts_decrypt");
1717238384Sjkim}
1718238384Sjkim}
1719238384Sjkim
1720238384Sjkim######################################################################
1721238384Sjkim# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722238384Sjkim#                           size_t length, const AES_KEY *key,
1723238384Sjkim#                           unsigned char *ivp,const int enc);
1724238384Sjkim&function_begin("${PREFIX}_cbc_encrypt");
1725238384Sjkim	&mov	($inp,&wparam(0));
1726238384Sjkim	&mov	($rounds_,"esp");
1727238384Sjkim	&mov	($out,&wparam(1));
1728238384Sjkim	&sub	($rounds_,24);
1729238384Sjkim	&mov	($len,&wparam(2));
1730238384Sjkim	&and	($rounds_,-16);
1731238384Sjkim	&mov	($key,&wparam(3));
1732238384Sjkim	&mov	($key_,&wparam(4));
1733238384Sjkim	&test	($len,$len);
1734238384Sjkim	&jz	(&label("cbc_abort"));
1735238384Sjkim
1736238384Sjkim	&cmp	(&wparam(5),0);
1737238384Sjkim	&xchg	($rounds_,"esp");		# alloca
1738238384Sjkim	&movups	($ivec,&QWP(0,$key_));		# load IV
1739238384Sjkim	&mov	($rounds,&DWP(240,$key));
1740238384Sjkim	&mov	($key_,$key);			# backup $key
1741238384Sjkim	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
1742238384Sjkim	&mov	($rounds_,$rounds);		# backup $rounds
1743238384Sjkim	&je	(&label("cbc_decrypt"));
1744238384Sjkim
1745238384Sjkim	&movaps	($inout0,$ivec);
1746238384Sjkim	&cmp	($len,16);
1747238384Sjkim	&jb	(&label("cbc_enc_tail"));
1748238384Sjkim	&sub	($len,16);
1749238384Sjkim	&jmp	(&label("cbc_enc_loop"));
1750238384Sjkim
1751238384Sjkim&set_label("cbc_enc_loop",16);
1752238384Sjkim	&movups	($ivec,&QWP(0,$inp));		# input actually
1753238384Sjkim	&lea	($inp,&DWP(16,$inp));
1754238384Sjkim	if ($inline)
1755238384Sjkim	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
1756238384Sjkim	else
1757238384Sjkim	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
1758238384Sjkim	&mov	($rounds,$rounds_);	# restore $rounds
1759238384Sjkim	&mov	($key,$key_);		# restore $key
1760238384Sjkim	&movups	(&QWP(0,$out),$inout0);	# store output
1761238384Sjkim	&lea	($out,&DWP(16,$out));
1762238384Sjkim	&sub	($len,16);
1763238384Sjkim	&jnc	(&label("cbc_enc_loop"));
1764238384Sjkim	&add	($len,16);
1765238384Sjkim	&jnz	(&label("cbc_enc_tail"));
1766238384Sjkim	&movaps	($ivec,$inout0);
1767238384Sjkim	&jmp	(&label("cbc_ret"));
1768238384Sjkim
1769238384Sjkim&set_label("cbc_enc_tail");
1770238384Sjkim	&mov	("ecx",$len);		# zaps $rounds
1771238384Sjkim	&data_word(0xA4F3F689);		# rep movsb
1772238384Sjkim	&mov	("ecx",16);		# zero tail
1773238384Sjkim	&sub	("ecx",$len);
1774238384Sjkim	&xor	("eax","eax");		# zaps $len
1775238384Sjkim	&data_word(0xAAF3F689);		# rep stosb
1776238384Sjkim	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
1777238384Sjkim	&mov	($rounds,$rounds_);	# restore $rounds
1778238384Sjkim	&mov	($inp,$out);		# $inp and $out are the same
1779238384Sjkim	&mov	($key,$key_);		# restore $key
1780238384Sjkim	&jmp	(&label("cbc_enc_loop"));
1781238384Sjkim######################################################################
1782238384Sjkim&set_label("cbc_decrypt",16);
1783238384Sjkim	&cmp	($len,0x50);
1784238384Sjkim	&jbe	(&label("cbc_dec_tail"));
1785238384Sjkim	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1786238384Sjkim	&sub	($len,0x50);
1787238384Sjkim	&jmp	(&label("cbc_dec_loop6_enter"));
1788238384Sjkim
1789238384Sjkim&set_label("cbc_dec_loop6",16);
1790238384Sjkim	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
1791238384Sjkim	&movups	(&QWP(0,$out),$inout5);
1792238384Sjkim	&lea	($out,&DWP(0x10,$out));
1793238384Sjkim&set_label("cbc_dec_loop6_enter");
1794238384Sjkim	&movdqu	($inout0,&QWP(0,$inp));
1795238384Sjkim	&movdqu	($inout1,&QWP(0x10,$inp));
1796238384Sjkim	&movdqu	($inout2,&QWP(0x20,$inp));
1797238384Sjkim	&movdqu	($inout3,&QWP(0x30,$inp));
1798238384Sjkim	&movdqu	($inout4,&QWP(0x40,$inp));
1799238384Sjkim	&movdqu	($inout5,&QWP(0x50,$inp));
1800238384Sjkim
1801238384Sjkim	&call	("_aesni_decrypt6");
1802238384Sjkim
1803238384Sjkim	&movups	($rndkey1,&QWP(0,$inp));
1804238384Sjkim	&movups	($rndkey0,&QWP(0x10,$inp));
1805238384Sjkim	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
1806238384Sjkim	&xorps	($inout1,$rndkey1);
1807238384Sjkim	&movups	($rndkey1,&QWP(0x20,$inp));
1808238384Sjkim	&xorps	($inout2,$rndkey0);
1809238384Sjkim	&movups	($rndkey0,&QWP(0x30,$inp));
1810238384Sjkim	&xorps	($inout3,$rndkey1);
1811238384Sjkim	&movups	($rndkey1,&QWP(0x40,$inp));
1812238384Sjkim	&xorps	($inout4,$rndkey0);
1813238384Sjkim	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
1814238384Sjkim	&xorps	($inout5,$rndkey1);
1815238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1816238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
1817238384Sjkim	&lea	($inp,&DWP(0x60,$inp));
1818238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
1819238384Sjkim	&mov	($rounds,$rounds_)		# restore $rounds
1820238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
1821238384Sjkim	&mov	($key,$key_);			# restore $key
1822238384Sjkim	&movups	(&QWP(0x40,$out),$inout4);
1823238384Sjkim	&lea	($out,&DWP(0x50,$out));
1824238384Sjkim	&sub	($len,0x60);
1825238384Sjkim	&ja	(&label("cbc_dec_loop6"));
1826238384Sjkim
1827238384Sjkim	&movaps	($inout0,$inout5);
1828238384Sjkim	&movaps	($ivec,$rndkey0);
1829238384Sjkim	&add	($len,0x50);
1830238384Sjkim	&jle	(&label("cbc_dec_tail_collected"));
1831238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1832238384Sjkim	&lea	($out,&DWP(0x10,$out));
1833238384Sjkim&set_label("cbc_dec_tail");
1834238384Sjkim	&movups	($inout0,&QWP(0,$inp));
1835238384Sjkim	&movaps	($in0,$inout0);
1836238384Sjkim	&cmp	($len,0x10);
1837238384Sjkim	&jbe	(&label("cbc_dec_one"));
1838238384Sjkim
1839238384Sjkim	&movups	($inout1,&QWP(0x10,$inp));
1840238384Sjkim	&movaps	($in1,$inout1);
1841238384Sjkim	&cmp	($len,0x20);
1842238384Sjkim	&jbe	(&label("cbc_dec_two"));
1843238384Sjkim
1844238384Sjkim	&movups	($inout2,&QWP(0x20,$inp));
1845238384Sjkim	&cmp	($len,0x30);
1846238384Sjkim	&jbe	(&label("cbc_dec_three"));
1847238384Sjkim
1848238384Sjkim	&movups	($inout3,&QWP(0x30,$inp));
1849238384Sjkim	&cmp	($len,0x40);
1850238384Sjkim	&jbe	(&label("cbc_dec_four"));
1851238384Sjkim
1852238384Sjkim	&movups	($inout4,&QWP(0x40,$inp));
1853238384Sjkim	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1854238384Sjkim	&movups	($inout0,&QWP(0,$inp));
1855238384Sjkim	&xorps	($inout5,$inout5);
1856238384Sjkim	&call	("_aesni_decrypt6");
1857238384Sjkim	&movups	($rndkey1,&QWP(0,$inp));
1858238384Sjkim	&movups	($rndkey0,&QWP(0x10,$inp));
1859238384Sjkim	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
1860238384Sjkim	&xorps	($inout1,$rndkey1);
1861238384Sjkim	&movups	($rndkey1,&QWP(0x20,$inp));
1862238384Sjkim	&xorps	($inout2,$rndkey0);
1863238384Sjkim	&movups	($rndkey0,&QWP(0x30,$inp));
1864238384Sjkim	&xorps	($inout3,$rndkey1);
1865238384Sjkim	&movups	($ivec,&QWP(0x40,$inp));	# IV
1866238384Sjkim	&xorps	($inout4,$rndkey0);
1867238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1868238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
1869238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
1870238384Sjkim	&movups	(&QWP(0x30,$out),$inout3);
1871238384Sjkim	&lea	($out,&DWP(0x40,$out));
1872238384Sjkim	&movaps	($inout0,$inout4);
1873238384Sjkim	&sub	($len,0x50);
1874238384Sjkim	&jmp	(&label("cbc_dec_tail_collected"));
1875238384Sjkim
1876238384Sjkim&set_label("cbc_dec_one",16);
1877238384Sjkim	if ($inline)
1878238384Sjkim	{   &aesni_inline_generate1("dec");	}
1879238384Sjkim	else
1880238384Sjkim	{   &call	("_aesni_decrypt1");	}
1881238384Sjkim	&xorps	($inout0,$ivec);
1882238384Sjkim	&movaps	($ivec,$in0);
1883238384Sjkim	&sub	($len,0x10);
1884238384Sjkim	&jmp	(&label("cbc_dec_tail_collected"));
1885238384Sjkim
1886238384Sjkim&set_label("cbc_dec_two",16);
1887238384Sjkim	&xorps	($inout2,$inout2);
1888238384Sjkim	&call	("_aesni_decrypt3");
1889238384Sjkim	&xorps	($inout0,$ivec);
1890238384Sjkim	&xorps	($inout1,$in0);
1891238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1892238384Sjkim	&movaps	($inout0,$inout1);
1893238384Sjkim	&lea	($out,&DWP(0x10,$out));
1894238384Sjkim	&movaps	($ivec,$in1);
1895238384Sjkim	&sub	($len,0x20);
1896238384Sjkim	&jmp	(&label("cbc_dec_tail_collected"));
1897238384Sjkim
1898238384Sjkim&set_label("cbc_dec_three",16);
1899238384Sjkim	&call	("_aesni_decrypt3");
1900238384Sjkim	&xorps	($inout0,$ivec);
1901238384Sjkim	&xorps	($inout1,$in0);
1902238384Sjkim	&xorps	($inout2,$in1);
1903238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1904238384Sjkim	&movaps	($inout0,$inout2);
1905238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
1906238384Sjkim	&lea	($out,&DWP(0x20,$out));
1907238384Sjkim	&movups	($ivec,&QWP(0x20,$inp));
1908238384Sjkim	&sub	($len,0x30);
1909238384Sjkim	&jmp	(&label("cbc_dec_tail_collected"));
1910238384Sjkim
1911238384Sjkim&set_label("cbc_dec_four",16);
1912238384Sjkim	&call	("_aesni_decrypt4");
1913238384Sjkim	&movups	($rndkey1,&QWP(0x10,$inp));
1914238384Sjkim	&movups	($rndkey0,&QWP(0x20,$inp));
1915238384Sjkim	&xorps	($inout0,$ivec);
1916238384Sjkim	&movups	($ivec,&QWP(0x30,$inp));
1917238384Sjkim	&xorps	($inout1,$in0);
1918238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1919238384Sjkim	&xorps	($inout2,$rndkey1);
1920238384Sjkim	&movups	(&QWP(0x10,$out),$inout1);
1921238384Sjkim	&xorps	($inout3,$rndkey0);
1922238384Sjkim	&movups	(&QWP(0x20,$out),$inout2);
1923238384Sjkim	&lea	($out,&DWP(0x30,$out));
1924238384Sjkim	&movaps	($inout0,$inout3);
1925238384Sjkim	&sub	($len,0x40);
1926238384Sjkim
1927238384Sjkim&set_label("cbc_dec_tail_collected");
1928238384Sjkim	&and	($len,15);
1929238384Sjkim	&jnz	(&label("cbc_dec_tail_partial"));
1930238384Sjkim	&movups	(&QWP(0,$out),$inout0);
1931238384Sjkim	&jmp	(&label("cbc_ret"));
1932238384Sjkim
1933238384Sjkim&set_label("cbc_dec_tail_partial",16);
1934238384Sjkim	&movaps	(&QWP(0,"esp"),$inout0);
1935238384Sjkim	&mov	("ecx",16);
1936238384Sjkim	&mov	($inp,"esp");
1937238384Sjkim	&sub	("ecx",$len);
1938238384Sjkim	&data_word(0xA4F3F689);		# rep movsb
1939238384Sjkim
1940238384Sjkim&set_label("cbc_ret");
1941238384Sjkim	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
1942238384Sjkim	&mov	($key_,&wparam(4));
1943238384Sjkim	&movups	(&QWP(0,$key_),$ivec);	# output IV
1944238384Sjkim&set_label("cbc_abort");
1945238384Sjkim&function_end("${PREFIX}_cbc_encrypt");
1946238384Sjkim
1947238384Sjkim######################################################################
1948238384Sjkim# Mechanical port from aesni-x86_64.pl.
1949238384Sjkim#
1950238384Sjkim# _aesni_set_encrypt_key is private interface,
1951238384Sjkim# input:
1952238384Sjkim#	"eax"	const unsigned char *userKey
1953238384Sjkim#	$rounds	int bits
1954238384Sjkim#	$key	AES_KEY *key
1955238384Sjkim# output:
1956238384Sjkim#	"eax"	return code
1957238384Sjkim#	$round	rounds
1958238384Sjkim
1959238384Sjkim&function_begin_B("_aesni_set_encrypt_key");
1960238384Sjkim	&test	("eax","eax");
1961238384Sjkim	&jz	(&label("bad_pointer"));
1962238384Sjkim	&test	($key,$key);
1963238384Sjkim	&jz	(&label("bad_pointer"));
1964238384Sjkim
1965238384Sjkim	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
1966238384Sjkim	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
1967238384Sjkim	&lea	($key,&DWP(16,$key));
1968238384Sjkim	&cmp	($rounds,256);
1969238384Sjkim	&je	(&label("14rounds"));
1970238384Sjkim	&cmp	($rounds,192);
1971238384Sjkim	&je	(&label("12rounds"));
1972238384Sjkim	&cmp	($rounds,128);
1973238384Sjkim	&jne	(&label("bad_keybits"));
1974238384Sjkim
1975238384Sjkim&set_label("10rounds",16);
1976238384Sjkim	&mov		($rounds,9);
1977238384Sjkim	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
1978238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
1979238384Sjkim	&call		(&label("key_128_cold"));
1980238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
1981238384Sjkim	&call		(&label("key_128"));
1982238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
1983238384Sjkim	&call		(&label("key_128"));
1984238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
1985238384Sjkim	&call		(&label("key_128"));
1986238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
1987238384Sjkim	&call		(&label("key_128"));
1988238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
1989238384Sjkim	&call		(&label("key_128"));
1990238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
1991238384Sjkim	&call		(&label("key_128"));
1992238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
1993238384Sjkim	&call		(&label("key_128"));
1994238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
1995238384Sjkim	&call		(&label("key_128"));
1996238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
1997238384Sjkim	&call		(&label("key_128"));
1998238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
1999238384Sjkim	&mov		(&DWP(80,$key),$rounds);
2000238384Sjkim	&xor		("eax","eax");
2001238384Sjkim	&ret();
2002238384Sjkim
2003238384Sjkim&set_label("key_128",16);
2004238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2005238384Sjkim	&lea		($key,&DWP(16,$key));
2006238384Sjkim&set_label("key_128_cold");
2007238384Sjkim	&shufps		("xmm4","xmm0",0b00010000);
2008238384Sjkim	&xorps		("xmm0","xmm4");
2009238384Sjkim	&shufps		("xmm4","xmm0",0b10001100);
2010238384Sjkim	&xorps		("xmm0","xmm4");
2011238384Sjkim	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2012238384Sjkim	&xorps		("xmm0","xmm1");
2013238384Sjkim	&ret();
2014238384Sjkim
2015238384Sjkim&set_label("12rounds",16);
2016238384Sjkim	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
2017238384Sjkim	&mov		($rounds,11);
2018238384Sjkim	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
2019238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
2020238384Sjkim	&call		(&label("key_192a_cold"));
2021238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
2022238384Sjkim	&call		(&label("key_192b"));
2023238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
2024238384Sjkim	&call		(&label("key_192a"));
2025238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
2026238384Sjkim	&call		(&label("key_192b"));
2027238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
2028238384Sjkim	&call		(&label("key_192a"));
2029238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
2030238384Sjkim	&call		(&label("key_192b"));
2031238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
2032238384Sjkim	&call		(&label("key_192a"));
2033238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
2034238384Sjkim	&call		(&label("key_192b"));
2035238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2036238384Sjkim	&mov		(&DWP(48,$key),$rounds);
2037238384Sjkim	&xor		("eax","eax");
2038238384Sjkim	&ret();
2039238384Sjkim
2040238384Sjkim&set_label("key_192a",16);
2041238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2042238384Sjkim	&lea		($key,&DWP(16,$key));
2043238384Sjkim&set_label("key_192a_cold",16);
2044238384Sjkim	&movaps		("xmm5","xmm2");
2045238384Sjkim&set_label("key_192b_warm");
2046238384Sjkim	&shufps		("xmm4","xmm0",0b00010000);
2047238384Sjkim	&movdqa		("xmm3","xmm2");
2048238384Sjkim	&xorps		("xmm0","xmm4");
2049238384Sjkim	&shufps		("xmm4","xmm0",0b10001100);
2050238384Sjkim	&pslldq		("xmm3",4);
2051238384Sjkim	&xorps		("xmm0","xmm4");
2052238384Sjkim	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
2053238384Sjkim	&pxor		("xmm2","xmm3");
2054238384Sjkim	&pxor		("xmm0","xmm1");
2055238384Sjkim	&pshufd		("xmm3","xmm0",0b11111111);
2056238384Sjkim	&pxor		("xmm2","xmm3");
2057238384Sjkim	&ret();
2058238384Sjkim
2059238384Sjkim&set_label("key_192b",16);
2060238384Sjkim	&movaps		("xmm3","xmm0");
2061238384Sjkim	&shufps		("xmm5","xmm0",0b01000100);
2062238384Sjkim	&$movekey	(&QWP(0,$key),"xmm5");
2063238384Sjkim	&shufps		("xmm3","xmm2",0b01001110);
2064238384Sjkim	&$movekey	(&QWP(16,$key),"xmm3");
2065238384Sjkim	&lea		($key,&DWP(32,$key));
2066238384Sjkim	&jmp		(&label("key_192b_warm"));
2067238384Sjkim
2068238384Sjkim&set_label("14rounds",16);
2069238384Sjkim	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
2070238384Sjkim	&mov		($rounds,13);
2071238384Sjkim	&lea		($key,&DWP(16,$key));
2072238384Sjkim	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
2073238384Sjkim	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
2074238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
2075238384Sjkim	&call		(&label("key_256a_cold"));
2076238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
2077238384Sjkim	&call		(&label("key_256b"));
2078238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
2079238384Sjkim	&call		(&label("key_256a"));
2080238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
2081238384Sjkim	&call		(&label("key_256b"));
2082238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
2083238384Sjkim	&call		(&label("key_256a"));
2084238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
2085238384Sjkim	&call		(&label("key_256b"));
2086238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
2087238384Sjkim	&call		(&label("key_256a"));
2088238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
2089238384Sjkim	&call		(&label("key_256b"));
2090238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
2091238384Sjkim	&call		(&label("key_256a"));
2092238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
2093238384Sjkim	&call		(&label("key_256b"));
2094238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
2095238384Sjkim	&call		(&label("key_256a"));
2096238384Sjkim	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
2097238384Sjkim	&call		(&label("key_256b"));
2098238384Sjkim	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
2099238384Sjkim	&call		(&label("key_256a"));
2100238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2101238384Sjkim	&mov		(&DWP(16,$key),$rounds);
2102238384Sjkim	&xor		("eax","eax");
2103238384Sjkim	&ret();
2104238384Sjkim
2105238384Sjkim&set_label("key_256a",16);
2106238384Sjkim	&$movekey	(&QWP(0,$key),"xmm2");
2107238384Sjkim	&lea		($key,&DWP(16,$key));
2108238384Sjkim&set_label("key_256a_cold");
2109238384Sjkim	&shufps		("xmm4","xmm0",0b00010000);
2110238384Sjkim	&xorps		("xmm0","xmm4");
2111238384Sjkim	&shufps		("xmm4","xmm0",0b10001100);
2112238384Sjkim	&xorps		("xmm0","xmm4");
2113238384Sjkim	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2114238384Sjkim	&xorps		("xmm0","xmm1");
2115238384Sjkim	&ret();
2116238384Sjkim
2117238384Sjkim&set_label("key_256b",16);
2118238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2119238384Sjkim	&lea		($key,&DWP(16,$key));
2120238384Sjkim
2121238384Sjkim	&shufps		("xmm4","xmm2",0b00010000);
2122238384Sjkim	&xorps		("xmm2","xmm4");
2123238384Sjkim	&shufps		("xmm4","xmm2",0b10001100);
2124238384Sjkim	&xorps		("xmm2","xmm4");
2125238384Sjkim	&shufps		("xmm1","xmm1",0b10101010);	# critical path
2126238384Sjkim	&xorps		("xmm2","xmm1");
2127238384Sjkim	&ret();
2128238384Sjkim
2129238384Sjkim&set_label("bad_pointer",4);
2130238384Sjkim	&mov	("eax",-1);
2131238384Sjkim	&ret	();
2132238384Sjkim&set_label("bad_keybits",4);
2133238384Sjkim	&mov	("eax",-2);
2134238384Sjkim	&ret	();
2135238384Sjkim&function_end_B("_aesni_set_encrypt_key");
2136238384Sjkim
2137238384Sjkim# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138238384Sjkim#                              AES_KEY *key)
2139238384Sjkim&function_begin_B("${PREFIX}_set_encrypt_key");
2140238384Sjkim	&mov	("eax",&wparam(0));
2141238384Sjkim	&mov	($rounds,&wparam(1));
2142238384Sjkim	&mov	($key,&wparam(2));
2143238384Sjkim	&call	("_aesni_set_encrypt_key");
2144238384Sjkim	&ret	();
2145238384Sjkim&function_end_B("${PREFIX}_set_encrypt_key");
2146238384Sjkim
2147238384Sjkim# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148238384Sjkim#                              AES_KEY *key)
2149238384Sjkim&function_begin_B("${PREFIX}_set_decrypt_key");
2150238384Sjkim	&mov	("eax",&wparam(0));
2151238384Sjkim	&mov	($rounds,&wparam(1));
2152238384Sjkim	&mov	($key,&wparam(2));
2153238384Sjkim	&call	("_aesni_set_encrypt_key");
2154238384Sjkim	&mov	($key,&wparam(2));
2155238384Sjkim	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
2156238384Sjkim	&test	("eax","eax");
2157238384Sjkim	&jnz	(&label("dec_key_ret"));
2158238384Sjkim	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
2159238384Sjkim
2160238384Sjkim	&$movekey	("xmm0",&QWP(0,$key));	# just swap
2161238384Sjkim	&$movekey	("xmm1",&QWP(0,"eax"));
2162238384Sjkim	&$movekey	(&QWP(0,"eax"),"xmm0");
2163238384Sjkim	&$movekey	(&QWP(0,$key),"xmm1");
2164238384Sjkim	&lea		($key,&DWP(16,$key));
2165238384Sjkim	&lea		("eax",&DWP(-16,"eax"));
2166238384Sjkim
2167238384Sjkim&set_label("dec_key_inverse");
2168238384Sjkim	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
2169238384Sjkim	&$movekey	("xmm1",&QWP(0,"eax"));
2170238384Sjkim	&aesimc		("xmm0","xmm0");
2171238384Sjkim	&aesimc		("xmm1","xmm1");
2172238384Sjkim	&lea		($key,&DWP(16,$key));
2173238384Sjkim	&lea		("eax",&DWP(-16,"eax"));
2174238384Sjkim	&$movekey	(&QWP(16,"eax"),"xmm0");
2175238384Sjkim	&$movekey	(&QWP(-16,$key),"xmm1");
2176238384Sjkim	&cmp		("eax",$key);
2177238384Sjkim	&ja		(&label("dec_key_inverse"));
2178238384Sjkim
2179238384Sjkim	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
2180238384Sjkim	&aesimc		("xmm0","xmm0");
2181238384Sjkim	&$movekey	(&QWP(0,$key),"xmm0");
2182238384Sjkim
2183238384Sjkim	&xor		("eax","eax");		# return success
2184238384Sjkim&set_label("dec_key_ret");
2185238384Sjkim	&ret	();
2186238384Sjkim&function_end_B("${PREFIX}_set_decrypt_key");
2187238384Sjkim&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188238384Sjkim
2189238384Sjkim&asm_finish();
2190