1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# This module implements support for Intel AES-NI extension. In 11238384Sjkim# OpenSSL context it's used with Intel engine, but can also be used as 12238384Sjkim# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13238384Sjkim# details]. 14238384Sjkim# 15238384Sjkim# Performance. 16238384Sjkim# 17238384Sjkim# To start with see corresponding paragraph in aesni-x86_64.pl... 18238384Sjkim# Instead of filling table similar to one found there I've chosen to 19238384Sjkim# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20238384Sjkim# The simplified table below represents 32-bit performance relative 21238384Sjkim# to 64-bit one in every given point. Ratios vary for different 22238384Sjkim# encryption modes, therefore interval values. 23238384Sjkim# 24238384Sjkim# 16-byte 64-byte 256-byte 1-KB 8-KB 25238384Sjkim# 53-67% 67-84% 91-94% 95-98% 97-99.5% 26238384Sjkim# 27238384Sjkim# Lower ratios for smaller block sizes are perfectly understandable, 28238384Sjkim# because function call overhead is higher in 32-bit mode. Largest 29238384Sjkim# 8-KB block performance is virtually same: 32-bit code is less than 30238384Sjkim# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31238384Sjkim 32238384Sjkim# January 2011 33238384Sjkim# 34238384Sjkim# See aesni-x86_64.pl for details. Unlike x86_64 version this module 35238384Sjkim# interleaves at most 6 aes[enc|dec] instructions, because there are 36238384Sjkim# not enough registers for 8x interleave [which should be optimal for 37238384Sjkim# Sandy Bridge]. Actually, performance results for 6x interleave 38238384Sjkim# factor presented in aesni-x86_64.pl (except for CTR) are for this 39238384Sjkim# module. 40238384Sjkim 41238384Sjkim# April 2011 42238384Sjkim# 43238384Sjkim# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44238384Sjkim# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45238384Sjkim 46238384Sjkim$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 47238384Sjkim # generates drop-in replacement for 48238384Sjkim # crypto/aes/asm/aes-586.pl:-) 49238384Sjkim$inline=1; # inline _aesni_[en|de]crypt 50238384Sjkim 51238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm"); 53238384Sjkimrequire "x86asm.pl"; 54238384Sjkim 55238384Sjkim&asm_init($ARGV[0],$0); 56238384Sjkim 57238384Sjkimif ($PREFIX eq "aesni") { $movekey=*movups; } 58238384Sjkimelse { $movekey=*movups; } 59238384Sjkim 60238384Sjkim$len="eax"; 61238384Sjkim$rounds="ecx"; 62238384Sjkim$key="edx"; 63238384Sjkim$inp="esi"; 64238384Sjkim$out="edi"; 65238384Sjkim$rounds_="ebx"; # backup copy for $rounds 66238384Sjkim$key_="ebp"; # backup copy for $key 67238384Sjkim 68238384Sjkim$rndkey0="xmm0"; 69238384Sjkim$rndkey1="xmm1"; 70238384Sjkim$inout0="xmm2"; 71238384Sjkim$inout1="xmm3"; 72238384Sjkim$inout2="xmm4"; 73238384Sjkim$inout3="xmm5"; $in1="xmm5"; 74238384Sjkim$inout4="xmm6"; $in0="xmm6"; 75238384Sjkim$inout5="xmm7"; $ivec="xmm7"; 76238384Sjkim 77238384Sjkim# AESNI extenstion 78238384Sjkimsub aeskeygenassist 79238384Sjkim{ my($dst,$src,$imm)=@_; 80238384Sjkim if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 81238384Sjkim { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 82238384Sjkim} 83238384Sjkimsub aescommon 84238384Sjkim{ my($opcodelet,$dst,$src)=@_; 85238384Sjkim if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 86238384Sjkim { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 87238384Sjkim} 88238384Sjkimsub aesimc { aescommon(0xdb,@_); } 89238384Sjkimsub aesenc { aescommon(0xdc,@_); } 90238384Sjkimsub aesenclast { aescommon(0xdd,@_); } 91238384Sjkimsub aesdec { aescommon(0xde,@_); } 92238384Sjkimsub aesdeclast { aescommon(0xdf,@_); } 93238384Sjkim 94238384Sjkim# Inline version of internal aesni_[en|de]crypt1 95238384Sjkim{ my $sn; 96238384Sjkimsub aesni_inline_generate1 97238384Sjkim{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 98238384Sjkim $sn++; 99238384Sjkim 100238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 101238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 102238384Sjkim &xorps ($ivec,$rndkey0) if (defined($ivec)); 103238384Sjkim &lea ($key,&DWP(32,$key)); 104238384Sjkim &xorps ($inout,$ivec) if (defined($ivec)); 105238384Sjkim &xorps ($inout,$rndkey0) if (!defined($ivec)); 106238384Sjkim &set_label("${p}1_loop_$sn"); 107238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 108238384Sjkim &dec ($rounds); 109238384Sjkim &$movekey ($rndkey1,&QWP(0,$key)); 110238384Sjkim &lea ($key,&DWP(16,$key)); 111238384Sjkim &jnz (&label("${p}1_loop_$sn")); 112238384Sjkim eval"&aes${p}last ($inout,$rndkey1)"; 113238384Sjkim}} 114238384Sjkim 115238384Sjkimsub aesni_generate1 # fully unrolled loop 116238384Sjkim{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 117238384Sjkim 118238384Sjkim &function_begin_B("_aesni_${p}rypt1"); 119238384Sjkim &movups ($rndkey0,&QWP(0,$key)); 120238384Sjkim &$movekey ($rndkey1,&QWP(0x10,$key)); 121238384Sjkim &xorps ($inout,$rndkey0); 122238384Sjkim &$movekey ($rndkey0,&QWP(0x20,$key)); 123238384Sjkim &lea ($key,&DWP(0x30,$key)); 124238384Sjkim &cmp ($rounds,11); 125238384Sjkim &jb (&label("${p}128")); 126238384Sjkim &lea ($key,&DWP(0x20,$key)); 127238384Sjkim &je (&label("${p}192")); 128238384Sjkim &lea ($key,&DWP(0x20,$key)); 129238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 130238384Sjkim &$movekey ($rndkey1,&QWP(-0x40,$key)); 131238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 132238384Sjkim &$movekey ($rndkey0,&QWP(-0x30,$key)); 133238384Sjkim &set_label("${p}192"); 134238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 135238384Sjkim &$movekey ($rndkey1,&QWP(-0x20,$key)); 136238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 137238384Sjkim &$movekey ($rndkey0,&QWP(-0x10,$key)); 138238384Sjkim &set_label("${p}128"); 139238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 140238384Sjkim &$movekey ($rndkey1,&QWP(0,$key)); 141238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 142238384Sjkim &$movekey ($rndkey0,&QWP(0x10,$key)); 143238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 144238384Sjkim &$movekey ($rndkey1,&QWP(0x20,$key)); 145238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 146238384Sjkim &$movekey ($rndkey0,&QWP(0x30,$key)); 147238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 148238384Sjkim &$movekey ($rndkey1,&QWP(0x40,$key)); 149238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 150238384Sjkim &$movekey ($rndkey0,&QWP(0x50,$key)); 151238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 152238384Sjkim &$movekey ($rndkey1,&QWP(0x60,$key)); 153238384Sjkim eval"&aes${p} ($inout,$rndkey0)"; 154238384Sjkim &$movekey ($rndkey0,&QWP(0x70,$key)); 155238384Sjkim eval"&aes${p} ($inout,$rndkey1)"; 156238384Sjkim eval"&aes${p}last ($inout,$rndkey0)"; 157238384Sjkim &ret(); 158238384Sjkim &function_end_B("_aesni_${p}rypt1"); 159238384Sjkim} 160238384Sjkim 161238384Sjkim# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 162238384Sjkim&aesni_generate1("enc") if (!$inline); 163238384Sjkim&function_begin_B("${PREFIX}_encrypt"); 164238384Sjkim &mov ("eax",&wparam(0)); 165238384Sjkim &mov ($key,&wparam(2)); 166238384Sjkim &movups ($inout0,&QWP(0,"eax")); 167238384Sjkim &mov ($rounds,&DWP(240,$key)); 168238384Sjkim &mov ("eax",&wparam(1)); 169238384Sjkim if ($inline) 170238384Sjkim { &aesni_inline_generate1("enc"); } 171238384Sjkim else 172238384Sjkim { &call ("_aesni_encrypt1"); } 173238384Sjkim &movups (&QWP(0,"eax"),$inout0); 174238384Sjkim &ret (); 175238384Sjkim&function_end_B("${PREFIX}_encrypt"); 176238384Sjkim 177238384Sjkim# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 178238384Sjkim&aesni_generate1("dec") if(!$inline); 179238384Sjkim&function_begin_B("${PREFIX}_decrypt"); 180238384Sjkim &mov ("eax",&wparam(0)); 181238384Sjkim &mov ($key,&wparam(2)); 182238384Sjkim &movups ($inout0,&QWP(0,"eax")); 183238384Sjkim &mov ($rounds,&DWP(240,$key)); 184238384Sjkim &mov ("eax",&wparam(1)); 185238384Sjkim if ($inline) 186238384Sjkim { &aesni_inline_generate1("dec"); } 187238384Sjkim else 188238384Sjkim { &call ("_aesni_decrypt1"); } 189238384Sjkim &movups (&QWP(0,"eax"),$inout0); 190238384Sjkim &ret (); 191238384Sjkim&function_end_B("${PREFIX}_decrypt"); 192238384Sjkim 193238384Sjkim# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 194238384Sjkim# factor. Why 3x subroutine were originally used in loops? Even though 195238384Sjkim# aes[enc|dec] latency was originally 6, it could be scheduled only 196238384Sjkim# every *2nd* cycle. Thus 3x interleave was the one providing optimal 197238384Sjkim# utilization, i.e. when subroutine's throughput is virtually same as 198238384Sjkim# of non-interleaved subroutine [for number of input blocks up to 3]. 199238384Sjkim# This is why it makes no sense to implement 2x subroutine. 200238384Sjkim# aes[enc|dec] latency in next processor generation is 8, but the 201238384Sjkim# instructions can be scheduled every cycle. Optimal interleave for 202238384Sjkim# new processor is therefore 8x, but it's unfeasible to accommodate it 203238384Sjkim# in XMM registers addreassable in 32-bit mode and therefore 6x is 204238384Sjkim# used instead... 205238384Sjkim 206238384Sjkimsub aesni_generate3 207238384Sjkim{ my $p=shift; 208238384Sjkim 209238384Sjkim &function_begin_B("_aesni_${p}rypt3"); 210238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 211238384Sjkim &shr ($rounds,1); 212238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 213238384Sjkim &lea ($key,&DWP(32,$key)); 214238384Sjkim &xorps ($inout0,$rndkey0); 215238384Sjkim &pxor ($inout1,$rndkey0); 216238384Sjkim &pxor ($inout2,$rndkey0); 217238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 218238384Sjkim 219238384Sjkim &set_label("${p}3_loop"); 220238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 221238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 222238384Sjkim &dec ($rounds); 223238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 224238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 225238384Sjkim eval"&aes${p} ($inout0,$rndkey0)"; 226238384Sjkim eval"&aes${p} ($inout1,$rndkey0)"; 227238384Sjkim &lea ($key,&DWP(32,$key)); 228238384Sjkim eval"&aes${p} ($inout2,$rndkey0)"; 229238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 230238384Sjkim &jnz (&label("${p}3_loop")); 231238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 232238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 233238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 234238384Sjkim eval"&aes${p}last ($inout0,$rndkey0)"; 235238384Sjkim eval"&aes${p}last ($inout1,$rndkey0)"; 236238384Sjkim eval"&aes${p}last ($inout2,$rndkey0)"; 237238384Sjkim &ret(); 238238384Sjkim &function_end_B("_aesni_${p}rypt3"); 239238384Sjkim} 240238384Sjkim 241238384Sjkim# 4x interleave is implemented to improve small block performance, 242238384Sjkim# most notably [and naturally] 4 block by ~30%. One can argue that one 243238384Sjkim# should have implemented 5x as well, but improvement would be <20%, 244238384Sjkim# so it's not worth it... 245238384Sjkimsub aesni_generate4 246238384Sjkim{ my $p=shift; 247238384Sjkim 248238384Sjkim &function_begin_B("_aesni_${p}rypt4"); 249238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 250238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 251238384Sjkim &shr ($rounds,1); 252238384Sjkim &lea ($key,&DWP(32,$key)); 253238384Sjkim &xorps ($inout0,$rndkey0); 254238384Sjkim &pxor ($inout1,$rndkey0); 255238384Sjkim &pxor ($inout2,$rndkey0); 256238384Sjkim &pxor ($inout3,$rndkey0); 257238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 258238384Sjkim 259238384Sjkim &set_label("${p}4_loop"); 260238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 261238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 262238384Sjkim &dec ($rounds); 263238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 264238384Sjkim eval"&aes${p} ($inout3,$rndkey1)"; 265238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 266238384Sjkim eval"&aes${p} ($inout0,$rndkey0)"; 267238384Sjkim eval"&aes${p} ($inout1,$rndkey0)"; 268238384Sjkim &lea ($key,&DWP(32,$key)); 269238384Sjkim eval"&aes${p} ($inout2,$rndkey0)"; 270238384Sjkim eval"&aes${p} ($inout3,$rndkey0)"; 271238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 272238384Sjkim &jnz (&label("${p}4_loop")); 273238384Sjkim 274238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 275238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 276238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 277238384Sjkim eval"&aes${p} ($inout3,$rndkey1)"; 278238384Sjkim eval"&aes${p}last ($inout0,$rndkey0)"; 279238384Sjkim eval"&aes${p}last ($inout1,$rndkey0)"; 280238384Sjkim eval"&aes${p}last ($inout2,$rndkey0)"; 281238384Sjkim eval"&aes${p}last ($inout3,$rndkey0)"; 282238384Sjkim &ret(); 283238384Sjkim &function_end_B("_aesni_${p}rypt4"); 284238384Sjkim} 285238384Sjkim 286238384Sjkimsub aesni_generate6 287238384Sjkim{ my $p=shift; 288238384Sjkim 289238384Sjkim &function_begin_B("_aesni_${p}rypt6"); 290238384Sjkim &static_label("_aesni_${p}rypt6_enter"); 291238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 292238384Sjkim &shr ($rounds,1); 293238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 294238384Sjkim &lea ($key,&DWP(32,$key)); 295238384Sjkim &xorps ($inout0,$rndkey0); 296238384Sjkim &pxor ($inout1,$rndkey0); # pxor does better here 297238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 298238384Sjkim &pxor ($inout2,$rndkey0); 299238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 300238384Sjkim &pxor ($inout3,$rndkey0); 301238384Sjkim &dec ($rounds); 302238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 303238384Sjkim &pxor ($inout4,$rndkey0); 304238384Sjkim eval"&aes${p} ($inout3,$rndkey1)"; 305238384Sjkim &pxor ($inout5,$rndkey0); 306238384Sjkim eval"&aes${p} ($inout4,$rndkey1)"; 307238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 308238384Sjkim eval"&aes${p} ($inout5,$rndkey1)"; 309238384Sjkim &jmp (&label("_aesni_${p}rypt6_enter")); 310238384Sjkim 311238384Sjkim &set_label("${p}6_loop",16); 312238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 313238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 314238384Sjkim &dec ($rounds); 315238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 316238384Sjkim eval"&aes${p} ($inout3,$rndkey1)"; 317238384Sjkim eval"&aes${p} ($inout4,$rndkey1)"; 318238384Sjkim eval"&aes${p} ($inout5,$rndkey1)"; 319238384Sjkim &set_label("_aesni_${p}rypt6_enter",16); 320238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 321238384Sjkim eval"&aes${p} ($inout0,$rndkey0)"; 322238384Sjkim eval"&aes${p} ($inout1,$rndkey0)"; 323238384Sjkim &lea ($key,&DWP(32,$key)); 324238384Sjkim eval"&aes${p} ($inout2,$rndkey0)"; 325238384Sjkim eval"&aes${p} ($inout3,$rndkey0)"; 326238384Sjkim eval"&aes${p} ($inout4,$rndkey0)"; 327238384Sjkim eval"&aes${p} ($inout5,$rndkey0)"; 328238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 329238384Sjkim &jnz (&label("${p}6_loop")); 330238384Sjkim 331238384Sjkim eval"&aes${p} ($inout0,$rndkey1)"; 332238384Sjkim eval"&aes${p} ($inout1,$rndkey1)"; 333238384Sjkim eval"&aes${p} ($inout2,$rndkey1)"; 334238384Sjkim eval"&aes${p} ($inout3,$rndkey1)"; 335238384Sjkim eval"&aes${p} ($inout4,$rndkey1)"; 336238384Sjkim eval"&aes${p} ($inout5,$rndkey1)"; 337238384Sjkim eval"&aes${p}last ($inout0,$rndkey0)"; 338238384Sjkim eval"&aes${p}last ($inout1,$rndkey0)"; 339238384Sjkim eval"&aes${p}last ($inout2,$rndkey0)"; 340238384Sjkim eval"&aes${p}last ($inout3,$rndkey0)"; 341238384Sjkim eval"&aes${p}last ($inout4,$rndkey0)"; 342238384Sjkim eval"&aes${p}last ($inout5,$rndkey0)"; 343238384Sjkim &ret(); 344238384Sjkim &function_end_B("_aesni_${p}rypt6"); 345238384Sjkim} 346238384Sjkim&aesni_generate3("enc") if ($PREFIX eq "aesni"); 347238384Sjkim&aesni_generate3("dec"); 348238384Sjkim&aesni_generate4("enc") if ($PREFIX eq "aesni"); 349238384Sjkim&aesni_generate4("dec"); 350238384Sjkim&aesni_generate6("enc") if ($PREFIX eq "aesni"); 351238384Sjkim&aesni_generate6("dec"); 352238384Sjkim 353238384Sjkimif ($PREFIX eq "aesni") { 354238384Sjkim###################################################################### 355238384Sjkim# void aesni_ecb_encrypt (const void *in, void *out, 356238384Sjkim# size_t length, const AES_KEY *key, 357238384Sjkim# int enc); 358238384Sjkim&function_begin("aesni_ecb_encrypt"); 359238384Sjkim &mov ($inp,&wparam(0)); 360238384Sjkim &mov ($out,&wparam(1)); 361238384Sjkim &mov ($len,&wparam(2)); 362238384Sjkim &mov ($key,&wparam(3)); 363238384Sjkim &mov ($rounds_,&wparam(4)); 364238384Sjkim &and ($len,-16); 365238384Sjkim &jz (&label("ecb_ret")); 366238384Sjkim &mov ($rounds,&DWP(240,$key)); 367238384Sjkim &test ($rounds_,$rounds_); 368238384Sjkim &jz (&label("ecb_decrypt")); 369238384Sjkim 370238384Sjkim &mov ($key_,$key); # backup $key 371238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 372238384Sjkim &cmp ($len,0x60); 373238384Sjkim &jb (&label("ecb_enc_tail")); 374238384Sjkim 375238384Sjkim &movdqu ($inout0,&QWP(0,$inp)); 376238384Sjkim &movdqu ($inout1,&QWP(0x10,$inp)); 377238384Sjkim &movdqu ($inout2,&QWP(0x20,$inp)); 378238384Sjkim &movdqu ($inout3,&QWP(0x30,$inp)); 379238384Sjkim &movdqu ($inout4,&QWP(0x40,$inp)); 380238384Sjkim &movdqu ($inout5,&QWP(0x50,$inp)); 381238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 382238384Sjkim &sub ($len,0x60); 383238384Sjkim &jmp (&label("ecb_enc_loop6_enter")); 384238384Sjkim 385238384Sjkim&set_label("ecb_enc_loop6",16); 386238384Sjkim &movups (&QWP(0,$out),$inout0); 387238384Sjkim &movdqu ($inout0,&QWP(0,$inp)); 388238384Sjkim &movups (&QWP(0x10,$out),$inout1); 389238384Sjkim &movdqu ($inout1,&QWP(0x10,$inp)); 390238384Sjkim &movups (&QWP(0x20,$out),$inout2); 391238384Sjkim &movdqu ($inout2,&QWP(0x20,$inp)); 392238384Sjkim &movups (&QWP(0x30,$out),$inout3); 393238384Sjkim &movdqu ($inout3,&QWP(0x30,$inp)); 394238384Sjkim &movups (&QWP(0x40,$out),$inout4); 395238384Sjkim &movdqu ($inout4,&QWP(0x40,$inp)); 396238384Sjkim &movups (&QWP(0x50,$out),$inout5); 397238384Sjkim &lea ($out,&DWP(0x60,$out)); 398238384Sjkim &movdqu ($inout5,&QWP(0x50,$inp)); 399238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 400238384Sjkim&set_label("ecb_enc_loop6_enter"); 401238384Sjkim 402238384Sjkim &call ("_aesni_encrypt6"); 403238384Sjkim 404238384Sjkim &mov ($key,$key_); # restore $key 405238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 406238384Sjkim &sub ($len,0x60); 407238384Sjkim &jnc (&label("ecb_enc_loop6")); 408238384Sjkim 409238384Sjkim &movups (&QWP(0,$out),$inout0); 410238384Sjkim &movups (&QWP(0x10,$out),$inout1); 411238384Sjkim &movups (&QWP(0x20,$out),$inout2); 412238384Sjkim &movups (&QWP(0x30,$out),$inout3); 413238384Sjkim &movups (&QWP(0x40,$out),$inout4); 414238384Sjkim &movups (&QWP(0x50,$out),$inout5); 415238384Sjkim &lea ($out,&DWP(0x60,$out)); 416238384Sjkim &add ($len,0x60); 417238384Sjkim &jz (&label("ecb_ret")); 418238384Sjkim 419238384Sjkim&set_label("ecb_enc_tail"); 420238384Sjkim &movups ($inout0,&QWP(0,$inp)); 421238384Sjkim &cmp ($len,0x20); 422238384Sjkim &jb (&label("ecb_enc_one")); 423238384Sjkim &movups ($inout1,&QWP(0x10,$inp)); 424238384Sjkim &je (&label("ecb_enc_two")); 425238384Sjkim &movups ($inout2,&QWP(0x20,$inp)); 426238384Sjkim &cmp ($len,0x40); 427238384Sjkim &jb (&label("ecb_enc_three")); 428238384Sjkim &movups ($inout3,&QWP(0x30,$inp)); 429238384Sjkim &je (&label("ecb_enc_four")); 430238384Sjkim &movups ($inout4,&QWP(0x40,$inp)); 431238384Sjkim &xorps ($inout5,$inout5); 432238384Sjkim &call ("_aesni_encrypt6"); 433238384Sjkim &movups (&QWP(0,$out),$inout0); 434238384Sjkim &movups (&QWP(0x10,$out),$inout1); 435238384Sjkim &movups (&QWP(0x20,$out),$inout2); 436238384Sjkim &movups (&QWP(0x30,$out),$inout3); 437238384Sjkim &movups (&QWP(0x40,$out),$inout4); 438238384Sjkim jmp (&label("ecb_ret")); 439238384Sjkim 440238384Sjkim&set_label("ecb_enc_one",16); 441238384Sjkim if ($inline) 442238384Sjkim { &aesni_inline_generate1("enc"); } 443238384Sjkim else 444238384Sjkim { &call ("_aesni_encrypt1"); } 445238384Sjkim &movups (&QWP(0,$out),$inout0); 446238384Sjkim &jmp (&label("ecb_ret")); 447238384Sjkim 448238384Sjkim&set_label("ecb_enc_two",16); 449238384Sjkim &xorps ($inout2,$inout2); 450238384Sjkim &call ("_aesni_encrypt3"); 451238384Sjkim &movups (&QWP(0,$out),$inout0); 452238384Sjkim &movups (&QWP(0x10,$out),$inout1); 453238384Sjkim &jmp (&label("ecb_ret")); 454238384Sjkim 455238384Sjkim&set_label("ecb_enc_three",16); 456238384Sjkim &call ("_aesni_encrypt3"); 457238384Sjkim &movups (&QWP(0,$out),$inout0); 458238384Sjkim &movups (&QWP(0x10,$out),$inout1); 459238384Sjkim &movups (&QWP(0x20,$out),$inout2); 460238384Sjkim &jmp (&label("ecb_ret")); 461238384Sjkim 462238384Sjkim&set_label("ecb_enc_four",16); 463238384Sjkim &call ("_aesni_encrypt4"); 464238384Sjkim &movups (&QWP(0,$out),$inout0); 465238384Sjkim &movups (&QWP(0x10,$out),$inout1); 466238384Sjkim &movups (&QWP(0x20,$out),$inout2); 467238384Sjkim &movups (&QWP(0x30,$out),$inout3); 468238384Sjkim &jmp (&label("ecb_ret")); 469238384Sjkim###################################################################### 470238384Sjkim&set_label("ecb_decrypt",16); 471238384Sjkim &mov ($key_,$key); # backup $key 472238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 473238384Sjkim &cmp ($len,0x60); 474238384Sjkim &jb (&label("ecb_dec_tail")); 475238384Sjkim 476238384Sjkim &movdqu ($inout0,&QWP(0,$inp)); 477238384Sjkim &movdqu ($inout1,&QWP(0x10,$inp)); 478238384Sjkim &movdqu ($inout2,&QWP(0x20,$inp)); 479238384Sjkim &movdqu ($inout3,&QWP(0x30,$inp)); 480238384Sjkim &movdqu ($inout4,&QWP(0x40,$inp)); 481238384Sjkim &movdqu ($inout5,&QWP(0x50,$inp)); 482238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 483238384Sjkim &sub ($len,0x60); 484238384Sjkim &jmp (&label("ecb_dec_loop6_enter")); 485238384Sjkim 486238384Sjkim&set_label("ecb_dec_loop6",16); 487238384Sjkim &movups (&QWP(0,$out),$inout0); 488238384Sjkim &movdqu ($inout0,&QWP(0,$inp)); 489238384Sjkim &movups (&QWP(0x10,$out),$inout1); 490238384Sjkim &movdqu ($inout1,&QWP(0x10,$inp)); 491238384Sjkim &movups (&QWP(0x20,$out),$inout2); 492238384Sjkim &movdqu ($inout2,&QWP(0x20,$inp)); 493238384Sjkim &movups (&QWP(0x30,$out),$inout3); 494238384Sjkim &movdqu ($inout3,&QWP(0x30,$inp)); 495238384Sjkim &movups (&QWP(0x40,$out),$inout4); 496238384Sjkim &movdqu ($inout4,&QWP(0x40,$inp)); 497238384Sjkim &movups (&QWP(0x50,$out),$inout5); 498238384Sjkim &lea ($out,&DWP(0x60,$out)); 499238384Sjkim &movdqu ($inout5,&QWP(0x50,$inp)); 500238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 501238384Sjkim&set_label("ecb_dec_loop6_enter"); 502238384Sjkim 503238384Sjkim &call ("_aesni_decrypt6"); 504238384Sjkim 505238384Sjkim &mov ($key,$key_); # restore $key 506238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 507238384Sjkim &sub ($len,0x60); 508238384Sjkim &jnc (&label("ecb_dec_loop6")); 509238384Sjkim 510238384Sjkim &movups (&QWP(0,$out),$inout0); 511238384Sjkim &movups (&QWP(0x10,$out),$inout1); 512238384Sjkim &movups (&QWP(0x20,$out),$inout2); 513238384Sjkim &movups (&QWP(0x30,$out),$inout3); 514238384Sjkim &movups (&QWP(0x40,$out),$inout4); 515238384Sjkim &movups (&QWP(0x50,$out),$inout5); 516238384Sjkim &lea ($out,&DWP(0x60,$out)); 517238384Sjkim &add ($len,0x60); 518238384Sjkim &jz (&label("ecb_ret")); 519238384Sjkim 520238384Sjkim&set_label("ecb_dec_tail"); 521238384Sjkim &movups ($inout0,&QWP(0,$inp)); 522238384Sjkim &cmp ($len,0x20); 523238384Sjkim &jb (&label("ecb_dec_one")); 524238384Sjkim &movups ($inout1,&QWP(0x10,$inp)); 525238384Sjkim &je (&label("ecb_dec_two")); 526238384Sjkim &movups ($inout2,&QWP(0x20,$inp)); 527238384Sjkim &cmp ($len,0x40); 528238384Sjkim &jb (&label("ecb_dec_three")); 529238384Sjkim &movups ($inout3,&QWP(0x30,$inp)); 530238384Sjkim &je (&label("ecb_dec_four")); 531238384Sjkim &movups ($inout4,&QWP(0x40,$inp)); 532238384Sjkim &xorps ($inout5,$inout5); 533238384Sjkim &call ("_aesni_decrypt6"); 534238384Sjkim &movups (&QWP(0,$out),$inout0); 535238384Sjkim &movups (&QWP(0x10,$out),$inout1); 536238384Sjkim &movups (&QWP(0x20,$out),$inout2); 537238384Sjkim &movups (&QWP(0x30,$out),$inout3); 538238384Sjkim &movups (&QWP(0x40,$out),$inout4); 539238384Sjkim &jmp (&label("ecb_ret")); 540238384Sjkim 541238384Sjkim&set_label("ecb_dec_one",16); 542238384Sjkim if ($inline) 543238384Sjkim { &aesni_inline_generate1("dec"); } 544238384Sjkim else 545238384Sjkim { &call ("_aesni_decrypt1"); } 546238384Sjkim &movups (&QWP(0,$out),$inout0); 547238384Sjkim &jmp (&label("ecb_ret")); 548238384Sjkim 549238384Sjkim&set_label("ecb_dec_two",16); 550238384Sjkim &xorps ($inout2,$inout2); 551238384Sjkim &call ("_aesni_decrypt3"); 552238384Sjkim &movups (&QWP(0,$out),$inout0); 553238384Sjkim &movups (&QWP(0x10,$out),$inout1); 554238384Sjkim &jmp (&label("ecb_ret")); 555238384Sjkim 556238384Sjkim&set_label("ecb_dec_three",16); 557238384Sjkim &call ("_aesni_decrypt3"); 558238384Sjkim &movups (&QWP(0,$out),$inout0); 559238384Sjkim &movups (&QWP(0x10,$out),$inout1); 560238384Sjkim &movups (&QWP(0x20,$out),$inout2); 561238384Sjkim &jmp (&label("ecb_ret")); 562238384Sjkim 563238384Sjkim&set_label("ecb_dec_four",16); 564238384Sjkim &call ("_aesni_decrypt4"); 565238384Sjkim &movups (&QWP(0,$out),$inout0); 566238384Sjkim &movups (&QWP(0x10,$out),$inout1); 567238384Sjkim &movups (&QWP(0x20,$out),$inout2); 568238384Sjkim &movups (&QWP(0x30,$out),$inout3); 569238384Sjkim 570238384Sjkim&set_label("ecb_ret"); 571238384Sjkim&function_end("aesni_ecb_encrypt"); 572238384Sjkim 573238384Sjkim###################################################################### 574238384Sjkim# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 575238384Sjkim# size_t blocks, const AES_KEY *key, 576238384Sjkim# const char *ivec,char *cmac); 577238384Sjkim# 578238384Sjkim# Handles only complete blocks, operates on 64-bit counter and 579238384Sjkim# does not update *ivec! Nor does it finalize CMAC value 580238384Sjkim# (see engine/eng_aesni.c for details) 581238384Sjkim# 582238384Sjkim{ my $cmac=$inout1; 583238384Sjkim&function_begin("aesni_ccm64_encrypt_blocks"); 584238384Sjkim &mov ($inp,&wparam(0)); 585238384Sjkim &mov ($out,&wparam(1)); 586238384Sjkim &mov ($len,&wparam(2)); 587238384Sjkim &mov ($key,&wparam(3)); 588238384Sjkim &mov ($rounds_,&wparam(4)); 589238384Sjkim &mov ($rounds,&wparam(5)); 590238384Sjkim &mov ($key_,"esp"); 591238384Sjkim &sub ("esp",60); 592238384Sjkim &and ("esp",-16); # align stack 593238384Sjkim &mov (&DWP(48,"esp"),$key_); 594238384Sjkim 595238384Sjkim &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 596238384Sjkim &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 597238384Sjkim &mov ($rounds,&DWP(240,$key)); 598238384Sjkim 599238384Sjkim # compose byte-swap control mask for pshufb on stack 600238384Sjkim &mov (&DWP(0,"esp"),0x0c0d0e0f); 601238384Sjkim &mov (&DWP(4,"esp"),0x08090a0b); 602238384Sjkim &mov (&DWP(8,"esp"),0x04050607); 603238384Sjkim &mov (&DWP(12,"esp"),0x00010203); 604238384Sjkim 605238384Sjkim # compose counter increment vector on stack 606238384Sjkim &mov ($rounds_,1); 607238384Sjkim &xor ($key_,$key_); 608238384Sjkim &mov (&DWP(16,"esp"),$rounds_); 609238384Sjkim &mov (&DWP(20,"esp"),$key_); 610238384Sjkim &mov (&DWP(24,"esp"),$key_); 611238384Sjkim &mov (&DWP(28,"esp"),$key_); 612238384Sjkim 613238384Sjkim &shr ($rounds,1); 614238384Sjkim &lea ($key_,&DWP(0,$key)); 615238384Sjkim &movdqa ($inout3,&QWP(0,"esp")); 616238384Sjkim &movdqa ($inout0,$ivec); 617238384Sjkim &mov ($rounds_,$rounds); 618238384Sjkim &pshufb ($ivec,$inout3); 619238384Sjkim 620238384Sjkim&set_label("ccm64_enc_outer"); 621238384Sjkim &$movekey ($rndkey0,&QWP(0,$key_)); 622238384Sjkim &mov ($rounds,$rounds_); 623238384Sjkim &movups ($in0,&QWP(0,$inp)); 624238384Sjkim 625238384Sjkim &xorps ($inout0,$rndkey0); 626238384Sjkim &$movekey ($rndkey1,&QWP(16,$key_)); 627238384Sjkim &xorps ($rndkey0,$in0); 628238384Sjkim &lea ($key,&DWP(32,$key_)); 629238384Sjkim &xorps ($cmac,$rndkey0); # cmac^=inp 630238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 631238384Sjkim 632238384Sjkim&set_label("ccm64_enc2_loop"); 633238384Sjkim &aesenc ($inout0,$rndkey1); 634238384Sjkim &dec ($rounds); 635238384Sjkim &aesenc ($cmac,$rndkey1); 636238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 637238384Sjkim &aesenc ($inout0,$rndkey0); 638238384Sjkim &lea ($key,&DWP(32,$key)); 639238384Sjkim &aesenc ($cmac,$rndkey0); 640238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 641238384Sjkim &jnz (&label("ccm64_enc2_loop")); 642238384Sjkim &aesenc ($inout0,$rndkey1); 643238384Sjkim &aesenc ($cmac,$rndkey1); 644238384Sjkim &paddq ($ivec,&QWP(16,"esp")); 645238384Sjkim &aesenclast ($inout0,$rndkey0); 646238384Sjkim &aesenclast ($cmac,$rndkey0); 647238384Sjkim 648238384Sjkim &dec ($len); 649238384Sjkim &lea ($inp,&DWP(16,$inp)); 650238384Sjkim &xorps ($in0,$inout0); # inp^=E(ivec) 651238384Sjkim &movdqa ($inout0,$ivec); 652238384Sjkim &movups (&QWP(0,$out),$in0); # save output 653238384Sjkim &lea ($out,&DWP(16,$out)); 654238384Sjkim &pshufb ($inout0,$inout3); 655238384Sjkim &jnz (&label("ccm64_enc_outer")); 656238384Sjkim 657238384Sjkim &mov ("esp",&DWP(48,"esp")); 658238384Sjkim &mov ($out,&wparam(5)); 659238384Sjkim &movups (&QWP(0,$out),$cmac); 660238384Sjkim&function_end("aesni_ccm64_encrypt_blocks"); 661238384Sjkim 662238384Sjkim&function_begin("aesni_ccm64_decrypt_blocks"); 663238384Sjkim &mov ($inp,&wparam(0)); 664238384Sjkim &mov ($out,&wparam(1)); 665238384Sjkim &mov ($len,&wparam(2)); 666238384Sjkim &mov ($key,&wparam(3)); 667238384Sjkim &mov ($rounds_,&wparam(4)); 668238384Sjkim &mov ($rounds,&wparam(5)); 669238384Sjkim &mov ($key_,"esp"); 670238384Sjkim &sub ("esp",60); 671238384Sjkim &and ("esp",-16); # align stack 672238384Sjkim &mov (&DWP(48,"esp"),$key_); 673238384Sjkim 674238384Sjkim &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 675238384Sjkim &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 676238384Sjkim &mov ($rounds,&DWP(240,$key)); 677238384Sjkim 678238384Sjkim # compose byte-swap control mask for pshufb on stack 679238384Sjkim &mov (&DWP(0,"esp"),0x0c0d0e0f); 680238384Sjkim &mov (&DWP(4,"esp"),0x08090a0b); 681238384Sjkim &mov (&DWP(8,"esp"),0x04050607); 682238384Sjkim &mov (&DWP(12,"esp"),0x00010203); 683238384Sjkim 684238384Sjkim # compose counter increment vector on stack 685238384Sjkim &mov ($rounds_,1); 686238384Sjkim &xor ($key_,$key_); 687238384Sjkim &mov (&DWP(16,"esp"),$rounds_); 688238384Sjkim &mov (&DWP(20,"esp"),$key_); 689238384Sjkim &mov (&DWP(24,"esp"),$key_); 690238384Sjkim &mov (&DWP(28,"esp"),$key_); 691238384Sjkim 692238384Sjkim &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 693238384Sjkim &movdqa ($inout0,$ivec); 694238384Sjkim 695238384Sjkim &mov ($key_,$key); 696238384Sjkim &mov ($rounds_,$rounds); 697238384Sjkim 698238384Sjkim &pshufb ($ivec,$inout3); 699238384Sjkim if ($inline) 700238384Sjkim { &aesni_inline_generate1("enc"); } 701238384Sjkim else 702238384Sjkim { &call ("_aesni_encrypt1"); } 703238384Sjkim &movups ($in0,&QWP(0,$inp)); # load inp 704238384Sjkim &paddq ($ivec,&QWP(16,"esp")); 705238384Sjkim &lea ($inp,&QWP(16,$inp)); 706238384Sjkim &jmp (&label("ccm64_dec_outer")); 707238384Sjkim 708238384Sjkim&set_label("ccm64_dec_outer",16); 709238384Sjkim &xorps ($in0,$inout0); # inp ^= E(ivec) 710238384Sjkim &movdqa ($inout0,$ivec); 711238384Sjkim &mov ($rounds,$rounds_); 712238384Sjkim &movups (&QWP(0,$out),$in0); # save output 713238384Sjkim &lea ($out,&DWP(16,$out)); 714238384Sjkim &pshufb ($inout0,$inout3); 715238384Sjkim 716238384Sjkim &sub ($len,1); 717238384Sjkim &jz (&label("ccm64_dec_break")); 718238384Sjkim 719238384Sjkim &$movekey ($rndkey0,&QWP(0,$key_)); 720238384Sjkim &shr ($rounds,1); 721238384Sjkim &$movekey ($rndkey1,&QWP(16,$key_)); 722238384Sjkim &xorps ($in0,$rndkey0); 723238384Sjkim &lea ($key,&DWP(32,$key_)); 724238384Sjkim &xorps ($inout0,$rndkey0); 725238384Sjkim &xorps ($cmac,$in0); # cmac^=out 726238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 727238384Sjkim 728238384Sjkim&set_label("ccm64_dec2_loop"); 729238384Sjkim &aesenc ($inout0,$rndkey1); 730238384Sjkim &dec ($rounds); 731238384Sjkim &aesenc ($cmac,$rndkey1); 732238384Sjkim &$movekey ($rndkey1,&QWP(16,$key)); 733238384Sjkim &aesenc ($inout0,$rndkey0); 734238384Sjkim &lea ($key,&DWP(32,$key)); 735238384Sjkim &aesenc ($cmac,$rndkey0); 736238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 737238384Sjkim &jnz (&label("ccm64_dec2_loop")); 738238384Sjkim &movups ($in0,&QWP(0,$inp)); # load inp 739238384Sjkim &paddq ($ivec,&QWP(16,"esp")); 740238384Sjkim &aesenc ($inout0,$rndkey1); 741238384Sjkim &aesenc ($cmac,$rndkey1); 742238384Sjkim &lea ($inp,&QWP(16,$inp)); 743238384Sjkim &aesenclast ($inout0,$rndkey0); 744238384Sjkim &aesenclast ($cmac,$rndkey0); 745238384Sjkim &jmp (&label("ccm64_dec_outer")); 746238384Sjkim 747238384Sjkim&set_label("ccm64_dec_break",16); 748238384Sjkim &mov ($key,$key_); 749238384Sjkim if ($inline) 750238384Sjkim { &aesni_inline_generate1("enc",$cmac,$in0); } 751238384Sjkim else 752238384Sjkim { &call ("_aesni_encrypt1",$cmac); } 753238384Sjkim 754238384Sjkim &mov ("esp",&DWP(48,"esp")); 755238384Sjkim &mov ($out,&wparam(5)); 756238384Sjkim &movups (&QWP(0,$out),$cmac); 757238384Sjkim&function_end("aesni_ccm64_decrypt_blocks"); 758238384Sjkim} 759238384Sjkim 760238384Sjkim###################################################################### 761238384Sjkim# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 762238384Sjkim# size_t blocks, const AES_KEY *key, 763238384Sjkim# const char *ivec); 764238384Sjkim# 765238384Sjkim# Handles only complete blocks, operates on 32-bit counter and 766238384Sjkim# does not update *ivec! (see engine/eng_aesni.c for details) 767238384Sjkim# 768238384Sjkim# stack layout: 769238384Sjkim# 0 pshufb mask 770238384Sjkim# 16 vector addend: 0,6,6,6 771238384Sjkim# 32 counter-less ivec 772238384Sjkim# 48 1st triplet of counter vector 773238384Sjkim# 64 2nd triplet of counter vector 774238384Sjkim# 80 saved %esp 775238384Sjkim 776238384Sjkim&function_begin("aesni_ctr32_encrypt_blocks"); 777238384Sjkim &mov ($inp,&wparam(0)); 778238384Sjkim &mov ($out,&wparam(1)); 779238384Sjkim &mov ($len,&wparam(2)); 780238384Sjkim &mov ($key,&wparam(3)); 781238384Sjkim &mov ($rounds_,&wparam(4)); 782238384Sjkim &mov ($key_,"esp"); 783238384Sjkim &sub ("esp",88); 784238384Sjkim &and ("esp",-16); # align stack 785238384Sjkim &mov (&DWP(80,"esp"),$key_); 786238384Sjkim 787238384Sjkim &cmp ($len,1); 788238384Sjkim &je (&label("ctr32_one_shortcut")); 789238384Sjkim 790238384Sjkim &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 791238384Sjkim 792238384Sjkim # compose byte-swap control mask for pshufb on stack 793238384Sjkim &mov (&DWP(0,"esp"),0x0c0d0e0f); 794238384Sjkim &mov (&DWP(4,"esp"),0x08090a0b); 795238384Sjkim &mov (&DWP(8,"esp"),0x04050607); 796238384Sjkim &mov (&DWP(12,"esp"),0x00010203); 797238384Sjkim 798238384Sjkim # compose counter increment vector on stack 799238384Sjkim &mov ($rounds,6); 800238384Sjkim &xor ($key_,$key_); 801238384Sjkim &mov (&DWP(16,"esp"),$rounds); 802238384Sjkim &mov (&DWP(20,"esp"),$rounds); 803238384Sjkim &mov (&DWP(24,"esp"),$rounds); 804238384Sjkim &mov (&DWP(28,"esp"),$key_); 805238384Sjkim 806238384Sjkim &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 807238384Sjkim &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 808238384Sjkim 809238384Sjkim &mov ($rounds,&DWP(240,$key)); # key->rounds 810238384Sjkim 811238384Sjkim # compose 2 vectors of 3x32-bit counters 812238384Sjkim &bswap ($rounds_); 813238384Sjkim &pxor ($rndkey1,$rndkey1); 814238384Sjkim &pxor ($rndkey0,$rndkey0); 815238384Sjkim &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 816238384Sjkim &pinsrd ($rndkey1,$rounds_,0); 817238384Sjkim &lea ($key_,&DWP(3,$rounds_)); 818238384Sjkim &pinsrd ($rndkey0,$key_,0); 819238384Sjkim &inc ($rounds_); 820238384Sjkim &pinsrd ($rndkey1,$rounds_,1); 821238384Sjkim &inc ($key_); 822238384Sjkim &pinsrd ($rndkey0,$key_,1); 823238384Sjkim &inc ($rounds_); 824238384Sjkim &pinsrd ($rndkey1,$rounds_,2); 825238384Sjkim &inc ($key_); 826238384Sjkim &pinsrd ($rndkey0,$key_,2); 827238384Sjkim &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 828238384Sjkim &pshufb ($rndkey1,$inout0); # byte swap 829238384Sjkim &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 830238384Sjkim &pshufb ($rndkey0,$inout0); # byte swap 831238384Sjkim 832238384Sjkim &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword 833238384Sjkim &pshufd ($inout1,$rndkey1,2<<6); 834238384Sjkim &cmp ($len,6); 835238384Sjkim &jb (&label("ctr32_tail")); 836238384Sjkim &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec 837238384Sjkim &shr ($rounds,1); 838238384Sjkim &mov ($key_,$key); # backup $key 839238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 840238384Sjkim &sub ($len,6); 841238384Sjkim &jmp (&label("ctr32_loop6")); 842238384Sjkim 843238384Sjkim&set_label("ctr32_loop6",16); 844238384Sjkim &pshufd ($inout2,$rndkey1,1<<6); 845238384Sjkim &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec 846238384Sjkim &pshufd ($inout3,$rndkey0,3<<6); 847238384Sjkim &por ($inout0,$rndkey1); # merge counter-less ivec 848238384Sjkim &pshufd ($inout4,$rndkey0,2<<6); 849238384Sjkim &por ($inout1,$rndkey1); 850238384Sjkim &pshufd ($inout5,$rndkey0,1<<6); 851238384Sjkim &por ($inout2,$rndkey1); 852238384Sjkim &por ($inout3,$rndkey1); 853238384Sjkim &por ($inout4,$rndkey1); 854238384Sjkim &por ($inout5,$rndkey1); 855238384Sjkim 856238384Sjkim # inlining _aesni_encrypt6's prologue gives ~4% improvement... 857238384Sjkim &$movekey ($rndkey0,&QWP(0,$key_)); 858238384Sjkim &$movekey ($rndkey1,&QWP(16,$key_)); 859238384Sjkim &lea ($key,&DWP(32,$key_)); 860238384Sjkim &dec ($rounds); 861238384Sjkim &pxor ($inout0,$rndkey0); 862238384Sjkim &pxor ($inout1,$rndkey0); 863238384Sjkim &aesenc ($inout0,$rndkey1); 864238384Sjkim &pxor ($inout2,$rndkey0); 865238384Sjkim &aesenc ($inout1,$rndkey1); 866238384Sjkim &pxor ($inout3,$rndkey0); 867238384Sjkim &aesenc ($inout2,$rndkey1); 868238384Sjkim &pxor ($inout4,$rndkey0); 869238384Sjkim &aesenc ($inout3,$rndkey1); 870238384Sjkim &pxor ($inout5,$rndkey0); 871238384Sjkim &aesenc ($inout4,$rndkey1); 872238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 873238384Sjkim &aesenc ($inout5,$rndkey1); 874238384Sjkim 875238384Sjkim &call (&label("_aesni_encrypt6_enter")); 876238384Sjkim 877238384Sjkim &movups ($rndkey1,&QWP(0,$inp)); 878238384Sjkim &movups ($rndkey0,&QWP(0x10,$inp)); 879238384Sjkim &xorps ($inout0,$rndkey1); 880238384Sjkim &movups ($rndkey1,&QWP(0x20,$inp)); 881238384Sjkim &xorps ($inout1,$rndkey0); 882238384Sjkim &movups (&QWP(0,$out),$inout0); 883238384Sjkim &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 884238384Sjkim &xorps ($inout2,$rndkey1); 885238384Sjkim &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet 886238384Sjkim &movups (&QWP(0x10,$out),$inout1); 887238384Sjkim &movups (&QWP(0x20,$out),$inout2); 888238384Sjkim 889238384Sjkim &paddd ($rndkey1,$rndkey0); # 1st triplet increment 890238384Sjkim &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment 891238384Sjkim &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 892238384Sjkim 893238384Sjkim &movups ($inout1,&QWP(0x30,$inp)); 894238384Sjkim &movups ($inout2,&QWP(0x40,$inp)); 895238384Sjkim &xorps ($inout3,$inout1); 896238384Sjkim &movups ($inout1,&QWP(0x50,$inp)); 897238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 898238384Sjkim &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 899238384Sjkim &pshufb ($rndkey1,$inout0); # byte swap 900238384Sjkim &xorps ($inout4,$inout2); 901238384Sjkim &movups (&QWP(0x30,$out),$inout3); 902238384Sjkim &xorps ($inout5,$inout1); 903238384Sjkim &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 904238384Sjkim &pshufb ($rndkey0,$inout0); # byte swap 905238384Sjkim &movups (&QWP(0x40,$out),$inout4); 906238384Sjkim &pshufd ($inout0,$rndkey1,3<<6); 907238384Sjkim &movups (&QWP(0x50,$out),$inout5); 908238384Sjkim &lea ($out,&DWP(0x60,$out)); 909238384Sjkim 910238384Sjkim &mov ($rounds,$rounds_); 911238384Sjkim &pshufd ($inout1,$rndkey1,2<<6); 912238384Sjkim &sub ($len,6); 913238384Sjkim &jnc (&label("ctr32_loop6")); 914238384Sjkim 915238384Sjkim &add ($len,6); 916238384Sjkim &jz (&label("ctr32_ret")); 917238384Sjkim &mov ($key,$key_); 918238384Sjkim &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 919238384Sjkim &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec 920238384Sjkim 921238384Sjkim&set_label("ctr32_tail"); 922238384Sjkim &por ($inout0,$inout5); 923238384Sjkim &cmp ($len,2); 924238384Sjkim &jb (&label("ctr32_one")); 925238384Sjkim 926238384Sjkim &pshufd ($inout2,$rndkey1,1<<6); 927238384Sjkim &por ($inout1,$inout5); 928238384Sjkim &je (&label("ctr32_two")); 929238384Sjkim 930238384Sjkim &pshufd ($inout3,$rndkey0,3<<6); 931238384Sjkim &por ($inout2,$inout5); 932238384Sjkim &cmp ($len,4); 933238384Sjkim &jb (&label("ctr32_three")); 934238384Sjkim 935238384Sjkim &pshufd ($inout4,$rndkey0,2<<6); 936238384Sjkim &por ($inout3,$inout5); 937238384Sjkim &je (&label("ctr32_four")); 938238384Sjkim 939238384Sjkim &por ($inout4,$inout5); 940238384Sjkim &call ("_aesni_encrypt6"); 941238384Sjkim &movups ($rndkey1,&QWP(0,$inp)); 942238384Sjkim &movups ($rndkey0,&QWP(0x10,$inp)); 943238384Sjkim &xorps ($inout0,$rndkey1); 944238384Sjkim &movups ($rndkey1,&QWP(0x20,$inp)); 945238384Sjkim &xorps ($inout1,$rndkey0); 946238384Sjkim &movups ($rndkey0,&QWP(0x30,$inp)); 947238384Sjkim &xorps ($inout2,$rndkey1); 948238384Sjkim &movups ($rndkey1,&QWP(0x40,$inp)); 949238384Sjkim &xorps ($inout3,$rndkey0); 950238384Sjkim &movups (&QWP(0,$out),$inout0); 951238384Sjkim &xorps ($inout4,$rndkey1); 952238384Sjkim &movups (&QWP(0x10,$out),$inout1); 953238384Sjkim &movups (&QWP(0x20,$out),$inout2); 954238384Sjkim &movups (&QWP(0x30,$out),$inout3); 955238384Sjkim &movups (&QWP(0x40,$out),$inout4); 956238384Sjkim &jmp (&label("ctr32_ret")); 957238384Sjkim 958238384Sjkim&set_label("ctr32_one_shortcut",16); 959238384Sjkim &movups ($inout0,&QWP(0,$rounds_)); # load ivec 960238384Sjkim &mov ($rounds,&DWP(240,$key)); 961238384Sjkim 962238384Sjkim&set_label("ctr32_one"); 963238384Sjkim if ($inline) 964238384Sjkim { &aesni_inline_generate1("enc"); } 965238384Sjkim else 966238384Sjkim { &call ("_aesni_encrypt1"); } 967238384Sjkim &movups ($in0,&QWP(0,$inp)); 968238384Sjkim &xorps ($in0,$inout0); 969238384Sjkim &movups (&QWP(0,$out),$in0); 970238384Sjkim &jmp (&label("ctr32_ret")); 971238384Sjkim 972238384Sjkim&set_label("ctr32_two",16); 973238384Sjkim &call ("_aesni_encrypt3"); 974238384Sjkim &movups ($inout3,&QWP(0,$inp)); 975238384Sjkim &movups ($inout4,&QWP(0x10,$inp)); 976238384Sjkim &xorps ($inout0,$inout3); 977238384Sjkim &xorps ($inout1,$inout4); 978238384Sjkim &movups (&QWP(0,$out),$inout0); 979238384Sjkim &movups (&QWP(0x10,$out),$inout1); 980238384Sjkim &jmp (&label("ctr32_ret")); 981238384Sjkim 982238384Sjkim&set_label("ctr32_three",16); 983238384Sjkim &call ("_aesni_encrypt3"); 984238384Sjkim &movups ($inout3,&QWP(0,$inp)); 985238384Sjkim &movups ($inout4,&QWP(0x10,$inp)); 986238384Sjkim &xorps ($inout0,$inout3); 987238384Sjkim &movups ($inout5,&QWP(0x20,$inp)); 988238384Sjkim &xorps ($inout1,$inout4); 989238384Sjkim &movups (&QWP(0,$out),$inout0); 990238384Sjkim &xorps ($inout2,$inout5); 991238384Sjkim &movups (&QWP(0x10,$out),$inout1); 992238384Sjkim &movups (&QWP(0x20,$out),$inout2); 993238384Sjkim &jmp (&label("ctr32_ret")); 994238384Sjkim 995238384Sjkim&set_label("ctr32_four",16); 996238384Sjkim &call ("_aesni_encrypt4"); 997238384Sjkim &movups ($inout4,&QWP(0,$inp)); 998238384Sjkim &movups ($inout5,&QWP(0x10,$inp)); 999238384Sjkim &movups ($rndkey1,&QWP(0x20,$inp)); 1000238384Sjkim &xorps ($inout0,$inout4); 1001238384Sjkim &movups ($rndkey0,&QWP(0x30,$inp)); 1002238384Sjkim &xorps ($inout1,$inout5); 1003238384Sjkim &movups (&QWP(0,$out),$inout0); 1004238384Sjkim &xorps ($inout2,$rndkey1); 1005238384Sjkim &movups (&QWP(0x10,$out),$inout1); 1006238384Sjkim &xorps ($inout3,$rndkey0); 1007238384Sjkim &movups (&QWP(0x20,$out),$inout2); 1008238384Sjkim &movups (&QWP(0x30,$out),$inout3); 1009238384Sjkim 1010238384Sjkim&set_label("ctr32_ret"); 1011238384Sjkim &mov ("esp",&DWP(80,"esp")); 1012238384Sjkim&function_end("aesni_ctr32_encrypt_blocks"); 1013238384Sjkim 1014238384Sjkim###################################################################### 1015238384Sjkim# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1016238384Sjkim# const AES_KEY *key1, const AES_KEY *key2 1017238384Sjkim# const unsigned char iv[16]); 1018238384Sjkim# 1019238384Sjkim{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1020238384Sjkim 1021238384Sjkim&function_begin("aesni_xts_encrypt"); 1022238384Sjkim &mov ($key,&wparam(4)); # key2 1023238384Sjkim &mov ($inp,&wparam(5)); # clear-text tweak 1024238384Sjkim 1025238384Sjkim &mov ($rounds,&DWP(240,$key)); # key2->rounds 1026238384Sjkim &movups ($inout0,&QWP(0,$inp)); 1027238384Sjkim if ($inline) 1028238384Sjkim { &aesni_inline_generate1("enc"); } 1029238384Sjkim else 1030238384Sjkim { &call ("_aesni_encrypt1"); } 1031238384Sjkim 1032238384Sjkim &mov ($inp,&wparam(0)); 1033238384Sjkim &mov ($out,&wparam(1)); 1034238384Sjkim &mov ($len,&wparam(2)); 1035238384Sjkim &mov ($key,&wparam(3)); # key1 1036238384Sjkim 1037238384Sjkim &mov ($key_,"esp"); 1038238384Sjkim &sub ("esp",16*7+8); 1039238384Sjkim &mov ($rounds,&DWP(240,$key)); # key1->rounds 1040238384Sjkim &and ("esp",-16); # align stack 1041238384Sjkim 1042238384Sjkim &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1043238384Sjkim &mov (&DWP(16*6+4,"esp"),0); 1044238384Sjkim &mov (&DWP(16*6+8,"esp"),1); 1045238384Sjkim &mov (&DWP(16*6+12,"esp"),0); 1046238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1047238384Sjkim &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1048238384Sjkim 1049238384Sjkim &movdqa ($tweak,$inout0); 1050238384Sjkim &pxor ($twtmp,$twtmp); 1051238384Sjkim &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1052238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1053238384Sjkim 1054238384Sjkim &and ($len,-16); 1055238384Sjkim &mov ($key_,$key); # backup $key 1056238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 1057238384Sjkim &sub ($len,16*6); 1058238384Sjkim &jc (&label("xts_enc_short")); 1059238384Sjkim 1060238384Sjkim &shr ($rounds,1); 1061238384Sjkim &mov ($rounds_,$rounds); 1062238384Sjkim &jmp (&label("xts_enc_loop6")); 1063238384Sjkim 1064238384Sjkim&set_label("xts_enc_loop6",16); 1065238384Sjkim for ($i=0;$i<4;$i++) { 1066238384Sjkim &pshufd ($twres,$twtmp,0x13); 1067238384Sjkim &pxor ($twtmp,$twtmp); 1068238384Sjkim &movdqa (&QWP(16*$i,"esp"),$tweak); 1069238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1070238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1071238384Sjkim &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1072238384Sjkim &pxor ($tweak,$twres); 1073238384Sjkim } 1074238384Sjkim &pshufd ($inout5,$twtmp,0x13); 1075238384Sjkim &movdqa (&QWP(16*$i++,"esp"),$tweak); 1076238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1077238384Sjkim &$movekey ($rndkey0,&QWP(0,$key_)); 1078238384Sjkim &pand ($inout5,$twmask); # isolate carry and residue 1079238384Sjkim &movups ($inout0,&QWP(0,$inp)); # load input 1080238384Sjkim &pxor ($inout5,$tweak); 1081238384Sjkim 1082238384Sjkim # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1083238384Sjkim &movdqu ($inout1,&QWP(16*1,$inp)); 1084238384Sjkim &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1085238384Sjkim &movdqu ($inout2,&QWP(16*2,$inp)); 1086238384Sjkim &pxor ($inout1,$rndkey0); 1087238384Sjkim &movdqu ($inout3,&QWP(16*3,$inp)); 1088238384Sjkim &pxor ($inout2,$rndkey0); 1089238384Sjkim &movdqu ($inout4,&QWP(16*4,$inp)); 1090238384Sjkim &pxor ($inout3,$rndkey0); 1091238384Sjkim &movdqu ($rndkey1,&QWP(16*5,$inp)); 1092238384Sjkim &pxor ($inout4,$rndkey0); 1093238384Sjkim &lea ($inp,&DWP(16*6,$inp)); 1094238384Sjkim &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1095238384Sjkim &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1096238384Sjkim &pxor ($inout5,$rndkey1); 1097238384Sjkim 1098238384Sjkim &$movekey ($rndkey1,&QWP(16,$key_)); 1099238384Sjkim &lea ($key,&DWP(32,$key_)); 1100238384Sjkim &pxor ($inout1,&QWP(16*1,"esp")); 1101238384Sjkim &aesenc ($inout0,$rndkey1); 1102238384Sjkim &pxor ($inout2,&QWP(16*2,"esp")); 1103238384Sjkim &aesenc ($inout1,$rndkey1); 1104238384Sjkim &pxor ($inout3,&QWP(16*3,"esp")); 1105238384Sjkim &dec ($rounds); 1106238384Sjkim &aesenc ($inout2,$rndkey1); 1107238384Sjkim &pxor ($inout4,&QWP(16*4,"esp")); 1108238384Sjkim &aesenc ($inout3,$rndkey1); 1109238384Sjkim &pxor ($inout5,$rndkey0); 1110238384Sjkim &aesenc ($inout4,$rndkey1); 1111238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 1112238384Sjkim &aesenc ($inout5,$rndkey1); 1113238384Sjkim &call (&label("_aesni_encrypt6_enter")); 1114238384Sjkim 1115238384Sjkim &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1116238384Sjkim &pxor ($twtmp,$twtmp); 1117238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1118238384Sjkim &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1119238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1120238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1121238384Sjkim &xorps ($inout2,&QWP(16*2,"esp")); 1122238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1123238384Sjkim &xorps ($inout3,&QWP(16*3,"esp")); 1124238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1125238384Sjkim &xorps ($inout4,&QWP(16*4,"esp")); 1126238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1127238384Sjkim &xorps ($inout5,$tweak); 1128238384Sjkim &movups (&QWP(16*4,$out),$inout4); 1129238384Sjkim &pshufd ($twres,$twtmp,0x13); 1130238384Sjkim &movups (&QWP(16*5,$out),$inout5); 1131238384Sjkim &lea ($out,&DWP(16*6,$out)); 1132238384Sjkim &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1133238384Sjkim 1134238384Sjkim &pxor ($twtmp,$twtmp); 1135238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1136238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1137238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1138238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1139238384Sjkim &pxor ($tweak,$twres); 1140238384Sjkim 1141238384Sjkim &sub ($len,16*6); 1142238384Sjkim &jnc (&label("xts_enc_loop6")); 1143238384Sjkim 1144238384Sjkim &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1145238384Sjkim &mov ($key,$key_); # restore $key 1146238384Sjkim &mov ($rounds_,$rounds); 1147238384Sjkim 1148238384Sjkim&set_label("xts_enc_short"); 1149238384Sjkim &add ($len,16*6); 1150238384Sjkim &jz (&label("xts_enc_done6x")); 1151238384Sjkim 1152238384Sjkim &movdqa ($inout3,$tweak); # put aside previous tweak 1153238384Sjkim &cmp ($len,0x20); 1154238384Sjkim &jb (&label("xts_enc_one")); 1155238384Sjkim 1156238384Sjkim &pshufd ($twres,$twtmp,0x13); 1157238384Sjkim &pxor ($twtmp,$twtmp); 1158238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1159238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1160238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1161238384Sjkim &pxor ($tweak,$twres); 1162238384Sjkim &je (&label("xts_enc_two")); 1163238384Sjkim 1164238384Sjkim &pshufd ($twres,$twtmp,0x13); 1165238384Sjkim &pxor ($twtmp,$twtmp); 1166238384Sjkim &movdqa ($inout4,$tweak); # put aside previous tweak 1167238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1168238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1169238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1170238384Sjkim &pxor ($tweak,$twres); 1171238384Sjkim &cmp ($len,0x40); 1172238384Sjkim &jb (&label("xts_enc_three")); 1173238384Sjkim 1174238384Sjkim &pshufd ($twres,$twtmp,0x13); 1175238384Sjkim &pxor ($twtmp,$twtmp); 1176238384Sjkim &movdqa ($inout5,$tweak); # put aside previous tweak 1177238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1178238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1179238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1180238384Sjkim &pxor ($tweak,$twres); 1181238384Sjkim &movdqa (&QWP(16*0,"esp"),$inout3); 1182238384Sjkim &movdqa (&QWP(16*1,"esp"),$inout4); 1183238384Sjkim &je (&label("xts_enc_four")); 1184238384Sjkim 1185238384Sjkim &movdqa (&QWP(16*2,"esp"),$inout5); 1186238384Sjkim &pshufd ($inout5,$twtmp,0x13); 1187238384Sjkim &movdqa (&QWP(16*3,"esp"),$tweak); 1188238384Sjkim &paddq ($tweak,$tweak); # &psllq($inout0,1); 1189238384Sjkim &pand ($inout5,$twmask); # isolate carry and residue 1190238384Sjkim &pxor ($inout5,$tweak); 1191238384Sjkim 1192238384Sjkim &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1193238384Sjkim &movdqu ($inout1,&QWP(16*1,$inp)); 1194238384Sjkim &movdqu ($inout2,&QWP(16*2,$inp)); 1195238384Sjkim &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1196238384Sjkim &movdqu ($inout3,&QWP(16*3,$inp)); 1197238384Sjkim &pxor ($inout1,&QWP(16*1,"esp")); 1198238384Sjkim &movdqu ($inout4,&QWP(16*4,$inp)); 1199238384Sjkim &pxor ($inout2,&QWP(16*2,"esp")); 1200238384Sjkim &lea ($inp,&DWP(16*5,$inp)); 1201238384Sjkim &pxor ($inout3,&QWP(16*3,"esp")); 1202238384Sjkim &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1203238384Sjkim &pxor ($inout4,$inout5); 1204238384Sjkim 1205238384Sjkim &call ("_aesni_encrypt6"); 1206238384Sjkim 1207238384Sjkim &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1208238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1209238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1210238384Sjkim &xorps ($inout2,&QWP(16*2,"esp")); 1211238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1212238384Sjkim &xorps ($inout3,&QWP(16*3,"esp")); 1213238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1214238384Sjkim &xorps ($inout4,$tweak); 1215238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1216238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1217238384Sjkim &movups (&QWP(16*4,$out),$inout4); 1218238384Sjkim &lea ($out,&DWP(16*5,$out)); 1219238384Sjkim &jmp (&label("xts_enc_done")); 1220238384Sjkim 1221238384Sjkim&set_label("xts_enc_one",16); 1222238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1223238384Sjkim &lea ($inp,&DWP(16*1,$inp)); 1224238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1225238384Sjkim if ($inline) 1226238384Sjkim { &aesni_inline_generate1("enc"); } 1227238384Sjkim else 1228238384Sjkim { &call ("_aesni_encrypt1"); } 1229238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1230238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1231238384Sjkim &lea ($out,&DWP(16*1,$out)); 1232238384Sjkim 1233238384Sjkim &movdqa ($tweak,$inout3); # last tweak 1234238384Sjkim &jmp (&label("xts_enc_done")); 1235238384Sjkim 1236238384Sjkim&set_label("xts_enc_two",16); 1237238384Sjkim &movaps ($inout4,$tweak); # put aside last tweak 1238238384Sjkim 1239238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1240238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1241238384Sjkim &lea ($inp,&DWP(16*2,$inp)); 1242238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1243238384Sjkim &xorps ($inout1,$inout4); 1244238384Sjkim &xorps ($inout2,$inout2); 1245238384Sjkim 1246238384Sjkim &call ("_aesni_encrypt3"); 1247238384Sjkim 1248238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1249238384Sjkim &xorps ($inout1,$inout4); 1250238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1251238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1252238384Sjkim &lea ($out,&DWP(16*2,$out)); 1253238384Sjkim 1254238384Sjkim &movdqa ($tweak,$inout4); # last tweak 1255238384Sjkim &jmp (&label("xts_enc_done")); 1256238384Sjkim 1257238384Sjkim&set_label("xts_enc_three",16); 1258238384Sjkim &movaps ($inout5,$tweak); # put aside last tweak 1259238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1260238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1261238384Sjkim &movups ($inout2,&QWP(16*2,$inp)); 1262238384Sjkim &lea ($inp,&DWP(16*3,$inp)); 1263238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1264238384Sjkim &xorps ($inout1,$inout4); 1265238384Sjkim &xorps ($inout2,$inout5); 1266238384Sjkim 1267238384Sjkim &call ("_aesni_encrypt3"); 1268238384Sjkim 1269238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1270238384Sjkim &xorps ($inout1,$inout4); 1271238384Sjkim &xorps ($inout2,$inout5); 1272238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1273238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1274238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1275238384Sjkim &lea ($out,&DWP(16*3,$out)); 1276238384Sjkim 1277238384Sjkim &movdqa ($tweak,$inout5); # last tweak 1278238384Sjkim &jmp (&label("xts_enc_done")); 1279238384Sjkim 1280238384Sjkim&set_label("xts_enc_four",16); 1281238384Sjkim &movaps ($inout4,$tweak); # put aside last tweak 1282238384Sjkim 1283238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1284238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1285238384Sjkim &movups ($inout2,&QWP(16*2,$inp)); 1286238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1287238384Sjkim &movups ($inout3,&QWP(16*3,$inp)); 1288238384Sjkim &lea ($inp,&DWP(16*4,$inp)); 1289238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1290238384Sjkim &xorps ($inout2,$inout5); 1291238384Sjkim &xorps ($inout3,$inout4); 1292238384Sjkim 1293238384Sjkim &call ("_aesni_encrypt4"); 1294238384Sjkim 1295238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1296238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1297238384Sjkim &xorps ($inout2,$inout5); 1298238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1299238384Sjkim &xorps ($inout3,$inout4); 1300238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1301238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1302238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1303238384Sjkim &lea ($out,&DWP(16*4,$out)); 1304238384Sjkim 1305238384Sjkim &movdqa ($tweak,$inout4); # last tweak 1306238384Sjkim &jmp (&label("xts_enc_done")); 1307238384Sjkim 1308238384Sjkim&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1309238384Sjkim &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1310238384Sjkim &and ($len,15); 1311238384Sjkim &jz (&label("xts_enc_ret")); 1312238384Sjkim &movdqa ($inout3,$tweak); 1313238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1314238384Sjkim &jmp (&label("xts_enc_steal")); 1315238384Sjkim 1316238384Sjkim&set_label("xts_enc_done",16); 1317238384Sjkim &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1318238384Sjkim &pxor ($twtmp,$twtmp); 1319238384Sjkim &and ($len,15); 1320238384Sjkim &jz (&label("xts_enc_ret")); 1321238384Sjkim 1322238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1323238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1324238384Sjkim &pshufd ($inout3,$twtmp,0x13); 1325238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1326238384Sjkim &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1327238384Sjkim &pxor ($inout3,$tweak); 1328238384Sjkim 1329238384Sjkim&set_label("xts_enc_steal"); 1330238384Sjkim &movz ($rounds,&BP(0,$inp)); 1331238384Sjkim &movz ($key,&BP(-16,$out)); 1332238384Sjkim &lea ($inp,&DWP(1,$inp)); 1333238384Sjkim &mov (&BP(-16,$out),&LB($rounds)); 1334238384Sjkim &mov (&BP(0,$out),&LB($key)); 1335238384Sjkim &lea ($out,&DWP(1,$out)); 1336238384Sjkim &sub ($len,1); 1337238384Sjkim &jnz (&label("xts_enc_steal")); 1338238384Sjkim 1339238384Sjkim &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1340238384Sjkim &mov ($key,$key_); # restore $key 1341238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1342238384Sjkim 1343238384Sjkim &movups ($inout0,&QWP(-16,$out)); # load input 1344238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1345238384Sjkim if ($inline) 1346238384Sjkim { &aesni_inline_generate1("enc"); } 1347238384Sjkim else 1348238384Sjkim { &call ("_aesni_encrypt1"); } 1349238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1350238384Sjkim &movups (&QWP(-16,$out),$inout0); # write output 1351238384Sjkim 1352238384Sjkim&set_label("xts_enc_ret"); 1353238384Sjkim &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1354238384Sjkim&function_end("aesni_xts_encrypt"); 1355238384Sjkim 1356238384Sjkim&function_begin("aesni_xts_decrypt"); 1357238384Sjkim &mov ($key,&wparam(4)); # key2 1358238384Sjkim &mov ($inp,&wparam(5)); # clear-text tweak 1359238384Sjkim 1360238384Sjkim &mov ($rounds,&DWP(240,$key)); # key2->rounds 1361238384Sjkim &movups ($inout0,&QWP(0,$inp)); 1362238384Sjkim if ($inline) 1363238384Sjkim { &aesni_inline_generate1("enc"); } 1364238384Sjkim else 1365238384Sjkim { &call ("_aesni_encrypt1"); } 1366238384Sjkim 1367238384Sjkim &mov ($inp,&wparam(0)); 1368238384Sjkim &mov ($out,&wparam(1)); 1369238384Sjkim &mov ($len,&wparam(2)); 1370238384Sjkim &mov ($key,&wparam(3)); # key1 1371238384Sjkim 1372238384Sjkim &mov ($key_,"esp"); 1373238384Sjkim &sub ("esp",16*7+8); 1374238384Sjkim &and ("esp",-16); # align stack 1375238384Sjkim 1376238384Sjkim &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1377238384Sjkim &test ($len,15); 1378238384Sjkim &setnz (&LB($rounds_)); 1379238384Sjkim &shl ($rounds_,4); 1380238384Sjkim &sub ($len,$rounds_); 1381238384Sjkim 1382238384Sjkim &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1383238384Sjkim &mov (&DWP(16*6+4,"esp"),0); 1384238384Sjkim &mov (&DWP(16*6+8,"esp"),1); 1385238384Sjkim &mov (&DWP(16*6+12,"esp"),0); 1386238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1387238384Sjkim &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1388238384Sjkim 1389238384Sjkim &mov ($rounds,&DWP(240,$key)); # key1->rounds 1390238384Sjkim &mov ($key_,$key); # backup $key 1391238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 1392238384Sjkim 1393238384Sjkim &movdqa ($tweak,$inout0); 1394238384Sjkim &pxor ($twtmp,$twtmp); 1395238384Sjkim &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1396238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1397238384Sjkim 1398238384Sjkim &and ($len,-16); 1399238384Sjkim &sub ($len,16*6); 1400238384Sjkim &jc (&label("xts_dec_short")); 1401238384Sjkim 1402238384Sjkim &shr ($rounds,1); 1403238384Sjkim &mov ($rounds_,$rounds); 1404238384Sjkim &jmp (&label("xts_dec_loop6")); 1405238384Sjkim 1406238384Sjkim&set_label("xts_dec_loop6",16); 1407238384Sjkim for ($i=0;$i<4;$i++) { 1408238384Sjkim &pshufd ($twres,$twtmp,0x13); 1409238384Sjkim &pxor ($twtmp,$twtmp); 1410238384Sjkim &movdqa (&QWP(16*$i,"esp"),$tweak); 1411238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1412238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1413238384Sjkim &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1414238384Sjkim &pxor ($tweak,$twres); 1415238384Sjkim } 1416238384Sjkim &pshufd ($inout5,$twtmp,0x13); 1417238384Sjkim &movdqa (&QWP(16*$i++,"esp"),$tweak); 1418238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1419238384Sjkim &$movekey ($rndkey0,&QWP(0,$key_)); 1420238384Sjkim &pand ($inout5,$twmask); # isolate carry and residue 1421238384Sjkim &movups ($inout0,&QWP(0,$inp)); # load input 1422238384Sjkim &pxor ($inout5,$tweak); 1423238384Sjkim 1424238384Sjkim # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1425238384Sjkim &movdqu ($inout1,&QWP(16*1,$inp)); 1426238384Sjkim &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1427238384Sjkim &movdqu ($inout2,&QWP(16*2,$inp)); 1428238384Sjkim &pxor ($inout1,$rndkey0); 1429238384Sjkim &movdqu ($inout3,&QWP(16*3,$inp)); 1430238384Sjkim &pxor ($inout2,$rndkey0); 1431238384Sjkim &movdqu ($inout4,&QWP(16*4,$inp)); 1432238384Sjkim &pxor ($inout3,$rndkey0); 1433238384Sjkim &movdqu ($rndkey1,&QWP(16*5,$inp)); 1434238384Sjkim &pxor ($inout4,$rndkey0); 1435238384Sjkim &lea ($inp,&DWP(16*6,$inp)); 1436238384Sjkim &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1437238384Sjkim &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1438238384Sjkim &pxor ($inout5,$rndkey1); 1439238384Sjkim 1440238384Sjkim &$movekey ($rndkey1,&QWP(16,$key_)); 1441238384Sjkim &lea ($key,&DWP(32,$key_)); 1442238384Sjkim &pxor ($inout1,&QWP(16*1,"esp")); 1443238384Sjkim &aesdec ($inout0,$rndkey1); 1444238384Sjkim &pxor ($inout2,&QWP(16*2,"esp")); 1445238384Sjkim &aesdec ($inout1,$rndkey1); 1446238384Sjkim &pxor ($inout3,&QWP(16*3,"esp")); 1447238384Sjkim &dec ($rounds); 1448238384Sjkim &aesdec ($inout2,$rndkey1); 1449238384Sjkim &pxor ($inout4,&QWP(16*4,"esp")); 1450238384Sjkim &aesdec ($inout3,$rndkey1); 1451238384Sjkim &pxor ($inout5,$rndkey0); 1452238384Sjkim &aesdec ($inout4,$rndkey1); 1453238384Sjkim &$movekey ($rndkey0,&QWP(0,$key)); 1454238384Sjkim &aesdec ($inout5,$rndkey1); 1455238384Sjkim &call (&label("_aesni_decrypt6_enter")); 1456238384Sjkim 1457238384Sjkim &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1458238384Sjkim &pxor ($twtmp,$twtmp); 1459238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1460238384Sjkim &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1461238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1462238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1463238384Sjkim &xorps ($inout2,&QWP(16*2,"esp")); 1464238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1465238384Sjkim &xorps ($inout3,&QWP(16*3,"esp")); 1466238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1467238384Sjkim &xorps ($inout4,&QWP(16*4,"esp")); 1468238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1469238384Sjkim &xorps ($inout5,$tweak); 1470238384Sjkim &movups (&QWP(16*4,$out),$inout4); 1471238384Sjkim &pshufd ($twres,$twtmp,0x13); 1472238384Sjkim &movups (&QWP(16*5,$out),$inout5); 1473238384Sjkim &lea ($out,&DWP(16*6,$out)); 1474238384Sjkim &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1475238384Sjkim 1476238384Sjkim &pxor ($twtmp,$twtmp); 1477238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1478238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1479238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1480238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1481238384Sjkim &pxor ($tweak,$twres); 1482238384Sjkim 1483238384Sjkim &sub ($len,16*6); 1484238384Sjkim &jnc (&label("xts_dec_loop6")); 1485238384Sjkim 1486238384Sjkim &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1487238384Sjkim &mov ($key,$key_); # restore $key 1488238384Sjkim &mov ($rounds_,$rounds); 1489238384Sjkim 1490238384Sjkim&set_label("xts_dec_short"); 1491238384Sjkim &add ($len,16*6); 1492238384Sjkim &jz (&label("xts_dec_done6x")); 1493238384Sjkim 1494238384Sjkim &movdqa ($inout3,$tweak); # put aside previous tweak 1495238384Sjkim &cmp ($len,0x20); 1496238384Sjkim &jb (&label("xts_dec_one")); 1497238384Sjkim 1498238384Sjkim &pshufd ($twres,$twtmp,0x13); 1499238384Sjkim &pxor ($twtmp,$twtmp); 1500238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1501238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1502238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1503238384Sjkim &pxor ($tweak,$twres); 1504238384Sjkim &je (&label("xts_dec_two")); 1505238384Sjkim 1506238384Sjkim &pshufd ($twres,$twtmp,0x13); 1507238384Sjkim &pxor ($twtmp,$twtmp); 1508238384Sjkim &movdqa ($inout4,$tweak); # put aside previous tweak 1509238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1510238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1511238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1512238384Sjkim &pxor ($tweak,$twres); 1513238384Sjkim &cmp ($len,0x40); 1514238384Sjkim &jb (&label("xts_dec_three")); 1515238384Sjkim 1516238384Sjkim &pshufd ($twres,$twtmp,0x13); 1517238384Sjkim &pxor ($twtmp,$twtmp); 1518238384Sjkim &movdqa ($inout5,$tweak); # put aside previous tweak 1519238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1520238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1521238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1522238384Sjkim &pxor ($tweak,$twres); 1523238384Sjkim &movdqa (&QWP(16*0,"esp"),$inout3); 1524238384Sjkim &movdqa (&QWP(16*1,"esp"),$inout4); 1525238384Sjkim &je (&label("xts_dec_four")); 1526238384Sjkim 1527238384Sjkim &movdqa (&QWP(16*2,"esp"),$inout5); 1528238384Sjkim &pshufd ($inout5,$twtmp,0x13); 1529238384Sjkim &movdqa (&QWP(16*3,"esp"),$tweak); 1530238384Sjkim &paddq ($tweak,$tweak); # &psllq($inout0,1); 1531238384Sjkim &pand ($inout5,$twmask); # isolate carry and residue 1532238384Sjkim &pxor ($inout5,$tweak); 1533238384Sjkim 1534238384Sjkim &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1535238384Sjkim &movdqu ($inout1,&QWP(16*1,$inp)); 1536238384Sjkim &movdqu ($inout2,&QWP(16*2,$inp)); 1537238384Sjkim &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1538238384Sjkim &movdqu ($inout3,&QWP(16*3,$inp)); 1539238384Sjkim &pxor ($inout1,&QWP(16*1,"esp")); 1540238384Sjkim &movdqu ($inout4,&QWP(16*4,$inp)); 1541238384Sjkim &pxor ($inout2,&QWP(16*2,"esp")); 1542238384Sjkim &lea ($inp,&DWP(16*5,$inp)); 1543238384Sjkim &pxor ($inout3,&QWP(16*3,"esp")); 1544238384Sjkim &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1545238384Sjkim &pxor ($inout4,$inout5); 1546238384Sjkim 1547238384Sjkim &call ("_aesni_decrypt6"); 1548238384Sjkim 1549238384Sjkim &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1550238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1551238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1552238384Sjkim &xorps ($inout2,&QWP(16*2,"esp")); 1553238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1554238384Sjkim &xorps ($inout3,&QWP(16*3,"esp")); 1555238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1556238384Sjkim &xorps ($inout4,$tweak); 1557238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1558238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1559238384Sjkim &movups (&QWP(16*4,$out),$inout4); 1560238384Sjkim &lea ($out,&DWP(16*5,$out)); 1561238384Sjkim &jmp (&label("xts_dec_done")); 1562238384Sjkim 1563238384Sjkim&set_label("xts_dec_one",16); 1564238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1565238384Sjkim &lea ($inp,&DWP(16*1,$inp)); 1566238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1567238384Sjkim if ($inline) 1568238384Sjkim { &aesni_inline_generate1("dec"); } 1569238384Sjkim else 1570238384Sjkim { &call ("_aesni_decrypt1"); } 1571238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1572238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1573238384Sjkim &lea ($out,&DWP(16*1,$out)); 1574238384Sjkim 1575238384Sjkim &movdqa ($tweak,$inout3); # last tweak 1576238384Sjkim &jmp (&label("xts_dec_done")); 1577238384Sjkim 1578238384Sjkim&set_label("xts_dec_two",16); 1579238384Sjkim &movaps ($inout4,$tweak); # put aside last tweak 1580238384Sjkim 1581238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1582238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1583238384Sjkim &lea ($inp,&DWP(16*2,$inp)); 1584238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1585238384Sjkim &xorps ($inout1,$inout4); 1586238384Sjkim 1587238384Sjkim &call ("_aesni_decrypt3"); 1588238384Sjkim 1589238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1590238384Sjkim &xorps ($inout1,$inout4); 1591238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1592238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1593238384Sjkim &lea ($out,&DWP(16*2,$out)); 1594238384Sjkim 1595238384Sjkim &movdqa ($tweak,$inout4); # last tweak 1596238384Sjkim &jmp (&label("xts_dec_done")); 1597238384Sjkim 1598238384Sjkim&set_label("xts_dec_three",16); 1599238384Sjkim &movaps ($inout5,$tweak); # put aside last tweak 1600238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1601238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1602238384Sjkim &movups ($inout2,&QWP(16*2,$inp)); 1603238384Sjkim &lea ($inp,&DWP(16*3,$inp)); 1604238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1605238384Sjkim &xorps ($inout1,$inout4); 1606238384Sjkim &xorps ($inout2,$inout5); 1607238384Sjkim 1608238384Sjkim &call ("_aesni_decrypt3"); 1609238384Sjkim 1610238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1611238384Sjkim &xorps ($inout1,$inout4); 1612238384Sjkim &xorps ($inout2,$inout5); 1613238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1614238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1615238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1616238384Sjkim &lea ($out,&DWP(16*3,$out)); 1617238384Sjkim 1618238384Sjkim &movdqa ($tweak,$inout5); # last tweak 1619238384Sjkim &jmp (&label("xts_dec_done")); 1620238384Sjkim 1621238384Sjkim&set_label("xts_dec_four",16); 1622238384Sjkim &movaps ($inout4,$tweak); # put aside last tweak 1623238384Sjkim 1624238384Sjkim &movups ($inout0,&QWP(16*0,$inp)); # load input 1625238384Sjkim &movups ($inout1,&QWP(16*1,$inp)); 1626238384Sjkim &movups ($inout2,&QWP(16*2,$inp)); 1627238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1628238384Sjkim &movups ($inout3,&QWP(16*3,$inp)); 1629238384Sjkim &lea ($inp,&DWP(16*4,$inp)); 1630238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1631238384Sjkim &xorps ($inout2,$inout5); 1632238384Sjkim &xorps ($inout3,$inout4); 1633238384Sjkim 1634238384Sjkim &call ("_aesni_decrypt4"); 1635238384Sjkim 1636238384Sjkim &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1637238384Sjkim &xorps ($inout1,&QWP(16*1,"esp")); 1638238384Sjkim &xorps ($inout2,$inout5); 1639238384Sjkim &movups (&QWP(16*0,$out),$inout0); # write output 1640238384Sjkim &xorps ($inout3,$inout4); 1641238384Sjkim &movups (&QWP(16*1,$out),$inout1); 1642238384Sjkim &movups (&QWP(16*2,$out),$inout2); 1643238384Sjkim &movups (&QWP(16*3,$out),$inout3); 1644238384Sjkim &lea ($out,&DWP(16*4,$out)); 1645238384Sjkim 1646238384Sjkim &movdqa ($tweak,$inout4); # last tweak 1647238384Sjkim &jmp (&label("xts_dec_done")); 1648238384Sjkim 1649238384Sjkim&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1650238384Sjkim &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1651238384Sjkim &and ($len,15); 1652238384Sjkim &jz (&label("xts_dec_ret")); 1653238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1654238384Sjkim &jmp (&label("xts_dec_only_one_more")); 1655238384Sjkim 1656238384Sjkim&set_label("xts_dec_done",16); 1657238384Sjkim &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1658238384Sjkim &pxor ($twtmp,$twtmp); 1659238384Sjkim &and ($len,15); 1660238384Sjkim &jz (&label("xts_dec_ret")); 1661238384Sjkim 1662238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1663238384Sjkim &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1664238384Sjkim &pshufd ($twres,$twtmp,0x13); 1665238384Sjkim &pxor ($twtmp,$twtmp); 1666238384Sjkim &movdqa ($twmask,&QWP(16*6,"esp")); 1667238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1668238384Sjkim &pand ($twres,$twmask); # isolate carry and residue 1669238384Sjkim &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1670238384Sjkim &pxor ($tweak,$twres); 1671238384Sjkim 1672238384Sjkim&set_label("xts_dec_only_one_more"); 1673238384Sjkim &pshufd ($inout3,$twtmp,0x13); 1674238384Sjkim &movdqa ($inout4,$tweak); # put aside previous tweak 1675238384Sjkim &paddq ($tweak,$tweak); # &psllq($tweak,1); 1676238384Sjkim &pand ($inout3,$twmask); # isolate carry and residue 1677238384Sjkim &pxor ($inout3,$tweak); 1678238384Sjkim 1679238384Sjkim &mov ($key,$key_); # restore $key 1680238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1681238384Sjkim 1682238384Sjkim &movups ($inout0,&QWP(0,$inp)); # load input 1683238384Sjkim &xorps ($inout0,$inout3); # input^=tweak 1684238384Sjkim if ($inline) 1685238384Sjkim { &aesni_inline_generate1("dec"); } 1686238384Sjkim else 1687238384Sjkim { &call ("_aesni_decrypt1"); } 1688238384Sjkim &xorps ($inout0,$inout3); # output^=tweak 1689238384Sjkim &movups (&QWP(0,$out),$inout0); # write output 1690238384Sjkim 1691238384Sjkim&set_label("xts_dec_steal"); 1692238384Sjkim &movz ($rounds,&BP(16,$inp)); 1693238384Sjkim &movz ($key,&BP(0,$out)); 1694238384Sjkim &lea ($inp,&DWP(1,$inp)); 1695238384Sjkim &mov (&BP(0,$out),&LB($rounds)); 1696238384Sjkim &mov (&BP(16,$out),&LB($key)); 1697238384Sjkim &lea ($out,&DWP(1,$out)); 1698238384Sjkim &sub ($len,1); 1699238384Sjkim &jnz (&label("xts_dec_steal")); 1700238384Sjkim 1701238384Sjkim &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1702238384Sjkim &mov ($key,$key_); # restore $key 1703238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1704238384Sjkim 1705238384Sjkim &movups ($inout0,&QWP(0,$out)); # load input 1706238384Sjkim &xorps ($inout0,$inout4); # input^=tweak 1707238384Sjkim if ($inline) 1708238384Sjkim { &aesni_inline_generate1("dec"); } 1709238384Sjkim else 1710238384Sjkim { &call ("_aesni_decrypt1"); } 1711238384Sjkim &xorps ($inout0,$inout4); # output^=tweak 1712238384Sjkim &movups (&QWP(0,$out),$inout0); # write output 1713238384Sjkim 1714238384Sjkim&set_label("xts_dec_ret"); 1715238384Sjkim &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1716238384Sjkim&function_end("aesni_xts_decrypt"); 1717238384Sjkim} 1718238384Sjkim} 1719238384Sjkim 1720238384Sjkim###################################################################### 1721238384Sjkim# void $PREFIX_cbc_encrypt (const void *inp, void *out, 1722238384Sjkim# size_t length, const AES_KEY *key, 1723238384Sjkim# unsigned char *ivp,const int enc); 1724238384Sjkim&function_begin("${PREFIX}_cbc_encrypt"); 1725238384Sjkim &mov ($inp,&wparam(0)); 1726238384Sjkim &mov ($rounds_,"esp"); 1727238384Sjkim &mov ($out,&wparam(1)); 1728238384Sjkim &sub ($rounds_,24); 1729238384Sjkim &mov ($len,&wparam(2)); 1730238384Sjkim &and ($rounds_,-16); 1731238384Sjkim &mov ($key,&wparam(3)); 1732238384Sjkim &mov ($key_,&wparam(4)); 1733238384Sjkim &test ($len,$len); 1734238384Sjkim &jz (&label("cbc_abort")); 1735238384Sjkim 1736238384Sjkim &cmp (&wparam(5),0); 1737238384Sjkim &xchg ($rounds_,"esp"); # alloca 1738238384Sjkim &movups ($ivec,&QWP(0,$key_)); # load IV 1739238384Sjkim &mov ($rounds,&DWP(240,$key)); 1740238384Sjkim &mov ($key_,$key); # backup $key 1741238384Sjkim &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1742238384Sjkim &mov ($rounds_,$rounds); # backup $rounds 1743238384Sjkim &je (&label("cbc_decrypt")); 1744238384Sjkim 1745238384Sjkim &movaps ($inout0,$ivec); 1746238384Sjkim &cmp ($len,16); 1747238384Sjkim &jb (&label("cbc_enc_tail")); 1748238384Sjkim &sub ($len,16); 1749238384Sjkim &jmp (&label("cbc_enc_loop")); 1750238384Sjkim 1751238384Sjkim&set_label("cbc_enc_loop",16); 1752238384Sjkim &movups ($ivec,&QWP(0,$inp)); # input actually 1753238384Sjkim &lea ($inp,&DWP(16,$inp)); 1754238384Sjkim if ($inline) 1755238384Sjkim { &aesni_inline_generate1("enc",$inout0,$ivec); } 1756238384Sjkim else 1757238384Sjkim { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1758238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1759238384Sjkim &mov ($key,$key_); # restore $key 1760238384Sjkim &movups (&QWP(0,$out),$inout0); # store output 1761238384Sjkim &lea ($out,&DWP(16,$out)); 1762238384Sjkim &sub ($len,16); 1763238384Sjkim &jnc (&label("cbc_enc_loop")); 1764238384Sjkim &add ($len,16); 1765238384Sjkim &jnz (&label("cbc_enc_tail")); 1766238384Sjkim &movaps ($ivec,$inout0); 1767238384Sjkim &jmp (&label("cbc_ret")); 1768238384Sjkim 1769238384Sjkim&set_label("cbc_enc_tail"); 1770238384Sjkim &mov ("ecx",$len); # zaps $rounds 1771238384Sjkim &data_word(0xA4F3F689); # rep movsb 1772238384Sjkim &mov ("ecx",16); # zero tail 1773238384Sjkim &sub ("ecx",$len); 1774238384Sjkim &xor ("eax","eax"); # zaps $len 1775238384Sjkim &data_word(0xAAF3F689); # rep stosb 1776238384Sjkim &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1777238384Sjkim &mov ($rounds,$rounds_); # restore $rounds 1778238384Sjkim &mov ($inp,$out); # $inp and $out are the same 1779238384Sjkim &mov ($key,$key_); # restore $key 1780238384Sjkim &jmp (&label("cbc_enc_loop")); 1781238384Sjkim###################################################################### 1782238384Sjkim&set_label("cbc_decrypt",16); 1783238384Sjkim &cmp ($len,0x50); 1784238384Sjkim &jbe (&label("cbc_dec_tail")); 1785238384Sjkim &movaps (&QWP(0,"esp"),$ivec); # save IV 1786238384Sjkim &sub ($len,0x50); 1787238384Sjkim &jmp (&label("cbc_dec_loop6_enter")); 1788238384Sjkim 1789238384Sjkim&set_label("cbc_dec_loop6",16); 1790238384Sjkim &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1791238384Sjkim &movups (&QWP(0,$out),$inout5); 1792238384Sjkim &lea ($out,&DWP(0x10,$out)); 1793238384Sjkim&set_label("cbc_dec_loop6_enter"); 1794238384Sjkim &movdqu ($inout0,&QWP(0,$inp)); 1795238384Sjkim &movdqu ($inout1,&QWP(0x10,$inp)); 1796238384Sjkim &movdqu ($inout2,&QWP(0x20,$inp)); 1797238384Sjkim &movdqu ($inout3,&QWP(0x30,$inp)); 1798238384Sjkim &movdqu ($inout4,&QWP(0x40,$inp)); 1799238384Sjkim &movdqu ($inout5,&QWP(0x50,$inp)); 1800238384Sjkim 1801238384Sjkim &call ("_aesni_decrypt6"); 1802238384Sjkim 1803238384Sjkim &movups ($rndkey1,&QWP(0,$inp)); 1804238384Sjkim &movups ($rndkey0,&QWP(0x10,$inp)); 1805238384Sjkim &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1806238384Sjkim &xorps ($inout1,$rndkey1); 1807238384Sjkim &movups ($rndkey1,&QWP(0x20,$inp)); 1808238384Sjkim &xorps ($inout2,$rndkey0); 1809238384Sjkim &movups ($rndkey0,&QWP(0x30,$inp)); 1810238384Sjkim &xorps ($inout3,$rndkey1); 1811238384Sjkim &movups ($rndkey1,&QWP(0x40,$inp)); 1812238384Sjkim &xorps ($inout4,$rndkey0); 1813238384Sjkim &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1814238384Sjkim &xorps ($inout5,$rndkey1); 1815238384Sjkim &movups (&QWP(0,$out),$inout0); 1816238384Sjkim &movups (&QWP(0x10,$out),$inout1); 1817238384Sjkim &lea ($inp,&DWP(0x60,$inp)); 1818238384Sjkim &movups (&QWP(0x20,$out),$inout2); 1819238384Sjkim &mov ($rounds,$rounds_) # restore $rounds 1820238384Sjkim &movups (&QWP(0x30,$out),$inout3); 1821238384Sjkim &mov ($key,$key_); # restore $key 1822238384Sjkim &movups (&QWP(0x40,$out),$inout4); 1823238384Sjkim &lea ($out,&DWP(0x50,$out)); 1824238384Sjkim &sub ($len,0x60); 1825238384Sjkim &ja (&label("cbc_dec_loop6")); 1826238384Sjkim 1827238384Sjkim &movaps ($inout0,$inout5); 1828238384Sjkim &movaps ($ivec,$rndkey0); 1829238384Sjkim &add ($len,0x50); 1830238384Sjkim &jle (&label("cbc_dec_tail_collected")); 1831238384Sjkim &movups (&QWP(0,$out),$inout0); 1832238384Sjkim &lea ($out,&DWP(0x10,$out)); 1833238384Sjkim&set_label("cbc_dec_tail"); 1834238384Sjkim &movups ($inout0,&QWP(0,$inp)); 1835238384Sjkim &movaps ($in0,$inout0); 1836238384Sjkim &cmp ($len,0x10); 1837238384Sjkim &jbe (&label("cbc_dec_one")); 1838238384Sjkim 1839238384Sjkim &movups ($inout1,&QWP(0x10,$inp)); 1840238384Sjkim &movaps ($in1,$inout1); 1841238384Sjkim &cmp ($len,0x20); 1842238384Sjkim &jbe (&label("cbc_dec_two")); 1843238384Sjkim 1844238384Sjkim &movups ($inout2,&QWP(0x20,$inp)); 1845238384Sjkim &cmp ($len,0x30); 1846238384Sjkim &jbe (&label("cbc_dec_three")); 1847238384Sjkim 1848238384Sjkim &movups ($inout3,&QWP(0x30,$inp)); 1849238384Sjkim &cmp ($len,0x40); 1850238384Sjkim &jbe (&label("cbc_dec_four")); 1851238384Sjkim 1852238384Sjkim &movups ($inout4,&QWP(0x40,$inp)); 1853238384Sjkim &movaps (&QWP(0,"esp"),$ivec); # save IV 1854238384Sjkim &movups ($inout0,&QWP(0,$inp)); 1855238384Sjkim &xorps ($inout5,$inout5); 1856238384Sjkim &call ("_aesni_decrypt6"); 1857238384Sjkim &movups ($rndkey1,&QWP(0,$inp)); 1858238384Sjkim &movups ($rndkey0,&QWP(0x10,$inp)); 1859238384Sjkim &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1860238384Sjkim &xorps ($inout1,$rndkey1); 1861238384Sjkim &movups ($rndkey1,&QWP(0x20,$inp)); 1862238384Sjkim &xorps ($inout2,$rndkey0); 1863238384Sjkim &movups ($rndkey0,&QWP(0x30,$inp)); 1864238384Sjkim &xorps ($inout3,$rndkey1); 1865238384Sjkim &movups ($ivec,&QWP(0x40,$inp)); # IV 1866238384Sjkim &xorps ($inout4,$rndkey0); 1867238384Sjkim &movups (&QWP(0,$out),$inout0); 1868238384Sjkim &movups (&QWP(0x10,$out),$inout1); 1869238384Sjkim &movups (&QWP(0x20,$out),$inout2); 1870238384Sjkim &movups (&QWP(0x30,$out),$inout3); 1871238384Sjkim &lea ($out,&DWP(0x40,$out)); 1872238384Sjkim &movaps ($inout0,$inout4); 1873238384Sjkim &sub ($len,0x50); 1874238384Sjkim &jmp (&label("cbc_dec_tail_collected")); 1875238384Sjkim 1876238384Sjkim&set_label("cbc_dec_one",16); 1877238384Sjkim if ($inline) 1878238384Sjkim { &aesni_inline_generate1("dec"); } 1879238384Sjkim else 1880238384Sjkim { &call ("_aesni_decrypt1"); } 1881238384Sjkim &xorps ($inout0,$ivec); 1882238384Sjkim &movaps ($ivec,$in0); 1883238384Sjkim &sub ($len,0x10); 1884238384Sjkim &jmp (&label("cbc_dec_tail_collected")); 1885238384Sjkim 1886238384Sjkim&set_label("cbc_dec_two",16); 1887238384Sjkim &xorps ($inout2,$inout2); 1888238384Sjkim &call ("_aesni_decrypt3"); 1889238384Sjkim &xorps ($inout0,$ivec); 1890238384Sjkim &xorps ($inout1,$in0); 1891238384Sjkim &movups (&QWP(0,$out),$inout0); 1892238384Sjkim &movaps ($inout0,$inout1); 1893238384Sjkim &lea ($out,&DWP(0x10,$out)); 1894238384Sjkim &movaps ($ivec,$in1); 1895238384Sjkim &sub ($len,0x20); 1896238384Sjkim &jmp (&label("cbc_dec_tail_collected")); 1897238384Sjkim 1898238384Sjkim&set_label("cbc_dec_three",16); 1899238384Sjkim &call ("_aesni_decrypt3"); 1900238384Sjkim &xorps ($inout0,$ivec); 1901238384Sjkim &xorps ($inout1,$in0); 1902238384Sjkim &xorps ($inout2,$in1); 1903238384Sjkim &movups (&QWP(0,$out),$inout0); 1904238384Sjkim &movaps ($inout0,$inout2); 1905238384Sjkim &movups (&QWP(0x10,$out),$inout1); 1906238384Sjkim &lea ($out,&DWP(0x20,$out)); 1907238384Sjkim &movups ($ivec,&QWP(0x20,$inp)); 1908238384Sjkim &sub ($len,0x30); 1909238384Sjkim &jmp (&label("cbc_dec_tail_collected")); 1910238384Sjkim 1911238384Sjkim&set_label("cbc_dec_four",16); 1912238384Sjkim &call ("_aesni_decrypt4"); 1913238384Sjkim &movups ($rndkey1,&QWP(0x10,$inp)); 1914238384Sjkim &movups ($rndkey0,&QWP(0x20,$inp)); 1915238384Sjkim &xorps ($inout0,$ivec); 1916238384Sjkim &movups ($ivec,&QWP(0x30,$inp)); 1917238384Sjkim &xorps ($inout1,$in0); 1918238384Sjkim &movups (&QWP(0,$out),$inout0); 1919238384Sjkim &xorps ($inout2,$rndkey1); 1920238384Sjkim &movups (&QWP(0x10,$out),$inout1); 1921238384Sjkim &xorps ($inout3,$rndkey0); 1922238384Sjkim &movups (&QWP(0x20,$out),$inout2); 1923238384Sjkim &lea ($out,&DWP(0x30,$out)); 1924238384Sjkim &movaps ($inout0,$inout3); 1925238384Sjkim &sub ($len,0x40); 1926238384Sjkim 1927238384Sjkim&set_label("cbc_dec_tail_collected"); 1928238384Sjkim &and ($len,15); 1929238384Sjkim &jnz (&label("cbc_dec_tail_partial")); 1930238384Sjkim &movups (&QWP(0,$out),$inout0); 1931238384Sjkim &jmp (&label("cbc_ret")); 1932238384Sjkim 1933238384Sjkim&set_label("cbc_dec_tail_partial",16); 1934238384Sjkim &movaps (&QWP(0,"esp"),$inout0); 1935238384Sjkim &mov ("ecx",16); 1936238384Sjkim &mov ($inp,"esp"); 1937238384Sjkim &sub ("ecx",$len); 1938238384Sjkim &data_word(0xA4F3F689); # rep movsb 1939238384Sjkim 1940238384Sjkim&set_label("cbc_ret"); 1941238384Sjkim &mov ("esp",&DWP(16,"esp")); # pull original %esp 1942238384Sjkim &mov ($key_,&wparam(4)); 1943238384Sjkim &movups (&QWP(0,$key_),$ivec); # output IV 1944238384Sjkim&set_label("cbc_abort"); 1945238384Sjkim&function_end("${PREFIX}_cbc_encrypt"); 1946238384Sjkim 1947238384Sjkim###################################################################### 1948238384Sjkim# Mechanical port from aesni-x86_64.pl. 1949238384Sjkim# 1950238384Sjkim# _aesni_set_encrypt_key is private interface, 1951238384Sjkim# input: 1952238384Sjkim# "eax" const unsigned char *userKey 1953238384Sjkim# $rounds int bits 1954238384Sjkim# $key AES_KEY *key 1955238384Sjkim# output: 1956238384Sjkim# "eax" return code 1957238384Sjkim# $round rounds 1958238384Sjkim 1959238384Sjkim&function_begin_B("_aesni_set_encrypt_key"); 1960238384Sjkim &test ("eax","eax"); 1961238384Sjkim &jz (&label("bad_pointer")); 1962238384Sjkim &test ($key,$key); 1963238384Sjkim &jz (&label("bad_pointer")); 1964238384Sjkim 1965238384Sjkim &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 1966238384Sjkim &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 1967238384Sjkim &lea ($key,&DWP(16,$key)); 1968238384Sjkim &cmp ($rounds,256); 1969238384Sjkim &je (&label("14rounds")); 1970238384Sjkim &cmp ($rounds,192); 1971238384Sjkim &je (&label("12rounds")); 1972238384Sjkim &cmp ($rounds,128); 1973238384Sjkim &jne (&label("bad_keybits")); 1974238384Sjkim 1975238384Sjkim&set_label("10rounds",16); 1976238384Sjkim &mov ($rounds,9); 1977238384Sjkim &$movekey (&QWP(-16,$key),"xmm0"); # round 0 1978238384Sjkim &aeskeygenassist("xmm1","xmm0",0x01); # round 1 1979238384Sjkim &call (&label("key_128_cold")); 1980238384Sjkim &aeskeygenassist("xmm1","xmm0",0x2); # round 2 1981238384Sjkim &call (&label("key_128")); 1982238384Sjkim &aeskeygenassist("xmm1","xmm0",0x04); # round 3 1983238384Sjkim &call (&label("key_128")); 1984238384Sjkim &aeskeygenassist("xmm1","xmm0",0x08); # round 4 1985238384Sjkim &call (&label("key_128")); 1986238384Sjkim &aeskeygenassist("xmm1","xmm0",0x10); # round 5 1987238384Sjkim &call (&label("key_128")); 1988238384Sjkim &aeskeygenassist("xmm1","xmm0",0x20); # round 6 1989238384Sjkim &call (&label("key_128")); 1990238384Sjkim &aeskeygenassist("xmm1","xmm0",0x40); # round 7 1991238384Sjkim &call (&label("key_128")); 1992238384Sjkim &aeskeygenassist("xmm1","xmm0",0x80); # round 8 1993238384Sjkim &call (&label("key_128")); 1994238384Sjkim &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 1995238384Sjkim &call (&label("key_128")); 1996238384Sjkim &aeskeygenassist("xmm1","xmm0",0x36); # round 10 1997238384Sjkim &call (&label("key_128")); 1998238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 1999238384Sjkim &mov (&DWP(80,$key),$rounds); 2000238384Sjkim &xor ("eax","eax"); 2001238384Sjkim &ret(); 2002238384Sjkim 2003238384Sjkim&set_label("key_128",16); 2004238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2005238384Sjkim &lea ($key,&DWP(16,$key)); 2006238384Sjkim&set_label("key_128_cold"); 2007238384Sjkim &shufps ("xmm4","xmm0",0b00010000); 2008238384Sjkim &xorps ("xmm0","xmm4"); 2009238384Sjkim &shufps ("xmm4","xmm0",0b10001100); 2010238384Sjkim &xorps ("xmm0","xmm4"); 2011238384Sjkim &shufps ("xmm1","xmm1",0b11111111); # critical path 2012238384Sjkim &xorps ("xmm0","xmm1"); 2013238384Sjkim &ret(); 2014238384Sjkim 2015238384Sjkim&set_label("12rounds",16); 2016238384Sjkim &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2017238384Sjkim &mov ($rounds,11); 2018238384Sjkim &$movekey (&QWP(-16,$key),"xmm0") # round 0 2019238384Sjkim &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2020238384Sjkim &call (&label("key_192a_cold")); 2021238384Sjkim &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2022238384Sjkim &call (&label("key_192b")); 2023238384Sjkim &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2024238384Sjkim &call (&label("key_192a")); 2025238384Sjkim &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2026238384Sjkim &call (&label("key_192b")); 2027238384Sjkim &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2028238384Sjkim &call (&label("key_192a")); 2029238384Sjkim &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2030238384Sjkim &call (&label("key_192b")); 2031238384Sjkim &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2032238384Sjkim &call (&label("key_192a")); 2033238384Sjkim &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2034238384Sjkim &call (&label("key_192b")); 2035238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2036238384Sjkim &mov (&DWP(48,$key),$rounds); 2037238384Sjkim &xor ("eax","eax"); 2038238384Sjkim &ret(); 2039238384Sjkim 2040238384Sjkim&set_label("key_192a",16); 2041238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2042238384Sjkim &lea ($key,&DWP(16,$key)); 2043238384Sjkim&set_label("key_192a_cold",16); 2044238384Sjkim &movaps ("xmm5","xmm2"); 2045238384Sjkim&set_label("key_192b_warm"); 2046238384Sjkim &shufps ("xmm4","xmm0",0b00010000); 2047238384Sjkim &movdqa ("xmm3","xmm2"); 2048238384Sjkim &xorps ("xmm0","xmm4"); 2049238384Sjkim &shufps ("xmm4","xmm0",0b10001100); 2050238384Sjkim &pslldq ("xmm3",4); 2051238384Sjkim &xorps ("xmm0","xmm4"); 2052238384Sjkim &pshufd ("xmm1","xmm1",0b01010101); # critical path 2053238384Sjkim &pxor ("xmm2","xmm3"); 2054238384Sjkim &pxor ("xmm0","xmm1"); 2055238384Sjkim &pshufd ("xmm3","xmm0",0b11111111); 2056238384Sjkim &pxor ("xmm2","xmm3"); 2057238384Sjkim &ret(); 2058238384Sjkim 2059238384Sjkim&set_label("key_192b",16); 2060238384Sjkim &movaps ("xmm3","xmm0"); 2061238384Sjkim &shufps ("xmm5","xmm0",0b01000100); 2062238384Sjkim &$movekey (&QWP(0,$key),"xmm5"); 2063238384Sjkim &shufps ("xmm3","xmm2",0b01001110); 2064238384Sjkim &$movekey (&QWP(16,$key),"xmm3"); 2065238384Sjkim &lea ($key,&DWP(32,$key)); 2066238384Sjkim &jmp (&label("key_192b_warm")); 2067238384Sjkim 2068238384Sjkim&set_label("14rounds",16); 2069238384Sjkim &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2070238384Sjkim &mov ($rounds,13); 2071238384Sjkim &lea ($key,&DWP(16,$key)); 2072238384Sjkim &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2073238384Sjkim &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2074238384Sjkim &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2075238384Sjkim &call (&label("key_256a_cold")); 2076238384Sjkim &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2077238384Sjkim &call (&label("key_256b")); 2078238384Sjkim &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2079238384Sjkim &call (&label("key_256a")); 2080238384Sjkim &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2081238384Sjkim &call (&label("key_256b")); 2082238384Sjkim &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2083238384Sjkim &call (&label("key_256a")); 2084238384Sjkim &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2085238384Sjkim &call (&label("key_256b")); 2086238384Sjkim &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2087238384Sjkim &call (&label("key_256a")); 2088238384Sjkim &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2089238384Sjkim &call (&label("key_256b")); 2090238384Sjkim &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2091238384Sjkim &call (&label("key_256a")); 2092238384Sjkim &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2093238384Sjkim &call (&label("key_256b")); 2094238384Sjkim &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2095238384Sjkim &call (&label("key_256a")); 2096238384Sjkim &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2097238384Sjkim &call (&label("key_256b")); 2098238384Sjkim &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2099238384Sjkim &call (&label("key_256a")); 2100238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2101238384Sjkim &mov (&DWP(16,$key),$rounds); 2102238384Sjkim &xor ("eax","eax"); 2103238384Sjkim &ret(); 2104238384Sjkim 2105238384Sjkim&set_label("key_256a",16); 2106238384Sjkim &$movekey (&QWP(0,$key),"xmm2"); 2107238384Sjkim &lea ($key,&DWP(16,$key)); 2108238384Sjkim&set_label("key_256a_cold"); 2109238384Sjkim &shufps ("xmm4","xmm0",0b00010000); 2110238384Sjkim &xorps ("xmm0","xmm4"); 2111238384Sjkim &shufps ("xmm4","xmm0",0b10001100); 2112238384Sjkim &xorps ("xmm0","xmm4"); 2113238384Sjkim &shufps ("xmm1","xmm1",0b11111111); # critical path 2114238384Sjkim &xorps ("xmm0","xmm1"); 2115238384Sjkim &ret(); 2116238384Sjkim 2117238384Sjkim&set_label("key_256b",16); 2118238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2119238384Sjkim &lea ($key,&DWP(16,$key)); 2120238384Sjkim 2121238384Sjkim &shufps ("xmm4","xmm2",0b00010000); 2122238384Sjkim &xorps ("xmm2","xmm4"); 2123238384Sjkim &shufps ("xmm4","xmm2",0b10001100); 2124238384Sjkim &xorps ("xmm2","xmm4"); 2125238384Sjkim &shufps ("xmm1","xmm1",0b10101010); # critical path 2126238384Sjkim &xorps ("xmm2","xmm1"); 2127238384Sjkim &ret(); 2128238384Sjkim 2129238384Sjkim&set_label("bad_pointer",4); 2130238384Sjkim &mov ("eax",-1); 2131238384Sjkim &ret (); 2132238384Sjkim&set_label("bad_keybits",4); 2133238384Sjkim &mov ("eax",-2); 2134238384Sjkim &ret (); 2135238384Sjkim&function_end_B("_aesni_set_encrypt_key"); 2136238384Sjkim 2137238384Sjkim# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2138238384Sjkim# AES_KEY *key) 2139238384Sjkim&function_begin_B("${PREFIX}_set_encrypt_key"); 2140238384Sjkim &mov ("eax",&wparam(0)); 2141238384Sjkim &mov ($rounds,&wparam(1)); 2142238384Sjkim &mov ($key,&wparam(2)); 2143238384Sjkim &call ("_aesni_set_encrypt_key"); 2144238384Sjkim &ret (); 2145238384Sjkim&function_end_B("${PREFIX}_set_encrypt_key"); 2146238384Sjkim 2147238384Sjkim# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2148238384Sjkim# AES_KEY *key) 2149238384Sjkim&function_begin_B("${PREFIX}_set_decrypt_key"); 2150238384Sjkim &mov ("eax",&wparam(0)); 2151238384Sjkim &mov ($rounds,&wparam(1)); 2152238384Sjkim &mov ($key,&wparam(2)); 2153238384Sjkim &call ("_aesni_set_encrypt_key"); 2154238384Sjkim &mov ($key,&wparam(2)); 2155238384Sjkim &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key 2156238384Sjkim &test ("eax","eax"); 2157238384Sjkim &jnz (&label("dec_key_ret")); 2158238384Sjkim &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2159238384Sjkim 2160238384Sjkim &$movekey ("xmm0",&QWP(0,$key)); # just swap 2161238384Sjkim &$movekey ("xmm1",&QWP(0,"eax")); 2162238384Sjkim &$movekey (&QWP(0,"eax"),"xmm0"); 2163238384Sjkim &$movekey (&QWP(0,$key),"xmm1"); 2164238384Sjkim &lea ($key,&DWP(16,$key)); 2165238384Sjkim &lea ("eax",&DWP(-16,"eax")); 2166238384Sjkim 2167238384Sjkim&set_label("dec_key_inverse"); 2168238384Sjkim &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2169238384Sjkim &$movekey ("xmm1",&QWP(0,"eax")); 2170238384Sjkim &aesimc ("xmm0","xmm0"); 2171238384Sjkim &aesimc ("xmm1","xmm1"); 2172238384Sjkim &lea ($key,&DWP(16,$key)); 2173238384Sjkim &lea ("eax",&DWP(-16,"eax")); 2174238384Sjkim &$movekey (&QWP(16,"eax"),"xmm0"); 2175238384Sjkim &$movekey (&QWP(-16,$key),"xmm1"); 2176238384Sjkim &cmp ("eax",$key); 2177238384Sjkim &ja (&label("dec_key_inverse")); 2178238384Sjkim 2179238384Sjkim &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2180238384Sjkim &aesimc ("xmm0","xmm0"); 2181238384Sjkim &$movekey (&QWP(0,$key),"xmm0"); 2182238384Sjkim 2183238384Sjkim &xor ("eax","eax"); # return success 2184238384Sjkim&set_label("dec_key_ret"); 2185238384Sjkim &ret (); 2186238384Sjkim&function_end_B("${PREFIX}_set_decrypt_key"); 2187238384Sjkim&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2188238384Sjkim 2189238384Sjkim&asm_finish(); 2190