1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# This module implements support for Intel AES-NI extension. In 11238384Sjkim# OpenSSL context it's used with Intel engine, but can also be used as 12238384Sjkim# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 13238384Sjkim# details]. 14238384Sjkim# 15238384Sjkim# Performance. 16238384Sjkim# 17238384Sjkim# Given aes(enc|dec) instructions' latency asymptotic performance for 18238384Sjkim# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 19238384Sjkim# processed with 128-bit key. And given their throughput asymptotic 20238384Sjkim# performance for parallelizable modes is 1.25 cycles per byte. Being 21238384Sjkim# asymptotic limit it's not something you commonly achieve in reality, 22238384Sjkim# but how close does one get? Below are results collected for 23238384Sjkim# different modes and block sized. Pairs of numbers are for en-/ 24238384Sjkim# decryption. 25238384Sjkim# 26238384Sjkim# 16-byte 64-byte 256-byte 1-KB 8-KB 27238384Sjkim# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 28238384Sjkim# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 29238384Sjkim# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 30238384Sjkim# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 31238384Sjkim# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 32238384Sjkim# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 33238384Sjkim# 34238384Sjkim# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 35238384Sjkim# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 36238384Sjkim# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 37238384Sjkim# The results were collected with specially crafted speed.c benchmark 38238384Sjkim# in order to compare them with results reported in "Intel Advanced 39238384Sjkim# Encryption Standard (AES) New Instruction Set" White Paper Revision 40238384Sjkim# 3.0 dated May 2010. All above results are consistently better. This 41238384Sjkim# module also provides better performance for block sizes smaller than 42238384Sjkim# 128 bytes in points *not* represented in the above table. 43238384Sjkim# 44238384Sjkim# Looking at the results for 8-KB buffer. 45238384Sjkim# 46238384Sjkim# CFB and OFB results are far from the limit, because implementation 47238384Sjkim# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 48238384Sjkim# single-block aesni_encrypt, which is not the most optimal way to go. 49238384Sjkim# CBC encrypt result is unexpectedly high and there is no documented 50238384Sjkim# explanation for it. Seemingly there is a small penalty for feeding 51238384Sjkim# the result back to AES unit the way it's done in CBC mode. There is 52238384Sjkim# nothing one can do and the result appears optimal. CCM result is 53238384Sjkim# identical to CBC, because CBC-MAC is essentially CBC encrypt without 54238384Sjkim# saving output. CCM CTR "stays invisible," because it's neatly 55238384Sjkim# interleaved wih CBC-MAC. This provides ~30% improvement over 56238384Sjkim# "straghtforward" CCM implementation with CTR and CBC-MAC performed 57238384Sjkim# disjointly. Parallelizable modes practically achieve the theoretical 58238384Sjkim# limit. 59238384Sjkim# 60238384Sjkim# Looking at how results vary with buffer size. 61238384Sjkim# 62238384Sjkim# Curves are practically saturated at 1-KB buffer size. In most cases 63238384Sjkim# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 64238384Sjkim# CTR curve doesn't follow this pattern and is "slowest" changing one 65238384Sjkim# with "256-byte" result being 87% of "8-KB." This is because overhead 66238384Sjkim# in CTR mode is most computationally intensive. Small-block CCM 67238384Sjkim# decrypt is slower than encrypt, because first CTR and last CBC-MAC 68238384Sjkim# iterations can't be interleaved. 69238384Sjkim# 70238384Sjkim# Results for 192- and 256-bit keys. 71238384Sjkim# 72238384Sjkim# EVP-free results were observed to scale perfectly with number of 73238384Sjkim# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 74238384Sjkim# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 75238384Sjkim# are a tad smaller, because the above mentioned penalty biases all 76238384Sjkim# results by same constant value. In similar way function call 77238384Sjkim# overhead affects small-block performance, as well as OFB and CFB 78238384Sjkim# results. Differences are not large, most common coefficients are 79238384Sjkim# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 80238384Sjkim# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 81238384Sjkim 82238384Sjkim# January 2011 83238384Sjkim# 84238384Sjkim# While Westmere processor features 6 cycles latency for aes[enc|dec] 85238384Sjkim# instructions, which can be scheduled every second cycle, Sandy 86238384Sjkim# Bridge spends 8 cycles per instruction, but it can schedule them 87238384Sjkim# every cycle. This means that code targeting Westmere would perform 88238384Sjkim# suboptimally on Sandy Bridge. Therefore this update. 89238384Sjkim# 90238384Sjkim# In addition, non-parallelizable CBC encrypt (as well as CCM) is 91238384Sjkim# optimized. Relative improvement might appear modest, 8% on Westmere, 92238384Sjkim# but in absolute terms it's 3.77 cycles per byte encrypted with 93238384Sjkim# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 94238384Sjkim# should be compared to asymptotic limits of 3.75 for Westmere and 95238384Sjkim# 5.00 for Sandy Bridge. Actually, the fact that they get this close 96238384Sjkim# to asymptotic limits is quite amazing. Indeed, the limit is 97238384Sjkim# calculated as latency times number of rounds, 10 for 128-bit key, 98238384Sjkim# and divided by 16, the number of bytes in block, or in other words 99238384Sjkim# it accounts *solely* for aesenc instructions. But there are extra 100238384Sjkim# instructions, and numbers so close to the asymptotic limits mean 101238384Sjkim# that it's as if it takes as little as *one* additional cycle to 102238384Sjkim# execute all of them. How is it possible? It is possible thanks to 103238384Sjkim# out-of-order execution logic, which manages to overlap post- 104238384Sjkim# processing of previous block, things like saving the output, with 105238384Sjkim# actual encryption of current block, as well as pre-processing of 106238384Sjkim# current block, things like fetching input and xor-ing it with 107238384Sjkim# 0-round element of the key schedule, with actual encryption of 108238384Sjkim# previous block. Keep this in mind... 109238384Sjkim# 110238384Sjkim# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 111238384Sjkim# performance is achieved by interleaving instructions working on 112238384Sjkim# independent blocks. In which case asymptotic limit for such modes 113238384Sjkim# can be obtained by dividing above mentioned numbers by AES 114238384Sjkim# instructions' interleave factor. Westmere can execute at most 3 115238384Sjkim# instructions at a time, meaning that optimal interleave factor is 3, 116238384Sjkim# and that's where the "magic" number of 1.25 come from. "Optimal 117238384Sjkim# interleave factor" means that increase of interleave factor does 118238384Sjkim# not improve performance. The formula has proven to reflect reality 119238384Sjkim# pretty well on Westmere... Sandy Bridge on the other hand can 120238384Sjkim# execute up to 8 AES instructions at a time, so how does varying 121238384Sjkim# interleave factor affect the performance? Here is table for ECB 122238384Sjkim# (numbers are cycles per byte processed with 128-bit key): 123238384Sjkim# 124238384Sjkim# instruction interleave factor 3x 6x 8x 125238384Sjkim# theoretical asymptotic limit 1.67 0.83 0.625 126238384Sjkim# measured performance for 8KB block 1.05 0.86 0.84 127238384Sjkim# 128238384Sjkim# "as if" interleave factor 4.7x 5.8x 6.0x 129238384Sjkim# 130238384Sjkim# Further data for other parallelizable modes: 131238384Sjkim# 132238384Sjkim# CBC decrypt 1.16 0.93 0.93 133238384Sjkim# CTR 1.14 0.91 n/a 134238384Sjkim# 135238384Sjkim# Well, given 3x column it's probably inappropriate to call the limit 136238384Sjkim# asymptotic, if it can be surpassed, isn't it? What happens there? 137238384Sjkim# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 138238384Sjkim# magic is responsible for this. Processor overlaps not only the 139238384Sjkim# additional instructions with AES ones, but even AES instuctions 140238384Sjkim# processing adjacent triplets of independent blocks. In the 6x case 141238384Sjkim# additional instructions still claim disproportionally small amount 142238384Sjkim# of additional cycles, but in 8x case number of instructions must be 143238384Sjkim# a tad too high for out-of-order logic to cope with, and AES unit 144238384Sjkim# remains underutilized... As you can see 8x interleave is hardly 145238384Sjkim# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 146238384Sjkim# utilizies 6x interleave because of limited register bank capacity. 147238384Sjkim# 148238384Sjkim# Higher interleave factors do have negative impact on Westmere 149238384Sjkim# performance. While for ECB mode it's negligible ~1.5%, other 150238384Sjkim# parallelizables perform ~5% worse, which is outweighed by ~25% 151238384Sjkim# improvement on Sandy Bridge. To balance regression on Westmere 152238384Sjkim# CTR mode was implemented with 6x aesenc interleave factor. 153238384Sjkim 154238384Sjkim# April 2011 155238384Sjkim# 156238384Sjkim# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing 157238384Sjkim# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like 158238384Sjkim# in CTR mode AES instruction interleave factor was chosen to be 6x. 159238384Sjkim 160238384Sjkim$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 161238384Sjkim # generates drop-in replacement for 162238384Sjkim # crypto/aes/asm/aes-x86_64.pl:-) 163238384Sjkim 164238384Sjkim$flavour = shift; 165238384Sjkim$output = shift; 166238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 167238384Sjkim 168238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 169238384Sjkim 170238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 171238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 172238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 173238384Sjkimdie "can't locate x86_64-xlate.pl"; 174238384Sjkim 175246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 176246772Sjkim*STDOUT=*OUT; 177238384Sjkim 178238384Sjkim$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 179238384Sjkim@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 180238384Sjkim ("%rdi","%rsi","%rdx","%rcx"); # Unix order 181238384Sjkim 182238384Sjkim$code=".text\n"; 183238384Sjkim 184238384Sjkim$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 185238384Sjkim# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 186238384Sjkim$inp="%rdi"; 187238384Sjkim$out="%rsi"; 188238384Sjkim$len="%rdx"; 189238384Sjkim$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 190238384Sjkim$ivp="%r8"; # cbc, ctr, ... 191238384Sjkim 192238384Sjkim$rnds_="%r10d"; # backup copy for $rounds 193238384Sjkim$key_="%r11"; # backup copy for $key 194238384Sjkim 195238384Sjkim# %xmm register layout 196238384Sjkim$rndkey0="%xmm0"; $rndkey1="%xmm1"; 197238384Sjkim$inout0="%xmm2"; $inout1="%xmm3"; 198238384Sjkim$inout2="%xmm4"; $inout3="%xmm5"; 199238384Sjkim$inout4="%xmm6"; $inout5="%xmm7"; 200238384Sjkim$inout6="%xmm8"; $inout7="%xmm9"; 201238384Sjkim 202238384Sjkim$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 203238384Sjkim$in0="%xmm8"; $iv="%xmm9"; 204238384Sjkim 205238384Sjkim# Inline version of internal aesni_[en|de]crypt1. 206238384Sjkim# 207238384Sjkim# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 208238384Sjkim# cycles which take care of loop variables... 209238384Sjkim{ my $sn; 210238384Sjkimsub aesni_generate1 { 211238384Sjkimmy ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 212238384Sjkim++$sn; 213238384Sjkim$code.=<<___; 214238384Sjkim $movkey ($key),$rndkey0 215238384Sjkim $movkey 16($key),$rndkey1 216238384Sjkim___ 217238384Sjkim$code.=<<___ if (defined($ivec)); 218238384Sjkim xorps $rndkey0,$ivec 219238384Sjkim lea 32($key),$key 220238384Sjkim xorps $ivec,$inout 221238384Sjkim___ 222238384Sjkim$code.=<<___ if (!defined($ivec)); 223238384Sjkim lea 32($key),$key 224238384Sjkim xorps $rndkey0,$inout 225238384Sjkim___ 226238384Sjkim$code.=<<___; 227238384Sjkim.Loop_${p}1_$sn: 228238384Sjkim aes${p} $rndkey1,$inout 229238384Sjkim dec $rounds 230238384Sjkim $movkey ($key),$rndkey1 231238384Sjkim lea 16($key),$key 232238384Sjkim jnz .Loop_${p}1_$sn # loop body is 16 bytes 233238384Sjkim aes${p}last $rndkey1,$inout 234238384Sjkim___ 235238384Sjkim}} 236238384Sjkim# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 237238384Sjkim# 238238384Sjkim{ my ($inp,$out,$key) = @_4args; 239238384Sjkim 240238384Sjkim$code.=<<___; 241238384Sjkim.globl ${PREFIX}_encrypt 242238384Sjkim.type ${PREFIX}_encrypt,\@abi-omnipotent 243238384Sjkim.align 16 244238384Sjkim${PREFIX}_encrypt: 245238384Sjkim movups ($inp),$inout0 # load input 246238384Sjkim mov 240($key),$rounds # key->rounds 247238384Sjkim___ 248238384Sjkim &aesni_generate1("enc",$key,$rounds); 249238384Sjkim$code.=<<___; 250238384Sjkim movups $inout0,($out) # output 251238384Sjkim ret 252238384Sjkim.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 253238384Sjkim 254238384Sjkim.globl ${PREFIX}_decrypt 255238384Sjkim.type ${PREFIX}_decrypt,\@abi-omnipotent 256238384Sjkim.align 16 257238384Sjkim${PREFIX}_decrypt: 258238384Sjkim movups ($inp),$inout0 # load input 259238384Sjkim mov 240($key),$rounds # key->rounds 260238384Sjkim___ 261238384Sjkim &aesni_generate1("dec",$key,$rounds); 262238384Sjkim$code.=<<___; 263238384Sjkim movups $inout0,($out) # output 264238384Sjkim ret 265238384Sjkim.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 266238384Sjkim___ 267238384Sjkim} 268238384Sjkim 269238384Sjkim# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 270238384Sjkim# factor. Why 3x subroutine were originally used in loops? Even though 271238384Sjkim# aes[enc|dec] latency was originally 6, it could be scheduled only 272238384Sjkim# every *2nd* cycle. Thus 3x interleave was the one providing optimal 273238384Sjkim# utilization, i.e. when subroutine's throughput is virtually same as 274238384Sjkim# of non-interleaved subroutine [for number of input blocks up to 3]. 275238384Sjkim# This is why it makes no sense to implement 2x subroutine. 276238384Sjkim# aes[enc|dec] latency in next processor generation is 8, but the 277238384Sjkim# instructions can be scheduled every cycle. Optimal interleave for 278238384Sjkim# new processor is therefore 8x... 279238384Sjkimsub aesni_generate3 { 280238384Sjkimmy $dir=shift; 281238384Sjkim# As already mentioned it takes in $key and $rounds, which are *not* 282238384Sjkim# preserved. $inout[0-2] is cipher/clear text... 283238384Sjkim$code.=<<___; 284238384Sjkim.type _aesni_${dir}rypt3,\@abi-omnipotent 285238384Sjkim.align 16 286238384Sjkim_aesni_${dir}rypt3: 287238384Sjkim $movkey ($key),$rndkey0 288238384Sjkim shr \$1,$rounds 289238384Sjkim $movkey 16($key),$rndkey1 290238384Sjkim lea 32($key),$key 291238384Sjkim xorps $rndkey0,$inout0 292238384Sjkim xorps $rndkey0,$inout1 293238384Sjkim xorps $rndkey0,$inout2 294238384Sjkim $movkey ($key),$rndkey0 295238384Sjkim 296238384Sjkim.L${dir}_loop3: 297238384Sjkim aes${dir} $rndkey1,$inout0 298238384Sjkim aes${dir} $rndkey1,$inout1 299238384Sjkim dec $rounds 300238384Sjkim aes${dir} $rndkey1,$inout2 301238384Sjkim $movkey 16($key),$rndkey1 302238384Sjkim aes${dir} $rndkey0,$inout0 303238384Sjkim aes${dir} $rndkey0,$inout1 304238384Sjkim lea 32($key),$key 305238384Sjkim aes${dir} $rndkey0,$inout2 306238384Sjkim $movkey ($key),$rndkey0 307238384Sjkim jnz .L${dir}_loop3 308238384Sjkim 309238384Sjkim aes${dir} $rndkey1,$inout0 310238384Sjkim aes${dir} $rndkey1,$inout1 311238384Sjkim aes${dir} $rndkey1,$inout2 312238384Sjkim aes${dir}last $rndkey0,$inout0 313238384Sjkim aes${dir}last $rndkey0,$inout1 314238384Sjkim aes${dir}last $rndkey0,$inout2 315238384Sjkim ret 316238384Sjkim.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 317238384Sjkim___ 318238384Sjkim} 319238384Sjkim# 4x interleave is implemented to improve small block performance, 320238384Sjkim# most notably [and naturally] 4 block by ~30%. One can argue that one 321238384Sjkim# should have implemented 5x as well, but improvement would be <20%, 322238384Sjkim# so it's not worth it... 323238384Sjkimsub aesni_generate4 { 324238384Sjkimmy $dir=shift; 325238384Sjkim# As already mentioned it takes in $key and $rounds, which are *not* 326238384Sjkim# preserved. $inout[0-3] is cipher/clear text... 327238384Sjkim$code.=<<___; 328238384Sjkim.type _aesni_${dir}rypt4,\@abi-omnipotent 329238384Sjkim.align 16 330238384Sjkim_aesni_${dir}rypt4: 331238384Sjkim $movkey ($key),$rndkey0 332238384Sjkim shr \$1,$rounds 333238384Sjkim $movkey 16($key),$rndkey1 334238384Sjkim lea 32($key),$key 335238384Sjkim xorps $rndkey0,$inout0 336238384Sjkim xorps $rndkey0,$inout1 337238384Sjkim xorps $rndkey0,$inout2 338238384Sjkim xorps $rndkey0,$inout3 339238384Sjkim $movkey ($key),$rndkey0 340238384Sjkim 341238384Sjkim.L${dir}_loop4: 342238384Sjkim aes${dir} $rndkey1,$inout0 343238384Sjkim aes${dir} $rndkey1,$inout1 344238384Sjkim dec $rounds 345238384Sjkim aes${dir} $rndkey1,$inout2 346238384Sjkim aes${dir} $rndkey1,$inout3 347238384Sjkim $movkey 16($key),$rndkey1 348238384Sjkim aes${dir} $rndkey0,$inout0 349238384Sjkim aes${dir} $rndkey0,$inout1 350238384Sjkim lea 32($key),$key 351238384Sjkim aes${dir} $rndkey0,$inout2 352238384Sjkim aes${dir} $rndkey0,$inout3 353238384Sjkim $movkey ($key),$rndkey0 354238384Sjkim jnz .L${dir}_loop4 355238384Sjkim 356238384Sjkim aes${dir} $rndkey1,$inout0 357238384Sjkim aes${dir} $rndkey1,$inout1 358238384Sjkim aes${dir} $rndkey1,$inout2 359238384Sjkim aes${dir} $rndkey1,$inout3 360238384Sjkim aes${dir}last $rndkey0,$inout0 361238384Sjkim aes${dir}last $rndkey0,$inout1 362238384Sjkim aes${dir}last $rndkey0,$inout2 363238384Sjkim aes${dir}last $rndkey0,$inout3 364238384Sjkim ret 365238384Sjkim.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 366238384Sjkim___ 367238384Sjkim} 368238384Sjkimsub aesni_generate6 { 369238384Sjkimmy $dir=shift; 370238384Sjkim# As already mentioned it takes in $key and $rounds, which are *not* 371238384Sjkim# preserved. $inout[0-5] is cipher/clear text... 372238384Sjkim$code.=<<___; 373238384Sjkim.type _aesni_${dir}rypt6,\@abi-omnipotent 374238384Sjkim.align 16 375238384Sjkim_aesni_${dir}rypt6: 376238384Sjkim $movkey ($key),$rndkey0 377238384Sjkim shr \$1,$rounds 378238384Sjkim $movkey 16($key),$rndkey1 379238384Sjkim lea 32($key),$key 380238384Sjkim xorps $rndkey0,$inout0 381238384Sjkim pxor $rndkey0,$inout1 382238384Sjkim aes${dir} $rndkey1,$inout0 383238384Sjkim pxor $rndkey0,$inout2 384238384Sjkim aes${dir} $rndkey1,$inout1 385238384Sjkim pxor $rndkey0,$inout3 386238384Sjkim aes${dir} $rndkey1,$inout2 387238384Sjkim pxor $rndkey0,$inout4 388238384Sjkim aes${dir} $rndkey1,$inout3 389238384Sjkim pxor $rndkey0,$inout5 390238384Sjkim dec $rounds 391238384Sjkim aes${dir} $rndkey1,$inout4 392238384Sjkim $movkey ($key),$rndkey0 393238384Sjkim aes${dir} $rndkey1,$inout5 394238384Sjkim jmp .L${dir}_loop6_enter 395238384Sjkim.align 16 396238384Sjkim.L${dir}_loop6: 397238384Sjkim aes${dir} $rndkey1,$inout0 398238384Sjkim aes${dir} $rndkey1,$inout1 399238384Sjkim dec $rounds 400238384Sjkim aes${dir} $rndkey1,$inout2 401238384Sjkim aes${dir} $rndkey1,$inout3 402238384Sjkim aes${dir} $rndkey1,$inout4 403238384Sjkim aes${dir} $rndkey1,$inout5 404238384Sjkim.L${dir}_loop6_enter: # happens to be 16-byte aligned 405238384Sjkim $movkey 16($key),$rndkey1 406238384Sjkim aes${dir} $rndkey0,$inout0 407238384Sjkim aes${dir} $rndkey0,$inout1 408238384Sjkim lea 32($key),$key 409238384Sjkim aes${dir} $rndkey0,$inout2 410238384Sjkim aes${dir} $rndkey0,$inout3 411238384Sjkim aes${dir} $rndkey0,$inout4 412238384Sjkim aes${dir} $rndkey0,$inout5 413238384Sjkim $movkey ($key),$rndkey0 414238384Sjkim jnz .L${dir}_loop6 415238384Sjkim 416238384Sjkim aes${dir} $rndkey1,$inout0 417238384Sjkim aes${dir} $rndkey1,$inout1 418238384Sjkim aes${dir} $rndkey1,$inout2 419238384Sjkim aes${dir} $rndkey1,$inout3 420238384Sjkim aes${dir} $rndkey1,$inout4 421238384Sjkim aes${dir} $rndkey1,$inout5 422238384Sjkim aes${dir}last $rndkey0,$inout0 423238384Sjkim aes${dir}last $rndkey0,$inout1 424238384Sjkim aes${dir}last $rndkey0,$inout2 425238384Sjkim aes${dir}last $rndkey0,$inout3 426238384Sjkim aes${dir}last $rndkey0,$inout4 427238384Sjkim aes${dir}last $rndkey0,$inout5 428238384Sjkim ret 429238384Sjkim.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 430238384Sjkim___ 431238384Sjkim} 432238384Sjkimsub aesni_generate8 { 433238384Sjkimmy $dir=shift; 434238384Sjkim# As already mentioned it takes in $key and $rounds, which are *not* 435238384Sjkim# preserved. $inout[0-7] is cipher/clear text... 436238384Sjkim$code.=<<___; 437238384Sjkim.type _aesni_${dir}rypt8,\@abi-omnipotent 438238384Sjkim.align 16 439238384Sjkim_aesni_${dir}rypt8: 440238384Sjkim $movkey ($key),$rndkey0 441238384Sjkim shr \$1,$rounds 442238384Sjkim $movkey 16($key),$rndkey1 443238384Sjkim lea 32($key),$key 444238384Sjkim xorps $rndkey0,$inout0 445238384Sjkim xorps $rndkey0,$inout1 446238384Sjkim aes${dir} $rndkey1,$inout0 447238384Sjkim pxor $rndkey0,$inout2 448238384Sjkim aes${dir} $rndkey1,$inout1 449238384Sjkim pxor $rndkey0,$inout3 450238384Sjkim aes${dir} $rndkey1,$inout2 451238384Sjkim pxor $rndkey0,$inout4 452238384Sjkim aes${dir} $rndkey1,$inout3 453238384Sjkim pxor $rndkey0,$inout5 454238384Sjkim dec $rounds 455238384Sjkim aes${dir} $rndkey1,$inout4 456238384Sjkim pxor $rndkey0,$inout6 457238384Sjkim aes${dir} $rndkey1,$inout5 458238384Sjkim pxor $rndkey0,$inout7 459238384Sjkim $movkey ($key),$rndkey0 460238384Sjkim aes${dir} $rndkey1,$inout6 461238384Sjkim aes${dir} $rndkey1,$inout7 462238384Sjkim $movkey 16($key),$rndkey1 463238384Sjkim jmp .L${dir}_loop8_enter 464238384Sjkim.align 16 465238384Sjkim.L${dir}_loop8: 466238384Sjkim aes${dir} $rndkey1,$inout0 467238384Sjkim aes${dir} $rndkey1,$inout1 468238384Sjkim dec $rounds 469238384Sjkim aes${dir} $rndkey1,$inout2 470238384Sjkim aes${dir} $rndkey1,$inout3 471238384Sjkim aes${dir} $rndkey1,$inout4 472238384Sjkim aes${dir} $rndkey1,$inout5 473238384Sjkim aes${dir} $rndkey1,$inout6 474238384Sjkim aes${dir} $rndkey1,$inout7 475238384Sjkim $movkey 16($key),$rndkey1 476238384Sjkim.L${dir}_loop8_enter: # happens to be 16-byte aligned 477238384Sjkim aes${dir} $rndkey0,$inout0 478238384Sjkim aes${dir} $rndkey0,$inout1 479238384Sjkim lea 32($key),$key 480238384Sjkim aes${dir} $rndkey0,$inout2 481238384Sjkim aes${dir} $rndkey0,$inout3 482238384Sjkim aes${dir} $rndkey0,$inout4 483238384Sjkim aes${dir} $rndkey0,$inout5 484238384Sjkim aes${dir} $rndkey0,$inout6 485238384Sjkim aes${dir} $rndkey0,$inout7 486238384Sjkim $movkey ($key),$rndkey0 487238384Sjkim jnz .L${dir}_loop8 488238384Sjkim 489238384Sjkim aes${dir} $rndkey1,$inout0 490238384Sjkim aes${dir} $rndkey1,$inout1 491238384Sjkim aes${dir} $rndkey1,$inout2 492238384Sjkim aes${dir} $rndkey1,$inout3 493238384Sjkim aes${dir} $rndkey1,$inout4 494238384Sjkim aes${dir} $rndkey1,$inout5 495238384Sjkim aes${dir} $rndkey1,$inout6 496238384Sjkim aes${dir} $rndkey1,$inout7 497238384Sjkim aes${dir}last $rndkey0,$inout0 498238384Sjkim aes${dir}last $rndkey0,$inout1 499238384Sjkim aes${dir}last $rndkey0,$inout2 500238384Sjkim aes${dir}last $rndkey0,$inout3 501238384Sjkim aes${dir}last $rndkey0,$inout4 502238384Sjkim aes${dir}last $rndkey0,$inout5 503238384Sjkim aes${dir}last $rndkey0,$inout6 504238384Sjkim aes${dir}last $rndkey0,$inout7 505238384Sjkim ret 506238384Sjkim.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 507238384Sjkim___ 508238384Sjkim} 509238384Sjkim&aesni_generate3("enc") if ($PREFIX eq "aesni"); 510238384Sjkim&aesni_generate3("dec"); 511238384Sjkim&aesni_generate4("enc") if ($PREFIX eq "aesni"); 512238384Sjkim&aesni_generate4("dec"); 513238384Sjkim&aesni_generate6("enc") if ($PREFIX eq "aesni"); 514238384Sjkim&aesni_generate6("dec"); 515238384Sjkim&aesni_generate8("enc") if ($PREFIX eq "aesni"); 516238384Sjkim&aesni_generate8("dec"); 517238384Sjkim 518238384Sjkimif ($PREFIX eq "aesni") { 519238384Sjkim######################################################################## 520238384Sjkim# void aesni_ecb_encrypt (const void *in, void *out, 521238384Sjkim# size_t length, const AES_KEY *key, 522238384Sjkim# int enc); 523238384Sjkim$code.=<<___; 524238384Sjkim.globl aesni_ecb_encrypt 525238384Sjkim.type aesni_ecb_encrypt,\@function,5 526238384Sjkim.align 16 527238384Sjkimaesni_ecb_encrypt: 528273399Sdelphij___ 529273399Sdelphij$code.=<<___ if ($win64); 530273399Sdelphij lea -0x58(%rsp),%rsp 531273399Sdelphij movaps %xmm6,(%rsp) 532273399Sdelphij movaps %xmm7,0x10(%rsp) 533273399Sdelphij movaps %xmm8,0x20(%rsp) 534273399Sdelphij movaps %xmm9,0x30(%rsp) 535273399Sdelphij.Lecb_enc_body: 536273399Sdelphij___ 537273399Sdelphij$code.=<<___; 538238384Sjkim and \$-16,$len 539238384Sjkim jz .Lecb_ret 540238384Sjkim 541238384Sjkim mov 240($key),$rounds # key->rounds 542238384Sjkim $movkey ($key),$rndkey0 543238384Sjkim mov $key,$key_ # backup $key 544238384Sjkim mov $rounds,$rnds_ # backup $rounds 545238384Sjkim test %r8d,%r8d # 5th argument 546238384Sjkim jz .Lecb_decrypt 547238384Sjkim#--------------------------- ECB ENCRYPT ------------------------------# 548238384Sjkim cmp \$0x80,$len 549238384Sjkim jb .Lecb_enc_tail 550238384Sjkim 551238384Sjkim movdqu ($inp),$inout0 552238384Sjkim movdqu 0x10($inp),$inout1 553238384Sjkim movdqu 0x20($inp),$inout2 554238384Sjkim movdqu 0x30($inp),$inout3 555238384Sjkim movdqu 0x40($inp),$inout4 556238384Sjkim movdqu 0x50($inp),$inout5 557238384Sjkim movdqu 0x60($inp),$inout6 558238384Sjkim movdqu 0x70($inp),$inout7 559238384Sjkim lea 0x80($inp),$inp 560238384Sjkim sub \$0x80,$len 561238384Sjkim jmp .Lecb_enc_loop8_enter 562238384Sjkim.align 16 563238384Sjkim.Lecb_enc_loop8: 564238384Sjkim movups $inout0,($out) 565238384Sjkim mov $key_,$key # restore $key 566238384Sjkim movdqu ($inp),$inout0 567238384Sjkim mov $rnds_,$rounds # restore $rounds 568238384Sjkim movups $inout1,0x10($out) 569238384Sjkim movdqu 0x10($inp),$inout1 570238384Sjkim movups $inout2,0x20($out) 571238384Sjkim movdqu 0x20($inp),$inout2 572238384Sjkim movups $inout3,0x30($out) 573238384Sjkim movdqu 0x30($inp),$inout3 574238384Sjkim movups $inout4,0x40($out) 575238384Sjkim movdqu 0x40($inp),$inout4 576238384Sjkim movups $inout5,0x50($out) 577238384Sjkim movdqu 0x50($inp),$inout5 578238384Sjkim movups $inout6,0x60($out) 579238384Sjkim movdqu 0x60($inp),$inout6 580238384Sjkim movups $inout7,0x70($out) 581238384Sjkim lea 0x80($out),$out 582238384Sjkim movdqu 0x70($inp),$inout7 583238384Sjkim lea 0x80($inp),$inp 584238384Sjkim.Lecb_enc_loop8_enter: 585238384Sjkim 586238384Sjkim call _aesni_encrypt8 587238384Sjkim 588238384Sjkim sub \$0x80,$len 589238384Sjkim jnc .Lecb_enc_loop8 590238384Sjkim 591238384Sjkim movups $inout0,($out) 592238384Sjkim mov $key_,$key # restore $key 593238384Sjkim movups $inout1,0x10($out) 594238384Sjkim mov $rnds_,$rounds # restore $rounds 595238384Sjkim movups $inout2,0x20($out) 596238384Sjkim movups $inout3,0x30($out) 597238384Sjkim movups $inout4,0x40($out) 598238384Sjkim movups $inout5,0x50($out) 599238384Sjkim movups $inout6,0x60($out) 600238384Sjkim movups $inout7,0x70($out) 601238384Sjkim lea 0x80($out),$out 602238384Sjkim add \$0x80,$len 603238384Sjkim jz .Lecb_ret 604238384Sjkim 605238384Sjkim.Lecb_enc_tail: 606238384Sjkim movups ($inp),$inout0 607238384Sjkim cmp \$0x20,$len 608238384Sjkim jb .Lecb_enc_one 609238384Sjkim movups 0x10($inp),$inout1 610238384Sjkim je .Lecb_enc_two 611238384Sjkim movups 0x20($inp),$inout2 612238384Sjkim cmp \$0x40,$len 613238384Sjkim jb .Lecb_enc_three 614238384Sjkim movups 0x30($inp),$inout3 615238384Sjkim je .Lecb_enc_four 616238384Sjkim movups 0x40($inp),$inout4 617238384Sjkim cmp \$0x60,$len 618238384Sjkim jb .Lecb_enc_five 619238384Sjkim movups 0x50($inp),$inout5 620238384Sjkim je .Lecb_enc_six 621238384Sjkim movdqu 0x60($inp),$inout6 622238384Sjkim call _aesni_encrypt8 623238384Sjkim movups $inout0,($out) 624238384Sjkim movups $inout1,0x10($out) 625238384Sjkim movups $inout2,0x20($out) 626238384Sjkim movups $inout3,0x30($out) 627238384Sjkim movups $inout4,0x40($out) 628238384Sjkim movups $inout5,0x50($out) 629238384Sjkim movups $inout6,0x60($out) 630238384Sjkim jmp .Lecb_ret 631238384Sjkim.align 16 632238384Sjkim.Lecb_enc_one: 633238384Sjkim___ 634238384Sjkim &aesni_generate1("enc",$key,$rounds); 635238384Sjkim$code.=<<___; 636238384Sjkim movups $inout0,($out) 637238384Sjkim jmp .Lecb_ret 638238384Sjkim.align 16 639238384Sjkim.Lecb_enc_two: 640238384Sjkim xorps $inout2,$inout2 641238384Sjkim call _aesni_encrypt3 642238384Sjkim movups $inout0,($out) 643238384Sjkim movups $inout1,0x10($out) 644238384Sjkim jmp .Lecb_ret 645238384Sjkim.align 16 646238384Sjkim.Lecb_enc_three: 647238384Sjkim call _aesni_encrypt3 648238384Sjkim movups $inout0,($out) 649238384Sjkim movups $inout1,0x10($out) 650238384Sjkim movups $inout2,0x20($out) 651238384Sjkim jmp .Lecb_ret 652238384Sjkim.align 16 653238384Sjkim.Lecb_enc_four: 654238384Sjkim call _aesni_encrypt4 655238384Sjkim movups $inout0,($out) 656238384Sjkim movups $inout1,0x10($out) 657238384Sjkim movups $inout2,0x20($out) 658238384Sjkim movups $inout3,0x30($out) 659238384Sjkim jmp .Lecb_ret 660238384Sjkim.align 16 661238384Sjkim.Lecb_enc_five: 662238384Sjkim xorps $inout5,$inout5 663238384Sjkim call _aesni_encrypt6 664238384Sjkim movups $inout0,($out) 665238384Sjkim movups $inout1,0x10($out) 666238384Sjkim movups $inout2,0x20($out) 667238384Sjkim movups $inout3,0x30($out) 668238384Sjkim movups $inout4,0x40($out) 669238384Sjkim jmp .Lecb_ret 670238384Sjkim.align 16 671238384Sjkim.Lecb_enc_six: 672238384Sjkim call _aesni_encrypt6 673238384Sjkim movups $inout0,($out) 674238384Sjkim movups $inout1,0x10($out) 675238384Sjkim movups $inout2,0x20($out) 676238384Sjkim movups $inout3,0x30($out) 677238384Sjkim movups $inout4,0x40($out) 678238384Sjkim movups $inout5,0x50($out) 679238384Sjkim jmp .Lecb_ret 680238384Sjkim#--------------------------- ECB DECRYPT ------------------------------# 681238384Sjkim.align 16 682238384Sjkim.Lecb_decrypt: 683238384Sjkim cmp \$0x80,$len 684238384Sjkim jb .Lecb_dec_tail 685238384Sjkim 686238384Sjkim movdqu ($inp),$inout0 687238384Sjkim movdqu 0x10($inp),$inout1 688238384Sjkim movdqu 0x20($inp),$inout2 689238384Sjkim movdqu 0x30($inp),$inout3 690238384Sjkim movdqu 0x40($inp),$inout4 691238384Sjkim movdqu 0x50($inp),$inout5 692238384Sjkim movdqu 0x60($inp),$inout6 693238384Sjkim movdqu 0x70($inp),$inout7 694238384Sjkim lea 0x80($inp),$inp 695238384Sjkim sub \$0x80,$len 696238384Sjkim jmp .Lecb_dec_loop8_enter 697238384Sjkim.align 16 698238384Sjkim.Lecb_dec_loop8: 699238384Sjkim movups $inout0,($out) 700238384Sjkim mov $key_,$key # restore $key 701238384Sjkim movdqu ($inp),$inout0 702238384Sjkim mov $rnds_,$rounds # restore $rounds 703238384Sjkim movups $inout1,0x10($out) 704238384Sjkim movdqu 0x10($inp),$inout1 705238384Sjkim movups $inout2,0x20($out) 706238384Sjkim movdqu 0x20($inp),$inout2 707238384Sjkim movups $inout3,0x30($out) 708238384Sjkim movdqu 0x30($inp),$inout3 709238384Sjkim movups $inout4,0x40($out) 710238384Sjkim movdqu 0x40($inp),$inout4 711238384Sjkim movups $inout5,0x50($out) 712238384Sjkim movdqu 0x50($inp),$inout5 713238384Sjkim movups $inout6,0x60($out) 714238384Sjkim movdqu 0x60($inp),$inout6 715238384Sjkim movups $inout7,0x70($out) 716238384Sjkim lea 0x80($out),$out 717238384Sjkim movdqu 0x70($inp),$inout7 718238384Sjkim lea 0x80($inp),$inp 719238384Sjkim.Lecb_dec_loop8_enter: 720238384Sjkim 721238384Sjkim call _aesni_decrypt8 722238384Sjkim 723238384Sjkim $movkey ($key_),$rndkey0 724238384Sjkim sub \$0x80,$len 725238384Sjkim jnc .Lecb_dec_loop8 726238384Sjkim 727238384Sjkim movups $inout0,($out) 728238384Sjkim mov $key_,$key # restore $key 729238384Sjkim movups $inout1,0x10($out) 730238384Sjkim mov $rnds_,$rounds # restore $rounds 731238384Sjkim movups $inout2,0x20($out) 732238384Sjkim movups $inout3,0x30($out) 733238384Sjkim movups $inout4,0x40($out) 734238384Sjkim movups $inout5,0x50($out) 735238384Sjkim movups $inout6,0x60($out) 736238384Sjkim movups $inout7,0x70($out) 737238384Sjkim lea 0x80($out),$out 738238384Sjkim add \$0x80,$len 739238384Sjkim jz .Lecb_ret 740238384Sjkim 741238384Sjkim.Lecb_dec_tail: 742238384Sjkim movups ($inp),$inout0 743238384Sjkim cmp \$0x20,$len 744238384Sjkim jb .Lecb_dec_one 745238384Sjkim movups 0x10($inp),$inout1 746238384Sjkim je .Lecb_dec_two 747238384Sjkim movups 0x20($inp),$inout2 748238384Sjkim cmp \$0x40,$len 749238384Sjkim jb .Lecb_dec_three 750238384Sjkim movups 0x30($inp),$inout3 751238384Sjkim je .Lecb_dec_four 752238384Sjkim movups 0x40($inp),$inout4 753238384Sjkim cmp \$0x60,$len 754238384Sjkim jb .Lecb_dec_five 755238384Sjkim movups 0x50($inp),$inout5 756238384Sjkim je .Lecb_dec_six 757238384Sjkim movups 0x60($inp),$inout6 758238384Sjkim $movkey ($key),$rndkey0 759238384Sjkim call _aesni_decrypt8 760238384Sjkim movups $inout0,($out) 761238384Sjkim movups $inout1,0x10($out) 762238384Sjkim movups $inout2,0x20($out) 763238384Sjkim movups $inout3,0x30($out) 764238384Sjkim movups $inout4,0x40($out) 765238384Sjkim movups $inout5,0x50($out) 766238384Sjkim movups $inout6,0x60($out) 767238384Sjkim jmp .Lecb_ret 768238384Sjkim.align 16 769238384Sjkim.Lecb_dec_one: 770238384Sjkim___ 771238384Sjkim &aesni_generate1("dec",$key,$rounds); 772238384Sjkim$code.=<<___; 773238384Sjkim movups $inout0,($out) 774238384Sjkim jmp .Lecb_ret 775238384Sjkim.align 16 776238384Sjkim.Lecb_dec_two: 777238384Sjkim xorps $inout2,$inout2 778238384Sjkim call _aesni_decrypt3 779238384Sjkim movups $inout0,($out) 780238384Sjkim movups $inout1,0x10($out) 781238384Sjkim jmp .Lecb_ret 782238384Sjkim.align 16 783238384Sjkim.Lecb_dec_three: 784238384Sjkim call _aesni_decrypt3 785238384Sjkim movups $inout0,($out) 786238384Sjkim movups $inout1,0x10($out) 787238384Sjkim movups $inout2,0x20($out) 788238384Sjkim jmp .Lecb_ret 789238384Sjkim.align 16 790238384Sjkim.Lecb_dec_four: 791238384Sjkim call _aesni_decrypt4 792238384Sjkim movups $inout0,($out) 793238384Sjkim movups $inout1,0x10($out) 794238384Sjkim movups $inout2,0x20($out) 795238384Sjkim movups $inout3,0x30($out) 796238384Sjkim jmp .Lecb_ret 797238384Sjkim.align 16 798238384Sjkim.Lecb_dec_five: 799238384Sjkim xorps $inout5,$inout5 800238384Sjkim call _aesni_decrypt6 801238384Sjkim movups $inout0,($out) 802238384Sjkim movups $inout1,0x10($out) 803238384Sjkim movups $inout2,0x20($out) 804238384Sjkim movups $inout3,0x30($out) 805238384Sjkim movups $inout4,0x40($out) 806238384Sjkim jmp .Lecb_ret 807238384Sjkim.align 16 808238384Sjkim.Lecb_dec_six: 809238384Sjkim call _aesni_decrypt6 810238384Sjkim movups $inout0,($out) 811238384Sjkim movups $inout1,0x10($out) 812238384Sjkim movups $inout2,0x20($out) 813238384Sjkim movups $inout3,0x30($out) 814238384Sjkim movups $inout4,0x40($out) 815238384Sjkim movups $inout5,0x50($out) 816238384Sjkim 817238384Sjkim.Lecb_ret: 818273399Sdelphij___ 819273399Sdelphij$code.=<<___ if ($win64); 820273399Sdelphij movaps (%rsp),%xmm6 821273399Sdelphij movaps 0x10(%rsp),%xmm7 822273399Sdelphij movaps 0x20(%rsp),%xmm8 823273399Sdelphij movaps 0x30(%rsp),%xmm9 824273399Sdelphij lea 0x58(%rsp),%rsp 825273399Sdelphij.Lecb_enc_ret: 826273399Sdelphij___ 827273399Sdelphij$code.=<<___; 828238384Sjkim ret 829238384Sjkim.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 830238384Sjkim___ 831238384Sjkim 832238384Sjkim{ 833238384Sjkim###################################################################### 834238384Sjkim# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 835238384Sjkim# size_t blocks, const AES_KEY *key, 836238384Sjkim# const char *ivec,char *cmac); 837238384Sjkim# 838238384Sjkim# Handles only complete blocks, operates on 64-bit counter and 839238384Sjkim# does not update *ivec! Nor does it finalize CMAC value 840238384Sjkim# (see engine/eng_aesni.c for details) 841238384Sjkim# 842238384Sjkim{ 843238384Sjkimmy $cmac="%r9"; # 6th argument 844238384Sjkim 845238384Sjkimmy $increment="%xmm6"; 846238384Sjkimmy $bswap_mask="%xmm7"; 847238384Sjkim 848238384Sjkim$code.=<<___; 849238384Sjkim.globl aesni_ccm64_encrypt_blocks 850238384Sjkim.type aesni_ccm64_encrypt_blocks,\@function,6 851238384Sjkim.align 16 852238384Sjkimaesni_ccm64_encrypt_blocks: 853238384Sjkim___ 854238384Sjkim$code.=<<___ if ($win64); 855238384Sjkim lea -0x58(%rsp),%rsp 856238384Sjkim movaps %xmm6,(%rsp) 857238384Sjkim movaps %xmm7,0x10(%rsp) 858238384Sjkim movaps %xmm8,0x20(%rsp) 859238384Sjkim movaps %xmm9,0x30(%rsp) 860238384Sjkim.Lccm64_enc_body: 861238384Sjkim___ 862238384Sjkim$code.=<<___; 863238384Sjkim mov 240($key),$rounds # key->rounds 864238384Sjkim movdqu ($ivp),$iv 865238384Sjkim movdqa .Lincrement64(%rip),$increment 866238384Sjkim movdqa .Lbswap_mask(%rip),$bswap_mask 867238384Sjkim 868238384Sjkim shr \$1,$rounds 869238384Sjkim lea 0($key),$key_ 870238384Sjkim movdqu ($cmac),$inout1 871238384Sjkim movdqa $iv,$inout0 872238384Sjkim mov $rounds,$rnds_ 873238384Sjkim pshufb $bswap_mask,$iv 874238384Sjkim jmp .Lccm64_enc_outer 875238384Sjkim.align 16 876238384Sjkim.Lccm64_enc_outer: 877238384Sjkim $movkey ($key_),$rndkey0 878238384Sjkim mov $rnds_,$rounds 879238384Sjkim movups ($inp),$in0 # load inp 880238384Sjkim 881238384Sjkim xorps $rndkey0,$inout0 # counter 882238384Sjkim $movkey 16($key_),$rndkey1 883238384Sjkim xorps $in0,$rndkey0 884238384Sjkim lea 32($key_),$key 885238384Sjkim xorps $rndkey0,$inout1 # cmac^=inp 886238384Sjkim $movkey ($key),$rndkey0 887238384Sjkim 888238384Sjkim.Lccm64_enc2_loop: 889238384Sjkim aesenc $rndkey1,$inout0 890238384Sjkim dec $rounds 891238384Sjkim aesenc $rndkey1,$inout1 892238384Sjkim $movkey 16($key),$rndkey1 893238384Sjkim aesenc $rndkey0,$inout0 894238384Sjkim lea 32($key),$key 895238384Sjkim aesenc $rndkey0,$inout1 896238384Sjkim $movkey 0($key),$rndkey0 897238384Sjkim jnz .Lccm64_enc2_loop 898238384Sjkim aesenc $rndkey1,$inout0 899238384Sjkim aesenc $rndkey1,$inout1 900238384Sjkim paddq $increment,$iv 901238384Sjkim aesenclast $rndkey0,$inout0 902238384Sjkim aesenclast $rndkey0,$inout1 903238384Sjkim 904238384Sjkim dec $len 905238384Sjkim lea 16($inp),$inp 906238384Sjkim xorps $inout0,$in0 # inp ^= E(iv) 907238384Sjkim movdqa $iv,$inout0 908238384Sjkim movups $in0,($out) # save output 909238384Sjkim lea 16($out),$out 910238384Sjkim pshufb $bswap_mask,$inout0 911238384Sjkim jnz .Lccm64_enc_outer 912238384Sjkim 913238384Sjkim movups $inout1,($cmac) 914238384Sjkim___ 915238384Sjkim$code.=<<___ if ($win64); 916238384Sjkim movaps (%rsp),%xmm6 917238384Sjkim movaps 0x10(%rsp),%xmm7 918238384Sjkim movaps 0x20(%rsp),%xmm8 919238384Sjkim movaps 0x30(%rsp),%xmm9 920238384Sjkim lea 0x58(%rsp),%rsp 921238384Sjkim.Lccm64_enc_ret: 922238384Sjkim___ 923238384Sjkim$code.=<<___; 924238384Sjkim ret 925238384Sjkim.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 926238384Sjkim___ 927238384Sjkim###################################################################### 928238384Sjkim$code.=<<___; 929238384Sjkim.globl aesni_ccm64_decrypt_blocks 930238384Sjkim.type aesni_ccm64_decrypt_blocks,\@function,6 931238384Sjkim.align 16 932238384Sjkimaesni_ccm64_decrypt_blocks: 933238384Sjkim___ 934238384Sjkim$code.=<<___ if ($win64); 935238384Sjkim lea -0x58(%rsp),%rsp 936238384Sjkim movaps %xmm6,(%rsp) 937238384Sjkim movaps %xmm7,0x10(%rsp) 938238384Sjkim movaps %xmm8,0x20(%rsp) 939238384Sjkim movaps %xmm9,0x30(%rsp) 940238384Sjkim.Lccm64_dec_body: 941238384Sjkim___ 942238384Sjkim$code.=<<___; 943238384Sjkim mov 240($key),$rounds # key->rounds 944238384Sjkim movups ($ivp),$iv 945238384Sjkim movdqu ($cmac),$inout1 946238384Sjkim movdqa .Lincrement64(%rip),$increment 947238384Sjkim movdqa .Lbswap_mask(%rip),$bswap_mask 948238384Sjkim 949238384Sjkim movaps $iv,$inout0 950238384Sjkim mov $rounds,$rnds_ 951238384Sjkim mov $key,$key_ 952238384Sjkim pshufb $bswap_mask,$iv 953238384Sjkim___ 954238384Sjkim &aesni_generate1("enc",$key,$rounds); 955238384Sjkim$code.=<<___; 956238384Sjkim movups ($inp),$in0 # load inp 957238384Sjkim paddq $increment,$iv 958238384Sjkim lea 16($inp),$inp 959238384Sjkim jmp .Lccm64_dec_outer 960238384Sjkim.align 16 961238384Sjkim.Lccm64_dec_outer: 962238384Sjkim xorps $inout0,$in0 # inp ^= E(iv) 963238384Sjkim movdqa $iv,$inout0 964238384Sjkim mov $rnds_,$rounds 965238384Sjkim movups $in0,($out) # save output 966238384Sjkim lea 16($out),$out 967238384Sjkim pshufb $bswap_mask,$inout0 968238384Sjkim 969238384Sjkim sub \$1,$len 970238384Sjkim jz .Lccm64_dec_break 971238384Sjkim 972238384Sjkim $movkey ($key_),$rndkey0 973238384Sjkim shr \$1,$rounds 974238384Sjkim $movkey 16($key_),$rndkey1 975238384Sjkim xorps $rndkey0,$in0 976238384Sjkim lea 32($key_),$key 977238384Sjkim xorps $rndkey0,$inout0 978238384Sjkim xorps $in0,$inout1 # cmac^=out 979238384Sjkim $movkey ($key),$rndkey0 980238384Sjkim 981238384Sjkim.Lccm64_dec2_loop: 982238384Sjkim aesenc $rndkey1,$inout0 983238384Sjkim dec $rounds 984238384Sjkim aesenc $rndkey1,$inout1 985238384Sjkim $movkey 16($key),$rndkey1 986238384Sjkim aesenc $rndkey0,$inout0 987238384Sjkim lea 32($key),$key 988238384Sjkim aesenc $rndkey0,$inout1 989238384Sjkim $movkey 0($key),$rndkey0 990238384Sjkim jnz .Lccm64_dec2_loop 991238384Sjkim movups ($inp),$in0 # load inp 992238384Sjkim paddq $increment,$iv 993238384Sjkim aesenc $rndkey1,$inout0 994238384Sjkim aesenc $rndkey1,$inout1 995238384Sjkim lea 16($inp),$inp 996238384Sjkim aesenclast $rndkey0,$inout0 997238384Sjkim aesenclast $rndkey0,$inout1 998238384Sjkim jmp .Lccm64_dec_outer 999238384Sjkim 1000238384Sjkim.align 16 1001238384Sjkim.Lccm64_dec_break: 1002238384Sjkim #xorps $in0,$inout1 # cmac^=out 1003238384Sjkim___ 1004238384Sjkim &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1005238384Sjkim$code.=<<___; 1006238384Sjkim movups $inout1,($cmac) 1007238384Sjkim___ 1008238384Sjkim$code.=<<___ if ($win64); 1009238384Sjkim movaps (%rsp),%xmm6 1010238384Sjkim movaps 0x10(%rsp),%xmm7 1011238384Sjkim movaps 0x20(%rsp),%xmm8 1012238384Sjkim movaps 0x30(%rsp),%xmm9 1013238384Sjkim lea 0x58(%rsp),%rsp 1014238384Sjkim.Lccm64_dec_ret: 1015238384Sjkim___ 1016238384Sjkim$code.=<<___; 1017238384Sjkim ret 1018238384Sjkim.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1019238384Sjkim___ 1020238384Sjkim} 1021238384Sjkim###################################################################### 1022238384Sjkim# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1023238384Sjkim# size_t blocks, const AES_KEY *key, 1024238384Sjkim# const char *ivec); 1025238384Sjkim# 1026238384Sjkim# Handles only complete blocks, operates on 32-bit counter and 1027238384Sjkim# does not update *ivec! (see engine/eng_aesni.c for details) 1028238384Sjkim# 1029238384Sjkim{ 1030238384Sjkimmy $reserved = $win64?0:-0x28; 1031238384Sjkimmy ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); 1032238384Sjkimmy ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); 1033238384Sjkimmy $bswap_mask="%xmm15"; 1034238384Sjkim 1035238384Sjkim$code.=<<___; 1036238384Sjkim.globl aesni_ctr32_encrypt_blocks 1037238384Sjkim.type aesni_ctr32_encrypt_blocks,\@function,5 1038238384Sjkim.align 16 1039238384Sjkimaesni_ctr32_encrypt_blocks: 1040238384Sjkim___ 1041238384Sjkim$code.=<<___ if ($win64); 1042238384Sjkim lea -0xc8(%rsp),%rsp 1043238384Sjkim movaps %xmm6,0x20(%rsp) 1044238384Sjkim movaps %xmm7,0x30(%rsp) 1045238384Sjkim movaps %xmm8,0x40(%rsp) 1046238384Sjkim movaps %xmm9,0x50(%rsp) 1047238384Sjkim movaps %xmm10,0x60(%rsp) 1048238384Sjkim movaps %xmm11,0x70(%rsp) 1049238384Sjkim movaps %xmm12,0x80(%rsp) 1050238384Sjkim movaps %xmm13,0x90(%rsp) 1051238384Sjkim movaps %xmm14,0xa0(%rsp) 1052238384Sjkim movaps %xmm15,0xb0(%rsp) 1053238384Sjkim.Lctr32_body: 1054238384Sjkim___ 1055238384Sjkim$code.=<<___; 1056238384Sjkim cmp \$1,$len 1057238384Sjkim je .Lctr32_one_shortcut 1058238384Sjkim 1059238384Sjkim movdqu ($ivp),$ivec 1060238384Sjkim movdqa .Lbswap_mask(%rip),$bswap_mask 1061238384Sjkim xor $rounds,$rounds 1062238384Sjkim pextrd \$3,$ivec,$rnds_ # pull 32-bit counter 1063238384Sjkim pinsrd \$3,$rounds,$ivec # wipe 32-bit counter 1064238384Sjkim 1065238384Sjkim mov 240($key),$rounds # key->rounds 1066238384Sjkim bswap $rnds_ 1067238384Sjkim pxor $iv0,$iv0 # vector of 3 32-bit counters 1068238384Sjkim pxor $iv1,$iv1 # vector of 3 32-bit counters 1069238384Sjkim pinsrd \$0,$rnds_,$iv0 1070238384Sjkim lea 3($rnds_),$key_ 1071238384Sjkim pinsrd \$0,$key_,$iv1 1072238384Sjkim inc $rnds_ 1073238384Sjkim pinsrd \$1,$rnds_,$iv0 1074238384Sjkim inc $key_ 1075238384Sjkim pinsrd \$1,$key_,$iv1 1076238384Sjkim inc $rnds_ 1077238384Sjkim pinsrd \$2,$rnds_,$iv0 1078238384Sjkim inc $key_ 1079238384Sjkim pinsrd \$2,$key_,$iv1 1080238384Sjkim movdqa $iv0,$reserved(%rsp) 1081238384Sjkim pshufb $bswap_mask,$iv0 1082238384Sjkim movdqa $iv1,`$reserved+0x10`(%rsp) 1083238384Sjkim pshufb $bswap_mask,$iv1 1084238384Sjkim 1085238384Sjkim pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword 1086238384Sjkim pshufd \$`2<<6`,$iv0,$inout1 1087238384Sjkim pshufd \$`1<<6`,$iv0,$inout2 1088238384Sjkim cmp \$6,$len 1089238384Sjkim jb .Lctr32_tail 1090238384Sjkim shr \$1,$rounds 1091238384Sjkim mov $key,$key_ # backup $key 1092238384Sjkim mov $rounds,$rnds_ # backup $rounds 1093238384Sjkim sub \$6,$len 1094238384Sjkim jmp .Lctr32_loop6 1095238384Sjkim 1096238384Sjkim.align 16 1097238384Sjkim.Lctr32_loop6: 1098238384Sjkim pshufd \$`3<<6`,$iv1,$inout3 1099238384Sjkim por $ivec,$inout0 # merge counter-less ivec 1100238384Sjkim $movkey ($key_),$rndkey0 1101238384Sjkim pshufd \$`2<<6`,$iv1,$inout4 1102238384Sjkim por $ivec,$inout1 1103238384Sjkim $movkey 16($key_),$rndkey1 1104238384Sjkim pshufd \$`1<<6`,$iv1,$inout5 1105238384Sjkim por $ivec,$inout2 1106238384Sjkim por $ivec,$inout3 1107238384Sjkim xorps $rndkey0,$inout0 1108238384Sjkim por $ivec,$inout4 1109238384Sjkim por $ivec,$inout5 1110238384Sjkim 1111238384Sjkim # inline _aesni_encrypt6 and interleave last rounds 1112238384Sjkim # with own code... 1113238384Sjkim 1114238384Sjkim pxor $rndkey0,$inout1 1115238384Sjkim aesenc $rndkey1,$inout0 1116238384Sjkim lea 32($key_),$key 1117238384Sjkim pxor $rndkey0,$inout2 1118238384Sjkim aesenc $rndkey1,$inout1 1119238384Sjkim movdqa .Lincrement32(%rip),$iv1 1120238384Sjkim pxor $rndkey0,$inout3 1121238384Sjkim aesenc $rndkey1,$inout2 1122238384Sjkim movdqa $reserved(%rsp),$iv0 1123238384Sjkim pxor $rndkey0,$inout4 1124238384Sjkim aesenc $rndkey1,$inout3 1125238384Sjkim pxor $rndkey0,$inout5 1126238384Sjkim $movkey ($key),$rndkey0 1127238384Sjkim dec $rounds 1128238384Sjkim aesenc $rndkey1,$inout4 1129238384Sjkim aesenc $rndkey1,$inout5 1130238384Sjkim jmp .Lctr32_enc_loop6_enter 1131238384Sjkim.align 16 1132238384Sjkim.Lctr32_enc_loop6: 1133238384Sjkim aesenc $rndkey1,$inout0 1134238384Sjkim aesenc $rndkey1,$inout1 1135238384Sjkim dec $rounds 1136238384Sjkim aesenc $rndkey1,$inout2 1137238384Sjkim aesenc $rndkey1,$inout3 1138238384Sjkim aesenc $rndkey1,$inout4 1139238384Sjkim aesenc $rndkey1,$inout5 1140238384Sjkim.Lctr32_enc_loop6_enter: 1141238384Sjkim $movkey 16($key),$rndkey1 1142238384Sjkim aesenc $rndkey0,$inout0 1143238384Sjkim aesenc $rndkey0,$inout1 1144238384Sjkim lea 32($key),$key 1145238384Sjkim aesenc $rndkey0,$inout2 1146238384Sjkim aesenc $rndkey0,$inout3 1147238384Sjkim aesenc $rndkey0,$inout4 1148238384Sjkim aesenc $rndkey0,$inout5 1149238384Sjkim $movkey ($key),$rndkey0 1150238384Sjkim jnz .Lctr32_enc_loop6 1151238384Sjkim 1152238384Sjkim aesenc $rndkey1,$inout0 1153238384Sjkim paddd $iv1,$iv0 # increment counter vector 1154238384Sjkim aesenc $rndkey1,$inout1 1155238384Sjkim paddd `$reserved+0x10`(%rsp),$iv1 1156238384Sjkim aesenc $rndkey1,$inout2 1157238384Sjkim movdqa $iv0,$reserved(%rsp) # save counter vector 1158238384Sjkim aesenc $rndkey1,$inout3 1159238384Sjkim movdqa $iv1,`$reserved+0x10`(%rsp) 1160238384Sjkim aesenc $rndkey1,$inout4 1161238384Sjkim pshufb $bswap_mask,$iv0 # byte swap 1162238384Sjkim aesenc $rndkey1,$inout5 1163238384Sjkim pshufb $bswap_mask,$iv1 1164238384Sjkim 1165238384Sjkim aesenclast $rndkey0,$inout0 1166238384Sjkim movups ($inp),$in0 # load input 1167238384Sjkim aesenclast $rndkey0,$inout1 1168238384Sjkim movups 0x10($inp),$in1 1169238384Sjkim aesenclast $rndkey0,$inout2 1170238384Sjkim movups 0x20($inp),$in2 1171238384Sjkim aesenclast $rndkey0,$inout3 1172238384Sjkim movups 0x30($inp),$in3 1173238384Sjkim aesenclast $rndkey0,$inout4 1174238384Sjkim movups 0x40($inp),$rndkey1 1175238384Sjkim aesenclast $rndkey0,$inout5 1176238384Sjkim movups 0x50($inp),$rndkey0 1177238384Sjkim lea 0x60($inp),$inp 1178238384Sjkim 1179238384Sjkim xorps $inout0,$in0 # xor 1180238384Sjkim pshufd \$`3<<6`,$iv0,$inout0 1181238384Sjkim xorps $inout1,$in1 1182238384Sjkim pshufd \$`2<<6`,$iv0,$inout1 1183238384Sjkim movups $in0,($out) # store output 1184238384Sjkim xorps $inout2,$in2 1185238384Sjkim pshufd \$`1<<6`,$iv0,$inout2 1186238384Sjkim movups $in1,0x10($out) 1187238384Sjkim xorps $inout3,$in3 1188238384Sjkim movups $in2,0x20($out) 1189238384Sjkim xorps $inout4,$rndkey1 1190238384Sjkim movups $in3,0x30($out) 1191238384Sjkim xorps $inout5,$rndkey0 1192238384Sjkim movups $rndkey1,0x40($out) 1193238384Sjkim movups $rndkey0,0x50($out) 1194238384Sjkim lea 0x60($out),$out 1195238384Sjkim mov $rnds_,$rounds 1196238384Sjkim sub \$6,$len 1197238384Sjkim jnc .Lctr32_loop6 1198238384Sjkim 1199238384Sjkim add \$6,$len 1200238384Sjkim jz .Lctr32_done 1201238384Sjkim mov $key_,$key # restore $key 1202238384Sjkim lea 1($rounds,$rounds),$rounds # restore original value 1203238384Sjkim 1204238384Sjkim.Lctr32_tail: 1205238384Sjkim por $ivec,$inout0 1206238384Sjkim movups ($inp),$in0 1207238384Sjkim cmp \$2,$len 1208238384Sjkim jb .Lctr32_one 1209238384Sjkim 1210238384Sjkim por $ivec,$inout1 1211238384Sjkim movups 0x10($inp),$in1 1212238384Sjkim je .Lctr32_two 1213238384Sjkim 1214238384Sjkim pshufd \$`3<<6`,$iv1,$inout3 1215238384Sjkim por $ivec,$inout2 1216238384Sjkim movups 0x20($inp),$in2 1217238384Sjkim cmp \$4,$len 1218238384Sjkim jb .Lctr32_three 1219238384Sjkim 1220238384Sjkim pshufd \$`2<<6`,$iv1,$inout4 1221238384Sjkim por $ivec,$inout3 1222238384Sjkim movups 0x30($inp),$in3 1223238384Sjkim je .Lctr32_four 1224238384Sjkim 1225238384Sjkim por $ivec,$inout4 1226238384Sjkim xorps $inout5,$inout5 1227238384Sjkim 1228238384Sjkim call _aesni_encrypt6 1229238384Sjkim 1230238384Sjkim movups 0x40($inp),$rndkey1 1231238384Sjkim xorps $inout0,$in0 1232238384Sjkim xorps $inout1,$in1 1233238384Sjkim movups $in0,($out) 1234238384Sjkim xorps $inout2,$in2 1235238384Sjkim movups $in1,0x10($out) 1236238384Sjkim xorps $inout3,$in3 1237238384Sjkim movups $in2,0x20($out) 1238238384Sjkim xorps $inout4,$rndkey1 1239238384Sjkim movups $in3,0x30($out) 1240238384Sjkim movups $rndkey1,0x40($out) 1241238384Sjkim jmp .Lctr32_done 1242238384Sjkim 1243238384Sjkim.align 16 1244238384Sjkim.Lctr32_one_shortcut: 1245238384Sjkim movups ($ivp),$inout0 1246238384Sjkim movups ($inp),$in0 1247238384Sjkim mov 240($key),$rounds # key->rounds 1248238384Sjkim.Lctr32_one: 1249238384Sjkim___ 1250238384Sjkim &aesni_generate1("enc",$key,$rounds); 1251238384Sjkim$code.=<<___; 1252238384Sjkim xorps $inout0,$in0 1253238384Sjkim movups $in0,($out) 1254238384Sjkim jmp .Lctr32_done 1255238384Sjkim 1256238384Sjkim.align 16 1257238384Sjkim.Lctr32_two: 1258238384Sjkim xorps $inout2,$inout2 1259238384Sjkim call _aesni_encrypt3 1260238384Sjkim xorps $inout0,$in0 1261238384Sjkim xorps $inout1,$in1 1262238384Sjkim movups $in0,($out) 1263238384Sjkim movups $in1,0x10($out) 1264238384Sjkim jmp .Lctr32_done 1265238384Sjkim 1266238384Sjkim.align 16 1267238384Sjkim.Lctr32_three: 1268238384Sjkim call _aesni_encrypt3 1269238384Sjkim xorps $inout0,$in0 1270238384Sjkim xorps $inout1,$in1 1271238384Sjkim movups $in0,($out) 1272238384Sjkim xorps $inout2,$in2 1273238384Sjkim movups $in1,0x10($out) 1274238384Sjkim movups $in2,0x20($out) 1275238384Sjkim jmp .Lctr32_done 1276238384Sjkim 1277238384Sjkim.align 16 1278238384Sjkim.Lctr32_four: 1279238384Sjkim call _aesni_encrypt4 1280238384Sjkim xorps $inout0,$in0 1281238384Sjkim xorps $inout1,$in1 1282238384Sjkim movups $in0,($out) 1283238384Sjkim xorps $inout2,$in2 1284238384Sjkim movups $in1,0x10($out) 1285238384Sjkim xorps $inout3,$in3 1286238384Sjkim movups $in2,0x20($out) 1287238384Sjkim movups $in3,0x30($out) 1288238384Sjkim 1289238384Sjkim.Lctr32_done: 1290238384Sjkim___ 1291238384Sjkim$code.=<<___ if ($win64); 1292238384Sjkim movaps 0x20(%rsp),%xmm6 1293238384Sjkim movaps 0x30(%rsp),%xmm7 1294238384Sjkim movaps 0x40(%rsp),%xmm8 1295238384Sjkim movaps 0x50(%rsp),%xmm9 1296238384Sjkim movaps 0x60(%rsp),%xmm10 1297238384Sjkim movaps 0x70(%rsp),%xmm11 1298238384Sjkim movaps 0x80(%rsp),%xmm12 1299238384Sjkim movaps 0x90(%rsp),%xmm13 1300238384Sjkim movaps 0xa0(%rsp),%xmm14 1301238384Sjkim movaps 0xb0(%rsp),%xmm15 1302238384Sjkim lea 0xc8(%rsp),%rsp 1303238384Sjkim.Lctr32_ret: 1304238384Sjkim___ 1305238384Sjkim$code.=<<___; 1306238384Sjkim ret 1307238384Sjkim.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1308238384Sjkim___ 1309238384Sjkim} 1310238384Sjkim 1311238384Sjkim###################################################################### 1312238384Sjkim# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1313238384Sjkim# const AES_KEY *key1, const AES_KEY *key2 1314238384Sjkim# const unsigned char iv[16]); 1315238384Sjkim# 1316238384Sjkim{ 1317238384Sjkimmy @tweak=map("%xmm$_",(10..15)); 1318238384Sjkimmy ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1319238384Sjkimmy ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1320238384Sjkimmy $frame_size = 0x68 + ($win64?160:0); 1321238384Sjkim 1322238384Sjkim$code.=<<___; 1323238384Sjkim.globl aesni_xts_encrypt 1324238384Sjkim.type aesni_xts_encrypt,\@function,6 1325238384Sjkim.align 16 1326238384Sjkimaesni_xts_encrypt: 1327238384Sjkim lea -$frame_size(%rsp),%rsp 1328238384Sjkim___ 1329238384Sjkim$code.=<<___ if ($win64); 1330238384Sjkim movaps %xmm6,0x60(%rsp) 1331238384Sjkim movaps %xmm7,0x70(%rsp) 1332238384Sjkim movaps %xmm8,0x80(%rsp) 1333238384Sjkim movaps %xmm9,0x90(%rsp) 1334238384Sjkim movaps %xmm10,0xa0(%rsp) 1335238384Sjkim movaps %xmm11,0xb0(%rsp) 1336238384Sjkim movaps %xmm12,0xc0(%rsp) 1337238384Sjkim movaps %xmm13,0xd0(%rsp) 1338238384Sjkim movaps %xmm14,0xe0(%rsp) 1339238384Sjkim movaps %xmm15,0xf0(%rsp) 1340238384Sjkim.Lxts_enc_body: 1341238384Sjkim___ 1342238384Sjkim$code.=<<___; 1343238384Sjkim movups ($ivp),@tweak[5] # load clear-text tweak 1344238384Sjkim mov 240(%r8),$rounds # key2->rounds 1345238384Sjkim mov 240($key),$rnds_ # key1->rounds 1346238384Sjkim___ 1347238384Sjkim # generate the tweak 1348238384Sjkim &aesni_generate1("enc",$key2,$rounds,@tweak[5]); 1349238384Sjkim$code.=<<___; 1350238384Sjkim mov $key,$key_ # backup $key 1351238384Sjkim mov $rnds_,$rounds # backup $rounds 1352238384Sjkim mov $len,$len_ # backup $len 1353238384Sjkim and \$-16,$len 1354238384Sjkim 1355238384Sjkim movdqa .Lxts_magic(%rip),$twmask 1356238384Sjkim pxor $twtmp,$twtmp 1357238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcast upper bits 1358238384Sjkim___ 1359238384Sjkim for ($i=0;$i<4;$i++) { 1360238384Sjkim $code.=<<___; 1361238384Sjkim pshufd \$0x13,$twtmp,$twres 1362238384Sjkim pxor $twtmp,$twtmp 1363238384Sjkim movdqa @tweak[5],@tweak[$i] 1364238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1365238384Sjkim pand $twmask,$twres # isolate carry and residue 1366238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1367238384Sjkim pxor $twres,@tweak[5] 1368238384Sjkim___ 1369238384Sjkim } 1370238384Sjkim$code.=<<___; 1371238384Sjkim sub \$16*6,$len 1372238384Sjkim jc .Lxts_enc_short 1373238384Sjkim 1374238384Sjkim shr \$1,$rounds 1375238384Sjkim sub \$1,$rounds 1376238384Sjkim mov $rounds,$rnds_ 1377238384Sjkim jmp .Lxts_enc_grandloop 1378238384Sjkim 1379238384Sjkim.align 16 1380238384Sjkim.Lxts_enc_grandloop: 1381238384Sjkim pshufd \$0x13,$twtmp,$twres 1382238384Sjkim movdqa @tweak[5],@tweak[4] 1383238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1384238384Sjkim movdqu `16*0`($inp),$inout0 # load input 1385238384Sjkim pand $twmask,$twres # isolate carry and residue 1386238384Sjkim movdqu `16*1`($inp),$inout1 1387238384Sjkim pxor $twres,@tweak[5] 1388238384Sjkim 1389238384Sjkim movdqu `16*2`($inp),$inout2 1390238384Sjkim pxor @tweak[0],$inout0 # input^=tweak 1391238384Sjkim movdqu `16*3`($inp),$inout3 1392238384Sjkim pxor @tweak[1],$inout1 1393238384Sjkim movdqu `16*4`($inp),$inout4 1394238384Sjkim pxor @tweak[2],$inout2 1395238384Sjkim movdqu `16*5`($inp),$inout5 1396238384Sjkim lea `16*6`($inp),$inp 1397238384Sjkim pxor @tweak[3],$inout3 1398238384Sjkim $movkey ($key_),$rndkey0 1399238384Sjkim pxor @tweak[4],$inout4 1400238384Sjkim pxor @tweak[5],$inout5 1401238384Sjkim 1402238384Sjkim # inline _aesni_encrypt6 and interleave first and last rounds 1403238384Sjkim # with own code... 1404238384Sjkim $movkey 16($key_),$rndkey1 1405238384Sjkim pxor $rndkey0,$inout0 1406238384Sjkim pxor $rndkey0,$inout1 1407238384Sjkim movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks 1408238384Sjkim aesenc $rndkey1,$inout0 1409238384Sjkim lea 32($key_),$key 1410238384Sjkim pxor $rndkey0,$inout2 1411238384Sjkim movdqa @tweak[1],`16*1`(%rsp) 1412238384Sjkim aesenc $rndkey1,$inout1 1413238384Sjkim pxor $rndkey0,$inout3 1414238384Sjkim movdqa @tweak[2],`16*2`(%rsp) 1415238384Sjkim aesenc $rndkey1,$inout2 1416238384Sjkim pxor $rndkey0,$inout4 1417238384Sjkim movdqa @tweak[3],`16*3`(%rsp) 1418238384Sjkim aesenc $rndkey1,$inout3 1419238384Sjkim pxor $rndkey0,$inout5 1420238384Sjkim $movkey ($key),$rndkey0 1421238384Sjkim dec $rounds 1422238384Sjkim movdqa @tweak[4],`16*4`(%rsp) 1423238384Sjkim aesenc $rndkey1,$inout4 1424238384Sjkim movdqa @tweak[5],`16*5`(%rsp) 1425238384Sjkim aesenc $rndkey1,$inout5 1426238384Sjkim pxor $twtmp,$twtmp 1427238384Sjkim pcmpgtd @tweak[5],$twtmp 1428238384Sjkim jmp .Lxts_enc_loop6_enter 1429238384Sjkim 1430238384Sjkim.align 16 1431238384Sjkim.Lxts_enc_loop6: 1432238384Sjkim aesenc $rndkey1,$inout0 1433238384Sjkim aesenc $rndkey1,$inout1 1434238384Sjkim dec $rounds 1435238384Sjkim aesenc $rndkey1,$inout2 1436238384Sjkim aesenc $rndkey1,$inout3 1437238384Sjkim aesenc $rndkey1,$inout4 1438238384Sjkim aesenc $rndkey1,$inout5 1439238384Sjkim.Lxts_enc_loop6_enter: 1440238384Sjkim $movkey 16($key),$rndkey1 1441238384Sjkim aesenc $rndkey0,$inout0 1442238384Sjkim aesenc $rndkey0,$inout1 1443238384Sjkim lea 32($key),$key 1444238384Sjkim aesenc $rndkey0,$inout2 1445238384Sjkim aesenc $rndkey0,$inout3 1446238384Sjkim aesenc $rndkey0,$inout4 1447238384Sjkim aesenc $rndkey0,$inout5 1448238384Sjkim $movkey ($key),$rndkey0 1449238384Sjkim jnz .Lxts_enc_loop6 1450238384Sjkim 1451238384Sjkim pshufd \$0x13,$twtmp,$twres 1452238384Sjkim pxor $twtmp,$twtmp 1453238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1454238384Sjkim aesenc $rndkey1,$inout0 1455238384Sjkim pand $twmask,$twres # isolate carry and residue 1456238384Sjkim aesenc $rndkey1,$inout1 1457238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcast upper bits 1458238384Sjkim aesenc $rndkey1,$inout2 1459238384Sjkim pxor $twres,@tweak[5] 1460238384Sjkim aesenc $rndkey1,$inout3 1461238384Sjkim aesenc $rndkey1,$inout4 1462238384Sjkim aesenc $rndkey1,$inout5 1463238384Sjkim $movkey 16($key),$rndkey1 1464238384Sjkim 1465238384Sjkim pshufd \$0x13,$twtmp,$twres 1466238384Sjkim pxor $twtmp,$twtmp 1467238384Sjkim movdqa @tweak[5],@tweak[0] 1468238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1469238384Sjkim aesenc $rndkey0,$inout0 1470238384Sjkim pand $twmask,$twres # isolate carry and residue 1471238384Sjkim aesenc $rndkey0,$inout1 1472238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1473238384Sjkim aesenc $rndkey0,$inout2 1474238384Sjkim pxor $twres,@tweak[5] 1475238384Sjkim aesenc $rndkey0,$inout3 1476238384Sjkim aesenc $rndkey0,$inout4 1477238384Sjkim aesenc $rndkey0,$inout5 1478238384Sjkim $movkey 32($key),$rndkey0 1479238384Sjkim 1480238384Sjkim pshufd \$0x13,$twtmp,$twres 1481238384Sjkim pxor $twtmp,$twtmp 1482238384Sjkim movdqa @tweak[5],@tweak[1] 1483238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1484238384Sjkim aesenc $rndkey1,$inout0 1485238384Sjkim pand $twmask,$twres # isolate carry and residue 1486238384Sjkim aesenc $rndkey1,$inout1 1487238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1488238384Sjkim aesenc $rndkey1,$inout2 1489238384Sjkim pxor $twres,@tweak[5] 1490238384Sjkim aesenc $rndkey1,$inout3 1491238384Sjkim aesenc $rndkey1,$inout4 1492238384Sjkim aesenc $rndkey1,$inout5 1493238384Sjkim 1494238384Sjkim pshufd \$0x13,$twtmp,$twres 1495238384Sjkim pxor $twtmp,$twtmp 1496238384Sjkim movdqa @tweak[5],@tweak[2] 1497238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1498238384Sjkim aesenclast $rndkey0,$inout0 1499238384Sjkim pand $twmask,$twres # isolate carry and residue 1500238384Sjkim aesenclast $rndkey0,$inout1 1501238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1502238384Sjkim aesenclast $rndkey0,$inout2 1503238384Sjkim pxor $twres,@tweak[5] 1504238384Sjkim aesenclast $rndkey0,$inout3 1505238384Sjkim aesenclast $rndkey0,$inout4 1506238384Sjkim aesenclast $rndkey0,$inout5 1507238384Sjkim 1508238384Sjkim pshufd \$0x13,$twtmp,$twres 1509238384Sjkim pxor $twtmp,$twtmp 1510238384Sjkim movdqa @tweak[5],@tweak[3] 1511238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1512238384Sjkim xorps `16*0`(%rsp),$inout0 # output^=tweak 1513238384Sjkim pand $twmask,$twres # isolate carry and residue 1514238384Sjkim xorps `16*1`(%rsp),$inout1 1515238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1516238384Sjkim pxor $twres,@tweak[5] 1517238384Sjkim 1518238384Sjkim xorps `16*2`(%rsp),$inout2 1519238384Sjkim movups $inout0,`16*0`($out) # write output 1520238384Sjkim xorps `16*3`(%rsp),$inout3 1521238384Sjkim movups $inout1,`16*1`($out) 1522238384Sjkim xorps `16*4`(%rsp),$inout4 1523238384Sjkim movups $inout2,`16*2`($out) 1524238384Sjkim xorps `16*5`(%rsp),$inout5 1525238384Sjkim movups $inout3,`16*3`($out) 1526238384Sjkim mov $rnds_,$rounds # restore $rounds 1527238384Sjkim movups $inout4,`16*4`($out) 1528238384Sjkim movups $inout5,`16*5`($out) 1529238384Sjkim lea `16*6`($out),$out 1530238384Sjkim sub \$16*6,$len 1531238384Sjkim jnc .Lxts_enc_grandloop 1532238384Sjkim 1533238384Sjkim lea 3($rounds,$rounds),$rounds # restore original value 1534238384Sjkim mov $key_,$key # restore $key 1535238384Sjkim mov $rounds,$rnds_ # backup $rounds 1536238384Sjkim 1537238384Sjkim.Lxts_enc_short: 1538238384Sjkim add \$16*6,$len 1539238384Sjkim jz .Lxts_enc_done 1540238384Sjkim 1541238384Sjkim cmp \$0x20,$len 1542238384Sjkim jb .Lxts_enc_one 1543238384Sjkim je .Lxts_enc_two 1544238384Sjkim 1545238384Sjkim cmp \$0x40,$len 1546238384Sjkim jb .Lxts_enc_three 1547238384Sjkim je .Lxts_enc_four 1548238384Sjkim 1549238384Sjkim pshufd \$0x13,$twtmp,$twres 1550238384Sjkim movdqa @tweak[5],@tweak[4] 1551238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1552238384Sjkim movdqu ($inp),$inout0 1553238384Sjkim pand $twmask,$twres # isolate carry and residue 1554238384Sjkim movdqu 16*1($inp),$inout1 1555238384Sjkim pxor $twres,@tweak[5] 1556238384Sjkim 1557238384Sjkim movdqu 16*2($inp),$inout2 1558238384Sjkim pxor @tweak[0],$inout0 1559238384Sjkim movdqu 16*3($inp),$inout3 1560238384Sjkim pxor @tweak[1],$inout1 1561238384Sjkim movdqu 16*4($inp),$inout4 1562238384Sjkim lea 16*5($inp),$inp 1563238384Sjkim pxor @tweak[2],$inout2 1564238384Sjkim pxor @tweak[3],$inout3 1565238384Sjkim pxor @tweak[4],$inout4 1566238384Sjkim 1567238384Sjkim call _aesni_encrypt6 1568238384Sjkim 1569238384Sjkim xorps @tweak[0],$inout0 1570238384Sjkim movdqa @tweak[5],@tweak[0] 1571238384Sjkim xorps @tweak[1],$inout1 1572238384Sjkim xorps @tweak[2],$inout2 1573238384Sjkim movdqu $inout0,($out) 1574238384Sjkim xorps @tweak[3],$inout3 1575238384Sjkim movdqu $inout1,16*1($out) 1576238384Sjkim xorps @tweak[4],$inout4 1577238384Sjkim movdqu $inout2,16*2($out) 1578238384Sjkim movdqu $inout3,16*3($out) 1579238384Sjkim movdqu $inout4,16*4($out) 1580238384Sjkim lea 16*5($out),$out 1581238384Sjkim jmp .Lxts_enc_done 1582238384Sjkim 1583238384Sjkim.align 16 1584238384Sjkim.Lxts_enc_one: 1585238384Sjkim movups ($inp),$inout0 1586238384Sjkim lea 16*1($inp),$inp 1587238384Sjkim xorps @tweak[0],$inout0 1588238384Sjkim___ 1589238384Sjkim &aesni_generate1("enc",$key,$rounds); 1590238384Sjkim$code.=<<___; 1591238384Sjkim xorps @tweak[0],$inout0 1592238384Sjkim movdqa @tweak[1],@tweak[0] 1593238384Sjkim movups $inout0,($out) 1594238384Sjkim lea 16*1($out),$out 1595238384Sjkim jmp .Lxts_enc_done 1596238384Sjkim 1597238384Sjkim.align 16 1598238384Sjkim.Lxts_enc_two: 1599238384Sjkim movups ($inp),$inout0 1600238384Sjkim movups 16($inp),$inout1 1601238384Sjkim lea 32($inp),$inp 1602238384Sjkim xorps @tweak[0],$inout0 1603238384Sjkim xorps @tweak[1],$inout1 1604238384Sjkim 1605238384Sjkim call _aesni_encrypt3 1606238384Sjkim 1607238384Sjkim xorps @tweak[0],$inout0 1608238384Sjkim movdqa @tweak[2],@tweak[0] 1609238384Sjkim xorps @tweak[1],$inout1 1610238384Sjkim movups $inout0,($out) 1611238384Sjkim movups $inout1,16*1($out) 1612238384Sjkim lea 16*2($out),$out 1613238384Sjkim jmp .Lxts_enc_done 1614238384Sjkim 1615238384Sjkim.align 16 1616238384Sjkim.Lxts_enc_three: 1617238384Sjkim movups ($inp),$inout0 1618238384Sjkim movups 16*1($inp),$inout1 1619238384Sjkim movups 16*2($inp),$inout2 1620238384Sjkim lea 16*3($inp),$inp 1621238384Sjkim xorps @tweak[0],$inout0 1622238384Sjkim xorps @tweak[1],$inout1 1623238384Sjkim xorps @tweak[2],$inout2 1624238384Sjkim 1625238384Sjkim call _aesni_encrypt3 1626238384Sjkim 1627238384Sjkim xorps @tweak[0],$inout0 1628238384Sjkim movdqa @tweak[3],@tweak[0] 1629238384Sjkim xorps @tweak[1],$inout1 1630238384Sjkim xorps @tweak[2],$inout2 1631238384Sjkim movups $inout0,($out) 1632238384Sjkim movups $inout1,16*1($out) 1633238384Sjkim movups $inout2,16*2($out) 1634238384Sjkim lea 16*3($out),$out 1635238384Sjkim jmp .Lxts_enc_done 1636238384Sjkim 1637238384Sjkim.align 16 1638238384Sjkim.Lxts_enc_four: 1639238384Sjkim movups ($inp),$inout0 1640238384Sjkim movups 16*1($inp),$inout1 1641238384Sjkim movups 16*2($inp),$inout2 1642238384Sjkim xorps @tweak[0],$inout0 1643238384Sjkim movups 16*3($inp),$inout3 1644238384Sjkim lea 16*4($inp),$inp 1645238384Sjkim xorps @tweak[1],$inout1 1646238384Sjkim xorps @tweak[2],$inout2 1647238384Sjkim xorps @tweak[3],$inout3 1648238384Sjkim 1649238384Sjkim call _aesni_encrypt4 1650238384Sjkim 1651238384Sjkim xorps @tweak[0],$inout0 1652238384Sjkim movdqa @tweak[5],@tweak[0] 1653238384Sjkim xorps @tweak[1],$inout1 1654238384Sjkim xorps @tweak[2],$inout2 1655238384Sjkim movups $inout0,($out) 1656238384Sjkim xorps @tweak[3],$inout3 1657238384Sjkim movups $inout1,16*1($out) 1658238384Sjkim movups $inout2,16*2($out) 1659238384Sjkim movups $inout3,16*3($out) 1660238384Sjkim lea 16*4($out),$out 1661238384Sjkim jmp .Lxts_enc_done 1662238384Sjkim 1663238384Sjkim.align 16 1664238384Sjkim.Lxts_enc_done: 1665238384Sjkim and \$15,$len_ 1666238384Sjkim jz .Lxts_enc_ret 1667238384Sjkim mov $len_,$len 1668238384Sjkim 1669238384Sjkim.Lxts_enc_steal: 1670238384Sjkim movzb ($inp),%eax # borrow $rounds ... 1671238384Sjkim movzb -16($out),%ecx # ... and $key 1672238384Sjkim lea 1($inp),$inp 1673238384Sjkim mov %al,-16($out) 1674238384Sjkim mov %cl,0($out) 1675238384Sjkim lea 1($out),$out 1676238384Sjkim sub \$1,$len 1677238384Sjkim jnz .Lxts_enc_steal 1678238384Sjkim 1679238384Sjkim sub $len_,$out # rewind $out 1680238384Sjkim mov $key_,$key # restore $key 1681238384Sjkim mov $rnds_,$rounds # restore $rounds 1682238384Sjkim 1683238384Sjkim movups -16($out),$inout0 1684238384Sjkim xorps @tweak[0],$inout0 1685238384Sjkim___ 1686238384Sjkim &aesni_generate1("enc",$key,$rounds); 1687238384Sjkim$code.=<<___; 1688238384Sjkim xorps @tweak[0],$inout0 1689238384Sjkim movups $inout0,-16($out) 1690238384Sjkim 1691238384Sjkim.Lxts_enc_ret: 1692238384Sjkim___ 1693238384Sjkim$code.=<<___ if ($win64); 1694238384Sjkim movaps 0x60(%rsp),%xmm6 1695238384Sjkim movaps 0x70(%rsp),%xmm7 1696238384Sjkim movaps 0x80(%rsp),%xmm8 1697238384Sjkim movaps 0x90(%rsp),%xmm9 1698238384Sjkim movaps 0xa0(%rsp),%xmm10 1699238384Sjkim movaps 0xb0(%rsp),%xmm11 1700238384Sjkim movaps 0xc0(%rsp),%xmm12 1701238384Sjkim movaps 0xd0(%rsp),%xmm13 1702238384Sjkim movaps 0xe0(%rsp),%xmm14 1703238384Sjkim movaps 0xf0(%rsp),%xmm15 1704238384Sjkim___ 1705238384Sjkim$code.=<<___; 1706238384Sjkim lea $frame_size(%rsp),%rsp 1707238384Sjkim.Lxts_enc_epilogue: 1708238384Sjkim ret 1709238384Sjkim.size aesni_xts_encrypt,.-aesni_xts_encrypt 1710238384Sjkim___ 1711238384Sjkim 1712238384Sjkim$code.=<<___; 1713238384Sjkim.globl aesni_xts_decrypt 1714238384Sjkim.type aesni_xts_decrypt,\@function,6 1715238384Sjkim.align 16 1716238384Sjkimaesni_xts_decrypt: 1717238384Sjkim lea -$frame_size(%rsp),%rsp 1718238384Sjkim___ 1719238384Sjkim$code.=<<___ if ($win64); 1720238384Sjkim movaps %xmm6,0x60(%rsp) 1721238384Sjkim movaps %xmm7,0x70(%rsp) 1722238384Sjkim movaps %xmm8,0x80(%rsp) 1723238384Sjkim movaps %xmm9,0x90(%rsp) 1724238384Sjkim movaps %xmm10,0xa0(%rsp) 1725238384Sjkim movaps %xmm11,0xb0(%rsp) 1726238384Sjkim movaps %xmm12,0xc0(%rsp) 1727238384Sjkim movaps %xmm13,0xd0(%rsp) 1728238384Sjkim movaps %xmm14,0xe0(%rsp) 1729238384Sjkim movaps %xmm15,0xf0(%rsp) 1730238384Sjkim.Lxts_dec_body: 1731238384Sjkim___ 1732238384Sjkim$code.=<<___; 1733238384Sjkim movups ($ivp),@tweak[5] # load clear-text tweak 1734238384Sjkim mov 240($key2),$rounds # key2->rounds 1735238384Sjkim mov 240($key),$rnds_ # key1->rounds 1736238384Sjkim___ 1737238384Sjkim # generate the tweak 1738238384Sjkim &aesni_generate1("enc",$key2,$rounds,@tweak[5]); 1739238384Sjkim$code.=<<___; 1740238384Sjkim xor %eax,%eax # if ($len%16) len-=16; 1741238384Sjkim test \$15,$len 1742238384Sjkim setnz %al 1743238384Sjkim shl \$4,%rax 1744238384Sjkim sub %rax,$len 1745238384Sjkim 1746238384Sjkim mov $key,$key_ # backup $key 1747238384Sjkim mov $rnds_,$rounds # backup $rounds 1748238384Sjkim mov $len,$len_ # backup $len 1749238384Sjkim and \$-16,$len 1750238384Sjkim 1751238384Sjkim movdqa .Lxts_magic(%rip),$twmask 1752238384Sjkim pxor $twtmp,$twtmp 1753238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcast upper bits 1754238384Sjkim___ 1755238384Sjkim for ($i=0;$i<4;$i++) { 1756238384Sjkim $code.=<<___; 1757238384Sjkim pshufd \$0x13,$twtmp,$twres 1758238384Sjkim pxor $twtmp,$twtmp 1759238384Sjkim movdqa @tweak[5],@tweak[$i] 1760238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1761238384Sjkim pand $twmask,$twres # isolate carry and residue 1762238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1763238384Sjkim pxor $twres,@tweak[5] 1764238384Sjkim___ 1765238384Sjkim } 1766238384Sjkim$code.=<<___; 1767238384Sjkim sub \$16*6,$len 1768238384Sjkim jc .Lxts_dec_short 1769238384Sjkim 1770238384Sjkim shr \$1,$rounds 1771238384Sjkim sub \$1,$rounds 1772238384Sjkim mov $rounds,$rnds_ 1773238384Sjkim jmp .Lxts_dec_grandloop 1774238384Sjkim 1775238384Sjkim.align 16 1776238384Sjkim.Lxts_dec_grandloop: 1777238384Sjkim pshufd \$0x13,$twtmp,$twres 1778238384Sjkim movdqa @tweak[5],@tweak[4] 1779238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1780238384Sjkim movdqu `16*0`($inp),$inout0 # load input 1781238384Sjkim pand $twmask,$twres # isolate carry and residue 1782238384Sjkim movdqu `16*1`($inp),$inout1 1783238384Sjkim pxor $twres,@tweak[5] 1784238384Sjkim 1785238384Sjkim movdqu `16*2`($inp),$inout2 1786238384Sjkim pxor @tweak[0],$inout0 # input^=tweak 1787238384Sjkim movdqu `16*3`($inp),$inout3 1788238384Sjkim pxor @tweak[1],$inout1 1789238384Sjkim movdqu `16*4`($inp),$inout4 1790238384Sjkim pxor @tweak[2],$inout2 1791238384Sjkim movdqu `16*5`($inp),$inout5 1792238384Sjkim lea `16*6`($inp),$inp 1793238384Sjkim pxor @tweak[3],$inout3 1794238384Sjkim $movkey ($key_),$rndkey0 1795238384Sjkim pxor @tweak[4],$inout4 1796238384Sjkim pxor @tweak[5],$inout5 1797238384Sjkim 1798238384Sjkim # inline _aesni_decrypt6 and interleave first and last rounds 1799238384Sjkim # with own code... 1800238384Sjkim $movkey 16($key_),$rndkey1 1801238384Sjkim pxor $rndkey0,$inout0 1802238384Sjkim pxor $rndkey0,$inout1 1803238384Sjkim movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks 1804238384Sjkim aesdec $rndkey1,$inout0 1805238384Sjkim lea 32($key_),$key 1806238384Sjkim pxor $rndkey0,$inout2 1807238384Sjkim movdqa @tweak[1],`16*1`(%rsp) 1808238384Sjkim aesdec $rndkey1,$inout1 1809238384Sjkim pxor $rndkey0,$inout3 1810238384Sjkim movdqa @tweak[2],`16*2`(%rsp) 1811238384Sjkim aesdec $rndkey1,$inout2 1812238384Sjkim pxor $rndkey0,$inout4 1813238384Sjkim movdqa @tweak[3],`16*3`(%rsp) 1814238384Sjkim aesdec $rndkey1,$inout3 1815238384Sjkim pxor $rndkey0,$inout5 1816238384Sjkim $movkey ($key),$rndkey0 1817238384Sjkim dec $rounds 1818238384Sjkim movdqa @tweak[4],`16*4`(%rsp) 1819238384Sjkim aesdec $rndkey1,$inout4 1820238384Sjkim movdqa @tweak[5],`16*5`(%rsp) 1821238384Sjkim aesdec $rndkey1,$inout5 1822238384Sjkim pxor $twtmp,$twtmp 1823238384Sjkim pcmpgtd @tweak[5],$twtmp 1824238384Sjkim jmp .Lxts_dec_loop6_enter 1825238384Sjkim 1826238384Sjkim.align 16 1827238384Sjkim.Lxts_dec_loop6: 1828238384Sjkim aesdec $rndkey1,$inout0 1829238384Sjkim aesdec $rndkey1,$inout1 1830238384Sjkim dec $rounds 1831238384Sjkim aesdec $rndkey1,$inout2 1832238384Sjkim aesdec $rndkey1,$inout3 1833238384Sjkim aesdec $rndkey1,$inout4 1834238384Sjkim aesdec $rndkey1,$inout5 1835238384Sjkim.Lxts_dec_loop6_enter: 1836238384Sjkim $movkey 16($key),$rndkey1 1837238384Sjkim aesdec $rndkey0,$inout0 1838238384Sjkim aesdec $rndkey0,$inout1 1839238384Sjkim lea 32($key),$key 1840238384Sjkim aesdec $rndkey0,$inout2 1841238384Sjkim aesdec $rndkey0,$inout3 1842238384Sjkim aesdec $rndkey0,$inout4 1843238384Sjkim aesdec $rndkey0,$inout5 1844238384Sjkim $movkey ($key),$rndkey0 1845238384Sjkim jnz .Lxts_dec_loop6 1846238384Sjkim 1847238384Sjkim pshufd \$0x13,$twtmp,$twres 1848238384Sjkim pxor $twtmp,$twtmp 1849238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1850238384Sjkim aesdec $rndkey1,$inout0 1851238384Sjkim pand $twmask,$twres # isolate carry and residue 1852238384Sjkim aesdec $rndkey1,$inout1 1853238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcast upper bits 1854238384Sjkim aesdec $rndkey1,$inout2 1855238384Sjkim pxor $twres,@tweak[5] 1856238384Sjkim aesdec $rndkey1,$inout3 1857238384Sjkim aesdec $rndkey1,$inout4 1858238384Sjkim aesdec $rndkey1,$inout5 1859238384Sjkim $movkey 16($key),$rndkey1 1860238384Sjkim 1861238384Sjkim pshufd \$0x13,$twtmp,$twres 1862238384Sjkim pxor $twtmp,$twtmp 1863238384Sjkim movdqa @tweak[5],@tweak[0] 1864238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1865238384Sjkim aesdec $rndkey0,$inout0 1866238384Sjkim pand $twmask,$twres # isolate carry and residue 1867238384Sjkim aesdec $rndkey0,$inout1 1868238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1869238384Sjkim aesdec $rndkey0,$inout2 1870238384Sjkim pxor $twres,@tweak[5] 1871238384Sjkim aesdec $rndkey0,$inout3 1872238384Sjkim aesdec $rndkey0,$inout4 1873238384Sjkim aesdec $rndkey0,$inout5 1874238384Sjkim $movkey 32($key),$rndkey0 1875238384Sjkim 1876238384Sjkim pshufd \$0x13,$twtmp,$twres 1877238384Sjkim pxor $twtmp,$twtmp 1878238384Sjkim movdqa @tweak[5],@tweak[1] 1879238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1880238384Sjkim aesdec $rndkey1,$inout0 1881238384Sjkim pand $twmask,$twres # isolate carry and residue 1882238384Sjkim aesdec $rndkey1,$inout1 1883238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1884238384Sjkim aesdec $rndkey1,$inout2 1885238384Sjkim pxor $twres,@tweak[5] 1886238384Sjkim aesdec $rndkey1,$inout3 1887238384Sjkim aesdec $rndkey1,$inout4 1888238384Sjkim aesdec $rndkey1,$inout5 1889238384Sjkim 1890238384Sjkim pshufd \$0x13,$twtmp,$twres 1891238384Sjkim pxor $twtmp,$twtmp 1892238384Sjkim movdqa @tweak[5],@tweak[2] 1893238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1894238384Sjkim aesdeclast $rndkey0,$inout0 1895238384Sjkim pand $twmask,$twres # isolate carry and residue 1896238384Sjkim aesdeclast $rndkey0,$inout1 1897238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1898238384Sjkim aesdeclast $rndkey0,$inout2 1899238384Sjkim pxor $twres,@tweak[5] 1900238384Sjkim aesdeclast $rndkey0,$inout3 1901238384Sjkim aesdeclast $rndkey0,$inout4 1902238384Sjkim aesdeclast $rndkey0,$inout5 1903238384Sjkim 1904238384Sjkim pshufd \$0x13,$twtmp,$twres 1905238384Sjkim pxor $twtmp,$twtmp 1906238384Sjkim movdqa @tweak[5],@tweak[3] 1907238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1908238384Sjkim xorps `16*0`(%rsp),$inout0 # output^=tweak 1909238384Sjkim pand $twmask,$twres # isolate carry and residue 1910238384Sjkim xorps `16*1`(%rsp),$inout1 1911238384Sjkim pcmpgtd @tweak[5],$twtmp # broadcat upper bits 1912238384Sjkim pxor $twres,@tweak[5] 1913238384Sjkim 1914238384Sjkim xorps `16*2`(%rsp),$inout2 1915238384Sjkim movups $inout0,`16*0`($out) # write output 1916238384Sjkim xorps `16*3`(%rsp),$inout3 1917238384Sjkim movups $inout1,`16*1`($out) 1918238384Sjkim xorps `16*4`(%rsp),$inout4 1919238384Sjkim movups $inout2,`16*2`($out) 1920238384Sjkim xorps `16*5`(%rsp),$inout5 1921238384Sjkim movups $inout3,`16*3`($out) 1922238384Sjkim mov $rnds_,$rounds # restore $rounds 1923238384Sjkim movups $inout4,`16*4`($out) 1924238384Sjkim movups $inout5,`16*5`($out) 1925238384Sjkim lea `16*6`($out),$out 1926238384Sjkim sub \$16*6,$len 1927238384Sjkim jnc .Lxts_dec_grandloop 1928238384Sjkim 1929238384Sjkim lea 3($rounds,$rounds),$rounds # restore original value 1930238384Sjkim mov $key_,$key # restore $key 1931238384Sjkim mov $rounds,$rnds_ # backup $rounds 1932238384Sjkim 1933238384Sjkim.Lxts_dec_short: 1934238384Sjkim add \$16*6,$len 1935238384Sjkim jz .Lxts_dec_done 1936238384Sjkim 1937238384Sjkim cmp \$0x20,$len 1938238384Sjkim jb .Lxts_dec_one 1939238384Sjkim je .Lxts_dec_two 1940238384Sjkim 1941238384Sjkim cmp \$0x40,$len 1942238384Sjkim jb .Lxts_dec_three 1943238384Sjkim je .Lxts_dec_four 1944238384Sjkim 1945238384Sjkim pshufd \$0x13,$twtmp,$twres 1946238384Sjkim movdqa @tweak[5],@tweak[4] 1947238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1948238384Sjkim movdqu ($inp),$inout0 1949238384Sjkim pand $twmask,$twres # isolate carry and residue 1950238384Sjkim movdqu 16*1($inp),$inout1 1951238384Sjkim pxor $twres,@tweak[5] 1952238384Sjkim 1953238384Sjkim movdqu 16*2($inp),$inout2 1954238384Sjkim pxor @tweak[0],$inout0 1955238384Sjkim movdqu 16*3($inp),$inout3 1956238384Sjkim pxor @tweak[1],$inout1 1957238384Sjkim movdqu 16*4($inp),$inout4 1958238384Sjkim lea 16*5($inp),$inp 1959238384Sjkim pxor @tweak[2],$inout2 1960238384Sjkim pxor @tweak[3],$inout3 1961238384Sjkim pxor @tweak[4],$inout4 1962238384Sjkim 1963238384Sjkim call _aesni_decrypt6 1964238384Sjkim 1965238384Sjkim xorps @tweak[0],$inout0 1966238384Sjkim xorps @tweak[1],$inout1 1967238384Sjkim xorps @tweak[2],$inout2 1968238384Sjkim movdqu $inout0,($out) 1969238384Sjkim xorps @tweak[3],$inout3 1970238384Sjkim movdqu $inout1,16*1($out) 1971238384Sjkim xorps @tweak[4],$inout4 1972238384Sjkim movdqu $inout2,16*2($out) 1973238384Sjkim pxor $twtmp,$twtmp 1974238384Sjkim movdqu $inout3,16*3($out) 1975238384Sjkim pcmpgtd @tweak[5],$twtmp 1976238384Sjkim movdqu $inout4,16*4($out) 1977238384Sjkim lea 16*5($out),$out 1978238384Sjkim pshufd \$0x13,$twtmp,@tweak[1] # $twres 1979238384Sjkim and \$15,$len_ 1980238384Sjkim jz .Lxts_dec_ret 1981238384Sjkim 1982238384Sjkim movdqa @tweak[5],@tweak[0] 1983238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 1984238384Sjkim pand $twmask,@tweak[1] # isolate carry and residue 1985238384Sjkim pxor @tweak[5],@tweak[1] 1986238384Sjkim jmp .Lxts_dec_done2 1987238384Sjkim 1988238384Sjkim.align 16 1989238384Sjkim.Lxts_dec_one: 1990238384Sjkim movups ($inp),$inout0 1991238384Sjkim lea 16*1($inp),$inp 1992238384Sjkim xorps @tweak[0],$inout0 1993238384Sjkim___ 1994238384Sjkim &aesni_generate1("dec",$key,$rounds); 1995238384Sjkim$code.=<<___; 1996238384Sjkim xorps @tweak[0],$inout0 1997238384Sjkim movdqa @tweak[1],@tweak[0] 1998238384Sjkim movups $inout0,($out) 1999238384Sjkim movdqa @tweak[2],@tweak[1] 2000238384Sjkim lea 16*1($out),$out 2001238384Sjkim jmp .Lxts_dec_done 2002238384Sjkim 2003238384Sjkim.align 16 2004238384Sjkim.Lxts_dec_two: 2005238384Sjkim movups ($inp),$inout0 2006238384Sjkim movups 16($inp),$inout1 2007238384Sjkim lea 32($inp),$inp 2008238384Sjkim xorps @tweak[0],$inout0 2009238384Sjkim xorps @tweak[1],$inout1 2010238384Sjkim 2011238384Sjkim call _aesni_decrypt3 2012238384Sjkim 2013238384Sjkim xorps @tweak[0],$inout0 2014238384Sjkim movdqa @tweak[2],@tweak[0] 2015238384Sjkim xorps @tweak[1],$inout1 2016238384Sjkim movdqa @tweak[3],@tweak[1] 2017238384Sjkim movups $inout0,($out) 2018238384Sjkim movups $inout1,16*1($out) 2019238384Sjkim lea 16*2($out),$out 2020238384Sjkim jmp .Lxts_dec_done 2021238384Sjkim 2022238384Sjkim.align 16 2023238384Sjkim.Lxts_dec_three: 2024238384Sjkim movups ($inp),$inout0 2025238384Sjkim movups 16*1($inp),$inout1 2026238384Sjkim movups 16*2($inp),$inout2 2027238384Sjkim lea 16*3($inp),$inp 2028238384Sjkim xorps @tweak[0],$inout0 2029238384Sjkim xorps @tweak[1],$inout1 2030238384Sjkim xorps @tweak[2],$inout2 2031238384Sjkim 2032238384Sjkim call _aesni_decrypt3 2033238384Sjkim 2034238384Sjkim xorps @tweak[0],$inout0 2035238384Sjkim movdqa @tweak[3],@tweak[0] 2036238384Sjkim xorps @tweak[1],$inout1 2037238384Sjkim movdqa @tweak[5],@tweak[1] 2038238384Sjkim xorps @tweak[2],$inout2 2039238384Sjkim movups $inout0,($out) 2040238384Sjkim movups $inout1,16*1($out) 2041238384Sjkim movups $inout2,16*2($out) 2042238384Sjkim lea 16*3($out),$out 2043238384Sjkim jmp .Lxts_dec_done 2044238384Sjkim 2045238384Sjkim.align 16 2046238384Sjkim.Lxts_dec_four: 2047238384Sjkim pshufd \$0x13,$twtmp,$twres 2048238384Sjkim movdqa @tweak[5],@tweak[4] 2049238384Sjkim paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2050238384Sjkim movups ($inp),$inout0 2051238384Sjkim pand $twmask,$twres # isolate carry and residue 2052238384Sjkim movups 16*1($inp),$inout1 2053238384Sjkim pxor $twres,@tweak[5] 2054238384Sjkim 2055238384Sjkim movups 16*2($inp),$inout2 2056238384Sjkim xorps @tweak[0],$inout0 2057238384Sjkim movups 16*3($inp),$inout3 2058238384Sjkim lea 16*4($inp),$inp 2059238384Sjkim xorps @tweak[1],$inout1 2060238384Sjkim xorps @tweak[2],$inout2 2061238384Sjkim xorps @tweak[3],$inout3 2062238384Sjkim 2063238384Sjkim call _aesni_decrypt4 2064238384Sjkim 2065238384Sjkim xorps @tweak[0],$inout0 2066238384Sjkim movdqa @tweak[4],@tweak[0] 2067238384Sjkim xorps @tweak[1],$inout1 2068238384Sjkim movdqa @tweak[5],@tweak[1] 2069238384Sjkim xorps @tweak[2],$inout2 2070238384Sjkim movups $inout0,($out) 2071238384Sjkim xorps @tweak[3],$inout3 2072238384Sjkim movups $inout1,16*1($out) 2073238384Sjkim movups $inout2,16*2($out) 2074238384Sjkim movups $inout3,16*3($out) 2075238384Sjkim lea 16*4($out),$out 2076238384Sjkim jmp .Lxts_dec_done 2077238384Sjkim 2078238384Sjkim.align 16 2079238384Sjkim.Lxts_dec_done: 2080238384Sjkim and \$15,$len_ 2081238384Sjkim jz .Lxts_dec_ret 2082238384Sjkim.Lxts_dec_done2: 2083238384Sjkim mov $len_,$len 2084238384Sjkim mov $key_,$key # restore $key 2085238384Sjkim mov $rnds_,$rounds # restore $rounds 2086238384Sjkim 2087238384Sjkim movups ($inp),$inout0 2088238384Sjkim xorps @tweak[1],$inout0 2089238384Sjkim___ 2090238384Sjkim &aesni_generate1("dec",$key,$rounds); 2091238384Sjkim$code.=<<___; 2092238384Sjkim xorps @tweak[1],$inout0 2093238384Sjkim movups $inout0,($out) 2094238384Sjkim 2095238384Sjkim.Lxts_dec_steal: 2096238384Sjkim movzb 16($inp),%eax # borrow $rounds ... 2097238384Sjkim movzb ($out),%ecx # ... and $key 2098238384Sjkim lea 1($inp),$inp 2099238384Sjkim mov %al,($out) 2100238384Sjkim mov %cl,16($out) 2101238384Sjkim lea 1($out),$out 2102238384Sjkim sub \$1,$len 2103238384Sjkim jnz .Lxts_dec_steal 2104238384Sjkim 2105238384Sjkim sub $len_,$out # rewind $out 2106238384Sjkim mov $key_,$key # restore $key 2107238384Sjkim mov $rnds_,$rounds # restore $rounds 2108238384Sjkim 2109238384Sjkim movups ($out),$inout0 2110238384Sjkim xorps @tweak[0],$inout0 2111238384Sjkim___ 2112238384Sjkim &aesni_generate1("dec",$key,$rounds); 2113238384Sjkim$code.=<<___; 2114238384Sjkim xorps @tweak[0],$inout0 2115238384Sjkim movups $inout0,($out) 2116238384Sjkim 2117238384Sjkim.Lxts_dec_ret: 2118238384Sjkim___ 2119238384Sjkim$code.=<<___ if ($win64); 2120238384Sjkim movaps 0x60(%rsp),%xmm6 2121238384Sjkim movaps 0x70(%rsp),%xmm7 2122238384Sjkim movaps 0x80(%rsp),%xmm8 2123238384Sjkim movaps 0x90(%rsp),%xmm9 2124238384Sjkim movaps 0xa0(%rsp),%xmm10 2125238384Sjkim movaps 0xb0(%rsp),%xmm11 2126238384Sjkim movaps 0xc0(%rsp),%xmm12 2127238384Sjkim movaps 0xd0(%rsp),%xmm13 2128238384Sjkim movaps 0xe0(%rsp),%xmm14 2129238384Sjkim movaps 0xf0(%rsp),%xmm15 2130238384Sjkim___ 2131238384Sjkim$code.=<<___; 2132238384Sjkim lea $frame_size(%rsp),%rsp 2133238384Sjkim.Lxts_dec_epilogue: 2134238384Sjkim ret 2135238384Sjkim.size aesni_xts_decrypt,.-aesni_xts_decrypt 2136238384Sjkim___ 2137238384Sjkim} }} 2138238384Sjkim 2139238384Sjkim######################################################################## 2140238384Sjkim# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2141238384Sjkim# size_t length, const AES_KEY *key, 2142238384Sjkim# unsigned char *ivp,const int enc); 2143238384Sjkim{ 2144238384Sjkimmy $reserved = $win64?0x40:-0x18; # used in decrypt 2145238384Sjkim$code.=<<___; 2146238384Sjkim.globl ${PREFIX}_cbc_encrypt 2147238384Sjkim.type ${PREFIX}_cbc_encrypt,\@function,6 2148238384Sjkim.align 16 2149238384Sjkim${PREFIX}_cbc_encrypt: 2150238384Sjkim test $len,$len # check length 2151238384Sjkim jz .Lcbc_ret 2152238384Sjkim 2153238384Sjkim mov 240($key),$rnds_ # key->rounds 2154238384Sjkim mov $key,$key_ # backup $key 2155238384Sjkim test %r9d,%r9d # 6th argument 2156238384Sjkim jz .Lcbc_decrypt 2157238384Sjkim#--------------------------- CBC ENCRYPT ------------------------------# 2158238384Sjkim movups ($ivp),$inout0 # load iv as initial state 2159238384Sjkim mov $rnds_,$rounds 2160238384Sjkim cmp \$16,$len 2161238384Sjkim jb .Lcbc_enc_tail 2162238384Sjkim sub \$16,$len 2163238384Sjkim jmp .Lcbc_enc_loop 2164238384Sjkim.align 16 2165238384Sjkim.Lcbc_enc_loop: 2166238384Sjkim movups ($inp),$inout1 # load input 2167238384Sjkim lea 16($inp),$inp 2168238384Sjkim #xorps $inout1,$inout0 2169238384Sjkim___ 2170238384Sjkim &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 2171238384Sjkim$code.=<<___; 2172238384Sjkim mov $rnds_,$rounds # restore $rounds 2173238384Sjkim mov $key_,$key # restore $key 2174238384Sjkim movups $inout0,0($out) # store output 2175238384Sjkim lea 16($out),$out 2176238384Sjkim sub \$16,$len 2177238384Sjkim jnc .Lcbc_enc_loop 2178238384Sjkim add \$16,$len 2179238384Sjkim jnz .Lcbc_enc_tail 2180238384Sjkim movups $inout0,($ivp) 2181238384Sjkim jmp .Lcbc_ret 2182238384Sjkim 2183238384Sjkim.Lcbc_enc_tail: 2184238384Sjkim mov $len,%rcx # zaps $key 2185238384Sjkim xchg $inp,$out # $inp is %rsi and $out is %rdi now 2186238384Sjkim .long 0x9066A4F3 # rep movsb 2187238384Sjkim mov \$16,%ecx # zero tail 2188238384Sjkim sub $len,%rcx 2189238384Sjkim xor %eax,%eax 2190238384Sjkim .long 0x9066AAF3 # rep stosb 2191238384Sjkim lea -16(%rdi),%rdi # rewind $out by 1 block 2192238384Sjkim mov $rnds_,$rounds # restore $rounds 2193238384Sjkim mov %rdi,%rsi # $inp and $out are the same 2194238384Sjkim mov $key_,$key # restore $key 2195238384Sjkim xor $len,$len # len=16 2196238384Sjkim jmp .Lcbc_enc_loop # one more spin 2197238384Sjkim#--------------------------- CBC DECRYPT ------------------------------# 2198238384Sjkim.align 16 2199238384Sjkim.Lcbc_decrypt: 2200238384Sjkim___ 2201238384Sjkim$code.=<<___ if ($win64); 2202238384Sjkim lea -0x58(%rsp),%rsp 2203238384Sjkim movaps %xmm6,(%rsp) 2204238384Sjkim movaps %xmm7,0x10(%rsp) 2205238384Sjkim movaps %xmm8,0x20(%rsp) 2206238384Sjkim movaps %xmm9,0x30(%rsp) 2207238384Sjkim.Lcbc_decrypt_body: 2208238384Sjkim___ 2209238384Sjkim$code.=<<___; 2210238384Sjkim movups ($ivp),$iv 2211238384Sjkim mov $rnds_,$rounds 2212238384Sjkim cmp \$0x70,$len 2213238384Sjkim jbe .Lcbc_dec_tail 2214238384Sjkim shr \$1,$rnds_ 2215238384Sjkim sub \$0x70,$len 2216238384Sjkim mov $rnds_,$rounds 2217238384Sjkim movaps $iv,$reserved(%rsp) 2218238384Sjkim jmp .Lcbc_dec_loop8_enter 2219238384Sjkim.align 16 2220238384Sjkim.Lcbc_dec_loop8: 2221238384Sjkim movaps $rndkey0,$reserved(%rsp) # save IV 2222238384Sjkim movups $inout7,($out) 2223238384Sjkim lea 0x10($out),$out 2224238384Sjkim.Lcbc_dec_loop8_enter: 2225238384Sjkim $movkey ($key),$rndkey0 2226238384Sjkim movups ($inp),$inout0 # load input 2227238384Sjkim movups 0x10($inp),$inout1 2228238384Sjkim $movkey 16($key),$rndkey1 2229238384Sjkim 2230238384Sjkim lea 32($key),$key 2231238384Sjkim movdqu 0x20($inp),$inout2 2232238384Sjkim xorps $rndkey0,$inout0 2233238384Sjkim movdqu 0x30($inp),$inout3 2234238384Sjkim xorps $rndkey0,$inout1 2235238384Sjkim movdqu 0x40($inp),$inout4 2236238384Sjkim aesdec $rndkey1,$inout0 2237238384Sjkim pxor $rndkey0,$inout2 2238238384Sjkim movdqu 0x50($inp),$inout5 2239238384Sjkim aesdec $rndkey1,$inout1 2240238384Sjkim pxor $rndkey0,$inout3 2241238384Sjkim movdqu 0x60($inp),$inout6 2242238384Sjkim aesdec $rndkey1,$inout2 2243238384Sjkim pxor $rndkey0,$inout4 2244238384Sjkim movdqu 0x70($inp),$inout7 2245238384Sjkim aesdec $rndkey1,$inout3 2246238384Sjkim pxor $rndkey0,$inout5 2247238384Sjkim dec $rounds 2248238384Sjkim aesdec $rndkey1,$inout4 2249238384Sjkim pxor $rndkey0,$inout6 2250238384Sjkim aesdec $rndkey1,$inout5 2251238384Sjkim pxor $rndkey0,$inout7 2252238384Sjkim $movkey ($key),$rndkey0 2253238384Sjkim aesdec $rndkey1,$inout6 2254238384Sjkim aesdec $rndkey1,$inout7 2255238384Sjkim $movkey 16($key),$rndkey1 2256238384Sjkim 2257238384Sjkim call .Ldec_loop8_enter 2258238384Sjkim 2259238384Sjkim movups ($inp),$rndkey1 # re-load input 2260238384Sjkim movups 0x10($inp),$rndkey0 2261238384Sjkim xorps $reserved(%rsp),$inout0 # ^= IV 2262238384Sjkim xorps $rndkey1,$inout1 2263238384Sjkim movups 0x20($inp),$rndkey1 2264238384Sjkim xorps $rndkey0,$inout2 2265238384Sjkim movups 0x30($inp),$rndkey0 2266238384Sjkim xorps $rndkey1,$inout3 2267238384Sjkim movups 0x40($inp),$rndkey1 2268238384Sjkim xorps $rndkey0,$inout4 2269238384Sjkim movups 0x50($inp),$rndkey0 2270238384Sjkim xorps $rndkey1,$inout5 2271238384Sjkim movups 0x60($inp),$rndkey1 2272238384Sjkim xorps $rndkey0,$inout6 2273238384Sjkim movups 0x70($inp),$rndkey0 # IV 2274238384Sjkim xorps $rndkey1,$inout7 2275238384Sjkim movups $inout0,($out) 2276238384Sjkim movups $inout1,0x10($out) 2277238384Sjkim movups $inout2,0x20($out) 2278238384Sjkim movups $inout3,0x30($out) 2279238384Sjkim mov $rnds_,$rounds # restore $rounds 2280238384Sjkim movups $inout4,0x40($out) 2281238384Sjkim mov $key_,$key # restore $key 2282238384Sjkim movups $inout5,0x50($out) 2283238384Sjkim lea 0x80($inp),$inp 2284238384Sjkim movups $inout6,0x60($out) 2285238384Sjkim lea 0x70($out),$out 2286238384Sjkim sub \$0x80,$len 2287238384Sjkim ja .Lcbc_dec_loop8 2288238384Sjkim 2289238384Sjkim movaps $inout7,$inout0 2290238384Sjkim movaps $rndkey0,$iv 2291238384Sjkim add \$0x70,$len 2292238384Sjkim jle .Lcbc_dec_tail_collected 2293238384Sjkim movups $inout0,($out) 2294238384Sjkim lea 1($rnds_,$rnds_),$rounds 2295238384Sjkim lea 0x10($out),$out 2296238384Sjkim.Lcbc_dec_tail: 2297238384Sjkim movups ($inp),$inout0 2298238384Sjkim movaps $inout0,$in0 2299238384Sjkim cmp \$0x10,$len 2300238384Sjkim jbe .Lcbc_dec_one 2301238384Sjkim 2302238384Sjkim movups 0x10($inp),$inout1 2303238384Sjkim movaps $inout1,$in1 2304238384Sjkim cmp \$0x20,$len 2305238384Sjkim jbe .Lcbc_dec_two 2306238384Sjkim 2307238384Sjkim movups 0x20($inp),$inout2 2308238384Sjkim movaps $inout2,$in2 2309238384Sjkim cmp \$0x30,$len 2310238384Sjkim jbe .Lcbc_dec_three 2311238384Sjkim 2312238384Sjkim movups 0x30($inp),$inout3 2313238384Sjkim cmp \$0x40,$len 2314238384Sjkim jbe .Lcbc_dec_four 2315238384Sjkim 2316238384Sjkim movups 0x40($inp),$inout4 2317238384Sjkim cmp \$0x50,$len 2318238384Sjkim jbe .Lcbc_dec_five 2319238384Sjkim 2320238384Sjkim movups 0x50($inp),$inout5 2321238384Sjkim cmp \$0x60,$len 2322238384Sjkim jbe .Lcbc_dec_six 2323238384Sjkim 2324238384Sjkim movups 0x60($inp),$inout6 2325238384Sjkim movaps $iv,$reserved(%rsp) # save IV 2326238384Sjkim call _aesni_decrypt8 2327238384Sjkim movups ($inp),$rndkey1 2328238384Sjkim movups 0x10($inp),$rndkey0 2329238384Sjkim xorps $reserved(%rsp),$inout0 # ^= IV 2330238384Sjkim xorps $rndkey1,$inout1 2331238384Sjkim movups 0x20($inp),$rndkey1 2332238384Sjkim xorps $rndkey0,$inout2 2333238384Sjkim movups 0x30($inp),$rndkey0 2334238384Sjkim xorps $rndkey1,$inout3 2335238384Sjkim movups 0x40($inp),$rndkey1 2336238384Sjkim xorps $rndkey0,$inout4 2337238384Sjkim movups 0x50($inp),$rndkey0 2338238384Sjkim xorps $rndkey1,$inout5 2339238384Sjkim movups 0x60($inp),$iv # IV 2340238384Sjkim xorps $rndkey0,$inout6 2341238384Sjkim movups $inout0,($out) 2342238384Sjkim movups $inout1,0x10($out) 2343238384Sjkim movups $inout2,0x20($out) 2344238384Sjkim movups $inout3,0x30($out) 2345238384Sjkim movups $inout4,0x40($out) 2346238384Sjkim movups $inout5,0x50($out) 2347238384Sjkim lea 0x60($out),$out 2348238384Sjkim movaps $inout6,$inout0 2349238384Sjkim sub \$0x70,$len 2350238384Sjkim jmp .Lcbc_dec_tail_collected 2351238384Sjkim.align 16 2352238384Sjkim.Lcbc_dec_one: 2353238384Sjkim___ 2354238384Sjkim &aesni_generate1("dec",$key,$rounds); 2355238384Sjkim$code.=<<___; 2356238384Sjkim xorps $iv,$inout0 2357238384Sjkim movaps $in0,$iv 2358238384Sjkim sub \$0x10,$len 2359238384Sjkim jmp .Lcbc_dec_tail_collected 2360238384Sjkim.align 16 2361238384Sjkim.Lcbc_dec_two: 2362238384Sjkim xorps $inout2,$inout2 2363238384Sjkim call _aesni_decrypt3 2364238384Sjkim xorps $iv,$inout0 2365238384Sjkim xorps $in0,$inout1 2366238384Sjkim movups $inout0,($out) 2367238384Sjkim movaps $in1,$iv 2368238384Sjkim movaps $inout1,$inout0 2369238384Sjkim lea 0x10($out),$out 2370238384Sjkim sub \$0x20,$len 2371238384Sjkim jmp .Lcbc_dec_tail_collected 2372238384Sjkim.align 16 2373238384Sjkim.Lcbc_dec_three: 2374238384Sjkim call _aesni_decrypt3 2375238384Sjkim xorps $iv,$inout0 2376238384Sjkim xorps $in0,$inout1 2377238384Sjkim movups $inout0,($out) 2378238384Sjkim xorps $in1,$inout2 2379238384Sjkim movups $inout1,0x10($out) 2380238384Sjkim movaps $in2,$iv 2381238384Sjkim movaps $inout2,$inout0 2382238384Sjkim lea 0x20($out),$out 2383238384Sjkim sub \$0x30,$len 2384238384Sjkim jmp .Lcbc_dec_tail_collected 2385238384Sjkim.align 16 2386238384Sjkim.Lcbc_dec_four: 2387238384Sjkim call _aesni_decrypt4 2388238384Sjkim xorps $iv,$inout0 2389238384Sjkim movups 0x30($inp),$iv 2390238384Sjkim xorps $in0,$inout1 2391238384Sjkim movups $inout0,($out) 2392238384Sjkim xorps $in1,$inout2 2393238384Sjkim movups $inout1,0x10($out) 2394238384Sjkim xorps $in2,$inout3 2395238384Sjkim movups $inout2,0x20($out) 2396238384Sjkim movaps $inout3,$inout0 2397238384Sjkim lea 0x30($out),$out 2398238384Sjkim sub \$0x40,$len 2399238384Sjkim jmp .Lcbc_dec_tail_collected 2400238384Sjkim.align 16 2401238384Sjkim.Lcbc_dec_five: 2402238384Sjkim xorps $inout5,$inout5 2403238384Sjkim call _aesni_decrypt6 2404238384Sjkim movups 0x10($inp),$rndkey1 2405238384Sjkim movups 0x20($inp),$rndkey0 2406238384Sjkim xorps $iv,$inout0 2407238384Sjkim xorps $in0,$inout1 2408238384Sjkim xorps $rndkey1,$inout2 2409238384Sjkim movups 0x30($inp),$rndkey1 2410238384Sjkim xorps $rndkey0,$inout3 2411238384Sjkim movups 0x40($inp),$iv 2412238384Sjkim xorps $rndkey1,$inout4 2413238384Sjkim movups $inout0,($out) 2414238384Sjkim movups $inout1,0x10($out) 2415238384Sjkim movups $inout2,0x20($out) 2416238384Sjkim movups $inout3,0x30($out) 2417238384Sjkim lea 0x40($out),$out 2418238384Sjkim movaps $inout4,$inout0 2419238384Sjkim sub \$0x50,$len 2420238384Sjkim jmp .Lcbc_dec_tail_collected 2421238384Sjkim.align 16 2422238384Sjkim.Lcbc_dec_six: 2423238384Sjkim call _aesni_decrypt6 2424238384Sjkim movups 0x10($inp),$rndkey1 2425238384Sjkim movups 0x20($inp),$rndkey0 2426238384Sjkim xorps $iv,$inout0 2427238384Sjkim xorps $in0,$inout1 2428238384Sjkim xorps $rndkey1,$inout2 2429238384Sjkim movups 0x30($inp),$rndkey1 2430238384Sjkim xorps $rndkey0,$inout3 2431238384Sjkim movups 0x40($inp),$rndkey0 2432238384Sjkim xorps $rndkey1,$inout4 2433238384Sjkim movups 0x50($inp),$iv 2434238384Sjkim xorps $rndkey0,$inout5 2435238384Sjkim movups $inout0,($out) 2436238384Sjkim movups $inout1,0x10($out) 2437238384Sjkim movups $inout2,0x20($out) 2438238384Sjkim movups $inout3,0x30($out) 2439238384Sjkim movups $inout4,0x40($out) 2440238384Sjkim lea 0x50($out),$out 2441238384Sjkim movaps $inout5,$inout0 2442238384Sjkim sub \$0x60,$len 2443238384Sjkim jmp .Lcbc_dec_tail_collected 2444238384Sjkim.align 16 2445238384Sjkim.Lcbc_dec_tail_collected: 2446238384Sjkim and \$15,$len 2447238384Sjkim movups $iv,($ivp) 2448238384Sjkim jnz .Lcbc_dec_tail_partial 2449238384Sjkim movups $inout0,($out) 2450238384Sjkim jmp .Lcbc_dec_ret 2451238384Sjkim.align 16 2452238384Sjkim.Lcbc_dec_tail_partial: 2453238384Sjkim movaps $inout0,$reserved(%rsp) 2454238384Sjkim mov \$16,%rcx 2455238384Sjkim mov $out,%rdi 2456238384Sjkim sub $len,%rcx 2457238384Sjkim lea $reserved(%rsp),%rsi 2458238384Sjkim .long 0x9066A4F3 # rep movsb 2459238384Sjkim 2460238384Sjkim.Lcbc_dec_ret: 2461238384Sjkim___ 2462238384Sjkim$code.=<<___ if ($win64); 2463238384Sjkim movaps (%rsp),%xmm6 2464238384Sjkim movaps 0x10(%rsp),%xmm7 2465238384Sjkim movaps 0x20(%rsp),%xmm8 2466238384Sjkim movaps 0x30(%rsp),%xmm9 2467238384Sjkim lea 0x58(%rsp),%rsp 2468238384Sjkim___ 2469238384Sjkim$code.=<<___; 2470238384Sjkim.Lcbc_ret: 2471238384Sjkim ret 2472238384Sjkim.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 2473238384Sjkim___ 2474238384Sjkim} 2475238384Sjkim# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, 2476238384Sjkim# int bits, AES_KEY *key) 2477238384Sjkim{ my ($inp,$bits,$key) = @_4args; 2478238384Sjkim $bits =~ s/%r/%e/; 2479238384Sjkim 2480238384Sjkim$code.=<<___; 2481238384Sjkim.globl ${PREFIX}_set_decrypt_key 2482238384Sjkim.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 2483238384Sjkim.align 16 2484238384Sjkim${PREFIX}_set_decrypt_key: 2485238384Sjkim .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 2486238384Sjkim call __aesni_set_encrypt_key 2487238384Sjkim shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 2488238384Sjkim test %eax,%eax 2489238384Sjkim jnz .Ldec_key_ret 2490238384Sjkim lea 16($key,$bits),$inp # points at the end of key schedule 2491238384Sjkim 2492238384Sjkim $movkey ($key),%xmm0 # just swap 2493238384Sjkim $movkey ($inp),%xmm1 2494238384Sjkim $movkey %xmm0,($inp) 2495238384Sjkim $movkey %xmm1,($key) 2496238384Sjkim lea 16($key),$key 2497238384Sjkim lea -16($inp),$inp 2498238384Sjkim 2499238384Sjkim.Ldec_key_inverse: 2500238384Sjkim $movkey ($key),%xmm0 # swap and inverse 2501238384Sjkim $movkey ($inp),%xmm1 2502238384Sjkim aesimc %xmm0,%xmm0 2503238384Sjkim aesimc %xmm1,%xmm1 2504238384Sjkim lea 16($key),$key 2505238384Sjkim lea -16($inp),$inp 2506238384Sjkim $movkey %xmm0,16($inp) 2507238384Sjkim $movkey %xmm1,-16($key) 2508238384Sjkim cmp $key,$inp 2509238384Sjkim ja .Ldec_key_inverse 2510238384Sjkim 2511238384Sjkim $movkey ($key),%xmm0 # inverse middle 2512238384Sjkim aesimc %xmm0,%xmm0 2513238384Sjkim $movkey %xmm0,($inp) 2514238384Sjkim.Ldec_key_ret: 2515238384Sjkim add \$8,%rsp 2516238384Sjkim ret 2517238384Sjkim.LSEH_end_set_decrypt_key: 2518238384Sjkim.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 2519238384Sjkim___ 2520238384Sjkim 2521238384Sjkim# This is based on submission by 2522238384Sjkim# 2523238384Sjkim# Huang Ying <ying.huang@intel.com> 2524238384Sjkim# Vinodh Gopal <vinodh.gopal@intel.com> 2525238384Sjkim# Kahraman Akdemir 2526238384Sjkim# 2527238384Sjkim# Agressively optimized in respect to aeskeygenassist's critical path 2528238384Sjkim# and is contained in %xmm0-5 to meet Win64 ABI requirement. 2529238384Sjkim# 2530238384Sjkim$code.=<<___; 2531238384Sjkim.globl ${PREFIX}_set_encrypt_key 2532238384Sjkim.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 2533238384Sjkim.align 16 2534238384Sjkim${PREFIX}_set_encrypt_key: 2535238384Sjkim__aesni_set_encrypt_key: 2536238384Sjkim .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 2537238384Sjkim mov \$-1,%rax 2538238384Sjkim test $inp,$inp 2539238384Sjkim jz .Lenc_key_ret 2540238384Sjkim test $key,$key 2541238384Sjkim jz .Lenc_key_ret 2542238384Sjkim 2543238384Sjkim movups ($inp),%xmm0 # pull first 128 bits of *userKey 2544238384Sjkim xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 2545238384Sjkim lea 16($key),%rax 2546238384Sjkim cmp \$256,$bits 2547238384Sjkim je .L14rounds 2548238384Sjkim cmp \$192,$bits 2549238384Sjkim je .L12rounds 2550238384Sjkim cmp \$128,$bits 2551238384Sjkim jne .Lbad_keybits 2552238384Sjkim 2553238384Sjkim.L10rounds: 2554238384Sjkim mov \$9,$bits # 10 rounds for 128-bit key 2555238384Sjkim $movkey %xmm0,($key) # round 0 2556238384Sjkim aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 2557238384Sjkim call .Lkey_expansion_128_cold 2558238384Sjkim aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 2559238384Sjkim call .Lkey_expansion_128 2560238384Sjkim aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 2561238384Sjkim call .Lkey_expansion_128 2562238384Sjkim aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 2563238384Sjkim call .Lkey_expansion_128 2564238384Sjkim aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 2565238384Sjkim call .Lkey_expansion_128 2566238384Sjkim aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 2567238384Sjkim call .Lkey_expansion_128 2568238384Sjkim aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 2569238384Sjkim call .Lkey_expansion_128 2570238384Sjkim aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 2571238384Sjkim call .Lkey_expansion_128 2572238384Sjkim aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 2573238384Sjkim call .Lkey_expansion_128 2574238384Sjkim aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 2575238384Sjkim call .Lkey_expansion_128 2576238384Sjkim $movkey %xmm0,(%rax) 2577238384Sjkim mov $bits,80(%rax) # 240(%rdx) 2578238384Sjkim xor %eax,%eax 2579238384Sjkim jmp .Lenc_key_ret 2580238384Sjkim 2581238384Sjkim.align 16 2582238384Sjkim.L12rounds: 2583238384Sjkim movq 16($inp),%xmm2 # remaining 1/3 of *userKey 2584238384Sjkim mov \$11,$bits # 12 rounds for 192 2585238384Sjkim $movkey %xmm0,($key) # round 0 2586238384Sjkim aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 2587238384Sjkim call .Lkey_expansion_192a_cold 2588238384Sjkim aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 2589238384Sjkim call .Lkey_expansion_192b 2590238384Sjkim aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 2591238384Sjkim call .Lkey_expansion_192a 2592238384Sjkim aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 2593238384Sjkim call .Lkey_expansion_192b 2594238384Sjkim aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 2595238384Sjkim call .Lkey_expansion_192a 2596238384Sjkim aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 2597238384Sjkim call .Lkey_expansion_192b 2598238384Sjkim aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 2599238384Sjkim call .Lkey_expansion_192a 2600238384Sjkim aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 2601238384Sjkim call .Lkey_expansion_192b 2602238384Sjkim $movkey %xmm0,(%rax) 2603238384Sjkim mov $bits,48(%rax) # 240(%rdx) 2604238384Sjkim xor %rax, %rax 2605238384Sjkim jmp .Lenc_key_ret 2606238384Sjkim 2607238384Sjkim.align 16 2608238384Sjkim.L14rounds: 2609238384Sjkim movups 16($inp),%xmm2 # remaning half of *userKey 2610238384Sjkim mov \$13,$bits # 14 rounds for 256 2611238384Sjkim lea 16(%rax),%rax 2612238384Sjkim $movkey %xmm0,($key) # round 0 2613238384Sjkim $movkey %xmm2,16($key) # round 1 2614238384Sjkim aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 2615238384Sjkim call .Lkey_expansion_256a_cold 2616238384Sjkim aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 2617238384Sjkim call .Lkey_expansion_256b 2618238384Sjkim aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 2619238384Sjkim call .Lkey_expansion_256a 2620238384Sjkim aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 2621238384Sjkim call .Lkey_expansion_256b 2622238384Sjkim aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 2623238384Sjkim call .Lkey_expansion_256a 2624238384Sjkim aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 2625238384Sjkim call .Lkey_expansion_256b 2626238384Sjkim aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 2627238384Sjkim call .Lkey_expansion_256a 2628238384Sjkim aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 2629238384Sjkim call .Lkey_expansion_256b 2630238384Sjkim aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 2631238384Sjkim call .Lkey_expansion_256a 2632238384Sjkim aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 2633238384Sjkim call .Lkey_expansion_256b 2634238384Sjkim aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 2635238384Sjkim call .Lkey_expansion_256a 2636238384Sjkim aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 2637238384Sjkim call .Lkey_expansion_256b 2638238384Sjkim aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 2639238384Sjkim call .Lkey_expansion_256a 2640238384Sjkim $movkey %xmm0,(%rax) 2641238384Sjkim mov $bits,16(%rax) # 240(%rdx) 2642238384Sjkim xor %rax,%rax 2643238384Sjkim jmp .Lenc_key_ret 2644238384Sjkim 2645238384Sjkim.align 16 2646238384Sjkim.Lbad_keybits: 2647238384Sjkim mov \$-2,%rax 2648238384Sjkim.Lenc_key_ret: 2649238384Sjkim add \$8,%rsp 2650238384Sjkim ret 2651238384Sjkim.LSEH_end_set_encrypt_key: 2652238384Sjkim 2653238384Sjkim.align 16 2654238384Sjkim.Lkey_expansion_128: 2655238384Sjkim $movkey %xmm0,(%rax) 2656238384Sjkim lea 16(%rax),%rax 2657238384Sjkim.Lkey_expansion_128_cold: 2658238384Sjkim shufps \$0b00010000,%xmm0,%xmm4 2659238384Sjkim xorps %xmm4, %xmm0 2660238384Sjkim shufps \$0b10001100,%xmm0,%xmm4 2661238384Sjkim xorps %xmm4, %xmm0 2662238384Sjkim shufps \$0b11111111,%xmm1,%xmm1 # critical path 2663238384Sjkim xorps %xmm1,%xmm0 2664238384Sjkim ret 2665238384Sjkim 2666238384Sjkim.align 16 2667238384Sjkim.Lkey_expansion_192a: 2668238384Sjkim $movkey %xmm0,(%rax) 2669238384Sjkim lea 16(%rax),%rax 2670238384Sjkim.Lkey_expansion_192a_cold: 2671238384Sjkim movaps %xmm2, %xmm5 2672238384Sjkim.Lkey_expansion_192b_warm: 2673238384Sjkim shufps \$0b00010000,%xmm0,%xmm4 2674238384Sjkim movdqa %xmm2,%xmm3 2675238384Sjkim xorps %xmm4,%xmm0 2676238384Sjkim shufps \$0b10001100,%xmm0,%xmm4 2677238384Sjkim pslldq \$4,%xmm3 2678238384Sjkim xorps %xmm4,%xmm0 2679238384Sjkim pshufd \$0b01010101,%xmm1,%xmm1 # critical path 2680238384Sjkim pxor %xmm3,%xmm2 2681238384Sjkim pxor %xmm1,%xmm0 2682238384Sjkim pshufd \$0b11111111,%xmm0,%xmm3 2683238384Sjkim pxor %xmm3,%xmm2 2684238384Sjkim ret 2685238384Sjkim 2686238384Sjkim.align 16 2687238384Sjkim.Lkey_expansion_192b: 2688238384Sjkim movaps %xmm0,%xmm3 2689238384Sjkim shufps \$0b01000100,%xmm0,%xmm5 2690238384Sjkim $movkey %xmm5,(%rax) 2691238384Sjkim shufps \$0b01001110,%xmm2,%xmm3 2692238384Sjkim $movkey %xmm3,16(%rax) 2693238384Sjkim lea 32(%rax),%rax 2694238384Sjkim jmp .Lkey_expansion_192b_warm 2695238384Sjkim 2696238384Sjkim.align 16 2697238384Sjkim.Lkey_expansion_256a: 2698238384Sjkim $movkey %xmm2,(%rax) 2699238384Sjkim lea 16(%rax),%rax 2700238384Sjkim.Lkey_expansion_256a_cold: 2701238384Sjkim shufps \$0b00010000,%xmm0,%xmm4 2702238384Sjkim xorps %xmm4,%xmm0 2703238384Sjkim shufps \$0b10001100,%xmm0,%xmm4 2704238384Sjkim xorps %xmm4,%xmm0 2705238384Sjkim shufps \$0b11111111,%xmm1,%xmm1 # critical path 2706238384Sjkim xorps %xmm1,%xmm0 2707238384Sjkim ret 2708238384Sjkim 2709238384Sjkim.align 16 2710238384Sjkim.Lkey_expansion_256b: 2711238384Sjkim $movkey %xmm0,(%rax) 2712238384Sjkim lea 16(%rax),%rax 2713238384Sjkim 2714238384Sjkim shufps \$0b00010000,%xmm2,%xmm4 2715238384Sjkim xorps %xmm4,%xmm2 2716238384Sjkim shufps \$0b10001100,%xmm2,%xmm4 2717238384Sjkim xorps %xmm4,%xmm2 2718238384Sjkim shufps \$0b10101010,%xmm1,%xmm1 # critical path 2719238384Sjkim xorps %xmm1,%xmm2 2720238384Sjkim ret 2721238384Sjkim.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 2722238384Sjkim.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 2723238384Sjkim___ 2724238384Sjkim} 2725238384Sjkim 2726238384Sjkim$code.=<<___; 2727238384Sjkim.align 64 2728238384Sjkim.Lbswap_mask: 2729238384Sjkim .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 2730238384Sjkim.Lincrement32: 2731238384Sjkim .long 6,6,6,0 2732238384Sjkim.Lincrement64: 2733238384Sjkim .long 1,0,0,0 2734238384Sjkim.Lxts_magic: 2735238384Sjkim .long 0x87,0,1,0 2736238384Sjkim 2737238384Sjkim.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 2738238384Sjkim.align 64 2739238384Sjkim___ 2740238384Sjkim 2741238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2742238384Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2743238384Sjkimif ($win64) { 2744238384Sjkim$rec="%rcx"; 2745238384Sjkim$frame="%rdx"; 2746238384Sjkim$context="%r8"; 2747238384Sjkim$disp="%r9"; 2748238384Sjkim 2749238384Sjkim$code.=<<___; 2750238384Sjkim.extern __imp_RtlVirtualUnwind 2751238384Sjkim___ 2752238384Sjkim$code.=<<___ if ($PREFIX eq "aesni"); 2753273399Sdelphij.type ecb_ccm64_se_handler,\@abi-omnipotent 2754238384Sjkim.align 16 2755273399Sdelphijecb_ccm64_se_handler: 2756238384Sjkim push %rsi 2757238384Sjkim push %rdi 2758238384Sjkim push %rbx 2759238384Sjkim push %rbp 2760238384Sjkim push %r12 2761238384Sjkim push %r13 2762238384Sjkim push %r14 2763238384Sjkim push %r15 2764238384Sjkim pushfq 2765238384Sjkim sub \$64,%rsp 2766238384Sjkim 2767238384Sjkim mov 120($context),%rax # pull context->Rax 2768238384Sjkim mov 248($context),%rbx # pull context->Rip 2769238384Sjkim 2770238384Sjkim mov 8($disp),%rsi # disp->ImageBase 2771238384Sjkim mov 56($disp),%r11 # disp->HandlerData 2772238384Sjkim 2773238384Sjkim mov 0(%r11),%r10d # HandlerData[0] 2774238384Sjkim lea (%rsi,%r10),%r10 # prologue label 2775238384Sjkim cmp %r10,%rbx # context->Rip<prologue label 2776238384Sjkim jb .Lcommon_seh_tail 2777238384Sjkim 2778238384Sjkim mov 152($context),%rax # pull context->Rsp 2779238384Sjkim 2780238384Sjkim mov 4(%r11),%r10d # HandlerData[1] 2781238384Sjkim lea (%rsi,%r10),%r10 # epilogue label 2782238384Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2783238384Sjkim jae .Lcommon_seh_tail 2784238384Sjkim 2785238384Sjkim lea 0(%rax),%rsi # %xmm save area 2786238384Sjkim lea 512($context),%rdi # &context.Xmm6 2787238384Sjkim mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 2788238384Sjkim .long 0xa548f3fc # cld; rep movsq 2789238384Sjkim lea 0x58(%rax),%rax # adjust stack pointer 2790238384Sjkim 2791238384Sjkim jmp .Lcommon_seh_tail 2792273399Sdelphij.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 2793238384Sjkim 2794238384Sjkim.type ctr32_se_handler,\@abi-omnipotent 2795238384Sjkim.align 16 2796238384Sjkimctr32_se_handler: 2797238384Sjkim push %rsi 2798238384Sjkim push %rdi 2799238384Sjkim push %rbx 2800238384Sjkim push %rbp 2801238384Sjkim push %r12 2802238384Sjkim push %r13 2803238384Sjkim push %r14 2804238384Sjkim push %r15 2805238384Sjkim pushfq 2806238384Sjkim sub \$64,%rsp 2807238384Sjkim 2808238384Sjkim mov 120($context),%rax # pull context->Rax 2809238384Sjkim mov 248($context),%rbx # pull context->Rip 2810238384Sjkim 2811238384Sjkim lea .Lctr32_body(%rip),%r10 2812238384Sjkim cmp %r10,%rbx # context->Rip<"prologue" label 2813238384Sjkim jb .Lcommon_seh_tail 2814238384Sjkim 2815238384Sjkim mov 152($context),%rax # pull context->Rsp 2816238384Sjkim 2817238384Sjkim lea .Lctr32_ret(%rip),%r10 2818238384Sjkim cmp %r10,%rbx 2819238384Sjkim jae .Lcommon_seh_tail 2820238384Sjkim 2821238384Sjkim lea 0x20(%rax),%rsi # %xmm save area 2822238384Sjkim lea 512($context),%rdi # &context.Xmm6 2823238384Sjkim mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2824238384Sjkim .long 0xa548f3fc # cld; rep movsq 2825238384Sjkim lea 0xc8(%rax),%rax # adjust stack pointer 2826238384Sjkim 2827238384Sjkim jmp .Lcommon_seh_tail 2828238384Sjkim.size ctr32_se_handler,.-ctr32_se_handler 2829238384Sjkim 2830238384Sjkim.type xts_se_handler,\@abi-omnipotent 2831238384Sjkim.align 16 2832238384Sjkimxts_se_handler: 2833238384Sjkim push %rsi 2834238384Sjkim push %rdi 2835238384Sjkim push %rbx 2836238384Sjkim push %rbp 2837238384Sjkim push %r12 2838238384Sjkim push %r13 2839238384Sjkim push %r14 2840238384Sjkim push %r15 2841238384Sjkim pushfq 2842238384Sjkim sub \$64,%rsp 2843238384Sjkim 2844238384Sjkim mov 120($context),%rax # pull context->Rax 2845238384Sjkim mov 248($context),%rbx # pull context->Rip 2846238384Sjkim 2847238384Sjkim mov 8($disp),%rsi # disp->ImageBase 2848238384Sjkim mov 56($disp),%r11 # disp->HandlerData 2849238384Sjkim 2850238384Sjkim mov 0(%r11),%r10d # HandlerData[0] 2851238384Sjkim lea (%rsi,%r10),%r10 # prologue lable 2852238384Sjkim cmp %r10,%rbx # context->Rip<prologue label 2853238384Sjkim jb .Lcommon_seh_tail 2854238384Sjkim 2855238384Sjkim mov 152($context),%rax # pull context->Rsp 2856238384Sjkim 2857238384Sjkim mov 4(%r11),%r10d # HandlerData[1] 2858238384Sjkim lea (%rsi,%r10),%r10 # epilogue label 2859238384Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2860238384Sjkim jae .Lcommon_seh_tail 2861238384Sjkim 2862238384Sjkim lea 0x60(%rax),%rsi # %xmm save area 2863238384Sjkim lea 512($context),%rdi # & context.Xmm6 2864238384Sjkim mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2865238384Sjkim .long 0xa548f3fc # cld; rep movsq 2866238384Sjkim lea 0x68+160(%rax),%rax # adjust stack pointer 2867238384Sjkim 2868238384Sjkim jmp .Lcommon_seh_tail 2869238384Sjkim.size xts_se_handler,.-xts_se_handler 2870238384Sjkim___ 2871238384Sjkim$code.=<<___; 2872238384Sjkim.type cbc_se_handler,\@abi-omnipotent 2873238384Sjkim.align 16 2874238384Sjkimcbc_se_handler: 2875238384Sjkim push %rsi 2876238384Sjkim push %rdi 2877238384Sjkim push %rbx 2878238384Sjkim push %rbp 2879238384Sjkim push %r12 2880238384Sjkim push %r13 2881238384Sjkim push %r14 2882238384Sjkim push %r15 2883238384Sjkim pushfq 2884238384Sjkim sub \$64,%rsp 2885238384Sjkim 2886238384Sjkim mov 152($context),%rax # pull context->Rsp 2887238384Sjkim mov 248($context),%rbx # pull context->Rip 2888238384Sjkim 2889238384Sjkim lea .Lcbc_decrypt(%rip),%r10 2890238384Sjkim cmp %r10,%rbx # context->Rip<"prologue" label 2891238384Sjkim jb .Lcommon_seh_tail 2892238384Sjkim 2893238384Sjkim lea .Lcbc_decrypt_body(%rip),%r10 2894238384Sjkim cmp %r10,%rbx # context->Rip<cbc_decrypt_body 2895238384Sjkim jb .Lrestore_cbc_rax 2896238384Sjkim 2897238384Sjkim lea .Lcbc_ret(%rip),%r10 2898238384Sjkim cmp %r10,%rbx # context->Rip>="epilogue" label 2899238384Sjkim jae .Lcommon_seh_tail 2900238384Sjkim 2901238384Sjkim lea 0(%rax),%rsi # top of stack 2902238384Sjkim lea 512($context),%rdi # &context.Xmm6 2903238384Sjkim mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 2904238384Sjkim .long 0xa548f3fc # cld; rep movsq 2905238384Sjkim lea 0x58(%rax),%rax # adjust stack pointer 2906238384Sjkim jmp .Lcommon_seh_tail 2907238384Sjkim 2908238384Sjkim.Lrestore_cbc_rax: 2909238384Sjkim mov 120($context),%rax 2910238384Sjkim 2911238384Sjkim.Lcommon_seh_tail: 2912238384Sjkim mov 8(%rax),%rdi 2913238384Sjkim mov 16(%rax),%rsi 2914238384Sjkim mov %rax,152($context) # restore context->Rsp 2915238384Sjkim mov %rsi,168($context) # restore context->Rsi 2916238384Sjkim mov %rdi,176($context) # restore context->Rdi 2917238384Sjkim 2918238384Sjkim mov 40($disp),%rdi # disp->ContextRecord 2919238384Sjkim mov $context,%rsi # context 2920238384Sjkim mov \$154,%ecx # sizeof(CONTEXT) 2921238384Sjkim .long 0xa548f3fc # cld; rep movsq 2922238384Sjkim 2923238384Sjkim mov $disp,%rsi 2924238384Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2925238384Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 2926238384Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 2927238384Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2928238384Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 2929238384Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 2930238384Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 2931238384Sjkim mov %r10,32(%rsp) # arg5 2932238384Sjkim mov %r11,40(%rsp) # arg6 2933238384Sjkim mov %r12,48(%rsp) # arg7 2934238384Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 2935238384Sjkim call *__imp_RtlVirtualUnwind(%rip) 2936238384Sjkim 2937238384Sjkim mov \$1,%eax # ExceptionContinueSearch 2938238384Sjkim add \$64,%rsp 2939238384Sjkim popfq 2940238384Sjkim pop %r15 2941238384Sjkim pop %r14 2942238384Sjkim pop %r13 2943238384Sjkim pop %r12 2944238384Sjkim pop %rbp 2945238384Sjkim pop %rbx 2946238384Sjkim pop %rdi 2947238384Sjkim pop %rsi 2948238384Sjkim ret 2949238384Sjkim.size cbc_se_handler,.-cbc_se_handler 2950238384Sjkim 2951238384Sjkim.section .pdata 2952238384Sjkim.align 4 2953238384Sjkim___ 2954238384Sjkim$code.=<<___ if ($PREFIX eq "aesni"); 2955238384Sjkim .rva .LSEH_begin_aesni_ecb_encrypt 2956238384Sjkim .rva .LSEH_end_aesni_ecb_encrypt 2957238384Sjkim .rva .LSEH_info_ecb 2958238384Sjkim 2959238384Sjkim .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 2960238384Sjkim .rva .LSEH_end_aesni_ccm64_encrypt_blocks 2961238384Sjkim .rva .LSEH_info_ccm64_enc 2962238384Sjkim 2963238384Sjkim .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 2964238384Sjkim .rva .LSEH_end_aesni_ccm64_decrypt_blocks 2965238384Sjkim .rva .LSEH_info_ccm64_dec 2966238384Sjkim 2967238384Sjkim .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 2968238384Sjkim .rva .LSEH_end_aesni_ctr32_encrypt_blocks 2969238384Sjkim .rva .LSEH_info_ctr32 2970238384Sjkim 2971238384Sjkim .rva .LSEH_begin_aesni_xts_encrypt 2972238384Sjkim .rva .LSEH_end_aesni_xts_encrypt 2973238384Sjkim .rva .LSEH_info_xts_enc 2974238384Sjkim 2975238384Sjkim .rva .LSEH_begin_aesni_xts_decrypt 2976238384Sjkim .rva .LSEH_end_aesni_xts_decrypt 2977238384Sjkim .rva .LSEH_info_xts_dec 2978238384Sjkim___ 2979238384Sjkim$code.=<<___; 2980238384Sjkim .rva .LSEH_begin_${PREFIX}_cbc_encrypt 2981238384Sjkim .rva .LSEH_end_${PREFIX}_cbc_encrypt 2982238384Sjkim .rva .LSEH_info_cbc 2983238384Sjkim 2984238384Sjkim .rva ${PREFIX}_set_decrypt_key 2985238384Sjkim .rva .LSEH_end_set_decrypt_key 2986238384Sjkim .rva .LSEH_info_key 2987238384Sjkim 2988238384Sjkim .rva ${PREFIX}_set_encrypt_key 2989238384Sjkim .rva .LSEH_end_set_encrypt_key 2990238384Sjkim .rva .LSEH_info_key 2991238384Sjkim.section .xdata 2992238384Sjkim.align 8 2993238384Sjkim___ 2994238384Sjkim$code.=<<___ if ($PREFIX eq "aesni"); 2995238384Sjkim.LSEH_info_ecb: 2996238384Sjkim .byte 9,0,0,0 2997273399Sdelphij .rva ecb_ccm64_se_handler 2998273399Sdelphij .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 2999238384Sjkim.LSEH_info_ccm64_enc: 3000238384Sjkim .byte 9,0,0,0 3001273399Sdelphij .rva ecb_ccm64_se_handler 3002238384Sjkim .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 3003238384Sjkim.LSEH_info_ccm64_dec: 3004238384Sjkim .byte 9,0,0,0 3005273399Sdelphij .rva ecb_ccm64_se_handler 3006238384Sjkim .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 3007238384Sjkim.LSEH_info_ctr32: 3008238384Sjkim .byte 9,0,0,0 3009238384Sjkim .rva ctr32_se_handler 3010238384Sjkim.LSEH_info_xts_enc: 3011238384Sjkim .byte 9,0,0,0 3012238384Sjkim .rva xts_se_handler 3013238384Sjkim .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3014238384Sjkim.LSEH_info_xts_dec: 3015238384Sjkim .byte 9,0,0,0 3016238384Sjkim .rva xts_se_handler 3017238384Sjkim .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3018238384Sjkim___ 3019238384Sjkim$code.=<<___; 3020238384Sjkim.LSEH_info_cbc: 3021238384Sjkim .byte 9,0,0,0 3022238384Sjkim .rva cbc_se_handler 3023238384Sjkim.LSEH_info_key: 3024238384Sjkim .byte 0x01,0x04,0x01,0x00 3025238384Sjkim .byte 0x04,0x02,0x00,0x00 # sub rsp,8 3026238384Sjkim___ 3027238384Sjkim} 3028238384Sjkim 3029238384Sjkimsub rex { 3030238384Sjkim local *opcode=shift; 3031238384Sjkim my ($dst,$src)=@_; 3032238384Sjkim my $rex=0; 3033238384Sjkim 3034238384Sjkim $rex|=0x04 if($dst>=8); 3035238384Sjkim $rex|=0x01 if($src>=8); 3036238384Sjkim push @opcode,$rex|0x40 if($rex); 3037238384Sjkim} 3038238384Sjkim 3039238384Sjkimsub aesni { 3040238384Sjkim my $line=shift; 3041238384Sjkim my @opcode=(0x66); 3042238384Sjkim 3043238384Sjkim if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3044238384Sjkim rex(\@opcode,$4,$3); 3045238384Sjkim push @opcode,0x0f,0x3a,0xdf; 3046238384Sjkim push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 3047238384Sjkim my $c=$2; 3048238384Sjkim push @opcode,$c=~/^0/?oct($c):$c; 3049238384Sjkim return ".byte\t".join(',',@opcode); 3050238384Sjkim } 3051238384Sjkim elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3052238384Sjkim my %opcodelet = ( 3053238384Sjkim "aesimc" => 0xdb, 3054238384Sjkim "aesenc" => 0xdc, "aesenclast" => 0xdd, 3055238384Sjkim "aesdec" => 0xde, "aesdeclast" => 0xdf 3056238384Sjkim ); 3057238384Sjkim return undef if (!defined($opcodelet{$1})); 3058238384Sjkim rex(\@opcode,$3,$2); 3059238384Sjkim push @opcode,0x0f,0x38,$opcodelet{$1}; 3060238384Sjkim push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 3061238384Sjkim return ".byte\t".join(',',@opcode); 3062238384Sjkim } 3063238384Sjkim return $line; 3064238384Sjkim} 3065238384Sjkim 3066238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3067238384Sjkim$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 3068238384Sjkim 3069238384Sjkimprint $code; 3070238384Sjkim 3071238384Sjkimclose STDOUT; 3072