161821Smarkm#!/usr/bin/env perl 261821Smarkm# 361821Smarkm# ==================================================================== 461821Smarkm# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 561821Smarkm# project. The module is, however, dual licensed under OpenSSL and 661821Smarkm# CRYPTOGAMS licenses depending on where you obtain it. For further 761821Smarkm# details see http://www.openssl.org/~appro/cryptogams/. 861821Smarkm# ==================================================================== 961821Smarkm# 1061821Smarkm# May 2011 1161821Smarkm# 1261821Smarkm# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 1361821Smarkm# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 1461821Smarkm# the time being... Except that it has two code paths: code suitable 1561821Smarkm# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and 1661821Smarkm# later. Improvement varies from one benchmark and �-arch to another. 1761821Smarkm# Vanilla code path is at most 20% faster than compiler-generated code 1861821Smarkm# [not very impressive], while PCLMULQDQ - whole 85%-160% better on 1961821Smarkm# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that 2061821Smarkm# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not 2161821Smarkm# all CPU time is burnt in it... 2261821Smarkm 2361821Smarkm$flavour = shift; 2461821Smarkm$output = shift; 2561821Smarkmif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 2661821Smarkm 2761821Smarkm$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 2861821Smarkm 2961821Smarkm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 3061821Smarkm( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 3161821Smarkm( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 3261821Smarkmdie "can't locate x86_64-xlate.pl"; 3361821Smarkm 3461821Smarkmopen OUT,"| \"$^X\" $xlate $flavour $output"; 3561821Smarkm*STDOUT=*OUT; 3661821Smarkm 3761821Smarkm($lo,$hi)=("%rax","%rdx"); $a=$lo; 3861821Smarkm($i0,$i1)=("%rsi","%rdi"); 3961821Smarkm($t0,$t1)=("%rbx","%rcx"); 4061821Smarkm($b,$mask)=("%rbp","%r8"); 4161821Smarkm($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); 4261821Smarkm($R,$Tx)=("%xmm0","%xmm1"); 4361821Smarkm 4461821Smarkm$code.=<<___; 4561821Smarkm.text 4661821Smarkm 4761821Smarkm.type _mul_1x1,\@abi-omnipotent 4861821Smarkm.align 16 4961821Smarkm_mul_1x1: 5061821Smarkm sub \$128+8,%rsp 5161821Smarkm mov \$-1,$a1 5261821Smarkm lea ($a,$a),$i0 5361821Smarkm shr \$3,$a1 5461821Smarkm lea (,$a,4),$i1 5561821Smarkm and $a,$a1 # a1=a&0x1fffffffffffffff 5661821Smarkm lea (,$a,8),$a8 5761821Smarkm sar \$63,$a # broadcast 63rd bit 5861821Smarkm lea ($a1,$a1),$a2 5961821Smarkm sar \$63,$i0 # broadcast 62nd bit 6061821Smarkm lea (,$a1,4),$a4 6161821Smarkm and $b,$a 62110007Smarkm sar \$63,$i1 # boardcast 61st bit 6361821Smarkm mov $a,$hi # $a is $lo 6461821Smarkm shl \$63,$lo 6561821Smarkm and $b,$i0 6661821Smarkm shr \$1,$hi 6761821Smarkm mov $i0,$t1 6861821Smarkm shl \$62,$i0 6961821Smarkm and $b,$i1 7061821Smarkm shr \$2,$t1 7161821Smarkm xor $i0,$lo 72160817Ssimon mov $i1,$t0 7361821Smarkm shl \$61,$i1 74160817Ssimon xor $t1,$hi 75160817Ssimon shr \$3,$t0 7661821Smarkm xor $i1,$lo 7761821Smarkm xor $t0,$hi 7861821Smarkm 7961821Smarkm mov $a1,$a12 8061821Smarkm movq \$0,0(%rsp) # tab[0]=0 8161821Smarkm xor $a2,$a12 # a1^a2 8261821Smarkm mov $a1,8(%rsp) # tab[1]=a1 8361821Smarkm mov $a4,$a48 8461821Smarkm mov $a2,16(%rsp) # tab[2]=a2 8561821Smarkm xor $a8,$a48 # a4^a8 8661821Smarkm mov $a12,24(%rsp) # tab[3]=a1^a2 8761821Smarkm 88110007Smarkm xor $a4,$a1 8961821Smarkm mov $a4,32(%rsp) # tab[4]=a4 9061821Smarkm xor $a4,$a2 9161821Smarkm mov $a1,40(%rsp) # tab[5]=a1^a4 9261821Smarkm xor $a4,$a12 9361821Smarkm mov $a2,48(%rsp) # tab[6]=a2^a4 9461821Smarkm xor $a48,$a1 # a1^a4^a4^a8=a1^a8 9561821Smarkm mov $a12,56(%rsp) # tab[7]=a1^a2^a4 9661821Smarkm xor $a48,$a2 # a2^a4^a4^a8=a1^a8 9761821Smarkm 9861821Smarkm mov $a8,64(%rsp) # tab[8]=a8 9961821Smarkm xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 10061821Smarkm mov $a1,72(%rsp) # tab[9]=a1^a8 10161821Smarkm xor $a4,$a1 # a1^a8^a4 10261821Smarkm mov $a2,80(%rsp) # tab[10]=a2^a8 10361821Smarkm xor $a4,$a2 # a2^a8^a4 10461821Smarkm mov $a12,88(%rsp) # tab[11]=a1^a2^a8 10561821Smarkm 10661821Smarkm xor $a4,$a12 # a1^a2^a8^a4 10761821Smarkm mov $a48,96(%rsp) # tab[12]=a4^a8 10861821Smarkm mov $mask,$i0 10961821Smarkm mov $a1,104(%rsp) # tab[13]=a1^a4^a8 11061821Smarkm and $b,$i0 11161821Smarkm mov $a2,112(%rsp) # tab[14]=a2^a4^a8 11261821Smarkm shr \$4,$b 11361821Smarkm mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 11461821Smarkm mov $mask,$i1 11561821Smarkm and $b,$i1 11661821Smarkm shr \$4,$b 11761821Smarkm 11861821Smarkm movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 11961821Smarkm mov $mask,$i0 12061821Smarkm and $b,$i0 12161821Smarkm shr \$4,$b 12261821Smarkm___ 12361821Smarkm for ($n=1;$n<8;$n++) { 12461821Smarkm $code.=<<___; 12561821Smarkm mov (%rsp,$i1,8),$t1 12661821Smarkm mov $mask,$i1 12761821Smarkm mov $t1,$t0 12861821Smarkm shl \$`8*$n-4`,$t1 12961821Smarkm and $b,$i1 13061821Smarkm movq (%rsp,$i0,8),$Tx 13161821Smarkm shr \$`64-(8*$n-4)`,$t0 13261821Smarkm xor $t1,$lo 13361821Smarkm pslldq \$$n,$Tx 13461821Smarkm mov $mask,$i0 13561821Smarkm shr \$4,$b 13661821Smarkm xor $t0,$hi 13761821Smarkm and $b,$i0 13861821Smarkm shr \$4,$b 13961821Smarkm pxor $Tx,$R 14061821Smarkm___ 14161821Smarkm } 14261821Smarkm$code.=<<___; 14361821Smarkm mov (%rsp,$i1,8),$t1 14461821Smarkm mov $t1,$t0 14561821Smarkm shl \$`8*$n-4`,$t1 14661821Smarkm movq $R,$i0 14761821Smarkm shr \$`64-(8*$n-4)`,$t0 14861821Smarkm xor $t1,$lo 14961821Smarkm psrldq \$8,$R 15061821Smarkm xor $t0,$hi 15161821Smarkm movq $R,$i1 15261821Smarkm xor $i0,$lo 15361821Smarkm xor $i1,$hi 15461821Smarkm 15561821Smarkm add \$128+8,%rsp 15661821Smarkm ret 15761821Smarkm.Lend_mul_1x1: 15861821Smarkm.size _mul_1x1,.-_mul_1x1 15961821Smarkm___ 16061821Smarkm 16161821Smarkm($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order 16261821Smarkm ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order 16361821Smarkm 16461821Smarkm$code.=<<___; 16561821Smarkm.extern OPENSSL_ia32cap_P 16661821Smarkm.globl bn_GF2m_mul_2x2 16761821Smarkm.type bn_GF2m_mul_2x2,\@abi-omnipotent 16861821Smarkm.align 16 16961821Smarkmbn_GF2m_mul_2x2: 17061821Smarkm mov OPENSSL_ia32cap_P(%rip),%rax 17161821Smarkm bt \$33,%rax 17261821Smarkm jnc .Lvanilla_mul_2x2 17361821Smarkm 17461821Smarkm movq $a1,%xmm0 17561821Smarkm movq $b1,%xmm1 17661821Smarkm movq $a0,%xmm2 17761821Smarkm___ 17861821Smarkm$code.=<<___ if ($win64); 17961821Smarkm movq 40(%rsp),%xmm3 18061821Smarkm___ 18161821Smarkm$code.=<<___ if (!$win64); 18261821Smarkm movq $b0,%xmm3 18361821Smarkm___ 18461821Smarkm$code.=<<___; 18561821Smarkm movdqa %xmm0,%xmm4 18661821Smarkm movdqa %xmm1,%xmm5 18761821Smarkm pclmulqdq \$0,%xmm1,%xmm0 # a1�b1 18861821Smarkm pxor %xmm2,%xmm4 18961821Smarkm pxor %xmm3,%xmm5 19061821Smarkm pclmulqdq \$0,%xmm3,%xmm2 # a0�b0 19161821Smarkm pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)�(b0+b1) 19261821Smarkm xorps %xmm0,%xmm4 19361821Smarkm xorps %xmm2,%xmm4 # (a0+a1)�(b0+b1)-a0�b0-a1�b1 19461821Smarkm movdqa %xmm4,%xmm5 19561821Smarkm pslldq \$8,%xmm4 19661821Smarkm psrldq \$8,%xmm5 19761821Smarkm pxor %xmm4,%xmm2 19861821Smarkm pxor %xmm5,%xmm0 19961821Smarkm movdqu %xmm2,0($rp) 20061821Smarkm movdqu %xmm0,16($rp) 20161821Smarkm ret 20261821Smarkm 20361821Smarkm.align 16 20461821Smarkm.Lvanilla_mul_2x2: 20561821Smarkm lea -8*17(%rsp),%rsp 20661821Smarkm___ 20761821Smarkm$code.=<<___ if ($win64); 20861821Smarkm mov `8*17+40`(%rsp),$b0 20961821Smarkm mov %rdi,8*15(%rsp) 21061821Smarkm mov %rsi,8*16(%rsp) 21161821Smarkm___ 21261821Smarkm$code.=<<___; 21361821Smarkm mov %r14,8*10(%rsp) 21461821Smarkm mov %r13,8*11(%rsp) 21561821Smarkm mov %r12,8*12(%rsp) 21661821Smarkm mov %rbp,8*13(%rsp) 21761821Smarkm mov %rbx,8*14(%rsp) 21861821Smarkm.Lbody_mul_2x2: 21961821Smarkm mov $rp,32(%rsp) # save the arguments 22061821Smarkm mov $a1,40(%rsp) 22161821Smarkm mov $a0,48(%rsp) 22261821Smarkm mov $b1,56(%rsp) 22361821Smarkm mov $b0,64(%rsp) 22461821Smarkm 22561821Smarkm mov \$0xf,$mask 22661821Smarkm mov $a1,$a 22761821Smarkm mov $b1,$b 22861821Smarkm call _mul_1x1 # a1�b1 22961821Smarkm mov $lo,16(%rsp) 23061821Smarkm mov $hi,24(%rsp) 23161821Smarkm 23261821Smarkm mov 48(%rsp),$a 23361821Smarkm mov 64(%rsp),$b 23461821Smarkm call _mul_1x1 # a0�b0 23561821Smarkm mov $lo,0(%rsp) 23661821Smarkm mov $hi,8(%rsp) 23761821Smarkm 23861821Smarkm mov 40(%rsp),$a 23961821Smarkm mov 56(%rsp),$b 24061821Smarkm xor 48(%rsp),$a 24161821Smarkm xor 64(%rsp),$b 24261821Smarkm call _mul_1x1 # (a0+a1)�(b0+b1) 24361821Smarkm___ 24461821Smarkm @r=("%rbx","%rcx","%rdi","%rsi"); 24561821Smarkm$code.=<<___; 24661821Smarkm mov 0(%rsp),@r[0] 24761821Smarkm mov 8(%rsp),@r[1] 24861821Smarkm mov 16(%rsp),@r[2] 24961821Smarkm mov 24(%rsp),@r[3] 25061821Smarkm mov 32(%rsp),%rbp 25161821Smarkm 25261821Smarkm xor $hi,$lo 25361821Smarkm xor @r[1],$hi 25461821Smarkm xor @r[0],$lo 25561821Smarkm mov @r[0],0(%rbp) 25661821Smarkm xor @r[2],$hi 25761821Smarkm mov @r[3],24(%rbp) 25861821Smarkm xor @r[3],$lo 25961821Smarkm xor @r[3],$hi 26061821Smarkm xor $hi,$lo 26161821Smarkm mov $hi,16(%rbp) 26261821Smarkm mov $lo,8(%rbp) 26361821Smarkm 26461821Smarkm mov 8*10(%rsp),%r14 26561821Smarkm mov 8*11(%rsp),%r13 26661821Smarkm mov 8*12(%rsp),%r12 26761821Smarkm mov 8*13(%rsp),%rbp 26861821Smarkm mov 8*14(%rsp),%rbx 26961821Smarkm___ 27061821Smarkm$code.=<<___ if ($win64); 27161821Smarkm mov 8*15(%rsp),%rdi 27261821Smarkm mov 8*16(%rsp),%rsi 27361821Smarkm___ 27461821Smarkm$code.=<<___; 27561821Smarkm lea 8*17(%rsp),%rsp 27661821Smarkm ret 27761821Smarkm.Lend_mul_2x2: 27861821Smarkm.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 27961821Smarkm.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 28061821Smarkm.align 16 28161821Smarkm___ 28261821Smarkm 28361821Smarkm# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 28461821Smarkm# CONTEXT *context,DISPATCHER_CONTEXT *disp) 28561821Smarkmif ($win64) { 28661821Smarkm$rec="%rcx"; 28761821Smarkm$frame="%rdx"; 28861821Smarkm$context="%r8"; 28961821Smarkm$disp="%r9"; 29061821Smarkm 29161821Smarkm$code.=<<___; 29261821Smarkm.extern __imp_RtlVirtualUnwind 29361821Smarkm 29461821Smarkm.type se_handler,\@abi-omnipotent 295110007Smarkm.align 16 29661821Smarkmse_handler: 29761821Smarkm push %rsi 29861821Smarkm push %rdi 29961821Smarkm push %rbx 300 push %rbp 301 push %r12 302 push %r13 303 push %r14 304 push %r15 305 pushfq 306 sub \$64,%rsp 307 308 mov 152($context),%rax # pull context->Rsp 309 mov 248($context),%rbx # pull context->Rip 310 311 lea .Lbody_mul_2x2(%rip),%r10 312 cmp %r10,%rbx # context->Rip<"prologue" label 313 jb .Lin_prologue 314 315 mov 8*10(%rax),%r14 # mimic epilogue 316 mov 8*11(%rax),%r13 317 mov 8*12(%rax),%r12 318 mov 8*13(%rax),%rbp 319 mov 8*14(%rax),%rbx 320 mov 8*15(%rax),%rdi 321 mov 8*16(%rax),%rsi 322 323 mov %rbx,144($context) # restore context->Rbx 324 mov %rbp,160($context) # restore context->Rbp 325 mov %rsi,168($context) # restore context->Rsi 326 mov %rdi,176($context) # restore context->Rdi 327 mov %r12,216($context) # restore context->R12 328 mov %r13,224($context) # restore context->R13 329 mov %r14,232($context) # restore context->R14 330 331.Lin_prologue: 332 lea 8*17(%rax),%rax 333 mov %rax,152($context) # restore context->Rsp 334 335 mov 40($disp),%rdi # disp->ContextRecord 336 mov $context,%rsi # context 337 mov \$154,%ecx # sizeof(CONTEXT) 338 .long 0xa548f3fc # cld; rep movsq 339 340 mov $disp,%rsi 341 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 342 mov 8(%rsi),%rdx # arg2, disp->ImageBase 343 mov 0(%rsi),%r8 # arg3, disp->ControlPc 344 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 345 mov 40(%rsi),%r10 # disp->ContextRecord 346 lea 56(%rsi),%r11 # &disp->HandlerData 347 lea 24(%rsi),%r12 # &disp->EstablisherFrame 348 mov %r10,32(%rsp) # arg5 349 mov %r11,40(%rsp) # arg6 350 mov %r12,48(%rsp) # arg7 351 mov %rcx,56(%rsp) # arg8, (NULL) 352 call *__imp_RtlVirtualUnwind(%rip) 353 354 mov \$1,%eax # ExceptionContinueSearch 355 add \$64,%rsp 356 popfq 357 pop %r15 358 pop %r14 359 pop %r13 360 pop %r12 361 pop %rbp 362 pop %rbx 363 pop %rdi 364 pop %rsi 365 ret 366.size se_handler,.-se_handler 367 368.section .pdata 369.align 4 370 .rva _mul_1x1 371 .rva .Lend_mul_1x1 372 .rva .LSEH_info_1x1 373 374 .rva .Lvanilla_mul_2x2 375 .rva .Lend_mul_2x2 376 .rva .LSEH_info_2x2 377.section .xdata 378.align 8 379.LSEH_info_1x1: 380 .byte 0x01,0x07,0x02,0x00 381 .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 382.LSEH_info_2x2: 383 .byte 9,0,0,0 384 .rva se_handler 385___ 386} 387 388$code =~ s/\`([^\`]*)\`/eval($1)/gem; 389print $code; 390close STDOUT; 391