1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# May 2011 11238384Sjkim# 12238384Sjkim# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 13238384Sjkim# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 14238384Sjkim# the time being... Except that it has three code paths: pure integer 15238384Sjkim# code suitable for any x86 CPU, MMX code suitable for PIII and later 16238384Sjkim# and PCLMULQDQ suitable for Westmere and later. Improvement varies 17291721Sjkim# from one benchmark and ��-arch to another. Below are interval values 18238384Sjkim# for 163- and 571-bit ECDH benchmarks relative to compiler-generated 19238384Sjkim# code: 20238384Sjkim# 21238384Sjkim# PIII 16%-30% 22238384Sjkim# P4 12%-12% 23238384Sjkim# Opteron 18%-40% 24238384Sjkim# Core2 19%-44% 25238384Sjkim# Atom 38%-64% 26238384Sjkim# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) 27238384Sjkim# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) 28238384Sjkim# 29238384Sjkim# Note that above improvement coefficients are not coefficients for 30238384Sjkim# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result 31238384Sjkim# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark 32238384Sjkim# is more and more dominated by other subroutines, most notably by 33238384Sjkim# BN_GF2m_mod[_mul]_arr... 34238384Sjkim 35238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 36238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm"); 37238384Sjkimrequire "x86asm.pl"; 38238384Sjkim 39238384Sjkim&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); 40238384Sjkim 41238384Sjkim$sse2=0; 42238384Sjkimfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 43238384Sjkim 44238384Sjkim&external_label("OPENSSL_ia32cap_P") if ($sse2); 45238384Sjkim 46238384Sjkim$a="eax"; 47238384Sjkim$b="ebx"; 48238384Sjkim($a1,$a2,$a4)=("ecx","edx","ebp"); 49238384Sjkim 50238384Sjkim$R="mm0"; 51238384Sjkim@T=("mm1","mm2"); 52238384Sjkim($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); 53238384Sjkim@i=("esi","edi"); 54238384Sjkim 55238384Sjkim if (!$x86only) { 56238384Sjkim&function_begin_B("_mul_1x1_mmx"); 57238384Sjkim &sub ("esp",32+4); 58238384Sjkim &mov ($a1,$a); 59238384Sjkim &lea ($a2,&DWP(0,$a,$a)); 60238384Sjkim &and ($a1,0x3fffffff); 61238384Sjkim &lea ($a4,&DWP(0,$a2,$a2)); 62238384Sjkim &mov (&DWP(0*4,"esp"),0); 63238384Sjkim &and ($a2,0x7fffffff); 64238384Sjkim &movd ($A,$a); 65238384Sjkim &movd ($B,$b); 66238384Sjkim &mov (&DWP(1*4,"esp"),$a1); # a1 67238384Sjkim &xor ($a1,$a2); # a1^a2 68238384Sjkim &pxor ($B31,$B31); 69238384Sjkim &pxor ($B30,$B30); 70238384Sjkim &mov (&DWP(2*4,"esp"),$a2); # a2 71238384Sjkim &xor ($a2,$a4); # a2^a4 72238384Sjkim &mov (&DWP(3*4,"esp"),$a1); # a1^a2 73238384Sjkim &pcmpgtd($B31,$A); # broadcast 31st bit 74238384Sjkim &paddd ($A,$A); # $A<<=1 75238384Sjkim &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 76238384Sjkim &mov (&DWP(4*4,"esp"),$a4); # a4 77238384Sjkim &xor ($a4,$a2); # a2=a4^a2^a4 78238384Sjkim &pand ($B31,$B); 79238384Sjkim &pcmpgtd($B30,$A); # broadcast 30th bit 80238384Sjkim &mov (&DWP(5*4,"esp"),$a1); # a1^a4 81238384Sjkim &xor ($a4,$a1); # a1^a2^a4 82238384Sjkim &psllq ($B31,31); 83238384Sjkim &pand ($B30,$B); 84238384Sjkim &mov (&DWP(6*4,"esp"),$a2); # a2^a4 85238384Sjkim &mov (@i[0],0x7); 86238384Sjkim &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 87238384Sjkim &mov ($a4,@i[0]); 88238384Sjkim &and (@i[0],$b); 89238384Sjkim &shr ($b,3); 90238384Sjkim &mov (@i[1],$a4); 91238384Sjkim &psllq ($B30,30); 92238384Sjkim &and (@i[1],$b); 93238384Sjkim &shr ($b,3); 94238384Sjkim &movd ($R,&DWP(0,"esp",@i[0],4)); 95238384Sjkim &mov (@i[0],$a4); 96238384Sjkim &and (@i[0],$b); 97238384Sjkim &shr ($b,3); 98238384Sjkim for($n=1;$n<9;$n++) { 99238384Sjkim &movd (@T[1],&DWP(0,"esp",@i[1],4)); 100238384Sjkim &mov (@i[1],$a4); 101238384Sjkim &psllq (@T[1],3*$n); 102238384Sjkim &and (@i[1],$b); 103238384Sjkim &shr ($b,3); 104238384Sjkim &pxor ($R,@T[1]); 105238384Sjkim 106238384Sjkim push(@i,shift(@i)); push(@T,shift(@T)); 107238384Sjkim } 108238384Sjkim &movd (@T[1],&DWP(0,"esp",@i[1],4)); 109238384Sjkim &pxor ($R,$B30); 110238384Sjkim &psllq (@T[1],3*$n++); 111238384Sjkim &pxor ($R,@T[1]); 112238384Sjkim 113238384Sjkim &movd (@T[0],&DWP(0,"esp",@i[0],4)); 114238384Sjkim &pxor ($R,$B31); 115238384Sjkim &psllq (@T[0],3*$n); 116238384Sjkim &add ("esp",32+4); 117238384Sjkim &pxor ($R,@T[0]); 118238384Sjkim &ret (); 119238384Sjkim&function_end_B("_mul_1x1_mmx"); 120238384Sjkim } 121238384Sjkim 122238384Sjkim($lo,$hi)=("eax","edx"); 123238384Sjkim@T=("ecx","ebp"); 124238384Sjkim 125238384Sjkim&function_begin_B("_mul_1x1_ialu"); 126238384Sjkim &sub ("esp",32+4); 127238384Sjkim &mov ($a1,$a); 128238384Sjkim &lea ($a2,&DWP(0,$a,$a)); 129238384Sjkim &lea ($a4,&DWP(0,"",$a,4)); 130238384Sjkim &and ($a1,0x3fffffff); 131238384Sjkim &lea (@i[1],&DWP(0,$lo,$lo)); 132238384Sjkim &sar ($lo,31); # broadcast 31st bit 133238384Sjkim &mov (&DWP(0*4,"esp"),0); 134238384Sjkim &and ($a2,0x7fffffff); 135238384Sjkim &mov (&DWP(1*4,"esp"),$a1); # a1 136238384Sjkim &xor ($a1,$a2); # a1^a2 137238384Sjkim &mov (&DWP(2*4,"esp"),$a2); # a2 138238384Sjkim &xor ($a2,$a4); # a2^a4 139238384Sjkim &mov (&DWP(3*4,"esp"),$a1); # a1^a2 140238384Sjkim &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 141238384Sjkim &mov (&DWP(4*4,"esp"),$a4); # a4 142238384Sjkim &xor ($a4,$a2); # a2=a4^a2^a4 143238384Sjkim &mov (&DWP(5*4,"esp"),$a1); # a1^a4 144238384Sjkim &xor ($a4,$a1); # a1^a2^a4 145238384Sjkim &sar (@i[1],31); # broardcast 30th bit 146238384Sjkim &and ($lo,$b); 147238384Sjkim &mov (&DWP(6*4,"esp"),$a2); # a2^a4 148238384Sjkim &and (@i[1],$b); 149238384Sjkim &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 150238384Sjkim &mov ($hi,$lo); 151238384Sjkim &shl ($lo,31); 152238384Sjkim &mov (@T[0],@i[1]); 153238384Sjkim &shr ($hi,1); 154238384Sjkim 155238384Sjkim &mov (@i[0],0x7); 156238384Sjkim &shl (@i[1],30); 157238384Sjkim &and (@i[0],$b); 158238384Sjkim &shr (@T[0],2); 159238384Sjkim &xor ($lo,@i[1]); 160238384Sjkim 161238384Sjkim &shr ($b,3); 162238384Sjkim &mov (@i[1],0x7); # 5-byte instruction!? 163238384Sjkim &and (@i[1],$b); 164238384Sjkim &shr ($b,3); 165238384Sjkim &xor ($hi,@T[0]); 166238384Sjkim &xor ($lo,&DWP(0,"esp",@i[0],4)); 167238384Sjkim &mov (@i[0],0x7); 168238384Sjkim &and (@i[0],$b); 169238384Sjkim &shr ($b,3); 170238384Sjkim for($n=1;$n<9;$n++) { 171238384Sjkim &mov (@T[1],&DWP(0,"esp",@i[1],4)); 172238384Sjkim &mov (@i[1],0x7); 173238384Sjkim &mov (@T[0],@T[1]); 174238384Sjkim &shl (@T[1],3*$n); 175238384Sjkim &and (@i[1],$b); 176238384Sjkim &shr (@T[0],32-3*$n); 177238384Sjkim &xor ($lo,@T[1]); 178238384Sjkim &shr ($b,3); 179238384Sjkim &xor ($hi,@T[0]); 180238384Sjkim 181238384Sjkim push(@i,shift(@i)); push(@T,shift(@T)); 182238384Sjkim } 183238384Sjkim &mov (@T[1],&DWP(0,"esp",@i[1],4)); 184238384Sjkim &mov (@T[0],@T[1]); 185238384Sjkim &shl (@T[1],3*$n); 186238384Sjkim &mov (@i[1],&DWP(0,"esp",@i[0],4)); 187238384Sjkim &shr (@T[0],32-3*$n); $n++; 188238384Sjkim &mov (@i[0],@i[1]); 189238384Sjkim &xor ($lo,@T[1]); 190238384Sjkim &shl (@i[1],3*$n); 191238384Sjkim &xor ($hi,@T[0]); 192238384Sjkim &shr (@i[0],32-3*$n); 193238384Sjkim &xor ($lo,@i[1]); 194238384Sjkim &xor ($hi,@i[0]); 195238384Sjkim 196238384Sjkim &add ("esp",32+4); 197238384Sjkim &ret (); 198238384Sjkim&function_end_B("_mul_1x1_ialu"); 199238384Sjkim 200238384Sjkim# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); 201238384Sjkim&function_begin_B("bn_GF2m_mul_2x2"); 202238384Sjkimif (!$x86only) { 203238384Sjkim &picmeup("edx","OPENSSL_ia32cap_P"); 204238384Sjkim &mov ("eax",&DWP(0,"edx")); 205238384Sjkim &mov ("edx",&DWP(4,"edx")); 206238384Sjkim &test ("eax",1<<23); # check MMX bit 207238384Sjkim &jz (&label("ialu")); 208238384Sjkimif ($sse2) { 209238384Sjkim &test ("eax",1<<24); # check FXSR bit 210238384Sjkim &jz (&label("mmx")); 211238384Sjkim &test ("edx",1<<1); # check PCLMULQDQ bit 212238384Sjkim &jz (&label("mmx")); 213238384Sjkim 214238384Sjkim &movups ("xmm0",&QWP(8,"esp")); 215238384Sjkim &shufps ("xmm0","xmm0",0b10110001); 216238384Sjkim &pclmulqdq ("xmm0","xmm0",1); 217238384Sjkim &mov ("eax",&DWP(4,"esp")); 218238384Sjkim &movups (&QWP(0,"eax"),"xmm0"); 219238384Sjkim &ret (); 220238384Sjkim 221238384Sjkim&set_label("mmx",16); 222238384Sjkim} 223238384Sjkim &push ("ebp"); 224238384Sjkim &push ("ebx"); 225238384Sjkim &push ("esi"); 226238384Sjkim &push ("edi"); 227238384Sjkim &mov ($a,&wparam(1)); 228238384Sjkim &mov ($b,&wparam(3)); 229291721Sjkim &call ("_mul_1x1_mmx"); # a1��b1 230238384Sjkim &movq ("mm7",$R); 231238384Sjkim 232238384Sjkim &mov ($a,&wparam(2)); 233238384Sjkim &mov ($b,&wparam(4)); 234291721Sjkim &call ("_mul_1x1_mmx"); # a0��b0 235238384Sjkim &movq ("mm6",$R); 236238384Sjkim 237238384Sjkim &mov ($a,&wparam(1)); 238238384Sjkim &mov ($b,&wparam(3)); 239238384Sjkim &xor ($a,&wparam(2)); 240238384Sjkim &xor ($b,&wparam(4)); 241291721Sjkim &call ("_mul_1x1_mmx"); # (a0+a1)��(b0+b1) 242238384Sjkim &pxor ($R,"mm7"); 243238384Sjkim &mov ($a,&wparam(0)); 244291721Sjkim &pxor ($R,"mm6"); # (a0+a1)��(b0+b1)-a1��b1-a0��b0 245238384Sjkim 246238384Sjkim &movq ($A,$R); 247238384Sjkim &psllq ($R,32); 248238384Sjkim &pop ("edi"); 249238384Sjkim &psrlq ($A,32); 250238384Sjkim &pop ("esi"); 251238384Sjkim &pxor ($R,"mm6"); 252238384Sjkim &pop ("ebx"); 253238384Sjkim &pxor ($A,"mm7"); 254238384Sjkim &movq (&QWP(0,$a),$R); 255238384Sjkim &pop ("ebp"); 256238384Sjkim &movq (&QWP(8,$a),$A); 257238384Sjkim &emms (); 258238384Sjkim &ret (); 259238384Sjkim&set_label("ialu",16); 260238384Sjkim} 261238384Sjkim &push ("ebp"); 262238384Sjkim &push ("ebx"); 263238384Sjkim &push ("esi"); 264238384Sjkim &push ("edi"); 265238384Sjkim &stack_push(4+1); 266238384Sjkim 267238384Sjkim &mov ($a,&wparam(1)); 268238384Sjkim &mov ($b,&wparam(3)); 269291721Sjkim &call ("_mul_1x1_ialu"); # a1��b1 270238384Sjkim &mov (&DWP(8,"esp"),$lo); 271238384Sjkim &mov (&DWP(12,"esp"),$hi); 272238384Sjkim 273238384Sjkim &mov ($a,&wparam(2)); 274238384Sjkim &mov ($b,&wparam(4)); 275291721Sjkim &call ("_mul_1x1_ialu"); # a0��b0 276238384Sjkim &mov (&DWP(0,"esp"),$lo); 277238384Sjkim &mov (&DWP(4,"esp"),$hi); 278238384Sjkim 279238384Sjkim &mov ($a,&wparam(1)); 280238384Sjkim &mov ($b,&wparam(3)); 281238384Sjkim &xor ($a,&wparam(2)); 282238384Sjkim &xor ($b,&wparam(4)); 283291721Sjkim &call ("_mul_1x1_ialu"); # (a0+a1)��(b0+b1) 284238384Sjkim 285238384Sjkim &mov ("ebp",&wparam(0)); 286238384Sjkim @r=("ebx","ecx","edi","esi"); 287238384Sjkim &mov (@r[0],&DWP(0,"esp")); 288238384Sjkim &mov (@r[1],&DWP(4,"esp")); 289238384Sjkim &mov (@r[2],&DWP(8,"esp")); 290238384Sjkim &mov (@r[3],&DWP(12,"esp")); 291238384Sjkim 292238384Sjkim &xor ($lo,$hi); 293238384Sjkim &xor ($hi,@r[1]); 294238384Sjkim &xor ($lo,@r[0]); 295238384Sjkim &mov (&DWP(0,"ebp"),@r[0]); 296238384Sjkim &xor ($hi,@r[2]); 297238384Sjkim &mov (&DWP(12,"ebp"),@r[3]); 298238384Sjkim &xor ($lo,@r[3]); 299238384Sjkim &stack_pop(4+1); 300238384Sjkim &xor ($hi,@r[3]); 301238384Sjkim &pop ("edi"); 302238384Sjkim &xor ($lo,$hi); 303238384Sjkim &pop ("esi"); 304238384Sjkim &mov (&DWP(8,"ebp"),$hi); 305238384Sjkim &pop ("ebx"); 306238384Sjkim &mov (&DWP(4,"ebp"),$lo); 307238384Sjkim &pop ("ebp"); 308238384Sjkim &ret (); 309238384Sjkim&function_end_B("bn_GF2m_mul_2x2"); 310238384Sjkim 311238384Sjkim&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 312238384Sjkim 313238384Sjkim&asm_finish(); 314