1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# October 2005 11238384Sjkim# 12238384Sjkim# This is a "teaser" code, as it can be improved in several ways... 13238384Sjkim# First of all non-SSE2 path should be implemented (yes, for now it 14238384Sjkim# performs Montgomery multiplication/convolution only on SSE2-capable 15238384Sjkim# CPUs such as P4, others fall down to original code). Then inner loop 16238384Sjkim# can be unrolled and modulo-scheduled to improve ILP and possibly 17238384Sjkim# moved to 128-bit XMM register bank (though it would require input 18238384Sjkim# rearrangement and/or increase bus bandwidth utilization). Dedicated 19238384Sjkim# squaring procedure should give further performance improvement... 20238384Sjkim# Yet, for being draft, the code improves rsa512 *sign* benchmark by 21238384Sjkim# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 22238384Sjkim 23238384Sjkim# December 2006 24238384Sjkim# 25238384Sjkim# Modulo-scheduling SSE2 loops results in further 15-20% improvement. 26238384Sjkim# Integer-only code [being equipped with dedicated squaring procedure] 27238384Sjkim# gives ~40% on rsa512 sign benchmark... 28238384Sjkim 29238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm"); 31238384Sjkimrequire "x86asm.pl"; 32238384Sjkim 33238384Sjkim&asm_init($ARGV[0],$0); 34238384Sjkim 35238384Sjkim$sse2=0; 36238384Sjkimfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 37238384Sjkim 38238384Sjkim&external_label("OPENSSL_ia32cap_P") if ($sse2); 39238384Sjkim 40238384Sjkim&function_begin("bn_mul_mont"); 41238384Sjkim 42238384Sjkim$i="edx"; 43238384Sjkim$j="ecx"; 44238384Sjkim$ap="esi"; $tp="esi"; # overlapping variables!!! 45238384Sjkim$rp="edi"; $bp="edi"; # overlapping variables!!! 46238384Sjkim$np="ebp"; 47238384Sjkim$num="ebx"; 48238384Sjkim 49238384Sjkim$_num=&DWP(4*0,"esp"); # stack top layout 50238384Sjkim$_rp=&DWP(4*1,"esp"); 51238384Sjkim$_ap=&DWP(4*2,"esp"); 52238384Sjkim$_bp=&DWP(4*3,"esp"); 53238384Sjkim$_np=&DWP(4*4,"esp"); 54238384Sjkim$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 55238384Sjkim$_sp=&DWP(4*6,"esp"); 56238384Sjkim$_bpend=&DWP(4*7,"esp"); 57238384Sjkim$frame=32; # size of above frame rounded up to 16n 58238384Sjkim 59238384Sjkim &xor ("eax","eax"); 60238384Sjkim &mov ("edi",&wparam(5)); # int num 61238384Sjkim &cmp ("edi",4); 62238384Sjkim &jl (&label("just_leave")); 63238384Sjkim 64238384Sjkim &lea ("esi",&wparam(0)); # put aside pointer to argument block 65238384Sjkim &lea ("edx",&wparam(1)); # load ap 66238384Sjkim &mov ("ebp","esp"); # saved stack pointer! 67238384Sjkim &add ("edi",2); # extra two words on top of tp 68238384Sjkim &neg ("edi"); 69238384Sjkim &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) 70238384Sjkim &neg ("edi"); 71238384Sjkim 72238384Sjkim # minimize cache contention by arraning 2K window between stack 73238384Sjkim # pointer and ap argument [np is also position sensitive vector, 74238384Sjkim # but it's assumed to be near ap, as it's allocated at ~same 75238384Sjkim # time]. 76238384Sjkim &mov ("eax","esp"); 77238384Sjkim &sub ("eax","edx"); 78238384Sjkim &and ("eax",2047); 79238384Sjkim &sub ("esp","eax"); # this aligns sp and ap modulo 2048 80238384Sjkim 81238384Sjkim &xor ("edx","esp"); 82238384Sjkim &and ("edx",2048); 83238384Sjkim &xor ("edx",2048); 84238384Sjkim &sub ("esp","edx"); # this splits them apart modulo 4096 85238384Sjkim 86238384Sjkim &and ("esp",-64); # align to cache line 87238384Sjkim 88238384Sjkim ################################# load argument block... 89238384Sjkim &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 90238384Sjkim &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 91238384Sjkim &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 92238384Sjkim &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np 93238384Sjkim &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 94238384Sjkim #&mov ("edi",&DWP(5*4,"esi"));# int num 95238384Sjkim 96238384Sjkim &mov ("esi",&DWP(0,"esi")); # pull n0[0] 97238384Sjkim &mov ($_rp,"eax"); # ... save a copy of argument block 98238384Sjkim &mov ($_ap,"ebx"); 99238384Sjkim &mov ($_bp,"ecx"); 100238384Sjkim &mov ($_np,"edx"); 101238384Sjkim &mov ($_n0,"esi"); 102238384Sjkim &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 103238384Sjkim #&mov ($_num,$num); # redundant as $num is not reused 104238384Sjkim &mov ($_sp,"ebp"); # saved stack pointer! 105238384Sjkim 106238384Sjkimif($sse2) { 107238384Sjkim$acc0="mm0"; # mmx register bank layout 108238384Sjkim$acc1="mm1"; 109238384Sjkim$car0="mm2"; 110238384Sjkim$car1="mm3"; 111238384Sjkim$mul0="mm4"; 112238384Sjkim$mul1="mm5"; 113238384Sjkim$temp="mm6"; 114238384Sjkim$mask="mm7"; 115238384Sjkim 116238384Sjkim &picmeup("eax","OPENSSL_ia32cap_P"); 117238384Sjkim &bt (&DWP(0,"eax"),26); 118238384Sjkim &jnc (&label("non_sse2")); 119238384Sjkim 120238384Sjkim &mov ("eax",-1); 121238384Sjkim &movd ($mask,"eax"); # mask 32 lower bits 122238384Sjkim 123238384Sjkim &mov ($ap,$_ap); # load input pointers 124238384Sjkim &mov ($bp,$_bp); 125238384Sjkim &mov ($np,$_np); 126238384Sjkim 127238384Sjkim &xor ($i,$i); # i=0 128238384Sjkim &xor ($j,$j); # j=0 129238384Sjkim 130238384Sjkim &movd ($mul0,&DWP(0,$bp)); # bp[0] 131238384Sjkim &movd ($mul1,&DWP(0,$ap)); # ap[0] 132238384Sjkim &movd ($car1,&DWP(0,$np)); # np[0] 133238384Sjkim 134238384Sjkim &pmuludq($mul1,$mul0); # ap[0]*bp[0] 135238384Sjkim &movq ($car0,$mul1); 136238384Sjkim &movq ($acc0,$mul1); # I wish movd worked for 137238384Sjkim &pand ($acc0,$mask); # inter-register transfers 138238384Sjkim 139238384Sjkim &pmuludq($mul1,$_n0q); # *=n0 140238384Sjkim 141238384Sjkim &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 142238384Sjkim &paddq ($car1,$acc0); 143238384Sjkim 144238384Sjkim &movd ($acc1,&DWP(4,$np)); # np[1] 145238384Sjkim &movd ($acc0,&DWP(4,$ap)); # ap[1] 146238384Sjkim 147238384Sjkim &psrlq ($car0,32); 148238384Sjkim &psrlq ($car1,32); 149238384Sjkim 150238384Sjkim &inc ($j); # j++ 151238384Sjkim&set_label("1st",16); 152238384Sjkim &pmuludq($acc0,$mul0); # ap[j]*bp[0] 153238384Sjkim &pmuludq($acc1,$mul1); # np[j]*m1 154238384Sjkim &paddq ($car0,$acc0); # +=c0 155238384Sjkim &paddq ($car1,$acc1); # +=c1 156238384Sjkim 157238384Sjkim &movq ($acc0,$car0); 158238384Sjkim &pand ($acc0,$mask); 159238384Sjkim &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 160238384Sjkim &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 161238384Sjkim &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 162238384Sjkim &psrlq ($car0,32); 163238384Sjkim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 164238384Sjkim &psrlq ($car1,32); 165238384Sjkim 166238384Sjkim &lea ($j,&DWP(1,$j)); 167238384Sjkim &cmp ($j,$num); 168238384Sjkim &jl (&label("1st")); 169238384Sjkim 170238384Sjkim &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 171238384Sjkim &pmuludq($acc1,$mul1); # np[num-1]*m1 172238384Sjkim &paddq ($car0,$acc0); # +=c0 173238384Sjkim &paddq ($car1,$acc1); # +=c1 174238384Sjkim 175238384Sjkim &movq ($acc0,$car0); 176238384Sjkim &pand ($acc0,$mask); 177238384Sjkim &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 178238384Sjkim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 179238384Sjkim 180238384Sjkim &psrlq ($car0,32); 181238384Sjkim &psrlq ($car1,32); 182238384Sjkim 183238384Sjkim &paddq ($car1,$car0); 184238384Sjkim &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 185238384Sjkim 186238384Sjkim &inc ($i); # i++ 187238384Sjkim&set_label("outer"); 188238384Sjkim &xor ($j,$j); # j=0 189238384Sjkim 190238384Sjkim &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 191238384Sjkim &movd ($mul1,&DWP(0,$ap)); # ap[0] 192238384Sjkim &movd ($temp,&DWP($frame,"esp")); # tp[0] 193238384Sjkim &movd ($car1,&DWP(0,$np)); # np[0] 194238384Sjkim &pmuludq($mul1,$mul0); # ap[0]*bp[i] 195238384Sjkim 196238384Sjkim &paddq ($mul1,$temp); # +=tp[0] 197238384Sjkim &movq ($acc0,$mul1); 198238384Sjkim &movq ($car0,$mul1); 199238384Sjkim &pand ($acc0,$mask); 200238384Sjkim 201238384Sjkim &pmuludq($mul1,$_n0q); # *=n0 202238384Sjkim 203238384Sjkim &pmuludq($car1,$mul1); 204238384Sjkim &paddq ($car1,$acc0); 205238384Sjkim 206238384Sjkim &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 207238384Sjkim &movd ($acc1,&DWP(4,$np)); # np[1] 208238384Sjkim &movd ($acc0,&DWP(4,$ap)); # ap[1] 209238384Sjkim 210238384Sjkim &psrlq ($car0,32); 211238384Sjkim &psrlq ($car1,32); 212238384Sjkim &paddq ($car0,$temp); # +=tp[1] 213238384Sjkim 214238384Sjkim &inc ($j); # j++ 215238384Sjkim &dec ($num); 216238384Sjkim&set_label("inner"); 217238384Sjkim &pmuludq($acc0,$mul0); # ap[j]*bp[i] 218238384Sjkim &pmuludq($acc1,$mul1); # np[j]*m1 219238384Sjkim &paddq ($car0,$acc0); # +=c0 220238384Sjkim &paddq ($car1,$acc1); # +=c1 221238384Sjkim 222238384Sjkim &movq ($acc0,$car0); 223238384Sjkim &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 224238384Sjkim &pand ($acc0,$mask); 225238384Sjkim &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 226238384Sjkim &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 227238384Sjkim &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 228238384Sjkim &psrlq ($car0,32); 229238384Sjkim &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 230238384Sjkim &psrlq ($car1,32); 231238384Sjkim &paddq ($car0,$temp); # +=tp[j+1] 232238384Sjkim 233238384Sjkim &dec ($num); 234238384Sjkim &lea ($j,&DWP(1,$j)); # j++ 235238384Sjkim &jnz (&label("inner")); 236238384Sjkim 237238384Sjkim &mov ($num,$j); 238238384Sjkim &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 239238384Sjkim &pmuludq($acc1,$mul1); # np[num-1]*m1 240238384Sjkim &paddq ($car0,$acc0); # +=c0 241238384Sjkim &paddq ($car1,$acc1); # +=c1 242238384Sjkim 243238384Sjkim &movq ($acc0,$car0); 244238384Sjkim &pand ($acc0,$mask); 245238384Sjkim &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 246238384Sjkim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 247238384Sjkim &psrlq ($car0,32); 248238384Sjkim &psrlq ($car1,32); 249238384Sjkim 250238384Sjkim &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 251238384Sjkim &paddq ($car1,$car0); 252238384Sjkim &paddq ($car1,$temp); 253238384Sjkim &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 254238384Sjkim 255238384Sjkim &lea ($i,&DWP(1,$i)); # i++ 256238384Sjkim &cmp ($i,$num); 257238384Sjkim &jle (&label("outer")); 258238384Sjkim 259238384Sjkim &emms (); # done with mmx bank 260238384Sjkim &jmp (&label("common_tail")); 261238384Sjkim 262238384Sjkim&set_label("non_sse2",16); 263238384Sjkim} 264238384Sjkim 265238384Sjkimif (0) { 266238384Sjkim &mov ("esp",$_sp); 267238384Sjkim &xor ("eax","eax"); # signal "not fast enough [yet]" 268238384Sjkim &jmp (&label("just_leave")); 269238384Sjkim # While the below code provides competitive performance for 270238384Sjkim # all key lengthes on modern Intel cores, it's still more 271238384Sjkim # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 272238384Sjkim # means compared to the original integer-only assembler. 273238384Sjkim # 512-bit RSA sign is better by ~40%, but that's about all 274238384Sjkim # one can say about all CPUs... 275238384Sjkim} else { 276238384Sjkim$inp="esi"; # integer path uses these registers differently 277238384Sjkim$word="edi"; 278238384Sjkim$carry="ebp"; 279238384Sjkim 280238384Sjkim &mov ($inp,$_ap); 281238384Sjkim &lea ($carry,&DWP(1,$num)); 282238384Sjkim &mov ($word,$_bp); 283238384Sjkim &xor ($j,$j); # j=0 284238384Sjkim &mov ("edx",$inp); 285238384Sjkim &and ($carry,1); # see if num is even 286238384Sjkim &sub ("edx",$word); # see if ap==bp 287238384Sjkim &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 288238384Sjkim &or ($carry,"edx"); 289238384Sjkim &mov ($word,&DWP(0,$word)); # bp[0] 290238384Sjkim &jz (&label("bn_sqr_mont")); 291238384Sjkim &mov ($_bpend,"eax"); 292238384Sjkim &mov ("eax",&DWP(0,$inp)); 293238384Sjkim &xor ("edx","edx"); 294238384Sjkim 295238384Sjkim&set_label("mull",16); 296238384Sjkim &mov ($carry,"edx"); 297238384Sjkim &mul ($word); # ap[j]*bp[0] 298238384Sjkim &add ($carry,"eax"); 299238384Sjkim &lea ($j,&DWP(1,$j)); 300238384Sjkim &adc ("edx",0); 301238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 302238384Sjkim &cmp ($j,$num); 303238384Sjkim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 304238384Sjkim &jl (&label("mull")); 305238384Sjkim 306238384Sjkim &mov ($carry,"edx"); 307238384Sjkim &mul ($word); # ap[num-1]*bp[0] 308238384Sjkim &mov ($word,$_n0); 309238384Sjkim &add ("eax",$carry); 310238384Sjkim &mov ($inp,$_np); 311238384Sjkim &adc ("edx",0); 312238384Sjkim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 313238384Sjkim 314238384Sjkim &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 315238384Sjkim &xor ($j,$j); 316238384Sjkim &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 317238384Sjkim &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 318238384Sjkim 319238384Sjkim &mov ("eax",&DWP(0,$inp)); # np[0] 320238384Sjkim &mul ($word); # np[0]*m 321238384Sjkim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 322238384Sjkim &mov ("eax",&DWP(4,$inp)); # np[1] 323238384Sjkim &adc ("edx",0); 324238384Sjkim &inc ($j); 325238384Sjkim 326238384Sjkim &jmp (&label("2ndmadd")); 327238384Sjkim 328238384Sjkim&set_label("1stmadd",16); 329238384Sjkim &mov ($carry,"edx"); 330238384Sjkim &mul ($word); # ap[j]*bp[i] 331238384Sjkim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 332238384Sjkim &lea ($j,&DWP(1,$j)); 333238384Sjkim &adc ("edx",0); 334238384Sjkim &add ($carry,"eax"); 335238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 336238384Sjkim &adc ("edx",0); 337238384Sjkim &cmp ($j,$num); 338238384Sjkim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 339238384Sjkim &jl (&label("1stmadd")); 340238384Sjkim 341238384Sjkim &mov ($carry,"edx"); 342238384Sjkim &mul ($word); # ap[num-1]*bp[i] 343238384Sjkim &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 344238384Sjkim &mov ($word,$_n0); 345238384Sjkim &adc ("edx",0); 346238384Sjkim &mov ($inp,$_np); 347238384Sjkim &add ($carry,"eax"); 348238384Sjkim &adc ("edx",0); 349238384Sjkim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 350238384Sjkim 351238384Sjkim &xor ($j,$j); 352238384Sjkim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 353238384Sjkim &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 354238384Sjkim &adc ($j,0); 355238384Sjkim &mov ("eax",&DWP(0,$inp)); # np[0] 356238384Sjkim &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 357238384Sjkim &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 358238384Sjkim 359238384Sjkim &mul ($word); # np[0]*m 360238384Sjkim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 361238384Sjkim &mov ("eax",&DWP(4,$inp)); # np[1] 362238384Sjkim &adc ("edx",0); 363238384Sjkim &mov ($j,1); 364238384Sjkim 365238384Sjkim&set_label("2ndmadd",16); 366238384Sjkim &mov ($carry,"edx"); 367238384Sjkim &mul ($word); # np[j]*m 368238384Sjkim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 369238384Sjkim &lea ($j,&DWP(1,$j)); 370238384Sjkim &adc ("edx",0); 371238384Sjkim &add ($carry,"eax"); 372238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 373238384Sjkim &adc ("edx",0); 374238384Sjkim &cmp ($j,$num); 375238384Sjkim &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 376238384Sjkim &jl (&label("2ndmadd")); 377238384Sjkim 378238384Sjkim &mov ($carry,"edx"); 379238384Sjkim &mul ($word); # np[j]*m 380238384Sjkim &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 381238384Sjkim &adc ("edx",0); 382238384Sjkim &add ($carry,"eax"); 383238384Sjkim &adc ("edx",0); 384238384Sjkim &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 385238384Sjkim 386238384Sjkim &xor ("eax","eax"); 387238384Sjkim &mov ($j,$_bp); # &bp[i] 388238384Sjkim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 389238384Sjkim &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 390238384Sjkim &lea ($j,&DWP(4,$j)); 391238384Sjkim &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 392238384Sjkim &cmp ($j,$_bpend); 393238384Sjkim &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 394238384Sjkim &je (&label("common_tail")); 395238384Sjkim 396238384Sjkim &mov ($word,&DWP(0,$j)); # bp[i+1] 397238384Sjkim &mov ($inp,$_ap); 398238384Sjkim &mov ($_bp,$j); # &bp[++i] 399238384Sjkim &xor ($j,$j); 400238384Sjkim &xor ("edx","edx"); 401238384Sjkim &mov ("eax",&DWP(0,$inp)); 402238384Sjkim &jmp (&label("1stmadd")); 403238384Sjkim 404238384Sjkim&set_label("bn_sqr_mont",16); 405238384Sjkim$sbit=$num; 406238384Sjkim &mov ($_num,$num); 407238384Sjkim &mov ($_bp,$j); # i=0 408238384Sjkim 409238384Sjkim &mov ("eax",$word); # ap[0] 410238384Sjkim &mul ($word); # ap[0]*ap[0] 411238384Sjkim &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 412238384Sjkim &mov ($sbit,"edx"); 413238384Sjkim &shr ("edx",1); 414238384Sjkim &and ($sbit,1); 415238384Sjkim &inc ($j); 416238384Sjkim&set_label("sqr",16); 417238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 418238384Sjkim &mov ($carry,"edx"); 419238384Sjkim &mul ($word); # ap[j]*ap[0] 420238384Sjkim &add ("eax",$carry); 421238384Sjkim &lea ($j,&DWP(1,$j)); 422238384Sjkim &adc ("edx",0); 423238384Sjkim &lea ($carry,&DWP(0,$sbit,"eax",2)); 424238384Sjkim &shr ("eax",31); 425238384Sjkim &cmp ($j,$_num); 426238384Sjkim &mov ($sbit,"eax"); 427238384Sjkim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 428238384Sjkim &jl (&label("sqr")); 429238384Sjkim 430238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 431238384Sjkim &mov ($carry,"edx"); 432238384Sjkim &mul ($word); # ap[num-1]*ap[0] 433238384Sjkim &add ("eax",$carry); 434238384Sjkim &mov ($word,$_n0); 435238384Sjkim &adc ("edx",0); 436238384Sjkim &mov ($inp,$_np); 437238384Sjkim &lea ($carry,&DWP(0,$sbit,"eax",2)); 438238384Sjkim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 439238384Sjkim &shr ("eax",31); 440238384Sjkim &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 441238384Sjkim 442238384Sjkim &lea ($carry,&DWP(0,"eax","edx",2)); 443238384Sjkim &mov ("eax",&DWP(0,$inp)); # np[0] 444238384Sjkim &shr ("edx",31); 445238384Sjkim &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 446238384Sjkim &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 447238384Sjkim 448238384Sjkim &mul ($word); # np[0]*m 449238384Sjkim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 450238384Sjkim &mov ($num,$j); 451238384Sjkim &adc ("edx",0); 452238384Sjkim &mov ("eax",&DWP(4,$inp)); # np[1] 453238384Sjkim &mov ($j,1); 454238384Sjkim 455238384Sjkim&set_label("3rdmadd",16); 456238384Sjkim &mov ($carry,"edx"); 457238384Sjkim &mul ($word); # np[j]*m 458238384Sjkim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 459238384Sjkim &adc ("edx",0); 460238384Sjkim &add ($carry,"eax"); 461238384Sjkim &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 462238384Sjkim &adc ("edx",0); 463238384Sjkim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 464238384Sjkim 465238384Sjkim &mov ($carry,"edx"); 466238384Sjkim &mul ($word); # np[j+1]*m 467238384Sjkim &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 468238384Sjkim &lea ($j,&DWP(2,$j)); 469238384Sjkim &adc ("edx",0); 470238384Sjkim &add ($carry,"eax"); 471238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 472238384Sjkim &adc ("edx",0); 473238384Sjkim &cmp ($j,$num); 474238384Sjkim &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 475238384Sjkim &jl (&label("3rdmadd")); 476238384Sjkim 477238384Sjkim &mov ($carry,"edx"); 478238384Sjkim &mul ($word); # np[j]*m 479238384Sjkim &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 480238384Sjkim &adc ("edx",0); 481238384Sjkim &add ($carry,"eax"); 482238384Sjkim &adc ("edx",0); 483238384Sjkim &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 484238384Sjkim 485238384Sjkim &mov ($j,$_bp); # i 486238384Sjkim &xor ("eax","eax"); 487238384Sjkim &mov ($inp,$_ap); 488238384Sjkim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 489238384Sjkim &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 490238384Sjkim &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 491238384Sjkim &cmp ($j,$num); 492238384Sjkim &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 493238384Sjkim &je (&label("common_tail")); 494238384Sjkim 495238384Sjkim &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 496238384Sjkim &lea ($j,&DWP(1,$j)); 497238384Sjkim &mov ("eax",$word); 498238384Sjkim &mov ($_bp,$j); # ++i 499238384Sjkim &mul ($word); # ap[i]*ap[i] 500238384Sjkim &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 501238384Sjkim &adc ("edx",0); 502238384Sjkim &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 503238384Sjkim &xor ($carry,$carry); 504238384Sjkim &cmp ($j,$num); 505238384Sjkim &lea ($j,&DWP(1,$j)); 506238384Sjkim &je (&label("sqrlast")); 507238384Sjkim 508238384Sjkim &mov ($sbit,"edx"); # zaps $num 509238384Sjkim &shr ("edx",1); 510238384Sjkim &and ($sbit,1); 511238384Sjkim&set_label("sqradd",16); 512238384Sjkim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 513238384Sjkim &mov ($carry,"edx"); 514238384Sjkim &mul ($word); # ap[j]*ap[i] 515238384Sjkim &add ("eax",$carry); 516238384Sjkim &lea ($carry,&DWP(0,"eax","eax")); 517238384Sjkim &adc ("edx",0); 518238384Sjkim &shr ("eax",31); 519238384Sjkim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 520238384Sjkim &lea ($j,&DWP(1,$j)); 521238384Sjkim &adc ("eax",0); 522238384Sjkim &add ($carry,$sbit); 523238384Sjkim &adc ("eax",0); 524238384Sjkim &cmp ($j,$_num); 525238384Sjkim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 526238384Sjkim &mov ($sbit,"eax"); 527238384Sjkim &jle (&label("sqradd")); 528238384Sjkim 529238384Sjkim &mov ($carry,"edx"); 530238384Sjkim &add ("edx","edx"); 531238384Sjkim &shr ($carry,31); 532238384Sjkim &add ("edx",$sbit); 533238384Sjkim &adc ($carry,0); 534238384Sjkim&set_label("sqrlast"); 535238384Sjkim &mov ($word,$_n0); 536238384Sjkim &mov ($inp,$_np); 537238384Sjkim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 538238384Sjkim 539238384Sjkim &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 540238384Sjkim &mov ("eax",&DWP(0,$inp)); # np[0] 541238384Sjkim &adc ($carry,0); 542238384Sjkim &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 543238384Sjkim &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 544238384Sjkim 545238384Sjkim &mul ($word); # np[0]*m 546238384Sjkim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 547238384Sjkim &lea ($num,&DWP(-1,$j)); 548238384Sjkim &adc ("edx",0); 549238384Sjkim &mov ($j,1); 550238384Sjkim &mov ("eax",&DWP(4,$inp)); # np[1] 551238384Sjkim 552238384Sjkim &jmp (&label("3rdmadd")); 553238384Sjkim} 554238384Sjkim 555238384Sjkim&set_label("common_tail",16); 556238384Sjkim &mov ($np,$_np); # load modulus pointer 557238384Sjkim &mov ($rp,$_rp); # load result pointer 558238384Sjkim &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 559238384Sjkim 560238384Sjkim &mov ("eax",&DWP(0,$tp)); # tp[0] 561238384Sjkim &mov ($j,$num); # j=num-1 562238384Sjkim &xor ($i,$i); # i=0 and clear CF! 563238384Sjkim 564238384Sjkim&set_label("sub",16); 565238384Sjkim &sbb ("eax",&DWP(0,$np,$i,4)); 566238384Sjkim &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 567238384Sjkim &dec ($j); # doesn't affect CF! 568238384Sjkim &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 569238384Sjkim &lea ($i,&DWP(1,$i)); # i++ 570238384Sjkim &jge (&label("sub")); 571238384Sjkim 572238384Sjkim &sbb ("eax",0); # handle upmost overflow bit 573238384Sjkim &and ($tp,"eax"); 574238384Sjkim ¬ ("eax"); 575238384Sjkim &mov ($np,$rp); 576238384Sjkim &and ($np,"eax"); 577238384Sjkim &or ($tp,$np); # tp=carry?tp:rp 578238384Sjkim 579238384Sjkim&set_label("copy",16); # copy or in-place refresh 580238384Sjkim &mov ("eax",&DWP(0,$tp,$num,4)); 581238384Sjkim &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] 582238384Sjkim &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector 583238384Sjkim &dec ($num); 584238384Sjkim &jge (&label("copy")); 585238384Sjkim 586238384Sjkim &mov ("esp",$_sp); # pull saved stack pointer 587238384Sjkim &mov ("eax",1); 588238384Sjkim&set_label("just_leave"); 589238384Sjkim&function_end("bn_mul_mont"); 590238384Sjkim 591238384Sjkim&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 592238384Sjkim 593238384Sjkim&asm_finish(); 594