1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# August 2011. 11238384Sjkim# 12238384Sjkim# Companion to x86_64-mont.pl that optimizes cache-timing attack 13238384Sjkim# countermeasures. The subroutines are produced by replacing bp[i] 14238384Sjkim# references in their x86_64-mont.pl counterparts with cache-neutral 15238384Sjkim# references to powers table computed in BN_mod_exp_mont_consttime. 16238384Sjkim# In addition subroutine that scatters elements of the powers table 17238384Sjkim# is implemented, so that scatter-/gathering can be tuned without 18238384Sjkim# bn_exp.c modifications. 19238384Sjkim 20238384Sjkim$flavour = shift; 21238384Sjkim$output = shift; 22238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23238384Sjkim 24238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25238384Sjkim 26238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29238384Sjkimdie "can't locate x86_64-xlate.pl"; 30238384Sjkim 31246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 32246772Sjkim*STDOUT=*OUT; 33238384Sjkim 34238384Sjkim# int bn_mul_mont_gather5( 35238384Sjkim$rp="%rdi"; # BN_ULONG *rp, 36238384Sjkim$ap="%rsi"; # const BN_ULONG *ap, 37238384Sjkim$bp="%rdx"; # const BN_ULONG *bp, 38238384Sjkim$np="%rcx"; # const BN_ULONG *np, 39238384Sjkim$n0="%r8"; # const BN_ULONG *n0, 40238384Sjkim$num="%r9"; # int num, 41238384Sjkim # int idx); # 0 to 2^5-1, "index" in $bp holding 42238384Sjkim # pre-computed powers of a', interlaced 43238384Sjkim # in such manner that b[0] is $bp[idx], 44238384Sjkim # b[1] is [2^5+idx], etc. 45238384Sjkim$lo0="%r10"; 46238384Sjkim$hi0="%r11"; 47238384Sjkim$hi1="%r13"; 48238384Sjkim$i="%r14"; 49238384Sjkim$j="%r15"; 50238384Sjkim$m0="%rbx"; 51238384Sjkim$m1="%rbp"; 52238384Sjkim 53238384Sjkim$code=<<___; 54238384Sjkim.text 55238384Sjkim 56238384Sjkim.globl bn_mul_mont_gather5 57238384Sjkim.type bn_mul_mont_gather5,\@function,6 58238384Sjkim.align 64 59238384Sjkimbn_mul_mont_gather5: 60238384Sjkim test \$3,${num}d 61238384Sjkim jnz .Lmul_enter 62238384Sjkim cmp \$8,${num}d 63238384Sjkim jb .Lmul_enter 64238384Sjkim jmp .Lmul4x_enter 65238384Sjkim 66238384Sjkim.align 16 67238384Sjkim.Lmul_enter: 68238384Sjkim mov ${num}d,${num}d 69238384Sjkim mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 70238384Sjkim push %rbx 71238384Sjkim push %rbp 72238384Sjkim push %r12 73238384Sjkim push %r13 74238384Sjkim push %r14 75238384Sjkim push %r15 76238384Sjkim___ 77238384Sjkim$code.=<<___ if ($win64); 78238384Sjkim lea -0x28(%rsp),%rsp 79238384Sjkim movaps %xmm6,(%rsp) 80238384Sjkim movaps %xmm7,0x10(%rsp) 81238384Sjkim.Lmul_alloca: 82238384Sjkim___ 83238384Sjkim$code.=<<___; 84238384Sjkim mov %rsp,%rax 85238384Sjkim lea 2($num),%r11 86238384Sjkim neg %r11 87238384Sjkim lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 88238384Sjkim and \$-1024,%rsp # minimize TLB usage 89238384Sjkim 90238384Sjkim mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 91238384Sjkim.Lmul_body: 92238384Sjkim mov $bp,%r12 # reassign $bp 93238384Sjkim___ 94238384Sjkim $bp="%r12"; 95238384Sjkim $STRIDE=2**5*8; # 5 is "window size" 96238384Sjkim $N=$STRIDE/4; # should match cache line size 97238384Sjkim$code.=<<___; 98238384Sjkim mov %r10,%r11 99238384Sjkim shr \$`log($N/8)/log(2)`,%r10 100238384Sjkim and \$`$N/8-1`,%r11 101238384Sjkim not %r10 102238384Sjkim lea .Lmagic_masks(%rip),%rax 103238384Sjkim and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 104238384Sjkim lea 96($bp,%r11,8),$bp # pointer within 1st cache line 105238384Sjkim movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 106238384Sjkim movq 8(%rax,%r10,8),%xmm5 # cache line contains element 107238384Sjkim movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 108238384Sjkim movq 24(%rax,%r10,8),%xmm7 109238384Sjkim 110238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 111238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 112238384Sjkim pand %xmm4,%xmm0 113238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 114238384Sjkim pand %xmm5,%xmm1 115238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 116238384Sjkim pand %xmm6,%xmm2 117238384Sjkim por %xmm1,%xmm0 118238384Sjkim pand %xmm7,%xmm3 119238384Sjkim por %xmm2,%xmm0 120238384Sjkim lea $STRIDE($bp),$bp 121238384Sjkim por %xmm3,%xmm0 122238384Sjkim 123238384Sjkim movq %xmm0,$m0 # m0=bp[0] 124238384Sjkim 125238384Sjkim mov ($n0),$n0 # pull n0[0] value 126238384Sjkim mov ($ap),%rax 127238384Sjkim 128238384Sjkim xor $i,$i # i=0 129238384Sjkim xor $j,$j # j=0 130238384Sjkim 131238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 132238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 133238384Sjkim pand %xmm4,%xmm0 134238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 135238384Sjkim pand %xmm5,%xmm1 136238384Sjkim 137238384Sjkim mov $n0,$m1 138238384Sjkim mulq $m0 # ap[0]*bp[0] 139238384Sjkim mov %rax,$lo0 140238384Sjkim mov ($np),%rax 141238384Sjkim 142238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 143238384Sjkim pand %xmm6,%xmm2 144238384Sjkim por %xmm1,%xmm0 145238384Sjkim pand %xmm7,%xmm3 146238384Sjkim 147238384Sjkim imulq $lo0,$m1 # "tp[0]"*n0 148238384Sjkim mov %rdx,$hi0 149238384Sjkim 150238384Sjkim por %xmm2,%xmm0 151238384Sjkim lea $STRIDE($bp),$bp 152238384Sjkim por %xmm3,%xmm0 153238384Sjkim 154238384Sjkim mulq $m1 # np[0]*m1 155238384Sjkim add %rax,$lo0 # discarded 156238384Sjkim mov 8($ap),%rax 157238384Sjkim adc \$0,%rdx 158238384Sjkim mov %rdx,$hi1 159238384Sjkim 160238384Sjkim lea 1($j),$j # j++ 161238384Sjkim jmp .L1st_enter 162238384Sjkim 163238384Sjkim.align 16 164238384Sjkim.L1st: 165238384Sjkim add %rax,$hi1 166238384Sjkim mov ($ap,$j,8),%rax 167238384Sjkim adc \$0,%rdx 168238384Sjkim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 169238384Sjkim mov $lo0,$hi0 170238384Sjkim adc \$0,%rdx 171238384Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 172238384Sjkim mov %rdx,$hi1 173238384Sjkim 174238384Sjkim.L1st_enter: 175238384Sjkim mulq $m0 # ap[j]*bp[0] 176238384Sjkim add %rax,$hi0 177238384Sjkim mov ($np,$j,8),%rax 178238384Sjkim adc \$0,%rdx 179238384Sjkim lea 1($j),$j # j++ 180238384Sjkim mov %rdx,$lo0 181238384Sjkim 182238384Sjkim mulq $m1 # np[j]*m1 183238384Sjkim cmp $num,$j 184238384Sjkim jne .L1st 185238384Sjkim 186238384Sjkim movq %xmm0,$m0 # bp[1] 187238384Sjkim 188238384Sjkim add %rax,$hi1 189238384Sjkim mov ($ap),%rax # ap[0] 190238384Sjkim adc \$0,%rdx 191238384Sjkim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 192238384Sjkim adc \$0,%rdx 193238384Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 194238384Sjkim mov %rdx,$hi1 195238384Sjkim mov $lo0,$hi0 196238384Sjkim 197238384Sjkim xor %rdx,%rdx 198238384Sjkim add $hi0,$hi1 199238384Sjkim adc \$0,%rdx 200238384Sjkim mov $hi1,-8(%rsp,$num,8) 201238384Sjkim mov %rdx,(%rsp,$num,8) # store upmost overflow bit 202238384Sjkim 203238384Sjkim lea 1($i),$i # i++ 204238384Sjkim jmp .Louter 205238384Sjkim.align 16 206238384Sjkim.Louter: 207238384Sjkim xor $j,$j # j=0 208238384Sjkim mov $n0,$m1 209238384Sjkim mov (%rsp),$lo0 210238384Sjkim 211238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 212238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 213238384Sjkim pand %xmm4,%xmm0 214238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 215238384Sjkim pand %xmm5,%xmm1 216238384Sjkim 217238384Sjkim mulq $m0 # ap[0]*bp[i] 218238384Sjkim add %rax,$lo0 # ap[0]*bp[i]+tp[0] 219238384Sjkim mov ($np),%rax 220238384Sjkim adc \$0,%rdx 221238384Sjkim 222238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 223238384Sjkim pand %xmm6,%xmm2 224238384Sjkim por %xmm1,%xmm0 225238384Sjkim pand %xmm7,%xmm3 226238384Sjkim 227238384Sjkim imulq $lo0,$m1 # tp[0]*n0 228238384Sjkim mov %rdx,$hi0 229238384Sjkim 230238384Sjkim por %xmm2,%xmm0 231238384Sjkim lea $STRIDE($bp),$bp 232238384Sjkim por %xmm3,%xmm0 233238384Sjkim 234238384Sjkim mulq $m1 # np[0]*m1 235238384Sjkim add %rax,$lo0 # discarded 236238384Sjkim mov 8($ap),%rax 237238384Sjkim adc \$0,%rdx 238238384Sjkim mov 8(%rsp),$lo0 # tp[1] 239238384Sjkim mov %rdx,$hi1 240238384Sjkim 241238384Sjkim lea 1($j),$j # j++ 242238384Sjkim jmp .Linner_enter 243238384Sjkim 244238384Sjkim.align 16 245238384Sjkim.Linner: 246238384Sjkim add %rax,$hi1 247238384Sjkim mov ($ap,$j,8),%rax 248238384Sjkim adc \$0,%rdx 249238384Sjkim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 250238384Sjkim mov (%rsp,$j,8),$lo0 251238384Sjkim adc \$0,%rdx 252238384Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 253238384Sjkim mov %rdx,$hi1 254238384Sjkim 255238384Sjkim.Linner_enter: 256238384Sjkim mulq $m0 # ap[j]*bp[i] 257238384Sjkim add %rax,$hi0 258238384Sjkim mov ($np,$j,8),%rax 259238384Sjkim adc \$0,%rdx 260238384Sjkim add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 261238384Sjkim mov %rdx,$hi0 262238384Sjkim adc \$0,$hi0 263238384Sjkim lea 1($j),$j # j++ 264238384Sjkim 265238384Sjkim mulq $m1 # np[j]*m1 266238384Sjkim cmp $num,$j 267238384Sjkim jne .Linner 268238384Sjkim 269238384Sjkim movq %xmm0,$m0 # bp[i+1] 270238384Sjkim 271238384Sjkim add %rax,$hi1 272238384Sjkim mov ($ap),%rax # ap[0] 273238384Sjkim adc \$0,%rdx 274238384Sjkim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 275238384Sjkim mov (%rsp,$j,8),$lo0 276238384Sjkim adc \$0,%rdx 277238384Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 278238384Sjkim mov %rdx,$hi1 279238384Sjkim 280238384Sjkim xor %rdx,%rdx 281238384Sjkim add $hi0,$hi1 282238384Sjkim adc \$0,%rdx 283238384Sjkim add $lo0,$hi1 # pull upmost overflow bit 284238384Sjkim adc \$0,%rdx 285238384Sjkim mov $hi1,-8(%rsp,$num,8) 286238384Sjkim mov %rdx,(%rsp,$num,8) # store upmost overflow bit 287238384Sjkim 288238384Sjkim lea 1($i),$i # i++ 289238384Sjkim cmp $num,$i 290238384Sjkim jl .Louter 291238384Sjkim 292238384Sjkim xor $i,$i # i=0 and clear CF! 293238384Sjkim mov (%rsp),%rax # tp[0] 294238384Sjkim lea (%rsp),$ap # borrow ap for tp 295238384Sjkim mov $num,$j # j=num 296238384Sjkim jmp .Lsub 297238384Sjkim.align 16 298238384Sjkim.Lsub: sbb ($np,$i,8),%rax 299238384Sjkim mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 300238384Sjkim mov 8($ap,$i,8),%rax # tp[i+1] 301238384Sjkim lea 1($i),$i # i++ 302238384Sjkim dec $j # doesnn't affect CF! 303238384Sjkim jnz .Lsub 304238384Sjkim 305238384Sjkim sbb \$0,%rax # handle upmost overflow bit 306238384Sjkim xor $i,$i 307238384Sjkim and %rax,$ap 308238384Sjkim not %rax 309238384Sjkim mov $rp,$np 310238384Sjkim and %rax,$np 311238384Sjkim mov $num,$j # j=num 312238384Sjkim or $np,$ap # ap=borrow?tp:rp 313238384Sjkim.align 16 314238384Sjkim.Lcopy: # copy or in-place refresh 315238384Sjkim mov ($ap,$i,8),%rax 316238384Sjkim mov $i,(%rsp,$i,8) # zap temporary vector 317238384Sjkim mov %rax,($rp,$i,8) # rp[i]=tp[i] 318238384Sjkim lea 1($i),$i 319238384Sjkim sub \$1,$j 320238384Sjkim jnz .Lcopy 321238384Sjkim 322238384Sjkim mov 8(%rsp,$num,8),%rsi # restore %rsp 323238384Sjkim mov \$1,%rax 324238384Sjkim___ 325238384Sjkim$code.=<<___ if ($win64); 326238384Sjkim movaps (%rsi),%xmm6 327238384Sjkim movaps 0x10(%rsi),%xmm7 328238384Sjkim lea 0x28(%rsi),%rsi 329238384Sjkim___ 330238384Sjkim$code.=<<___; 331238384Sjkim mov (%rsi),%r15 332238384Sjkim mov 8(%rsi),%r14 333238384Sjkim mov 16(%rsi),%r13 334238384Sjkim mov 24(%rsi),%r12 335238384Sjkim mov 32(%rsi),%rbp 336238384Sjkim mov 40(%rsi),%rbx 337238384Sjkim lea 48(%rsi),%rsp 338238384Sjkim.Lmul_epilogue: 339238384Sjkim ret 340238384Sjkim.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 341238384Sjkim___ 342238384Sjkim{{{ 343238384Sjkimmy @A=("%r10","%r11"); 344238384Sjkimmy @N=("%r13","%rdi"); 345238384Sjkim$code.=<<___; 346238384Sjkim.type bn_mul4x_mont_gather5,\@function,6 347238384Sjkim.align 16 348238384Sjkimbn_mul4x_mont_gather5: 349238384Sjkim.Lmul4x_enter: 350238384Sjkim mov ${num}d,${num}d 351238384Sjkim mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 352238384Sjkim push %rbx 353238384Sjkim push %rbp 354238384Sjkim push %r12 355238384Sjkim push %r13 356238384Sjkim push %r14 357238384Sjkim push %r15 358238384Sjkim___ 359238384Sjkim$code.=<<___ if ($win64); 360238384Sjkim lea -0x28(%rsp),%rsp 361238384Sjkim movaps %xmm6,(%rsp) 362238384Sjkim movaps %xmm7,0x10(%rsp) 363238384Sjkim.Lmul4x_alloca: 364238384Sjkim___ 365238384Sjkim$code.=<<___; 366238384Sjkim mov %rsp,%rax 367238384Sjkim lea 4($num),%r11 368238384Sjkim neg %r11 369238384Sjkim lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) 370238384Sjkim and \$-1024,%rsp # minimize TLB usage 371238384Sjkim 372238384Sjkim mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 373238384Sjkim.Lmul4x_body: 374238384Sjkim mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 375238384Sjkim mov %rdx,%r12 # reassign $bp 376238384Sjkim___ 377238384Sjkim $bp="%r12"; 378238384Sjkim $STRIDE=2**5*8; # 5 is "window size" 379238384Sjkim $N=$STRIDE/4; # should match cache line size 380238384Sjkim$code.=<<___; 381238384Sjkim mov %r10,%r11 382238384Sjkim shr \$`log($N/8)/log(2)`,%r10 383238384Sjkim and \$`$N/8-1`,%r11 384238384Sjkim not %r10 385238384Sjkim lea .Lmagic_masks(%rip),%rax 386238384Sjkim and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 387238384Sjkim lea 96($bp,%r11,8),$bp # pointer within 1st cache line 388238384Sjkim movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 389238384Sjkim movq 8(%rax,%r10,8),%xmm5 # cache line contains element 390238384Sjkim movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 391238384Sjkim movq 24(%rax,%r10,8),%xmm7 392238384Sjkim 393238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 394238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 395238384Sjkim pand %xmm4,%xmm0 396238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 397238384Sjkim pand %xmm5,%xmm1 398238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 399238384Sjkim pand %xmm6,%xmm2 400238384Sjkim por %xmm1,%xmm0 401238384Sjkim pand %xmm7,%xmm3 402238384Sjkim por %xmm2,%xmm0 403238384Sjkim lea $STRIDE($bp),$bp 404238384Sjkim por %xmm3,%xmm0 405238384Sjkim 406238384Sjkim movq %xmm0,$m0 # m0=bp[0] 407238384Sjkim mov ($n0),$n0 # pull n0[0] value 408238384Sjkim mov ($ap),%rax 409238384Sjkim 410238384Sjkim xor $i,$i # i=0 411238384Sjkim xor $j,$j # j=0 412238384Sjkim 413238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 414238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 415238384Sjkim pand %xmm4,%xmm0 416238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 417238384Sjkim pand %xmm5,%xmm1 418238384Sjkim 419238384Sjkim mov $n0,$m1 420238384Sjkim mulq $m0 # ap[0]*bp[0] 421238384Sjkim mov %rax,$A[0] 422238384Sjkim mov ($np),%rax 423238384Sjkim 424238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 425238384Sjkim pand %xmm6,%xmm2 426238384Sjkim por %xmm1,%xmm0 427238384Sjkim pand %xmm7,%xmm3 428238384Sjkim 429238384Sjkim imulq $A[0],$m1 # "tp[0]"*n0 430238384Sjkim mov %rdx,$A[1] 431238384Sjkim 432238384Sjkim por %xmm2,%xmm0 433238384Sjkim lea $STRIDE($bp),$bp 434238384Sjkim por %xmm3,%xmm0 435238384Sjkim 436238384Sjkim mulq $m1 # np[0]*m1 437238384Sjkim add %rax,$A[0] # discarded 438238384Sjkim mov 8($ap),%rax 439238384Sjkim adc \$0,%rdx 440238384Sjkim mov %rdx,$N[1] 441238384Sjkim 442238384Sjkim mulq $m0 443238384Sjkim add %rax,$A[1] 444238384Sjkim mov 8($np),%rax 445238384Sjkim adc \$0,%rdx 446238384Sjkim mov %rdx,$A[0] 447238384Sjkim 448238384Sjkim mulq $m1 449238384Sjkim add %rax,$N[1] 450238384Sjkim mov 16($ap),%rax 451238384Sjkim adc \$0,%rdx 452238384Sjkim add $A[1],$N[1] 453238384Sjkim lea 4($j),$j # j++ 454238384Sjkim adc \$0,%rdx 455238384Sjkim mov $N[1],(%rsp) 456238384Sjkim mov %rdx,$N[0] 457238384Sjkim jmp .L1st4x 458238384Sjkim.align 16 459238384Sjkim.L1st4x: 460238384Sjkim mulq $m0 # ap[j]*bp[0] 461238384Sjkim add %rax,$A[0] 462238384Sjkim mov -16($np,$j,8),%rax 463238384Sjkim adc \$0,%rdx 464238384Sjkim mov %rdx,$A[1] 465238384Sjkim 466238384Sjkim mulq $m1 # np[j]*m1 467238384Sjkim add %rax,$N[0] 468238384Sjkim mov -8($ap,$j,8),%rax 469238384Sjkim adc \$0,%rdx 470238384Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 471238384Sjkim adc \$0,%rdx 472238384Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 473238384Sjkim mov %rdx,$N[1] 474238384Sjkim 475238384Sjkim mulq $m0 # ap[j]*bp[0] 476238384Sjkim add %rax,$A[1] 477238384Sjkim mov -8($np,$j,8),%rax 478238384Sjkim adc \$0,%rdx 479238384Sjkim mov %rdx,$A[0] 480238384Sjkim 481238384Sjkim mulq $m1 # np[j]*m1 482238384Sjkim add %rax,$N[1] 483238384Sjkim mov ($ap,$j,8),%rax 484238384Sjkim adc \$0,%rdx 485238384Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 486238384Sjkim adc \$0,%rdx 487238384Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 488238384Sjkim mov %rdx,$N[0] 489238384Sjkim 490238384Sjkim mulq $m0 # ap[j]*bp[0] 491238384Sjkim add %rax,$A[0] 492238384Sjkim mov ($np,$j,8),%rax 493238384Sjkim adc \$0,%rdx 494238384Sjkim mov %rdx,$A[1] 495238384Sjkim 496238384Sjkim mulq $m1 # np[j]*m1 497238384Sjkim add %rax,$N[0] 498238384Sjkim mov 8($ap,$j,8),%rax 499238384Sjkim adc \$0,%rdx 500238384Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 501238384Sjkim adc \$0,%rdx 502238384Sjkim mov $N[0],-8(%rsp,$j,8) # tp[j-1] 503238384Sjkim mov %rdx,$N[1] 504238384Sjkim 505238384Sjkim mulq $m0 # ap[j]*bp[0] 506238384Sjkim add %rax,$A[1] 507238384Sjkim mov 8($np,$j,8),%rax 508238384Sjkim adc \$0,%rdx 509238384Sjkim lea 4($j),$j # j++ 510238384Sjkim mov %rdx,$A[0] 511238384Sjkim 512238384Sjkim mulq $m1 # np[j]*m1 513238384Sjkim add %rax,$N[1] 514238384Sjkim mov -16($ap,$j,8),%rax 515238384Sjkim adc \$0,%rdx 516238384Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 517238384Sjkim adc \$0,%rdx 518238384Sjkim mov $N[1],-32(%rsp,$j,8) # tp[j-1] 519238384Sjkim mov %rdx,$N[0] 520238384Sjkim cmp $num,$j 521238384Sjkim jl .L1st4x 522238384Sjkim 523238384Sjkim mulq $m0 # ap[j]*bp[0] 524238384Sjkim add %rax,$A[0] 525238384Sjkim mov -16($np,$j,8),%rax 526238384Sjkim adc \$0,%rdx 527238384Sjkim mov %rdx,$A[1] 528238384Sjkim 529238384Sjkim mulq $m1 # np[j]*m1 530238384Sjkim add %rax,$N[0] 531238384Sjkim mov -8($ap,$j,8),%rax 532238384Sjkim adc \$0,%rdx 533238384Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 534238384Sjkim adc \$0,%rdx 535238384Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 536238384Sjkim mov %rdx,$N[1] 537238384Sjkim 538238384Sjkim mulq $m0 # ap[j]*bp[0] 539238384Sjkim add %rax,$A[1] 540238384Sjkim mov -8($np,$j,8),%rax 541238384Sjkim adc \$0,%rdx 542238384Sjkim mov %rdx,$A[0] 543238384Sjkim 544238384Sjkim mulq $m1 # np[j]*m1 545238384Sjkim add %rax,$N[1] 546238384Sjkim mov ($ap),%rax # ap[0] 547238384Sjkim adc \$0,%rdx 548238384Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 549238384Sjkim adc \$0,%rdx 550238384Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 551238384Sjkim mov %rdx,$N[0] 552238384Sjkim 553238384Sjkim movq %xmm0,$m0 # bp[1] 554238384Sjkim 555238384Sjkim xor $N[1],$N[1] 556238384Sjkim add $A[0],$N[0] 557238384Sjkim adc \$0,$N[1] 558238384Sjkim mov $N[0],-8(%rsp,$j,8) 559238384Sjkim mov $N[1],(%rsp,$j,8) # store upmost overflow bit 560238384Sjkim 561238384Sjkim lea 1($i),$i # i++ 562238384Sjkim.align 4 563238384Sjkim.Louter4x: 564238384Sjkim xor $j,$j # j=0 565238384Sjkim movq `0*$STRIDE/4-96`($bp),%xmm0 566238384Sjkim movq `1*$STRIDE/4-96`($bp),%xmm1 567238384Sjkim pand %xmm4,%xmm0 568238384Sjkim movq `2*$STRIDE/4-96`($bp),%xmm2 569238384Sjkim pand %xmm5,%xmm1 570238384Sjkim 571238384Sjkim mov (%rsp),$A[0] 572238384Sjkim mov $n0,$m1 573238384Sjkim mulq $m0 # ap[0]*bp[i] 574238384Sjkim add %rax,$A[0] # ap[0]*bp[i]+tp[0] 575238384Sjkim mov ($np),%rax 576238384Sjkim adc \$0,%rdx 577238384Sjkim 578238384Sjkim movq `3*$STRIDE/4-96`($bp),%xmm3 579238384Sjkim pand %xmm6,%xmm2 580238384Sjkim por %xmm1,%xmm0 581238384Sjkim pand %xmm7,%xmm3 582238384Sjkim 583238384Sjkim imulq $A[0],$m1 # tp[0]*n0 584238384Sjkim mov %rdx,$A[1] 585238384Sjkim 586238384Sjkim por %xmm2,%xmm0 587238384Sjkim lea $STRIDE($bp),$bp 588238384Sjkim por %xmm3,%xmm0 589238384Sjkim 590238384Sjkim mulq $m1 # np[0]*m1 591238384Sjkim add %rax,$A[0] # "$N[0]", discarded 592238384Sjkim mov 8($ap),%rax 593238384Sjkim adc \$0,%rdx 594238384Sjkim mov %rdx,$N[1] 595238384Sjkim 596238384Sjkim mulq $m0 # ap[j]*bp[i] 597238384Sjkim add %rax,$A[1] 598238384Sjkim mov 8($np),%rax 599238384Sjkim adc \$0,%rdx 600238384Sjkim add 8(%rsp),$A[1] # +tp[1] 601238384Sjkim adc \$0,%rdx 602238384Sjkim mov %rdx,$A[0] 603238384Sjkim 604238384Sjkim mulq $m1 # np[j]*m1 605238384Sjkim add %rax,$N[1] 606238384Sjkim mov 16($ap),%rax 607238384Sjkim adc \$0,%rdx 608238384Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 609238384Sjkim lea 4($j),$j # j+=2 610238384Sjkim adc \$0,%rdx 611238384Sjkim mov %rdx,$N[0] 612238384Sjkim jmp .Linner4x 613238384Sjkim.align 16 614238384Sjkim.Linner4x: 615238384Sjkim mulq $m0 # ap[j]*bp[i] 616238384Sjkim add %rax,$A[0] 617238384Sjkim mov -16($np,$j,8),%rax 618238384Sjkim adc \$0,%rdx 619238384Sjkim add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 620238384Sjkim adc \$0,%rdx 621238384Sjkim mov %rdx,$A[1] 622238384Sjkim 623238384Sjkim mulq $m1 # np[j]*m1 624238384Sjkim add %rax,$N[0] 625238384Sjkim mov -8($ap,$j,8),%rax 626238384Sjkim adc \$0,%rdx 627238384Sjkim add $A[0],$N[0] 628238384Sjkim adc \$0,%rdx 629238384Sjkim mov $N[1],-32(%rsp,$j,8) # tp[j-1] 630238384Sjkim mov %rdx,$N[1] 631238384Sjkim 632238384Sjkim mulq $m0 # ap[j]*bp[i] 633238384Sjkim add %rax,$A[1] 634238384Sjkim mov -8($np,$j,8),%rax 635238384Sjkim adc \$0,%rdx 636238384Sjkim add -8(%rsp,$j,8),$A[1] 637238384Sjkim adc \$0,%rdx 638238384Sjkim mov %rdx,$A[0] 639238384Sjkim 640238384Sjkim mulq $m1 # np[j]*m1 641238384Sjkim add %rax,$N[1] 642238384Sjkim mov ($ap,$j,8),%rax 643238384Sjkim adc \$0,%rdx 644238384Sjkim add $A[1],$N[1] 645238384Sjkim adc \$0,%rdx 646238384Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 647238384Sjkim mov %rdx,$N[0] 648238384Sjkim 649238384Sjkim mulq $m0 # ap[j]*bp[i] 650238384Sjkim add %rax,$A[0] 651238384Sjkim mov ($np,$j,8),%rax 652238384Sjkim adc \$0,%rdx 653238384Sjkim add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 654238384Sjkim adc \$0,%rdx 655238384Sjkim mov %rdx,$A[1] 656238384Sjkim 657238384Sjkim mulq $m1 # np[j]*m1 658238384Sjkim add %rax,$N[0] 659238384Sjkim mov 8($ap,$j,8),%rax 660238384Sjkim adc \$0,%rdx 661238384Sjkim add $A[0],$N[0] 662238384Sjkim adc \$0,%rdx 663238384Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 664238384Sjkim mov %rdx,$N[1] 665238384Sjkim 666238384Sjkim mulq $m0 # ap[j]*bp[i] 667238384Sjkim add %rax,$A[1] 668238384Sjkim mov 8($np,$j,8),%rax 669238384Sjkim adc \$0,%rdx 670238384Sjkim add 8(%rsp,$j,8),$A[1] 671238384Sjkim adc \$0,%rdx 672238384Sjkim lea 4($j),$j # j++ 673238384Sjkim mov %rdx,$A[0] 674238384Sjkim 675238384Sjkim mulq $m1 # np[j]*m1 676238384Sjkim add %rax,$N[1] 677238384Sjkim mov -16($ap,$j,8),%rax 678238384Sjkim adc \$0,%rdx 679238384Sjkim add $A[1],$N[1] 680238384Sjkim adc \$0,%rdx 681238384Sjkim mov $N[0],-40(%rsp,$j,8) # tp[j-1] 682238384Sjkim mov %rdx,$N[0] 683238384Sjkim cmp $num,$j 684238384Sjkim jl .Linner4x 685238384Sjkim 686238384Sjkim mulq $m0 # ap[j]*bp[i] 687238384Sjkim add %rax,$A[0] 688238384Sjkim mov -16($np,$j,8),%rax 689238384Sjkim adc \$0,%rdx 690238384Sjkim add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 691238384Sjkim adc \$0,%rdx 692238384Sjkim mov %rdx,$A[1] 693238384Sjkim 694238384Sjkim mulq $m1 # np[j]*m1 695238384Sjkim add %rax,$N[0] 696238384Sjkim mov -8($ap,$j,8),%rax 697238384Sjkim adc \$0,%rdx 698238384Sjkim add $A[0],$N[0] 699238384Sjkim adc \$0,%rdx 700238384Sjkim mov $N[1],-32(%rsp,$j,8) # tp[j-1] 701238384Sjkim mov %rdx,$N[1] 702238384Sjkim 703238384Sjkim mulq $m0 # ap[j]*bp[i] 704238384Sjkim add %rax,$A[1] 705238384Sjkim mov -8($np,$j,8),%rax 706238384Sjkim adc \$0,%rdx 707238384Sjkim add -8(%rsp,$j,8),$A[1] 708238384Sjkim adc \$0,%rdx 709238384Sjkim lea 1($i),$i # i++ 710238384Sjkim mov %rdx,$A[0] 711238384Sjkim 712238384Sjkim mulq $m1 # np[j]*m1 713238384Sjkim add %rax,$N[1] 714238384Sjkim mov ($ap),%rax # ap[0] 715238384Sjkim adc \$0,%rdx 716238384Sjkim add $A[1],$N[1] 717238384Sjkim adc \$0,%rdx 718238384Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 719238384Sjkim mov %rdx,$N[0] 720238384Sjkim 721238384Sjkim movq %xmm0,$m0 # bp[i+1] 722238384Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 723238384Sjkim 724238384Sjkim xor $N[1],$N[1] 725238384Sjkim add $A[0],$N[0] 726238384Sjkim adc \$0,$N[1] 727238384Sjkim add (%rsp,$num,8),$N[0] # pull upmost overflow bit 728238384Sjkim adc \$0,$N[1] 729238384Sjkim mov $N[0],-8(%rsp,$j,8) 730238384Sjkim mov $N[1],(%rsp,$j,8) # store upmost overflow bit 731238384Sjkim 732238384Sjkim cmp $num,$i 733238384Sjkim jl .Louter4x 734238384Sjkim___ 735238384Sjkim{ 736238384Sjkimmy @ri=("%rax","%rdx",$m0,$m1); 737238384Sjkim$code.=<<___; 738238384Sjkim mov 16(%rsp,$num,8),$rp # restore $rp 739238384Sjkim mov 0(%rsp),@ri[0] # tp[0] 740238384Sjkim pxor %xmm0,%xmm0 741238384Sjkim mov 8(%rsp),@ri[1] # tp[1] 742238384Sjkim shr \$2,$num # num/=4 743238384Sjkim lea (%rsp),$ap # borrow ap for tp 744238384Sjkim xor $i,$i # i=0 and clear CF! 745238384Sjkim 746238384Sjkim sub 0($np),@ri[0] 747238384Sjkim mov 16($ap),@ri[2] # tp[2] 748238384Sjkim mov 24($ap),@ri[3] # tp[3] 749238384Sjkim sbb 8($np),@ri[1] 750238384Sjkim lea -1($num),$j # j=num/4-1 751238384Sjkim jmp .Lsub4x 752238384Sjkim.align 16 753238384Sjkim.Lsub4x: 754238384Sjkim mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 755238384Sjkim mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 756238384Sjkim sbb 16($np,$i,8),@ri[2] 757238384Sjkim mov 32($ap,$i,8),@ri[0] # tp[i+1] 758238384Sjkim mov 40($ap,$i,8),@ri[1] 759238384Sjkim sbb 24($np,$i,8),@ri[3] 760238384Sjkim mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 761238384Sjkim mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 762238384Sjkim sbb 32($np,$i,8),@ri[0] 763238384Sjkim mov 48($ap,$i,8),@ri[2] 764238384Sjkim mov 56($ap,$i,8),@ri[3] 765238384Sjkim sbb 40($np,$i,8),@ri[1] 766238384Sjkim lea 4($i),$i # i++ 767238384Sjkim dec $j # doesnn't affect CF! 768238384Sjkim jnz .Lsub4x 769238384Sjkim 770238384Sjkim mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 771238384Sjkim mov 32($ap,$i,8),@ri[0] # load overflow bit 772238384Sjkim sbb 16($np,$i,8),@ri[2] 773238384Sjkim mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 774238384Sjkim sbb 24($np,$i,8),@ri[3] 775238384Sjkim mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 776238384Sjkim 777238384Sjkim sbb \$0,@ri[0] # handle upmost overflow bit 778238384Sjkim mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 779238384Sjkim xor $i,$i # i=0 780238384Sjkim and @ri[0],$ap 781238384Sjkim not @ri[0] 782238384Sjkim mov $rp,$np 783238384Sjkim and @ri[0],$np 784238384Sjkim lea -1($num),$j 785238384Sjkim or $np,$ap # ap=borrow?tp:rp 786238384Sjkim 787238384Sjkim movdqu ($ap),%xmm1 788238384Sjkim movdqa %xmm0,(%rsp) 789238384Sjkim movdqu %xmm1,($rp) 790238384Sjkim jmp .Lcopy4x 791238384Sjkim.align 16 792238384Sjkim.Lcopy4x: # copy or in-place refresh 793238384Sjkim movdqu 16($ap,$i),%xmm2 794238384Sjkim movdqu 32($ap,$i),%xmm1 795238384Sjkim movdqa %xmm0,16(%rsp,$i) 796238384Sjkim movdqu %xmm2,16($rp,$i) 797238384Sjkim movdqa %xmm0,32(%rsp,$i) 798238384Sjkim movdqu %xmm1,32($rp,$i) 799238384Sjkim lea 32($i),$i 800238384Sjkim dec $j 801238384Sjkim jnz .Lcopy4x 802238384Sjkim 803238384Sjkim shl \$2,$num 804238384Sjkim movdqu 16($ap,$i),%xmm2 805238384Sjkim movdqa %xmm0,16(%rsp,$i) 806238384Sjkim movdqu %xmm2,16($rp,$i) 807238384Sjkim___ 808238384Sjkim} 809238384Sjkim$code.=<<___; 810238384Sjkim mov 8(%rsp,$num,8),%rsi # restore %rsp 811238384Sjkim mov \$1,%rax 812238384Sjkim___ 813238384Sjkim$code.=<<___ if ($win64); 814238384Sjkim movaps (%rsi),%xmm6 815238384Sjkim movaps 0x10(%rsi),%xmm7 816238384Sjkim lea 0x28(%rsi),%rsi 817238384Sjkim___ 818238384Sjkim$code.=<<___; 819238384Sjkim mov (%rsi),%r15 820238384Sjkim mov 8(%rsi),%r14 821238384Sjkim mov 16(%rsi),%r13 822238384Sjkim mov 24(%rsi),%r12 823238384Sjkim mov 32(%rsi),%rbp 824238384Sjkim mov 40(%rsi),%rbx 825238384Sjkim lea 48(%rsi),%rsp 826238384Sjkim.Lmul4x_epilogue: 827238384Sjkim ret 828238384Sjkim.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 829238384Sjkim___ 830238384Sjkim}}} 831238384Sjkim 832238384Sjkim{ 833238384Sjkimmy ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 834238384Sjkim ("%rdi","%rsi","%rdx","%rcx"); # Unix order 835238384Sjkimmy $out=$inp; 836238384Sjkimmy $STRIDE=2**5*8; 837238384Sjkimmy $N=$STRIDE/4; 838238384Sjkim 839238384Sjkim$code.=<<___; 840238384Sjkim.globl bn_scatter5 841238384Sjkim.type bn_scatter5,\@abi-omnipotent 842238384Sjkim.align 16 843238384Sjkimbn_scatter5: 844238384Sjkim cmp \$0, $num 845238384Sjkim jz .Lscatter_epilogue 846238384Sjkim lea ($tbl,$idx,8),$tbl 847238384Sjkim.Lscatter: 848238384Sjkim mov ($inp),%rax 849238384Sjkim lea 8($inp),$inp 850238384Sjkim mov %rax,($tbl) 851238384Sjkim lea 32*8($tbl),$tbl 852238384Sjkim sub \$1,$num 853238384Sjkim jnz .Lscatter 854238384Sjkim.Lscatter_epilogue: 855238384Sjkim ret 856238384Sjkim.size bn_scatter5,.-bn_scatter5 857238384Sjkim 858238384Sjkim.globl bn_gather5 859238384Sjkim.type bn_gather5,\@abi-omnipotent 860238384Sjkim.align 16 861238384Sjkimbn_gather5: 862238384Sjkim___ 863238384Sjkim$code.=<<___ if ($win64); 864238384Sjkim.LSEH_begin_bn_gather5: 865238384Sjkim # I can't trust assembler to use specific encoding:-( 866238384Sjkim .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 867238384Sjkim .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 868238384Sjkim .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 869238384Sjkim___ 870238384Sjkim$code.=<<___; 871238384Sjkim mov $idx,%r11 872238384Sjkim shr \$`log($N/8)/log(2)`,$idx 873238384Sjkim and \$`$N/8-1`,%r11 874238384Sjkim not $idx 875238384Sjkim lea .Lmagic_masks(%rip),%rax 876238384Sjkim and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 877238384Sjkim lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line 878238384Sjkim movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 879238384Sjkim movq 8(%rax,$idx,8),%xmm5 # cache line contains element 880238384Sjkim movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 881238384Sjkim movq 24(%rax,$idx,8),%xmm7 882238384Sjkim jmp .Lgather 883238384Sjkim.align 16 884238384Sjkim.Lgather: 885238384Sjkim movq `0*$STRIDE/4-96`($tbl),%xmm0 886238384Sjkim movq `1*$STRIDE/4-96`($tbl),%xmm1 887238384Sjkim pand %xmm4,%xmm0 888238384Sjkim movq `2*$STRIDE/4-96`($tbl),%xmm2 889238384Sjkim pand %xmm5,%xmm1 890238384Sjkim movq `3*$STRIDE/4-96`($tbl),%xmm3 891238384Sjkim pand %xmm6,%xmm2 892238384Sjkim por %xmm1,%xmm0 893238384Sjkim pand %xmm7,%xmm3 894238384Sjkim por %xmm2,%xmm0 895238384Sjkim lea $STRIDE($tbl),$tbl 896238384Sjkim por %xmm3,%xmm0 897238384Sjkim 898238384Sjkim movq %xmm0,($out) # m0=bp[0] 899238384Sjkim lea 8($out),$out 900238384Sjkim sub \$1,$num 901238384Sjkim jnz .Lgather 902238384Sjkim___ 903238384Sjkim$code.=<<___ if ($win64); 904279264Sdelphij movaps (%rsp),%xmm6 905279264Sdelphij movaps 0x10(%rsp),%xmm7 906238384Sjkim lea 0x28(%rsp),%rsp 907238384Sjkim___ 908238384Sjkim$code.=<<___; 909238384Sjkim ret 910238384Sjkim.LSEH_end_bn_gather5: 911238384Sjkim.size bn_gather5,.-bn_gather5 912238384Sjkim___ 913238384Sjkim} 914238384Sjkim$code.=<<___; 915238384Sjkim.align 64 916238384Sjkim.Lmagic_masks: 917238384Sjkim .long 0,0, 0,0, 0,0, -1,-1 918238384Sjkim .long 0,0, 0,0, 0,0, 0,0 919238384Sjkim.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 920238384Sjkim___ 921238384Sjkim 922238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 923238384Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 924238384Sjkimif ($win64) { 925238384Sjkim$rec="%rcx"; 926238384Sjkim$frame="%rdx"; 927238384Sjkim$context="%r8"; 928238384Sjkim$disp="%r9"; 929238384Sjkim 930238384Sjkim$code.=<<___; 931238384Sjkim.extern __imp_RtlVirtualUnwind 932238384Sjkim.type mul_handler,\@abi-omnipotent 933238384Sjkim.align 16 934238384Sjkimmul_handler: 935238384Sjkim push %rsi 936238384Sjkim push %rdi 937238384Sjkim push %rbx 938238384Sjkim push %rbp 939238384Sjkim push %r12 940238384Sjkim push %r13 941238384Sjkim push %r14 942238384Sjkim push %r15 943238384Sjkim pushfq 944238384Sjkim sub \$64,%rsp 945238384Sjkim 946238384Sjkim mov 120($context),%rax # pull context->Rax 947238384Sjkim mov 248($context),%rbx # pull context->Rip 948238384Sjkim 949238384Sjkim mov 8($disp),%rsi # disp->ImageBase 950238384Sjkim mov 56($disp),%r11 # disp->HandlerData 951238384Sjkim 952238384Sjkim mov 0(%r11),%r10d # HandlerData[0] 953238384Sjkim lea (%rsi,%r10),%r10 # end of prologue label 954238384Sjkim cmp %r10,%rbx # context->Rip<end of prologue label 955238384Sjkim jb .Lcommon_seh_tail 956238384Sjkim 957238384Sjkim lea `40+48`(%rax),%rax 958238384Sjkim 959238384Sjkim mov 4(%r11),%r10d # HandlerData[1] 960238384Sjkim lea (%rsi,%r10),%r10 # end of alloca label 961238384Sjkim cmp %r10,%rbx # context->Rip<end of alloca label 962238384Sjkim jb .Lcommon_seh_tail 963238384Sjkim 964238384Sjkim mov 152($context),%rax # pull context->Rsp 965238384Sjkim 966238384Sjkim mov 8(%r11),%r10d # HandlerData[2] 967238384Sjkim lea (%rsi,%r10),%r10 # epilogue label 968238384Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 969238384Sjkim jae .Lcommon_seh_tail 970238384Sjkim 971238384Sjkim mov 192($context),%r10 # pull $num 972238384Sjkim mov 8(%rax,%r10,8),%rax # pull saved stack pointer 973238384Sjkim 974238384Sjkim movaps (%rax),%xmm0 975238384Sjkim movaps 16(%rax),%xmm1 976238384Sjkim lea `40+48`(%rax),%rax 977238384Sjkim 978238384Sjkim mov -8(%rax),%rbx 979238384Sjkim mov -16(%rax),%rbp 980238384Sjkim mov -24(%rax),%r12 981238384Sjkim mov -32(%rax),%r13 982238384Sjkim mov -40(%rax),%r14 983238384Sjkim mov -48(%rax),%r15 984238384Sjkim mov %rbx,144($context) # restore context->Rbx 985238384Sjkim mov %rbp,160($context) # restore context->Rbp 986238384Sjkim mov %r12,216($context) # restore context->R12 987238384Sjkim mov %r13,224($context) # restore context->R13 988238384Sjkim mov %r14,232($context) # restore context->R14 989238384Sjkim mov %r15,240($context) # restore context->R15 990238384Sjkim movups %xmm0,512($context) # restore context->Xmm6 991238384Sjkim movups %xmm1,528($context) # restore context->Xmm7 992238384Sjkim 993238384Sjkim.Lcommon_seh_tail: 994238384Sjkim mov 8(%rax),%rdi 995238384Sjkim mov 16(%rax),%rsi 996238384Sjkim mov %rax,152($context) # restore context->Rsp 997238384Sjkim mov %rsi,168($context) # restore context->Rsi 998238384Sjkim mov %rdi,176($context) # restore context->Rdi 999238384Sjkim 1000238384Sjkim mov 40($disp),%rdi # disp->ContextRecord 1001238384Sjkim mov $context,%rsi # context 1002238384Sjkim mov \$154,%ecx # sizeof(CONTEXT) 1003238384Sjkim .long 0xa548f3fc # cld; rep movsq 1004238384Sjkim 1005238384Sjkim mov $disp,%rsi 1006238384Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1007238384Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 1008238384Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 1009238384Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1010238384Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 1011238384Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 1012238384Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 1013238384Sjkim mov %r10,32(%rsp) # arg5 1014238384Sjkim mov %r11,40(%rsp) # arg6 1015238384Sjkim mov %r12,48(%rsp) # arg7 1016238384Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 1017238384Sjkim call *__imp_RtlVirtualUnwind(%rip) 1018238384Sjkim 1019238384Sjkim mov \$1,%eax # ExceptionContinueSearch 1020238384Sjkim add \$64,%rsp 1021238384Sjkim popfq 1022238384Sjkim pop %r15 1023238384Sjkim pop %r14 1024238384Sjkim pop %r13 1025238384Sjkim pop %r12 1026238384Sjkim pop %rbp 1027238384Sjkim pop %rbx 1028238384Sjkim pop %rdi 1029238384Sjkim pop %rsi 1030238384Sjkim ret 1031238384Sjkim.size mul_handler,.-mul_handler 1032238384Sjkim 1033238384Sjkim.section .pdata 1034238384Sjkim.align 4 1035238384Sjkim .rva .LSEH_begin_bn_mul_mont_gather5 1036238384Sjkim .rva .LSEH_end_bn_mul_mont_gather5 1037238384Sjkim .rva .LSEH_info_bn_mul_mont_gather5 1038238384Sjkim 1039238384Sjkim .rva .LSEH_begin_bn_mul4x_mont_gather5 1040238384Sjkim .rva .LSEH_end_bn_mul4x_mont_gather5 1041238384Sjkim .rva .LSEH_info_bn_mul4x_mont_gather5 1042238384Sjkim 1043238384Sjkim .rva .LSEH_begin_bn_gather5 1044238384Sjkim .rva .LSEH_end_bn_gather5 1045238384Sjkim .rva .LSEH_info_bn_gather5 1046238384Sjkim 1047238384Sjkim.section .xdata 1048238384Sjkim.align 8 1049238384Sjkim.LSEH_info_bn_mul_mont_gather5: 1050238384Sjkim .byte 9,0,0,0 1051238384Sjkim .rva mul_handler 1052238384Sjkim .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] 1053238384Sjkim.align 8 1054238384Sjkim.LSEH_info_bn_mul4x_mont_gather5: 1055238384Sjkim .byte 9,0,0,0 1056238384Sjkim .rva mul_handler 1057238384Sjkim .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1058238384Sjkim.align 8 1059238384Sjkim.LSEH_info_bn_gather5: 1060238384Sjkim .byte 0x01,0x0d,0x05,0x00 1061238384Sjkim .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1062238384Sjkim .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 1063238384Sjkim .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 1064238384Sjkim.align 8 1065238384Sjkim___ 1066238384Sjkim} 1067238384Sjkim 1068238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1069238384Sjkim 1070238384Sjkimprint $code; 1071238384Sjkimclose STDOUT; 1072