x86_64-mont.pl revision 298999
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# October 2005. 11# 12# Montgomery multiplication routine for x86_64. While it gives modest 13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15# respectful 50%. It remains to be seen if loop unrolling and 16# dedicated squaring routine can provide further improvement... 17 18# July 2011. 19# 20# Add dedicated squaring procedure. Performance improvement varies 21# from platform to platform, but in average it's ~5%/15%/25%/33% 22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 23 24# August 2011. 25# 26# Unroll and modulo-schedule inner loops in such manner that they 27# are "fallen through" for input lengths of 8, which is critical for 28# 1024-bit RSA *sign*. Average performance improvement in comparison 29# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 31 32$flavour = shift; 33$output = shift; 34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35 36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37 38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41die "can't locate x86_64-xlate.pl"; 42 43open OUT,"| \"$^X\" $xlate $flavour $output"; 44*STDOUT=*OUT; 45 46# int bn_mul_mont( 47$rp="%rdi"; # BN_ULONG *rp, 48$ap="%rsi"; # const BN_ULONG *ap, 49$bp="%rdx"; # const BN_ULONG *bp, 50$np="%rcx"; # const BN_ULONG *np, 51$n0="%r8"; # const BN_ULONG *n0, 52$num="%r9"; # int num); 53$lo0="%r10"; 54$hi0="%r11"; 55$hi1="%r13"; 56$i="%r14"; 57$j="%r15"; 58$m0="%rbx"; 59$m1="%rbp"; 60 61$code=<<___; 62.text 63 64.globl bn_mul_mont 65.type bn_mul_mont,\@function,6 66.align 16 67bn_mul_mont: 68 test \$3,${num}d 69 jnz .Lmul_enter 70 cmp \$8,${num}d 71 jb .Lmul_enter 72 cmp $ap,$bp 73 jne .Lmul4x_enter 74 jmp .Lsqr4x_enter 75 76.align 16 77.Lmul_enter: 78 push %rbx 79 push %rbp 80 push %r12 81 push %r13 82 push %r14 83 push %r15 84 85 mov ${num}d,${num}d 86 lea 2($num),%r10 87 mov %rsp,%r11 88 neg %r10 89 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) 90 and \$-1024,%rsp # minimize TLB usage 91 92 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 93.Lmul_body: 94 # Some OSes, *cough*-dows, insist on stack being "wired" to 95 # physical memory in strictly sequential manner, i.e. if stack 96 # allocation spans two pages, then reference to farmost one can 97 # be punishable by SEGV. But page walking can do good even on 98 # other OSes, because it guarantees that villain thread hits 99 # the guard page before it can make damage to innocent one... 100 sub %rsp,%r11 101 and \$-4096,%r11 102.Lmul_page_walk: 103 mov (%rsp,%r11),%r10 104 sub \$4096,%r11 105 .byte 0x66,0x2e # predict non-taken 106 jnc .Lmul_page_walk 107 108 mov $bp,%r12 # reassign $bp 109___ 110 $bp="%r12"; 111$code.=<<___; 112 mov ($n0),$n0 # pull n0[0] value 113 mov ($bp),$m0 # m0=bp[0] 114 mov ($ap),%rax 115 116 xor $i,$i # i=0 117 xor $j,$j # j=0 118 119 mov $n0,$m1 120 mulq $m0 # ap[0]*bp[0] 121 mov %rax,$lo0 122 mov ($np),%rax 123 124 imulq $lo0,$m1 # "tp[0]"*n0 125 mov %rdx,$hi0 126 127 mulq $m1 # np[0]*m1 128 add %rax,$lo0 # discarded 129 mov 8($ap),%rax 130 adc \$0,%rdx 131 mov %rdx,$hi1 132 133 lea 1($j),$j # j++ 134 jmp .L1st_enter 135 136.align 16 137.L1st: 138 add %rax,$hi1 139 mov ($ap,$j,8),%rax 140 adc \$0,%rdx 141 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 142 mov $lo0,$hi0 143 adc \$0,%rdx 144 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 145 mov %rdx,$hi1 146 147.L1st_enter: 148 mulq $m0 # ap[j]*bp[0] 149 add %rax,$hi0 150 mov ($np,$j,8),%rax 151 adc \$0,%rdx 152 lea 1($j),$j # j++ 153 mov %rdx,$lo0 154 155 mulq $m1 # np[j]*m1 156 cmp $num,$j 157 jne .L1st 158 159 add %rax,$hi1 160 mov ($ap),%rax # ap[0] 161 adc \$0,%rdx 162 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 163 adc \$0,%rdx 164 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 165 mov %rdx,$hi1 166 mov $lo0,$hi0 167 168 xor %rdx,%rdx 169 add $hi0,$hi1 170 adc \$0,%rdx 171 mov $hi1,-8(%rsp,$num,8) 172 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 173 174 lea 1($i),$i # i++ 175 jmp .Louter 176.align 16 177.Louter: 178 mov ($bp,$i,8),$m0 # m0=bp[i] 179 xor $j,$j # j=0 180 mov $n0,$m1 181 mov (%rsp),$lo0 182 mulq $m0 # ap[0]*bp[i] 183 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 184 mov ($np),%rax 185 adc \$0,%rdx 186 187 imulq $lo0,$m1 # tp[0]*n0 188 mov %rdx,$hi0 189 190 mulq $m1 # np[0]*m1 191 add %rax,$lo0 # discarded 192 mov 8($ap),%rax 193 adc \$0,%rdx 194 mov 8(%rsp),$lo0 # tp[1] 195 mov %rdx,$hi1 196 197 lea 1($j),$j # j++ 198 jmp .Linner_enter 199 200.align 16 201.Linner: 202 add %rax,$hi1 203 mov ($ap,$j,8),%rax 204 adc \$0,%rdx 205 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 206 mov (%rsp,$j,8),$lo0 207 adc \$0,%rdx 208 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 209 mov %rdx,$hi1 210 211.Linner_enter: 212 mulq $m0 # ap[j]*bp[i] 213 add %rax,$hi0 214 mov ($np,$j,8),%rax 215 adc \$0,%rdx 216 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 217 mov %rdx,$hi0 218 adc \$0,$hi0 219 lea 1($j),$j # j++ 220 221 mulq $m1 # np[j]*m1 222 cmp $num,$j 223 jne .Linner 224 225 add %rax,$hi1 226 mov ($ap),%rax # ap[0] 227 adc \$0,%rdx 228 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 229 mov (%rsp,$j,8),$lo0 230 adc \$0,%rdx 231 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 232 mov %rdx,$hi1 233 234 xor %rdx,%rdx 235 add $hi0,$hi1 236 adc \$0,%rdx 237 add $lo0,$hi1 # pull upmost overflow bit 238 adc \$0,%rdx 239 mov $hi1,-8(%rsp,$num,8) 240 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 241 242 lea 1($i),$i # i++ 243 cmp $num,$i 244 jl .Louter 245 246 xor $i,$i # i=0 and clear CF! 247 mov (%rsp),%rax # tp[0] 248 lea (%rsp),$ap # borrow ap for tp 249 mov $num,$j # j=num 250 jmp .Lsub 251.align 16 252.Lsub: sbb ($np,$i,8),%rax 253 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 254 mov 8($ap,$i,8),%rax # tp[i+1] 255 lea 1($i),$i # i++ 256 dec $j # doesnn't affect CF! 257 jnz .Lsub 258 259 sbb \$0,%rax # handle upmost overflow bit 260 xor $i,$i 261 and %rax,$ap 262 not %rax 263 mov $rp,$np 264 and %rax,$np 265 mov $num,$j # j=num 266 or $np,$ap # ap=borrow?tp:rp 267.align 16 268.Lcopy: # copy or in-place refresh 269 mov ($ap,$i,8),%rax 270 mov $i,(%rsp,$i,8) # zap temporary vector 271 mov %rax,($rp,$i,8) # rp[i]=tp[i] 272 lea 1($i),$i 273 sub \$1,$j 274 jnz .Lcopy 275 276 mov 8(%rsp,$num,8),%rsi # restore %rsp 277 mov \$1,%rax 278 mov (%rsi),%r15 279 mov 8(%rsi),%r14 280 mov 16(%rsi),%r13 281 mov 24(%rsi),%r12 282 mov 32(%rsi),%rbp 283 mov 40(%rsi),%rbx 284 lea 48(%rsi),%rsp 285.Lmul_epilogue: 286 ret 287.size bn_mul_mont,.-bn_mul_mont 288___ 289{{{ 290my @A=("%r10","%r11"); 291my @N=("%r13","%rdi"); 292$code.=<<___; 293.type bn_mul4x_mont,\@function,6 294.align 16 295bn_mul4x_mont: 296.Lmul4x_enter: 297 push %rbx 298 push %rbp 299 push %r12 300 push %r13 301 push %r14 302 push %r15 303 304 mov ${num}d,${num}d 305 lea 4($num),%r10 306 mov %rsp,%r11 307 neg %r10 308 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) 309 and \$-1024,%rsp # minimize TLB usage 310 311 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 312.Lmul4x_body: 313 sub %rsp,%r11 314 and \$-4096,%r11 315.Lmul4x_page_walk: 316 mov (%rsp,%r11),%r10 317 sub \$4096,%r11 318 .byte 0x2e # predict non-taken 319 jnc .Lmul4x_page_walk 320 321 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 322 mov %rdx,%r12 # reassign $bp 323___ 324 $bp="%r12"; 325$code.=<<___; 326 mov ($n0),$n0 # pull n0[0] value 327 mov ($bp),$m0 # m0=bp[0] 328 mov ($ap),%rax 329 330 xor $i,$i # i=0 331 xor $j,$j # j=0 332 333 mov $n0,$m1 334 mulq $m0 # ap[0]*bp[0] 335 mov %rax,$A[0] 336 mov ($np),%rax 337 338 imulq $A[0],$m1 # "tp[0]"*n0 339 mov %rdx,$A[1] 340 341 mulq $m1 # np[0]*m1 342 add %rax,$A[0] # discarded 343 mov 8($ap),%rax 344 adc \$0,%rdx 345 mov %rdx,$N[1] 346 347 mulq $m0 348 add %rax,$A[1] 349 mov 8($np),%rax 350 adc \$0,%rdx 351 mov %rdx,$A[0] 352 353 mulq $m1 354 add %rax,$N[1] 355 mov 16($ap),%rax 356 adc \$0,%rdx 357 add $A[1],$N[1] 358 lea 4($j),$j # j++ 359 adc \$0,%rdx 360 mov $N[1],(%rsp) 361 mov %rdx,$N[0] 362 jmp .L1st4x 363.align 16 364.L1st4x: 365 mulq $m0 # ap[j]*bp[0] 366 add %rax,$A[0] 367 mov -16($np,$j,8),%rax 368 adc \$0,%rdx 369 mov %rdx,$A[1] 370 371 mulq $m1 # np[j]*m1 372 add %rax,$N[0] 373 mov -8($ap,$j,8),%rax 374 adc \$0,%rdx 375 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 376 adc \$0,%rdx 377 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 378 mov %rdx,$N[1] 379 380 mulq $m0 # ap[j]*bp[0] 381 add %rax,$A[1] 382 mov -8($np,$j,8),%rax 383 adc \$0,%rdx 384 mov %rdx,$A[0] 385 386 mulq $m1 # np[j]*m1 387 add %rax,$N[1] 388 mov ($ap,$j,8),%rax 389 adc \$0,%rdx 390 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 391 adc \$0,%rdx 392 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 393 mov %rdx,$N[0] 394 395 mulq $m0 # ap[j]*bp[0] 396 add %rax,$A[0] 397 mov ($np,$j,8),%rax 398 adc \$0,%rdx 399 mov %rdx,$A[1] 400 401 mulq $m1 # np[j]*m1 402 add %rax,$N[0] 403 mov 8($ap,$j,8),%rax 404 adc \$0,%rdx 405 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 406 adc \$0,%rdx 407 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 408 mov %rdx,$N[1] 409 410 mulq $m0 # ap[j]*bp[0] 411 add %rax,$A[1] 412 mov 8($np,$j,8),%rax 413 adc \$0,%rdx 414 lea 4($j),$j # j++ 415 mov %rdx,$A[0] 416 417 mulq $m1 # np[j]*m1 418 add %rax,$N[1] 419 mov -16($ap,$j,8),%rax 420 adc \$0,%rdx 421 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 422 adc \$0,%rdx 423 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 424 mov %rdx,$N[0] 425 cmp $num,$j 426 jl .L1st4x 427 428 mulq $m0 # ap[j]*bp[0] 429 add %rax,$A[0] 430 mov -16($np,$j,8),%rax 431 adc \$0,%rdx 432 mov %rdx,$A[1] 433 434 mulq $m1 # np[j]*m1 435 add %rax,$N[0] 436 mov -8($ap,$j,8),%rax 437 adc \$0,%rdx 438 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 439 adc \$0,%rdx 440 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 441 mov %rdx,$N[1] 442 443 mulq $m0 # ap[j]*bp[0] 444 add %rax,$A[1] 445 mov -8($np,$j,8),%rax 446 adc \$0,%rdx 447 mov %rdx,$A[0] 448 449 mulq $m1 # np[j]*m1 450 add %rax,$N[1] 451 mov ($ap),%rax # ap[0] 452 adc \$0,%rdx 453 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 454 adc \$0,%rdx 455 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 456 mov %rdx,$N[0] 457 458 xor $N[1],$N[1] 459 add $A[0],$N[0] 460 adc \$0,$N[1] 461 mov $N[0],-8(%rsp,$j,8) 462 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 463 464 lea 1($i),$i # i++ 465.align 4 466.Louter4x: 467 mov ($bp,$i,8),$m0 # m0=bp[i] 468 xor $j,$j # j=0 469 mov (%rsp),$A[0] 470 mov $n0,$m1 471 mulq $m0 # ap[0]*bp[i] 472 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 473 mov ($np),%rax 474 adc \$0,%rdx 475 476 imulq $A[0],$m1 # tp[0]*n0 477 mov %rdx,$A[1] 478 479 mulq $m1 # np[0]*m1 480 add %rax,$A[0] # "$N[0]", discarded 481 mov 8($ap),%rax 482 adc \$0,%rdx 483 mov %rdx,$N[1] 484 485 mulq $m0 # ap[j]*bp[i] 486 add %rax,$A[1] 487 mov 8($np),%rax 488 adc \$0,%rdx 489 add 8(%rsp),$A[1] # +tp[1] 490 adc \$0,%rdx 491 mov %rdx,$A[0] 492 493 mulq $m1 # np[j]*m1 494 add %rax,$N[1] 495 mov 16($ap),%rax 496 adc \$0,%rdx 497 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 498 lea 4($j),$j # j+=2 499 adc \$0,%rdx 500 mov $N[1],(%rsp) # tp[j-1] 501 mov %rdx,$N[0] 502 jmp .Linner4x 503.align 16 504.Linner4x: 505 mulq $m0 # ap[j]*bp[i] 506 add %rax,$A[0] 507 mov -16($np,$j,8),%rax 508 adc \$0,%rdx 509 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 510 adc \$0,%rdx 511 mov %rdx,$A[1] 512 513 mulq $m1 # np[j]*m1 514 add %rax,$N[0] 515 mov -8($ap,$j,8),%rax 516 adc \$0,%rdx 517 add $A[0],$N[0] 518 adc \$0,%rdx 519 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 520 mov %rdx,$N[1] 521 522 mulq $m0 # ap[j]*bp[i] 523 add %rax,$A[1] 524 mov -8($np,$j,8),%rax 525 adc \$0,%rdx 526 add -8(%rsp,$j,8),$A[1] 527 adc \$0,%rdx 528 mov %rdx,$A[0] 529 530 mulq $m1 # np[j]*m1 531 add %rax,$N[1] 532 mov ($ap,$j,8),%rax 533 adc \$0,%rdx 534 add $A[1],$N[1] 535 adc \$0,%rdx 536 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 537 mov %rdx,$N[0] 538 539 mulq $m0 # ap[j]*bp[i] 540 add %rax,$A[0] 541 mov ($np,$j,8),%rax 542 adc \$0,%rdx 543 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 544 adc \$0,%rdx 545 mov %rdx,$A[1] 546 547 mulq $m1 # np[j]*m1 548 add %rax,$N[0] 549 mov 8($ap,$j,8),%rax 550 adc \$0,%rdx 551 add $A[0],$N[0] 552 adc \$0,%rdx 553 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 554 mov %rdx,$N[1] 555 556 mulq $m0 # ap[j]*bp[i] 557 add %rax,$A[1] 558 mov 8($np,$j,8),%rax 559 adc \$0,%rdx 560 add 8(%rsp,$j,8),$A[1] 561 adc \$0,%rdx 562 lea 4($j),$j # j++ 563 mov %rdx,$A[0] 564 565 mulq $m1 # np[j]*m1 566 add %rax,$N[1] 567 mov -16($ap,$j,8),%rax 568 adc \$0,%rdx 569 add $A[1],$N[1] 570 adc \$0,%rdx 571 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 572 mov %rdx,$N[0] 573 cmp $num,$j 574 jl .Linner4x 575 576 mulq $m0 # ap[j]*bp[i] 577 add %rax,$A[0] 578 mov -16($np,$j,8),%rax 579 adc \$0,%rdx 580 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 581 adc \$0,%rdx 582 mov %rdx,$A[1] 583 584 mulq $m1 # np[j]*m1 585 add %rax,$N[0] 586 mov -8($ap,$j,8),%rax 587 adc \$0,%rdx 588 add $A[0],$N[0] 589 adc \$0,%rdx 590 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 591 mov %rdx,$N[1] 592 593 mulq $m0 # ap[j]*bp[i] 594 add %rax,$A[1] 595 mov -8($np,$j,8),%rax 596 adc \$0,%rdx 597 add -8(%rsp,$j,8),$A[1] 598 adc \$0,%rdx 599 lea 1($i),$i # i++ 600 mov %rdx,$A[0] 601 602 mulq $m1 # np[j]*m1 603 add %rax,$N[1] 604 mov ($ap),%rax # ap[0] 605 adc \$0,%rdx 606 add $A[1],$N[1] 607 adc \$0,%rdx 608 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 609 mov %rdx,$N[0] 610 611 xor $N[1],$N[1] 612 add $A[0],$N[0] 613 adc \$0,$N[1] 614 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 615 adc \$0,$N[1] 616 mov $N[0],-8(%rsp,$j,8) 617 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 618 619 cmp $num,$i 620 jl .Louter4x 621___ 622{ 623my @ri=("%rax","%rdx",$m0,$m1); 624$code.=<<___; 625 mov 16(%rsp,$num,8),$rp # restore $rp 626 mov 0(%rsp),@ri[0] # tp[0] 627 pxor %xmm0,%xmm0 628 mov 8(%rsp),@ri[1] # tp[1] 629 shr \$2,$num # num/=4 630 lea (%rsp),$ap # borrow ap for tp 631 xor $i,$i # i=0 and clear CF! 632 633 sub 0($np),@ri[0] 634 mov 16($ap),@ri[2] # tp[2] 635 mov 24($ap),@ri[3] # tp[3] 636 sbb 8($np),@ri[1] 637 lea -1($num),$j # j=num/4-1 638 jmp .Lsub4x 639.align 16 640.Lsub4x: 641 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 642 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 643 sbb 16($np,$i,8),@ri[2] 644 mov 32($ap,$i,8),@ri[0] # tp[i+1] 645 mov 40($ap,$i,8),@ri[1] 646 sbb 24($np,$i,8),@ri[3] 647 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 648 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 649 sbb 32($np,$i,8),@ri[0] 650 mov 48($ap,$i,8),@ri[2] 651 mov 56($ap,$i,8),@ri[3] 652 sbb 40($np,$i,8),@ri[1] 653 lea 4($i),$i # i++ 654 dec $j # doesnn't affect CF! 655 jnz .Lsub4x 656 657 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 658 mov 32($ap,$i,8),@ri[0] # load overflow bit 659 sbb 16($np,$i,8),@ri[2] 660 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 661 sbb 24($np,$i,8),@ri[3] 662 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 663 664 sbb \$0,@ri[0] # handle upmost overflow bit 665 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 666 xor $i,$i # i=0 667 and @ri[0],$ap 668 not @ri[0] 669 mov $rp,$np 670 and @ri[0],$np 671 lea -1($num),$j 672 or $np,$ap # ap=borrow?tp:rp 673 674 movdqu ($ap),%xmm1 675 movdqa %xmm0,(%rsp) 676 movdqu %xmm1,($rp) 677 jmp .Lcopy4x 678.align 16 679.Lcopy4x: # copy or in-place refresh 680 movdqu 16($ap,$i),%xmm2 681 movdqu 32($ap,$i),%xmm1 682 movdqa %xmm0,16(%rsp,$i) 683 movdqu %xmm2,16($rp,$i) 684 movdqa %xmm0,32(%rsp,$i) 685 movdqu %xmm1,32($rp,$i) 686 lea 32($i),$i 687 dec $j 688 jnz .Lcopy4x 689 690 shl \$2,$num 691 movdqu 16($ap,$i),%xmm2 692 movdqa %xmm0,16(%rsp,$i) 693 movdqu %xmm2,16($rp,$i) 694___ 695} 696$code.=<<___; 697 mov 8(%rsp,$num,8),%rsi # restore %rsp 698 mov \$1,%rax 699 mov (%rsi),%r15 700 mov 8(%rsi),%r14 701 mov 16(%rsi),%r13 702 mov 24(%rsi),%r12 703 mov 32(%rsi),%rbp 704 mov 40(%rsi),%rbx 705 lea 48(%rsi),%rsp 706.Lmul4x_epilogue: 707 ret 708.size bn_mul4x_mont,.-bn_mul4x_mont 709___ 710}}} 711{{{ 712###################################################################### 713# void bn_sqr4x_mont( 714my $rptr="%rdi"; # const BN_ULONG *rptr, 715my $aptr="%rsi"; # const BN_ULONG *aptr, 716my $bptr="%rdx"; # not used 717my $nptr="%rcx"; # const BN_ULONG *nptr, 718my $n0 ="%r8"; # const BN_ULONG *n0); 719my $num ="%r9"; # int num, has to be divisible by 4 and 720 # not less than 8 721 722my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 723my @A0=("%r10","%r11"); 724my @A1=("%r12","%r13"); 725my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 726 727$code.=<<___; 728.type bn_sqr4x_mont,\@function,6 729.align 16 730bn_sqr4x_mont: 731.Lsqr4x_enter: 732 mov %rsp,%rax 733 push %rbx 734 push %rbp 735 push %r12 736 push %r13 737 push %r14 738 push %r15 739 740 shl \$3,${num}d # convert $num to bytes 741 mov %rsp,%r11 # put aside %rsp 742 neg $num # -$num 743 mov ($n0),$n0 # *n0 744 lea -72(%rsp,$num,2),%rsp # alloca(frame+2*$num) 745 and \$-1024,%rsp # minimize TLB usage 746 747 sub %rsp,%r11 748 and \$-4096,%r11 749.Lsqr4x_page_walk: 750 mov (%rsp,%r11),%r10 751 sub \$4096,%r11 752 .byte 0x2e # predict non-taken 753 jnc .Lsqr4x_page_walk 754 755 mov $num,%r10 756 neg $num # restore $num 757 lea -48(%rax),%r11 # restore saved %rsp 758 ############################################################## 759 # Stack layout 760 # 761 # +0 saved $num, used in reduction section 762 # +8 &t[2*$num], used in reduction section 763 # +32 saved $rptr 764 # +40 saved $nptr 765 # +48 saved *n0 766 # +56 saved %rsp 767 # +64 t[2*$num] 768 # 769 mov $rptr,32(%rsp) # save $rptr 770 mov $nptr,40(%rsp) 771 mov $n0, 48(%rsp) 772 mov %r11, 56(%rsp) # save original %rsp 773.Lsqr4x_body: 774 ############################################################## 775 # Squaring part: 776 # 777 # a) multiply-n-add everything but a[i]*a[i]; 778 # b) shift result of a) by 1 to the left and accumulate 779 # a[i]*a[i] products; 780 # 781 lea 32(%r10),$i # $i=-($num-32) 782 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 783 784 mov $num,$j # $j=$num 785 786 # comments apply to $num==8 case 787 mov -32($aptr,$i),$a0 # a[0] 788 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 789 mov -24($aptr,$i),%rax # a[1] 790 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 791 mov -16($aptr,$i),$ai # a[2] 792 mov %rax,$a1 793 794 mul $a0 # a[1]*a[0] 795 mov %rax,$A0[0] # a[1]*a[0] 796 mov $ai,%rax # a[2] 797 mov %rdx,$A0[1] 798 mov $A0[0],-24($tptr,$i) # t[1] 799 800 xor $A0[0],$A0[0] 801 mul $a0 # a[2]*a[0] 802 add %rax,$A0[1] 803 mov $ai,%rax 804 adc %rdx,$A0[0] 805 mov $A0[1],-16($tptr,$i) # t[2] 806 807 lea -16($i),$j # j=-16 808 809 810 mov 8($aptr,$j),$ai # a[3] 811 mul $a1 # a[2]*a[1] 812 mov %rax,$A1[0] # a[2]*a[1]+t[3] 813 mov $ai,%rax 814 mov %rdx,$A1[1] 815 816 xor $A0[1],$A0[1] 817 add $A1[0],$A0[0] 818 lea 16($j),$j 819 adc \$0,$A0[1] 820 mul $a0 # a[3]*a[0] 821 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 822 mov $ai,%rax 823 adc %rdx,$A0[1] 824 mov $A0[0],-8($tptr,$j) # t[3] 825 jmp .Lsqr4x_1st 826 827.align 16 828.Lsqr4x_1st: 829 mov ($aptr,$j),$ai # a[4] 830 xor $A1[0],$A1[0] 831 mul $a1 # a[3]*a[1] 832 add %rax,$A1[1] # a[3]*a[1]+t[4] 833 mov $ai,%rax 834 adc %rdx,$A1[0] 835 836 xor $A0[0],$A0[0] 837 add $A1[1],$A0[1] 838 adc \$0,$A0[0] 839 mul $a0 # a[4]*a[0] 840 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 841 mov $ai,%rax # a[3] 842 adc %rdx,$A0[0] 843 mov $A0[1],($tptr,$j) # t[4] 844 845 846 mov 8($aptr,$j),$ai # a[5] 847 xor $A1[1],$A1[1] 848 mul $a1 # a[4]*a[3] 849 add %rax,$A1[0] # a[4]*a[3]+t[5] 850 mov $ai,%rax 851 adc %rdx,$A1[1] 852 853 xor $A0[1],$A0[1] 854 add $A1[0],$A0[0] 855 adc \$0,$A0[1] 856 mul $a0 # a[5]*a[2] 857 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 858 mov $ai,%rax 859 adc %rdx,$A0[1] 860 mov $A0[0],8($tptr,$j) # t[5] 861 862 mov 16($aptr,$j),$ai # a[6] 863 xor $A1[0],$A1[0] 864 mul $a1 # a[5]*a[3] 865 add %rax,$A1[1] # a[5]*a[3]+t[6] 866 mov $ai,%rax 867 adc %rdx,$A1[0] 868 869 xor $A0[0],$A0[0] 870 add $A1[1],$A0[1] 871 adc \$0,$A0[0] 872 mul $a0 # a[6]*a[2] 873 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 874 mov $ai,%rax # a[3] 875 adc %rdx,$A0[0] 876 mov $A0[1],16($tptr,$j) # t[6] 877 878 879 mov 24($aptr,$j),$ai # a[7] 880 xor $A1[1],$A1[1] 881 mul $a1 # a[6]*a[5] 882 add %rax,$A1[0] # a[6]*a[5]+t[7] 883 mov $ai,%rax 884 adc %rdx,$A1[1] 885 886 xor $A0[1],$A0[1] 887 add $A1[0],$A0[0] 888 lea 32($j),$j 889 adc \$0,$A0[1] 890 mul $a0 # a[7]*a[4] 891 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 892 mov $ai,%rax 893 adc %rdx,$A0[1] 894 mov $A0[0],-8($tptr,$j) # t[7] 895 896 cmp \$0,$j 897 jne .Lsqr4x_1st 898 899 xor $A1[0],$A1[0] 900 add $A0[1],$A1[1] 901 adc \$0,$A1[0] 902 mul $a1 # a[7]*a[5] 903 add %rax,$A1[1] 904 adc %rdx,$A1[0] 905 906 mov $A1[1],($tptr) # t[8] 907 lea 16($i),$i 908 mov $A1[0],8($tptr) # t[9] 909 jmp .Lsqr4x_outer 910 911.align 16 912.Lsqr4x_outer: # comments apply to $num==6 case 913 mov -32($aptr,$i),$a0 # a[0] 914 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 915 mov -24($aptr,$i),%rax # a[1] 916 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 917 mov -16($aptr,$i),$ai # a[2] 918 mov %rax,$a1 919 920 mov -24($tptr,$i),$A0[0] # t[1] 921 xor $A0[1],$A0[1] 922 mul $a0 # a[1]*a[0] 923 add %rax,$A0[0] # a[1]*a[0]+t[1] 924 mov $ai,%rax # a[2] 925 adc %rdx,$A0[1] 926 mov $A0[0],-24($tptr,$i) # t[1] 927 928 xor $A0[0],$A0[0] 929 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 930 adc \$0,$A0[0] 931 mul $a0 # a[2]*a[0] 932 add %rax,$A0[1] 933 mov $ai,%rax 934 adc %rdx,$A0[0] 935 mov $A0[1],-16($tptr,$i) # t[2] 936 937 lea -16($i),$j # j=-16 938 xor $A1[0],$A1[0] 939 940 941 mov 8($aptr,$j),$ai # a[3] 942 xor $A1[1],$A1[1] 943 add 8($tptr,$j),$A1[0] 944 adc \$0,$A1[1] 945 mul $a1 # a[2]*a[1] 946 add %rax,$A1[0] # a[2]*a[1]+t[3] 947 mov $ai,%rax 948 adc %rdx,$A1[1] 949 950 xor $A0[1],$A0[1] 951 add $A1[0],$A0[0] 952 adc \$0,$A0[1] 953 mul $a0 # a[3]*a[0] 954 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 955 mov $ai,%rax 956 adc %rdx,$A0[1] 957 mov $A0[0],8($tptr,$j) # t[3] 958 959 lea 16($j),$j 960 jmp .Lsqr4x_inner 961 962.align 16 963.Lsqr4x_inner: 964 mov ($aptr,$j),$ai # a[4] 965 xor $A1[0],$A1[0] 966 add ($tptr,$j),$A1[1] 967 adc \$0,$A1[0] 968 mul $a1 # a[3]*a[1] 969 add %rax,$A1[1] # a[3]*a[1]+t[4] 970 mov $ai,%rax 971 adc %rdx,$A1[0] 972 973 xor $A0[0],$A0[0] 974 add $A1[1],$A0[1] 975 adc \$0,$A0[0] 976 mul $a0 # a[4]*a[0] 977 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 978 mov $ai,%rax # a[3] 979 adc %rdx,$A0[0] 980 mov $A0[1],($tptr,$j) # t[4] 981 982 mov 8($aptr,$j),$ai # a[5] 983 xor $A1[1],$A1[1] 984 add 8($tptr,$j),$A1[0] 985 adc \$0,$A1[1] 986 mul $a1 # a[4]*a[3] 987 add %rax,$A1[0] # a[4]*a[3]+t[5] 988 mov $ai,%rax 989 adc %rdx,$A1[1] 990 991 xor $A0[1],$A0[1] 992 add $A1[0],$A0[0] 993 lea 16($j),$j # j++ 994 adc \$0,$A0[1] 995 mul $a0 # a[5]*a[2] 996 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 997 mov $ai,%rax 998 adc %rdx,$A0[1] 999 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1000 1001 cmp \$0,$j 1002 jne .Lsqr4x_inner 1003 1004 xor $A1[0],$A1[0] 1005 add $A0[1],$A1[1] 1006 adc \$0,$A1[0] 1007 mul $a1 # a[5]*a[3] 1008 add %rax,$A1[1] 1009 adc %rdx,$A1[0] 1010 1011 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1012 mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below 1013 1014 add \$16,$i 1015 jnz .Lsqr4x_outer 1016 1017 # comments apply to $num==4 case 1018 mov -32($aptr),$a0 # a[0] 1019 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1020 mov -24($aptr),%rax # a[1] 1021 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1022 mov -16($aptr),$ai # a[2] 1023 mov %rax,$a1 1024 1025 xor $A0[1],$A0[1] 1026 mul $a0 # a[1]*a[0] 1027 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1028 mov $ai,%rax # a[2] 1029 adc %rdx,$A0[1] 1030 mov $A0[0],-24($tptr) # t[1] 1031 1032 xor $A0[0],$A0[0] 1033 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1034 adc \$0,$A0[0] 1035 mul $a0 # a[2]*a[0] 1036 add %rax,$A0[1] 1037 mov $ai,%rax 1038 adc %rdx,$A0[0] 1039 mov $A0[1],-16($tptr) # t[2] 1040 1041 mov -8($aptr),$ai # a[3] 1042 mul $a1 # a[2]*a[1] 1043 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1044 mov $ai,%rax 1045 adc \$0,%rdx 1046 1047 xor $A0[1],$A0[1] 1048 add $A1[0],$A0[0] 1049 mov %rdx,$A1[1] 1050 adc \$0,$A0[1] 1051 mul $a0 # a[3]*a[0] 1052 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1053 mov $ai,%rax 1054 adc %rdx,$A0[1] 1055 mov $A0[0],-8($tptr) # t[3] 1056 1057 xor $A1[0],$A1[0] 1058 add $A0[1],$A1[1] 1059 adc \$0,$A1[0] 1060 mul $a1 # a[3]*a[1] 1061 add %rax,$A1[1] 1062 mov -16($aptr),%rax # a[2] 1063 adc %rdx,$A1[0] 1064 1065 mov $A1[1],($tptr) # t[4] 1066 mov $A1[0],8($tptr) # t[5] 1067 1068 mul $ai # a[2]*a[3] 1069___ 1070{ 1071my ($shift,$carry)=($a0,$a1); 1072my @S=(@A1,$ai,$n0); 1073$code.=<<___; 1074 add \$16,$i 1075 xor $shift,$shift 1076 sub $num,$i # $i=16-$num 1077 xor $carry,$carry 1078 1079 add $A1[0],%rax # t[5] 1080 adc \$0,%rdx 1081 mov %rax,8($tptr) # t[5] 1082 mov %rdx,16($tptr) # t[6] 1083 mov $carry,24($tptr) # t[7] 1084 1085 mov -16($aptr,$i),%rax # a[0] 1086 lea 64(%rsp,$num,2),$tptr 1087 xor $A0[0],$A0[0] # t[0] 1088 mov -24($tptr,$i,2),$A0[1] # t[1] 1089 1090 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1091 shr \$63,$A0[0] 1092 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1093 shr \$63,$A0[1] 1094 or $A0[0],$S[1] # | t[2*i]>>63 1095 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1096 mov $A0[1],$shift # shift=t[2*i+1]>>63 1097 mul %rax # a[i]*a[i] 1098 neg $carry # mov $carry,cf 1099 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1100 adc %rax,$S[0] 1101 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1102 mov $S[0],-32($tptr,$i,2) 1103 adc %rdx,$S[1] 1104 1105 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1106 mov $S[1],-24($tptr,$i,2) 1107 sbb $carry,$carry # mov cf,$carry 1108 shr \$63,$A0[0] 1109 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1110 shr \$63,$A0[1] 1111 or $A0[0],$S[3] # | t[2*i]>>63 1112 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1113 mov $A0[1],$shift # shift=t[2*i+1]>>63 1114 mul %rax # a[i]*a[i] 1115 neg $carry # mov $carry,cf 1116 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1117 adc %rax,$S[2] 1118 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1119 mov $S[2],-16($tptr,$i,2) 1120 adc %rdx,$S[3] 1121 lea 16($i),$i 1122 mov $S[3],-40($tptr,$i,2) 1123 sbb $carry,$carry # mov cf,$carry 1124 jmp .Lsqr4x_shift_n_add 1125 1126.align 16 1127.Lsqr4x_shift_n_add: 1128 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1129 shr \$63,$A0[0] 1130 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1131 shr \$63,$A0[1] 1132 or $A0[0],$S[1] # | t[2*i]>>63 1133 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1134 mov $A0[1],$shift # shift=t[2*i+1]>>63 1135 mul %rax # a[i]*a[i] 1136 neg $carry # mov $carry,cf 1137 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1138 adc %rax,$S[0] 1139 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1140 mov $S[0],-32($tptr,$i,2) 1141 adc %rdx,$S[1] 1142 1143 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1144 mov $S[1],-24($tptr,$i,2) 1145 sbb $carry,$carry # mov cf,$carry 1146 shr \$63,$A0[0] 1147 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1148 shr \$63,$A0[1] 1149 or $A0[0],$S[3] # | t[2*i]>>63 1150 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1151 mov $A0[1],$shift # shift=t[2*i+1]>>63 1152 mul %rax # a[i]*a[i] 1153 neg $carry # mov $carry,cf 1154 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1155 adc %rax,$S[2] 1156 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1157 mov $S[2],-16($tptr,$i,2) 1158 adc %rdx,$S[3] 1159 1160 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1161 mov $S[3],-8($tptr,$i,2) 1162 sbb $carry,$carry # mov cf,$carry 1163 shr \$63,$A0[0] 1164 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1165 shr \$63,$A0[1] 1166 or $A0[0],$S[1] # | t[2*i]>>63 1167 mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1168 mov $A0[1],$shift # shift=t[2*i+1]>>63 1169 mul %rax # a[i]*a[i] 1170 neg $carry # mov $carry,cf 1171 mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1172 adc %rax,$S[0] 1173 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1174 mov $S[0],0($tptr,$i,2) 1175 adc %rdx,$S[1] 1176 1177 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1178 mov $S[1],8($tptr,$i,2) 1179 sbb $carry,$carry # mov cf,$carry 1180 shr \$63,$A0[0] 1181 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1182 shr \$63,$A0[1] 1183 or $A0[0],$S[3] # | t[2*i]>>63 1184 mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1185 mov $A0[1],$shift # shift=t[2*i+1]>>63 1186 mul %rax # a[i]*a[i] 1187 neg $carry # mov $carry,cf 1188 mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1189 adc %rax,$S[2] 1190 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1191 mov $S[2],16($tptr,$i,2) 1192 adc %rdx,$S[3] 1193 mov $S[3],24($tptr,$i,2) 1194 sbb $carry,$carry # mov cf,$carry 1195 add \$32,$i 1196 jnz .Lsqr4x_shift_n_add 1197 1198 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1199 shr \$63,$A0[0] 1200 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1201 shr \$63,$A0[1] 1202 or $A0[0],$S[1] # | t[2*i]>>63 1203 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1204 mov $A0[1],$shift # shift=t[2*i+1]>>63 1205 mul %rax # a[i]*a[i] 1206 neg $carry # mov $carry,cf 1207 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1208 adc %rax,$S[0] 1209 mov -8($aptr),%rax # a[i+1] # prefetch 1210 mov $S[0],-32($tptr) 1211 adc %rdx,$S[1] 1212 1213 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1214 mov $S[1],-24($tptr) 1215 sbb $carry,$carry # mov cf,$carry 1216 shr \$63,$A0[0] 1217 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1218 shr \$63,$A0[1] 1219 or $A0[0],$S[3] # | t[2*i]>>63 1220 mul %rax # a[i]*a[i] 1221 neg $carry # mov $carry,cf 1222 adc %rax,$S[2] 1223 adc %rdx,$S[3] 1224 mov $S[2],-16($tptr) 1225 mov $S[3],-8($tptr) 1226___ 1227} 1228############################################################## 1229# Montgomery reduction part, "word-by-word" algorithm. 1230# 1231{ 1232my ($topbit,$nptr)=("%rbp",$aptr); 1233my ($m0,$m1)=($a0,$a1); 1234my @Ni=("%rbx","%r9"); 1235$code.=<<___; 1236 mov 40(%rsp),$nptr # restore $nptr 1237 mov 48(%rsp),$n0 # restore *n0 1238 xor $j,$j 1239 mov $num,0(%rsp) # save $num 1240 sub $num,$j # $j=-$num 1241 mov 64(%rsp),$A0[0] # t[0] # modsched # 1242 mov $n0,$m0 # # modsched # 1243 lea 64(%rsp,$num,2),%rax # end of t[] buffer 1244 lea 64(%rsp,$num),$tptr # end of t[] window 1245 mov %rax,8(%rsp) # save end of t[] buffer 1246 lea ($nptr,$num),$nptr # end of n[] buffer 1247 xor $topbit,$topbit # $topbit=0 1248 1249 mov 0($nptr,$j),%rax # n[0] # modsched # 1250 mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1251 imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # 1252 mov %rax,$Ni[0] # # modsched # 1253 jmp .Lsqr4x_mont_outer 1254 1255.align 16 1256.Lsqr4x_mont_outer: 1257 xor $A0[1],$A0[1] 1258 mul $m0 # n[0]*m0 1259 add %rax,$A0[0] # n[0]*m0+t[0] 1260 mov $Ni[1],%rax 1261 adc %rdx,$A0[1] 1262 mov $n0,$m1 1263 1264 xor $A0[0],$A0[0] 1265 add 8($tptr,$j),$A0[1] 1266 adc \$0,$A0[0] 1267 mul $m0 # n[1]*m0 1268 add %rax,$A0[1] # n[1]*m0+t[1] 1269 mov $Ni[0],%rax 1270 adc %rdx,$A0[0] 1271 1272 imulq $A0[1],$m1 1273 1274 mov 16($nptr,$j),$Ni[0] # n[2] 1275 xor $A1[1],$A1[1] 1276 add $A0[1],$A1[0] 1277 adc \$0,$A1[1] 1278 mul $m1 # n[0]*m1 1279 add %rax,$A1[0] # n[0]*m1+"t[1]" 1280 mov $Ni[0],%rax 1281 adc %rdx,$A1[1] 1282 mov $A1[0],8($tptr,$j) # "t[1]" 1283 1284 xor $A0[1],$A0[1] 1285 add 16($tptr,$j),$A0[0] 1286 adc \$0,$A0[1] 1287 mul $m0 # n[2]*m0 1288 add %rax,$A0[0] # n[2]*m0+t[2] 1289 mov $Ni[1],%rax 1290 adc %rdx,$A0[1] 1291 1292 mov 24($nptr,$j),$Ni[1] # n[3] 1293 xor $A1[0],$A1[0] 1294 add $A0[0],$A1[1] 1295 adc \$0,$A1[0] 1296 mul $m1 # n[1]*m1 1297 add %rax,$A1[1] # n[1]*m1+"t[2]" 1298 mov $Ni[1],%rax 1299 adc %rdx,$A1[0] 1300 mov $A1[1],16($tptr,$j) # "t[2]" 1301 1302 xor $A0[0],$A0[0] 1303 add 24($tptr,$j),$A0[1] 1304 lea 32($j),$j 1305 adc \$0,$A0[0] 1306 mul $m0 # n[3]*m0 1307 add %rax,$A0[1] # n[3]*m0+t[3] 1308 mov $Ni[0],%rax 1309 adc %rdx,$A0[0] 1310 jmp .Lsqr4x_mont_inner 1311 1312.align 16 1313.Lsqr4x_mont_inner: 1314 mov ($nptr,$j),$Ni[0] # n[4] 1315 xor $A1[1],$A1[1] 1316 add $A0[1],$A1[0] 1317 adc \$0,$A1[1] 1318 mul $m1 # n[2]*m1 1319 add %rax,$A1[0] # n[2]*m1+"t[3]" 1320 mov $Ni[0],%rax 1321 adc %rdx,$A1[1] 1322 mov $A1[0],-8($tptr,$j) # "t[3]" 1323 1324 xor $A0[1],$A0[1] 1325 add ($tptr,$j),$A0[0] 1326 adc \$0,$A0[1] 1327 mul $m0 # n[4]*m0 1328 add %rax,$A0[0] # n[4]*m0+t[4] 1329 mov $Ni[1],%rax 1330 adc %rdx,$A0[1] 1331 1332 mov 8($nptr,$j),$Ni[1] # n[5] 1333 xor $A1[0],$A1[0] 1334 add $A0[0],$A1[1] 1335 adc \$0,$A1[0] 1336 mul $m1 # n[3]*m1 1337 add %rax,$A1[1] # n[3]*m1+"t[4]" 1338 mov $Ni[1],%rax 1339 adc %rdx,$A1[0] 1340 mov $A1[1],($tptr,$j) # "t[4]" 1341 1342 xor $A0[0],$A0[0] 1343 add 8($tptr,$j),$A0[1] 1344 adc \$0,$A0[0] 1345 mul $m0 # n[5]*m0 1346 add %rax,$A0[1] # n[5]*m0+t[5] 1347 mov $Ni[0],%rax 1348 adc %rdx,$A0[0] 1349 1350 1351 mov 16($nptr,$j),$Ni[0] # n[6] 1352 xor $A1[1],$A1[1] 1353 add $A0[1],$A1[0] 1354 adc \$0,$A1[1] 1355 mul $m1 # n[4]*m1 1356 add %rax,$A1[0] # n[4]*m1+"t[5]" 1357 mov $Ni[0],%rax 1358 adc %rdx,$A1[1] 1359 mov $A1[0],8($tptr,$j) # "t[5]" 1360 1361 xor $A0[1],$A0[1] 1362 add 16($tptr,$j),$A0[0] 1363 adc \$0,$A0[1] 1364 mul $m0 # n[6]*m0 1365 add %rax,$A0[0] # n[6]*m0+t[6] 1366 mov $Ni[1],%rax 1367 adc %rdx,$A0[1] 1368 1369 mov 24($nptr,$j),$Ni[1] # n[7] 1370 xor $A1[0],$A1[0] 1371 add $A0[0],$A1[1] 1372 adc \$0,$A1[0] 1373 mul $m1 # n[5]*m1 1374 add %rax,$A1[1] # n[5]*m1+"t[6]" 1375 mov $Ni[1],%rax 1376 adc %rdx,$A1[0] 1377 mov $A1[1],16($tptr,$j) # "t[6]" 1378 1379 xor $A0[0],$A0[0] 1380 add 24($tptr,$j),$A0[1] 1381 lea 32($j),$j 1382 adc \$0,$A0[0] 1383 mul $m0 # n[7]*m0 1384 add %rax,$A0[1] # n[7]*m0+t[7] 1385 mov $Ni[0],%rax 1386 adc %rdx,$A0[0] 1387 cmp \$0,$j 1388 jne .Lsqr4x_mont_inner 1389 1390 sub 0(%rsp),$j # $j=-$num # modsched # 1391 mov $n0,$m0 # # modsched # 1392 1393 xor $A1[1],$A1[1] 1394 add $A0[1],$A1[0] 1395 adc \$0,$A1[1] 1396 mul $m1 # n[6]*m1 1397 add %rax,$A1[0] # n[6]*m1+"t[7]" 1398 mov $Ni[1],%rax 1399 adc %rdx,$A1[1] 1400 mov $A1[0],-8($tptr) # "t[7]" 1401 1402 xor $A0[1],$A0[1] 1403 add ($tptr),$A0[0] # +t[8] 1404 adc \$0,$A0[1] 1405 mov 0($nptr,$j),$Ni[0] # n[0] # modsched # 1406 add $topbit,$A0[0] 1407 adc \$0,$A0[1] 1408 1409 imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # 1410 xor $A1[0],$A1[0] 1411 mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1412 add $A0[0],$A1[1] 1413 mov 16($tptr,$j),$A0[0] # t[0] # modsched # 1414 adc \$0,$A1[0] 1415 mul $m1 # n[7]*m1 1416 add %rax,$A1[1] # n[7]*m1+"t[8]" 1417 mov $Ni[0],%rax # # modsched # 1418 adc %rdx,$A1[0] 1419 mov $A1[1],($tptr) # "t[8]" 1420 1421 xor $topbit,$topbit 1422 add 8($tptr),$A1[0] # +t[9] 1423 adc $topbit,$topbit 1424 add $A0[1],$A1[0] 1425 lea 16($tptr),$tptr # "t[$num]>>128" 1426 adc \$0,$topbit 1427 mov $A1[0],-8($tptr) # "t[9]" 1428 cmp 8(%rsp),$tptr # are we done? 1429 jb .Lsqr4x_mont_outer 1430 1431 mov 0(%rsp),$num # restore $num 1432 mov $topbit,($tptr) # save $topbit 1433___ 1434} 1435############################################################## 1436# Post-condition, 4x unrolled copy from bn_mul_mont 1437# 1438{ 1439my ($tptr,$nptr)=("%rbx",$aptr); 1440my @ri=("%rax","%rdx","%r10","%r11"); 1441$code.=<<___; 1442 mov 64(%rsp,$num),@ri[0] # tp[0] 1443 lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result 1444 mov 40(%rsp),$nptr # restore $nptr 1445 shr \$5,$num # num/4 1446 mov 8($tptr),@ri[1] # t[1] 1447 xor $i,$i # i=0 and clear CF! 1448 1449 mov 32(%rsp),$rptr # restore $rptr 1450 sub 0($nptr),@ri[0] 1451 mov 16($tptr),@ri[2] # t[2] 1452 mov 24($tptr),@ri[3] # t[3] 1453 sbb 8($nptr),@ri[1] 1454 lea -1($num),$j # j=num/4-1 1455 jmp .Lsqr4x_sub 1456.align 16 1457.Lsqr4x_sub: 1458 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1459 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1460 sbb 16($nptr,$i,8),@ri[2] 1461 mov 32($tptr,$i,8),@ri[0] # tp[i+1] 1462 mov 40($tptr,$i,8),@ri[1] 1463 sbb 24($nptr,$i,8),@ri[3] 1464 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1465 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1466 sbb 32($nptr,$i,8),@ri[0] 1467 mov 48($tptr,$i,8),@ri[2] 1468 mov 56($tptr,$i,8),@ri[3] 1469 sbb 40($nptr,$i,8),@ri[1] 1470 lea 4($i),$i # i++ 1471 dec $j # doesn't affect CF! 1472 jnz .Lsqr4x_sub 1473 1474 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1475 mov 32($tptr,$i,8),@ri[0] # load overflow bit 1476 sbb 16($nptr,$i,8),@ri[2] 1477 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1478 sbb 24($nptr,$i,8),@ri[3] 1479 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1480 1481 sbb \$0,@ri[0] # handle upmost overflow bit 1482 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1483 xor $i,$i # i=0 1484 and @ri[0],$tptr 1485 not @ri[0] 1486 mov $rptr,$nptr 1487 and @ri[0],$nptr 1488 lea -1($num),$j 1489 or $nptr,$tptr # tp=borrow?tp:rp 1490 1491 pxor %xmm0,%xmm0 1492 lea 64(%rsp,$num,8),$nptr 1493 movdqu ($tptr),%xmm1 1494 lea ($nptr,$num,8),$nptr 1495 movdqa %xmm0,64(%rsp) # zap lower half of temporary vector 1496 movdqa %xmm0,($nptr) # zap upper half of temporary vector 1497 movdqu %xmm1,($rptr) 1498 jmp .Lsqr4x_copy 1499.align 16 1500.Lsqr4x_copy: # copy or in-place refresh 1501 movdqu 16($tptr,$i),%xmm2 1502 movdqu 32($tptr,$i),%xmm1 1503 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1504 movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector 1505 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1506 movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector 1507 movdqu %xmm2,16($rptr,$i) 1508 movdqu %xmm1,32($rptr,$i) 1509 lea 32($i),$i 1510 dec $j 1511 jnz .Lsqr4x_copy 1512 1513 movdqu 16($tptr,$i),%xmm2 1514 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1515 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1516 movdqu %xmm2,16($rptr,$i) 1517___ 1518} 1519$code.=<<___; 1520 mov 56(%rsp),%rsi # restore %rsp 1521 mov \$1,%rax 1522 mov 0(%rsi),%r15 1523 mov 8(%rsi),%r14 1524 mov 16(%rsi),%r13 1525 mov 24(%rsi),%r12 1526 mov 32(%rsi),%rbp 1527 mov 40(%rsi),%rbx 1528 lea 48(%rsi),%rsp 1529.Lsqr4x_epilogue: 1530 ret 1531.size bn_sqr4x_mont,.-bn_sqr4x_mont 1532___ 1533}}} 1534$code.=<<___; 1535.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1536.align 16 1537___ 1538 1539# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1540# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1541if ($win64) { 1542$rec="%rcx"; 1543$frame="%rdx"; 1544$context="%r8"; 1545$disp="%r9"; 1546 1547$code.=<<___; 1548.extern __imp_RtlVirtualUnwind 1549.type mul_handler,\@abi-omnipotent 1550.align 16 1551mul_handler: 1552 push %rsi 1553 push %rdi 1554 push %rbx 1555 push %rbp 1556 push %r12 1557 push %r13 1558 push %r14 1559 push %r15 1560 pushfq 1561 sub \$64,%rsp 1562 1563 mov 120($context),%rax # pull context->Rax 1564 mov 248($context),%rbx # pull context->Rip 1565 1566 mov 8($disp),%rsi # disp->ImageBase 1567 mov 56($disp),%r11 # disp->HandlerData 1568 1569 mov 0(%r11),%r10d # HandlerData[0] 1570 lea (%rsi,%r10),%r10 # end of prologue label 1571 cmp %r10,%rbx # context->Rip<end of prologue label 1572 jb .Lcommon_seh_tail 1573 1574 mov 152($context),%rax # pull context->Rsp 1575 1576 mov 4(%r11),%r10d # HandlerData[1] 1577 lea (%rsi,%r10),%r10 # epilogue label 1578 cmp %r10,%rbx # context->Rip>=epilogue label 1579 jae .Lcommon_seh_tail 1580 1581 mov 192($context),%r10 # pull $num 1582 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1583 lea 48(%rax),%rax 1584 1585 mov -8(%rax),%rbx 1586 mov -16(%rax),%rbp 1587 mov -24(%rax),%r12 1588 mov -32(%rax),%r13 1589 mov -40(%rax),%r14 1590 mov -48(%rax),%r15 1591 mov %rbx,144($context) # restore context->Rbx 1592 mov %rbp,160($context) # restore context->Rbp 1593 mov %r12,216($context) # restore context->R12 1594 mov %r13,224($context) # restore context->R13 1595 mov %r14,232($context) # restore context->R14 1596 mov %r15,240($context) # restore context->R15 1597 1598 jmp .Lcommon_seh_tail 1599.size mul_handler,.-mul_handler 1600 1601.type sqr_handler,\@abi-omnipotent 1602.align 16 1603sqr_handler: 1604 push %rsi 1605 push %rdi 1606 push %rbx 1607 push %rbp 1608 push %r12 1609 push %r13 1610 push %r14 1611 push %r15 1612 pushfq 1613 sub \$64,%rsp 1614 1615 mov 120($context),%rax # pull context->Rax 1616 mov 248($context),%rbx # pull context->Rip 1617 1618 lea .Lsqr4x_body(%rip),%r10 1619 cmp %r10,%rbx # context->Rip<.Lsqr_body 1620 jb .Lcommon_seh_tail 1621 1622 mov 152($context),%rax # pull context->Rsp 1623 1624 lea .Lsqr4x_epilogue(%rip),%r10 1625 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1626 jae .Lcommon_seh_tail 1627 1628 mov 56(%rax),%rax # pull saved stack pointer 1629 lea 48(%rax),%rax 1630 1631 mov -8(%rax),%rbx 1632 mov -16(%rax),%rbp 1633 mov -24(%rax),%r12 1634 mov -32(%rax),%r13 1635 mov -40(%rax),%r14 1636 mov -48(%rax),%r15 1637 mov %rbx,144($context) # restore context->Rbx 1638 mov %rbp,160($context) # restore context->Rbp 1639 mov %r12,216($context) # restore context->R12 1640 mov %r13,224($context) # restore context->R13 1641 mov %r14,232($context) # restore context->R14 1642 mov %r15,240($context) # restore context->R15 1643 1644.Lcommon_seh_tail: 1645 mov 8(%rax),%rdi 1646 mov 16(%rax),%rsi 1647 mov %rax,152($context) # restore context->Rsp 1648 mov %rsi,168($context) # restore context->Rsi 1649 mov %rdi,176($context) # restore context->Rdi 1650 1651 mov 40($disp),%rdi # disp->ContextRecord 1652 mov $context,%rsi # context 1653 mov \$154,%ecx # sizeof(CONTEXT) 1654 .long 0xa548f3fc # cld; rep movsq 1655 1656 mov $disp,%rsi 1657 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1658 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1659 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1660 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1661 mov 40(%rsi),%r10 # disp->ContextRecord 1662 lea 56(%rsi),%r11 # &disp->HandlerData 1663 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1664 mov %r10,32(%rsp) # arg5 1665 mov %r11,40(%rsp) # arg6 1666 mov %r12,48(%rsp) # arg7 1667 mov %rcx,56(%rsp) # arg8, (NULL) 1668 call *__imp_RtlVirtualUnwind(%rip) 1669 1670 mov \$1,%eax # ExceptionContinueSearch 1671 add \$64,%rsp 1672 popfq 1673 pop %r15 1674 pop %r14 1675 pop %r13 1676 pop %r12 1677 pop %rbp 1678 pop %rbx 1679 pop %rdi 1680 pop %rsi 1681 ret 1682.size sqr_handler,.-sqr_handler 1683 1684.section .pdata 1685.align 4 1686 .rva .LSEH_begin_bn_mul_mont 1687 .rva .LSEH_end_bn_mul_mont 1688 .rva .LSEH_info_bn_mul_mont 1689 1690 .rva .LSEH_begin_bn_mul4x_mont 1691 .rva .LSEH_end_bn_mul4x_mont 1692 .rva .LSEH_info_bn_mul4x_mont 1693 1694 .rva .LSEH_begin_bn_sqr4x_mont 1695 .rva .LSEH_end_bn_sqr4x_mont 1696 .rva .LSEH_info_bn_sqr4x_mont 1697 1698.section .xdata 1699.align 8 1700.LSEH_info_bn_mul_mont: 1701 .byte 9,0,0,0 1702 .rva mul_handler 1703 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1704.LSEH_info_bn_mul4x_mont: 1705 .byte 9,0,0,0 1706 .rva mul_handler 1707 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1708.LSEH_info_bn_sqr4x_mont: 1709 .byte 9,0,0,0 1710 .rva sqr_handler 1711___ 1712} 1713 1714print $code; 1715close STDOUT; 1716