1183234Ssimon#!/usr/bin/env perl 2183234Ssimon 3183234Ssimon# ==================================================================== 4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and 6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon 10183234Ssimon# October 2005. 11183234Ssimon# 12183234Ssimon# Montgomery multiplication routine for x86_64. While it gives modest 13183234Ssimon# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14183234Ssimon# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15183234Ssimon# respectful 50%. It remains to be seen if loop unrolling and 16183234Ssimon# dedicated squaring routine can provide further improvement... 17183234Ssimon 18183234Ssimon$output=shift; 19183234Ssimon 20183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 22183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 23183234Ssimondie "can't locate x86_64-xlate.pl"; 24183234Ssimon 25183234Ssimonopen STDOUT,"| $^X $xlate $output"; 26183234Ssimon 27183234Ssimon# int bn_mul_mont( 28183234Ssimon$rp="%rdi"; # BN_ULONG *rp, 29183234Ssimon$ap="%rsi"; # const BN_ULONG *ap, 30183234Ssimon$bp="%rdx"; # const BN_ULONG *bp, 31183234Ssimon$np="%rcx"; # const BN_ULONG *np, 32183234Ssimon$n0="%r8"; # const BN_ULONG *n0, 33183234Ssimon$num="%r9"; # int num); 34183234Ssimon$lo0="%r10"; 35183234Ssimon$hi0="%r11"; 36183234Ssimon$bp="%r12"; # reassign $bp 37183234Ssimon$hi1="%r13"; 38183234Ssimon$i="%r14"; 39183234Ssimon$j="%r15"; 40183234Ssimon$m0="%rbx"; 41183234Ssimon$m1="%rbp"; 42183234Ssimon 43183234Ssimon$code=<<___; 44183234Ssimon.text 45183234Ssimon 46183234Ssimon.globl bn_mul_mont 47183234Ssimon.type bn_mul_mont,\@function,6 48183234Ssimon.align 16 49183234Ssimonbn_mul_mont: 50183234Ssimon push %rbx 51183234Ssimon push %rbp 52183234Ssimon push %r12 53183234Ssimon push %r13 54183234Ssimon push %r14 55183234Ssimon push %r15 56183234Ssimon 57183234Ssimon mov ${num}d,${num}d 58183234Ssimon lea 2($num),%rax 59183234Ssimon mov %rsp,%rbp 60183234Ssimon neg %rax 61183234Ssimon lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) 62183234Ssimon and \$-1024,%rsp # minimize TLB usage 63183234Ssimon 64183234Ssimon mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp 65183234Ssimon mov %rdx,$bp # $bp reassigned, remember? 66183234Ssimon 67183234Ssimon mov ($n0),$n0 # pull n0[0] value 68183234Ssimon 69183234Ssimon xor $i,$i # i=0 70183234Ssimon xor $j,$j # j=0 71183234Ssimon 72183234Ssimon mov ($bp),$m0 # m0=bp[0] 73183234Ssimon mov ($ap),%rax 74183234Ssimon mulq $m0 # ap[0]*bp[0] 75183234Ssimon mov %rax,$lo0 76183234Ssimon mov %rdx,$hi0 77183234Ssimon 78183234Ssimon imulq $n0,%rax # "tp[0]"*n0 79183234Ssimon mov %rax,$m1 80183234Ssimon 81183234Ssimon mulq ($np) # np[0]*m1 82183234Ssimon add $lo0,%rax # discarded 83183234Ssimon adc \$0,%rdx 84183234Ssimon mov %rdx,$hi1 85183234Ssimon 86183234Ssimon lea 1($j),$j # j++ 87183234Ssimon.L1st: 88183234Ssimon mov ($ap,$j,8),%rax 89183234Ssimon mulq $m0 # ap[j]*bp[0] 90183234Ssimon add $hi0,%rax 91183234Ssimon adc \$0,%rdx 92183234Ssimon mov %rax,$lo0 93183234Ssimon mov ($np,$j,8),%rax 94183234Ssimon mov %rdx,$hi0 95183234Ssimon 96183234Ssimon mulq $m1 # np[j]*m1 97183234Ssimon add $hi1,%rax 98183234Ssimon lea 1($j),$j # j++ 99183234Ssimon adc \$0,%rdx 100183234Ssimon add $lo0,%rax # np[j]*m1+ap[j]*bp[0] 101183234Ssimon adc \$0,%rdx 102183234Ssimon mov %rax,-16(%rsp,$j,8) # tp[j-1] 103183234Ssimon cmp $num,$j 104183234Ssimon mov %rdx,$hi1 105183234Ssimon jl .L1st 106183234Ssimon 107183234Ssimon xor %rdx,%rdx 108183234Ssimon add $hi0,$hi1 109183234Ssimon adc \$0,%rdx 110183234Ssimon mov $hi1,-8(%rsp,$num,8) 111183234Ssimon mov %rdx,(%rsp,$num,8) # store upmost overflow bit 112183234Ssimon 113183234Ssimon lea 1($i),$i # i++ 114183234Ssimon.align 4 115183234Ssimon.Louter: 116183234Ssimon xor $j,$j # j=0 117183234Ssimon 118183234Ssimon mov ($bp,$i,8),$m0 # m0=bp[i] 119183234Ssimon mov ($ap),%rax # ap[0] 120183234Ssimon mulq $m0 # ap[0]*bp[i] 121183234Ssimon add (%rsp),%rax # ap[0]*bp[i]+tp[0] 122183234Ssimon adc \$0,%rdx 123183234Ssimon mov %rax,$lo0 124183234Ssimon mov %rdx,$hi0 125183234Ssimon 126183234Ssimon imulq $n0,%rax # tp[0]*n0 127183234Ssimon mov %rax,$m1 128183234Ssimon 129183234Ssimon mulq ($np,$j,8) # np[0]*m1 130183234Ssimon add $lo0,%rax # discarded 131183234Ssimon mov 8(%rsp),$lo0 # tp[1] 132183234Ssimon adc \$0,%rdx 133183234Ssimon mov %rdx,$hi1 134183234Ssimon 135183234Ssimon lea 1($j),$j # j++ 136183234Ssimon.align 4 137183234Ssimon.Linner: 138183234Ssimon mov ($ap,$j,8),%rax 139183234Ssimon mulq $m0 # ap[j]*bp[i] 140183234Ssimon add $hi0,%rax 141183234Ssimon adc \$0,%rdx 142183234Ssimon add %rax,$lo0 # ap[j]*bp[i]+tp[j] 143183234Ssimon mov ($np,$j,8),%rax 144183234Ssimon adc \$0,%rdx 145183234Ssimon mov %rdx,$hi0 146183234Ssimon 147183234Ssimon mulq $m1 # np[j]*m1 148183234Ssimon add $hi1,%rax 149183234Ssimon lea 1($j),$j # j++ 150183234Ssimon adc \$0,%rdx 151183234Ssimon add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] 152183234Ssimon adc \$0,%rdx 153183234Ssimon mov (%rsp,$j,8),$lo0 154183234Ssimon cmp $num,$j 155183234Ssimon mov %rax,-16(%rsp,$j,8) # tp[j-1] 156183234Ssimon mov %rdx,$hi1 157183234Ssimon jl .Linner 158183234Ssimon 159183234Ssimon xor %rdx,%rdx 160183234Ssimon add $hi0,$hi1 161183234Ssimon adc \$0,%rdx 162183234Ssimon add $lo0,$hi1 # pull upmost overflow bit 163183234Ssimon adc \$0,%rdx 164183234Ssimon mov $hi1,-8(%rsp,$num,8) 165183234Ssimon mov %rdx,(%rsp,$num,8) # store upmost overflow bit 166183234Ssimon 167183234Ssimon lea 1($i),$i # i++ 168183234Ssimon cmp $num,$i 169183234Ssimon jl .Louter 170183234Ssimon 171183234Ssimon lea (%rsp),$ap # borrow ap for tp 172183234Ssimon lea -1($num),$j # j=num-1 173183234Ssimon 174183234Ssimon mov ($ap),%rax # tp[0] 175183234Ssimon xor $i,$i # i=0 and clear CF! 176183234Ssimon jmp .Lsub 177183234Ssimon.align 16 178183234Ssimon.Lsub: sbb ($np,$i,8),%rax 179183234Ssimon mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 180183234Ssimon dec $j # doesn't affect CF! 181183234Ssimon mov 8($ap,$i,8),%rax # tp[i+1] 182183234Ssimon lea 1($i),$i # i++ 183183234Ssimon jge .Lsub 184183234Ssimon 185183234Ssimon sbb \$0,%rax # handle upmost overflow bit 186183234Ssimon and %rax,$ap 187183234Ssimon not %rax 188183234Ssimon mov $rp,$np 189183234Ssimon and %rax,$np 190183234Ssimon lea -1($num),$j 191183234Ssimon or $np,$ap # ap=borrow?tp:rp 192183234Ssimon.align 16 193183234Ssimon.Lcopy: # copy or in-place refresh 194183234Ssimon mov ($ap,$j,8),%rax 195183234Ssimon mov %rax,($rp,$j,8) # rp[i]=tp[i] 196183234Ssimon mov $i,(%rsp,$j,8) # zap temporary vector 197183234Ssimon dec $j 198183234Ssimon jge .Lcopy 199183234Ssimon 200183234Ssimon mov 8(%rsp,$num,8),%rsp # restore %rsp 201183234Ssimon mov \$1,%rax 202183234Ssimon pop %r15 203183234Ssimon pop %r14 204183234Ssimon pop %r13 205183234Ssimon pop %r12 206183234Ssimon pop %rbp 207183234Ssimon pop %rbx 208183234Ssimon ret 209183234Ssimon.size bn_mul_mont,.-bn_mul_mont 210183234Ssimon.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 211183234Ssimon___ 212183234Ssimon 213183234Ssimonprint $code; 214183234Ssimonclose STDOUT; 215