1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# October 2005. 11# 12# Montgomery multiplication routine for x86_64. While it gives modest 13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15# respectful 50%. It remains to be seen if loop unrolling and 16# dedicated squaring routine can provide further improvement... 17 18$output=shift; 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 22( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 23die "can't locate x86_64-xlate.pl"; 24 25open STDOUT,"| $^X $xlate $output"; 26 27# int bn_mul_mont( 28$rp="%rdi"; # BN_ULONG *rp, 29$ap="%rsi"; # const BN_ULONG *ap, 30$bp="%rdx"; # const BN_ULONG *bp, 31$np="%rcx"; # const BN_ULONG *np, 32$n0="%r8"; # const BN_ULONG *n0, 33$num="%r9"; # int num); 34$lo0="%r10"; 35$hi0="%r11"; 36$bp="%r12"; # reassign $bp 37$hi1="%r13"; 38$i="%r14"; 39$j="%r15"; 40$m0="%rbx"; 41$m1="%rbp"; 42 43$code=<<___; 44.text 45 46.globl bn_mul_mont 47.type bn_mul_mont,\@function,6 48.align 16 49bn_mul_mont: 50 push %rbx 51 push %rbp 52 push %r12 53 push %r13 54 push %r14 55 push %r15 56 57 mov ${num}d,${num}d 58 lea 2($num),%rax 59 mov %rsp,%rbp 60 neg %rax 61 lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) 62 and \$-1024,%rsp # minimize TLB usage 63 64 mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp 65 mov %rdx,$bp # $bp reassigned, remember? 66 67 mov ($n0),$n0 # pull n0[0] value 68 69 xor $i,$i # i=0 70 xor $j,$j # j=0 71 72 mov ($bp),$m0 # m0=bp[0] 73 mov ($ap),%rax 74 mulq $m0 # ap[0]*bp[0] 75 mov %rax,$lo0 76 mov %rdx,$hi0 77 78 imulq $n0,%rax # "tp[0]"*n0 79 mov %rax,$m1 80 81 mulq ($np) # np[0]*m1 82 add $lo0,%rax # discarded 83 adc \$0,%rdx 84 mov %rdx,$hi1 85 86 lea 1($j),$j # j++ 87.L1st: 88 mov ($ap,$j,8),%rax 89 mulq $m0 # ap[j]*bp[0] 90 add $hi0,%rax 91 adc \$0,%rdx 92 mov %rax,$lo0 93 mov ($np,$j,8),%rax 94 mov %rdx,$hi0 95 96 mulq $m1 # np[j]*m1 97 add $hi1,%rax 98 lea 1($j),$j # j++ 99 adc \$0,%rdx 100 add $lo0,%rax # np[j]*m1+ap[j]*bp[0] 101 adc \$0,%rdx 102 mov %rax,-16(%rsp,$j,8) # tp[j-1] 103 cmp $num,$j 104 mov %rdx,$hi1 105 jl .L1st 106 107 xor %rdx,%rdx 108 add $hi0,$hi1 109 adc \$0,%rdx 110 mov $hi1,-8(%rsp,$num,8) 111 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 112 113 lea 1($i),$i # i++ 114.align 4 115.Louter: 116 xor $j,$j # j=0 117 118 mov ($bp,$i,8),$m0 # m0=bp[i] 119 mov ($ap),%rax # ap[0] 120 mulq $m0 # ap[0]*bp[i] 121 add (%rsp),%rax # ap[0]*bp[i]+tp[0] 122 adc \$0,%rdx 123 mov %rax,$lo0 124 mov %rdx,$hi0 125 126 imulq $n0,%rax # tp[0]*n0 127 mov %rax,$m1 128 129 mulq ($np,$j,8) # np[0]*m1 130 add $lo0,%rax # discarded 131 mov 8(%rsp),$lo0 # tp[1] 132 adc \$0,%rdx 133 mov %rdx,$hi1 134 135 lea 1($j),$j # j++ 136.align 4 137.Linner: 138 mov ($ap,$j,8),%rax 139 mulq $m0 # ap[j]*bp[i] 140 add $hi0,%rax 141 adc \$0,%rdx 142 add %rax,$lo0 # ap[j]*bp[i]+tp[j] 143 mov ($np,$j,8),%rax 144 adc \$0,%rdx 145 mov %rdx,$hi0 146 147 mulq $m1 # np[j]*m1 148 add $hi1,%rax 149 lea 1($j),$j # j++ 150 adc \$0,%rdx 151 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] 152 adc \$0,%rdx 153 mov (%rsp,$j,8),$lo0 154 cmp $num,$j 155 mov %rax,-16(%rsp,$j,8) # tp[j-1] 156 mov %rdx,$hi1 157 jl .Linner 158 159 xor %rdx,%rdx 160 add $hi0,$hi1 161 adc \$0,%rdx 162 add $lo0,$hi1 # pull upmost overflow bit 163 adc \$0,%rdx 164 mov $hi1,-8(%rsp,$num,8) 165 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 166 167 lea 1($i),$i # i++ 168 cmp $num,$i 169 jl .Louter 170 171 lea (%rsp),$ap # borrow ap for tp 172 lea -1($num),$j # j=num-1 173 174 mov ($ap),%rax # tp[0] 175 xor $i,$i # i=0 and clear CF! 176 jmp .Lsub 177.align 16 178.Lsub: sbb ($np,$i,8),%rax 179 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 180 dec $j # doesn't affect CF! 181 mov 8($ap,$i,8),%rax # tp[i+1] 182 lea 1($i),$i # i++ 183 jge .Lsub 184 185 sbb \$0,%rax # handle upmost overflow bit 186 and %rax,$ap 187 not %rax 188 mov $rp,$np 189 and %rax,$np 190 lea -1($num),$j 191 or $np,$ap # ap=borrow?tp:rp 192.align 16 193.Lcopy: # copy or in-place refresh 194 mov ($ap,$j,8),%rax 195 mov %rax,($rp,$j,8) # rp[i]=tp[i] 196 mov $i,(%rsp,$j,8) # zap temporary vector 197 dec $j 198 jge .Lcopy 199 200 mov 8(%rsp,$num,8),%rsp # restore %rsp 201 mov \$1,%rax 202 pop %r15 203 pop %r14 204 pop %r13 205 pop %r12 206 pop %rbp 207 pop %rbx 208 ret 209.size bn_mul_mont,.-bn_mul_mont 210.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 211___ 212 213print $code; 214close STDOUT; 215