1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18$output=shift;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
23die "can't locate x86_64-xlate.pl";
24
25open STDOUT,"| $^X $xlate $output";
26
27# int bn_mul_mont(
28$rp="%rdi";	# BN_ULONG *rp,
29$ap="%rsi";	# const BN_ULONG *ap,
30$bp="%rdx";	# const BN_ULONG *bp,
31$np="%rcx";	# const BN_ULONG *np,
32$n0="%r8";	# const BN_ULONG *n0,
33$num="%r9";	# int num);
34$lo0="%r10";
35$hi0="%r11";
36$bp="%r12";	# reassign $bp
37$hi1="%r13";
38$i="%r14";
39$j="%r15";
40$m0="%rbx";
41$m1="%rbp";
42
43$code=<<___;
44.text
45
46.globl	bn_mul_mont
47.type	bn_mul_mont,\@function,6
48.align	16
49bn_mul_mont:
50	push	%rbx
51	push	%rbp
52	push	%r12
53	push	%r13
54	push	%r14
55	push	%r15
56
57	mov	${num}d,${num}d
58	lea	2($num),%rax
59	mov	%rsp,%rbp
60	neg	%rax
61	lea	(%rsp,%rax,8),%rsp	# tp=alloca(8*(num+2))
62	and	\$-1024,%rsp		# minimize TLB usage
63
64	mov	%rbp,8(%rsp,$num,8)	# tp[num+1]=%rsp
65	mov	%rdx,$bp		# $bp reassigned, remember?
66
67	mov	($n0),$n0		# pull n0[0] value
68
69	xor	$i,$i			# i=0
70	xor	$j,$j			# j=0
71
72	mov	($bp),$m0		# m0=bp[0]
73	mov	($ap),%rax
74	mulq	$m0			# ap[0]*bp[0]
75	mov	%rax,$lo0
76	mov	%rdx,$hi0
77
78	imulq	$n0,%rax		# "tp[0]"*n0
79	mov	%rax,$m1
80
81	mulq	($np)			# np[0]*m1
82	add	$lo0,%rax		# discarded
83	adc	\$0,%rdx
84	mov	%rdx,$hi1
85
86	lea	1($j),$j		# j++
87.L1st:
88	mov	($ap,$j,8),%rax
89	mulq	$m0			# ap[j]*bp[0]
90	add	$hi0,%rax
91	adc	\$0,%rdx
92	mov	%rax,$lo0
93	mov	($np,$j,8),%rax
94	mov	%rdx,$hi0
95
96	mulq	$m1			# np[j]*m1
97	add	$hi1,%rax
98	lea	1($j),$j		# j++
99	adc	\$0,%rdx
100	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
101	adc	\$0,%rdx
102	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
103	cmp	$num,$j
104	mov	%rdx,$hi1
105	jl	.L1st
106
107	xor	%rdx,%rdx
108	add	$hi0,$hi1
109	adc	\$0,%rdx
110	mov	$hi1,-8(%rsp,$num,8)
111	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
112
113	lea	1($i),$i		# i++
114.align	4
115.Louter:
116	xor	$j,$j			# j=0
117
118	mov	($bp,$i,8),$m0		# m0=bp[i]
119	mov	($ap),%rax		# ap[0]
120	mulq	$m0			# ap[0]*bp[i]
121	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
122	adc	\$0,%rdx
123	mov	%rax,$lo0
124	mov	%rdx,$hi0
125
126	imulq	$n0,%rax		# tp[0]*n0
127	mov	%rax,$m1
128
129	mulq	($np,$j,8)		# np[0]*m1
130	add	$lo0,%rax		# discarded
131	mov	8(%rsp),$lo0		# tp[1]
132	adc	\$0,%rdx
133	mov	%rdx,$hi1
134
135	lea	1($j),$j		# j++
136.align	4
137.Linner:
138	mov	($ap,$j,8),%rax
139	mulq	$m0			# ap[j]*bp[i]
140	add	$hi0,%rax
141	adc	\$0,%rdx
142	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
143	mov	($np,$j,8),%rax
144	adc	\$0,%rdx
145	mov	%rdx,$hi0
146
147	mulq	$m1			# np[j]*m1
148	add	$hi1,%rax
149	lea	1($j),$j		# j++
150	adc	\$0,%rdx
151	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
152	adc	\$0,%rdx
153	mov	(%rsp,$j,8),$lo0
154	cmp	$num,$j
155	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
156	mov	%rdx,$hi1
157	jl	.Linner
158
159	xor	%rdx,%rdx
160	add	$hi0,$hi1
161	adc	\$0,%rdx
162	add	$lo0,$hi1		# pull upmost overflow bit
163	adc	\$0,%rdx
164	mov	$hi1,-8(%rsp,$num,8)
165	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
166
167	lea	1($i),$i		# i++
168	cmp	$num,$i
169	jl	.Louter
170
171	lea	(%rsp),$ap		# borrow ap for tp
172	lea	-1($num),$j		# j=num-1
173
174	mov	($ap),%rax		# tp[0]
175	xor	$i,$i			# i=0 and clear CF!
176	jmp	.Lsub
177.align	16
178.Lsub:	sbb	($np,$i,8),%rax
179	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
180	dec	$j			# doesn't affect CF!
181	mov	8($ap,$i,8),%rax	# tp[i+1]
182	lea	1($i),$i		# i++
183	jge	.Lsub
184
185	sbb	\$0,%rax		# handle upmost overflow bit
186	and	%rax,$ap
187	not	%rax
188	mov	$rp,$np
189	and	%rax,$np
190	lea	-1($num),$j
191	or	$np,$ap			# ap=borrow?tp:rp
192.align	16
193.Lcopy:					# copy or in-place refresh
194	mov	($ap,$j,8),%rax
195	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
196	mov	$i,(%rsp,$j,8)		# zap temporary vector
197	dec	$j
198	jge	.Lcopy
199
200	mov	8(%rsp,$num,8),%rsp	# restore %rsp
201	mov	\$1,%rax
202	pop	%r15
203	pop	%r14
204	pop	%r13
205	pop	%r12
206	pop	%rbp
207	pop	%rbx
208	ret
209.size	bn_mul_mont,.-bn_mul_mont
210.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
211___
212
213print $code;
214close STDOUT;
215