1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# October 2005
11238384Sjkim#
12238384Sjkim# This is a "teaser" code, as it can be improved in several ways...
13238384Sjkim# First of all non-SSE2 path should be implemented (yes, for now it
14238384Sjkim# performs Montgomery multiplication/convolution only on SSE2-capable
15238384Sjkim# CPUs such as P4, others fall down to original code). Then inner loop
16238384Sjkim# can be unrolled and modulo-scheduled to improve ILP and possibly
17238384Sjkim# moved to 128-bit XMM register bank (though it would require input
18238384Sjkim# rearrangement and/or increase bus bandwidth utilization). Dedicated
19238384Sjkim# squaring procedure should give further performance improvement...
20238384Sjkim# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21238384Sjkim# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22238384Sjkim
23238384Sjkim# December 2006
24238384Sjkim#
25238384Sjkim# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26238384Sjkim# Integer-only code [being equipped with dedicated squaring procedure]
27238384Sjkim# gives ~40% on rsa512 sign benchmark...
28238384Sjkim
29238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm");
31238384Sjkimrequire "x86asm.pl";
32238384Sjkim
33238384Sjkim&asm_init($ARGV[0],$0);
34238384Sjkim
35238384Sjkim$sse2=0;
36238384Sjkimfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37238384Sjkim
38238384Sjkim&external_label("OPENSSL_ia32cap_P") if ($sse2);
39238384Sjkim
40238384Sjkim&function_begin("bn_mul_mont");
41238384Sjkim
42238384Sjkim$i="edx";
43238384Sjkim$j="ecx";
44238384Sjkim$ap="esi";	$tp="esi";		# overlapping variables!!!
45238384Sjkim$rp="edi";	$bp="edi";		# overlapping variables!!!
46238384Sjkim$np="ebp";
47238384Sjkim$num="ebx";
48238384Sjkim
49238384Sjkim$_num=&DWP(4*0,"esp");			# stack top layout
50238384Sjkim$_rp=&DWP(4*1,"esp");
51238384Sjkim$_ap=&DWP(4*2,"esp");
52238384Sjkim$_bp=&DWP(4*3,"esp");
53238384Sjkim$_np=&DWP(4*4,"esp");
54238384Sjkim$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
55238384Sjkim$_sp=&DWP(4*6,"esp");
56238384Sjkim$_bpend=&DWP(4*7,"esp");
57238384Sjkim$frame=32;				# size of above frame rounded up to 16n
58238384Sjkim
59238384Sjkim	&xor	("eax","eax");
60238384Sjkim	&mov	("edi",&wparam(5));	# int num
61238384Sjkim	&cmp	("edi",4);
62238384Sjkim	&jl	(&label("just_leave"));
63238384Sjkim
64238384Sjkim	&lea	("esi",&wparam(0));	# put aside pointer to argument block
65238384Sjkim	&lea	("edx",&wparam(1));	# load ap
66238384Sjkim	&mov	("ebp","esp");		# saved stack pointer!
67238384Sjkim	&add	("edi",2);		# extra two words on top of tp
68238384Sjkim	&neg	("edi");
69238384Sjkim	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
70238384Sjkim	&neg	("edi");
71238384Sjkim
72238384Sjkim	# minimize cache contention by arraning 2K window between stack
73238384Sjkim	# pointer and ap argument [np is also position sensitive vector,
74238384Sjkim	# but it's assumed to be near ap, as it's allocated at ~same
75238384Sjkim	# time].
76238384Sjkim	&mov	("eax","esp");
77238384Sjkim	&sub	("eax","edx");
78238384Sjkim	&and	("eax",2047);
79238384Sjkim	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
80238384Sjkim
81238384Sjkim	&xor	("edx","esp");
82238384Sjkim	&and	("edx",2048);
83238384Sjkim	&xor	("edx",2048);
84238384Sjkim	&sub	("esp","edx");		# this splits them apart modulo 4096
85238384Sjkim
86238384Sjkim	&and	("esp",-64);		# align to cache line
87238384Sjkim
88238384Sjkim	################################# load argument block...
89238384Sjkim	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90238384Sjkim	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91238384Sjkim	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92238384Sjkim	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93238384Sjkim	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94238384Sjkim	#&mov	("edi",&DWP(5*4,"esi"));# int num
95238384Sjkim
96238384Sjkim	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
97238384Sjkim	&mov	($_rp,"eax");		# ... save a copy of argument block
98238384Sjkim	&mov	($_ap,"ebx");
99238384Sjkim	&mov	($_bp,"ecx");
100238384Sjkim	&mov	($_np,"edx");
101238384Sjkim	&mov	($_n0,"esi");
102238384Sjkim	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
103238384Sjkim	#&mov	($_num,$num);		# redundant as $num is not reused
104238384Sjkim	&mov	($_sp,"ebp");		# saved stack pointer!
105238384Sjkim
106238384Sjkimif($sse2) {
107238384Sjkim$acc0="mm0";	# mmx register bank layout
108238384Sjkim$acc1="mm1";
109238384Sjkim$car0="mm2";
110238384Sjkim$car1="mm3";
111238384Sjkim$mul0="mm4";
112238384Sjkim$mul1="mm5";
113238384Sjkim$temp="mm6";
114238384Sjkim$mask="mm7";
115238384Sjkim
116238384Sjkim	&picmeup("eax","OPENSSL_ia32cap_P");
117238384Sjkim	&bt	(&DWP(0,"eax"),26);
118238384Sjkim	&jnc	(&label("non_sse2"));
119238384Sjkim
120238384Sjkim	&mov	("eax",-1);
121238384Sjkim	&movd	($mask,"eax");		# mask 32 lower bits
122238384Sjkim
123238384Sjkim	&mov	($ap,$_ap);		# load input pointers
124238384Sjkim	&mov	($bp,$_bp);
125238384Sjkim	&mov	($np,$_np);
126238384Sjkim
127238384Sjkim	&xor	($i,$i);		# i=0
128238384Sjkim	&xor	($j,$j);		# j=0
129238384Sjkim
130238384Sjkim	&movd	($mul0,&DWP(0,$bp));		# bp[0]
131238384Sjkim	&movd	($mul1,&DWP(0,$ap));		# ap[0]
132238384Sjkim	&movd	($car1,&DWP(0,$np));		# np[0]
133238384Sjkim
134238384Sjkim	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
135238384Sjkim	&movq	($car0,$mul1);
136238384Sjkim	&movq	($acc0,$mul1);			# I wish movd worked for
137238384Sjkim	&pand	($acc0,$mask);			# inter-register transfers
138238384Sjkim
139238384Sjkim	&pmuludq($mul1,$_n0q);			# *=n0
140238384Sjkim
141238384Sjkim	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
142238384Sjkim	&paddq	($car1,$acc0);
143238384Sjkim
144238384Sjkim	&movd	($acc1,&DWP(4,$np));		# np[1]
145238384Sjkim	&movd	($acc0,&DWP(4,$ap));		# ap[1]
146238384Sjkim
147238384Sjkim	&psrlq	($car0,32);
148238384Sjkim	&psrlq	($car1,32);
149238384Sjkim
150238384Sjkim	&inc	($j);				# j++
151238384Sjkim&set_label("1st",16);
152238384Sjkim	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
153238384Sjkim	&pmuludq($acc1,$mul1);			# np[j]*m1
154238384Sjkim	&paddq	($car0,$acc0);			# +=c0
155238384Sjkim	&paddq	($car1,$acc1);			# +=c1
156238384Sjkim
157238384Sjkim	&movq	($acc0,$car0);
158238384Sjkim	&pand	($acc0,$mask);
159238384Sjkim	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
160238384Sjkim	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
161238384Sjkim	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
162238384Sjkim	&psrlq	($car0,32);
163238384Sjkim	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
164238384Sjkim	&psrlq	($car1,32);
165238384Sjkim
166238384Sjkim	&lea	($j,&DWP(1,$j));
167238384Sjkim	&cmp	($j,$num);
168238384Sjkim	&jl	(&label("1st"));
169238384Sjkim
170238384Sjkim	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
171238384Sjkim	&pmuludq($acc1,$mul1);			# np[num-1]*m1
172238384Sjkim	&paddq	($car0,$acc0);			# +=c0
173238384Sjkim	&paddq	($car1,$acc1);			# +=c1
174238384Sjkim
175238384Sjkim	&movq	($acc0,$car0);
176238384Sjkim	&pand	($acc0,$mask);
177238384Sjkim	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
178238384Sjkim	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
179238384Sjkim
180238384Sjkim	&psrlq	($car0,32);
181238384Sjkim	&psrlq	($car1,32);
182238384Sjkim
183238384Sjkim	&paddq	($car1,$car0);
184238384Sjkim	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
185238384Sjkim
186238384Sjkim	&inc	($i);				# i++
187238384Sjkim&set_label("outer");
188238384Sjkim	&xor	($j,$j);			# j=0
189238384Sjkim
190238384Sjkim	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
191238384Sjkim	&movd	($mul1,&DWP(0,$ap));		# ap[0]
192238384Sjkim	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
193238384Sjkim	&movd	($car1,&DWP(0,$np));		# np[0]
194238384Sjkim	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
195238384Sjkim
196238384Sjkim	&paddq	($mul1,$temp);			# +=tp[0]
197238384Sjkim	&movq	($acc0,$mul1);
198238384Sjkim	&movq	($car0,$mul1);
199238384Sjkim	&pand	($acc0,$mask);
200238384Sjkim
201238384Sjkim	&pmuludq($mul1,$_n0q);			# *=n0
202238384Sjkim
203238384Sjkim	&pmuludq($car1,$mul1);
204238384Sjkim	&paddq	($car1,$acc0);
205238384Sjkim
206238384Sjkim	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
207238384Sjkim	&movd	($acc1,&DWP(4,$np));		# np[1]
208238384Sjkim	&movd	($acc0,&DWP(4,$ap));		# ap[1]
209238384Sjkim
210238384Sjkim	&psrlq	($car0,32);
211238384Sjkim	&psrlq	($car1,32);
212238384Sjkim	&paddq	($car0,$temp);			# +=tp[1]
213238384Sjkim
214238384Sjkim	&inc	($j);				# j++
215238384Sjkim	&dec	($num);
216238384Sjkim&set_label("inner");
217238384Sjkim	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
218238384Sjkim	&pmuludq($acc1,$mul1);			# np[j]*m1
219238384Sjkim	&paddq	($car0,$acc0);			# +=c0
220238384Sjkim	&paddq	($car1,$acc1);			# +=c1
221238384Sjkim
222238384Sjkim	&movq	($acc0,$car0);
223238384Sjkim	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224238384Sjkim	&pand	($acc0,$mask);
225238384Sjkim	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
226238384Sjkim	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
227238384Sjkim	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
228238384Sjkim	&psrlq	($car0,32);
229238384Sjkim	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230238384Sjkim	&psrlq	($car1,32);
231238384Sjkim	&paddq	($car0,$temp);			# +=tp[j+1]
232238384Sjkim
233238384Sjkim	&dec	($num);
234238384Sjkim	&lea	($j,&DWP(1,$j));		# j++
235238384Sjkim	&jnz	(&label("inner"));
236238384Sjkim
237238384Sjkim	&mov	($num,$j);
238238384Sjkim	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
239238384Sjkim	&pmuludq($acc1,$mul1);			# np[num-1]*m1
240238384Sjkim	&paddq	($car0,$acc0);			# +=c0
241238384Sjkim	&paddq	($car1,$acc1);			# +=c1
242238384Sjkim
243238384Sjkim	&movq	($acc0,$car0);
244238384Sjkim	&pand	($acc0,$mask);
245238384Sjkim	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
246238384Sjkim	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
247238384Sjkim	&psrlq	($car0,32);
248238384Sjkim	&psrlq	($car1,32);
249238384Sjkim
250238384Sjkim	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
251238384Sjkim	&paddq	($car1,$car0);
252238384Sjkim	&paddq	($car1,$temp);
253238384Sjkim	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
254238384Sjkim
255238384Sjkim	&lea	($i,&DWP(1,$i));		# i++
256238384Sjkim	&cmp	($i,$num);
257238384Sjkim	&jle	(&label("outer"));
258238384Sjkim
259238384Sjkim	&emms	();				# done with mmx bank
260238384Sjkim	&jmp	(&label("common_tail"));
261238384Sjkim
262238384Sjkim&set_label("non_sse2",16);
263238384Sjkim}
264238384Sjkim
265238384Sjkimif (0) {
266238384Sjkim	&mov	("esp",$_sp);
267238384Sjkim	&xor	("eax","eax");	# signal "not fast enough [yet]"
268238384Sjkim	&jmp	(&label("just_leave"));
269238384Sjkim	# While the below code provides competitive performance for
270238384Sjkim	# all key lengthes on modern Intel cores, it's still more
271238384Sjkim	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272238384Sjkim	# means compared to the original integer-only assembler.
273238384Sjkim	# 512-bit RSA sign is better by ~40%, but that's about all
274238384Sjkim	# one can say about all CPUs...
275238384Sjkim} else {
276238384Sjkim$inp="esi";	# integer path uses these registers differently
277238384Sjkim$word="edi";
278238384Sjkim$carry="ebp";
279238384Sjkim
280238384Sjkim	&mov	($inp,$_ap);
281238384Sjkim	&lea	($carry,&DWP(1,$num));
282238384Sjkim	&mov	($word,$_bp);
283238384Sjkim	&xor	($j,$j);				# j=0
284238384Sjkim	&mov	("edx",$inp);
285238384Sjkim	&and	($carry,1);				# see if num is even
286238384Sjkim	&sub	("edx",$word);				# see if ap==bp
287238384Sjkim	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
288238384Sjkim	&or	($carry,"edx");
289238384Sjkim	&mov	($word,&DWP(0,$word));			# bp[0]
290238384Sjkim	&jz	(&label("bn_sqr_mont"));
291238384Sjkim	&mov	($_bpend,"eax");
292238384Sjkim	&mov	("eax",&DWP(0,$inp));
293238384Sjkim	&xor	("edx","edx");
294238384Sjkim
295238384Sjkim&set_label("mull",16);
296238384Sjkim	&mov	($carry,"edx");
297238384Sjkim	&mul	($word);				# ap[j]*bp[0]
298238384Sjkim	&add	($carry,"eax");
299238384Sjkim	&lea	($j,&DWP(1,$j));
300238384Sjkim	&adc	("edx",0);
301238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
302238384Sjkim	&cmp	($j,$num);
303238384Sjkim	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
304238384Sjkim	&jl	(&label("mull"));
305238384Sjkim
306238384Sjkim	&mov	($carry,"edx");
307238384Sjkim	&mul	($word);				# ap[num-1]*bp[0]
308238384Sjkim	 &mov	($word,$_n0);
309238384Sjkim	&add	("eax",$carry);
310238384Sjkim	 &mov	($inp,$_np);
311238384Sjkim	&adc	("edx",0);
312238384Sjkim	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
313238384Sjkim
314238384Sjkim	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
315238384Sjkim	&xor	($j,$j);
316238384Sjkim	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
317238384Sjkim	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
318238384Sjkim
319238384Sjkim	&mov	("eax",&DWP(0,$inp));			# np[0]
320238384Sjkim	&mul	($word);				# np[0]*m
321238384Sjkim	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
322238384Sjkim	&mov	("eax",&DWP(4,$inp));			# np[1]
323238384Sjkim	&adc	("edx",0);
324238384Sjkim	&inc	($j);
325238384Sjkim
326238384Sjkim	&jmp	(&label("2ndmadd"));
327238384Sjkim
328238384Sjkim&set_label("1stmadd",16);
329238384Sjkim	&mov	($carry,"edx");
330238384Sjkim	&mul	($word);				# ap[j]*bp[i]
331238384Sjkim	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
332238384Sjkim	&lea	($j,&DWP(1,$j));
333238384Sjkim	&adc	("edx",0);
334238384Sjkim	&add	($carry,"eax");
335238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
336238384Sjkim	&adc	("edx",0);
337238384Sjkim	&cmp	($j,$num);
338238384Sjkim	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
339238384Sjkim	&jl	(&label("1stmadd"));
340238384Sjkim
341238384Sjkim	&mov	($carry,"edx");
342238384Sjkim	&mul	($word);				# ap[num-1]*bp[i]
343238384Sjkim	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
344238384Sjkim	 &mov	($word,$_n0);
345238384Sjkim	&adc	("edx",0);
346238384Sjkim	 &mov	($inp,$_np);
347238384Sjkim	&add	($carry,"eax");
348238384Sjkim	&adc	("edx",0);
349238384Sjkim	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
350238384Sjkim
351238384Sjkim	&xor	($j,$j);
352238384Sjkim	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
353238384Sjkim	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
354238384Sjkim	&adc	($j,0);
355238384Sjkim	 &mov	("eax",&DWP(0,$inp));			# np[0]
356238384Sjkim	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
357238384Sjkim	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
358238384Sjkim
359238384Sjkim	&mul	($word);				# np[0]*m
360238384Sjkim	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
361238384Sjkim	&mov	("eax",&DWP(4,$inp));			# np[1]
362238384Sjkim	&adc	("edx",0);
363238384Sjkim	&mov	($j,1);
364238384Sjkim
365238384Sjkim&set_label("2ndmadd",16);
366238384Sjkim	&mov	($carry,"edx");
367238384Sjkim	&mul	($word);				# np[j]*m
368238384Sjkim	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
369238384Sjkim	&lea	($j,&DWP(1,$j));
370238384Sjkim	&adc	("edx",0);
371238384Sjkim	&add	($carry,"eax");
372238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
373238384Sjkim	&adc	("edx",0);
374238384Sjkim	&cmp	($j,$num);
375238384Sjkim	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
376238384Sjkim	&jl	(&label("2ndmadd"));
377238384Sjkim
378238384Sjkim	&mov	($carry,"edx");
379238384Sjkim	&mul	($word);				# np[j]*m
380238384Sjkim	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
381238384Sjkim	&adc	("edx",0);
382238384Sjkim	&add	($carry,"eax");
383238384Sjkim	&adc	("edx",0);
384238384Sjkim	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
385238384Sjkim
386238384Sjkim	&xor	("eax","eax");
387238384Sjkim	 &mov	($j,$_bp);				# &bp[i]
388238384Sjkim	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
389238384Sjkim	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
390238384Sjkim	 &lea	($j,&DWP(4,$j));
391238384Sjkim	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
392238384Sjkim	 &cmp	($j,$_bpend);
393238384Sjkim	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
394238384Sjkim	&je	(&label("common_tail"));
395238384Sjkim
396238384Sjkim	&mov	($word,&DWP(0,$j));			# bp[i+1]
397238384Sjkim	&mov	($inp,$_ap);
398238384Sjkim	&mov	($_bp,$j);				# &bp[++i]
399238384Sjkim	&xor	($j,$j);
400238384Sjkim	&xor	("edx","edx");
401238384Sjkim	&mov	("eax",&DWP(0,$inp));
402238384Sjkim	&jmp	(&label("1stmadd"));
403238384Sjkim
404238384Sjkim&set_label("bn_sqr_mont",16);
405238384Sjkim$sbit=$num;
406238384Sjkim	&mov	($_num,$num);
407238384Sjkim	&mov	($_bp,$j);				# i=0
408238384Sjkim
409238384Sjkim	&mov	("eax",$word);				# ap[0]
410238384Sjkim	&mul	($word);				# ap[0]*ap[0]
411238384Sjkim	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
412238384Sjkim	&mov	($sbit,"edx");
413238384Sjkim	&shr	("edx",1);
414238384Sjkim	&and	($sbit,1);
415238384Sjkim	&inc	($j);
416238384Sjkim&set_label("sqr",16);
417238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
418238384Sjkim	&mov	($carry,"edx");
419238384Sjkim	&mul	($word);				# ap[j]*ap[0]
420238384Sjkim	&add	("eax",$carry);
421238384Sjkim	&lea	($j,&DWP(1,$j));
422238384Sjkim	&adc	("edx",0);
423238384Sjkim	&lea	($carry,&DWP(0,$sbit,"eax",2));
424238384Sjkim	&shr	("eax",31);
425238384Sjkim	&cmp	($j,$_num);
426238384Sjkim	&mov	($sbit,"eax");
427238384Sjkim	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
428238384Sjkim	&jl	(&label("sqr"));
429238384Sjkim
430238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
431238384Sjkim	&mov	($carry,"edx");
432238384Sjkim	&mul	($word);				# ap[num-1]*ap[0]
433238384Sjkim	&add	("eax",$carry);
434238384Sjkim	 &mov	($word,$_n0);
435238384Sjkim	&adc	("edx",0);
436238384Sjkim	 &mov	($inp,$_np);
437238384Sjkim	&lea	($carry,&DWP(0,$sbit,"eax",2));
438238384Sjkim	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
439238384Sjkim	&shr	("eax",31);
440238384Sjkim	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
441238384Sjkim
442238384Sjkim	&lea	($carry,&DWP(0,"eax","edx",2));
443238384Sjkim	 &mov	("eax",&DWP(0,$inp));			# np[0]
444238384Sjkim	&shr	("edx",31);
445238384Sjkim	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
446238384Sjkim	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
447238384Sjkim
448238384Sjkim	&mul	($word);				# np[0]*m
449238384Sjkim	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
450238384Sjkim	&mov	($num,$j);
451238384Sjkim	&adc	("edx",0);
452238384Sjkim	&mov	("eax",&DWP(4,$inp));			# np[1]
453238384Sjkim	&mov	($j,1);
454238384Sjkim
455238384Sjkim&set_label("3rdmadd",16);
456238384Sjkim	&mov	($carry,"edx");
457238384Sjkim	&mul	($word);				# np[j]*m
458238384Sjkim	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
459238384Sjkim	&adc	("edx",0);
460238384Sjkim	&add	($carry,"eax");
461238384Sjkim	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
462238384Sjkim	&adc	("edx",0);
463238384Sjkim	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
464238384Sjkim
465238384Sjkim	&mov	($carry,"edx");
466238384Sjkim	&mul	($word);				# np[j+1]*m
467238384Sjkim	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
468238384Sjkim	&lea	($j,&DWP(2,$j));
469238384Sjkim	&adc	("edx",0);
470238384Sjkim	&add	($carry,"eax");
471238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
472238384Sjkim	&adc	("edx",0);
473238384Sjkim	&cmp	($j,$num);
474238384Sjkim	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
475238384Sjkim	&jl	(&label("3rdmadd"));
476238384Sjkim
477238384Sjkim	&mov	($carry,"edx");
478238384Sjkim	&mul	($word);				# np[j]*m
479238384Sjkim	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
480238384Sjkim	&adc	("edx",0);
481238384Sjkim	&add	($carry,"eax");
482238384Sjkim	&adc	("edx",0);
483238384Sjkim	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
484238384Sjkim
485238384Sjkim	&mov	($j,$_bp);				# i
486238384Sjkim	&xor	("eax","eax");
487238384Sjkim	&mov	($inp,$_ap);
488238384Sjkim	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
489238384Sjkim	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
490238384Sjkim	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
491238384Sjkim	&cmp	($j,$num);
492238384Sjkim	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
493238384Sjkim	&je	(&label("common_tail"));
494238384Sjkim
495238384Sjkim	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
496238384Sjkim	&lea	($j,&DWP(1,$j));
497238384Sjkim	&mov	("eax",$word);
498238384Sjkim	&mov	($_bp,$j);				# ++i
499238384Sjkim	&mul	($word);				# ap[i]*ap[i]
500238384Sjkim	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
501238384Sjkim	&adc	("edx",0);
502238384Sjkim	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
503238384Sjkim	&xor	($carry,$carry);
504238384Sjkim	&cmp	($j,$num);
505238384Sjkim	&lea	($j,&DWP(1,$j));
506238384Sjkim	&je	(&label("sqrlast"));
507238384Sjkim
508238384Sjkim	&mov	($sbit,"edx");				# zaps $num
509238384Sjkim	&shr	("edx",1);
510238384Sjkim	&and	($sbit,1);
511238384Sjkim&set_label("sqradd",16);
512238384Sjkim	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
513238384Sjkim	&mov	($carry,"edx");
514238384Sjkim	&mul	($word);				# ap[j]*ap[i]
515238384Sjkim	&add	("eax",$carry);
516238384Sjkim	&lea	($carry,&DWP(0,"eax","eax"));
517238384Sjkim	&adc	("edx",0);
518238384Sjkim	&shr	("eax",31);
519238384Sjkim	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
520238384Sjkim	&lea	($j,&DWP(1,$j));
521238384Sjkim	&adc	("eax",0);
522238384Sjkim	&add	($carry,$sbit);
523238384Sjkim	&adc	("eax",0);
524238384Sjkim	&cmp	($j,$_num);
525238384Sjkim	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
526238384Sjkim	&mov	($sbit,"eax");
527238384Sjkim	&jle	(&label("sqradd"));
528238384Sjkim
529238384Sjkim	&mov	($carry,"edx");
530238384Sjkim	&add	("edx","edx");
531238384Sjkim	&shr	($carry,31);
532238384Sjkim	&add	("edx",$sbit);
533238384Sjkim	&adc	($carry,0);
534238384Sjkim&set_label("sqrlast");
535238384Sjkim	&mov	($word,$_n0);
536238384Sjkim	&mov	($inp,$_np);
537238384Sjkim	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
538238384Sjkim
539238384Sjkim	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
540238384Sjkim	&mov	("eax",&DWP(0,$inp));			# np[0]
541238384Sjkim	&adc	($carry,0);
542238384Sjkim	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
543238384Sjkim	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
544238384Sjkim
545238384Sjkim	&mul	($word);				# np[0]*m
546238384Sjkim	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
547238384Sjkim	&lea	($num,&DWP(-1,$j));
548238384Sjkim	&adc	("edx",0);
549238384Sjkim	&mov	($j,1);
550238384Sjkim	&mov	("eax",&DWP(4,$inp));			# np[1]
551238384Sjkim
552238384Sjkim	&jmp	(&label("3rdmadd"));
553238384Sjkim}
554238384Sjkim
555238384Sjkim&set_label("common_tail",16);
556238384Sjkim	&mov	($np,$_np);			# load modulus pointer
557238384Sjkim	&mov	($rp,$_rp);			# load result pointer
558238384Sjkim	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
559238384Sjkim
560238384Sjkim	&mov	("eax",&DWP(0,$tp));		# tp[0]
561238384Sjkim	&mov	($j,$num);			# j=num-1
562238384Sjkim	&xor	($i,$i);			# i=0 and clear CF!
563238384Sjkim
564238384Sjkim&set_label("sub",16);
565238384Sjkim	&sbb	("eax",&DWP(0,$np,$i,4));
566238384Sjkim	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
567238384Sjkim	&dec	($j);				# doesn't affect CF!
568238384Sjkim	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
569238384Sjkim	&lea	($i,&DWP(1,$i));		# i++
570238384Sjkim	&jge	(&label("sub"));
571238384Sjkim
572238384Sjkim	&sbb	("eax",0);			# handle upmost overflow bit
573238384Sjkim	&and	($tp,"eax");
574238384Sjkim	&not	("eax");
575238384Sjkim	&mov	($np,$rp);
576238384Sjkim	&and	($np,"eax");
577238384Sjkim	&or	($tp,$np);			# tp=carry?tp:rp
578238384Sjkim
579238384Sjkim&set_label("copy",16);				# copy or in-place refresh
580238384Sjkim	&mov	("eax",&DWP(0,$tp,$num,4));
581238384Sjkim	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
582238384Sjkim	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
583238384Sjkim	&dec	($num);
584238384Sjkim	&jge	(&label("copy"));
585238384Sjkim
586238384Sjkim	&mov	("esp",$_sp);		# pull saved stack pointer
587238384Sjkim	&mov	("eax",1);
588238384Sjkim&set_label("just_leave");
589238384Sjkim&function_end("bn_mul_mont");
590238384Sjkim
591238384Sjkim&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592238384Sjkim
593238384Sjkim&asm_finish();
594