x86_64-mont.pl revision 298999
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open OUT,"| \"$^X\" $xlate $flavour $output";
44*STDOUT=*OUT;
45
46# int bn_mul_mont(
47$rp="%rdi";	# BN_ULONG *rp,
48$ap="%rsi";	# const BN_ULONG *ap,
49$bp="%rdx";	# const BN_ULONG *bp,
50$np="%rcx";	# const BN_ULONG *np,
51$n0="%r8";	# const BN_ULONG *n0,
52$num="%r9";	# int num);
53$lo0="%r10";
54$hi0="%r11";
55$hi1="%r13";
56$i="%r14";
57$j="%r15";
58$m0="%rbx";
59$m1="%rbp";
60
61$code=<<___;
62.text
63
64.globl	bn_mul_mont
65.type	bn_mul_mont,\@function,6
66.align	16
67bn_mul_mont:
68	test	\$3,${num}d
69	jnz	.Lmul_enter
70	cmp	\$8,${num}d
71	jb	.Lmul_enter
72	cmp	$ap,$bp
73	jne	.Lmul4x_enter
74	jmp	.Lsqr4x_enter
75
76.align	16
77.Lmul_enter:
78	push	%rbx
79	push	%rbp
80	push	%r12
81	push	%r13
82	push	%r14
83	push	%r15
84
85	mov	${num}d,${num}d
86	lea	2($num),%r10
87	mov	%rsp,%r11
88	neg	%r10
89	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
90	and	\$-1024,%rsp		# minimize TLB usage
91
92	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
93.Lmul_body:
94	# Some OSes, *cough*-dows, insist on stack being "wired" to
95	# physical memory in strictly sequential manner, i.e. if stack
96	# allocation spans two pages, then reference to farmost one can
97	# be punishable by SEGV. But page walking can do good even on
98	# other OSes, because it guarantees that villain thread hits
99	# the guard page before it can make damage to innocent one...
100	sub	%rsp,%r11
101	and	\$-4096,%r11
102.Lmul_page_walk:
103	mov	(%rsp,%r11),%r10
104	sub	\$4096,%r11
105	.byte	0x66,0x2e		# predict non-taken
106	jnc	.Lmul_page_walk
107
108	mov	$bp,%r12		# reassign $bp
109___
110		$bp="%r12";
111$code.=<<___;
112	mov	($n0),$n0		# pull n0[0] value
113	mov	($bp),$m0		# m0=bp[0]
114	mov	($ap),%rax
115
116	xor	$i,$i			# i=0
117	xor	$j,$j			# j=0
118
119	mov	$n0,$m1
120	mulq	$m0			# ap[0]*bp[0]
121	mov	%rax,$lo0
122	mov	($np),%rax
123
124	imulq	$lo0,$m1		# "tp[0]"*n0
125	mov	%rdx,$hi0
126
127	mulq	$m1			# np[0]*m1
128	add	%rax,$lo0		# discarded
129	mov	8($ap),%rax
130	adc	\$0,%rdx
131	mov	%rdx,$hi1
132
133	lea	1($j),$j		# j++
134	jmp	.L1st_enter
135
136.align	16
137.L1st:
138	add	%rax,$hi1
139	mov	($ap,$j,8),%rax
140	adc	\$0,%rdx
141	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
142	mov	$lo0,$hi0
143	adc	\$0,%rdx
144	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
145	mov	%rdx,$hi1
146
147.L1st_enter:
148	mulq	$m0			# ap[j]*bp[0]
149	add	%rax,$hi0
150	mov	($np,$j,8),%rax
151	adc	\$0,%rdx
152	lea	1($j),$j		# j++
153	mov	%rdx,$lo0
154
155	mulq	$m1			# np[j]*m1
156	cmp	$num,$j
157	jne	.L1st
158
159	add	%rax,$hi1
160	mov	($ap),%rax		# ap[0]
161	adc	\$0,%rdx
162	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
163	adc	\$0,%rdx
164	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
165	mov	%rdx,$hi1
166	mov	$lo0,$hi0
167
168	xor	%rdx,%rdx
169	add	$hi0,$hi1
170	adc	\$0,%rdx
171	mov	$hi1,-8(%rsp,$num,8)
172	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
173
174	lea	1($i),$i		# i++
175	jmp	.Louter
176.align	16
177.Louter:
178	mov	($bp,$i,8),$m0		# m0=bp[i]
179	xor	$j,$j			# j=0
180	mov	$n0,$m1
181	mov	(%rsp),$lo0
182	mulq	$m0			# ap[0]*bp[i]
183	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
184	mov	($np),%rax
185	adc	\$0,%rdx
186
187	imulq	$lo0,$m1		# tp[0]*n0
188	mov	%rdx,$hi0
189
190	mulq	$m1			# np[0]*m1
191	add	%rax,$lo0		# discarded
192	mov	8($ap),%rax
193	adc	\$0,%rdx
194	mov	8(%rsp),$lo0		# tp[1]
195	mov	%rdx,$hi1
196
197	lea	1($j),$j		# j++
198	jmp	.Linner_enter
199
200.align	16
201.Linner:
202	add	%rax,$hi1
203	mov	($ap,$j,8),%rax
204	adc	\$0,%rdx
205	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
206	mov	(%rsp,$j,8),$lo0
207	adc	\$0,%rdx
208	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
209	mov	%rdx,$hi1
210
211.Linner_enter:
212	mulq	$m0			# ap[j]*bp[i]
213	add	%rax,$hi0
214	mov	($np,$j,8),%rax
215	adc	\$0,%rdx
216	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
217	mov	%rdx,$hi0
218	adc	\$0,$hi0
219	lea	1($j),$j		# j++
220
221	mulq	$m1			# np[j]*m1
222	cmp	$num,$j
223	jne	.Linner
224
225	add	%rax,$hi1
226	mov	($ap),%rax		# ap[0]
227	adc	\$0,%rdx
228	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
229	mov	(%rsp,$j,8),$lo0
230	adc	\$0,%rdx
231	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
232	mov	%rdx,$hi1
233
234	xor	%rdx,%rdx
235	add	$hi0,$hi1
236	adc	\$0,%rdx
237	add	$lo0,$hi1		# pull upmost overflow bit
238	adc	\$0,%rdx
239	mov	$hi1,-8(%rsp,$num,8)
240	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
241
242	lea	1($i),$i		# i++
243	cmp	$num,$i
244	jl	.Louter
245
246	xor	$i,$i			# i=0 and clear CF!
247	mov	(%rsp),%rax		# tp[0]
248	lea	(%rsp),$ap		# borrow ap for tp
249	mov	$num,$j			# j=num
250	jmp	.Lsub
251.align	16
252.Lsub:	sbb	($np,$i,8),%rax
253	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
254	mov	8($ap,$i,8),%rax	# tp[i+1]
255	lea	1($i),$i		# i++
256	dec	$j			# doesnn't affect CF!
257	jnz	.Lsub
258
259	sbb	\$0,%rax		# handle upmost overflow bit
260	xor	$i,$i
261	and	%rax,$ap
262	not	%rax
263	mov	$rp,$np
264	and	%rax,$np
265	mov	$num,$j			# j=num
266	or	$np,$ap			# ap=borrow?tp:rp
267.align	16
268.Lcopy:					# copy or in-place refresh
269	mov	($ap,$i,8),%rax
270	mov	$i,(%rsp,$i,8)		# zap temporary vector
271	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
272	lea	1($i),$i
273	sub	\$1,$j
274	jnz	.Lcopy
275
276	mov	8(%rsp,$num,8),%rsi	# restore %rsp
277	mov	\$1,%rax
278	mov	(%rsi),%r15
279	mov	8(%rsi),%r14
280	mov	16(%rsi),%r13
281	mov	24(%rsi),%r12
282	mov	32(%rsi),%rbp
283	mov	40(%rsi),%rbx
284	lea	48(%rsi),%rsp
285.Lmul_epilogue:
286	ret
287.size	bn_mul_mont,.-bn_mul_mont
288___
289{{{
290my @A=("%r10","%r11");
291my @N=("%r13","%rdi");
292$code.=<<___;
293.type	bn_mul4x_mont,\@function,6
294.align	16
295bn_mul4x_mont:
296.Lmul4x_enter:
297	push	%rbx
298	push	%rbp
299	push	%r12
300	push	%r13
301	push	%r14
302	push	%r15
303
304	mov	${num}d,${num}d
305	lea	4($num),%r10
306	mov	%rsp,%r11
307	neg	%r10
308	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
309	and	\$-1024,%rsp		# minimize TLB usage
310
311	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
312.Lmul4x_body:
313	sub	%rsp,%r11
314	and	\$-4096,%r11
315.Lmul4x_page_walk:
316	mov	(%rsp,%r11),%r10
317	sub	\$4096,%r11
318	.byte	0x2e			# predict non-taken
319	jnc	.Lmul4x_page_walk
320
321	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
322	mov	%rdx,%r12		# reassign $bp
323___
324		$bp="%r12";
325$code.=<<___;
326	mov	($n0),$n0		# pull n0[0] value
327	mov	($bp),$m0		# m0=bp[0]
328	mov	($ap),%rax
329
330	xor	$i,$i			# i=0
331	xor	$j,$j			# j=0
332
333	mov	$n0,$m1
334	mulq	$m0			# ap[0]*bp[0]
335	mov	%rax,$A[0]
336	mov	($np),%rax
337
338	imulq	$A[0],$m1		# "tp[0]"*n0
339	mov	%rdx,$A[1]
340
341	mulq	$m1			# np[0]*m1
342	add	%rax,$A[0]		# discarded
343	mov	8($ap),%rax
344	adc	\$0,%rdx
345	mov	%rdx,$N[1]
346
347	mulq	$m0
348	add	%rax,$A[1]
349	mov	8($np),%rax
350	adc	\$0,%rdx
351	mov	%rdx,$A[0]
352
353	mulq	$m1
354	add	%rax,$N[1]
355	mov	16($ap),%rax
356	adc	\$0,%rdx
357	add	$A[1],$N[1]
358	lea	4($j),$j		# j++
359	adc	\$0,%rdx
360	mov	$N[1],(%rsp)
361	mov	%rdx,$N[0]
362	jmp	.L1st4x
363.align	16
364.L1st4x:
365	mulq	$m0			# ap[j]*bp[0]
366	add	%rax,$A[0]
367	mov	-16($np,$j,8),%rax
368	adc	\$0,%rdx
369	mov	%rdx,$A[1]
370
371	mulq	$m1			# np[j]*m1
372	add	%rax,$N[0]
373	mov	-8($ap,$j,8),%rax
374	adc	\$0,%rdx
375	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
376	adc	\$0,%rdx
377	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
378	mov	%rdx,$N[1]
379
380	mulq	$m0			# ap[j]*bp[0]
381	add	%rax,$A[1]
382	mov	-8($np,$j,8),%rax
383	adc	\$0,%rdx
384	mov	%rdx,$A[0]
385
386	mulq	$m1			# np[j]*m1
387	add	%rax,$N[1]
388	mov	($ap,$j,8),%rax
389	adc	\$0,%rdx
390	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
391	adc	\$0,%rdx
392	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
393	mov	%rdx,$N[0]
394
395	mulq	$m0			# ap[j]*bp[0]
396	add	%rax,$A[0]
397	mov	($np,$j,8),%rax
398	adc	\$0,%rdx
399	mov	%rdx,$A[1]
400
401	mulq	$m1			# np[j]*m1
402	add	%rax,$N[0]
403	mov	8($ap,$j,8),%rax
404	adc	\$0,%rdx
405	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
406	adc	\$0,%rdx
407	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
408	mov	%rdx,$N[1]
409
410	mulq	$m0			# ap[j]*bp[0]
411	add	%rax,$A[1]
412	mov	8($np,$j,8),%rax
413	adc	\$0,%rdx
414	lea	4($j),$j		# j++
415	mov	%rdx,$A[0]
416
417	mulq	$m1			# np[j]*m1
418	add	%rax,$N[1]
419	mov	-16($ap,$j,8),%rax
420	adc	\$0,%rdx
421	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
422	adc	\$0,%rdx
423	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
424	mov	%rdx,$N[0]
425	cmp	$num,$j
426	jl	.L1st4x
427
428	mulq	$m0			# ap[j]*bp[0]
429	add	%rax,$A[0]
430	mov	-16($np,$j,8),%rax
431	adc	\$0,%rdx
432	mov	%rdx,$A[1]
433
434	mulq	$m1			# np[j]*m1
435	add	%rax,$N[0]
436	mov	-8($ap,$j,8),%rax
437	adc	\$0,%rdx
438	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
439	adc	\$0,%rdx
440	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
441	mov	%rdx,$N[1]
442
443	mulq	$m0			# ap[j]*bp[0]
444	add	%rax,$A[1]
445	mov	-8($np,$j,8),%rax
446	adc	\$0,%rdx
447	mov	%rdx,$A[0]
448
449	mulq	$m1			# np[j]*m1
450	add	%rax,$N[1]
451	mov	($ap),%rax		# ap[0]
452	adc	\$0,%rdx
453	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
454	adc	\$0,%rdx
455	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
456	mov	%rdx,$N[0]
457
458	xor	$N[1],$N[1]
459	add	$A[0],$N[0]
460	adc	\$0,$N[1]
461	mov	$N[0],-8(%rsp,$j,8)
462	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
463
464	lea	1($i),$i		# i++
465.align	4
466.Louter4x:
467	mov	($bp,$i,8),$m0		# m0=bp[i]
468	xor	$j,$j			# j=0
469	mov	(%rsp),$A[0]
470	mov	$n0,$m1
471	mulq	$m0			# ap[0]*bp[i]
472	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
473	mov	($np),%rax
474	adc	\$0,%rdx
475
476	imulq	$A[0],$m1		# tp[0]*n0
477	mov	%rdx,$A[1]
478
479	mulq	$m1			# np[0]*m1
480	add	%rax,$A[0]		# "$N[0]", discarded
481	mov	8($ap),%rax
482	adc	\$0,%rdx
483	mov	%rdx,$N[1]
484
485	mulq	$m0			# ap[j]*bp[i]
486	add	%rax,$A[1]
487	mov	8($np),%rax
488	adc	\$0,%rdx
489	add	8(%rsp),$A[1]		# +tp[1]
490	adc	\$0,%rdx
491	mov	%rdx,$A[0]
492
493	mulq	$m1			# np[j]*m1
494	add	%rax,$N[1]
495	mov	16($ap),%rax
496	adc	\$0,%rdx
497	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
498	lea	4($j),$j		# j+=2
499	adc	\$0,%rdx
500	mov	$N[1],(%rsp)		# tp[j-1]
501	mov	%rdx,$N[0]
502	jmp	.Linner4x
503.align	16
504.Linner4x:
505	mulq	$m0			# ap[j]*bp[i]
506	add	%rax,$A[0]
507	mov	-16($np,$j,8),%rax
508	adc	\$0,%rdx
509	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
510	adc	\$0,%rdx
511	mov	%rdx,$A[1]
512
513	mulq	$m1			# np[j]*m1
514	add	%rax,$N[0]
515	mov	-8($ap,$j,8),%rax
516	adc	\$0,%rdx
517	add	$A[0],$N[0]
518	adc	\$0,%rdx
519	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
520	mov	%rdx,$N[1]
521
522	mulq	$m0			# ap[j]*bp[i]
523	add	%rax,$A[1]
524	mov	-8($np,$j,8),%rax
525	adc	\$0,%rdx
526	add	-8(%rsp,$j,8),$A[1]
527	adc	\$0,%rdx
528	mov	%rdx,$A[0]
529
530	mulq	$m1			# np[j]*m1
531	add	%rax,$N[1]
532	mov	($ap,$j,8),%rax
533	adc	\$0,%rdx
534	add	$A[1],$N[1]
535	adc	\$0,%rdx
536	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
537	mov	%rdx,$N[0]
538
539	mulq	$m0			# ap[j]*bp[i]
540	add	%rax,$A[0]
541	mov	($np,$j,8),%rax
542	adc	\$0,%rdx
543	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
544	adc	\$0,%rdx
545	mov	%rdx,$A[1]
546
547	mulq	$m1			# np[j]*m1
548	add	%rax,$N[0]
549	mov	8($ap,$j,8),%rax
550	adc	\$0,%rdx
551	add	$A[0],$N[0]
552	adc	\$0,%rdx
553	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
554	mov	%rdx,$N[1]
555
556	mulq	$m0			# ap[j]*bp[i]
557	add	%rax,$A[1]
558	mov	8($np,$j,8),%rax
559	adc	\$0,%rdx
560	add	8(%rsp,$j,8),$A[1]
561	adc	\$0,%rdx
562	lea	4($j),$j		# j++
563	mov	%rdx,$A[0]
564
565	mulq	$m1			# np[j]*m1
566	add	%rax,$N[1]
567	mov	-16($ap,$j,8),%rax
568	adc	\$0,%rdx
569	add	$A[1],$N[1]
570	adc	\$0,%rdx
571	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
572	mov	%rdx,$N[0]
573	cmp	$num,$j
574	jl	.Linner4x
575
576	mulq	$m0			# ap[j]*bp[i]
577	add	%rax,$A[0]
578	mov	-16($np,$j,8),%rax
579	adc	\$0,%rdx
580	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
581	adc	\$0,%rdx
582	mov	%rdx,$A[1]
583
584	mulq	$m1			# np[j]*m1
585	add	%rax,$N[0]
586	mov	-8($ap,$j,8),%rax
587	adc	\$0,%rdx
588	add	$A[0],$N[0]
589	adc	\$0,%rdx
590	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
591	mov	%rdx,$N[1]
592
593	mulq	$m0			# ap[j]*bp[i]
594	add	%rax,$A[1]
595	mov	-8($np,$j,8),%rax
596	adc	\$0,%rdx
597	add	-8(%rsp,$j,8),$A[1]
598	adc	\$0,%rdx
599	lea	1($i),$i		# i++
600	mov	%rdx,$A[0]
601
602	mulq	$m1			# np[j]*m1
603	add	%rax,$N[1]
604	mov	($ap),%rax		# ap[0]
605	adc	\$0,%rdx
606	add	$A[1],$N[1]
607	adc	\$0,%rdx
608	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
609	mov	%rdx,$N[0]
610
611	xor	$N[1],$N[1]
612	add	$A[0],$N[0]
613	adc	\$0,$N[1]
614	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
615	adc	\$0,$N[1]
616	mov	$N[0],-8(%rsp,$j,8)
617	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
618
619	cmp	$num,$i
620	jl	.Louter4x
621___
622{
623my @ri=("%rax","%rdx",$m0,$m1);
624$code.=<<___;
625	mov	16(%rsp,$num,8),$rp	# restore $rp
626	mov	0(%rsp),@ri[0]		# tp[0]
627	pxor	%xmm0,%xmm0
628	mov	8(%rsp),@ri[1]		# tp[1]
629	shr	\$2,$num		# num/=4
630	lea	(%rsp),$ap		# borrow ap for tp
631	xor	$i,$i			# i=0 and clear CF!
632
633	sub	0($np),@ri[0]
634	mov	16($ap),@ri[2]		# tp[2]
635	mov	24($ap),@ri[3]		# tp[3]
636	sbb	8($np),@ri[1]
637	lea	-1($num),$j		# j=num/4-1
638	jmp	.Lsub4x
639.align	16
640.Lsub4x:
641	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
642	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
643	sbb	16($np,$i,8),@ri[2]
644	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
645	mov	40($ap,$i,8),@ri[1]
646	sbb	24($np,$i,8),@ri[3]
647	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
648	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
649	sbb	32($np,$i,8),@ri[0]
650	mov	48($ap,$i,8),@ri[2]
651	mov	56($ap,$i,8),@ri[3]
652	sbb	40($np,$i,8),@ri[1]
653	lea	4($i),$i		# i++
654	dec	$j			# doesnn't affect CF!
655	jnz	.Lsub4x
656
657	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
658	mov	32($ap,$i,8),@ri[0]	# load overflow bit
659	sbb	16($np,$i,8),@ri[2]
660	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
661	sbb	24($np,$i,8),@ri[3]
662	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
663
664	sbb	\$0,@ri[0]		# handle upmost overflow bit
665	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
666	xor	$i,$i			# i=0
667	and	@ri[0],$ap
668	not	@ri[0]
669	mov	$rp,$np
670	and	@ri[0],$np
671	lea	-1($num),$j
672	or	$np,$ap			# ap=borrow?tp:rp
673
674	movdqu	($ap),%xmm1
675	movdqa	%xmm0,(%rsp)
676	movdqu	%xmm1,($rp)
677	jmp	.Lcopy4x
678.align	16
679.Lcopy4x:					# copy or in-place refresh
680	movdqu	16($ap,$i),%xmm2
681	movdqu	32($ap,$i),%xmm1
682	movdqa	%xmm0,16(%rsp,$i)
683	movdqu	%xmm2,16($rp,$i)
684	movdqa	%xmm0,32(%rsp,$i)
685	movdqu	%xmm1,32($rp,$i)
686	lea	32($i),$i
687	dec	$j
688	jnz	.Lcopy4x
689
690	shl	\$2,$num
691	movdqu	16($ap,$i),%xmm2
692	movdqa	%xmm0,16(%rsp,$i)
693	movdqu	%xmm2,16($rp,$i)
694___
695}
696$code.=<<___;
697	mov	8(%rsp,$num,8),%rsi	# restore %rsp
698	mov	\$1,%rax
699	mov	(%rsi),%r15
700	mov	8(%rsi),%r14
701	mov	16(%rsi),%r13
702	mov	24(%rsi),%r12
703	mov	32(%rsi),%rbp
704	mov	40(%rsi),%rbx
705	lea	48(%rsi),%rsp
706.Lmul4x_epilogue:
707	ret
708.size	bn_mul4x_mont,.-bn_mul4x_mont
709___
710}}}
711{{{
712######################################################################
713# void bn_sqr4x_mont(
714my $rptr="%rdi";	# const BN_ULONG *rptr,
715my $aptr="%rsi";	# const BN_ULONG *aptr,
716my $bptr="%rdx";	# not used
717my $nptr="%rcx";	# const BN_ULONG *nptr,
718my $n0  ="%r8";		# const BN_ULONG *n0);
719my $num ="%r9";		# int num, has to be divisible by 4 and
720			# not less than 8
721
722my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
723my @A0=("%r10","%r11");
724my @A1=("%r12","%r13");
725my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
726
727$code.=<<___;
728.type	bn_sqr4x_mont,\@function,6
729.align	16
730bn_sqr4x_mont:
731.Lsqr4x_enter:
732	mov	%rsp,%rax
733	push	%rbx
734	push	%rbp
735	push	%r12
736	push	%r13
737	push	%r14
738	push	%r15
739
740	shl	\$3,${num}d		# convert $num to bytes
741	mov	%rsp,%r11		# put aside %rsp
742	neg	$num			# -$num
743	mov	($n0),$n0		# *n0
744	lea	-72(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
745	and	\$-1024,%rsp		# minimize TLB usage
746
747	sub	%rsp,%r11
748	and	\$-4096,%r11
749.Lsqr4x_page_walk:
750	mov	(%rsp,%r11),%r10
751	sub	\$4096,%r11
752	.byte	0x2e			# predict non-taken
753	jnc	.Lsqr4x_page_walk
754
755	mov	$num,%r10
756	neg	$num			# restore $num
757	lea	-48(%rax),%r11		# restore saved %rsp
758	##############################################################
759	# Stack layout
760	#
761	# +0	saved $num, used in reduction section
762	# +8	&t[2*$num], used in reduction section
763	# +32	saved $rptr
764	# +40	saved $nptr
765	# +48	saved *n0
766	# +56	saved %rsp
767	# +64	t[2*$num]
768	#
769	mov	$rptr,32(%rsp)		# save $rptr
770	mov	$nptr,40(%rsp)
771	mov	$n0,  48(%rsp)
772	mov	%r11, 56(%rsp)		# save original %rsp
773.Lsqr4x_body:
774	##############################################################
775	# Squaring part:
776	#
777	# a) multiply-n-add everything but a[i]*a[i];
778	# b) shift result of a) by 1 to the left and accumulate
779	#    a[i]*a[i] products;
780	#
781	lea	32(%r10),$i		# $i=-($num-32)
782	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
783
784	mov	$num,$j			# $j=$num
785
786					# comments apply to $num==8 case
787	mov	-32($aptr,$i),$a0	# a[0]
788	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
789	mov	-24($aptr,$i),%rax	# a[1]
790	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
791	mov	-16($aptr,$i),$ai	# a[2]
792	mov	%rax,$a1
793
794	mul	$a0			# a[1]*a[0]
795	mov	%rax,$A0[0]		# a[1]*a[0]
796	 mov	$ai,%rax		# a[2]
797	mov	%rdx,$A0[1]
798	mov	$A0[0],-24($tptr,$i)	# t[1]
799
800	xor	$A0[0],$A0[0]
801	mul	$a0			# a[2]*a[0]
802	add	%rax,$A0[1]
803	 mov	$ai,%rax
804	adc	%rdx,$A0[0]
805	mov	$A0[1],-16($tptr,$i)	# t[2]
806
807	lea	-16($i),$j		# j=-16
808
809
810	 mov	8($aptr,$j),$ai		# a[3]
811	mul	$a1			# a[2]*a[1]
812	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
813	 mov	$ai,%rax
814	mov	%rdx,$A1[1]
815
816	xor	$A0[1],$A0[1]
817	add	$A1[0],$A0[0]
818	 lea	16($j),$j
819	adc	\$0,$A0[1]
820	mul	$a0			# a[3]*a[0]
821	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
822	 mov	$ai,%rax
823	adc	%rdx,$A0[1]
824	mov	$A0[0],-8($tptr,$j)	# t[3]
825	jmp	.Lsqr4x_1st
826
827.align	16
828.Lsqr4x_1st:
829	 mov	($aptr,$j),$ai		# a[4]
830	xor	$A1[0],$A1[0]
831	mul	$a1			# a[3]*a[1]
832	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
833	 mov	$ai,%rax
834	adc	%rdx,$A1[0]
835
836	xor	$A0[0],$A0[0]
837	add	$A1[1],$A0[1]
838	adc	\$0,$A0[0]
839	mul	$a0			# a[4]*a[0]
840	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
841	 mov	$ai,%rax		# a[3]
842	adc	%rdx,$A0[0]
843	mov	$A0[1],($tptr,$j)	# t[4]
844
845
846	 mov	8($aptr,$j),$ai		# a[5]
847	xor	$A1[1],$A1[1]
848	mul	$a1			# a[4]*a[3]
849	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
850	 mov	$ai,%rax
851	adc	%rdx,$A1[1]
852
853	xor	$A0[1],$A0[1]
854	add	$A1[0],$A0[0]
855	adc	\$0,$A0[1]
856	mul	$a0			# a[5]*a[2]
857	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
858	 mov	$ai,%rax
859	adc	%rdx,$A0[1]
860	mov	$A0[0],8($tptr,$j)	# t[5]
861
862	 mov	16($aptr,$j),$ai	# a[6]
863	xor	$A1[0],$A1[0]
864	mul	$a1			# a[5]*a[3]
865	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
866	 mov	$ai,%rax
867	adc	%rdx,$A1[0]
868
869	xor	$A0[0],$A0[0]
870	add	$A1[1],$A0[1]
871	adc	\$0,$A0[0]
872	mul	$a0			# a[6]*a[2]
873	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
874	 mov	$ai,%rax		# a[3]
875	adc	%rdx,$A0[0]
876	mov	$A0[1],16($tptr,$j)	# t[6]
877
878
879	 mov	24($aptr,$j),$ai	# a[7]
880	xor	$A1[1],$A1[1]
881	mul	$a1			# a[6]*a[5]
882	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
883	 mov	$ai,%rax
884	adc	%rdx,$A1[1]
885
886	xor	$A0[1],$A0[1]
887	add	$A1[0],$A0[0]
888	 lea	32($j),$j
889	adc	\$0,$A0[1]
890	mul	$a0			# a[7]*a[4]
891	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
892	 mov	$ai,%rax
893	adc	%rdx,$A0[1]
894	mov	$A0[0],-8($tptr,$j)	# t[7]
895
896	cmp	\$0,$j
897	jne	.Lsqr4x_1st
898
899	xor	$A1[0],$A1[0]
900	add	$A0[1],$A1[1]
901	adc	\$0,$A1[0]
902	mul	$a1			# a[7]*a[5]
903	add	%rax,$A1[1]
904	adc	%rdx,$A1[0]
905
906	mov	$A1[1],($tptr)		# t[8]
907	lea	16($i),$i
908	mov	$A1[0],8($tptr)		# t[9]
909	jmp	.Lsqr4x_outer
910
911.align	16
912.Lsqr4x_outer:				# comments apply to $num==6 case
913	mov	-32($aptr,$i),$a0	# a[0]
914	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
915	mov	-24($aptr,$i),%rax	# a[1]
916	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
917	mov	-16($aptr,$i),$ai	# a[2]
918	mov	%rax,$a1
919
920	mov	-24($tptr,$i),$A0[0]	# t[1]
921	xor	$A0[1],$A0[1]
922	mul	$a0			# a[1]*a[0]
923	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
924	 mov	$ai,%rax		# a[2]
925	adc	%rdx,$A0[1]
926	mov	$A0[0],-24($tptr,$i)	# t[1]
927
928	xor	$A0[0],$A0[0]
929	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
930	adc	\$0,$A0[0]
931	mul	$a0			# a[2]*a[0]
932	add	%rax,$A0[1]
933	 mov	$ai,%rax
934	adc	%rdx,$A0[0]
935	mov	$A0[1],-16($tptr,$i)	# t[2]
936
937	lea	-16($i),$j		# j=-16
938	xor	$A1[0],$A1[0]
939
940
941	 mov	8($aptr,$j),$ai		# a[3]
942	xor	$A1[1],$A1[1]
943	add	8($tptr,$j),$A1[0]
944	adc	\$0,$A1[1]
945	mul	$a1			# a[2]*a[1]
946	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
947	 mov	$ai,%rax
948	adc	%rdx,$A1[1]
949
950	xor	$A0[1],$A0[1]
951	add	$A1[0],$A0[0]
952	adc	\$0,$A0[1]
953	mul	$a0			# a[3]*a[0]
954	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
955	 mov	$ai,%rax
956	adc	%rdx,$A0[1]
957	mov	$A0[0],8($tptr,$j)	# t[3]
958
959	lea	16($j),$j
960	jmp	.Lsqr4x_inner
961
962.align	16
963.Lsqr4x_inner:
964	 mov	($aptr,$j),$ai		# a[4]
965	xor	$A1[0],$A1[0]
966	add	($tptr,$j),$A1[1]
967	adc	\$0,$A1[0]
968	mul	$a1			# a[3]*a[1]
969	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
970	 mov	$ai,%rax
971	adc	%rdx,$A1[0]
972
973	xor	$A0[0],$A0[0]
974	add	$A1[1],$A0[1]
975	adc	\$0,$A0[0]
976	mul	$a0			# a[4]*a[0]
977	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
978	 mov	$ai,%rax		# a[3]
979	adc	%rdx,$A0[0]
980	mov	$A0[1],($tptr,$j)	# t[4]
981
982	 mov	8($aptr,$j),$ai		# a[5]
983	xor	$A1[1],$A1[1]
984	add	8($tptr,$j),$A1[0]
985	adc	\$0,$A1[1]
986	mul	$a1			# a[4]*a[3]
987	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
988	 mov	$ai,%rax
989	adc	%rdx,$A1[1]
990
991	xor	$A0[1],$A0[1]
992	add	$A1[0],$A0[0]
993	lea	16($j),$j		# j++
994	adc	\$0,$A0[1]
995	mul	$a0			# a[5]*a[2]
996	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
997	 mov	$ai,%rax
998	adc	%rdx,$A0[1]
999	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1000
1001	cmp	\$0,$j
1002	jne	.Lsqr4x_inner
1003
1004	xor	$A1[0],$A1[0]
1005	add	$A0[1],$A1[1]
1006	adc	\$0,$A1[0]
1007	mul	$a1			# a[5]*a[3]
1008	add	%rax,$A1[1]
1009	adc	%rdx,$A1[0]
1010
1011	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1012	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
1013
1014	add	\$16,$i
1015	jnz	.Lsqr4x_outer
1016
1017					# comments apply to $num==4 case
1018	mov	-32($aptr),$a0		# a[0]
1019	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1020	mov	-24($aptr),%rax		# a[1]
1021	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1022	mov	-16($aptr),$ai		# a[2]
1023	mov	%rax,$a1
1024
1025	xor	$A0[1],$A0[1]
1026	mul	$a0			# a[1]*a[0]
1027	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1028	 mov	$ai,%rax		# a[2]
1029	adc	%rdx,$A0[1]
1030	mov	$A0[0],-24($tptr)	# t[1]
1031
1032	xor	$A0[0],$A0[0]
1033	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1034	adc	\$0,$A0[0]
1035	mul	$a0			# a[2]*a[0]
1036	add	%rax,$A0[1]
1037	 mov	$ai,%rax
1038	adc	%rdx,$A0[0]
1039	mov	$A0[1],-16($tptr)	# t[2]
1040
1041	 mov	-8($aptr),$ai		# a[3]
1042	mul	$a1			# a[2]*a[1]
1043	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1044	 mov	$ai,%rax
1045	adc	\$0,%rdx
1046
1047	xor	$A0[1],$A0[1]
1048	add	$A1[0],$A0[0]
1049	 mov	%rdx,$A1[1]
1050	adc	\$0,$A0[1]
1051	mul	$a0			# a[3]*a[0]
1052	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1053	 mov	$ai,%rax
1054	adc	%rdx,$A0[1]
1055	mov	$A0[0],-8($tptr)	# t[3]
1056
1057	xor	$A1[0],$A1[0]
1058	add	$A0[1],$A1[1]
1059	adc	\$0,$A1[0]
1060	mul	$a1			# a[3]*a[1]
1061	add	%rax,$A1[1]
1062	 mov	-16($aptr),%rax		# a[2]
1063	adc	%rdx,$A1[0]
1064
1065	mov	$A1[1],($tptr)		# t[4]
1066	mov	$A1[0],8($tptr)		# t[5]
1067
1068	mul	$ai			# a[2]*a[3]
1069___
1070{
1071my ($shift,$carry)=($a0,$a1);
1072my @S=(@A1,$ai,$n0);
1073$code.=<<___;
1074	 add	\$16,$i
1075	 xor	$shift,$shift
1076	 sub	$num,$i			# $i=16-$num
1077	 xor	$carry,$carry
1078
1079	add	$A1[0],%rax		# t[5]
1080	adc	\$0,%rdx
1081	mov	%rax,8($tptr)		# t[5]
1082	mov	%rdx,16($tptr)		# t[6]
1083	mov	$carry,24($tptr)	# t[7]
1084
1085	 mov	-16($aptr,$i),%rax	# a[0]
1086	lea	64(%rsp,$num,2),$tptr
1087	 xor	$A0[0],$A0[0]		# t[0]
1088	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1089
1090	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1091	shr	\$63,$A0[0]
1092	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1093	shr	\$63,$A0[1]
1094	or	$A0[0],$S[1]		# | t[2*i]>>63
1095	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1096	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1097	mul	%rax			# a[i]*a[i]
1098	neg	$carry			# mov $carry,cf
1099	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1100	adc	%rax,$S[0]
1101	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1102	mov	$S[0],-32($tptr,$i,2)
1103	adc	%rdx,$S[1]
1104
1105	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1106	 mov	$S[1],-24($tptr,$i,2)
1107	 sbb	$carry,$carry		# mov cf,$carry
1108	shr	\$63,$A0[0]
1109	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1110	shr	\$63,$A0[1]
1111	or	$A0[0],$S[3]		# | t[2*i]>>63
1112	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1113	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1114	mul	%rax			# a[i]*a[i]
1115	neg	$carry			# mov $carry,cf
1116	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1117	adc	%rax,$S[2]
1118	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1119	mov	$S[2],-16($tptr,$i,2)
1120	adc	%rdx,$S[3]
1121	lea	16($i),$i
1122	mov	$S[3],-40($tptr,$i,2)
1123	sbb	$carry,$carry		# mov cf,$carry
1124	jmp	.Lsqr4x_shift_n_add
1125
1126.align	16
1127.Lsqr4x_shift_n_add:
1128	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1129	shr	\$63,$A0[0]
1130	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1131	shr	\$63,$A0[1]
1132	or	$A0[0],$S[1]		# | t[2*i]>>63
1133	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1134	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1135	mul	%rax			# a[i]*a[i]
1136	neg	$carry			# mov $carry,cf
1137	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1138	adc	%rax,$S[0]
1139	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1140	mov	$S[0],-32($tptr,$i,2)
1141	adc	%rdx,$S[1]
1142
1143	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1144	 mov	$S[1],-24($tptr,$i,2)
1145	 sbb	$carry,$carry		# mov cf,$carry
1146	shr	\$63,$A0[0]
1147	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1148	shr	\$63,$A0[1]
1149	or	$A0[0],$S[3]		# | t[2*i]>>63
1150	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1151	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1152	mul	%rax			# a[i]*a[i]
1153	neg	$carry			# mov $carry,cf
1154	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1155	adc	%rax,$S[2]
1156	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1157	mov	$S[2],-16($tptr,$i,2)
1158	adc	%rdx,$S[3]
1159
1160	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1161	 mov	$S[3],-8($tptr,$i,2)
1162	 sbb	$carry,$carry		# mov cf,$carry
1163	shr	\$63,$A0[0]
1164	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1165	shr	\$63,$A0[1]
1166	or	$A0[0],$S[1]		# | t[2*i]>>63
1167	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1168	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1169	mul	%rax			# a[i]*a[i]
1170	neg	$carry			# mov $carry,cf
1171	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1172	adc	%rax,$S[0]
1173	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1174	mov	$S[0],0($tptr,$i,2)
1175	adc	%rdx,$S[1]
1176
1177	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1178	 mov	$S[1],8($tptr,$i,2)
1179	 sbb	$carry,$carry		# mov cf,$carry
1180	shr	\$63,$A0[0]
1181	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1182	shr	\$63,$A0[1]
1183	or	$A0[0],$S[3]		# | t[2*i]>>63
1184	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1185	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1186	mul	%rax			# a[i]*a[i]
1187	neg	$carry			# mov $carry,cf
1188	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1189	adc	%rax,$S[2]
1190	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1191	mov	$S[2],16($tptr,$i,2)
1192	adc	%rdx,$S[3]
1193	mov	$S[3],24($tptr,$i,2)
1194	sbb	$carry,$carry		# mov cf,$carry
1195	add	\$32,$i
1196	jnz	.Lsqr4x_shift_n_add
1197
1198	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1199	shr	\$63,$A0[0]
1200	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1201	shr	\$63,$A0[1]
1202	or	$A0[0],$S[1]		# | t[2*i]>>63
1203	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1204	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1205	mul	%rax			# a[i]*a[i]
1206	neg	$carry			# mov $carry,cf
1207	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1208	adc	%rax,$S[0]
1209	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1210	mov	$S[0],-32($tptr)
1211	adc	%rdx,$S[1]
1212
1213	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1214	 mov	$S[1],-24($tptr)
1215	 sbb	$carry,$carry		# mov cf,$carry
1216	shr	\$63,$A0[0]
1217	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1218	shr	\$63,$A0[1]
1219	or	$A0[0],$S[3]		# | t[2*i]>>63
1220	mul	%rax			# a[i]*a[i]
1221	neg	$carry			# mov $carry,cf
1222	adc	%rax,$S[2]
1223	adc	%rdx,$S[3]
1224	mov	$S[2],-16($tptr)
1225	mov	$S[3],-8($tptr)
1226___
1227}
1228##############################################################
1229# Montgomery reduction part, "word-by-word" algorithm.
1230#
1231{
1232my ($topbit,$nptr)=("%rbp",$aptr);
1233my ($m0,$m1)=($a0,$a1);
1234my @Ni=("%rbx","%r9");
1235$code.=<<___;
1236	mov	40(%rsp),$nptr		# restore $nptr
1237	mov	48(%rsp),$n0		# restore *n0
1238	xor	$j,$j
1239	mov	$num,0(%rsp)		# save $num
1240	sub	$num,$j			# $j=-$num
1241	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1242	 mov	$n0,$m0			#		# modsched #
1243	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1244	lea	64(%rsp,$num),$tptr	# end of t[] window
1245	mov	%rax,8(%rsp)		# save end of t[] buffer
1246	lea	($nptr,$num),$nptr	# end of n[] buffer
1247	xor	$topbit,$topbit		# $topbit=0
1248
1249	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1250	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1251	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1252	 mov	%rax,$Ni[0]		#		# modsched #
1253	jmp	.Lsqr4x_mont_outer
1254
1255.align	16
1256.Lsqr4x_mont_outer:
1257	xor	$A0[1],$A0[1]
1258	mul	$m0			# n[0]*m0
1259	add	%rax,$A0[0]		# n[0]*m0+t[0]
1260	 mov	$Ni[1],%rax
1261	adc	%rdx,$A0[1]
1262	mov	$n0,$m1
1263
1264	xor	$A0[0],$A0[0]
1265	add	8($tptr,$j),$A0[1]
1266	adc	\$0,$A0[0]
1267	mul	$m0			# n[1]*m0
1268	add	%rax,$A0[1]		# n[1]*m0+t[1]
1269	 mov	$Ni[0],%rax
1270	adc	%rdx,$A0[0]
1271
1272	imulq	$A0[1],$m1
1273
1274	mov	16($nptr,$j),$Ni[0]	# n[2]
1275	xor	$A1[1],$A1[1]
1276	add	$A0[1],$A1[0]
1277	adc	\$0,$A1[1]
1278	mul	$m1			# n[0]*m1
1279	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1280	 mov	$Ni[0],%rax
1281	adc	%rdx,$A1[1]
1282	mov	$A1[0],8($tptr,$j)	# "t[1]"
1283
1284	xor	$A0[1],$A0[1]
1285	add	16($tptr,$j),$A0[0]
1286	adc	\$0,$A0[1]
1287	mul	$m0			# n[2]*m0
1288	add	%rax,$A0[0]		# n[2]*m0+t[2]
1289	 mov	$Ni[1],%rax
1290	adc	%rdx,$A0[1]
1291
1292	mov	24($nptr,$j),$Ni[1]	# n[3]
1293	xor	$A1[0],$A1[0]
1294	add	$A0[0],$A1[1]
1295	adc	\$0,$A1[0]
1296	mul	$m1			# n[1]*m1
1297	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1298	 mov	$Ni[1],%rax
1299	adc	%rdx,$A1[0]
1300	mov	$A1[1],16($tptr,$j)	# "t[2]"
1301
1302	xor	$A0[0],$A0[0]
1303	add	24($tptr,$j),$A0[1]
1304	lea	32($j),$j
1305	adc	\$0,$A0[0]
1306	mul	$m0			# n[3]*m0
1307	add	%rax,$A0[1]		# n[3]*m0+t[3]
1308	 mov	$Ni[0],%rax
1309	adc	%rdx,$A0[0]
1310	jmp	.Lsqr4x_mont_inner
1311
1312.align	16
1313.Lsqr4x_mont_inner:
1314	mov	($nptr,$j),$Ni[0]	# n[4]
1315	xor	$A1[1],$A1[1]
1316	add	$A0[1],$A1[0]
1317	adc	\$0,$A1[1]
1318	mul	$m1			# n[2]*m1
1319	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1320	 mov	$Ni[0],%rax
1321	adc	%rdx,$A1[1]
1322	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1323
1324	xor	$A0[1],$A0[1]
1325	add	($tptr,$j),$A0[0]
1326	adc	\$0,$A0[1]
1327	mul	$m0			# n[4]*m0
1328	add	%rax,$A0[0]		# n[4]*m0+t[4]
1329	 mov	$Ni[1],%rax
1330	adc	%rdx,$A0[1]
1331
1332	mov	8($nptr,$j),$Ni[1]	# n[5]
1333	xor	$A1[0],$A1[0]
1334	add	$A0[0],$A1[1]
1335	adc	\$0,$A1[0]
1336	mul	$m1			# n[3]*m1
1337	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1338	 mov	$Ni[1],%rax
1339	adc	%rdx,$A1[0]
1340	mov	$A1[1],($tptr,$j)	# "t[4]"
1341
1342	xor	$A0[0],$A0[0]
1343	add	8($tptr,$j),$A0[1]
1344	adc	\$0,$A0[0]
1345	mul	$m0			# n[5]*m0
1346	add	%rax,$A0[1]		# n[5]*m0+t[5]
1347	 mov	$Ni[0],%rax
1348	adc	%rdx,$A0[0]
1349
1350
1351	mov	16($nptr,$j),$Ni[0]	# n[6]
1352	xor	$A1[1],$A1[1]
1353	add	$A0[1],$A1[0]
1354	adc	\$0,$A1[1]
1355	mul	$m1			# n[4]*m1
1356	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1357	 mov	$Ni[0],%rax
1358	adc	%rdx,$A1[1]
1359	mov	$A1[0],8($tptr,$j)	# "t[5]"
1360
1361	xor	$A0[1],$A0[1]
1362	add	16($tptr,$j),$A0[0]
1363	adc	\$0,$A0[1]
1364	mul	$m0			# n[6]*m0
1365	add	%rax,$A0[0]		# n[6]*m0+t[6]
1366	 mov	$Ni[1],%rax
1367	adc	%rdx,$A0[1]
1368
1369	mov	24($nptr,$j),$Ni[1]	# n[7]
1370	xor	$A1[0],$A1[0]
1371	add	$A0[0],$A1[1]
1372	adc	\$0,$A1[0]
1373	mul	$m1			# n[5]*m1
1374	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1375	 mov	$Ni[1],%rax
1376	adc	%rdx,$A1[0]
1377	mov	$A1[1],16($tptr,$j)	# "t[6]"
1378
1379	xor	$A0[0],$A0[0]
1380	add	24($tptr,$j),$A0[1]
1381	lea	32($j),$j
1382	adc	\$0,$A0[0]
1383	mul	$m0			# n[7]*m0
1384	add	%rax,$A0[1]		# n[7]*m0+t[7]
1385	 mov	$Ni[0],%rax
1386	adc	%rdx,$A0[0]
1387	cmp	\$0,$j
1388	jne	.Lsqr4x_mont_inner
1389
1390	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1391	 mov	$n0,$m0			#		# modsched #
1392
1393	xor	$A1[1],$A1[1]
1394	add	$A0[1],$A1[0]
1395	adc	\$0,$A1[1]
1396	mul	$m1			# n[6]*m1
1397	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1398	mov	$Ni[1],%rax
1399	adc	%rdx,$A1[1]
1400	mov	$A1[0],-8($tptr)	# "t[7]"
1401
1402	xor	$A0[1],$A0[1]
1403	add	($tptr),$A0[0]		# +t[8]
1404	adc	\$0,$A0[1]
1405	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1406	add	$topbit,$A0[0]
1407	adc	\$0,$A0[1]
1408
1409	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1410	xor	$A1[0],$A1[0]
1411	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1412	add	$A0[0],$A1[1]
1413	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1414	adc	\$0,$A1[0]
1415	mul	$m1			# n[7]*m1
1416	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1417	 mov	$Ni[0],%rax		#		# modsched #
1418	adc	%rdx,$A1[0]
1419	mov	$A1[1],($tptr)		# "t[8]"
1420
1421	xor	$topbit,$topbit
1422	add	8($tptr),$A1[0]		# +t[9]
1423	adc	$topbit,$topbit
1424	add	$A0[1],$A1[0]
1425	lea	16($tptr),$tptr		# "t[$num]>>128"
1426	adc	\$0,$topbit
1427	mov	$A1[0],-8($tptr)	# "t[9]"
1428	cmp	8(%rsp),$tptr		# are we done?
1429	jb	.Lsqr4x_mont_outer
1430
1431	mov	0(%rsp),$num		# restore $num
1432	mov	$topbit,($tptr)		# save $topbit
1433___
1434}
1435##############################################################
1436# Post-condition, 4x unrolled copy from bn_mul_mont
1437#
1438{
1439my ($tptr,$nptr)=("%rbx",$aptr);
1440my @ri=("%rax","%rdx","%r10","%r11");
1441$code.=<<___;
1442	mov	64(%rsp,$num),@ri[0]	# tp[0]
1443	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1444	mov	40(%rsp),$nptr		# restore $nptr
1445	shr	\$5,$num		# num/4
1446	mov	8($tptr),@ri[1]		# t[1]
1447	xor	$i,$i			# i=0 and clear CF!
1448
1449	mov	32(%rsp),$rptr		# restore $rptr
1450	sub	0($nptr),@ri[0]
1451	mov	16($tptr),@ri[2]	# t[2]
1452	mov	24($tptr),@ri[3]	# t[3]
1453	sbb	8($nptr),@ri[1]
1454	lea	-1($num),$j		# j=num/4-1
1455	jmp	.Lsqr4x_sub
1456.align	16
1457.Lsqr4x_sub:
1458	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1459	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1460	sbb	16($nptr,$i,8),@ri[2]
1461	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1462	mov	40($tptr,$i,8),@ri[1]
1463	sbb	24($nptr,$i,8),@ri[3]
1464	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1465	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1466	sbb	32($nptr,$i,8),@ri[0]
1467	mov	48($tptr,$i,8),@ri[2]
1468	mov	56($tptr,$i,8),@ri[3]
1469	sbb	40($nptr,$i,8),@ri[1]
1470	lea	4($i),$i		# i++
1471	dec	$j			# doesn't affect CF!
1472	jnz	.Lsqr4x_sub
1473
1474	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1475	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1476	sbb	16($nptr,$i,8),@ri[2]
1477	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1478	sbb	24($nptr,$i,8),@ri[3]
1479	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1480
1481	sbb	\$0,@ri[0]		# handle upmost overflow bit
1482	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1483	xor	$i,$i			# i=0
1484	and	@ri[0],$tptr
1485	not	@ri[0]
1486	mov	$rptr,$nptr
1487	and	@ri[0],$nptr
1488	lea	-1($num),$j
1489	or	$nptr,$tptr		# tp=borrow?tp:rp
1490
1491	pxor	%xmm0,%xmm0
1492	lea	64(%rsp,$num,8),$nptr
1493	movdqu	($tptr),%xmm1
1494	lea	($nptr,$num,8),$nptr
1495	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1496	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1497	movdqu	%xmm1,($rptr)
1498	jmp	.Lsqr4x_copy
1499.align	16
1500.Lsqr4x_copy:				# copy or in-place refresh
1501	movdqu	16($tptr,$i),%xmm2
1502	movdqu	32($tptr,$i),%xmm1
1503	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1504	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1505	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1506	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1507	movdqu	%xmm2,16($rptr,$i)
1508	movdqu	%xmm1,32($rptr,$i)
1509	lea	32($i),$i
1510	dec	$j
1511	jnz	.Lsqr4x_copy
1512
1513	movdqu	16($tptr,$i),%xmm2
1514	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1515	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1516	movdqu	%xmm2,16($rptr,$i)
1517___
1518}
1519$code.=<<___;
1520	mov	56(%rsp),%rsi		# restore %rsp
1521	mov	\$1,%rax
1522	mov	0(%rsi),%r15
1523	mov	8(%rsi),%r14
1524	mov	16(%rsi),%r13
1525	mov	24(%rsi),%r12
1526	mov	32(%rsi),%rbp
1527	mov	40(%rsi),%rbx
1528	lea	48(%rsi),%rsp
1529.Lsqr4x_epilogue:
1530	ret
1531.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1532___
1533}}}
1534$code.=<<___;
1535.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1536.align	16
1537___
1538
1539# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1540#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1541if ($win64) {
1542$rec="%rcx";
1543$frame="%rdx";
1544$context="%r8";
1545$disp="%r9";
1546
1547$code.=<<___;
1548.extern	__imp_RtlVirtualUnwind
1549.type	mul_handler,\@abi-omnipotent
1550.align	16
1551mul_handler:
1552	push	%rsi
1553	push	%rdi
1554	push	%rbx
1555	push	%rbp
1556	push	%r12
1557	push	%r13
1558	push	%r14
1559	push	%r15
1560	pushfq
1561	sub	\$64,%rsp
1562
1563	mov	120($context),%rax	# pull context->Rax
1564	mov	248($context),%rbx	# pull context->Rip
1565
1566	mov	8($disp),%rsi		# disp->ImageBase
1567	mov	56($disp),%r11		# disp->HandlerData
1568
1569	mov	0(%r11),%r10d		# HandlerData[0]
1570	lea	(%rsi,%r10),%r10	# end of prologue label
1571	cmp	%r10,%rbx		# context->Rip<end of prologue label
1572	jb	.Lcommon_seh_tail
1573
1574	mov	152($context),%rax	# pull context->Rsp
1575
1576	mov	4(%r11),%r10d		# HandlerData[1]
1577	lea	(%rsi,%r10),%r10	# epilogue label
1578	cmp	%r10,%rbx		# context->Rip>=epilogue label
1579	jae	.Lcommon_seh_tail
1580
1581	mov	192($context),%r10	# pull $num
1582	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1583	lea	48(%rax),%rax
1584
1585	mov	-8(%rax),%rbx
1586	mov	-16(%rax),%rbp
1587	mov	-24(%rax),%r12
1588	mov	-32(%rax),%r13
1589	mov	-40(%rax),%r14
1590	mov	-48(%rax),%r15
1591	mov	%rbx,144($context)	# restore context->Rbx
1592	mov	%rbp,160($context)	# restore context->Rbp
1593	mov	%r12,216($context)	# restore context->R12
1594	mov	%r13,224($context)	# restore context->R13
1595	mov	%r14,232($context)	# restore context->R14
1596	mov	%r15,240($context)	# restore context->R15
1597
1598	jmp	.Lcommon_seh_tail
1599.size	mul_handler,.-mul_handler
1600
1601.type	sqr_handler,\@abi-omnipotent
1602.align	16
1603sqr_handler:
1604	push	%rsi
1605	push	%rdi
1606	push	%rbx
1607	push	%rbp
1608	push	%r12
1609	push	%r13
1610	push	%r14
1611	push	%r15
1612	pushfq
1613	sub	\$64,%rsp
1614
1615	mov	120($context),%rax	# pull context->Rax
1616	mov	248($context),%rbx	# pull context->Rip
1617
1618	lea	.Lsqr4x_body(%rip),%r10
1619	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1620	jb	.Lcommon_seh_tail
1621
1622	mov	152($context),%rax	# pull context->Rsp
1623
1624	lea	.Lsqr4x_epilogue(%rip),%r10
1625	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1626	jae	.Lcommon_seh_tail
1627
1628	mov	56(%rax),%rax		# pull saved stack pointer
1629	lea	48(%rax),%rax
1630
1631	mov	-8(%rax),%rbx
1632	mov	-16(%rax),%rbp
1633	mov	-24(%rax),%r12
1634	mov	-32(%rax),%r13
1635	mov	-40(%rax),%r14
1636	mov	-48(%rax),%r15
1637	mov	%rbx,144($context)	# restore context->Rbx
1638	mov	%rbp,160($context)	# restore context->Rbp
1639	mov	%r12,216($context)	# restore context->R12
1640	mov	%r13,224($context)	# restore context->R13
1641	mov	%r14,232($context)	# restore context->R14
1642	mov	%r15,240($context)	# restore context->R15
1643
1644.Lcommon_seh_tail:
1645	mov	8(%rax),%rdi
1646	mov	16(%rax),%rsi
1647	mov	%rax,152($context)	# restore context->Rsp
1648	mov	%rsi,168($context)	# restore context->Rsi
1649	mov	%rdi,176($context)	# restore context->Rdi
1650
1651	mov	40($disp),%rdi		# disp->ContextRecord
1652	mov	$context,%rsi		# context
1653	mov	\$154,%ecx		# sizeof(CONTEXT)
1654	.long	0xa548f3fc		# cld; rep movsq
1655
1656	mov	$disp,%rsi
1657	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1658	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1659	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1660	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1661	mov	40(%rsi),%r10		# disp->ContextRecord
1662	lea	56(%rsi),%r11		# &disp->HandlerData
1663	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1664	mov	%r10,32(%rsp)		# arg5
1665	mov	%r11,40(%rsp)		# arg6
1666	mov	%r12,48(%rsp)		# arg7
1667	mov	%rcx,56(%rsp)		# arg8, (NULL)
1668	call	*__imp_RtlVirtualUnwind(%rip)
1669
1670	mov	\$1,%eax		# ExceptionContinueSearch
1671	add	\$64,%rsp
1672	popfq
1673	pop	%r15
1674	pop	%r14
1675	pop	%r13
1676	pop	%r12
1677	pop	%rbp
1678	pop	%rbx
1679	pop	%rdi
1680	pop	%rsi
1681	ret
1682.size	sqr_handler,.-sqr_handler
1683
1684.section	.pdata
1685.align	4
1686	.rva	.LSEH_begin_bn_mul_mont
1687	.rva	.LSEH_end_bn_mul_mont
1688	.rva	.LSEH_info_bn_mul_mont
1689
1690	.rva	.LSEH_begin_bn_mul4x_mont
1691	.rva	.LSEH_end_bn_mul4x_mont
1692	.rva	.LSEH_info_bn_mul4x_mont
1693
1694	.rva	.LSEH_begin_bn_sqr4x_mont
1695	.rva	.LSEH_end_bn_sqr4x_mont
1696	.rva	.LSEH_info_bn_sqr4x_mont
1697
1698.section	.xdata
1699.align	8
1700.LSEH_info_bn_mul_mont:
1701	.byte	9,0,0,0
1702	.rva	mul_handler
1703	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1704.LSEH_info_bn_mul4x_mont:
1705	.byte	9,0,0,0
1706	.rva	mul_handler
1707	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1708.LSEH_info_bn_sqr4x_mont:
1709	.byte	9,0,0,0
1710	.rva	sqr_handler
1711___
1712}
1713
1714print $code;
1715close STDOUT;
1716