x86_64-mont.S revision 299966
1# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-mont.S 299966 2016-05-16 19:30:27Z jkim $
2# Do not modify. This file is auto-generated from x86_64-mont.pl.
3.text
4
5.globl	bn_mul_mont
6.type	bn_mul_mont,@function
7.align	16
8bn_mul_mont:
9	testl	$3,%r9d
10	jnz	.Lmul_enter
11	cmpl	$8,%r9d
12	jb	.Lmul_enter
13	cmpq	%rsi,%rdx
14	jne	.Lmul4x_enter
15	jmp	.Lsqr4x_enter
16
17.align	16
18.Lmul_enter:
19	pushq	%rbx
20	pushq	%rbp
21	pushq	%r12
22	pushq	%r13
23	pushq	%r14
24	pushq	%r15
25
26	movl	%r9d,%r9d
27	leaq	2(%r9),%r10
28	movq	%rsp,%r11
29	negq	%r10
30	leaq	(%rsp,%r10,8),%rsp
31	andq	$-1024,%rsp
32
33	movq	%r11,8(%rsp,%r9,8)
34.Lmul_body:
35
36
37
38
39
40
41	subq	%rsp,%r11
42	andq	$-4096,%r11
43.Lmul_page_walk:
44	movq	(%rsp,%r11,1),%r10
45	subq	$4096,%r11
46.byte	0x66,0x2e
47	jnc	.Lmul_page_walk
48
49	movq	%rdx,%r12
50	movq	(%r8),%r8
51	movq	(%r12),%rbx
52	movq	(%rsi),%rax
53
54	xorq	%r14,%r14
55	xorq	%r15,%r15
56
57	movq	%r8,%rbp
58	mulq	%rbx
59	movq	%rax,%r10
60	movq	(%rcx),%rax
61
62	imulq	%r10,%rbp
63	movq	%rdx,%r11
64
65	mulq	%rbp
66	addq	%rax,%r10
67	movq	8(%rsi),%rax
68	adcq	$0,%rdx
69	movq	%rdx,%r13
70
71	leaq	1(%r15),%r15
72	jmp	.L1st_enter
73
74.align	16
75.L1st:
76	addq	%rax,%r13
77	movq	(%rsi,%r15,8),%rax
78	adcq	$0,%rdx
79	addq	%r11,%r13
80	movq	%r10,%r11
81	adcq	$0,%rdx
82	movq	%r13,-16(%rsp,%r15,8)
83	movq	%rdx,%r13
84
85.L1st_enter:
86	mulq	%rbx
87	addq	%rax,%r11
88	movq	(%rcx,%r15,8),%rax
89	adcq	$0,%rdx
90	leaq	1(%r15),%r15
91	movq	%rdx,%r10
92
93	mulq	%rbp
94	cmpq	%r9,%r15
95	jne	.L1st
96
97	addq	%rax,%r13
98	movq	(%rsi),%rax
99	adcq	$0,%rdx
100	addq	%r11,%r13
101	adcq	$0,%rdx
102	movq	%r13,-16(%rsp,%r15,8)
103	movq	%rdx,%r13
104	movq	%r10,%r11
105
106	xorq	%rdx,%rdx
107	addq	%r11,%r13
108	adcq	$0,%rdx
109	movq	%r13,-8(%rsp,%r9,8)
110	movq	%rdx,(%rsp,%r9,8)
111
112	leaq	1(%r14),%r14
113	jmp	.Louter
114.align	16
115.Louter:
116	movq	(%r12,%r14,8),%rbx
117	xorq	%r15,%r15
118	movq	%r8,%rbp
119	movq	(%rsp),%r10
120	mulq	%rbx
121	addq	%rax,%r10
122	movq	(%rcx),%rax
123	adcq	$0,%rdx
124
125	imulq	%r10,%rbp
126	movq	%rdx,%r11
127
128	mulq	%rbp
129	addq	%rax,%r10
130	movq	8(%rsi),%rax
131	adcq	$0,%rdx
132	movq	8(%rsp),%r10
133	movq	%rdx,%r13
134
135	leaq	1(%r15),%r15
136	jmp	.Linner_enter
137
138.align	16
139.Linner:
140	addq	%rax,%r13
141	movq	(%rsi,%r15,8),%rax
142	adcq	$0,%rdx
143	addq	%r10,%r13
144	movq	(%rsp,%r15,8),%r10
145	adcq	$0,%rdx
146	movq	%r13,-16(%rsp,%r15,8)
147	movq	%rdx,%r13
148
149.Linner_enter:
150	mulq	%rbx
151	addq	%rax,%r11
152	movq	(%rcx,%r15,8),%rax
153	adcq	$0,%rdx
154	addq	%r11,%r10
155	movq	%rdx,%r11
156	adcq	$0,%r11
157	leaq	1(%r15),%r15
158
159	mulq	%rbp
160	cmpq	%r9,%r15
161	jne	.Linner
162
163	addq	%rax,%r13
164	movq	(%rsi),%rax
165	adcq	$0,%rdx
166	addq	%r10,%r13
167	movq	(%rsp,%r15,8),%r10
168	adcq	$0,%rdx
169	movq	%r13,-16(%rsp,%r15,8)
170	movq	%rdx,%r13
171
172	xorq	%rdx,%rdx
173	addq	%r11,%r13
174	adcq	$0,%rdx
175	addq	%r10,%r13
176	adcq	$0,%rdx
177	movq	%r13,-8(%rsp,%r9,8)
178	movq	%rdx,(%rsp,%r9,8)
179
180	leaq	1(%r14),%r14
181	cmpq	%r9,%r14
182	jl	.Louter
183
184	xorq	%r14,%r14
185	movq	(%rsp),%rax
186	leaq	(%rsp),%rsi
187	movq	%r9,%r15
188	jmp	.Lsub
189.align	16
190.Lsub:	sbbq	(%rcx,%r14,8),%rax
191	movq	%rax,(%rdi,%r14,8)
192	movq	8(%rsi,%r14,8),%rax
193	leaq	1(%r14),%r14
194	decq	%r15
195	jnz	.Lsub
196
197	sbbq	$0,%rax
198	xorq	%r14,%r14
199	andq	%rax,%rsi
200	notq	%rax
201	movq	%rdi,%rcx
202	andq	%rax,%rcx
203	movq	%r9,%r15
204	orq	%rcx,%rsi
205.align	16
206.Lcopy:
207	movq	(%rsi,%r14,8),%rax
208	movq	%r14,(%rsp,%r14,8)
209	movq	%rax,(%rdi,%r14,8)
210	leaq	1(%r14),%r14
211	subq	$1,%r15
212	jnz	.Lcopy
213
214	movq	8(%rsp,%r9,8),%rsi
215	movq	$1,%rax
216	movq	(%rsi),%r15
217	movq	8(%rsi),%r14
218	movq	16(%rsi),%r13
219	movq	24(%rsi),%r12
220	movq	32(%rsi),%rbp
221	movq	40(%rsi),%rbx
222	leaq	48(%rsi),%rsp
223.Lmul_epilogue:
224	.byte	0xf3,0xc3
225.size	bn_mul_mont,.-bn_mul_mont
226.type	bn_mul4x_mont,@function
227.align	16
228bn_mul4x_mont:
229.Lmul4x_enter:
230	pushq	%rbx
231	pushq	%rbp
232	pushq	%r12
233	pushq	%r13
234	pushq	%r14
235	pushq	%r15
236
237	movl	%r9d,%r9d
238	leaq	4(%r9),%r10
239	movq	%rsp,%r11
240	negq	%r10
241	leaq	(%rsp,%r10,8),%rsp
242	andq	$-1024,%rsp
243
244	movq	%r11,8(%rsp,%r9,8)
245.Lmul4x_body:
246	subq	%rsp,%r11
247	andq	$-4096,%r11
248.Lmul4x_page_walk:
249	movq	(%rsp,%r11,1),%r10
250	subq	$4096,%r11
251.byte	0x2e
252	jnc	.Lmul4x_page_walk
253
254	movq	%rdi,16(%rsp,%r9,8)
255	movq	%rdx,%r12
256	movq	(%r8),%r8
257	movq	(%r12),%rbx
258	movq	(%rsi),%rax
259
260	xorq	%r14,%r14
261	xorq	%r15,%r15
262
263	movq	%r8,%rbp
264	mulq	%rbx
265	movq	%rax,%r10
266	movq	(%rcx),%rax
267
268	imulq	%r10,%rbp
269	movq	%rdx,%r11
270
271	mulq	%rbp
272	addq	%rax,%r10
273	movq	8(%rsi),%rax
274	adcq	$0,%rdx
275	movq	%rdx,%rdi
276
277	mulq	%rbx
278	addq	%rax,%r11
279	movq	8(%rcx),%rax
280	adcq	$0,%rdx
281	movq	%rdx,%r10
282
283	mulq	%rbp
284	addq	%rax,%rdi
285	movq	16(%rsi),%rax
286	adcq	$0,%rdx
287	addq	%r11,%rdi
288	leaq	4(%r15),%r15
289	adcq	$0,%rdx
290	movq	%rdi,(%rsp)
291	movq	%rdx,%r13
292	jmp	.L1st4x
293.align	16
294.L1st4x:
295	mulq	%rbx
296	addq	%rax,%r10
297	movq	-16(%rcx,%r15,8),%rax
298	adcq	$0,%rdx
299	movq	%rdx,%r11
300
301	mulq	%rbp
302	addq	%rax,%r13
303	movq	-8(%rsi,%r15,8),%rax
304	adcq	$0,%rdx
305	addq	%r10,%r13
306	adcq	$0,%rdx
307	movq	%r13,-24(%rsp,%r15,8)
308	movq	%rdx,%rdi
309
310	mulq	%rbx
311	addq	%rax,%r11
312	movq	-8(%rcx,%r15,8),%rax
313	adcq	$0,%rdx
314	movq	%rdx,%r10
315
316	mulq	%rbp
317	addq	%rax,%rdi
318	movq	(%rsi,%r15,8),%rax
319	adcq	$0,%rdx
320	addq	%r11,%rdi
321	adcq	$0,%rdx
322	movq	%rdi,-16(%rsp,%r15,8)
323	movq	%rdx,%r13
324
325	mulq	%rbx
326	addq	%rax,%r10
327	movq	(%rcx,%r15,8),%rax
328	adcq	$0,%rdx
329	movq	%rdx,%r11
330
331	mulq	%rbp
332	addq	%rax,%r13
333	movq	8(%rsi,%r15,8),%rax
334	adcq	$0,%rdx
335	addq	%r10,%r13
336	adcq	$0,%rdx
337	movq	%r13,-8(%rsp,%r15,8)
338	movq	%rdx,%rdi
339
340	mulq	%rbx
341	addq	%rax,%r11
342	movq	8(%rcx,%r15,8),%rax
343	adcq	$0,%rdx
344	leaq	4(%r15),%r15
345	movq	%rdx,%r10
346
347	mulq	%rbp
348	addq	%rax,%rdi
349	movq	-16(%rsi,%r15,8),%rax
350	adcq	$0,%rdx
351	addq	%r11,%rdi
352	adcq	$0,%rdx
353	movq	%rdi,-32(%rsp,%r15,8)
354	movq	%rdx,%r13
355	cmpq	%r9,%r15
356	jl	.L1st4x
357
358	mulq	%rbx
359	addq	%rax,%r10
360	movq	-16(%rcx,%r15,8),%rax
361	adcq	$0,%rdx
362	movq	%rdx,%r11
363
364	mulq	%rbp
365	addq	%rax,%r13
366	movq	-8(%rsi,%r15,8),%rax
367	adcq	$0,%rdx
368	addq	%r10,%r13
369	adcq	$0,%rdx
370	movq	%r13,-24(%rsp,%r15,8)
371	movq	%rdx,%rdi
372
373	mulq	%rbx
374	addq	%rax,%r11
375	movq	-8(%rcx,%r15,8),%rax
376	adcq	$0,%rdx
377	movq	%rdx,%r10
378
379	mulq	%rbp
380	addq	%rax,%rdi
381	movq	(%rsi),%rax
382	adcq	$0,%rdx
383	addq	%r11,%rdi
384	adcq	$0,%rdx
385	movq	%rdi,-16(%rsp,%r15,8)
386	movq	%rdx,%r13
387
388	xorq	%rdi,%rdi
389	addq	%r10,%r13
390	adcq	$0,%rdi
391	movq	%r13,-8(%rsp,%r15,8)
392	movq	%rdi,(%rsp,%r15,8)
393
394	leaq	1(%r14),%r14
395.align	4
396.Louter4x:
397	movq	(%r12,%r14,8),%rbx
398	xorq	%r15,%r15
399	movq	(%rsp),%r10
400	movq	%r8,%rbp
401	mulq	%rbx
402	addq	%rax,%r10
403	movq	(%rcx),%rax
404	adcq	$0,%rdx
405
406	imulq	%r10,%rbp
407	movq	%rdx,%r11
408
409	mulq	%rbp
410	addq	%rax,%r10
411	movq	8(%rsi),%rax
412	adcq	$0,%rdx
413	movq	%rdx,%rdi
414
415	mulq	%rbx
416	addq	%rax,%r11
417	movq	8(%rcx),%rax
418	adcq	$0,%rdx
419	addq	8(%rsp),%r11
420	adcq	$0,%rdx
421	movq	%rdx,%r10
422
423	mulq	%rbp
424	addq	%rax,%rdi
425	movq	16(%rsi),%rax
426	adcq	$0,%rdx
427	addq	%r11,%rdi
428	leaq	4(%r15),%r15
429	adcq	$0,%rdx
430	movq	%rdi,(%rsp)
431	movq	%rdx,%r13
432	jmp	.Linner4x
433.align	16
434.Linner4x:
435	mulq	%rbx
436	addq	%rax,%r10
437	movq	-16(%rcx,%r15,8),%rax
438	adcq	$0,%rdx
439	addq	-16(%rsp,%r15,8),%r10
440	adcq	$0,%rdx
441	movq	%rdx,%r11
442
443	mulq	%rbp
444	addq	%rax,%r13
445	movq	-8(%rsi,%r15,8),%rax
446	adcq	$0,%rdx
447	addq	%r10,%r13
448	adcq	$0,%rdx
449	movq	%r13,-24(%rsp,%r15,8)
450	movq	%rdx,%rdi
451
452	mulq	%rbx
453	addq	%rax,%r11
454	movq	-8(%rcx,%r15,8),%rax
455	adcq	$0,%rdx
456	addq	-8(%rsp,%r15,8),%r11
457	adcq	$0,%rdx
458	movq	%rdx,%r10
459
460	mulq	%rbp
461	addq	%rax,%rdi
462	movq	(%rsi,%r15,8),%rax
463	adcq	$0,%rdx
464	addq	%r11,%rdi
465	adcq	$0,%rdx
466	movq	%rdi,-16(%rsp,%r15,8)
467	movq	%rdx,%r13
468
469	mulq	%rbx
470	addq	%rax,%r10
471	movq	(%rcx,%r15,8),%rax
472	adcq	$0,%rdx
473	addq	(%rsp,%r15,8),%r10
474	adcq	$0,%rdx
475	movq	%rdx,%r11
476
477	mulq	%rbp
478	addq	%rax,%r13
479	movq	8(%rsi,%r15,8),%rax
480	adcq	$0,%rdx
481	addq	%r10,%r13
482	adcq	$0,%rdx
483	movq	%r13,-8(%rsp,%r15,8)
484	movq	%rdx,%rdi
485
486	mulq	%rbx
487	addq	%rax,%r11
488	movq	8(%rcx,%r15,8),%rax
489	adcq	$0,%rdx
490	addq	8(%rsp,%r15,8),%r11
491	adcq	$0,%rdx
492	leaq	4(%r15),%r15
493	movq	%rdx,%r10
494
495	mulq	%rbp
496	addq	%rax,%rdi
497	movq	-16(%rsi,%r15,8),%rax
498	adcq	$0,%rdx
499	addq	%r11,%rdi
500	adcq	$0,%rdx
501	movq	%rdi,-32(%rsp,%r15,8)
502	movq	%rdx,%r13
503	cmpq	%r9,%r15
504	jl	.Linner4x
505
506	mulq	%rbx
507	addq	%rax,%r10
508	movq	-16(%rcx,%r15,8),%rax
509	adcq	$0,%rdx
510	addq	-16(%rsp,%r15,8),%r10
511	adcq	$0,%rdx
512	movq	%rdx,%r11
513
514	mulq	%rbp
515	addq	%rax,%r13
516	movq	-8(%rsi,%r15,8),%rax
517	adcq	$0,%rdx
518	addq	%r10,%r13
519	adcq	$0,%rdx
520	movq	%r13,-24(%rsp,%r15,8)
521	movq	%rdx,%rdi
522
523	mulq	%rbx
524	addq	%rax,%r11
525	movq	-8(%rcx,%r15,8),%rax
526	adcq	$0,%rdx
527	addq	-8(%rsp,%r15,8),%r11
528	adcq	$0,%rdx
529	leaq	1(%r14),%r14
530	movq	%rdx,%r10
531
532	mulq	%rbp
533	addq	%rax,%rdi
534	movq	(%rsi),%rax
535	adcq	$0,%rdx
536	addq	%r11,%rdi
537	adcq	$0,%rdx
538	movq	%rdi,-16(%rsp,%r15,8)
539	movq	%rdx,%r13
540
541	xorq	%rdi,%rdi
542	addq	%r10,%r13
543	adcq	$0,%rdi
544	addq	(%rsp,%r9,8),%r13
545	adcq	$0,%rdi
546	movq	%r13,-8(%rsp,%r15,8)
547	movq	%rdi,(%rsp,%r15,8)
548
549	cmpq	%r9,%r14
550	jl	.Louter4x
551	movq	16(%rsp,%r9,8),%rdi
552	movq	0(%rsp),%rax
553	pxor	%xmm0,%xmm0
554	movq	8(%rsp),%rdx
555	shrq	$2,%r9
556	leaq	(%rsp),%rsi
557	xorq	%r14,%r14
558
559	subq	0(%rcx),%rax
560	movq	16(%rsi),%rbx
561	movq	24(%rsi),%rbp
562	sbbq	8(%rcx),%rdx
563	leaq	-1(%r9),%r15
564	jmp	.Lsub4x
565.align	16
566.Lsub4x:
567	movq	%rax,0(%rdi,%r14,8)
568	movq	%rdx,8(%rdi,%r14,8)
569	sbbq	16(%rcx,%r14,8),%rbx
570	movq	32(%rsi,%r14,8),%rax
571	movq	40(%rsi,%r14,8),%rdx
572	sbbq	24(%rcx,%r14,8),%rbp
573	movq	%rbx,16(%rdi,%r14,8)
574	movq	%rbp,24(%rdi,%r14,8)
575	sbbq	32(%rcx,%r14,8),%rax
576	movq	48(%rsi,%r14,8),%rbx
577	movq	56(%rsi,%r14,8),%rbp
578	sbbq	40(%rcx,%r14,8),%rdx
579	leaq	4(%r14),%r14
580	decq	%r15
581	jnz	.Lsub4x
582
583	movq	%rax,0(%rdi,%r14,8)
584	movq	32(%rsi,%r14,8),%rax
585	sbbq	16(%rcx,%r14,8),%rbx
586	movq	%rdx,8(%rdi,%r14,8)
587	sbbq	24(%rcx,%r14,8),%rbp
588	movq	%rbx,16(%rdi,%r14,8)
589
590	sbbq	$0,%rax
591	movq	%rbp,24(%rdi,%r14,8)
592	xorq	%r14,%r14
593	andq	%rax,%rsi
594	notq	%rax
595	movq	%rdi,%rcx
596	andq	%rax,%rcx
597	leaq	-1(%r9),%r15
598	orq	%rcx,%rsi
599
600	movdqu	(%rsi),%xmm1
601	movdqa	%xmm0,(%rsp)
602	movdqu	%xmm1,(%rdi)
603	jmp	.Lcopy4x
604.align	16
605.Lcopy4x:
606	movdqu	16(%rsi,%r14,1),%xmm2
607	movdqu	32(%rsi,%r14,1),%xmm1
608	movdqa	%xmm0,16(%rsp,%r14,1)
609	movdqu	%xmm2,16(%rdi,%r14,1)
610	movdqa	%xmm0,32(%rsp,%r14,1)
611	movdqu	%xmm1,32(%rdi,%r14,1)
612	leaq	32(%r14),%r14
613	decq	%r15
614	jnz	.Lcopy4x
615
616	shlq	$2,%r9
617	movdqu	16(%rsi,%r14,1),%xmm2
618	movdqa	%xmm0,16(%rsp,%r14,1)
619	movdqu	%xmm2,16(%rdi,%r14,1)
620	movq	8(%rsp,%r9,8),%rsi
621	movq	$1,%rax
622	movq	(%rsi),%r15
623	movq	8(%rsi),%r14
624	movq	16(%rsi),%r13
625	movq	24(%rsi),%r12
626	movq	32(%rsi),%rbp
627	movq	40(%rsi),%rbx
628	leaq	48(%rsi),%rsp
629.Lmul4x_epilogue:
630	.byte	0xf3,0xc3
631.size	bn_mul4x_mont,.-bn_mul4x_mont
632.type	bn_sqr4x_mont,@function
633.align	16
634bn_sqr4x_mont:
635.Lsqr4x_enter:
636	movq	%rsp,%rax
637	pushq	%rbx
638	pushq	%rbp
639	pushq	%r12
640	pushq	%r13
641	pushq	%r14
642	pushq	%r15
643
644	shll	$3,%r9d
645	movq	%rsp,%r11
646	negq	%r9
647	movq	(%r8),%r8
648	leaq	-72(%rsp,%r9,2),%rsp
649	andq	$-1024,%rsp
650
651	subq	%rsp,%r11
652	andq	$-4096,%r11
653.Lsqr4x_page_walk:
654	movq	(%rsp,%r11,1),%r10
655	subq	$4096,%r11
656.byte	0x2e
657	jnc	.Lsqr4x_page_walk
658
659	movq	%r9,%r10
660	negq	%r9
661	leaq	-48(%rax),%r11
662
663
664
665
666
667
668
669
670
671
672
673	movq	%rdi,32(%rsp)
674	movq	%rcx,40(%rsp)
675	movq	%r8,48(%rsp)
676	movq	%r11,56(%rsp)
677.Lsqr4x_body:
678
679
680
681
682
683
684
685	leaq	32(%r10),%rbp
686	leaq	(%rsi,%r9,1),%rsi
687
688	movq	%r9,%rcx
689
690
691	movq	-32(%rsi,%rbp,1),%r14
692	leaq	64(%rsp,%r9,2),%rdi
693	movq	-24(%rsi,%rbp,1),%rax
694	leaq	-32(%rdi,%rbp,1),%rdi
695	movq	-16(%rsi,%rbp,1),%rbx
696	movq	%rax,%r15
697
698	mulq	%r14
699	movq	%rax,%r10
700	movq	%rbx,%rax
701	movq	%rdx,%r11
702	movq	%r10,-24(%rdi,%rbp,1)
703
704	xorq	%r10,%r10
705	mulq	%r14
706	addq	%rax,%r11
707	movq	%rbx,%rax
708	adcq	%rdx,%r10
709	movq	%r11,-16(%rdi,%rbp,1)
710
711	leaq	-16(%rbp),%rcx
712
713
714	movq	8(%rsi,%rcx,1),%rbx
715	mulq	%r15
716	movq	%rax,%r12
717	movq	%rbx,%rax
718	movq	%rdx,%r13
719
720	xorq	%r11,%r11
721	addq	%r12,%r10
722	leaq	16(%rcx),%rcx
723	adcq	$0,%r11
724	mulq	%r14
725	addq	%rax,%r10
726	movq	%rbx,%rax
727	adcq	%rdx,%r11
728	movq	%r10,-8(%rdi,%rcx,1)
729	jmp	.Lsqr4x_1st
730
731.align	16
732.Lsqr4x_1st:
733	movq	(%rsi,%rcx,1),%rbx
734	xorq	%r12,%r12
735	mulq	%r15
736	addq	%rax,%r13
737	movq	%rbx,%rax
738	adcq	%rdx,%r12
739
740	xorq	%r10,%r10
741	addq	%r13,%r11
742	adcq	$0,%r10
743	mulq	%r14
744	addq	%rax,%r11
745	movq	%rbx,%rax
746	adcq	%rdx,%r10
747	movq	%r11,(%rdi,%rcx,1)
748
749
750	movq	8(%rsi,%rcx,1),%rbx
751	xorq	%r13,%r13
752	mulq	%r15
753	addq	%rax,%r12
754	movq	%rbx,%rax
755	adcq	%rdx,%r13
756
757	xorq	%r11,%r11
758	addq	%r12,%r10
759	adcq	$0,%r11
760	mulq	%r14
761	addq	%rax,%r10
762	movq	%rbx,%rax
763	adcq	%rdx,%r11
764	movq	%r10,8(%rdi,%rcx,1)
765
766	movq	16(%rsi,%rcx,1),%rbx
767	xorq	%r12,%r12
768	mulq	%r15
769	addq	%rax,%r13
770	movq	%rbx,%rax
771	adcq	%rdx,%r12
772
773	xorq	%r10,%r10
774	addq	%r13,%r11
775	adcq	$0,%r10
776	mulq	%r14
777	addq	%rax,%r11
778	movq	%rbx,%rax
779	adcq	%rdx,%r10
780	movq	%r11,16(%rdi,%rcx,1)
781
782
783	movq	24(%rsi,%rcx,1),%rbx
784	xorq	%r13,%r13
785	mulq	%r15
786	addq	%rax,%r12
787	movq	%rbx,%rax
788	adcq	%rdx,%r13
789
790	xorq	%r11,%r11
791	addq	%r12,%r10
792	leaq	32(%rcx),%rcx
793	adcq	$0,%r11
794	mulq	%r14
795	addq	%rax,%r10
796	movq	%rbx,%rax
797	adcq	%rdx,%r11
798	movq	%r10,-8(%rdi,%rcx,1)
799
800	cmpq	$0,%rcx
801	jne	.Lsqr4x_1st
802
803	xorq	%r12,%r12
804	addq	%r11,%r13
805	adcq	$0,%r12
806	mulq	%r15
807	addq	%rax,%r13
808	adcq	%rdx,%r12
809
810	movq	%r13,(%rdi)
811	leaq	16(%rbp),%rbp
812	movq	%r12,8(%rdi)
813	jmp	.Lsqr4x_outer
814
815.align	16
816.Lsqr4x_outer:
817	movq	-32(%rsi,%rbp,1),%r14
818	leaq	64(%rsp,%r9,2),%rdi
819	movq	-24(%rsi,%rbp,1),%rax
820	leaq	-32(%rdi,%rbp,1),%rdi
821	movq	-16(%rsi,%rbp,1),%rbx
822	movq	%rax,%r15
823
824	movq	-24(%rdi,%rbp,1),%r10
825	xorq	%r11,%r11
826	mulq	%r14
827	addq	%rax,%r10
828	movq	%rbx,%rax
829	adcq	%rdx,%r11
830	movq	%r10,-24(%rdi,%rbp,1)
831
832	xorq	%r10,%r10
833	addq	-16(%rdi,%rbp,1),%r11
834	adcq	$0,%r10
835	mulq	%r14
836	addq	%rax,%r11
837	movq	%rbx,%rax
838	adcq	%rdx,%r10
839	movq	%r11,-16(%rdi,%rbp,1)
840
841	leaq	-16(%rbp),%rcx
842	xorq	%r12,%r12
843
844
845	movq	8(%rsi,%rcx,1),%rbx
846	xorq	%r13,%r13
847	addq	8(%rdi,%rcx,1),%r12
848	adcq	$0,%r13
849	mulq	%r15
850	addq	%rax,%r12
851	movq	%rbx,%rax
852	adcq	%rdx,%r13
853
854	xorq	%r11,%r11
855	addq	%r12,%r10
856	adcq	$0,%r11
857	mulq	%r14
858	addq	%rax,%r10
859	movq	%rbx,%rax
860	adcq	%rdx,%r11
861	movq	%r10,8(%rdi,%rcx,1)
862
863	leaq	16(%rcx),%rcx
864	jmp	.Lsqr4x_inner
865
866.align	16
867.Lsqr4x_inner:
868	movq	(%rsi,%rcx,1),%rbx
869	xorq	%r12,%r12
870	addq	(%rdi,%rcx,1),%r13
871	adcq	$0,%r12
872	mulq	%r15
873	addq	%rax,%r13
874	movq	%rbx,%rax
875	adcq	%rdx,%r12
876
877	xorq	%r10,%r10
878	addq	%r13,%r11
879	adcq	$0,%r10
880	mulq	%r14
881	addq	%rax,%r11
882	movq	%rbx,%rax
883	adcq	%rdx,%r10
884	movq	%r11,(%rdi,%rcx,1)
885
886	movq	8(%rsi,%rcx,1),%rbx
887	xorq	%r13,%r13
888	addq	8(%rdi,%rcx,1),%r12
889	adcq	$0,%r13
890	mulq	%r15
891	addq	%rax,%r12
892	movq	%rbx,%rax
893	adcq	%rdx,%r13
894
895	xorq	%r11,%r11
896	addq	%r12,%r10
897	leaq	16(%rcx),%rcx
898	adcq	$0,%r11
899	mulq	%r14
900	addq	%rax,%r10
901	movq	%rbx,%rax
902	adcq	%rdx,%r11
903	movq	%r10,-8(%rdi,%rcx,1)
904
905	cmpq	$0,%rcx
906	jne	.Lsqr4x_inner
907
908	xorq	%r12,%r12
909	addq	%r11,%r13
910	adcq	$0,%r12
911	mulq	%r15
912	addq	%rax,%r13
913	adcq	%rdx,%r12
914
915	movq	%r13,(%rdi)
916	movq	%r12,8(%rdi)
917
918	addq	$16,%rbp
919	jnz	.Lsqr4x_outer
920
921
922	movq	-32(%rsi),%r14
923	leaq	64(%rsp,%r9,2),%rdi
924	movq	-24(%rsi),%rax
925	leaq	-32(%rdi,%rbp,1),%rdi
926	movq	-16(%rsi),%rbx
927	movq	%rax,%r15
928
929	xorq	%r11,%r11
930	mulq	%r14
931	addq	%rax,%r10
932	movq	%rbx,%rax
933	adcq	%rdx,%r11
934	movq	%r10,-24(%rdi)
935
936	xorq	%r10,%r10
937	addq	%r13,%r11
938	adcq	$0,%r10
939	mulq	%r14
940	addq	%rax,%r11
941	movq	%rbx,%rax
942	adcq	%rdx,%r10
943	movq	%r11,-16(%rdi)
944
945	movq	-8(%rsi),%rbx
946	mulq	%r15
947	addq	%rax,%r12
948	movq	%rbx,%rax
949	adcq	$0,%rdx
950
951	xorq	%r11,%r11
952	addq	%r12,%r10
953	movq	%rdx,%r13
954	adcq	$0,%r11
955	mulq	%r14
956	addq	%rax,%r10
957	movq	%rbx,%rax
958	adcq	%rdx,%r11
959	movq	%r10,-8(%rdi)
960
961	xorq	%r12,%r12
962	addq	%r11,%r13
963	adcq	$0,%r12
964	mulq	%r15
965	addq	%rax,%r13
966	movq	-16(%rsi),%rax
967	adcq	%rdx,%r12
968
969	movq	%r13,(%rdi)
970	movq	%r12,8(%rdi)
971
972	mulq	%rbx
973	addq	$16,%rbp
974	xorq	%r14,%r14
975	subq	%r9,%rbp
976	xorq	%r15,%r15
977
978	addq	%r12,%rax
979	adcq	$0,%rdx
980	movq	%rax,8(%rdi)
981	movq	%rdx,16(%rdi)
982	movq	%r15,24(%rdi)
983
984	movq	-16(%rsi,%rbp,1),%rax
985	leaq	64(%rsp,%r9,2),%rdi
986	xorq	%r10,%r10
987	movq	-24(%rdi,%rbp,2),%r11
988
989	leaq	(%r14,%r10,2),%r12
990	shrq	$63,%r10
991	leaq	(%rcx,%r11,2),%r13
992	shrq	$63,%r11
993	orq	%r10,%r13
994	movq	-16(%rdi,%rbp,2),%r10
995	movq	%r11,%r14
996	mulq	%rax
997	negq	%r15
998	movq	-8(%rdi,%rbp,2),%r11
999	adcq	%rax,%r12
1000	movq	-8(%rsi,%rbp,1),%rax
1001	movq	%r12,-32(%rdi,%rbp,2)
1002	adcq	%rdx,%r13
1003
1004	leaq	(%r14,%r10,2),%rbx
1005	movq	%r13,-24(%rdi,%rbp,2)
1006	sbbq	%r15,%r15
1007	shrq	$63,%r10
1008	leaq	(%rcx,%r11,2),%r8
1009	shrq	$63,%r11
1010	orq	%r10,%r8
1011	movq	0(%rdi,%rbp,2),%r10
1012	movq	%r11,%r14
1013	mulq	%rax
1014	negq	%r15
1015	movq	8(%rdi,%rbp,2),%r11
1016	adcq	%rax,%rbx
1017	movq	0(%rsi,%rbp,1),%rax
1018	movq	%rbx,-16(%rdi,%rbp,2)
1019	adcq	%rdx,%r8
1020	leaq	16(%rbp),%rbp
1021	movq	%r8,-40(%rdi,%rbp,2)
1022	sbbq	%r15,%r15
1023	jmp	.Lsqr4x_shift_n_add
1024
1025.align	16
1026.Lsqr4x_shift_n_add:
1027	leaq	(%r14,%r10,2),%r12
1028	shrq	$63,%r10
1029	leaq	(%rcx,%r11,2),%r13
1030	shrq	$63,%r11
1031	orq	%r10,%r13
1032	movq	-16(%rdi,%rbp,2),%r10
1033	movq	%r11,%r14
1034	mulq	%rax
1035	negq	%r15
1036	movq	-8(%rdi,%rbp,2),%r11
1037	adcq	%rax,%r12
1038	movq	-8(%rsi,%rbp,1),%rax
1039	movq	%r12,-32(%rdi,%rbp,2)
1040	adcq	%rdx,%r13
1041
1042	leaq	(%r14,%r10,2),%rbx
1043	movq	%r13,-24(%rdi,%rbp,2)
1044	sbbq	%r15,%r15
1045	shrq	$63,%r10
1046	leaq	(%rcx,%r11,2),%r8
1047	shrq	$63,%r11
1048	orq	%r10,%r8
1049	movq	0(%rdi,%rbp,2),%r10
1050	movq	%r11,%r14
1051	mulq	%rax
1052	negq	%r15
1053	movq	8(%rdi,%rbp,2),%r11
1054	adcq	%rax,%rbx
1055	movq	0(%rsi,%rbp,1),%rax
1056	movq	%rbx,-16(%rdi,%rbp,2)
1057	adcq	%rdx,%r8
1058
1059	leaq	(%r14,%r10,2),%r12
1060	movq	%r8,-8(%rdi,%rbp,2)
1061	sbbq	%r15,%r15
1062	shrq	$63,%r10
1063	leaq	(%rcx,%r11,2),%r13
1064	shrq	$63,%r11
1065	orq	%r10,%r13
1066	movq	16(%rdi,%rbp,2),%r10
1067	movq	%r11,%r14
1068	mulq	%rax
1069	negq	%r15
1070	movq	24(%rdi,%rbp,2),%r11
1071	adcq	%rax,%r12
1072	movq	8(%rsi,%rbp,1),%rax
1073	movq	%r12,0(%rdi,%rbp,2)
1074	adcq	%rdx,%r13
1075
1076	leaq	(%r14,%r10,2),%rbx
1077	movq	%r13,8(%rdi,%rbp,2)
1078	sbbq	%r15,%r15
1079	shrq	$63,%r10
1080	leaq	(%rcx,%r11,2),%r8
1081	shrq	$63,%r11
1082	orq	%r10,%r8
1083	movq	32(%rdi,%rbp,2),%r10
1084	movq	%r11,%r14
1085	mulq	%rax
1086	negq	%r15
1087	movq	40(%rdi,%rbp,2),%r11
1088	adcq	%rax,%rbx
1089	movq	16(%rsi,%rbp,1),%rax
1090	movq	%rbx,16(%rdi,%rbp,2)
1091	adcq	%rdx,%r8
1092	movq	%r8,24(%rdi,%rbp,2)
1093	sbbq	%r15,%r15
1094	addq	$32,%rbp
1095	jnz	.Lsqr4x_shift_n_add
1096
1097	leaq	(%r14,%r10,2),%r12
1098	shrq	$63,%r10
1099	leaq	(%rcx,%r11,2),%r13
1100	shrq	$63,%r11
1101	orq	%r10,%r13
1102	movq	-16(%rdi),%r10
1103	movq	%r11,%r14
1104	mulq	%rax
1105	negq	%r15
1106	movq	-8(%rdi),%r11
1107	adcq	%rax,%r12
1108	movq	-8(%rsi),%rax
1109	movq	%r12,-32(%rdi)
1110	adcq	%rdx,%r13
1111
1112	leaq	(%r14,%r10,2),%rbx
1113	movq	%r13,-24(%rdi)
1114	sbbq	%r15,%r15
1115	shrq	$63,%r10
1116	leaq	(%rcx,%r11,2),%r8
1117	shrq	$63,%r11
1118	orq	%r10,%r8
1119	mulq	%rax
1120	negq	%r15
1121	adcq	%rax,%rbx
1122	adcq	%rdx,%r8
1123	movq	%rbx,-16(%rdi)
1124	movq	%r8,-8(%rdi)
1125	movq	40(%rsp),%rsi
1126	movq	48(%rsp),%r8
1127	xorq	%rcx,%rcx
1128	movq	%r9,0(%rsp)
1129	subq	%r9,%rcx
1130	movq	64(%rsp),%r10
1131	movq	%r8,%r14
1132	leaq	64(%rsp,%r9,2),%rax
1133	leaq	64(%rsp,%r9,1),%rdi
1134	movq	%rax,8(%rsp)
1135	leaq	(%rsi,%r9,1),%rsi
1136	xorq	%rbp,%rbp
1137
1138	movq	0(%rsi,%rcx,1),%rax
1139	movq	8(%rsi,%rcx,1),%r9
1140	imulq	%r10,%r14
1141	movq	%rax,%rbx
1142	jmp	.Lsqr4x_mont_outer
1143
1144.align	16
1145.Lsqr4x_mont_outer:
1146	xorq	%r11,%r11
1147	mulq	%r14
1148	addq	%rax,%r10
1149	movq	%r9,%rax
1150	adcq	%rdx,%r11
1151	movq	%r8,%r15
1152
1153	xorq	%r10,%r10
1154	addq	8(%rdi,%rcx,1),%r11
1155	adcq	$0,%r10
1156	mulq	%r14
1157	addq	%rax,%r11
1158	movq	%rbx,%rax
1159	adcq	%rdx,%r10
1160
1161	imulq	%r11,%r15
1162
1163	movq	16(%rsi,%rcx,1),%rbx
1164	xorq	%r13,%r13
1165	addq	%r11,%r12
1166	adcq	$0,%r13
1167	mulq	%r15
1168	addq	%rax,%r12
1169	movq	%rbx,%rax
1170	adcq	%rdx,%r13
1171	movq	%r12,8(%rdi,%rcx,1)
1172
1173	xorq	%r11,%r11
1174	addq	16(%rdi,%rcx,1),%r10
1175	adcq	$0,%r11
1176	mulq	%r14
1177	addq	%rax,%r10
1178	movq	%r9,%rax
1179	adcq	%rdx,%r11
1180
1181	movq	24(%rsi,%rcx,1),%r9
1182	xorq	%r12,%r12
1183	addq	%r10,%r13
1184	adcq	$0,%r12
1185	mulq	%r15
1186	addq	%rax,%r13
1187	movq	%r9,%rax
1188	adcq	%rdx,%r12
1189	movq	%r13,16(%rdi,%rcx,1)
1190
1191	xorq	%r10,%r10
1192	addq	24(%rdi,%rcx,1),%r11
1193	leaq	32(%rcx),%rcx
1194	adcq	$0,%r10
1195	mulq	%r14
1196	addq	%rax,%r11
1197	movq	%rbx,%rax
1198	adcq	%rdx,%r10
1199	jmp	.Lsqr4x_mont_inner
1200
1201.align	16
1202.Lsqr4x_mont_inner:
1203	movq	(%rsi,%rcx,1),%rbx
1204	xorq	%r13,%r13
1205	addq	%r11,%r12
1206	adcq	$0,%r13
1207	mulq	%r15
1208	addq	%rax,%r12
1209	movq	%rbx,%rax
1210	adcq	%rdx,%r13
1211	movq	%r12,-8(%rdi,%rcx,1)
1212
1213	xorq	%r11,%r11
1214	addq	(%rdi,%rcx,1),%r10
1215	adcq	$0,%r11
1216	mulq	%r14
1217	addq	%rax,%r10
1218	movq	%r9,%rax
1219	adcq	%rdx,%r11
1220
1221	movq	8(%rsi,%rcx,1),%r9
1222	xorq	%r12,%r12
1223	addq	%r10,%r13
1224	adcq	$0,%r12
1225	mulq	%r15
1226	addq	%rax,%r13
1227	movq	%r9,%rax
1228	adcq	%rdx,%r12
1229	movq	%r13,(%rdi,%rcx,1)
1230
1231	xorq	%r10,%r10
1232	addq	8(%rdi,%rcx,1),%r11
1233	adcq	$0,%r10
1234	mulq	%r14
1235	addq	%rax,%r11
1236	movq	%rbx,%rax
1237	adcq	%rdx,%r10
1238
1239
1240	movq	16(%rsi,%rcx,1),%rbx
1241	xorq	%r13,%r13
1242	addq	%r11,%r12
1243	adcq	$0,%r13
1244	mulq	%r15
1245	addq	%rax,%r12
1246	movq	%rbx,%rax
1247	adcq	%rdx,%r13
1248	movq	%r12,8(%rdi,%rcx,1)
1249
1250	xorq	%r11,%r11
1251	addq	16(%rdi,%rcx,1),%r10
1252	adcq	$0,%r11
1253	mulq	%r14
1254	addq	%rax,%r10
1255	movq	%r9,%rax
1256	adcq	%rdx,%r11
1257
1258	movq	24(%rsi,%rcx,1),%r9
1259	xorq	%r12,%r12
1260	addq	%r10,%r13
1261	adcq	$0,%r12
1262	mulq	%r15
1263	addq	%rax,%r13
1264	movq	%r9,%rax
1265	adcq	%rdx,%r12
1266	movq	%r13,16(%rdi,%rcx,1)
1267
1268	xorq	%r10,%r10
1269	addq	24(%rdi,%rcx,1),%r11
1270	leaq	32(%rcx),%rcx
1271	adcq	$0,%r10
1272	mulq	%r14
1273	addq	%rax,%r11
1274	movq	%rbx,%rax
1275	adcq	%rdx,%r10
1276	cmpq	$0,%rcx
1277	jne	.Lsqr4x_mont_inner
1278
1279	subq	0(%rsp),%rcx
1280	movq	%r8,%r14
1281
1282	xorq	%r13,%r13
1283	addq	%r11,%r12
1284	adcq	$0,%r13
1285	mulq	%r15
1286	addq	%rax,%r12
1287	movq	%r9,%rax
1288	adcq	%rdx,%r13
1289	movq	%r12,-8(%rdi)
1290
1291	xorq	%r11,%r11
1292	addq	(%rdi),%r10
1293	adcq	$0,%r11
1294	movq	0(%rsi,%rcx,1),%rbx
1295	addq	%rbp,%r10
1296	adcq	$0,%r11
1297
1298	imulq	16(%rdi,%rcx,1),%r14
1299	xorq	%r12,%r12
1300	movq	8(%rsi,%rcx,1),%r9
1301	addq	%r10,%r13
1302	movq	16(%rdi,%rcx,1),%r10
1303	adcq	$0,%r12
1304	mulq	%r15
1305	addq	%rax,%r13
1306	movq	%rbx,%rax
1307	adcq	%rdx,%r12
1308	movq	%r13,(%rdi)
1309
1310	xorq	%rbp,%rbp
1311	addq	8(%rdi),%r12
1312	adcq	%rbp,%rbp
1313	addq	%r11,%r12
1314	leaq	16(%rdi),%rdi
1315	adcq	$0,%rbp
1316	movq	%r12,-8(%rdi)
1317	cmpq	8(%rsp),%rdi
1318	jb	.Lsqr4x_mont_outer
1319
1320	movq	0(%rsp),%r9
1321	movq	%rbp,(%rdi)
1322	movq	64(%rsp,%r9,1),%rax
1323	leaq	64(%rsp,%r9,1),%rbx
1324	movq	40(%rsp),%rsi
1325	shrq	$5,%r9
1326	movq	8(%rbx),%rdx
1327	xorq	%rbp,%rbp
1328
1329	movq	32(%rsp),%rdi
1330	subq	0(%rsi),%rax
1331	movq	16(%rbx),%r10
1332	movq	24(%rbx),%r11
1333	sbbq	8(%rsi),%rdx
1334	leaq	-1(%r9),%rcx
1335	jmp	.Lsqr4x_sub
1336.align	16
1337.Lsqr4x_sub:
1338	movq	%rax,0(%rdi,%rbp,8)
1339	movq	%rdx,8(%rdi,%rbp,8)
1340	sbbq	16(%rsi,%rbp,8),%r10
1341	movq	32(%rbx,%rbp,8),%rax
1342	movq	40(%rbx,%rbp,8),%rdx
1343	sbbq	24(%rsi,%rbp,8),%r11
1344	movq	%r10,16(%rdi,%rbp,8)
1345	movq	%r11,24(%rdi,%rbp,8)
1346	sbbq	32(%rsi,%rbp,8),%rax
1347	movq	48(%rbx,%rbp,8),%r10
1348	movq	56(%rbx,%rbp,8),%r11
1349	sbbq	40(%rsi,%rbp,8),%rdx
1350	leaq	4(%rbp),%rbp
1351	decq	%rcx
1352	jnz	.Lsqr4x_sub
1353
1354	movq	%rax,0(%rdi,%rbp,8)
1355	movq	32(%rbx,%rbp,8),%rax
1356	sbbq	16(%rsi,%rbp,8),%r10
1357	movq	%rdx,8(%rdi,%rbp,8)
1358	sbbq	24(%rsi,%rbp,8),%r11
1359	movq	%r10,16(%rdi,%rbp,8)
1360
1361	sbbq	$0,%rax
1362	movq	%r11,24(%rdi,%rbp,8)
1363	xorq	%rbp,%rbp
1364	andq	%rax,%rbx
1365	notq	%rax
1366	movq	%rdi,%rsi
1367	andq	%rax,%rsi
1368	leaq	-1(%r9),%rcx
1369	orq	%rsi,%rbx
1370
1371	pxor	%xmm0,%xmm0
1372	leaq	64(%rsp,%r9,8),%rsi
1373	movdqu	(%rbx),%xmm1
1374	leaq	(%rsi,%r9,8),%rsi
1375	movdqa	%xmm0,64(%rsp)
1376	movdqa	%xmm0,(%rsi)
1377	movdqu	%xmm1,(%rdi)
1378	jmp	.Lsqr4x_copy
1379.align	16
1380.Lsqr4x_copy:
1381	movdqu	16(%rbx,%rbp,1),%xmm2
1382	movdqu	32(%rbx,%rbp,1),%xmm1
1383	movdqa	%xmm0,80(%rsp,%rbp,1)
1384	movdqa	%xmm0,96(%rsp,%rbp,1)
1385	movdqa	%xmm0,16(%rsi,%rbp,1)
1386	movdqa	%xmm0,32(%rsi,%rbp,1)
1387	movdqu	%xmm2,16(%rdi,%rbp,1)
1388	movdqu	%xmm1,32(%rdi,%rbp,1)
1389	leaq	32(%rbp),%rbp
1390	decq	%rcx
1391	jnz	.Lsqr4x_copy
1392
1393	movdqu	16(%rbx,%rbp,1),%xmm2
1394	movdqa	%xmm0,80(%rsp,%rbp,1)
1395	movdqa	%xmm0,16(%rsi,%rbp,1)
1396	movdqu	%xmm2,16(%rdi,%rbp,1)
1397	movq	56(%rsp),%rsi
1398	movq	$1,%rax
1399	movq	0(%rsi),%r15
1400	movq	8(%rsi),%r14
1401	movq	16(%rsi),%r13
1402	movq	24(%rsi),%r12
1403	movq	32(%rsi),%rbp
1404	movq	40(%rsi),%rbx
1405	leaq	48(%rsi),%rsp
1406.Lsqr4x_epilogue:
1407	.byte	0xf3,0xc3
1408.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1409.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1410.align	16
1411