modexp512-x86_64.S revision 299966
1# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/modexp512-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
2# Do not modify. This file is auto-generated from modexp512-x86_64.pl.
3.text
4
5.type	MULADD_128x512,@function
6.align	16
7MULADD_128x512:
8	movq	0(%rsi),%rax
9	mulq	%rbp
10	addq	%rax,%r8
11	adcq	$0,%rdx
12	movq	%r8,0(%rcx)
13	movq	%rdx,%rbx
14
15	movq	8(%rsi),%rax
16	mulq	%rbp
17	addq	%rax,%r9
18	adcq	$0,%rdx
19	addq	%rbx,%r9
20	adcq	$0,%rdx
21	movq	%rdx,%rbx
22
23	movq	16(%rsi),%rax
24	mulq	%rbp
25	addq	%rax,%r10
26	adcq	$0,%rdx
27	addq	%rbx,%r10
28	adcq	$0,%rdx
29	movq	%rdx,%rbx
30
31	movq	24(%rsi),%rax
32	mulq	%rbp
33	addq	%rax,%r11
34	adcq	$0,%rdx
35	addq	%rbx,%r11
36	adcq	$0,%rdx
37	movq	%rdx,%rbx
38
39	movq	32(%rsi),%rax
40	mulq	%rbp
41	addq	%rax,%r12
42	adcq	$0,%rdx
43	addq	%rbx,%r12
44	adcq	$0,%rdx
45	movq	%rdx,%rbx
46
47	movq	40(%rsi),%rax
48	mulq	%rbp
49	addq	%rax,%r13
50	adcq	$0,%rdx
51	addq	%rbx,%r13
52	adcq	$0,%rdx
53	movq	%rdx,%rbx
54
55	movq	48(%rsi),%rax
56	mulq	%rbp
57	addq	%rax,%r14
58	adcq	$0,%rdx
59	addq	%rbx,%r14
60	adcq	$0,%rdx
61	movq	%rdx,%rbx
62
63	movq	56(%rsi),%rax
64	mulq	%rbp
65	addq	%rax,%r15
66	adcq	$0,%rdx
67	addq	%rbx,%r15
68	adcq	$0,%rdx
69	movq	%rdx,%r8
70	movq	8(%rdi),%rbp
71	movq	0(%rsi),%rax
72	mulq	%rbp
73	addq	%rax,%r9
74	adcq	$0,%rdx
75	movq	%r9,8(%rcx)
76	movq	%rdx,%rbx
77
78	movq	8(%rsi),%rax
79	mulq	%rbp
80	addq	%rax,%r10
81	adcq	$0,%rdx
82	addq	%rbx,%r10
83	adcq	$0,%rdx
84	movq	%rdx,%rbx
85
86	movq	16(%rsi),%rax
87	mulq	%rbp
88	addq	%rax,%r11
89	adcq	$0,%rdx
90	addq	%rbx,%r11
91	adcq	$0,%rdx
92	movq	%rdx,%rbx
93
94	movq	24(%rsi),%rax
95	mulq	%rbp
96	addq	%rax,%r12
97	adcq	$0,%rdx
98	addq	%rbx,%r12
99	adcq	$0,%rdx
100	movq	%rdx,%rbx
101
102	movq	32(%rsi),%rax
103	mulq	%rbp
104	addq	%rax,%r13
105	adcq	$0,%rdx
106	addq	%rbx,%r13
107	adcq	$0,%rdx
108	movq	%rdx,%rbx
109
110	movq	40(%rsi),%rax
111	mulq	%rbp
112	addq	%rax,%r14
113	adcq	$0,%rdx
114	addq	%rbx,%r14
115	adcq	$0,%rdx
116	movq	%rdx,%rbx
117
118	movq	48(%rsi),%rax
119	mulq	%rbp
120	addq	%rax,%r15
121	adcq	$0,%rdx
122	addq	%rbx,%r15
123	adcq	$0,%rdx
124	movq	%rdx,%rbx
125
126	movq	56(%rsi),%rax
127	mulq	%rbp
128	addq	%rax,%r8
129	adcq	$0,%rdx
130	addq	%rbx,%r8
131	adcq	$0,%rdx
132	movq	%rdx,%r9
133	.byte	0xf3,0xc3
134.size	MULADD_128x512,.-MULADD_128x512
135.type	mont_reduce,@function
136.align	16
137mont_reduce:
138	leaq	192(%rsp),%rdi
139	movq	32(%rsp),%rsi
140	addq	$576,%rsi
141	leaq	520(%rsp),%rcx
142
143	movq	96(%rcx),%rbp
144	movq	0(%rsi),%rax
145	mulq	%rbp
146	movq	(%rcx),%r8
147	addq	%rax,%r8
148	adcq	$0,%rdx
149	movq	%r8,0(%rdi)
150	movq	%rdx,%rbx
151
152	movq	8(%rsi),%rax
153	mulq	%rbp
154	movq	8(%rcx),%r9
155	addq	%rax,%r9
156	adcq	$0,%rdx
157	addq	%rbx,%r9
158	adcq	$0,%rdx
159	movq	%rdx,%rbx
160
161	movq	16(%rsi),%rax
162	mulq	%rbp
163	movq	16(%rcx),%r10
164	addq	%rax,%r10
165	adcq	$0,%rdx
166	addq	%rbx,%r10
167	adcq	$0,%rdx
168	movq	%rdx,%rbx
169
170	movq	24(%rsi),%rax
171	mulq	%rbp
172	movq	24(%rcx),%r11
173	addq	%rax,%r11
174	adcq	$0,%rdx
175	addq	%rbx,%r11
176	adcq	$0,%rdx
177	movq	%rdx,%rbx
178
179	movq	32(%rsi),%rax
180	mulq	%rbp
181	movq	32(%rcx),%r12
182	addq	%rax,%r12
183	adcq	$0,%rdx
184	addq	%rbx,%r12
185	adcq	$0,%rdx
186	movq	%rdx,%rbx
187
188	movq	40(%rsi),%rax
189	mulq	%rbp
190	movq	40(%rcx),%r13
191	addq	%rax,%r13
192	adcq	$0,%rdx
193	addq	%rbx,%r13
194	adcq	$0,%rdx
195	movq	%rdx,%rbx
196
197	movq	48(%rsi),%rax
198	mulq	%rbp
199	movq	48(%rcx),%r14
200	addq	%rax,%r14
201	adcq	$0,%rdx
202	addq	%rbx,%r14
203	adcq	$0,%rdx
204	movq	%rdx,%rbx
205
206	movq	56(%rsi),%rax
207	mulq	%rbp
208	movq	56(%rcx),%r15
209	addq	%rax,%r15
210	adcq	$0,%rdx
211	addq	%rbx,%r15
212	adcq	$0,%rdx
213	movq	%rdx,%r8
214	movq	104(%rcx),%rbp
215	movq	0(%rsi),%rax
216	mulq	%rbp
217	addq	%rax,%r9
218	adcq	$0,%rdx
219	movq	%r9,8(%rdi)
220	movq	%rdx,%rbx
221
222	movq	8(%rsi),%rax
223	mulq	%rbp
224	addq	%rax,%r10
225	adcq	$0,%rdx
226	addq	%rbx,%r10
227	adcq	$0,%rdx
228	movq	%rdx,%rbx
229
230	movq	16(%rsi),%rax
231	mulq	%rbp
232	addq	%rax,%r11
233	adcq	$0,%rdx
234	addq	%rbx,%r11
235	adcq	$0,%rdx
236	movq	%rdx,%rbx
237
238	movq	24(%rsi),%rax
239	mulq	%rbp
240	addq	%rax,%r12
241	adcq	$0,%rdx
242	addq	%rbx,%r12
243	adcq	$0,%rdx
244	movq	%rdx,%rbx
245
246	movq	32(%rsi),%rax
247	mulq	%rbp
248	addq	%rax,%r13
249	adcq	$0,%rdx
250	addq	%rbx,%r13
251	adcq	$0,%rdx
252	movq	%rdx,%rbx
253
254	movq	40(%rsi),%rax
255	mulq	%rbp
256	addq	%rax,%r14
257	adcq	$0,%rdx
258	addq	%rbx,%r14
259	adcq	$0,%rdx
260	movq	%rdx,%rbx
261
262	movq	48(%rsi),%rax
263	mulq	%rbp
264	addq	%rax,%r15
265	adcq	$0,%rdx
266	addq	%rbx,%r15
267	adcq	$0,%rdx
268	movq	%rdx,%rbx
269
270	movq	56(%rsi),%rax
271	mulq	%rbp
272	addq	%rax,%r8
273	adcq	$0,%rdx
274	addq	%rbx,%r8
275	adcq	$0,%rdx
276	movq	%rdx,%r9
277	movq	112(%rcx),%rbp
278	movq	0(%rsi),%rax
279	mulq	%rbp
280	addq	%rax,%r10
281	adcq	$0,%rdx
282	movq	%r10,16(%rdi)
283	movq	%rdx,%rbx
284
285	movq	8(%rsi),%rax
286	mulq	%rbp
287	addq	%rax,%r11
288	adcq	$0,%rdx
289	addq	%rbx,%r11
290	adcq	$0,%rdx
291	movq	%rdx,%rbx
292
293	movq	16(%rsi),%rax
294	mulq	%rbp
295	addq	%rax,%r12
296	adcq	$0,%rdx
297	addq	%rbx,%r12
298	adcq	$0,%rdx
299	movq	%rdx,%rbx
300
301	movq	24(%rsi),%rax
302	mulq	%rbp
303	addq	%rax,%r13
304	adcq	$0,%rdx
305	addq	%rbx,%r13
306	adcq	$0,%rdx
307	movq	%rdx,%rbx
308
309	movq	32(%rsi),%rax
310	mulq	%rbp
311	addq	%rax,%r14
312	adcq	$0,%rdx
313	addq	%rbx,%r14
314	adcq	$0,%rdx
315	movq	%rdx,%rbx
316
317	movq	40(%rsi),%rax
318	mulq	%rbp
319	addq	%rax,%r15
320	adcq	$0,%rdx
321	addq	%rbx,%r15
322	adcq	$0,%rdx
323	movq	%rdx,%rbx
324
325	movq	48(%rsi),%rax
326	mulq	%rbp
327	addq	%rax,%r8
328	adcq	$0,%rdx
329	addq	%rbx,%r8
330	adcq	$0,%rdx
331	movq	%rdx,%rbx
332
333	movq	56(%rsi),%rax
334	mulq	%rbp
335	addq	%rax,%r9
336	adcq	$0,%rdx
337	addq	%rbx,%r9
338	adcq	$0,%rdx
339	movq	%rdx,%r10
340	movq	120(%rcx),%rbp
341	movq	0(%rsi),%rax
342	mulq	%rbp
343	addq	%rax,%r11
344	adcq	$0,%rdx
345	movq	%r11,24(%rdi)
346	movq	%rdx,%rbx
347
348	movq	8(%rsi),%rax
349	mulq	%rbp
350	addq	%rax,%r12
351	adcq	$0,%rdx
352	addq	%rbx,%r12
353	adcq	$0,%rdx
354	movq	%rdx,%rbx
355
356	movq	16(%rsi),%rax
357	mulq	%rbp
358	addq	%rax,%r13
359	adcq	$0,%rdx
360	addq	%rbx,%r13
361	adcq	$0,%rdx
362	movq	%rdx,%rbx
363
364	movq	24(%rsi),%rax
365	mulq	%rbp
366	addq	%rax,%r14
367	adcq	$0,%rdx
368	addq	%rbx,%r14
369	adcq	$0,%rdx
370	movq	%rdx,%rbx
371
372	movq	32(%rsi),%rax
373	mulq	%rbp
374	addq	%rax,%r15
375	adcq	$0,%rdx
376	addq	%rbx,%r15
377	adcq	$0,%rdx
378	movq	%rdx,%rbx
379
380	movq	40(%rsi),%rax
381	mulq	%rbp
382	addq	%rax,%r8
383	adcq	$0,%rdx
384	addq	%rbx,%r8
385	adcq	$0,%rdx
386	movq	%rdx,%rbx
387
388	movq	48(%rsi),%rax
389	mulq	%rbp
390	addq	%rax,%r9
391	adcq	$0,%rdx
392	addq	%rbx,%r9
393	adcq	$0,%rdx
394	movq	%rdx,%rbx
395
396	movq	56(%rsi),%rax
397	mulq	%rbp
398	addq	%rax,%r10
399	adcq	$0,%rdx
400	addq	%rbx,%r10
401	adcq	$0,%rdx
402	movq	%rdx,%r11
403	xorq	%rax,%rax
404
405	addq	64(%rcx),%r8
406	adcq	72(%rcx),%r9
407	adcq	80(%rcx),%r10
408	adcq	88(%rcx),%r11
409	adcq	$0,%rax
410
411
412
413
414	movq	%r8,64(%rdi)
415	movq	%r9,72(%rdi)
416	movq	%r10,%rbp
417	movq	%r11,88(%rdi)
418
419	movq	%rax,384(%rsp)
420
421	movq	0(%rdi),%r8
422	movq	8(%rdi),%r9
423	movq	16(%rdi),%r10
424	movq	24(%rdi),%r11
425
426
427
428
429
430
431
432
433	addq	$80,%rdi
434
435	addq	$64,%rsi
436	leaq	296(%rsp),%rcx
437
438	call	MULADD_128x512
439
440	movq	384(%rsp),%rax
441
442
443	addq	-16(%rdi),%r8
444	adcq	-8(%rdi),%r9
445	movq	%r8,64(%rcx)
446	movq	%r9,72(%rcx)
447
448	adcq	%rax,%rax
449	movq	%rax,384(%rsp)
450
451	leaq	192(%rsp),%rdi
452	addq	$64,%rsi
453
454
455
456
457
458	movq	(%rsi),%r8
459	movq	8(%rsi),%rbx
460
461	movq	(%rcx),%rax
462	mulq	%r8
463	movq	%rax,%rbp
464	movq	%rdx,%r9
465
466	movq	8(%rcx),%rax
467	mulq	%r8
468	addq	%rax,%r9
469
470	movq	(%rcx),%rax
471	mulq	%rbx
472	addq	%rax,%r9
473
474	movq	%r9,8(%rdi)
475
476
477	subq	$192,%rsi
478
479	movq	(%rcx),%r8
480	movq	8(%rcx),%r9
481
482	call	MULADD_128x512
483
484
485
486
487	movq	0(%rsi),%rax
488	movq	8(%rsi),%rbx
489	movq	16(%rsi),%rdi
490	movq	24(%rsi),%rdx
491
492
493	movq	384(%rsp),%rbp
494
495	addq	64(%rcx),%r8
496	adcq	72(%rcx),%r9
497
498
499	adcq	%rbp,%rbp
500
501
502
503	shlq	$3,%rbp
504	movq	32(%rsp),%rcx
505	addq	%rcx,%rbp
506
507
508	xorq	%rsi,%rsi
509
510	addq	0(%rbp),%r10
511	adcq	64(%rbp),%r11
512	adcq	128(%rbp),%r12
513	adcq	192(%rbp),%r13
514	adcq	256(%rbp),%r14
515	adcq	320(%rbp),%r15
516	adcq	384(%rbp),%r8
517	adcq	448(%rbp),%r9
518
519
520
521	sbbq	$0,%rsi
522
523
524	andq	%rsi,%rax
525	andq	%rsi,%rbx
526	andq	%rsi,%rdi
527	andq	%rsi,%rdx
528
529	movq	$1,%rbp
530	subq	%rax,%r10
531	sbbq	%rbx,%r11
532	sbbq	%rdi,%r12
533	sbbq	%rdx,%r13
534
535
536
537
538	sbbq	$0,%rbp
539
540
541
542	addq	$512,%rcx
543	movq	32(%rcx),%rax
544	movq	40(%rcx),%rbx
545	movq	48(%rcx),%rdi
546	movq	56(%rcx),%rdx
547
548
549
550	andq	%rsi,%rax
551	andq	%rsi,%rbx
552	andq	%rsi,%rdi
553	andq	%rsi,%rdx
554
555
556
557	subq	$1,%rbp
558
559	sbbq	%rax,%r14
560	sbbq	%rbx,%r15
561	sbbq	%rdi,%r8
562	sbbq	%rdx,%r9
563
564
565
566	movq	144(%rsp),%rsi
567	movq	%r10,0(%rsi)
568	movq	%r11,8(%rsi)
569	movq	%r12,16(%rsi)
570	movq	%r13,24(%rsi)
571	movq	%r14,32(%rsi)
572	movq	%r15,40(%rsi)
573	movq	%r8,48(%rsi)
574	movq	%r9,56(%rsi)
575
576	.byte	0xf3,0xc3
577.size	mont_reduce,.-mont_reduce
578.type	mont_mul_a3b,@function
579.align	16
580mont_mul_a3b:
581
582
583
584
585	movq	0(%rdi),%rbp
586
587	movq	%r10,%rax
588	mulq	%rbp
589	movq	%rax,520(%rsp)
590	movq	%rdx,%r10
591	movq	%r11,%rax
592	mulq	%rbp
593	addq	%rax,%r10
594	adcq	$0,%rdx
595	movq	%rdx,%r11
596	movq	%r12,%rax
597	mulq	%rbp
598	addq	%rax,%r11
599	adcq	$0,%rdx
600	movq	%rdx,%r12
601	movq	%r13,%rax
602	mulq	%rbp
603	addq	%rax,%r12
604	adcq	$0,%rdx
605	movq	%rdx,%r13
606	movq	%r14,%rax
607	mulq	%rbp
608	addq	%rax,%r13
609	adcq	$0,%rdx
610	movq	%rdx,%r14
611	movq	%r15,%rax
612	mulq	%rbp
613	addq	%rax,%r14
614	adcq	$0,%rdx
615	movq	%rdx,%r15
616	movq	%r8,%rax
617	mulq	%rbp
618	addq	%rax,%r15
619	adcq	$0,%rdx
620	movq	%rdx,%r8
621	movq	%r9,%rax
622	mulq	%rbp
623	addq	%rax,%r8
624	adcq	$0,%rdx
625	movq	%rdx,%r9
626	movq	8(%rdi),%rbp
627	movq	0(%rsi),%rax
628	mulq	%rbp
629	addq	%rax,%r10
630	adcq	$0,%rdx
631	movq	%r10,528(%rsp)
632	movq	%rdx,%rbx
633
634	movq	8(%rsi),%rax
635	mulq	%rbp
636	addq	%rax,%r11
637	adcq	$0,%rdx
638	addq	%rbx,%r11
639	adcq	$0,%rdx
640	movq	%rdx,%rbx
641
642	movq	16(%rsi),%rax
643	mulq	%rbp
644	addq	%rax,%r12
645	adcq	$0,%rdx
646	addq	%rbx,%r12
647	adcq	$0,%rdx
648	movq	%rdx,%rbx
649
650	movq	24(%rsi),%rax
651	mulq	%rbp
652	addq	%rax,%r13
653	adcq	$0,%rdx
654	addq	%rbx,%r13
655	adcq	$0,%rdx
656	movq	%rdx,%rbx
657
658	movq	32(%rsi),%rax
659	mulq	%rbp
660	addq	%rax,%r14
661	adcq	$0,%rdx
662	addq	%rbx,%r14
663	adcq	$0,%rdx
664	movq	%rdx,%rbx
665
666	movq	40(%rsi),%rax
667	mulq	%rbp
668	addq	%rax,%r15
669	adcq	$0,%rdx
670	addq	%rbx,%r15
671	adcq	$0,%rdx
672	movq	%rdx,%rbx
673
674	movq	48(%rsi),%rax
675	mulq	%rbp
676	addq	%rax,%r8
677	adcq	$0,%rdx
678	addq	%rbx,%r8
679	adcq	$0,%rdx
680	movq	%rdx,%rbx
681
682	movq	56(%rsi),%rax
683	mulq	%rbp
684	addq	%rax,%r9
685	adcq	$0,%rdx
686	addq	%rbx,%r9
687	adcq	$0,%rdx
688	movq	%rdx,%r10
689	movq	16(%rdi),%rbp
690	movq	0(%rsi),%rax
691	mulq	%rbp
692	addq	%rax,%r11
693	adcq	$0,%rdx
694	movq	%r11,536(%rsp)
695	movq	%rdx,%rbx
696
697	movq	8(%rsi),%rax
698	mulq	%rbp
699	addq	%rax,%r12
700	adcq	$0,%rdx
701	addq	%rbx,%r12
702	adcq	$0,%rdx
703	movq	%rdx,%rbx
704
705	movq	16(%rsi),%rax
706	mulq	%rbp
707	addq	%rax,%r13
708	adcq	$0,%rdx
709	addq	%rbx,%r13
710	adcq	$0,%rdx
711	movq	%rdx,%rbx
712
713	movq	24(%rsi),%rax
714	mulq	%rbp
715	addq	%rax,%r14
716	adcq	$0,%rdx
717	addq	%rbx,%r14
718	adcq	$0,%rdx
719	movq	%rdx,%rbx
720
721	movq	32(%rsi),%rax
722	mulq	%rbp
723	addq	%rax,%r15
724	adcq	$0,%rdx
725	addq	%rbx,%r15
726	adcq	$0,%rdx
727	movq	%rdx,%rbx
728
729	movq	40(%rsi),%rax
730	mulq	%rbp
731	addq	%rax,%r8
732	adcq	$0,%rdx
733	addq	%rbx,%r8
734	adcq	$0,%rdx
735	movq	%rdx,%rbx
736
737	movq	48(%rsi),%rax
738	mulq	%rbp
739	addq	%rax,%r9
740	adcq	$0,%rdx
741	addq	%rbx,%r9
742	adcq	$0,%rdx
743	movq	%rdx,%rbx
744
745	movq	56(%rsi),%rax
746	mulq	%rbp
747	addq	%rax,%r10
748	adcq	$0,%rdx
749	addq	%rbx,%r10
750	adcq	$0,%rdx
751	movq	%rdx,%r11
752	movq	24(%rdi),%rbp
753	movq	0(%rsi),%rax
754	mulq	%rbp
755	addq	%rax,%r12
756	adcq	$0,%rdx
757	movq	%r12,544(%rsp)
758	movq	%rdx,%rbx
759
760	movq	8(%rsi),%rax
761	mulq	%rbp
762	addq	%rax,%r13
763	adcq	$0,%rdx
764	addq	%rbx,%r13
765	adcq	$0,%rdx
766	movq	%rdx,%rbx
767
768	movq	16(%rsi),%rax
769	mulq	%rbp
770	addq	%rax,%r14
771	adcq	$0,%rdx
772	addq	%rbx,%r14
773	adcq	$0,%rdx
774	movq	%rdx,%rbx
775
776	movq	24(%rsi),%rax
777	mulq	%rbp
778	addq	%rax,%r15
779	adcq	$0,%rdx
780	addq	%rbx,%r15
781	adcq	$0,%rdx
782	movq	%rdx,%rbx
783
784	movq	32(%rsi),%rax
785	mulq	%rbp
786	addq	%rax,%r8
787	adcq	$0,%rdx
788	addq	%rbx,%r8
789	adcq	$0,%rdx
790	movq	%rdx,%rbx
791
792	movq	40(%rsi),%rax
793	mulq	%rbp
794	addq	%rax,%r9
795	adcq	$0,%rdx
796	addq	%rbx,%r9
797	adcq	$0,%rdx
798	movq	%rdx,%rbx
799
800	movq	48(%rsi),%rax
801	mulq	%rbp
802	addq	%rax,%r10
803	adcq	$0,%rdx
804	addq	%rbx,%r10
805	adcq	$0,%rdx
806	movq	%rdx,%rbx
807
808	movq	56(%rsi),%rax
809	mulq	%rbp
810	addq	%rax,%r11
811	adcq	$0,%rdx
812	addq	%rbx,%r11
813	adcq	$0,%rdx
814	movq	%rdx,%r12
815	movq	32(%rdi),%rbp
816	movq	0(%rsi),%rax
817	mulq	%rbp
818	addq	%rax,%r13
819	adcq	$0,%rdx
820	movq	%r13,552(%rsp)
821	movq	%rdx,%rbx
822
823	movq	8(%rsi),%rax
824	mulq	%rbp
825	addq	%rax,%r14
826	adcq	$0,%rdx
827	addq	%rbx,%r14
828	adcq	$0,%rdx
829	movq	%rdx,%rbx
830
831	movq	16(%rsi),%rax
832	mulq	%rbp
833	addq	%rax,%r15
834	adcq	$0,%rdx
835	addq	%rbx,%r15
836	adcq	$0,%rdx
837	movq	%rdx,%rbx
838
839	movq	24(%rsi),%rax
840	mulq	%rbp
841	addq	%rax,%r8
842	adcq	$0,%rdx
843	addq	%rbx,%r8
844	adcq	$0,%rdx
845	movq	%rdx,%rbx
846
847	movq	32(%rsi),%rax
848	mulq	%rbp
849	addq	%rax,%r9
850	adcq	$0,%rdx
851	addq	%rbx,%r9
852	adcq	$0,%rdx
853	movq	%rdx,%rbx
854
855	movq	40(%rsi),%rax
856	mulq	%rbp
857	addq	%rax,%r10
858	adcq	$0,%rdx
859	addq	%rbx,%r10
860	adcq	$0,%rdx
861	movq	%rdx,%rbx
862
863	movq	48(%rsi),%rax
864	mulq	%rbp
865	addq	%rax,%r11
866	adcq	$0,%rdx
867	addq	%rbx,%r11
868	adcq	$0,%rdx
869	movq	%rdx,%rbx
870
871	movq	56(%rsi),%rax
872	mulq	%rbp
873	addq	%rax,%r12
874	adcq	$0,%rdx
875	addq	%rbx,%r12
876	adcq	$0,%rdx
877	movq	%rdx,%r13
878	movq	40(%rdi),%rbp
879	movq	0(%rsi),%rax
880	mulq	%rbp
881	addq	%rax,%r14
882	adcq	$0,%rdx
883	movq	%r14,560(%rsp)
884	movq	%rdx,%rbx
885
886	movq	8(%rsi),%rax
887	mulq	%rbp
888	addq	%rax,%r15
889	adcq	$0,%rdx
890	addq	%rbx,%r15
891	adcq	$0,%rdx
892	movq	%rdx,%rbx
893
894	movq	16(%rsi),%rax
895	mulq	%rbp
896	addq	%rax,%r8
897	adcq	$0,%rdx
898	addq	%rbx,%r8
899	adcq	$0,%rdx
900	movq	%rdx,%rbx
901
902	movq	24(%rsi),%rax
903	mulq	%rbp
904	addq	%rax,%r9
905	adcq	$0,%rdx
906	addq	%rbx,%r9
907	adcq	$0,%rdx
908	movq	%rdx,%rbx
909
910	movq	32(%rsi),%rax
911	mulq	%rbp
912	addq	%rax,%r10
913	adcq	$0,%rdx
914	addq	%rbx,%r10
915	adcq	$0,%rdx
916	movq	%rdx,%rbx
917
918	movq	40(%rsi),%rax
919	mulq	%rbp
920	addq	%rax,%r11
921	adcq	$0,%rdx
922	addq	%rbx,%r11
923	adcq	$0,%rdx
924	movq	%rdx,%rbx
925
926	movq	48(%rsi),%rax
927	mulq	%rbp
928	addq	%rax,%r12
929	adcq	$0,%rdx
930	addq	%rbx,%r12
931	adcq	$0,%rdx
932	movq	%rdx,%rbx
933
934	movq	56(%rsi),%rax
935	mulq	%rbp
936	addq	%rax,%r13
937	adcq	$0,%rdx
938	addq	%rbx,%r13
939	adcq	$0,%rdx
940	movq	%rdx,%r14
941	movq	48(%rdi),%rbp
942	movq	0(%rsi),%rax
943	mulq	%rbp
944	addq	%rax,%r15
945	adcq	$0,%rdx
946	movq	%r15,568(%rsp)
947	movq	%rdx,%rbx
948
949	movq	8(%rsi),%rax
950	mulq	%rbp
951	addq	%rax,%r8
952	adcq	$0,%rdx
953	addq	%rbx,%r8
954	adcq	$0,%rdx
955	movq	%rdx,%rbx
956
957	movq	16(%rsi),%rax
958	mulq	%rbp
959	addq	%rax,%r9
960	adcq	$0,%rdx
961	addq	%rbx,%r9
962	adcq	$0,%rdx
963	movq	%rdx,%rbx
964
965	movq	24(%rsi),%rax
966	mulq	%rbp
967	addq	%rax,%r10
968	adcq	$0,%rdx
969	addq	%rbx,%r10
970	adcq	$0,%rdx
971	movq	%rdx,%rbx
972
973	movq	32(%rsi),%rax
974	mulq	%rbp
975	addq	%rax,%r11
976	adcq	$0,%rdx
977	addq	%rbx,%r11
978	adcq	$0,%rdx
979	movq	%rdx,%rbx
980
981	movq	40(%rsi),%rax
982	mulq	%rbp
983	addq	%rax,%r12
984	adcq	$0,%rdx
985	addq	%rbx,%r12
986	adcq	$0,%rdx
987	movq	%rdx,%rbx
988
989	movq	48(%rsi),%rax
990	mulq	%rbp
991	addq	%rax,%r13
992	adcq	$0,%rdx
993	addq	%rbx,%r13
994	adcq	$0,%rdx
995	movq	%rdx,%rbx
996
997	movq	56(%rsi),%rax
998	mulq	%rbp
999	addq	%rax,%r14
1000	adcq	$0,%rdx
1001	addq	%rbx,%r14
1002	adcq	$0,%rdx
1003	movq	%rdx,%r15
1004	movq	56(%rdi),%rbp
1005	movq	0(%rsi),%rax
1006	mulq	%rbp
1007	addq	%rax,%r8
1008	adcq	$0,%rdx
1009	movq	%r8,576(%rsp)
1010	movq	%rdx,%rbx
1011
1012	movq	8(%rsi),%rax
1013	mulq	%rbp
1014	addq	%rax,%r9
1015	adcq	$0,%rdx
1016	addq	%rbx,%r9
1017	adcq	$0,%rdx
1018	movq	%rdx,%rbx
1019
1020	movq	16(%rsi),%rax
1021	mulq	%rbp
1022	addq	%rax,%r10
1023	adcq	$0,%rdx
1024	addq	%rbx,%r10
1025	adcq	$0,%rdx
1026	movq	%rdx,%rbx
1027
1028	movq	24(%rsi),%rax
1029	mulq	%rbp
1030	addq	%rax,%r11
1031	adcq	$0,%rdx
1032	addq	%rbx,%r11
1033	adcq	$0,%rdx
1034	movq	%rdx,%rbx
1035
1036	movq	32(%rsi),%rax
1037	mulq	%rbp
1038	addq	%rax,%r12
1039	adcq	$0,%rdx
1040	addq	%rbx,%r12
1041	adcq	$0,%rdx
1042	movq	%rdx,%rbx
1043
1044	movq	40(%rsi),%rax
1045	mulq	%rbp
1046	addq	%rax,%r13
1047	adcq	$0,%rdx
1048	addq	%rbx,%r13
1049	adcq	$0,%rdx
1050	movq	%rdx,%rbx
1051
1052	movq	48(%rsi),%rax
1053	mulq	%rbp
1054	addq	%rax,%r14
1055	adcq	$0,%rdx
1056	addq	%rbx,%r14
1057	adcq	$0,%rdx
1058	movq	%rdx,%rbx
1059
1060	movq	56(%rsi),%rax
1061	mulq	%rbp
1062	addq	%rax,%r15
1063	adcq	$0,%rdx
1064	addq	%rbx,%r15
1065	adcq	$0,%rdx
1066	movq	%rdx,%r8
1067	movq	%r9,584(%rsp)
1068	movq	%r10,592(%rsp)
1069	movq	%r11,600(%rsp)
1070	movq	%r12,608(%rsp)
1071	movq	%r13,616(%rsp)
1072	movq	%r14,624(%rsp)
1073	movq	%r15,632(%rsp)
1074	movq	%r8,640(%rsp)
1075
1076
1077
1078
1079
1080	jmp	mont_reduce
1081
1082
1083.size	mont_mul_a3b,.-mont_mul_a3b
1084.type	sqr_reduce,@function
1085.align	16
1086sqr_reduce:
1087	movq	16(%rsp),%rcx
1088
1089
1090
1091	movq	%r10,%rbx
1092
1093	movq	%r11,%rax
1094	mulq	%rbx
1095	movq	%rax,528(%rsp)
1096	movq	%rdx,%r10
1097	movq	%r12,%rax
1098	mulq	%rbx
1099	addq	%rax,%r10
1100	adcq	$0,%rdx
1101	movq	%rdx,%r11
1102	movq	%r13,%rax
1103	mulq	%rbx
1104	addq	%rax,%r11
1105	adcq	$0,%rdx
1106	movq	%rdx,%r12
1107	movq	%r14,%rax
1108	mulq	%rbx
1109	addq	%rax,%r12
1110	adcq	$0,%rdx
1111	movq	%rdx,%r13
1112	movq	%r15,%rax
1113	mulq	%rbx
1114	addq	%rax,%r13
1115	adcq	$0,%rdx
1116	movq	%rdx,%r14
1117	movq	%r8,%rax
1118	mulq	%rbx
1119	addq	%rax,%r14
1120	adcq	$0,%rdx
1121	movq	%rdx,%r15
1122	movq	%r9,%rax
1123	mulq	%rbx
1124	addq	%rax,%r15
1125	adcq	$0,%rdx
1126	movq	%rdx,%rsi
1127
1128	movq	%r10,536(%rsp)
1129
1130
1131
1132
1133
1134	movq	8(%rcx),%rbx
1135
1136	movq	16(%rcx),%rax
1137	mulq	%rbx
1138	addq	%rax,%r11
1139	adcq	$0,%rdx
1140	movq	%r11,544(%rsp)
1141
1142	movq	%rdx,%r10
1143	movq	24(%rcx),%rax
1144	mulq	%rbx
1145	addq	%rax,%r12
1146	adcq	$0,%rdx
1147	addq	%r10,%r12
1148	adcq	$0,%rdx
1149	movq	%r12,552(%rsp)
1150
1151	movq	%rdx,%r10
1152	movq	32(%rcx),%rax
1153	mulq	%rbx
1154	addq	%rax,%r13
1155	adcq	$0,%rdx
1156	addq	%r10,%r13
1157	adcq	$0,%rdx
1158
1159	movq	%rdx,%r10
1160	movq	40(%rcx),%rax
1161	mulq	%rbx
1162	addq	%rax,%r14
1163	adcq	$0,%rdx
1164	addq	%r10,%r14
1165	adcq	$0,%rdx
1166
1167	movq	%rdx,%r10
1168	movq	%r8,%rax
1169	mulq	%rbx
1170	addq	%rax,%r15
1171	adcq	$0,%rdx
1172	addq	%r10,%r15
1173	adcq	$0,%rdx
1174
1175	movq	%rdx,%r10
1176	movq	%r9,%rax
1177	mulq	%rbx
1178	addq	%rax,%rsi
1179	adcq	$0,%rdx
1180	addq	%r10,%rsi
1181	adcq	$0,%rdx
1182
1183	movq	%rdx,%r11
1184
1185
1186
1187
1188	movq	16(%rcx),%rbx
1189
1190	movq	24(%rcx),%rax
1191	mulq	%rbx
1192	addq	%rax,%r13
1193	adcq	$0,%rdx
1194	movq	%r13,560(%rsp)
1195
1196	movq	%rdx,%r10
1197	movq	32(%rcx),%rax
1198	mulq	%rbx
1199	addq	%rax,%r14
1200	adcq	$0,%rdx
1201	addq	%r10,%r14
1202	adcq	$0,%rdx
1203	movq	%r14,568(%rsp)
1204
1205	movq	%rdx,%r10
1206	movq	40(%rcx),%rax
1207	mulq	%rbx
1208	addq	%rax,%r15
1209	adcq	$0,%rdx
1210	addq	%r10,%r15
1211	adcq	$0,%rdx
1212
1213	movq	%rdx,%r10
1214	movq	%r8,%rax
1215	mulq	%rbx
1216	addq	%rax,%rsi
1217	adcq	$0,%rdx
1218	addq	%r10,%rsi
1219	adcq	$0,%rdx
1220
1221	movq	%rdx,%r10
1222	movq	%r9,%rax
1223	mulq	%rbx
1224	addq	%rax,%r11
1225	adcq	$0,%rdx
1226	addq	%r10,%r11
1227	adcq	$0,%rdx
1228
1229	movq	%rdx,%r12
1230
1231
1232
1233
1234
1235	movq	24(%rcx),%rbx
1236
1237	movq	32(%rcx),%rax
1238	mulq	%rbx
1239	addq	%rax,%r15
1240	adcq	$0,%rdx
1241	movq	%r15,576(%rsp)
1242
1243	movq	%rdx,%r10
1244	movq	40(%rcx),%rax
1245	mulq	%rbx
1246	addq	%rax,%rsi
1247	adcq	$0,%rdx
1248	addq	%r10,%rsi
1249	adcq	$0,%rdx
1250	movq	%rsi,584(%rsp)
1251
1252	movq	%rdx,%r10
1253	movq	%r8,%rax
1254	mulq	%rbx
1255	addq	%rax,%r11
1256	adcq	$0,%rdx
1257	addq	%r10,%r11
1258	adcq	$0,%rdx
1259
1260	movq	%rdx,%r10
1261	movq	%r9,%rax
1262	mulq	%rbx
1263	addq	%rax,%r12
1264	adcq	$0,%rdx
1265	addq	%r10,%r12
1266	adcq	$0,%rdx
1267
1268	movq	%rdx,%r15
1269
1270
1271
1272
1273	movq	32(%rcx),%rbx
1274
1275	movq	40(%rcx),%rax
1276	mulq	%rbx
1277	addq	%rax,%r11
1278	adcq	$0,%rdx
1279	movq	%r11,592(%rsp)
1280
1281	movq	%rdx,%r10
1282	movq	%r8,%rax
1283	mulq	%rbx
1284	addq	%rax,%r12
1285	adcq	$0,%rdx
1286	addq	%r10,%r12
1287	adcq	$0,%rdx
1288	movq	%r12,600(%rsp)
1289
1290	movq	%rdx,%r10
1291	movq	%r9,%rax
1292	mulq	%rbx
1293	addq	%rax,%r15
1294	adcq	$0,%rdx
1295	addq	%r10,%r15
1296	adcq	$0,%rdx
1297
1298	movq	%rdx,%r11
1299
1300
1301
1302
1303	movq	40(%rcx),%rbx
1304
1305	movq	%r8,%rax
1306	mulq	%rbx
1307	addq	%rax,%r15
1308	adcq	$0,%rdx
1309	movq	%r15,608(%rsp)
1310
1311	movq	%rdx,%r10
1312	movq	%r9,%rax
1313	mulq	%rbx
1314	addq	%rax,%r11
1315	adcq	$0,%rdx
1316	addq	%r10,%r11
1317	adcq	$0,%rdx
1318	movq	%r11,616(%rsp)
1319
1320	movq	%rdx,%r12
1321
1322
1323
1324
1325	movq	%r8,%rbx
1326
1327	movq	%r9,%rax
1328	mulq	%rbx
1329	addq	%rax,%r12
1330	adcq	$0,%rdx
1331	movq	%r12,624(%rsp)
1332
1333	movq	%rdx,632(%rsp)
1334
1335
1336	movq	528(%rsp),%r10
1337	movq	536(%rsp),%r11
1338	movq	544(%rsp),%r12
1339	movq	552(%rsp),%r13
1340	movq	560(%rsp),%r14
1341	movq	568(%rsp),%r15
1342
1343	movq	24(%rcx),%rax
1344	mulq	%rax
1345	movq	%rax,%rdi
1346	movq	%rdx,%r8
1347
1348	addq	%r10,%r10
1349	adcq	%r11,%r11
1350	adcq	%r12,%r12
1351	adcq	%r13,%r13
1352	adcq	%r14,%r14
1353	adcq	%r15,%r15
1354	adcq	$0,%r8
1355
1356	movq	0(%rcx),%rax
1357	mulq	%rax
1358	movq	%rax,520(%rsp)
1359	movq	%rdx,%rbx
1360
1361	movq	8(%rcx),%rax
1362	mulq	%rax
1363
1364	addq	%rbx,%r10
1365	adcq	%rax,%r11
1366	adcq	$0,%rdx
1367
1368	movq	%rdx,%rbx
1369	movq	%r10,528(%rsp)
1370	movq	%r11,536(%rsp)
1371
1372	movq	16(%rcx),%rax
1373	mulq	%rax
1374
1375	addq	%rbx,%r12
1376	adcq	%rax,%r13
1377	adcq	$0,%rdx
1378
1379	movq	%rdx,%rbx
1380
1381	movq	%r12,544(%rsp)
1382	movq	%r13,552(%rsp)
1383
1384	xorq	%rbp,%rbp
1385	addq	%rbx,%r14
1386	adcq	%rdi,%r15
1387	adcq	$0,%rbp
1388
1389	movq	%r14,560(%rsp)
1390	movq	%r15,568(%rsp)
1391
1392
1393
1394
1395	movq	576(%rsp),%r10
1396	movq	584(%rsp),%r11
1397	movq	592(%rsp),%r12
1398	movq	600(%rsp),%r13
1399	movq	608(%rsp),%r14
1400	movq	616(%rsp),%r15
1401	movq	624(%rsp),%rdi
1402	movq	632(%rsp),%rsi
1403
1404	movq	%r9,%rax
1405	mulq	%rax
1406	movq	%rax,%r9
1407	movq	%rdx,%rbx
1408
1409	addq	%r10,%r10
1410	adcq	%r11,%r11
1411	adcq	%r12,%r12
1412	adcq	%r13,%r13
1413	adcq	%r14,%r14
1414	adcq	%r15,%r15
1415	adcq	%rdi,%rdi
1416	adcq	%rsi,%rsi
1417	adcq	$0,%rbx
1418
1419	addq	%rbp,%r10
1420
1421	movq	32(%rcx),%rax
1422	mulq	%rax
1423
1424	addq	%r8,%r10
1425	adcq	%rax,%r11
1426	adcq	$0,%rdx
1427
1428	movq	%rdx,%rbp
1429
1430	movq	%r10,576(%rsp)
1431	movq	%r11,584(%rsp)
1432
1433	movq	40(%rcx),%rax
1434	mulq	%rax
1435
1436	addq	%rbp,%r12
1437	adcq	%rax,%r13
1438	adcq	$0,%rdx
1439
1440	movq	%rdx,%rbp
1441
1442	movq	%r12,592(%rsp)
1443	movq	%r13,600(%rsp)
1444
1445	movq	48(%rcx),%rax
1446	mulq	%rax
1447
1448	addq	%rbp,%r14
1449	adcq	%rax,%r15
1450	adcq	$0,%rdx
1451
1452	movq	%r14,608(%rsp)
1453	movq	%r15,616(%rsp)
1454
1455	addq	%rdx,%rdi
1456	adcq	%r9,%rsi
1457	adcq	$0,%rbx
1458
1459	movq	%rdi,624(%rsp)
1460	movq	%rsi,632(%rsp)
1461	movq	%rbx,640(%rsp)
1462
1463	jmp	mont_reduce
1464
1465
1466.size	sqr_reduce,.-sqr_reduce
1467.globl	mod_exp_512
1468.type	mod_exp_512,@function
1469mod_exp_512:
1470	pushq	%rbp
1471	pushq	%rbx
1472	pushq	%r12
1473	pushq	%r13
1474	pushq	%r14
1475	pushq	%r15
1476
1477
1478	movq	%rsp,%r8
1479	subq	$2688,%rsp
1480	andq	$-64,%rsp
1481
1482
1483	movq	%r8,0(%rsp)
1484	movq	%rdi,8(%rsp)
1485	movq	%rsi,16(%rsp)
1486	movq	%rcx,24(%rsp)
1487.Lbody:
1488
1489
1490
1491	pxor	%xmm4,%xmm4
1492	movdqu	0(%rsi),%xmm0
1493	movdqu	16(%rsi),%xmm1
1494	movdqu	32(%rsi),%xmm2
1495	movdqu	48(%rsi),%xmm3
1496	movdqa	%xmm4,512(%rsp)
1497	movdqa	%xmm4,528(%rsp)
1498	movdqa	%xmm4,608(%rsp)
1499	movdqa	%xmm4,624(%rsp)
1500	movdqa	%xmm0,544(%rsp)
1501	movdqa	%xmm1,560(%rsp)
1502	movdqa	%xmm2,576(%rsp)
1503	movdqa	%xmm3,592(%rsp)
1504
1505
1506	movdqu	0(%rdx),%xmm0
1507	movdqu	16(%rdx),%xmm1
1508	movdqu	32(%rdx),%xmm2
1509	movdqu	48(%rdx),%xmm3
1510
1511	leaq	384(%rsp),%rbx
1512	movq	%rbx,136(%rsp)
1513	call	mont_reduce
1514
1515
1516	leaq	448(%rsp),%rcx
1517	xorq	%rax,%rax
1518	movq	%rax,0(%rcx)
1519	movq	%rax,8(%rcx)
1520	movq	%rax,24(%rcx)
1521	movq	%rax,32(%rcx)
1522	movq	%rax,40(%rcx)
1523	movq	%rax,48(%rcx)
1524	movq	%rax,56(%rcx)
1525	movq	%rax,128(%rsp)
1526	movq	$1,16(%rcx)
1527
1528	leaq	640(%rsp),%rbp
1529	movq	%rcx,%rsi
1530	movq	%rbp,%rdi
1531	movq	$8,%rax
1532loop_0:
1533	movq	(%rcx),%rbx
1534	movw	%bx,(%rdi)
1535	shrq	$16,%rbx
1536	movw	%bx,64(%rdi)
1537	shrq	$16,%rbx
1538	movw	%bx,128(%rdi)
1539	shrq	$16,%rbx
1540	movw	%bx,192(%rdi)
1541	leaq	8(%rcx),%rcx
1542	leaq	256(%rdi),%rdi
1543	decq	%rax
1544	jnz	loop_0
1545	movq	$31,%rax
1546	movq	%rax,32(%rsp)
1547	movq	%rbp,40(%rsp)
1548
1549	movq	%rsi,136(%rsp)
1550	movq	0(%rsi),%r10
1551	movq	8(%rsi),%r11
1552	movq	16(%rsi),%r12
1553	movq	24(%rsi),%r13
1554	movq	32(%rsi),%r14
1555	movq	40(%rsi),%r15
1556	movq	48(%rsi),%r8
1557	movq	56(%rsi),%r9
1558init_loop:
1559	leaq	384(%rsp),%rdi
1560	call	mont_mul_a3b
1561	leaq	448(%rsp),%rsi
1562	movq	40(%rsp),%rbp
1563	addq	$2,%rbp
1564	movq	%rbp,40(%rsp)
1565	movq	%rsi,%rcx
1566	movq	$8,%rax
1567loop_1:
1568	movq	(%rcx),%rbx
1569	movw	%bx,(%rbp)
1570	shrq	$16,%rbx
1571	movw	%bx,64(%rbp)
1572	shrq	$16,%rbx
1573	movw	%bx,128(%rbp)
1574	shrq	$16,%rbx
1575	movw	%bx,192(%rbp)
1576	leaq	8(%rcx),%rcx
1577	leaq	256(%rbp),%rbp
1578	decq	%rax
1579	jnz	loop_1
1580	movq	32(%rsp),%rax
1581	subq	$1,%rax
1582	movq	%rax,32(%rsp)
1583	jne	init_loop
1584
1585
1586
1587	movdqa	%xmm0,64(%rsp)
1588	movdqa	%xmm1,80(%rsp)
1589	movdqa	%xmm2,96(%rsp)
1590	movdqa	%xmm3,112(%rsp)
1591
1592
1593
1594
1595
1596	movl	126(%rsp),%eax
1597	movq	%rax,%rdx
1598	shrq	$11,%rax
1599	andl	$2047,%edx
1600	movl	%edx,126(%rsp)
1601	leaq	640(%rsp,%rax,2),%rsi
1602	movq	8(%rsp),%rdx
1603	movq	$4,%rbp
1604loop_2:
1605	movzwq	192(%rsi),%rbx
1606	movzwq	448(%rsi),%rax
1607	shlq	$16,%rbx
1608	shlq	$16,%rax
1609	movw	128(%rsi),%bx
1610	movw	384(%rsi),%ax
1611	shlq	$16,%rbx
1612	shlq	$16,%rax
1613	movw	64(%rsi),%bx
1614	movw	320(%rsi),%ax
1615	shlq	$16,%rbx
1616	shlq	$16,%rax
1617	movw	0(%rsi),%bx
1618	movw	256(%rsi),%ax
1619	movq	%rbx,0(%rdx)
1620	movq	%rax,8(%rdx)
1621	leaq	512(%rsi),%rsi
1622	leaq	16(%rdx),%rdx
1623	subq	$1,%rbp
1624	jnz	loop_2
1625	movq	$505,48(%rsp)
1626
1627	movq	8(%rsp),%rcx
1628	movq	%rcx,136(%rsp)
1629	movq	0(%rcx),%r10
1630	movq	8(%rcx),%r11
1631	movq	16(%rcx),%r12
1632	movq	24(%rcx),%r13
1633	movq	32(%rcx),%r14
1634	movq	40(%rcx),%r15
1635	movq	48(%rcx),%r8
1636	movq	56(%rcx),%r9
1637	jmp	sqr_2
1638
1639main_loop_a3b:
1640	call	sqr_reduce
1641	call	sqr_reduce
1642	call	sqr_reduce
1643sqr_2:
1644	call	sqr_reduce
1645	call	sqr_reduce
1646
1647
1648
1649	movq	48(%rsp),%rcx
1650	movq	%rcx,%rax
1651	shrq	$4,%rax
1652	movl	64(%rsp,%rax,2),%edx
1653	andq	$15,%rcx
1654	shrq	%cl,%rdx
1655	andq	$31,%rdx
1656
1657	leaq	640(%rsp,%rdx,2),%rsi
1658	leaq	448(%rsp),%rdx
1659	movq	%rdx,%rdi
1660	movq	$4,%rbp
1661loop_3:
1662	movzwq	192(%rsi),%rbx
1663	movzwq	448(%rsi),%rax
1664	shlq	$16,%rbx
1665	shlq	$16,%rax
1666	movw	128(%rsi),%bx
1667	movw	384(%rsi),%ax
1668	shlq	$16,%rbx
1669	shlq	$16,%rax
1670	movw	64(%rsi),%bx
1671	movw	320(%rsi),%ax
1672	shlq	$16,%rbx
1673	shlq	$16,%rax
1674	movw	0(%rsi),%bx
1675	movw	256(%rsi),%ax
1676	movq	%rbx,0(%rdx)
1677	movq	%rax,8(%rdx)
1678	leaq	512(%rsi),%rsi
1679	leaq	16(%rdx),%rdx
1680	subq	$1,%rbp
1681	jnz	loop_3
1682	movq	8(%rsp),%rsi
1683	call	mont_mul_a3b
1684
1685
1686
1687	movq	48(%rsp),%rcx
1688	subq	$5,%rcx
1689	movq	%rcx,48(%rsp)
1690	jge	main_loop_a3b
1691
1692
1693
1694end_main_loop_a3b:
1695
1696
1697	movq	8(%rsp),%rdx
1698	pxor	%xmm4,%xmm4
1699	movdqu	0(%rdx),%xmm0
1700	movdqu	16(%rdx),%xmm1
1701	movdqu	32(%rdx),%xmm2
1702	movdqu	48(%rdx),%xmm3
1703	movdqa	%xmm4,576(%rsp)
1704	movdqa	%xmm4,592(%rsp)
1705	movdqa	%xmm4,608(%rsp)
1706	movdqa	%xmm4,624(%rsp)
1707	movdqa	%xmm0,512(%rsp)
1708	movdqa	%xmm1,528(%rsp)
1709	movdqa	%xmm2,544(%rsp)
1710	movdqa	%xmm3,560(%rsp)
1711	call	mont_reduce
1712
1713
1714
1715	movq	8(%rsp),%rax
1716	movq	0(%rax),%r8
1717	movq	8(%rax),%r9
1718	movq	16(%rax),%r10
1719	movq	24(%rax),%r11
1720	movq	32(%rax),%r12
1721	movq	40(%rax),%r13
1722	movq	48(%rax),%r14
1723	movq	56(%rax),%r15
1724
1725
1726	movq	24(%rsp),%rbx
1727	addq	$512,%rbx
1728
1729	subq	0(%rbx),%r8
1730	sbbq	8(%rbx),%r9
1731	sbbq	16(%rbx),%r10
1732	sbbq	24(%rbx),%r11
1733	sbbq	32(%rbx),%r12
1734	sbbq	40(%rbx),%r13
1735	sbbq	48(%rbx),%r14
1736	sbbq	56(%rbx),%r15
1737
1738
1739	movq	0(%rax),%rsi
1740	movq	8(%rax),%rdi
1741	movq	16(%rax),%rcx
1742	movq	24(%rax),%rdx
1743	cmovncq	%r8,%rsi
1744	cmovncq	%r9,%rdi
1745	cmovncq	%r10,%rcx
1746	cmovncq	%r11,%rdx
1747	movq	%rsi,0(%rax)
1748	movq	%rdi,8(%rax)
1749	movq	%rcx,16(%rax)
1750	movq	%rdx,24(%rax)
1751
1752	movq	32(%rax),%rsi
1753	movq	40(%rax),%rdi
1754	movq	48(%rax),%rcx
1755	movq	56(%rax),%rdx
1756	cmovncq	%r12,%rsi
1757	cmovncq	%r13,%rdi
1758	cmovncq	%r14,%rcx
1759	cmovncq	%r15,%rdx
1760	movq	%rsi,32(%rax)
1761	movq	%rdi,40(%rax)
1762	movq	%rcx,48(%rax)
1763	movq	%rdx,56(%rax)
1764
1765	movq	0(%rsp),%rsi
1766	movq	0(%rsi),%r15
1767	movq	8(%rsi),%r14
1768	movq	16(%rsi),%r13
1769	movq	24(%rsi),%r12
1770	movq	32(%rsi),%rbx
1771	movq	40(%rsi),%rbp
1772	leaq	48(%rsi),%rsp
1773.Lepilogue:
1774	.byte	0xf3,0xc3
1775.size	mod_exp_512, . - mod_exp_512
1776