1	# $FreeBSD$
2.text
3
4.globl	bn_mul_mont_gather5
5.type	bn_mul_mont_gather5,@function
6.align	64
7bn_mul_mont_gather5:
8	testl	$3,%r9d
9	jnz	.Lmul_enter
10	cmpl	$8,%r9d
11	jb	.Lmul_enter
12	jmp	.Lmul4x_enter
13
14.align	16
15.Lmul_enter:
16	movl	%r9d,%r9d
17	movl	8(%rsp),%r10d
18	pushq	%rbx
19	pushq	%rbp
20	pushq	%r12
21	pushq	%r13
22	pushq	%r14
23	pushq	%r15
24	movq	%rsp,%rax
25	leaq	2(%r9),%r11
26	negq	%r11
27	leaq	(%rsp,%r11,8),%rsp
28	andq	$-1024,%rsp
29
30	movq	%rax,8(%rsp,%r9,8)
31.Lmul_body:
32	movq	%rdx,%r12
33	movq	%r10,%r11
34	shrq	$3,%r10
35	andq	$7,%r11
36	notq	%r10
37	leaq	.Lmagic_masks(%rip),%rax
38	andq	$3,%r10
39	leaq	96(%r12,%r11,8),%r12
40	movq	0(%rax,%r10,8),%xmm4
41	movq	8(%rax,%r10,8),%xmm5
42	movq	16(%rax,%r10,8),%xmm6
43	movq	24(%rax,%r10,8),%xmm7
44
45	movq	-96(%r12),%xmm0
46	movq	-32(%r12),%xmm1
47	pand	%xmm4,%xmm0
48	movq	32(%r12),%xmm2
49	pand	%xmm5,%xmm1
50	movq	96(%r12),%xmm3
51	pand	%xmm6,%xmm2
52	por	%xmm1,%xmm0
53	pand	%xmm7,%xmm3
54	por	%xmm2,%xmm0
55	leaq	256(%r12),%r12
56	por	%xmm3,%xmm0
57
58.byte	102,72,15,126,195
59
60	movq	(%r8),%r8
61	movq	(%rsi),%rax
62
63	xorq	%r14,%r14
64	xorq	%r15,%r15
65
66	movq	-96(%r12),%xmm0
67	movq	-32(%r12),%xmm1
68	pand	%xmm4,%xmm0
69	movq	32(%r12),%xmm2
70	pand	%xmm5,%xmm1
71
72	movq	%r8,%rbp
73	mulq	%rbx
74	movq	%rax,%r10
75	movq	(%rcx),%rax
76
77	movq	96(%r12),%xmm3
78	pand	%xmm6,%xmm2
79	por	%xmm1,%xmm0
80	pand	%xmm7,%xmm3
81
82	imulq	%r10,%rbp
83	movq	%rdx,%r11
84
85	por	%xmm2,%xmm0
86	leaq	256(%r12),%r12
87	por	%xmm3,%xmm0
88
89	mulq	%rbp
90	addq	%rax,%r10
91	movq	8(%rsi),%rax
92	adcq	$0,%rdx
93	movq	%rdx,%r13
94
95	leaq	1(%r15),%r15
96	jmp	.L1st_enter
97
98.align	16
99.L1st:
100	addq	%rax,%r13
101	movq	(%rsi,%r15,8),%rax
102	adcq	$0,%rdx
103	addq	%r11,%r13
104	movq	%r10,%r11
105	adcq	$0,%rdx
106	movq	%r13,-16(%rsp,%r15,8)
107	movq	%rdx,%r13
108
109.L1st_enter:
110	mulq	%rbx
111	addq	%rax,%r11
112	movq	(%rcx,%r15,8),%rax
113	adcq	$0,%rdx
114	leaq	1(%r15),%r15
115	movq	%rdx,%r10
116
117	mulq	%rbp
118	cmpq	%r9,%r15
119	jne	.L1st
120
121.byte	102,72,15,126,195
122
123	addq	%rax,%r13
124	movq	(%rsi),%rax
125	adcq	$0,%rdx
126	addq	%r11,%r13
127	adcq	$0,%rdx
128	movq	%r13,-16(%rsp,%r15,8)
129	movq	%rdx,%r13
130	movq	%r10,%r11
131
132	xorq	%rdx,%rdx
133	addq	%r11,%r13
134	adcq	$0,%rdx
135	movq	%r13,-8(%rsp,%r9,8)
136	movq	%rdx,(%rsp,%r9,8)
137
138	leaq	1(%r14),%r14
139	jmp	.Louter
140.align	16
141.Louter:
142	xorq	%r15,%r15
143	movq	%r8,%rbp
144	movq	(%rsp),%r10
145
146	movq	-96(%r12),%xmm0
147	movq	-32(%r12),%xmm1
148	pand	%xmm4,%xmm0
149	movq	32(%r12),%xmm2
150	pand	%xmm5,%xmm1
151
152	mulq	%rbx
153	addq	%rax,%r10
154	movq	(%rcx),%rax
155	adcq	$0,%rdx
156
157	movq	96(%r12),%xmm3
158	pand	%xmm6,%xmm2
159	por	%xmm1,%xmm0
160	pand	%xmm7,%xmm3
161
162	imulq	%r10,%rbp
163	movq	%rdx,%r11
164
165	por	%xmm2,%xmm0
166	leaq	256(%r12),%r12
167	por	%xmm3,%xmm0
168
169	mulq	%rbp
170	addq	%rax,%r10
171	movq	8(%rsi),%rax
172	adcq	$0,%rdx
173	movq	8(%rsp),%r10
174	movq	%rdx,%r13
175
176	leaq	1(%r15),%r15
177	jmp	.Linner_enter
178
179.align	16
180.Linner:
181	addq	%rax,%r13
182	movq	(%rsi,%r15,8),%rax
183	adcq	$0,%rdx
184	addq	%r10,%r13
185	movq	(%rsp,%r15,8),%r10
186	adcq	$0,%rdx
187	movq	%r13,-16(%rsp,%r15,8)
188	movq	%rdx,%r13
189
190.Linner_enter:
191	mulq	%rbx
192	addq	%rax,%r11
193	movq	(%rcx,%r15,8),%rax
194	adcq	$0,%rdx
195	addq	%r11,%r10
196	movq	%rdx,%r11
197	adcq	$0,%r11
198	leaq	1(%r15),%r15
199
200	mulq	%rbp
201	cmpq	%r9,%r15
202	jne	.Linner
203
204.byte	102,72,15,126,195
205
206	addq	%rax,%r13
207	movq	(%rsi),%rax
208	adcq	$0,%rdx
209	addq	%r10,%r13
210	movq	(%rsp,%r15,8),%r10
211	adcq	$0,%rdx
212	movq	%r13,-16(%rsp,%r15,8)
213	movq	%rdx,%r13
214
215	xorq	%rdx,%rdx
216	addq	%r11,%r13
217	adcq	$0,%rdx
218	addq	%r10,%r13
219	adcq	$0,%rdx
220	movq	%r13,-8(%rsp,%r9,8)
221	movq	%rdx,(%rsp,%r9,8)
222
223	leaq	1(%r14),%r14
224	cmpq	%r9,%r14
225	jl	.Louter
226
227	xorq	%r14,%r14
228	movq	(%rsp),%rax
229	leaq	(%rsp),%rsi
230	movq	%r9,%r15
231	jmp	.Lsub
232.align	16
233.Lsub:	sbbq	(%rcx,%r14,8),%rax
234	movq	%rax,(%rdi,%r14,8)
235	movq	8(%rsi,%r14,8),%rax
236	leaq	1(%r14),%r14
237	decq	%r15
238	jnz	.Lsub
239
240	sbbq	$0,%rax
241	xorq	%r14,%r14
242	andq	%rax,%rsi
243	notq	%rax
244	movq	%rdi,%rcx
245	andq	%rax,%rcx
246	movq	%r9,%r15
247	orq	%rcx,%rsi
248.align	16
249.Lcopy:
250	movq	(%rsi,%r14,8),%rax
251	movq	%r14,(%rsp,%r14,8)
252	movq	%rax,(%rdi,%r14,8)
253	leaq	1(%r14),%r14
254	subq	$1,%r15
255	jnz	.Lcopy
256
257	movq	8(%rsp,%r9,8),%rsi
258	movq	$1,%rax
259	movq	(%rsi),%r15
260	movq	8(%rsi),%r14
261	movq	16(%rsi),%r13
262	movq	24(%rsi),%r12
263	movq	32(%rsi),%rbp
264	movq	40(%rsi),%rbx
265	leaq	48(%rsi),%rsp
266.Lmul_epilogue:
267	.byte	0xf3,0xc3
268.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
269.type	bn_mul4x_mont_gather5,@function
270.align	16
271bn_mul4x_mont_gather5:
272.Lmul4x_enter:
273	movl	%r9d,%r9d
274	movl	8(%rsp),%r10d
275	pushq	%rbx
276	pushq	%rbp
277	pushq	%r12
278	pushq	%r13
279	pushq	%r14
280	pushq	%r15
281	movq	%rsp,%rax
282	leaq	4(%r9),%r11
283	negq	%r11
284	leaq	(%rsp,%r11,8),%rsp
285	andq	$-1024,%rsp
286
287	movq	%rax,8(%rsp,%r9,8)
288.Lmul4x_body:
289	movq	%rdi,16(%rsp,%r9,8)
290	movq	%rdx,%r12
291	movq	%r10,%r11
292	shrq	$3,%r10
293	andq	$7,%r11
294	notq	%r10
295	leaq	.Lmagic_masks(%rip),%rax
296	andq	$3,%r10
297	leaq	96(%r12,%r11,8),%r12
298	movq	0(%rax,%r10,8),%xmm4
299	movq	8(%rax,%r10,8),%xmm5
300	movq	16(%rax,%r10,8),%xmm6
301	movq	24(%rax,%r10,8),%xmm7
302
303	movq	-96(%r12),%xmm0
304	movq	-32(%r12),%xmm1
305	pand	%xmm4,%xmm0
306	movq	32(%r12),%xmm2
307	pand	%xmm5,%xmm1
308	movq	96(%r12),%xmm3
309	pand	%xmm6,%xmm2
310	por	%xmm1,%xmm0
311	pand	%xmm7,%xmm3
312	por	%xmm2,%xmm0
313	leaq	256(%r12),%r12
314	por	%xmm3,%xmm0
315
316.byte	102,72,15,126,195
317	movq	(%r8),%r8
318	movq	(%rsi),%rax
319
320	xorq	%r14,%r14
321	xorq	%r15,%r15
322
323	movq	-96(%r12),%xmm0
324	movq	-32(%r12),%xmm1
325	pand	%xmm4,%xmm0
326	movq	32(%r12),%xmm2
327	pand	%xmm5,%xmm1
328
329	movq	%r8,%rbp
330	mulq	%rbx
331	movq	%rax,%r10
332	movq	(%rcx),%rax
333
334	movq	96(%r12),%xmm3
335	pand	%xmm6,%xmm2
336	por	%xmm1,%xmm0
337	pand	%xmm7,%xmm3
338
339	imulq	%r10,%rbp
340	movq	%rdx,%r11
341
342	por	%xmm2,%xmm0
343	leaq	256(%r12),%r12
344	por	%xmm3,%xmm0
345
346	mulq	%rbp
347	addq	%rax,%r10
348	movq	8(%rsi),%rax
349	adcq	$0,%rdx
350	movq	%rdx,%rdi
351
352	mulq	%rbx
353	addq	%rax,%r11
354	movq	8(%rcx),%rax
355	adcq	$0,%rdx
356	movq	%rdx,%r10
357
358	mulq	%rbp
359	addq	%rax,%rdi
360	movq	16(%rsi),%rax
361	adcq	$0,%rdx
362	addq	%r11,%rdi
363	leaq	4(%r15),%r15
364	adcq	$0,%rdx
365	movq	%rdi,(%rsp)
366	movq	%rdx,%r13
367	jmp	.L1st4x
368.align	16
369.L1st4x:
370	mulq	%rbx
371	addq	%rax,%r10
372	movq	-16(%rcx,%r15,8),%rax
373	adcq	$0,%rdx
374	movq	%rdx,%r11
375
376	mulq	%rbp
377	addq	%rax,%r13
378	movq	-8(%rsi,%r15,8),%rax
379	adcq	$0,%rdx
380	addq	%r10,%r13
381	adcq	$0,%rdx
382	movq	%r13,-24(%rsp,%r15,8)
383	movq	%rdx,%rdi
384
385	mulq	%rbx
386	addq	%rax,%r11
387	movq	-8(%rcx,%r15,8),%rax
388	adcq	$0,%rdx
389	movq	%rdx,%r10
390
391	mulq	%rbp
392	addq	%rax,%rdi
393	movq	(%rsi,%r15,8),%rax
394	adcq	$0,%rdx
395	addq	%r11,%rdi
396	adcq	$0,%rdx
397	movq	%rdi,-16(%rsp,%r15,8)
398	movq	%rdx,%r13
399
400	mulq	%rbx
401	addq	%rax,%r10
402	movq	(%rcx,%r15,8),%rax
403	adcq	$0,%rdx
404	movq	%rdx,%r11
405
406	mulq	%rbp
407	addq	%rax,%r13
408	movq	8(%rsi,%r15,8),%rax
409	adcq	$0,%rdx
410	addq	%r10,%r13
411	adcq	$0,%rdx
412	movq	%r13,-8(%rsp,%r15,8)
413	movq	%rdx,%rdi
414
415	mulq	%rbx
416	addq	%rax,%r11
417	movq	8(%rcx,%r15,8),%rax
418	adcq	$0,%rdx
419	leaq	4(%r15),%r15
420	movq	%rdx,%r10
421
422	mulq	%rbp
423	addq	%rax,%rdi
424	movq	-16(%rsi,%r15,8),%rax
425	adcq	$0,%rdx
426	addq	%r11,%rdi
427	adcq	$0,%rdx
428	movq	%rdi,-32(%rsp,%r15,8)
429	movq	%rdx,%r13
430	cmpq	%r9,%r15
431	jl	.L1st4x
432
433	mulq	%rbx
434	addq	%rax,%r10
435	movq	-16(%rcx,%r15,8),%rax
436	adcq	$0,%rdx
437	movq	%rdx,%r11
438
439	mulq	%rbp
440	addq	%rax,%r13
441	movq	-8(%rsi,%r15,8),%rax
442	adcq	$0,%rdx
443	addq	%r10,%r13
444	adcq	$0,%rdx
445	movq	%r13,-24(%rsp,%r15,8)
446	movq	%rdx,%rdi
447
448	mulq	%rbx
449	addq	%rax,%r11
450	movq	-8(%rcx,%r15,8),%rax
451	adcq	$0,%rdx
452	movq	%rdx,%r10
453
454	mulq	%rbp
455	addq	%rax,%rdi
456	movq	(%rsi),%rax
457	adcq	$0,%rdx
458	addq	%r11,%rdi
459	adcq	$0,%rdx
460	movq	%rdi,-16(%rsp,%r15,8)
461	movq	%rdx,%r13
462
463.byte	102,72,15,126,195
464
465	xorq	%rdi,%rdi
466	addq	%r10,%r13
467	adcq	$0,%rdi
468	movq	%r13,-8(%rsp,%r15,8)
469	movq	%rdi,(%rsp,%r15,8)
470
471	leaq	1(%r14),%r14
472.align	4
473.Louter4x:
474	xorq	%r15,%r15
475	movq	-96(%r12),%xmm0
476	movq	-32(%r12),%xmm1
477	pand	%xmm4,%xmm0
478	movq	32(%r12),%xmm2
479	pand	%xmm5,%xmm1
480
481	movq	(%rsp),%r10
482	movq	%r8,%rbp
483	mulq	%rbx
484	addq	%rax,%r10
485	movq	(%rcx),%rax
486	adcq	$0,%rdx
487
488	movq	96(%r12),%xmm3
489	pand	%xmm6,%xmm2
490	por	%xmm1,%xmm0
491	pand	%xmm7,%xmm3
492
493	imulq	%r10,%rbp
494	movq	%rdx,%r11
495
496	por	%xmm2,%xmm0
497	leaq	256(%r12),%r12
498	por	%xmm3,%xmm0
499
500	mulq	%rbp
501	addq	%rax,%r10
502	movq	8(%rsi),%rax
503	adcq	$0,%rdx
504	movq	%rdx,%rdi
505
506	mulq	%rbx
507	addq	%rax,%r11
508	movq	8(%rcx),%rax
509	adcq	$0,%rdx
510	addq	8(%rsp),%r11
511	adcq	$0,%rdx
512	movq	%rdx,%r10
513
514	mulq	%rbp
515	addq	%rax,%rdi
516	movq	16(%rsi),%rax
517	adcq	$0,%rdx
518	addq	%r11,%rdi
519	leaq	4(%r15),%r15
520	adcq	$0,%rdx
521	movq	%rdx,%r13
522	jmp	.Linner4x
523.align	16
524.Linner4x:
525	mulq	%rbx
526	addq	%rax,%r10
527	movq	-16(%rcx,%r15,8),%rax
528	adcq	$0,%rdx
529	addq	-16(%rsp,%r15,8),%r10
530	adcq	$0,%rdx
531	movq	%rdx,%r11
532
533	mulq	%rbp
534	addq	%rax,%r13
535	movq	-8(%rsi,%r15,8),%rax
536	adcq	$0,%rdx
537	addq	%r10,%r13
538	adcq	$0,%rdx
539	movq	%rdi,-32(%rsp,%r15,8)
540	movq	%rdx,%rdi
541
542	mulq	%rbx
543	addq	%rax,%r11
544	movq	-8(%rcx,%r15,8),%rax
545	adcq	$0,%rdx
546	addq	-8(%rsp,%r15,8),%r11
547	adcq	$0,%rdx
548	movq	%rdx,%r10
549
550	mulq	%rbp
551	addq	%rax,%rdi
552	movq	(%rsi,%r15,8),%rax
553	adcq	$0,%rdx
554	addq	%r11,%rdi
555	adcq	$0,%rdx
556	movq	%r13,-24(%rsp,%r15,8)
557	movq	%rdx,%r13
558
559	mulq	%rbx
560	addq	%rax,%r10
561	movq	(%rcx,%r15,8),%rax
562	adcq	$0,%rdx
563	addq	(%rsp,%r15,8),%r10
564	adcq	$0,%rdx
565	movq	%rdx,%r11
566
567	mulq	%rbp
568	addq	%rax,%r13
569	movq	8(%rsi,%r15,8),%rax
570	adcq	$0,%rdx
571	addq	%r10,%r13
572	adcq	$0,%rdx
573	movq	%rdi,-16(%rsp,%r15,8)
574	movq	%rdx,%rdi
575
576	mulq	%rbx
577	addq	%rax,%r11
578	movq	8(%rcx,%r15,8),%rax
579	adcq	$0,%rdx
580	addq	8(%rsp,%r15,8),%r11
581	adcq	$0,%rdx
582	leaq	4(%r15),%r15
583	movq	%rdx,%r10
584
585	mulq	%rbp
586	addq	%rax,%rdi
587	movq	-16(%rsi,%r15,8),%rax
588	adcq	$0,%rdx
589	addq	%r11,%rdi
590	adcq	$0,%rdx
591	movq	%r13,-40(%rsp,%r15,8)
592	movq	%rdx,%r13
593	cmpq	%r9,%r15
594	jl	.Linner4x
595
596	mulq	%rbx
597	addq	%rax,%r10
598	movq	-16(%rcx,%r15,8),%rax
599	adcq	$0,%rdx
600	addq	-16(%rsp,%r15,8),%r10
601	adcq	$0,%rdx
602	movq	%rdx,%r11
603
604	mulq	%rbp
605	addq	%rax,%r13
606	movq	-8(%rsi,%r15,8),%rax
607	adcq	$0,%rdx
608	addq	%r10,%r13
609	adcq	$0,%rdx
610	movq	%rdi,-32(%rsp,%r15,8)
611	movq	%rdx,%rdi
612
613	mulq	%rbx
614	addq	%rax,%r11
615	movq	-8(%rcx,%r15,8),%rax
616	adcq	$0,%rdx
617	addq	-8(%rsp,%r15,8),%r11
618	adcq	$0,%rdx
619	leaq	1(%r14),%r14
620	movq	%rdx,%r10
621
622	mulq	%rbp
623	addq	%rax,%rdi
624	movq	(%rsi),%rax
625	adcq	$0,%rdx
626	addq	%r11,%rdi
627	adcq	$0,%rdx
628	movq	%r13,-24(%rsp,%r15,8)
629	movq	%rdx,%r13
630
631.byte	102,72,15,126,195
632	movq	%rdi,-16(%rsp,%r15,8)
633
634	xorq	%rdi,%rdi
635	addq	%r10,%r13
636	adcq	$0,%rdi
637	addq	(%rsp,%r9,8),%r13
638	adcq	$0,%rdi
639	movq	%r13,-8(%rsp,%r15,8)
640	movq	%rdi,(%rsp,%r15,8)
641
642	cmpq	%r9,%r14
643	jl	.Louter4x
644	movq	16(%rsp,%r9,8),%rdi
645	movq	0(%rsp),%rax
646	pxor	%xmm0,%xmm0
647	movq	8(%rsp),%rdx
648	shrq	$2,%r9
649	leaq	(%rsp),%rsi
650	xorq	%r14,%r14
651
652	subq	0(%rcx),%rax
653	movq	16(%rsi),%rbx
654	movq	24(%rsi),%rbp
655	sbbq	8(%rcx),%rdx
656	leaq	-1(%r9),%r15
657	jmp	.Lsub4x
658.align	16
659.Lsub4x:
660	movq	%rax,0(%rdi,%r14,8)
661	movq	%rdx,8(%rdi,%r14,8)
662	sbbq	16(%rcx,%r14,8),%rbx
663	movq	32(%rsi,%r14,8),%rax
664	movq	40(%rsi,%r14,8),%rdx
665	sbbq	24(%rcx,%r14,8),%rbp
666	movq	%rbx,16(%rdi,%r14,8)
667	movq	%rbp,24(%rdi,%r14,8)
668	sbbq	32(%rcx,%r14,8),%rax
669	movq	48(%rsi,%r14,8),%rbx
670	movq	56(%rsi,%r14,8),%rbp
671	sbbq	40(%rcx,%r14,8),%rdx
672	leaq	4(%r14),%r14
673	decq	%r15
674	jnz	.Lsub4x
675
676	movq	%rax,0(%rdi,%r14,8)
677	movq	32(%rsi,%r14,8),%rax
678	sbbq	16(%rcx,%r14,8),%rbx
679	movq	%rdx,8(%rdi,%r14,8)
680	sbbq	24(%rcx,%r14,8),%rbp
681	movq	%rbx,16(%rdi,%r14,8)
682
683	sbbq	$0,%rax
684	movq	%rbp,24(%rdi,%r14,8)
685	xorq	%r14,%r14
686	andq	%rax,%rsi
687	notq	%rax
688	movq	%rdi,%rcx
689	andq	%rax,%rcx
690	leaq	-1(%r9),%r15
691	orq	%rcx,%rsi
692
693	movdqu	(%rsi),%xmm1
694	movdqa	%xmm0,(%rsp)
695	movdqu	%xmm1,(%rdi)
696	jmp	.Lcopy4x
697.align	16
698.Lcopy4x:
699	movdqu	16(%rsi,%r14,1),%xmm2
700	movdqu	32(%rsi,%r14,1),%xmm1
701	movdqa	%xmm0,16(%rsp,%r14,1)
702	movdqu	%xmm2,16(%rdi,%r14,1)
703	movdqa	%xmm0,32(%rsp,%r14,1)
704	movdqu	%xmm1,32(%rdi,%r14,1)
705	leaq	32(%r14),%r14
706	decq	%r15
707	jnz	.Lcopy4x
708
709	shlq	$2,%r9
710	movdqu	16(%rsi,%r14,1),%xmm2
711	movdqa	%xmm0,16(%rsp,%r14,1)
712	movdqu	%xmm2,16(%rdi,%r14,1)
713	movq	8(%rsp,%r9,8),%rsi
714	movq	$1,%rax
715	movq	(%rsi),%r15
716	movq	8(%rsi),%r14
717	movq	16(%rsi),%r13
718	movq	24(%rsi),%r12
719	movq	32(%rsi),%rbp
720	movq	40(%rsi),%rbx
721	leaq	48(%rsi),%rsp
722.Lmul4x_epilogue:
723	.byte	0xf3,0xc3
724.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
725.globl	bn_scatter5
726.type	bn_scatter5,@function
727.align	16
728bn_scatter5:
729	cmpq	$0,%rsi
730	jz	.Lscatter_epilogue
731	leaq	(%rdx,%rcx,8),%rdx
732.Lscatter:
733	movq	(%rdi),%rax
734	leaq	8(%rdi),%rdi
735	movq	%rax,(%rdx)
736	leaq	256(%rdx),%rdx
737	subq	$1,%rsi
738	jnz	.Lscatter
739.Lscatter_epilogue:
740	.byte	0xf3,0xc3
741.size	bn_scatter5,.-bn_scatter5
742
743.globl	bn_gather5
744.type	bn_gather5,@function
745.align	16
746bn_gather5:
747	movq	%rcx,%r11
748	shrq	$3,%rcx
749	andq	$7,%r11
750	notq	%rcx
751	leaq	.Lmagic_masks(%rip),%rax
752	andq	$3,%rcx
753	leaq	96(%rdx,%r11,8),%rdx
754	movq	0(%rax,%rcx,8),%xmm4
755	movq	8(%rax,%rcx,8),%xmm5
756	movq	16(%rax,%rcx,8),%xmm6
757	movq	24(%rax,%rcx,8),%xmm7
758	jmp	.Lgather
759.align	16
760.Lgather:
761	movq	-96(%rdx),%xmm0
762	movq	-32(%rdx),%xmm1
763	pand	%xmm4,%xmm0
764	movq	32(%rdx),%xmm2
765	pand	%xmm5,%xmm1
766	movq	96(%rdx),%xmm3
767	pand	%xmm6,%xmm2
768	por	%xmm1,%xmm0
769	pand	%xmm7,%xmm3
770	por	%xmm2,%xmm0
771	leaq	256(%rdx),%rdx
772	por	%xmm3,%xmm0
773
774	movq	%xmm0,(%rdi)
775	leaq	8(%rdi),%rdi
776	subq	$1,%rsi
777	jnz	.Lgather
778	.byte	0xf3,0xc3
779.LSEH_end_bn_gather5:
780.size	bn_gather5,.-bn_gather5
781.align	64
782.Lmagic_masks:
783.long	0,0, 0,0, 0,0, -1,-1
784.long	0,0, 0,0, 0,0,  0,0
785.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
786