1	# $FreeBSD$
2.text
3
4.type	_mul_1x1,@function
5.align	16
6_mul_1x1:
7	subq	$128+8,%rsp
8	movq	$-1,%r9
9	leaq	(%rax,%rax,1),%rsi
10	shrq	$3,%r9
11	leaq	(,%rax,4),%rdi
12	andq	%rax,%r9
13	leaq	(,%rax,8),%r12
14	sarq	$63,%rax
15	leaq	(%r9,%r9,1),%r10
16	sarq	$63,%rsi
17	leaq	(,%r9,4),%r11
18	andq	%rbp,%rax
19	sarq	$63,%rdi
20	movq	%rax,%rdx
21	shlq	$63,%rax
22	andq	%rbp,%rsi
23	shrq	$1,%rdx
24	movq	%rsi,%rcx
25	shlq	$62,%rsi
26	andq	%rbp,%rdi
27	shrq	$2,%rcx
28	xorq	%rsi,%rax
29	movq	%rdi,%rbx
30	shlq	$61,%rdi
31	xorq	%rcx,%rdx
32	shrq	$3,%rbx
33	xorq	%rdi,%rax
34	xorq	%rbx,%rdx
35
36	movq	%r9,%r13
37	movq	$0,0(%rsp)
38	xorq	%r10,%r13
39	movq	%r9,8(%rsp)
40	movq	%r11,%r14
41	movq	%r10,16(%rsp)
42	xorq	%r12,%r14
43	movq	%r13,24(%rsp)
44
45	xorq	%r11,%r9
46	movq	%r11,32(%rsp)
47	xorq	%r11,%r10
48	movq	%r9,40(%rsp)
49	xorq	%r11,%r13
50	movq	%r10,48(%rsp)
51	xorq	%r14,%r9
52	movq	%r13,56(%rsp)
53	xorq	%r14,%r10
54
55	movq	%r12,64(%rsp)
56	xorq	%r14,%r13
57	movq	%r9,72(%rsp)
58	xorq	%r11,%r9
59	movq	%r10,80(%rsp)
60	xorq	%r11,%r10
61	movq	%r13,88(%rsp)
62
63	xorq	%r11,%r13
64	movq	%r14,96(%rsp)
65	movq	%r8,%rsi
66	movq	%r9,104(%rsp)
67	andq	%rbp,%rsi
68	movq	%r10,112(%rsp)
69	shrq	$4,%rbp
70	movq	%r13,120(%rsp)
71	movq	%r8,%rdi
72	andq	%rbp,%rdi
73	shrq	$4,%rbp
74
75	movq	(%rsp,%rsi,8),%xmm0
76	movq	%r8,%rsi
77	andq	%rbp,%rsi
78	shrq	$4,%rbp
79	movq	(%rsp,%rdi,8),%rcx
80	movq	%r8,%rdi
81	movq	%rcx,%rbx
82	shlq	$4,%rcx
83	andq	%rbp,%rdi
84	movq	(%rsp,%rsi,8),%xmm1
85	shrq	$60,%rbx
86	xorq	%rcx,%rax
87	pslldq	$1,%xmm1
88	movq	%r8,%rsi
89	shrq	$4,%rbp
90	xorq	%rbx,%rdx
91	andq	%rbp,%rsi
92	shrq	$4,%rbp
93	pxor	%xmm1,%xmm0
94	movq	(%rsp,%rdi,8),%rcx
95	movq	%r8,%rdi
96	movq	%rcx,%rbx
97	shlq	$12,%rcx
98	andq	%rbp,%rdi
99	movq	(%rsp,%rsi,8),%xmm1
100	shrq	$52,%rbx
101	xorq	%rcx,%rax
102	pslldq	$2,%xmm1
103	movq	%r8,%rsi
104	shrq	$4,%rbp
105	xorq	%rbx,%rdx
106	andq	%rbp,%rsi
107	shrq	$4,%rbp
108	pxor	%xmm1,%xmm0
109	movq	(%rsp,%rdi,8),%rcx
110	movq	%r8,%rdi
111	movq	%rcx,%rbx
112	shlq	$20,%rcx
113	andq	%rbp,%rdi
114	movq	(%rsp,%rsi,8),%xmm1
115	shrq	$44,%rbx
116	xorq	%rcx,%rax
117	pslldq	$3,%xmm1
118	movq	%r8,%rsi
119	shrq	$4,%rbp
120	xorq	%rbx,%rdx
121	andq	%rbp,%rsi
122	shrq	$4,%rbp
123	pxor	%xmm1,%xmm0
124	movq	(%rsp,%rdi,8),%rcx
125	movq	%r8,%rdi
126	movq	%rcx,%rbx
127	shlq	$28,%rcx
128	andq	%rbp,%rdi
129	movq	(%rsp,%rsi,8),%xmm1
130	shrq	$36,%rbx
131	xorq	%rcx,%rax
132	pslldq	$4,%xmm1
133	movq	%r8,%rsi
134	shrq	$4,%rbp
135	xorq	%rbx,%rdx
136	andq	%rbp,%rsi
137	shrq	$4,%rbp
138	pxor	%xmm1,%xmm0
139	movq	(%rsp,%rdi,8),%rcx
140	movq	%r8,%rdi
141	movq	%rcx,%rbx
142	shlq	$36,%rcx
143	andq	%rbp,%rdi
144	movq	(%rsp,%rsi,8),%xmm1
145	shrq	$28,%rbx
146	xorq	%rcx,%rax
147	pslldq	$5,%xmm1
148	movq	%r8,%rsi
149	shrq	$4,%rbp
150	xorq	%rbx,%rdx
151	andq	%rbp,%rsi
152	shrq	$4,%rbp
153	pxor	%xmm1,%xmm0
154	movq	(%rsp,%rdi,8),%rcx
155	movq	%r8,%rdi
156	movq	%rcx,%rbx
157	shlq	$44,%rcx
158	andq	%rbp,%rdi
159	movq	(%rsp,%rsi,8),%xmm1
160	shrq	$20,%rbx
161	xorq	%rcx,%rax
162	pslldq	$6,%xmm1
163	movq	%r8,%rsi
164	shrq	$4,%rbp
165	xorq	%rbx,%rdx
166	andq	%rbp,%rsi
167	shrq	$4,%rbp
168	pxor	%xmm1,%xmm0
169	movq	(%rsp,%rdi,8),%rcx
170	movq	%r8,%rdi
171	movq	%rcx,%rbx
172	shlq	$52,%rcx
173	andq	%rbp,%rdi
174	movq	(%rsp,%rsi,8),%xmm1
175	shrq	$12,%rbx
176	xorq	%rcx,%rax
177	pslldq	$7,%xmm1
178	movq	%r8,%rsi
179	shrq	$4,%rbp
180	xorq	%rbx,%rdx
181	andq	%rbp,%rsi
182	shrq	$4,%rbp
183	pxor	%xmm1,%xmm0
184	movq	(%rsp,%rdi,8),%rcx
185	movq	%rcx,%rbx
186	shlq	$60,%rcx
187.byte	102,72,15,126,198
188	shrq	$4,%rbx
189	xorq	%rcx,%rax
190	psrldq	$8,%xmm0
191	xorq	%rbx,%rdx
192.byte	102,72,15,126,199
193	xorq	%rsi,%rax
194	xorq	%rdi,%rdx
195
196	addq	$128+8,%rsp
197	.byte	0xf3,0xc3
198.Lend_mul_1x1:
199.size	_mul_1x1,.-_mul_1x1
200
201.globl	bn_GF2m_mul_2x2
202.type	bn_GF2m_mul_2x2,@function
203.align	16
204bn_GF2m_mul_2x2:
205	movq	OPENSSL_ia32cap_P(%rip),%rax
206	btq	$33,%rax
207	jnc	.Lvanilla_mul_2x2
208
209.byte	102,72,15,110,198
210.byte	102,72,15,110,201
211.byte	102,72,15,110,210
212.byte	102,73,15,110,216
213	movdqa	%xmm0,%xmm4
214	movdqa	%xmm1,%xmm5
215.byte	102,15,58,68,193,0
216	pxor	%xmm2,%xmm4
217	pxor	%xmm3,%xmm5
218.byte	102,15,58,68,211,0
219.byte	102,15,58,68,229,0
220	xorps	%xmm0,%xmm4
221	xorps	%xmm2,%xmm4
222	movdqa	%xmm4,%xmm5
223	pslldq	$8,%xmm4
224	psrldq	$8,%xmm5
225	pxor	%xmm4,%xmm2
226	pxor	%xmm5,%xmm0
227	movdqu	%xmm2,0(%rdi)
228	movdqu	%xmm0,16(%rdi)
229	.byte	0xf3,0xc3
230
231.align	16
232.Lvanilla_mul_2x2:
233	leaq	-136(%rsp),%rsp
234	movq	%r14,80(%rsp)
235	movq	%r13,88(%rsp)
236	movq	%r12,96(%rsp)
237	movq	%rbp,104(%rsp)
238	movq	%rbx,112(%rsp)
239.Lbody_mul_2x2:
240	movq	%rdi,32(%rsp)
241	movq	%rsi,40(%rsp)
242	movq	%rdx,48(%rsp)
243	movq	%rcx,56(%rsp)
244	movq	%r8,64(%rsp)
245
246	movq	$15,%r8
247	movq	%rsi,%rax
248	movq	%rcx,%rbp
249	call	_mul_1x1
250	movq	%rax,16(%rsp)
251	movq	%rdx,24(%rsp)
252
253	movq	48(%rsp),%rax
254	movq	64(%rsp),%rbp
255	call	_mul_1x1
256	movq	%rax,0(%rsp)
257	movq	%rdx,8(%rsp)
258
259	movq	40(%rsp),%rax
260	movq	56(%rsp),%rbp
261	xorq	48(%rsp),%rax
262	xorq	64(%rsp),%rbp
263	call	_mul_1x1
264	movq	0(%rsp),%rbx
265	movq	8(%rsp),%rcx
266	movq	16(%rsp),%rdi
267	movq	24(%rsp),%rsi
268	movq	32(%rsp),%rbp
269
270	xorq	%rdx,%rax
271	xorq	%rcx,%rdx
272	xorq	%rbx,%rax
273	movq	%rbx,0(%rbp)
274	xorq	%rdi,%rdx
275	movq	%rsi,24(%rbp)
276	xorq	%rsi,%rax
277	xorq	%rsi,%rdx
278	xorq	%rdx,%rax
279	movq	%rdx,16(%rbp)
280	movq	%rax,8(%rbp)
281
282	movq	80(%rsp),%r14
283	movq	88(%rsp),%r13
284	movq	96(%rsp),%r12
285	movq	104(%rsp),%rbp
286	movq	112(%rsp),%rbx
287	leaq	136(%rsp),%rsp
288	.byte	0xf3,0xc3
289.Lend_mul_2x2:
290.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
291.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
292.align	16
293