1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#ifndef _ASM_X86_XOR_H
3#define _ASM_X86_XOR_H
4
5/*
6 * Optimized RAID-5 checksumming functions for SSE.
7 */
8
9/*
10 * Cache avoiding checksumming functions utilizing KNI instructions
11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12 */
13
14/*
15 * Based on
16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
17 * Copyright (C) 1998 Ingo Molnar.
18 */
19
20/*
21 * x86-64 changes / gcc fixes from Andi Kleen.
22 * Copyright 2002 Andi Kleen, SuSE Labs.
23 *
24 * This hasn't been optimized for the hammer yet, but there are likely
25 * no advantages to be gotten from x86-64 here anyways.
26 */
27
28#include <asm/fpu/api.h>
29
30#ifdef CONFIG_X86_32
31/* reduce register pressure */
32# define XOR_CONSTANT_CONSTRAINT "i"
33#else
34# define XOR_CONSTANT_CONSTRAINT "re"
35#endif
36
37#define OFFS(x)		"16*("#x")"
38#define PF_OFFS(x)	"256+16*("#x")"
39#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
40#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
41#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
42#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
43#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
44#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
45#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
46#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
47#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
48#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
49#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
50#define NOP(x)
51
52#define BLK64(pf, op, i)				\
53		pf(i)					\
54		op(i, 0)				\
55			op(i + 1, 1)			\
56				op(i + 2, 2)		\
57					op(i + 3, 3)
58
59static void
60xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
61	  const unsigned long * __restrict p2)
62{
63	unsigned long lines = bytes >> 8;
64
65	kernel_fpu_begin();
66
67	asm volatile(
68#undef BLOCK
69#define BLOCK(i)					\
70		LD(i, 0)				\
71			LD(i + 1, 1)			\
72		PF1(i)					\
73				PF1(i + 2)		\
74				LD(i + 2, 2)		\
75					LD(i + 3, 3)	\
76		PF0(i + 4)				\
77				PF0(i + 6)		\
78		XO1(i, 0)				\
79			XO1(i + 1, 1)			\
80				XO1(i + 2, 2)		\
81					XO1(i + 3, 3)	\
82		ST(i, 0)				\
83			ST(i + 1, 1)			\
84				ST(i + 2, 2)		\
85					ST(i + 3, 3)	\
86
87
88		PF0(0)
89				PF0(2)
90
91	" .align 32			;\n"
92	" 1:                            ;\n"
93
94		BLOCK(0)
95		BLOCK(4)
96		BLOCK(8)
97		BLOCK(12)
98
99	"       add %[inc], %[p1]       ;\n"
100	"       add %[inc], %[p2]       ;\n"
101	"       dec %[cnt]              ;\n"
102	"       jnz 1b                  ;\n"
103	: [cnt] "+r" (lines),
104	  [p1] "+r" (p1), [p2] "+r" (p2)
105	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
106	: "memory");
107
108	kernel_fpu_end();
109}
110
111static void
112xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113	       const unsigned long * __restrict p2)
114{
115	unsigned long lines = bytes >> 8;
116
117	kernel_fpu_begin();
118
119	asm volatile(
120#undef BLOCK
121#define BLOCK(i)			\
122		BLK64(PF0, LD, i)	\
123		BLK64(PF1, XO1, i)	\
124		BLK64(NOP, ST, i)	\
125
126	" .align 32			;\n"
127	" 1:                            ;\n"
128
129		BLOCK(0)
130		BLOCK(4)
131		BLOCK(8)
132		BLOCK(12)
133
134	"       add %[inc], %[p1]       ;\n"
135	"       add %[inc], %[p2]       ;\n"
136	"       dec %[cnt]              ;\n"
137	"       jnz 1b                  ;\n"
138	: [cnt] "+r" (lines),
139	  [p1] "+r" (p1), [p2] "+r" (p2)
140	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
141	: "memory");
142
143	kernel_fpu_end();
144}
145
146static void
147xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148	  const unsigned long * __restrict p2,
149	  const unsigned long * __restrict p3)
150{
151	unsigned long lines = bytes >> 8;
152
153	kernel_fpu_begin();
154
155	asm volatile(
156#undef BLOCK
157#define BLOCK(i) \
158		PF1(i)					\
159				PF1(i + 2)		\
160		LD(i, 0)				\
161			LD(i + 1, 1)			\
162				LD(i + 2, 2)		\
163					LD(i + 3, 3)	\
164		PF2(i)					\
165				PF2(i + 2)		\
166		PF0(i + 4)				\
167				PF0(i + 6)		\
168		XO1(i, 0)				\
169			XO1(i + 1, 1)			\
170				XO1(i + 2, 2)		\
171					XO1(i + 3, 3)	\
172		XO2(i, 0)				\
173			XO2(i + 1, 1)			\
174				XO2(i + 2, 2)		\
175					XO2(i + 3, 3)	\
176		ST(i, 0)				\
177			ST(i + 1, 1)			\
178				ST(i + 2, 2)		\
179					ST(i + 3, 3)	\
180
181
182		PF0(0)
183				PF0(2)
184
185	" .align 32			;\n"
186	" 1:                            ;\n"
187
188		BLOCK(0)
189		BLOCK(4)
190		BLOCK(8)
191		BLOCK(12)
192
193	"       add %[inc], %[p1]       ;\n"
194	"       add %[inc], %[p2]       ;\n"
195	"       add %[inc], %[p3]       ;\n"
196	"       dec %[cnt]              ;\n"
197	"       jnz 1b                  ;\n"
198	: [cnt] "+r" (lines),
199	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
201	: "memory");
202
203	kernel_fpu_end();
204}
205
206static void
207xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208	       const unsigned long * __restrict p2,
209	       const unsigned long * __restrict p3)
210{
211	unsigned long lines = bytes >> 8;
212
213	kernel_fpu_begin();
214
215	asm volatile(
216#undef BLOCK
217#define BLOCK(i)			\
218		BLK64(PF0, LD, i)	\
219		BLK64(PF1, XO1, i)	\
220		BLK64(PF2, XO2, i)	\
221		BLK64(NOP, ST, i)	\
222
223	" .align 32			;\n"
224	" 1:                            ;\n"
225
226		BLOCK(0)
227		BLOCK(4)
228		BLOCK(8)
229		BLOCK(12)
230
231	"       add %[inc], %[p1]       ;\n"
232	"       add %[inc], %[p2]       ;\n"
233	"       add %[inc], %[p3]       ;\n"
234	"       dec %[cnt]              ;\n"
235	"       jnz 1b                  ;\n"
236	: [cnt] "+r" (lines),
237	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
239	: "memory");
240
241	kernel_fpu_end();
242}
243
244static void
245xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246	  const unsigned long * __restrict p2,
247	  const unsigned long * __restrict p3,
248	  const unsigned long * __restrict p4)
249{
250	unsigned long lines = bytes >> 8;
251
252	kernel_fpu_begin();
253
254	asm volatile(
255#undef BLOCK
256#define BLOCK(i) \
257		PF1(i)					\
258				PF1(i + 2)		\
259		LD(i, 0)				\
260			LD(i + 1, 1)			\
261				LD(i + 2, 2)		\
262					LD(i + 3, 3)	\
263		PF2(i)					\
264				PF2(i + 2)		\
265		XO1(i, 0)				\
266			XO1(i + 1, 1)			\
267				XO1(i + 2, 2)		\
268					XO1(i + 3, 3)	\
269		PF3(i)					\
270				PF3(i + 2)		\
271		PF0(i + 4)				\
272				PF0(i + 6)		\
273		XO2(i, 0)				\
274			XO2(i + 1, 1)			\
275				XO2(i + 2, 2)		\
276					XO2(i + 3, 3)	\
277		XO3(i, 0)				\
278			XO3(i + 1, 1)			\
279				XO3(i + 2, 2)		\
280					XO3(i + 3, 3)	\
281		ST(i, 0)				\
282			ST(i + 1, 1)			\
283				ST(i + 2, 2)		\
284					ST(i + 3, 3)	\
285
286
287		PF0(0)
288				PF0(2)
289
290	" .align 32			;\n"
291	" 1:                            ;\n"
292
293		BLOCK(0)
294		BLOCK(4)
295		BLOCK(8)
296		BLOCK(12)
297
298	"       add %[inc], %[p1]       ;\n"
299	"       add %[inc], %[p2]       ;\n"
300	"       add %[inc], %[p3]       ;\n"
301	"       add %[inc], %[p4]       ;\n"
302	"       dec %[cnt]              ;\n"
303	"       jnz 1b                  ;\n"
304	: [cnt] "+r" (lines), [p1] "+r" (p1),
305	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
307	: "memory");
308
309	kernel_fpu_end();
310}
311
312static void
313xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314	       const unsigned long * __restrict p2,
315	       const unsigned long * __restrict p3,
316	       const unsigned long * __restrict p4)
317{
318	unsigned long lines = bytes >> 8;
319
320	kernel_fpu_begin();
321
322	asm volatile(
323#undef BLOCK
324#define BLOCK(i)			\
325		BLK64(PF0, LD, i)	\
326		BLK64(PF1, XO1, i)	\
327		BLK64(PF2, XO2, i)	\
328		BLK64(PF3, XO3, i)	\
329		BLK64(NOP, ST, i)	\
330
331	" .align 32			;\n"
332	" 1:                            ;\n"
333
334		BLOCK(0)
335		BLOCK(4)
336		BLOCK(8)
337		BLOCK(12)
338
339	"       add %[inc], %[p1]       ;\n"
340	"       add %[inc], %[p2]       ;\n"
341	"       add %[inc], %[p3]       ;\n"
342	"       add %[inc], %[p4]       ;\n"
343	"       dec %[cnt]              ;\n"
344	"       jnz 1b                  ;\n"
345	: [cnt] "+r" (lines), [p1] "+r" (p1),
346	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348	: "memory");
349
350	kernel_fpu_end();
351}
352
353static void
354xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355	  const unsigned long * __restrict p2,
356	  const unsigned long * __restrict p3,
357	  const unsigned long * __restrict p4,
358	  const unsigned long * __restrict p5)
359{
360	unsigned long lines = bytes >> 8;
361
362	kernel_fpu_begin();
363
364	asm volatile(
365#undef BLOCK
366#define BLOCK(i) \
367		PF1(i)					\
368				PF1(i + 2)		\
369		LD(i, 0)				\
370			LD(i + 1, 1)			\
371				LD(i + 2, 2)		\
372					LD(i + 3, 3)	\
373		PF2(i)					\
374				PF2(i + 2)		\
375		XO1(i, 0)				\
376			XO1(i + 1, 1)			\
377				XO1(i + 2, 2)		\
378					XO1(i + 3, 3)	\
379		PF3(i)					\
380				PF3(i + 2)		\
381		XO2(i, 0)				\
382			XO2(i + 1, 1)			\
383				XO2(i + 2, 2)		\
384					XO2(i + 3, 3)	\
385		PF4(i)					\
386				PF4(i + 2)		\
387		PF0(i + 4)				\
388				PF0(i + 6)		\
389		XO3(i, 0)				\
390			XO3(i + 1, 1)			\
391				XO3(i + 2, 2)		\
392					XO3(i + 3, 3)	\
393		XO4(i, 0)				\
394			XO4(i + 1, 1)			\
395				XO4(i + 2, 2)		\
396					XO4(i + 3, 3)	\
397		ST(i, 0)				\
398			ST(i + 1, 1)			\
399				ST(i + 2, 2)		\
400					ST(i + 3, 3)	\
401
402
403		PF0(0)
404				PF0(2)
405
406	" .align 32			;\n"
407	" 1:                            ;\n"
408
409		BLOCK(0)
410		BLOCK(4)
411		BLOCK(8)
412		BLOCK(12)
413
414	"       add %[inc], %[p1]       ;\n"
415	"       add %[inc], %[p2]       ;\n"
416	"       add %[inc], %[p3]       ;\n"
417	"       add %[inc], %[p4]       ;\n"
418	"       add %[inc], %[p5]       ;\n"
419	"       dec %[cnt]              ;\n"
420	"       jnz 1b                  ;\n"
421	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424	: "memory");
425
426	kernel_fpu_end();
427}
428
429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431	       const unsigned long * __restrict p2,
432	       const unsigned long * __restrict p3,
433	       const unsigned long * __restrict p4,
434	       const unsigned long * __restrict p5)
435{
436	unsigned long lines = bytes >> 8;
437
438	kernel_fpu_begin();
439
440	asm volatile(
441#undef BLOCK
442#define BLOCK(i)			\
443		BLK64(PF0, LD, i)	\
444		BLK64(PF1, XO1, i)	\
445		BLK64(PF2, XO2, i)	\
446		BLK64(PF3, XO3, i)	\
447		BLK64(PF4, XO4, i)	\
448		BLK64(NOP, ST, i)	\
449
450	" .align 32			;\n"
451	" 1:                            ;\n"
452
453		BLOCK(0)
454		BLOCK(4)
455		BLOCK(8)
456		BLOCK(12)
457
458	"       add %[inc], %[p1]       ;\n"
459	"       add %[inc], %[p2]       ;\n"
460	"       add %[inc], %[p3]       ;\n"
461	"       add %[inc], %[p4]       ;\n"
462	"       add %[inc], %[p5]       ;\n"
463	"       dec %[cnt]              ;\n"
464	"       jnz 1b                  ;\n"
465	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
468	: "memory");
469
470	kernel_fpu_end();
471}
472
473static struct xor_block_template xor_block_sse_pf64 = {
474	.name = "prefetch64-sse",
475	.do_2 = xor_sse_2_pf64,
476	.do_3 = xor_sse_3_pf64,
477	.do_4 = xor_sse_4_pf64,
478	.do_5 = xor_sse_5_pf64,
479};
480
481#undef LD
482#undef XO1
483#undef XO2
484#undef XO3
485#undef XO4
486#undef ST
487#undef NOP
488#undef BLK64
489#undef BLOCK
490
491#undef XOR_CONSTANT_CONSTRAINT
492
493#ifdef CONFIG_X86_32
494# include <asm/xor_32.h>
495#else
496# include <asm/xor_64.h>
497#endif
498
499#define XOR_SELECT_TEMPLATE(FASTEST) \
500	AVX_SELECT(FASTEST)
501
502#endif /* _ASM_X86_XOR_H */
503