1/*
2 * include/asm-x86_64/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16
17/*
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20 */
21
22/*
23 * Based on
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
26 */
27
28/*
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
31 *
32 * This hasn't been optimized for the hammer yet, but there are likely
33 * no advantages to be gotten from x86-64 here anyways.
34 */
35
36typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
37
38/* Doesn't use gcc to save the XMM registers, because there is no easy way to
39   tell it to do a clts before the register saving. */
40#define XMMS_SAVE				\
41	asm volatile ( 			\
42		"movq %%cr0,%0		;\n\t"	\
43		"clts			;\n\t"	\
44		"movups %%xmm0,(%1)	;\n\t"	\
45		"movups %%xmm1,0x10(%1)	;\n\t"	\
46		"movups %%xmm2,0x20(%1)	;\n\t"	\
47		"movups %%xmm3,0x30(%1)	;\n\t"	\
48		: "=&r" (cr0)			\
49		: "r" (xmm_save) 		\
50		: "memory")
51
52#define XMMS_RESTORE				\
53	asm volatile ( 			\
54		"sfence			;\n\t"	\
55		"movups (%1),%%xmm0	;\n\t"	\
56		"movups 0x10(%1),%%xmm1	;\n\t"	\
57		"movups 0x20(%1),%%xmm2	;\n\t"	\
58		"movups 0x30(%1),%%xmm3	;\n\t"	\
59		"movq 	%0,%%cr0	;\n\t"	\
60		:				\
61		: "r" (cr0), "r" (xmm_save)	\
62		: "memory")
63
64#define OFFS(x)		"16*("#x")"
65#define PF_OFFS(x)	"256+16*("#x")"
66#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
67#define LD(x,y)		"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
68#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
69#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
70#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
71#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
72#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
73#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
74#define XO1(x,y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
75#define XO2(x,y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
76#define XO3(x,y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
77#define XO4(x,y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
78#define XO5(x,y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
79
80
81static void
82xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
83{
84        unsigned int lines = bytes >> 8;
85	unsigned long cr0;
86	xmm_store_t xmm_save[4];
87
88	XMMS_SAVE;
89
90        asm volatile (
91#undef BLOCK
92#define BLOCK(i) \
93		LD(i,0)					\
94			LD(i+1,1)			\
95		PF1(i)					\
96				PF1(i+2)		\
97				LD(i+2,2)		\
98					LD(i+3,3)	\
99		PF0(i+4)				\
100				PF0(i+6)		\
101		XO1(i,0)				\
102			XO1(i+1,1)			\
103				XO1(i+2,2)		\
104					XO1(i+3,3)	\
105		ST(i,0)					\
106			ST(i+1,1)			\
107				ST(i+2,2)		\
108					ST(i+3,3)	\
109
110
111		PF0(0)
112				PF0(2)
113
114	" .align 32			;\n"
115        " 1:                            ;\n"
116
117		BLOCK(0)
118		BLOCK(4)
119		BLOCK(8)
120		BLOCK(12)
121
122        "       addq %[inc], %[p1]           ;\n"
123        "       addq %[inc], %[p2]           ;\n"
124		"		decl %[cnt] ; jnz 1b"
125	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
126	: [inc] "r" (256UL)
127        : "memory");
128
129	XMMS_RESTORE;
130}
131
132static void
133xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
134	  unsigned long *p3)
135{
136	unsigned int lines = bytes >> 8;
137	xmm_store_t xmm_save[4];
138	unsigned long cr0;
139
140	XMMS_SAVE;
141
142        __asm__ __volatile__ (
143#undef BLOCK
144#define BLOCK(i) \
145		PF1(i)					\
146				PF1(i+2)		\
147		LD(i,0)					\
148			LD(i+1,1)			\
149				LD(i+2,2)		\
150					LD(i+3,3)	\
151		PF2(i)					\
152				PF2(i+2)		\
153		PF0(i+4)				\
154				PF0(i+6)		\
155		XO1(i,0)				\
156			XO1(i+1,1)			\
157				XO1(i+2,2)		\
158					XO1(i+3,3)	\
159		XO2(i,0)				\
160			XO2(i+1,1)			\
161				XO2(i+2,2)		\
162					XO2(i+3,3)	\
163		ST(i,0)					\
164			ST(i+1,1)			\
165				ST(i+2,2)		\
166					ST(i+3,3)	\
167
168
169		PF0(0)
170				PF0(2)
171
172	" .align 32			;\n"
173        " 1:                            ;\n"
174
175		BLOCK(0)
176		BLOCK(4)
177		BLOCK(8)
178		BLOCK(12)
179
180        "       addq %[inc], %[p1]           ;\n"
181        "       addq %[inc], %[p2]          ;\n"
182        "       addq %[inc], %[p3]           ;\n"
183		"		decl %[cnt] ; jnz 1b"
184	: [cnt] "+r" (lines),
185	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
186	: [inc] "r" (256UL)
187	: "memory");
188	XMMS_RESTORE;
189}
190
191static void
192xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
193	  unsigned long *p3, unsigned long *p4)
194{
195	unsigned int lines = bytes >> 8;
196	xmm_store_t xmm_save[4];
197	unsigned long cr0;
198
199	XMMS_SAVE;
200
201        __asm__ __volatile__ (
202#undef BLOCK
203#define BLOCK(i) \
204		PF1(i)					\
205				PF1(i+2)		\
206		LD(i,0)					\
207			LD(i+1,1)			\
208				LD(i+2,2)		\
209					LD(i+3,3)	\
210		PF2(i)					\
211				PF2(i+2)		\
212		XO1(i,0)				\
213			XO1(i+1,1)			\
214				XO1(i+2,2)		\
215					XO1(i+3,3)	\
216		PF3(i)					\
217				PF3(i+2)		\
218		PF0(i+4)				\
219				PF0(i+6)		\
220		XO2(i,0)				\
221			XO2(i+1,1)			\
222				XO2(i+2,2)		\
223					XO2(i+3,3)	\
224		XO3(i,0)				\
225			XO3(i+1,1)			\
226				XO3(i+2,2)		\
227					XO3(i+3,3)	\
228		ST(i,0)					\
229			ST(i+1,1)			\
230				ST(i+2,2)		\
231					ST(i+3,3)	\
232
233
234		PF0(0)
235				PF0(2)
236
237	" .align 32			;\n"
238        " 1:                            ;\n"
239
240		BLOCK(0)
241		BLOCK(4)
242		BLOCK(8)
243		BLOCK(12)
244
245        "       addq %[inc], %[p1]           ;\n"
246        "       addq %[inc], %[p2]           ;\n"
247        "       addq %[inc], %[p3]           ;\n"
248        "       addq %[inc], %[p4]           ;\n"
249	"	decl %[cnt] ; jnz 1b"
250	: [cnt] "+c" (lines),
251	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
252	: [inc] "r" (256UL)
253        : "memory" );
254
255	XMMS_RESTORE;
256}
257
258static void
259xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
260	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
261{
262        unsigned int lines = bytes >> 8;
263	xmm_store_t xmm_save[4];
264	unsigned long cr0;
265
266	XMMS_SAVE;
267
268        __asm__ __volatile__ (
269#undef BLOCK
270#define BLOCK(i) \
271		PF1(i)					\
272				PF1(i+2)		\
273		LD(i,0)					\
274			LD(i+1,1)			\
275				LD(i+2,2)		\
276					LD(i+3,3)	\
277		PF2(i)					\
278				PF2(i+2)		\
279		XO1(i,0)				\
280			XO1(i+1,1)			\
281				XO1(i+2,2)		\
282					XO1(i+3,3)	\
283		PF3(i)					\
284				PF3(i+2)		\
285		XO2(i,0)				\
286			XO2(i+1,1)			\
287				XO2(i+2,2)		\
288					XO2(i+3,3)	\
289		PF4(i)					\
290				PF4(i+2)		\
291		PF0(i+4)				\
292				PF0(i+6)		\
293		XO3(i,0)				\
294			XO3(i+1,1)			\
295				XO3(i+2,2)		\
296					XO3(i+3,3)	\
297		XO4(i,0)				\
298			XO4(i+1,1)			\
299				XO4(i+2,2)		\
300					XO4(i+3,3)	\
301		ST(i,0)					\
302			ST(i+1,1)			\
303				ST(i+2,2)		\
304					ST(i+3,3)	\
305
306
307		PF0(0)
308				PF0(2)
309
310	" .align 32			;\n"
311        " 1:                            ;\n"
312
313		BLOCK(0)
314		BLOCK(4)
315		BLOCK(8)
316		BLOCK(12)
317
318        "       addq %[inc], %[p1]           ;\n"
319        "       addq %[inc], %[p2]           ;\n"
320        "       addq %[inc], %[p3]           ;\n"
321        "       addq %[inc], %[p4]           ;\n"
322        "       addq %[inc], %[p5]           ;\n"
323	"	decl %[cnt] ; jnz 1b"
324	: [cnt] "+c" (lines),
325  	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
326	  [p5] "+r" (p5)
327	: [inc] "r" (256UL)
328	: "memory");
329
330	XMMS_RESTORE;
331}
332
333static struct xor_block_template xor_block_sse = {
334        name: "generic_sse",
335        do_2: xor_sse_2,
336        do_3: xor_sse_3,
337        do_4: xor_sse_4,
338        do_5: xor_sse_5,
339};
340
341#undef XOR_TRY_TEMPLATES
342#define XOR_TRY_TEMPLATES				\
343	do {						\
344		xor_speed(&xor_block_sse);	\
345	} while (0)
346
347/* We force the use of the SSE xor block because it can write around L2.
348   We may also be able to load into the L1 only depending on how the cpu
349   deals with a load to a line that is being prefetched.  */
350#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
351