1/* 2 * include/asm-x86_64/xor.h 3 * 4 * Optimized RAID-5 checksumming functions for MMX and SSE. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2, or (at your option) 9 * any later version. 10 * 11 * You should have received a copy of the GNU General Public License 12 * (for example /usr/src/linux/COPYING); if not, write to the Free 13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 14 */ 15 16 17/* 18 * Cache avoiding checksumming functions utilizing KNI instructions 19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 20 */ 21 22/* 23 * Based on 24 * High-speed RAID5 checksumming functions utilizing SSE instructions. 25 * Copyright (C) 1998 Ingo Molnar. 26 */ 27 28/* 29 * x86-64 changes / gcc fixes from Andi Kleen. 30 * Copyright 2002 Andi Kleen, SuSE Labs. 31 * 32 * This hasn't been optimized for the hammer yet, but there are likely 33 * no advantages to be gotten from x86-64 here anyways. 34 */ 35 36typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; 37 38/* Doesn't use gcc to save the XMM registers, because there is no easy way to 39 tell it to do a clts before the register saving. */ 40#define XMMS_SAVE \ 41 asm volatile ( \ 42 "movq %%cr0,%0 ;\n\t" \ 43 "clts ;\n\t" \ 44 "movups %%xmm0,(%1) ;\n\t" \ 45 "movups %%xmm1,0x10(%1) ;\n\t" \ 46 "movups %%xmm2,0x20(%1) ;\n\t" \ 47 "movups %%xmm3,0x30(%1) ;\n\t" \ 48 : "=&r" (cr0) \ 49 : "r" (xmm_save) \ 50 : "memory") 51 52#define XMMS_RESTORE \ 53 asm volatile ( \ 54 "sfence ;\n\t" \ 55 "movups (%1),%%xmm0 ;\n\t" \ 56 "movups 0x10(%1),%%xmm1 ;\n\t" \ 57 "movups 0x20(%1),%%xmm2 ;\n\t" \ 58 "movups 0x30(%1),%%xmm3 ;\n\t" \ 59 "movq %0,%%cr0 ;\n\t" \ 60 : \ 61 : "r" (cr0), "r" (xmm_save) \ 62 : "memory") 63 64#define OFFS(x) "16*("#x")" 65#define PF_OFFS(x) "256+16*("#x")" 66#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 67#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 68#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 69#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 70#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 71#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 72#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 73#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" 74#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 75#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 76#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 77#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 78#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" 79 80 81static void 82xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 83{ 84 unsigned int lines = bytes >> 8; 85 unsigned long cr0; 86 xmm_store_t xmm_save[4]; 87 88 XMMS_SAVE; 89 90 asm volatile ( 91#undef BLOCK 92#define BLOCK(i) \ 93 LD(i,0) \ 94 LD(i+1,1) \ 95 PF1(i) \ 96 PF1(i+2) \ 97 LD(i+2,2) \ 98 LD(i+3,3) \ 99 PF0(i+4) \ 100 PF0(i+6) \ 101 XO1(i,0) \ 102 XO1(i+1,1) \ 103 XO1(i+2,2) \ 104 XO1(i+3,3) \ 105 ST(i,0) \ 106 ST(i+1,1) \ 107 ST(i+2,2) \ 108 ST(i+3,3) \ 109 110 111 PF0(0) 112 PF0(2) 113 114 " .align 32 ;\n" 115 " 1: ;\n" 116 117 BLOCK(0) 118 BLOCK(4) 119 BLOCK(8) 120 BLOCK(12) 121 122 " addq %[inc], %[p1] ;\n" 123 " addq %[inc], %[p2] ;\n" 124 " decl %[cnt] ; jnz 1b" 125 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) 126 : [inc] "r" (256UL) 127 : "memory"); 128 129 XMMS_RESTORE; 130} 131 132static void 133xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 134 unsigned long *p3) 135{ 136 unsigned int lines = bytes >> 8; 137 xmm_store_t xmm_save[4]; 138 unsigned long cr0; 139 140 XMMS_SAVE; 141 142 __asm__ __volatile__ ( 143#undef BLOCK 144#define BLOCK(i) \ 145 PF1(i) \ 146 PF1(i+2) \ 147 LD(i,0) \ 148 LD(i+1,1) \ 149 LD(i+2,2) \ 150 LD(i+3,3) \ 151 PF2(i) \ 152 PF2(i+2) \ 153 PF0(i+4) \ 154 PF0(i+6) \ 155 XO1(i,0) \ 156 XO1(i+1,1) \ 157 XO1(i+2,2) \ 158 XO1(i+3,3) \ 159 XO2(i,0) \ 160 XO2(i+1,1) \ 161 XO2(i+2,2) \ 162 XO2(i+3,3) \ 163 ST(i,0) \ 164 ST(i+1,1) \ 165 ST(i+2,2) \ 166 ST(i+3,3) \ 167 168 169 PF0(0) 170 PF0(2) 171 172 " .align 32 ;\n" 173 " 1: ;\n" 174 175 BLOCK(0) 176 BLOCK(4) 177 BLOCK(8) 178 BLOCK(12) 179 180 " addq %[inc], %[p1] ;\n" 181 " addq %[inc], %[p2] ;\n" 182 " addq %[inc], %[p3] ;\n" 183 " decl %[cnt] ; jnz 1b" 184 : [cnt] "+r" (lines), 185 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 186 : [inc] "r" (256UL) 187 : "memory"); 188 XMMS_RESTORE; 189} 190 191static void 192xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 193 unsigned long *p3, unsigned long *p4) 194{ 195 unsigned int lines = bytes >> 8; 196 xmm_store_t xmm_save[4]; 197 unsigned long cr0; 198 199 XMMS_SAVE; 200 201 __asm__ __volatile__ ( 202#undef BLOCK 203#define BLOCK(i) \ 204 PF1(i) \ 205 PF1(i+2) \ 206 LD(i,0) \ 207 LD(i+1,1) \ 208 LD(i+2,2) \ 209 LD(i+3,3) \ 210 PF2(i) \ 211 PF2(i+2) \ 212 XO1(i,0) \ 213 XO1(i+1,1) \ 214 XO1(i+2,2) \ 215 XO1(i+3,3) \ 216 PF3(i) \ 217 PF3(i+2) \ 218 PF0(i+4) \ 219 PF0(i+6) \ 220 XO2(i,0) \ 221 XO2(i+1,1) \ 222 XO2(i+2,2) \ 223 XO2(i+3,3) \ 224 XO3(i,0) \ 225 XO3(i+1,1) \ 226 XO3(i+2,2) \ 227 XO3(i+3,3) \ 228 ST(i,0) \ 229 ST(i+1,1) \ 230 ST(i+2,2) \ 231 ST(i+3,3) \ 232 233 234 PF0(0) 235 PF0(2) 236 237 " .align 32 ;\n" 238 " 1: ;\n" 239 240 BLOCK(0) 241 BLOCK(4) 242 BLOCK(8) 243 BLOCK(12) 244 245 " addq %[inc], %[p1] ;\n" 246 " addq %[inc], %[p2] ;\n" 247 " addq %[inc], %[p3] ;\n" 248 " addq %[inc], %[p4] ;\n" 249 " decl %[cnt] ; jnz 1b" 250 : [cnt] "+c" (lines), 251 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 252 : [inc] "r" (256UL) 253 : "memory" ); 254 255 XMMS_RESTORE; 256} 257 258static void 259xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 260 unsigned long *p3, unsigned long *p4, unsigned long *p5) 261{ 262 unsigned int lines = bytes >> 8; 263 xmm_store_t xmm_save[4]; 264 unsigned long cr0; 265 266 XMMS_SAVE; 267 268 __asm__ __volatile__ ( 269#undef BLOCK 270#define BLOCK(i) \ 271 PF1(i) \ 272 PF1(i+2) \ 273 LD(i,0) \ 274 LD(i+1,1) \ 275 LD(i+2,2) \ 276 LD(i+3,3) \ 277 PF2(i) \ 278 PF2(i+2) \ 279 XO1(i,0) \ 280 XO1(i+1,1) \ 281 XO1(i+2,2) \ 282 XO1(i+3,3) \ 283 PF3(i) \ 284 PF3(i+2) \ 285 XO2(i,0) \ 286 XO2(i+1,1) \ 287 XO2(i+2,2) \ 288 XO2(i+3,3) \ 289 PF4(i) \ 290 PF4(i+2) \ 291 PF0(i+4) \ 292 PF0(i+6) \ 293 XO3(i,0) \ 294 XO3(i+1,1) \ 295 XO3(i+2,2) \ 296 XO3(i+3,3) \ 297 XO4(i,0) \ 298 XO4(i+1,1) \ 299 XO4(i+2,2) \ 300 XO4(i+3,3) \ 301 ST(i,0) \ 302 ST(i+1,1) \ 303 ST(i+2,2) \ 304 ST(i+3,3) \ 305 306 307 PF0(0) 308 PF0(2) 309 310 " .align 32 ;\n" 311 " 1: ;\n" 312 313 BLOCK(0) 314 BLOCK(4) 315 BLOCK(8) 316 BLOCK(12) 317 318 " addq %[inc], %[p1] ;\n" 319 " addq %[inc], %[p2] ;\n" 320 " addq %[inc], %[p3] ;\n" 321 " addq %[inc], %[p4] ;\n" 322 " addq %[inc], %[p5] ;\n" 323 " decl %[cnt] ; jnz 1b" 324 : [cnt] "+c" (lines), 325 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 326 [p5] "+r" (p5) 327 : [inc] "r" (256UL) 328 : "memory"); 329 330 XMMS_RESTORE; 331} 332 333static struct xor_block_template xor_block_sse = { 334 name: "generic_sse", 335 do_2: xor_sse_2, 336 do_3: xor_sse_3, 337 do_4: xor_sse_4, 338 do_5: xor_sse_5, 339}; 340 341#undef XOR_TRY_TEMPLATES 342#define XOR_TRY_TEMPLATES \ 343 do { \ 344 xor_speed(&xor_block_sse); \ 345 } while (0) 346 347/* We force the use of the SSE xor block because it can write around L2. 348 We may also be able to load into the L1 only depending on how the cpu 349 deals with a load to a line that is being prefetched. */ 350#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) 351