arch/x86/include/asm/xor_32.h
changeset 0 aa628870c1d3
equal deleted inserted replaced
-1:000000000000 0:aa628870c1d3
       
     1 #ifndef _ASM_X86_XOR_32_H
       
     2 #define _ASM_X86_XOR_32_H
       
     3 
       
     4 /*
       
     5  * Optimized RAID-5 checksumming functions for MMX and SSE.
       
     6  *
       
     7  * This program is free software; you can redistribute it and/or modify
       
     8  * it under the terms of the GNU General Public License as published by
       
     9  * the Free Software Foundation; either version 2, or (at your option)
       
    10  * any later version.
       
    11  *
       
    12  * You should have received a copy of the GNU General Public License
       
    13  * (for example /usr/src/linux/COPYING); if not, write to the Free
       
    14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
       
    15  */
       
    16 
       
    17 /*
       
    18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
       
    19  * Copyright (C) 1998 Ingo Molnar.
       
    20  */
       
    21 
       
    22 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
       
    23 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
       
    24 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
       
    25 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
       
    26 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
       
    27 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
       
    28 
       
    29 #include <asm/i387.h>
       
    30 
       
    31 static void
       
    32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
       
    33 {
       
    34 	unsigned long lines = bytes >> 7;
       
    35 
       
    36 	kernel_fpu_begin();
       
    37 
       
    38 	asm volatile(
       
    39 #undef BLOCK
       
    40 #define BLOCK(i)				\
       
    41 	LD(i, 0)				\
       
    42 		LD(i + 1, 1)			\
       
    43 			LD(i + 2, 2)		\
       
    44 				LD(i + 3, 3)	\
       
    45 	XO1(i, 0)				\
       
    46 	ST(i, 0)				\
       
    47 		XO1(i+1, 1)			\
       
    48 		ST(i+1, 1)			\
       
    49 			XO1(i + 2, 2)		\
       
    50 			ST(i + 2, 2)		\
       
    51 				XO1(i + 3, 3)	\
       
    52 				ST(i + 3, 3)
       
    53 
       
    54 	" .align 32			;\n"
       
    55 	" 1:                            ;\n"
       
    56 
       
    57 	BLOCK(0)
       
    58 	BLOCK(4)
       
    59 	BLOCK(8)
       
    60 	BLOCK(12)
       
    61 
       
    62 	"       addl $128, %1         ;\n"
       
    63 	"       addl $128, %2         ;\n"
       
    64 	"       decl %0               ;\n"
       
    65 	"       jnz 1b                ;\n"
       
    66 	: "+r" (lines),
       
    67 	  "+r" (p1), "+r" (p2)
       
    68 	:
       
    69 	: "memory");
       
    70 
       
    71 	kernel_fpu_end();
       
    72 }
       
    73 
       
    74 static void
       
    75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
    76 	      unsigned long *p3)
       
    77 {
       
    78 	unsigned long lines = bytes >> 7;
       
    79 
       
    80 	kernel_fpu_begin();
       
    81 
       
    82 	asm volatile(
       
    83 #undef BLOCK
       
    84 #define BLOCK(i)				\
       
    85 	LD(i, 0)				\
       
    86 		LD(i + 1, 1)			\
       
    87 			LD(i + 2, 2)		\
       
    88 				LD(i + 3, 3)	\
       
    89 	XO1(i, 0)				\
       
    90 		XO1(i + 1, 1)			\
       
    91 			XO1(i + 2, 2)		\
       
    92 				XO1(i + 3, 3)	\
       
    93 	XO2(i, 0)				\
       
    94 	ST(i, 0)				\
       
    95 		XO2(i + 1, 1)			\
       
    96 		ST(i + 1, 1)			\
       
    97 			XO2(i + 2, 2)		\
       
    98 			ST(i + 2, 2)		\
       
    99 				XO2(i + 3, 3)	\
       
   100 				ST(i + 3, 3)
       
   101 
       
   102 	" .align 32			;\n"
       
   103 	" 1:                            ;\n"
       
   104 
       
   105 	BLOCK(0)
       
   106 	BLOCK(4)
       
   107 	BLOCK(8)
       
   108 	BLOCK(12)
       
   109 
       
   110 	"       addl $128, %1         ;\n"
       
   111 	"       addl $128, %2         ;\n"
       
   112 	"       addl $128, %3         ;\n"
       
   113 	"       decl %0               ;\n"
       
   114 	"       jnz 1b                ;\n"
       
   115 	: "+r" (lines),
       
   116 	  "+r" (p1), "+r" (p2), "+r" (p3)
       
   117 	:
       
   118 	: "memory");
       
   119 
       
   120 	kernel_fpu_end();
       
   121 }
       
   122 
       
   123 static void
       
   124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   125 	      unsigned long *p3, unsigned long *p4)
       
   126 {
       
   127 	unsigned long lines = bytes >> 7;
       
   128 
       
   129 	kernel_fpu_begin();
       
   130 
       
   131 	asm volatile(
       
   132 #undef BLOCK
       
   133 #define BLOCK(i)				\
       
   134 	LD(i, 0)				\
       
   135 		LD(i + 1, 1)			\
       
   136 			LD(i + 2, 2)		\
       
   137 				LD(i + 3, 3)	\
       
   138 	XO1(i, 0)				\
       
   139 		XO1(i + 1, 1)			\
       
   140 			XO1(i + 2, 2)		\
       
   141 				XO1(i + 3, 3)	\
       
   142 	XO2(i, 0)				\
       
   143 		XO2(i + 1, 1)			\
       
   144 			XO2(i + 2, 2)		\
       
   145 				XO2(i + 3, 3)	\
       
   146 	XO3(i, 0)				\
       
   147 	ST(i, 0)				\
       
   148 		XO3(i + 1, 1)			\
       
   149 		ST(i + 1, 1)			\
       
   150 			XO3(i + 2, 2)		\
       
   151 			ST(i + 2, 2)		\
       
   152 				XO3(i + 3, 3)	\
       
   153 				ST(i + 3, 3)
       
   154 
       
   155 	" .align 32			;\n"
       
   156 	" 1:                            ;\n"
       
   157 
       
   158 	BLOCK(0)
       
   159 	BLOCK(4)
       
   160 	BLOCK(8)
       
   161 	BLOCK(12)
       
   162 
       
   163 	"       addl $128, %1         ;\n"
       
   164 	"       addl $128, %2         ;\n"
       
   165 	"       addl $128, %3         ;\n"
       
   166 	"       addl $128, %4         ;\n"
       
   167 	"       decl %0               ;\n"
       
   168 	"       jnz 1b                ;\n"
       
   169 	: "+r" (lines),
       
   170 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
       
   171 	:
       
   172 	: "memory");
       
   173 
       
   174 	kernel_fpu_end();
       
   175 }
       
   176 
       
   177 
       
   178 static void
       
   179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   180 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
       
   181 {
       
   182 	unsigned long lines = bytes >> 7;
       
   183 
       
   184 	kernel_fpu_begin();
       
   185 
       
   186 	/* Make sure GCC forgets anything it knows about p4 or p5,
       
   187 	   such that it won't pass to the asm volatile below a
       
   188 	   register that is shared with any other variable.  That's
       
   189 	   because we modify p4 and p5 there, but we can't mark them
       
   190 	   as read/write, otherwise we'd overflow the 10-asm-operands
       
   191 	   limit of GCC < 3.1.  */
       
   192 	asm("" : "+r" (p4), "+r" (p5));
       
   193 
       
   194 	asm volatile(
       
   195 #undef BLOCK
       
   196 #define BLOCK(i)				\
       
   197 	LD(i, 0)				\
       
   198 		LD(i + 1, 1)			\
       
   199 			LD(i + 2, 2)		\
       
   200 				LD(i + 3, 3)	\
       
   201 	XO1(i, 0)				\
       
   202 		XO1(i + 1, 1)			\
       
   203 			XO1(i + 2, 2)		\
       
   204 				XO1(i + 3, 3)	\
       
   205 	XO2(i, 0)				\
       
   206 		XO2(i + 1, 1)			\
       
   207 			XO2(i + 2, 2)		\
       
   208 				XO2(i + 3, 3)	\
       
   209 	XO3(i, 0)				\
       
   210 		XO3(i + 1, 1)			\
       
   211 			XO3(i + 2, 2)		\
       
   212 				XO3(i + 3, 3)	\
       
   213 	XO4(i, 0)				\
       
   214 	ST(i, 0)				\
       
   215 		XO4(i + 1, 1)			\
       
   216 		ST(i + 1, 1)			\
       
   217 			XO4(i + 2, 2)		\
       
   218 			ST(i + 2, 2)		\
       
   219 				XO4(i + 3, 3)	\
       
   220 				ST(i + 3, 3)
       
   221 
       
   222 	" .align 32			;\n"
       
   223 	" 1:                            ;\n"
       
   224 
       
   225 	BLOCK(0)
       
   226 	BLOCK(4)
       
   227 	BLOCK(8)
       
   228 	BLOCK(12)
       
   229 
       
   230 	"       addl $128, %1         ;\n"
       
   231 	"       addl $128, %2         ;\n"
       
   232 	"       addl $128, %3         ;\n"
       
   233 	"       addl $128, %4         ;\n"
       
   234 	"       addl $128, %5         ;\n"
       
   235 	"       decl %0               ;\n"
       
   236 	"       jnz 1b                ;\n"
       
   237 	: "+r" (lines),
       
   238 	  "+r" (p1), "+r" (p2), "+r" (p3)
       
   239 	: "r" (p4), "r" (p5)
       
   240 	: "memory");
       
   241 
       
   242 	/* p4 and p5 were modified, and now the variables are dead.
       
   243 	   Clobber them just to be sure nobody does something stupid
       
   244 	   like assuming they have some legal value.  */
       
   245 	asm("" : "=r" (p4), "=r" (p5));
       
   246 
       
   247 	kernel_fpu_end();
       
   248 }
       
   249 
       
   250 #undef LD
       
   251 #undef XO1
       
   252 #undef XO2
       
   253 #undef XO3
       
   254 #undef XO4
       
   255 #undef ST
       
   256 #undef BLOCK
       
   257 
       
   258 static void
       
   259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
       
   260 {
       
   261 	unsigned long lines = bytes >> 6;
       
   262 
       
   263 	kernel_fpu_begin();
       
   264 
       
   265 	asm volatile(
       
   266 	" .align 32	             ;\n"
       
   267 	" 1:                         ;\n"
       
   268 	"       movq   (%1), %%mm0   ;\n"
       
   269 	"       movq  8(%1), %%mm1   ;\n"
       
   270 	"       pxor   (%2), %%mm0   ;\n"
       
   271 	"       movq 16(%1), %%mm2   ;\n"
       
   272 	"       movq %%mm0,   (%1)   ;\n"
       
   273 	"       pxor  8(%2), %%mm1   ;\n"
       
   274 	"       movq 24(%1), %%mm3   ;\n"
       
   275 	"       movq %%mm1,  8(%1)   ;\n"
       
   276 	"       pxor 16(%2), %%mm2   ;\n"
       
   277 	"       movq 32(%1), %%mm4   ;\n"
       
   278 	"       movq %%mm2, 16(%1)   ;\n"
       
   279 	"       pxor 24(%2), %%mm3   ;\n"
       
   280 	"       movq 40(%1), %%mm5   ;\n"
       
   281 	"       movq %%mm3, 24(%1)   ;\n"
       
   282 	"       pxor 32(%2), %%mm4   ;\n"
       
   283 	"       movq 48(%1), %%mm6   ;\n"
       
   284 	"       movq %%mm4, 32(%1)   ;\n"
       
   285 	"       pxor 40(%2), %%mm5   ;\n"
       
   286 	"       movq 56(%1), %%mm7   ;\n"
       
   287 	"       movq %%mm5, 40(%1)   ;\n"
       
   288 	"       pxor 48(%2), %%mm6   ;\n"
       
   289 	"       pxor 56(%2), %%mm7   ;\n"
       
   290 	"       movq %%mm6, 48(%1)   ;\n"
       
   291 	"       movq %%mm7, 56(%1)   ;\n"
       
   292 
       
   293 	"       addl $64, %1         ;\n"
       
   294 	"       addl $64, %2         ;\n"
       
   295 	"       decl %0              ;\n"
       
   296 	"       jnz 1b               ;\n"
       
   297 	: "+r" (lines),
       
   298 	  "+r" (p1), "+r" (p2)
       
   299 	:
       
   300 	: "memory");
       
   301 
       
   302 	kernel_fpu_end();
       
   303 }
       
   304 
       
   305 static void
       
   306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   307 	     unsigned long *p3)
       
   308 {
       
   309 	unsigned long lines = bytes >> 6;
       
   310 
       
   311 	kernel_fpu_begin();
       
   312 
       
   313 	asm volatile(
       
   314 	" .align 32,0x90             ;\n"
       
   315 	" 1:                         ;\n"
       
   316 	"       movq   (%1), %%mm0   ;\n"
       
   317 	"       movq  8(%1), %%mm1   ;\n"
       
   318 	"       pxor   (%2), %%mm0   ;\n"
       
   319 	"       movq 16(%1), %%mm2   ;\n"
       
   320 	"       pxor  8(%2), %%mm1   ;\n"
       
   321 	"       pxor   (%3), %%mm0   ;\n"
       
   322 	"       pxor 16(%2), %%mm2   ;\n"
       
   323 	"       movq %%mm0,   (%1)   ;\n"
       
   324 	"       pxor  8(%3), %%mm1   ;\n"
       
   325 	"       pxor 16(%3), %%mm2   ;\n"
       
   326 	"       movq 24(%1), %%mm3   ;\n"
       
   327 	"       movq %%mm1,  8(%1)   ;\n"
       
   328 	"       movq 32(%1), %%mm4   ;\n"
       
   329 	"       movq 40(%1), %%mm5   ;\n"
       
   330 	"       pxor 24(%2), %%mm3   ;\n"
       
   331 	"       movq %%mm2, 16(%1)   ;\n"
       
   332 	"       pxor 32(%2), %%mm4   ;\n"
       
   333 	"       pxor 24(%3), %%mm3   ;\n"
       
   334 	"       pxor 40(%2), %%mm5   ;\n"
       
   335 	"       movq %%mm3, 24(%1)   ;\n"
       
   336 	"       pxor 32(%3), %%mm4   ;\n"
       
   337 	"       pxor 40(%3), %%mm5   ;\n"
       
   338 	"       movq 48(%1), %%mm6   ;\n"
       
   339 	"       movq %%mm4, 32(%1)   ;\n"
       
   340 	"       movq 56(%1), %%mm7   ;\n"
       
   341 	"       pxor 48(%2), %%mm6   ;\n"
       
   342 	"       movq %%mm5, 40(%1)   ;\n"
       
   343 	"       pxor 56(%2), %%mm7   ;\n"
       
   344 	"       pxor 48(%3), %%mm6   ;\n"
       
   345 	"       pxor 56(%3), %%mm7   ;\n"
       
   346 	"       movq %%mm6, 48(%1)   ;\n"
       
   347 	"       movq %%mm7, 56(%1)   ;\n"
       
   348 
       
   349 	"       addl $64, %1         ;\n"
       
   350 	"       addl $64, %2         ;\n"
       
   351 	"       addl $64, %3         ;\n"
       
   352 	"       decl %0              ;\n"
       
   353 	"       jnz 1b               ;\n"
       
   354 	: "+r" (lines),
       
   355 	  "+r" (p1), "+r" (p2), "+r" (p3)
       
   356 	:
       
   357 	: "memory" );
       
   358 
       
   359 	kernel_fpu_end();
       
   360 }
       
   361 
       
   362 static void
       
   363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   364 	     unsigned long *p3, unsigned long *p4)
       
   365 {
       
   366 	unsigned long lines = bytes >> 6;
       
   367 
       
   368 	kernel_fpu_begin();
       
   369 
       
   370 	asm volatile(
       
   371 	" .align 32,0x90             ;\n"
       
   372 	" 1:                         ;\n"
       
   373 	"       movq   (%1), %%mm0   ;\n"
       
   374 	"       movq  8(%1), %%mm1   ;\n"
       
   375 	"       pxor   (%2), %%mm0   ;\n"
       
   376 	"       movq 16(%1), %%mm2   ;\n"
       
   377 	"       pxor  8(%2), %%mm1   ;\n"
       
   378 	"       pxor   (%3), %%mm0   ;\n"
       
   379 	"       pxor 16(%2), %%mm2   ;\n"
       
   380 	"       pxor  8(%3), %%mm1   ;\n"
       
   381 	"       pxor   (%4), %%mm0   ;\n"
       
   382 	"       movq 24(%1), %%mm3   ;\n"
       
   383 	"       pxor 16(%3), %%mm2   ;\n"
       
   384 	"       pxor  8(%4), %%mm1   ;\n"
       
   385 	"       movq %%mm0,   (%1)   ;\n"
       
   386 	"       movq 32(%1), %%mm4   ;\n"
       
   387 	"       pxor 24(%2), %%mm3   ;\n"
       
   388 	"       pxor 16(%4), %%mm2   ;\n"
       
   389 	"       movq %%mm1,  8(%1)   ;\n"
       
   390 	"       movq 40(%1), %%mm5   ;\n"
       
   391 	"       pxor 32(%2), %%mm4   ;\n"
       
   392 	"       pxor 24(%3), %%mm3   ;\n"
       
   393 	"       movq %%mm2, 16(%1)   ;\n"
       
   394 	"       pxor 40(%2), %%mm5   ;\n"
       
   395 	"       pxor 32(%3), %%mm4   ;\n"
       
   396 	"       pxor 24(%4), %%mm3   ;\n"
       
   397 	"       movq %%mm3, 24(%1)   ;\n"
       
   398 	"       movq 56(%1), %%mm7   ;\n"
       
   399 	"       movq 48(%1), %%mm6   ;\n"
       
   400 	"       pxor 40(%3), %%mm5   ;\n"
       
   401 	"       pxor 32(%4), %%mm4   ;\n"
       
   402 	"       pxor 48(%2), %%mm6   ;\n"
       
   403 	"       movq %%mm4, 32(%1)   ;\n"
       
   404 	"       pxor 56(%2), %%mm7   ;\n"
       
   405 	"       pxor 40(%4), %%mm5   ;\n"
       
   406 	"       pxor 48(%3), %%mm6   ;\n"
       
   407 	"       pxor 56(%3), %%mm7   ;\n"
       
   408 	"       movq %%mm5, 40(%1)   ;\n"
       
   409 	"       pxor 48(%4), %%mm6   ;\n"
       
   410 	"       pxor 56(%4), %%mm7   ;\n"
       
   411 	"       movq %%mm6, 48(%1)   ;\n"
       
   412 	"       movq %%mm7, 56(%1)   ;\n"
       
   413 
       
   414 	"       addl $64, %1         ;\n"
       
   415 	"       addl $64, %2         ;\n"
       
   416 	"       addl $64, %3         ;\n"
       
   417 	"       addl $64, %4         ;\n"
       
   418 	"       decl %0              ;\n"
       
   419 	"       jnz 1b               ;\n"
       
   420 	: "+r" (lines),
       
   421 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
       
   422 	:
       
   423 	: "memory");
       
   424 
       
   425 	kernel_fpu_end();
       
   426 }
       
   427 
       
   428 static void
       
   429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   430 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
       
   431 {
       
   432 	unsigned long lines = bytes >> 6;
       
   433 
       
   434 	kernel_fpu_begin();
       
   435 
       
   436 	/* Make sure GCC forgets anything it knows about p4 or p5,
       
   437 	   such that it won't pass to the asm volatile below a
       
   438 	   register that is shared with any other variable.  That's
       
   439 	   because we modify p4 and p5 there, but we can't mark them
       
   440 	   as read/write, otherwise we'd overflow the 10-asm-operands
       
   441 	   limit of GCC < 3.1.  */
       
   442 	asm("" : "+r" (p4), "+r" (p5));
       
   443 
       
   444 	asm volatile(
       
   445 	" .align 32,0x90             ;\n"
       
   446 	" 1:                         ;\n"
       
   447 	"       movq   (%1), %%mm0   ;\n"
       
   448 	"       movq  8(%1), %%mm1   ;\n"
       
   449 	"       pxor   (%2), %%mm0   ;\n"
       
   450 	"       pxor  8(%2), %%mm1   ;\n"
       
   451 	"       movq 16(%1), %%mm2   ;\n"
       
   452 	"       pxor   (%3), %%mm0   ;\n"
       
   453 	"       pxor  8(%3), %%mm1   ;\n"
       
   454 	"       pxor 16(%2), %%mm2   ;\n"
       
   455 	"       pxor   (%4), %%mm0   ;\n"
       
   456 	"       pxor  8(%4), %%mm1   ;\n"
       
   457 	"       pxor 16(%3), %%mm2   ;\n"
       
   458 	"       movq 24(%1), %%mm3   ;\n"
       
   459 	"       pxor   (%5), %%mm0   ;\n"
       
   460 	"       pxor  8(%5), %%mm1   ;\n"
       
   461 	"       movq %%mm0,   (%1)   ;\n"
       
   462 	"       pxor 16(%4), %%mm2   ;\n"
       
   463 	"       pxor 24(%2), %%mm3   ;\n"
       
   464 	"       movq %%mm1,  8(%1)   ;\n"
       
   465 	"       pxor 16(%5), %%mm2   ;\n"
       
   466 	"       pxor 24(%3), %%mm3   ;\n"
       
   467 	"       movq 32(%1), %%mm4   ;\n"
       
   468 	"       movq %%mm2, 16(%1)   ;\n"
       
   469 	"       pxor 24(%4), %%mm3   ;\n"
       
   470 	"       pxor 32(%2), %%mm4   ;\n"
       
   471 	"       movq 40(%1), %%mm5   ;\n"
       
   472 	"       pxor 24(%5), %%mm3   ;\n"
       
   473 	"       pxor 32(%3), %%mm4   ;\n"
       
   474 	"       pxor 40(%2), %%mm5   ;\n"
       
   475 	"       movq %%mm3, 24(%1)   ;\n"
       
   476 	"       pxor 32(%4), %%mm4   ;\n"
       
   477 	"       pxor 40(%3), %%mm5   ;\n"
       
   478 	"       movq 48(%1), %%mm6   ;\n"
       
   479 	"       movq 56(%1), %%mm7   ;\n"
       
   480 	"       pxor 32(%5), %%mm4   ;\n"
       
   481 	"       pxor 40(%4), %%mm5   ;\n"
       
   482 	"       pxor 48(%2), %%mm6   ;\n"
       
   483 	"       pxor 56(%2), %%mm7   ;\n"
       
   484 	"       movq %%mm4, 32(%1)   ;\n"
       
   485 	"       pxor 48(%3), %%mm6   ;\n"
       
   486 	"       pxor 56(%3), %%mm7   ;\n"
       
   487 	"       pxor 40(%5), %%mm5   ;\n"
       
   488 	"       pxor 48(%4), %%mm6   ;\n"
       
   489 	"       pxor 56(%4), %%mm7   ;\n"
       
   490 	"       movq %%mm5, 40(%1)   ;\n"
       
   491 	"       pxor 48(%5), %%mm6   ;\n"
       
   492 	"       pxor 56(%5), %%mm7   ;\n"
       
   493 	"       movq %%mm6, 48(%1)   ;\n"
       
   494 	"       movq %%mm7, 56(%1)   ;\n"
       
   495 
       
   496 	"       addl $64, %1         ;\n"
       
   497 	"       addl $64, %2         ;\n"
       
   498 	"       addl $64, %3         ;\n"
       
   499 	"       addl $64, %4         ;\n"
       
   500 	"       addl $64, %5         ;\n"
       
   501 	"       decl %0              ;\n"
       
   502 	"       jnz 1b               ;\n"
       
   503 	: "+r" (lines),
       
   504 	  "+r" (p1), "+r" (p2), "+r" (p3)
       
   505 	: "r" (p4), "r" (p5)
       
   506 	: "memory");
       
   507 
       
   508 	/* p4 and p5 were modified, and now the variables are dead.
       
   509 	   Clobber them just to be sure nobody does something stupid
       
   510 	   like assuming they have some legal value.  */
       
   511 	asm("" : "=r" (p4), "=r" (p5));
       
   512 
       
   513 	kernel_fpu_end();
       
   514 }
       
   515 
       
   516 static struct xor_block_template xor_block_pII_mmx = {
       
   517 	.name = "pII_mmx",
       
   518 	.do_2 = xor_pII_mmx_2,
       
   519 	.do_3 = xor_pII_mmx_3,
       
   520 	.do_4 = xor_pII_mmx_4,
       
   521 	.do_5 = xor_pII_mmx_5,
       
   522 };
       
   523 
       
   524 static struct xor_block_template xor_block_p5_mmx = {
       
   525 	.name = "p5_mmx",
       
   526 	.do_2 = xor_p5_mmx_2,
       
   527 	.do_3 = xor_p5_mmx_3,
       
   528 	.do_4 = xor_p5_mmx_4,
       
   529 	.do_5 = xor_p5_mmx_5,
       
   530 };
       
   531 
       
   532 /*
       
   533  * Cache avoiding checksumming functions utilizing KNI instructions
       
   534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
       
   535  */
       
   536 
       
   537 #define XMMS_SAVE				\
       
   538 do {						\
       
   539 	preempt_disable();			\
       
   540 	cr0 = read_cr0();			\
       
   541 	clts();					\
       
   542 	asm volatile(				\
       
   543 		"movups %%xmm0,(%0)	;\n\t"	\
       
   544 		"movups %%xmm1,0x10(%0)	;\n\t"	\
       
   545 		"movups %%xmm2,0x20(%0)	;\n\t"	\
       
   546 		"movups %%xmm3,0x30(%0)	;\n\t"	\
       
   547 		:				\
       
   548 		: "r" (xmm_save) 		\
       
   549 		: "memory");			\
       
   550 } while (0)
       
   551 
       
   552 #define XMMS_RESTORE				\
       
   553 do {						\
       
   554 	asm volatile(				\
       
   555 		"sfence			;\n\t"	\
       
   556 		"movups (%0),%%xmm0	;\n\t"	\
       
   557 		"movups 0x10(%0),%%xmm1	;\n\t"	\
       
   558 		"movups 0x20(%0),%%xmm2	;\n\t"	\
       
   559 		"movups 0x30(%0),%%xmm3	;\n\t"	\
       
   560 		:				\
       
   561 		: "r" (xmm_save)		\
       
   562 		: "memory");			\
       
   563 	write_cr0(cr0);				\
       
   564 	preempt_enable();			\
       
   565 } while (0)
       
   566 
       
   567 #define ALIGN16 __attribute__((aligned(16)))
       
   568 
       
   569 #define OFFS(x)		"16*("#x")"
       
   570 #define PF_OFFS(x)	"256+16*("#x")"
       
   571 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
       
   572 #define LD(x, y)	"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
       
   573 #define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
       
   574 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
       
   575 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
       
   576 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
       
   577 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
       
   578 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
       
   579 #define XO1(x, y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
       
   580 #define XO2(x, y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
       
   581 #define XO3(x, y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
       
   582 #define XO4(x, y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
       
   583 #define XO5(x, y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
       
   584 
       
   585 
       
   586 static void
       
   587 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
       
   588 {
       
   589 	unsigned long lines = bytes >> 8;
       
   590 	char xmm_save[16*4] ALIGN16;
       
   591 	int cr0;
       
   592 
       
   593 	XMMS_SAVE;
       
   594 
       
   595 	asm volatile(
       
   596 #undef BLOCK
       
   597 #define BLOCK(i)					\
       
   598 		LD(i, 0)				\
       
   599 			LD(i + 1, 1)			\
       
   600 		PF1(i)					\
       
   601 				PF1(i + 2)		\
       
   602 				LD(i + 2, 2)		\
       
   603 					LD(i + 3, 3)	\
       
   604 		PF0(i + 4)				\
       
   605 				PF0(i + 6)		\
       
   606 		XO1(i, 0)				\
       
   607 			XO1(i + 1, 1)			\
       
   608 				XO1(i + 2, 2)		\
       
   609 					XO1(i + 3, 3)	\
       
   610 		ST(i, 0)				\
       
   611 			ST(i + 1, 1)			\
       
   612 				ST(i + 2, 2)		\
       
   613 					ST(i + 3, 3)	\
       
   614 
       
   615 
       
   616 		PF0(0)
       
   617 				PF0(2)
       
   618 
       
   619 	" .align 32			;\n"
       
   620 	" 1:                            ;\n"
       
   621 
       
   622 		BLOCK(0)
       
   623 		BLOCK(4)
       
   624 		BLOCK(8)
       
   625 		BLOCK(12)
       
   626 
       
   627 	"       addl $256, %1           ;\n"
       
   628 	"       addl $256, %2           ;\n"
       
   629 	"       decl %0                 ;\n"
       
   630 	"       jnz 1b                  ;\n"
       
   631 	: "+r" (lines),
       
   632 	  "+r" (p1), "+r" (p2)
       
   633 	:
       
   634 	: "memory");
       
   635 
       
   636 	XMMS_RESTORE;
       
   637 }
       
   638 
       
   639 static void
       
   640 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   641 	  unsigned long *p3)
       
   642 {
       
   643 	unsigned long lines = bytes >> 8;
       
   644 	char xmm_save[16*4] ALIGN16;
       
   645 	int cr0;
       
   646 
       
   647 	XMMS_SAVE;
       
   648 
       
   649 	asm volatile(
       
   650 #undef BLOCK
       
   651 #define BLOCK(i) \
       
   652 		PF1(i)					\
       
   653 				PF1(i + 2)		\
       
   654 		LD(i,0)					\
       
   655 			LD(i + 1, 1)			\
       
   656 				LD(i + 2, 2)		\
       
   657 					LD(i + 3, 3)	\
       
   658 		PF2(i)					\
       
   659 				PF2(i + 2)		\
       
   660 		PF0(i + 4)				\
       
   661 				PF0(i + 6)		\
       
   662 		XO1(i,0)				\
       
   663 			XO1(i + 1, 1)			\
       
   664 				XO1(i + 2, 2)		\
       
   665 					XO1(i + 3, 3)	\
       
   666 		XO2(i,0)				\
       
   667 			XO2(i + 1, 1)			\
       
   668 				XO2(i + 2, 2)		\
       
   669 					XO2(i + 3, 3)	\
       
   670 		ST(i,0)					\
       
   671 			ST(i + 1, 1)			\
       
   672 				ST(i + 2, 2)		\
       
   673 					ST(i + 3, 3)	\
       
   674 
       
   675 
       
   676 		PF0(0)
       
   677 				PF0(2)
       
   678 
       
   679 	" .align 32			;\n"
       
   680 	" 1:                            ;\n"
       
   681 
       
   682 		BLOCK(0)
       
   683 		BLOCK(4)
       
   684 		BLOCK(8)
       
   685 		BLOCK(12)
       
   686 
       
   687 	"       addl $256, %1           ;\n"
       
   688 	"       addl $256, %2           ;\n"
       
   689 	"       addl $256, %3           ;\n"
       
   690 	"       decl %0                 ;\n"
       
   691 	"       jnz 1b                  ;\n"
       
   692 	: "+r" (lines),
       
   693 	  "+r" (p1), "+r"(p2), "+r"(p3)
       
   694 	:
       
   695 	: "memory" );
       
   696 
       
   697 	XMMS_RESTORE;
       
   698 }
       
   699 
       
   700 static void
       
   701 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   702 	  unsigned long *p3, unsigned long *p4)
       
   703 {
       
   704 	unsigned long lines = bytes >> 8;
       
   705 	char xmm_save[16*4] ALIGN16;
       
   706 	int cr0;
       
   707 
       
   708 	XMMS_SAVE;
       
   709 
       
   710 	asm volatile(
       
   711 #undef BLOCK
       
   712 #define BLOCK(i) \
       
   713 		PF1(i)					\
       
   714 				PF1(i + 2)		\
       
   715 		LD(i,0)					\
       
   716 			LD(i + 1, 1)			\
       
   717 				LD(i + 2, 2)		\
       
   718 					LD(i + 3, 3)	\
       
   719 		PF2(i)					\
       
   720 				PF2(i + 2)		\
       
   721 		XO1(i,0)				\
       
   722 			XO1(i + 1, 1)			\
       
   723 				XO1(i + 2, 2)		\
       
   724 					XO1(i + 3, 3)	\
       
   725 		PF3(i)					\
       
   726 				PF3(i + 2)		\
       
   727 		PF0(i + 4)				\
       
   728 				PF0(i + 6)		\
       
   729 		XO2(i,0)				\
       
   730 			XO2(i + 1, 1)			\
       
   731 				XO2(i + 2, 2)		\
       
   732 					XO2(i + 3, 3)	\
       
   733 		XO3(i,0)				\
       
   734 			XO3(i + 1, 1)			\
       
   735 				XO3(i + 2, 2)		\
       
   736 					XO3(i + 3, 3)	\
       
   737 		ST(i,0)					\
       
   738 			ST(i + 1, 1)			\
       
   739 				ST(i + 2, 2)		\
       
   740 					ST(i + 3, 3)	\
       
   741 
       
   742 
       
   743 		PF0(0)
       
   744 				PF0(2)
       
   745 
       
   746 	" .align 32			;\n"
       
   747 	" 1:                            ;\n"
       
   748 
       
   749 		BLOCK(0)
       
   750 		BLOCK(4)
       
   751 		BLOCK(8)
       
   752 		BLOCK(12)
       
   753 
       
   754 	"       addl $256, %1           ;\n"
       
   755 	"       addl $256, %2           ;\n"
       
   756 	"       addl $256, %3           ;\n"
       
   757 	"       addl $256, %4           ;\n"
       
   758 	"       decl %0                 ;\n"
       
   759 	"       jnz 1b                  ;\n"
       
   760 	: "+r" (lines),
       
   761 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
       
   762 	:
       
   763 	: "memory" );
       
   764 
       
   765 	XMMS_RESTORE;
       
   766 }
       
   767 
       
   768 static void
       
   769 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
       
   770 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
       
   771 {
       
   772 	unsigned long lines = bytes >> 8;
       
   773 	char xmm_save[16*4] ALIGN16;
       
   774 	int cr0;
       
   775 
       
   776 	XMMS_SAVE;
       
   777 
       
   778 	/* Make sure GCC forgets anything it knows about p4 or p5,
       
   779 	   such that it won't pass to the asm volatile below a
       
   780 	   register that is shared with any other variable.  That's
       
   781 	   because we modify p4 and p5 there, but we can't mark them
       
   782 	   as read/write, otherwise we'd overflow the 10-asm-operands
       
   783 	   limit of GCC < 3.1.  */
       
   784 	asm("" : "+r" (p4), "+r" (p5));
       
   785 
       
   786 	asm volatile(
       
   787 #undef BLOCK
       
   788 #define BLOCK(i) \
       
   789 		PF1(i)					\
       
   790 				PF1(i + 2)		\
       
   791 		LD(i,0)					\
       
   792 			LD(i + 1, 1)			\
       
   793 				LD(i + 2, 2)		\
       
   794 					LD(i + 3, 3)	\
       
   795 		PF2(i)					\
       
   796 				PF2(i + 2)		\
       
   797 		XO1(i,0)				\
       
   798 			XO1(i + 1, 1)			\
       
   799 				XO1(i + 2, 2)		\
       
   800 					XO1(i + 3, 3)	\
       
   801 		PF3(i)					\
       
   802 				PF3(i + 2)		\
       
   803 		XO2(i,0)				\
       
   804 			XO2(i + 1, 1)			\
       
   805 				XO2(i + 2, 2)		\
       
   806 					XO2(i + 3, 3)	\
       
   807 		PF4(i)					\
       
   808 				PF4(i + 2)		\
       
   809 		PF0(i + 4)				\
       
   810 				PF0(i + 6)		\
       
   811 		XO3(i,0)				\
       
   812 			XO3(i + 1, 1)			\
       
   813 				XO3(i + 2, 2)		\
       
   814 					XO3(i + 3, 3)	\
       
   815 		XO4(i,0)				\
       
   816 			XO4(i + 1, 1)			\
       
   817 				XO4(i + 2, 2)		\
       
   818 					XO4(i + 3, 3)	\
       
   819 		ST(i,0)					\
       
   820 			ST(i + 1, 1)			\
       
   821 				ST(i + 2, 2)		\
       
   822 					ST(i + 3, 3)	\
       
   823 
       
   824 
       
   825 		PF0(0)
       
   826 				PF0(2)
       
   827 
       
   828 	" .align 32			;\n"
       
   829 	" 1:                            ;\n"
       
   830 
       
   831 		BLOCK(0)
       
   832 		BLOCK(4)
       
   833 		BLOCK(8)
       
   834 		BLOCK(12)
       
   835 
       
   836 	"       addl $256, %1           ;\n"
       
   837 	"       addl $256, %2           ;\n"
       
   838 	"       addl $256, %3           ;\n"
       
   839 	"       addl $256, %4           ;\n"
       
   840 	"       addl $256, %5           ;\n"
       
   841 	"       decl %0                 ;\n"
       
   842 	"       jnz 1b                  ;\n"
       
   843 	: "+r" (lines),
       
   844 	  "+r" (p1), "+r" (p2), "+r" (p3)
       
   845 	: "r" (p4), "r" (p5)
       
   846 	: "memory");
       
   847 
       
   848 	/* p4 and p5 were modified, and now the variables are dead.
       
   849 	   Clobber them just to be sure nobody does something stupid
       
   850 	   like assuming they have some legal value.  */
       
   851 	asm("" : "=r" (p4), "=r" (p5));
       
   852 
       
   853 	XMMS_RESTORE;
       
   854 }
       
   855 
       
   856 static struct xor_block_template xor_block_pIII_sse = {
       
   857 	.name = "pIII_sse",
       
   858 	.do_2 = xor_sse_2,
       
   859 	.do_3 = xor_sse_3,
       
   860 	.do_4 = xor_sse_4,
       
   861 	.do_5 = xor_sse_5,
       
   862 };
       
   863 
       
   864 /* Also try the generic routines.  */
       
   865 #include <asm-generic/xor.h>
       
   866 
       
   867 #undef XOR_TRY_TEMPLATES
       
   868 #define XOR_TRY_TEMPLATES				\
       
   869 do {							\
       
   870 	xor_speed(&xor_block_8regs);			\
       
   871 	xor_speed(&xor_block_8regs_p);			\
       
   872 	xor_speed(&xor_block_32regs);			\
       
   873 	xor_speed(&xor_block_32regs_p);			\
       
   874 	if (cpu_has_xmm)				\
       
   875 		xor_speed(&xor_block_pIII_sse);		\
       
   876 	if (cpu_has_mmx) {				\
       
   877 		xor_speed(&xor_block_pII_mmx);		\
       
   878 		xor_speed(&xor_block_p5_mmx);		\
       
   879 	}						\
       
   880 } while (0)
       
   881 
       
   882 /* We force the use of the SSE xor block because it can write around L2.
       
   883    We may also be able to load into the L1 only depending on how the cpu
       
   884    deals with a load to a line that is being prefetched.  */
       
   885 #define XOR_SELECT_TEMPLATE(FASTEST)			\
       
   886 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
       
   887 
       
   888 #endif /* _ASM_X86_XOR_32_H */