x86_64-gcc.c revision 296341
1#include "../bn_lcl.h" 2#if !(defined(__GNUC__) && __GNUC__>=2) 3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4#else 5/*- 6 * x86_64 BIGNUM accelerator version 0.1, December 2002. 7 * 8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9 * project. 10 * 11 * Rights for redistribution and usage in source and binary forms are 12 * granted according to the OpenSSL license. Warranty of any kind is 13 * disclaimed. 14 * 15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16 * versions, like 1.0... 17 * A. Well, that's because this code is basically a quick-n-dirty 18 * proof-of-concept hack. As you can see it's implemented with 19 * inline assembler, which means that you're bound to GCC and that 20 * there might be enough room for further improvement. 21 * 22 * Q. Why inline assembler? 23 * A. x86_64 features own ABI which I'm not familiar with. This is 24 * why I decided to let the compiler take care of subroutine 25 * prologue/epilogue as well as register allocation. For reference. 26 * Win64 implements different ABI for AMD64, different from Linux. 27 * 28 * Q. How much faster does it get? 29 * A. 'apps/openssl speed rsa dsa' output with no-asm: 30 * 31 * sign verify sign/s verify/s 32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36 * sign verify sign/s verify/s 37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40 * 41 * 'apps/openssl speed rsa dsa' output with this module: 42 * 43 * sign verify sign/s verify/s 44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48 * sign verify sign/s verify/s 49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52 * 53 * For the reference. IA-32 assembler implementation performs 54 * very much like 64-bit code compiled with no-asm on the same 55 * machine. 56 */ 57 58# ifdef _WIN64 59# define BN_ULONG unsigned long long 60# else 61# define BN_ULONG unsigned long 62# endif 63 64# undef mul 65# undef mul_add 66# undef sqr 67 68/*- 69 * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 70 * "g"(0) let the compiler to decide where does it 71 * want to keep the value of zero; 72 */ 73# define mul_add(r,a,word,carry) do { \ 74 register BN_ULONG high,low; \ 75 asm ("mulq %3" \ 76 : "=a"(low),"=d"(high) \ 77 : "a"(word),"m"(a) \ 78 : "cc"); \ 79 asm ("addq %2,%0; adcq %3,%1" \ 80 : "+r"(carry),"+d"(high)\ 81 : "a"(low),"g"(0) \ 82 : "cc"); \ 83 asm ("addq %2,%0; adcq %3,%1" \ 84 : "+m"(r),"+d"(high) \ 85 : "r"(carry),"g"(0) \ 86 : "cc"); \ 87 carry=high; \ 88 } while (0) 89 90# define mul(r,a,word,carry) do { \ 91 register BN_ULONG high,low; \ 92 asm ("mulq %3" \ 93 : "=a"(low),"=d"(high) \ 94 : "a"(word),"g"(a) \ 95 : "cc"); \ 96 asm ("addq %2,%0; adcq %3,%1" \ 97 : "+r"(carry),"+d"(high)\ 98 : "a"(low),"g"(0) \ 99 : "cc"); \ 100 (r)=carry, carry=high; \ 101 } while (0) 102 103# define sqr(r0,r1,a) \ 104 asm ("mulq %2" \ 105 : "=a"(r0),"=d"(r1) \ 106 : "a"(a) \ 107 : "cc"); 108 109BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 110 BN_ULONG w) 111{ 112 BN_ULONG c1 = 0; 113 114 if (num <= 0) 115 return (c1); 116 117 while (num & ~3) { 118 mul_add(rp[0], ap[0], w, c1); 119 mul_add(rp[1], ap[1], w, c1); 120 mul_add(rp[2], ap[2], w, c1); 121 mul_add(rp[3], ap[3], w, c1); 122 ap += 4; 123 rp += 4; 124 num -= 4; 125 } 126 if (num) { 127 mul_add(rp[0], ap[0], w, c1); 128 if (--num == 0) 129 return c1; 130 mul_add(rp[1], ap[1], w, c1); 131 if (--num == 0) 132 return c1; 133 mul_add(rp[2], ap[2], w, c1); 134 return c1; 135 } 136 137 return (c1); 138} 139 140BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 141{ 142 BN_ULONG c1 = 0; 143 144 if (num <= 0) 145 return (c1); 146 147 while (num & ~3) { 148 mul(rp[0], ap[0], w, c1); 149 mul(rp[1], ap[1], w, c1); 150 mul(rp[2], ap[2], w, c1); 151 mul(rp[3], ap[3], w, c1); 152 ap += 4; 153 rp += 4; 154 num -= 4; 155 } 156 if (num) { 157 mul(rp[0], ap[0], w, c1); 158 if (--num == 0) 159 return c1; 160 mul(rp[1], ap[1], w, c1); 161 if (--num == 0) 162 return c1; 163 mul(rp[2], ap[2], w, c1); 164 } 165 return (c1); 166} 167 168void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 169{ 170 if (n <= 0) 171 return; 172 173 while (n & ~3) { 174 sqr(r[0], r[1], a[0]); 175 sqr(r[2], r[3], a[1]); 176 sqr(r[4], r[5], a[2]); 177 sqr(r[6], r[7], a[3]); 178 a += 4; 179 r += 8; 180 n -= 4; 181 } 182 if (n) { 183 sqr(r[0], r[1], a[0]); 184 if (--n == 0) 185 return; 186 sqr(r[2], r[3], a[1]); 187 if (--n == 0) 188 return; 189 sqr(r[4], r[5], a[2]); 190 } 191} 192 193BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 194{ 195 BN_ULONG ret, waste; 196 197 asm("divq %4":"=a"(ret), "=d"(waste) 198 : "a"(l), "d"(h), "g"(d) 199 : "cc"); 200 201 return ret; 202} 203 204BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 205 int n) 206{ 207 BN_ULONG ret = 0, i = 0; 208 209 if (n <= 0) 210 return 0; 211 212 asm volatile (" subq %2,%2 \n" 213 ".p2align 4 \n" 214 "1: movq (%4,%2,8),%0 \n" 215 " adcq (%5,%2,8),%0 \n" 216 " movq %0,(%3,%2,8) \n" 217 " leaq 1(%2),%2 \n" 218 " loop 1b \n" 219 " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 220 "=&r"(i) 221 :"r"(rp), "r"(ap), "r"(bp) 222 :"cc", "memory"); 223 224 return ret & 1; 225} 226 227# ifndef SIMICS 228BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 229 int n) 230{ 231 BN_ULONG ret = 0, i = 0; 232 233 if (n <= 0) 234 return 0; 235 236 asm volatile (" subq %2,%2 \n" 237 ".p2align 4 \n" 238 "1: movq (%4,%2,8),%0 \n" 239 " sbbq (%5,%2,8),%0 \n" 240 " movq %0,(%3,%2,8) \n" 241 " leaq 1(%2),%2 \n" 242 " loop 1b \n" 243 " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 244 "=&r"(i) 245 :"r"(rp), "r"(ap), "r"(bp) 246 :"cc", "memory"); 247 248 return ret & 1; 249} 250# else 251/* Simics 1.4<7 has buggy sbbq:-( */ 252# define BN_MASK2 0xffffffffffffffffL 253BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 254{ 255 BN_ULONG t1, t2; 256 int c = 0; 257 258 if (n <= 0) 259 return ((BN_ULONG)0); 260 261 for (;;) { 262 t1 = a[0]; 263 t2 = b[0]; 264 r[0] = (t1 - t2 - c) & BN_MASK2; 265 if (t1 != t2) 266 c = (t1 < t2); 267 if (--n <= 0) 268 break; 269 270 t1 = a[1]; 271 t2 = b[1]; 272 r[1] = (t1 - t2 - c) & BN_MASK2; 273 if (t1 != t2) 274 c = (t1 < t2); 275 if (--n <= 0) 276 break; 277 278 t1 = a[2]; 279 t2 = b[2]; 280 r[2] = (t1 - t2 - c) & BN_MASK2; 281 if (t1 != t2) 282 c = (t1 < t2); 283 if (--n <= 0) 284 break; 285 286 t1 = a[3]; 287 t2 = b[3]; 288 r[3] = (t1 - t2 - c) & BN_MASK2; 289 if (t1 != t2) 290 c = (t1 < t2); 291 if (--n <= 0) 292 break; 293 294 a += 4; 295 b += 4; 296 r += 4; 297 } 298 return (c); 299} 300# endif 301 302/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 303/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 304/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 305/* 306 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 307 * c=(c2,c1,c0) 308 */ 309 310/* 311 * Keep in mind that carrying into high part of multiplication result 312 * can not overflow, because it cannot be all-ones. 313 */ 314# if 0 315/* original macros are kept for reference purposes */ 316# define mul_add_c(a,b,c0,c1,c2) { \ 317 BN_ULONG ta=(a),tb=(b); \ 318 t1 = ta * tb; \ 319 t2 = BN_UMULT_HIGH(ta,tb); \ 320 c0 += t1; t2 += (c0<t1)?1:0; \ 321 c1 += t2; c2 += (c1<t2)?1:0; \ 322 } 323 324# define mul_add_c2(a,b,c0,c1,c2) { \ 325 BN_ULONG ta=(a),tb=(b),t0; \ 326 t1 = BN_UMULT_HIGH(ta,tb); \ 327 t0 = ta * tb; \ 328 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 329 c1 += t2; c2 += (c1<t2)?1:0; \ 330 c0 += t0; t1 += (c0<t0)?1:0; \ 331 c1 += t1; c2 += (c1<t1)?1:0; \ 332 } 333# else 334# define mul_add_c(a,b,c0,c1,c2) do { \ 335 asm ("mulq %3" \ 336 : "=a"(t1),"=d"(t2) \ 337 : "a"(a),"m"(b) \ 338 : "cc"); \ 339 asm ("addq %2,%0; adcq %3,%1" \ 340 : "+r"(c0),"+d"(t2) \ 341 : "a"(t1),"g"(0) \ 342 : "cc"); \ 343 asm ("addq %2,%0; adcq %3,%1" \ 344 : "+r"(c1),"+r"(c2) \ 345 : "d"(t2),"g"(0) \ 346 : "cc"); \ 347 } while (0) 348 349# define sqr_add_c(a,i,c0,c1,c2) do { \ 350 asm ("mulq %2" \ 351 : "=a"(t1),"=d"(t2) \ 352 : "a"(a[i]) \ 353 : "cc"); \ 354 asm ("addq %2,%0; adcq %3,%1" \ 355 : "+r"(c0),"+d"(t2) \ 356 : "a"(t1),"g"(0) \ 357 : "cc"); \ 358 asm ("addq %2,%0; adcq %3,%1" \ 359 : "+r"(c1),"+r"(c2) \ 360 : "d"(t2),"g"(0) \ 361 : "cc"); \ 362 } while (0) 363 364# define mul_add_c2(a,b,c0,c1,c2) do { \ 365 asm ("mulq %3" \ 366 : "=a"(t1),"=d"(t2) \ 367 : "a"(a),"m"(b) \ 368 : "cc"); \ 369 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 370 : "+r"(c0),"+r"(c1),"+r"(c2) \ 371 : "r"(t1),"r"(t2),"g"(0) \ 372 : "cc"); \ 373 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 374 : "+r"(c0),"+r"(c1),"+r"(c2) \ 375 : "r"(t1),"r"(t2),"g"(0) \ 376 : "cc"); \ 377 } while (0) 378# endif 379 380# define sqr_add_c2(a,i,j,c0,c1,c2) \ 381 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 382 383void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 384{ 385 BN_ULONG t1, t2; 386 BN_ULONG c1, c2, c3; 387 388 c1 = 0; 389 c2 = 0; 390 c3 = 0; 391 mul_add_c(a[0], b[0], c1, c2, c3); 392 r[0] = c1; 393 c1 = 0; 394 mul_add_c(a[0], b[1], c2, c3, c1); 395 mul_add_c(a[1], b[0], c2, c3, c1); 396 r[1] = c2; 397 c2 = 0; 398 mul_add_c(a[2], b[0], c3, c1, c2); 399 mul_add_c(a[1], b[1], c3, c1, c2); 400 mul_add_c(a[0], b[2], c3, c1, c2); 401 r[2] = c3; 402 c3 = 0; 403 mul_add_c(a[0], b[3], c1, c2, c3); 404 mul_add_c(a[1], b[2], c1, c2, c3); 405 mul_add_c(a[2], b[1], c1, c2, c3); 406 mul_add_c(a[3], b[0], c1, c2, c3); 407 r[3] = c1; 408 c1 = 0; 409 mul_add_c(a[4], b[0], c2, c3, c1); 410 mul_add_c(a[3], b[1], c2, c3, c1); 411 mul_add_c(a[2], b[2], c2, c3, c1); 412 mul_add_c(a[1], b[3], c2, c3, c1); 413 mul_add_c(a[0], b[4], c2, c3, c1); 414 r[4] = c2; 415 c2 = 0; 416 mul_add_c(a[0], b[5], c3, c1, c2); 417 mul_add_c(a[1], b[4], c3, c1, c2); 418 mul_add_c(a[2], b[3], c3, c1, c2); 419 mul_add_c(a[3], b[2], c3, c1, c2); 420 mul_add_c(a[4], b[1], c3, c1, c2); 421 mul_add_c(a[5], b[0], c3, c1, c2); 422 r[5] = c3; 423 c3 = 0; 424 mul_add_c(a[6], b[0], c1, c2, c3); 425 mul_add_c(a[5], b[1], c1, c2, c3); 426 mul_add_c(a[4], b[2], c1, c2, c3); 427 mul_add_c(a[3], b[3], c1, c2, c3); 428 mul_add_c(a[2], b[4], c1, c2, c3); 429 mul_add_c(a[1], b[5], c1, c2, c3); 430 mul_add_c(a[0], b[6], c1, c2, c3); 431 r[6] = c1; 432 c1 = 0; 433 mul_add_c(a[0], b[7], c2, c3, c1); 434 mul_add_c(a[1], b[6], c2, c3, c1); 435 mul_add_c(a[2], b[5], c2, c3, c1); 436 mul_add_c(a[3], b[4], c2, c3, c1); 437 mul_add_c(a[4], b[3], c2, c3, c1); 438 mul_add_c(a[5], b[2], c2, c3, c1); 439 mul_add_c(a[6], b[1], c2, c3, c1); 440 mul_add_c(a[7], b[0], c2, c3, c1); 441 r[7] = c2; 442 c2 = 0; 443 mul_add_c(a[7], b[1], c3, c1, c2); 444 mul_add_c(a[6], b[2], c3, c1, c2); 445 mul_add_c(a[5], b[3], c3, c1, c2); 446 mul_add_c(a[4], b[4], c3, c1, c2); 447 mul_add_c(a[3], b[5], c3, c1, c2); 448 mul_add_c(a[2], b[6], c3, c1, c2); 449 mul_add_c(a[1], b[7], c3, c1, c2); 450 r[8] = c3; 451 c3 = 0; 452 mul_add_c(a[2], b[7], c1, c2, c3); 453 mul_add_c(a[3], b[6], c1, c2, c3); 454 mul_add_c(a[4], b[5], c1, c2, c3); 455 mul_add_c(a[5], b[4], c1, c2, c3); 456 mul_add_c(a[6], b[3], c1, c2, c3); 457 mul_add_c(a[7], b[2], c1, c2, c3); 458 r[9] = c1; 459 c1 = 0; 460 mul_add_c(a[7], b[3], c2, c3, c1); 461 mul_add_c(a[6], b[4], c2, c3, c1); 462 mul_add_c(a[5], b[5], c2, c3, c1); 463 mul_add_c(a[4], b[6], c2, c3, c1); 464 mul_add_c(a[3], b[7], c2, c3, c1); 465 r[10] = c2; 466 c2 = 0; 467 mul_add_c(a[4], b[7], c3, c1, c2); 468 mul_add_c(a[5], b[6], c3, c1, c2); 469 mul_add_c(a[6], b[5], c3, c1, c2); 470 mul_add_c(a[7], b[4], c3, c1, c2); 471 r[11] = c3; 472 c3 = 0; 473 mul_add_c(a[7], b[5], c1, c2, c3); 474 mul_add_c(a[6], b[6], c1, c2, c3); 475 mul_add_c(a[5], b[7], c1, c2, c3); 476 r[12] = c1; 477 c1 = 0; 478 mul_add_c(a[6], b[7], c2, c3, c1); 479 mul_add_c(a[7], b[6], c2, c3, c1); 480 r[13] = c2; 481 c2 = 0; 482 mul_add_c(a[7], b[7], c3, c1, c2); 483 r[14] = c3; 484 r[15] = c1; 485} 486 487void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 488{ 489 BN_ULONG t1, t2; 490 BN_ULONG c1, c2, c3; 491 492 c1 = 0; 493 c2 = 0; 494 c3 = 0; 495 mul_add_c(a[0], b[0], c1, c2, c3); 496 r[0] = c1; 497 c1 = 0; 498 mul_add_c(a[0], b[1], c2, c3, c1); 499 mul_add_c(a[1], b[0], c2, c3, c1); 500 r[1] = c2; 501 c2 = 0; 502 mul_add_c(a[2], b[0], c3, c1, c2); 503 mul_add_c(a[1], b[1], c3, c1, c2); 504 mul_add_c(a[0], b[2], c3, c1, c2); 505 r[2] = c3; 506 c3 = 0; 507 mul_add_c(a[0], b[3], c1, c2, c3); 508 mul_add_c(a[1], b[2], c1, c2, c3); 509 mul_add_c(a[2], b[1], c1, c2, c3); 510 mul_add_c(a[3], b[0], c1, c2, c3); 511 r[3] = c1; 512 c1 = 0; 513 mul_add_c(a[3], b[1], c2, c3, c1); 514 mul_add_c(a[2], b[2], c2, c3, c1); 515 mul_add_c(a[1], b[3], c2, c3, c1); 516 r[4] = c2; 517 c2 = 0; 518 mul_add_c(a[2], b[3], c3, c1, c2); 519 mul_add_c(a[3], b[2], c3, c1, c2); 520 r[5] = c3; 521 c3 = 0; 522 mul_add_c(a[3], b[3], c1, c2, c3); 523 r[6] = c1; 524 r[7] = c2; 525} 526 527void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 528{ 529 BN_ULONG t1, t2; 530 BN_ULONG c1, c2, c3; 531 532 c1 = 0; 533 c2 = 0; 534 c3 = 0; 535 sqr_add_c(a, 0, c1, c2, c3); 536 r[0] = c1; 537 c1 = 0; 538 sqr_add_c2(a, 1, 0, c2, c3, c1); 539 r[1] = c2; 540 c2 = 0; 541 sqr_add_c(a, 1, c3, c1, c2); 542 sqr_add_c2(a, 2, 0, c3, c1, c2); 543 r[2] = c3; 544 c3 = 0; 545 sqr_add_c2(a, 3, 0, c1, c2, c3); 546 sqr_add_c2(a, 2, 1, c1, c2, c3); 547 r[3] = c1; 548 c1 = 0; 549 sqr_add_c(a, 2, c2, c3, c1); 550 sqr_add_c2(a, 3, 1, c2, c3, c1); 551 sqr_add_c2(a, 4, 0, c2, c3, c1); 552 r[4] = c2; 553 c2 = 0; 554 sqr_add_c2(a, 5, 0, c3, c1, c2); 555 sqr_add_c2(a, 4, 1, c3, c1, c2); 556 sqr_add_c2(a, 3, 2, c3, c1, c2); 557 r[5] = c3; 558 c3 = 0; 559 sqr_add_c(a, 3, c1, c2, c3); 560 sqr_add_c2(a, 4, 2, c1, c2, c3); 561 sqr_add_c2(a, 5, 1, c1, c2, c3); 562 sqr_add_c2(a, 6, 0, c1, c2, c3); 563 r[6] = c1; 564 c1 = 0; 565 sqr_add_c2(a, 7, 0, c2, c3, c1); 566 sqr_add_c2(a, 6, 1, c2, c3, c1); 567 sqr_add_c2(a, 5, 2, c2, c3, c1); 568 sqr_add_c2(a, 4, 3, c2, c3, c1); 569 r[7] = c2; 570 c2 = 0; 571 sqr_add_c(a, 4, c3, c1, c2); 572 sqr_add_c2(a, 5, 3, c3, c1, c2); 573 sqr_add_c2(a, 6, 2, c3, c1, c2); 574 sqr_add_c2(a, 7, 1, c3, c1, c2); 575 r[8] = c3; 576 c3 = 0; 577 sqr_add_c2(a, 7, 2, c1, c2, c3); 578 sqr_add_c2(a, 6, 3, c1, c2, c3); 579 sqr_add_c2(a, 5, 4, c1, c2, c3); 580 r[9] = c1; 581 c1 = 0; 582 sqr_add_c(a, 5, c2, c3, c1); 583 sqr_add_c2(a, 6, 4, c2, c3, c1); 584 sqr_add_c2(a, 7, 3, c2, c3, c1); 585 r[10] = c2; 586 c2 = 0; 587 sqr_add_c2(a, 7, 4, c3, c1, c2); 588 sqr_add_c2(a, 6, 5, c3, c1, c2); 589 r[11] = c3; 590 c3 = 0; 591 sqr_add_c(a, 6, c1, c2, c3); 592 sqr_add_c2(a, 7, 5, c1, c2, c3); 593 r[12] = c1; 594 c1 = 0; 595 sqr_add_c2(a, 7, 6, c2, c3, c1); 596 r[13] = c2; 597 c2 = 0; 598 sqr_add_c(a, 7, c3, c1, c2); 599 r[14] = c3; 600 r[15] = c1; 601} 602 603void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 604{ 605 BN_ULONG t1, t2; 606 BN_ULONG c1, c2, c3; 607 608 c1 = 0; 609 c2 = 0; 610 c3 = 0; 611 sqr_add_c(a, 0, c1, c2, c3); 612 r[0] = c1; 613 c1 = 0; 614 sqr_add_c2(a, 1, 0, c2, c3, c1); 615 r[1] = c2; 616 c2 = 0; 617 sqr_add_c(a, 1, c3, c1, c2); 618 sqr_add_c2(a, 2, 0, c3, c1, c2); 619 r[2] = c3; 620 c3 = 0; 621 sqr_add_c2(a, 3, 0, c1, c2, c3); 622 sqr_add_c2(a, 2, 1, c1, c2, c3); 623 r[3] = c1; 624 c1 = 0; 625 sqr_add_c(a, 2, c2, c3, c1); 626 sqr_add_c2(a, 3, 1, c2, c3, c1); 627 r[4] = c2; 628 c2 = 0; 629 sqr_add_c2(a, 3, 2, c3, c1, c2); 630 r[5] = c3; 631 c3 = 0; 632 sqr_add_c(a, 3, c1, c2, c3); 633 r[6] = c1; 634 r[7] = c2; 635} 636#endif 637