1205128Ssimon#include "../bn_lcl.h" 2238405Sjkim#if !(defined(__GNUC__) && __GNUC__>=2) 3296341Sdelphij# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4162911Ssimon#else 5296341Sdelphij/*- 6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002. 7109998Smarkm * 8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9109998Smarkm * project. 10109998Smarkm * 11109998Smarkm * Rights for redistribution and usage in source and binary forms are 12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is 13109998Smarkm * disclaimed. 14109998Smarkm * 15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16109998Smarkm * versions, like 1.0... 17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty 18109998Smarkm * proof-of-concept hack. As you can see it's implemented with 19109998Smarkm * inline assembler, which means that you're bound to GCC and that 20160814Ssimon * there might be enough room for further improvement. 21109998Smarkm * 22109998Smarkm * Q. Why inline assembler? 23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is 24160814Ssimon * why I decided to let the compiler take care of subroutine 25160814Ssimon * prologue/epilogue as well as register allocation. For reference. 26160814Ssimon * Win64 implements different ABI for AMD64, different from Linux. 27109998Smarkm * 28109998Smarkm * Q. How much faster does it get? 29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm: 30160814Ssimon * 31296341Sdelphij * sign verify sign/s verify/s 32296341Sdelphij * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33296341Sdelphij * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34296341Sdelphij * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35296341Sdelphij * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36296341Sdelphij * sign verify sign/s verify/s 37296341Sdelphij * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38296341Sdelphij * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39296341Sdelphij * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40160814Ssimon * 41160814Ssimon * 'apps/openssl speed rsa dsa' output with this module: 42160814Ssimon * 43296341Sdelphij * sign verify sign/s verify/s 44296341Sdelphij * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45296341Sdelphij * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46296341Sdelphij * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47296341Sdelphij * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48296341Sdelphij * sign verify sign/s verify/s 49296341Sdelphij * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50296341Sdelphij * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51296341Sdelphij * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52160814Ssimon * 53160814Ssimon * For the reference. IA-32 assembler implementation performs 54160814Ssimon * very much like 64-bit code compiled with no-asm on the same 55160814Ssimon * machine. 56109998Smarkm */ 57109998Smarkm 58296341Sdelphij# ifdef _WIN64 59296341Sdelphij# define BN_ULONG unsigned long long 60296341Sdelphij# else 61296341Sdelphij# define BN_ULONG unsigned long 62296341Sdelphij# endif 63109998Smarkm 64296341Sdelphij# undef mul 65296341Sdelphij# undef mul_add 66296341Sdelphij# undef sqr 67205128Ssimon 68296341Sdelphij/*- 69296341Sdelphij * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 70296341Sdelphij * "g"(0) let the compiler to decide where does it 71296341Sdelphij * want to keep the value of zero; 72109998Smarkm */ 73296341Sdelphij# define mul_add(r,a,word,carry) do { \ 74296341Sdelphij register BN_ULONG high,low; \ 75296341Sdelphij asm ("mulq %3" \ 76296341Sdelphij : "=a"(low),"=d"(high) \ 77296341Sdelphij : "a"(word),"m"(a) \ 78296341Sdelphij : "cc"); \ 79296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 80296341Sdelphij : "+r"(carry),"+d"(high)\ 81296341Sdelphij : "a"(low),"g"(0) \ 82296341Sdelphij : "cc"); \ 83296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 84296341Sdelphij : "+m"(r),"+d"(high) \ 85296341Sdelphij : "r"(carry),"g"(0) \ 86296341Sdelphij : "cc"); \ 87296341Sdelphij carry=high; \ 88296341Sdelphij } while (0) 89109998Smarkm 90296341Sdelphij# define mul(r,a,word,carry) do { \ 91296341Sdelphij register BN_ULONG high,low; \ 92296341Sdelphij asm ("mulq %3" \ 93296341Sdelphij : "=a"(low),"=d"(high) \ 94296341Sdelphij : "a"(word),"g"(a) \ 95296341Sdelphij : "cc"); \ 96296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 97296341Sdelphij : "+r"(carry),"+d"(high)\ 98296341Sdelphij : "a"(low),"g"(0) \ 99296341Sdelphij : "cc"); \ 100296341Sdelphij (r)=carry, carry=high; \ 101296341Sdelphij } while (0) 102109998Smarkm 103296341Sdelphij# define sqr(r0,r1,a) \ 104296341Sdelphij asm ("mulq %2" \ 105296341Sdelphij : "=a"(r0),"=d"(r1) \ 106296341Sdelphij : "a"(a) \ 107296341Sdelphij : "cc"); 108109998Smarkm 109296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 110296341Sdelphij BN_ULONG w) 111296341Sdelphij{ 112296341Sdelphij BN_ULONG c1 = 0; 113109998Smarkm 114296341Sdelphij if (num <= 0) 115296341Sdelphij return (c1); 116109998Smarkm 117296341Sdelphij while (num & ~3) { 118296341Sdelphij mul_add(rp[0], ap[0], w, c1); 119296341Sdelphij mul_add(rp[1], ap[1], w, c1); 120296341Sdelphij mul_add(rp[2], ap[2], w, c1); 121296341Sdelphij mul_add(rp[3], ap[3], w, c1); 122296341Sdelphij ap += 4; 123296341Sdelphij rp += 4; 124296341Sdelphij num -= 4; 125296341Sdelphij } 126296341Sdelphij if (num) { 127296341Sdelphij mul_add(rp[0], ap[0], w, c1); 128296341Sdelphij if (--num == 0) 129296341Sdelphij return c1; 130296341Sdelphij mul_add(rp[1], ap[1], w, c1); 131296341Sdelphij if (--num == 0) 132296341Sdelphij return c1; 133296341Sdelphij mul_add(rp[2], ap[2], w, c1); 134296341Sdelphij return c1; 135296341Sdelphij } 136109998Smarkm 137296341Sdelphij return (c1); 138296341Sdelphij} 139296341Sdelphij 140205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 141296341Sdelphij{ 142296341Sdelphij BN_ULONG c1 = 0; 143109998Smarkm 144296341Sdelphij if (num <= 0) 145296341Sdelphij return (c1); 146109998Smarkm 147296341Sdelphij while (num & ~3) { 148296341Sdelphij mul(rp[0], ap[0], w, c1); 149296341Sdelphij mul(rp[1], ap[1], w, c1); 150296341Sdelphij mul(rp[2], ap[2], w, c1); 151296341Sdelphij mul(rp[3], ap[3], w, c1); 152296341Sdelphij ap += 4; 153296341Sdelphij rp += 4; 154296341Sdelphij num -= 4; 155296341Sdelphij } 156296341Sdelphij if (num) { 157296341Sdelphij mul(rp[0], ap[0], w, c1); 158296341Sdelphij if (--num == 0) 159296341Sdelphij return c1; 160296341Sdelphij mul(rp[1], ap[1], w, c1); 161296341Sdelphij if (--num == 0) 162296341Sdelphij return c1; 163296341Sdelphij mul(rp[2], ap[2], w, c1); 164296341Sdelphij } 165296341Sdelphij return (c1); 166296341Sdelphij} 167109998Smarkm 168205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 169296341Sdelphij{ 170296341Sdelphij if (n <= 0) 171296341Sdelphij return; 172109998Smarkm 173296341Sdelphij while (n & ~3) { 174296341Sdelphij sqr(r[0], r[1], a[0]); 175296341Sdelphij sqr(r[2], r[3], a[1]); 176296341Sdelphij sqr(r[4], r[5], a[2]); 177296341Sdelphij sqr(r[6], r[7], a[3]); 178296341Sdelphij a += 4; 179296341Sdelphij r += 8; 180296341Sdelphij n -= 4; 181296341Sdelphij } 182296341Sdelphij if (n) { 183296341Sdelphij sqr(r[0], r[1], a[0]); 184296341Sdelphij if (--n == 0) 185296341Sdelphij return; 186296341Sdelphij sqr(r[2], r[3], a[1]); 187296341Sdelphij if (--n == 0) 188296341Sdelphij return; 189296341Sdelphij sqr(r[4], r[5], a[2]); 190296341Sdelphij } 191296341Sdelphij} 192109998Smarkm 193109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 194296341Sdelphij{ 195296341Sdelphij BN_ULONG ret, waste; 196109998Smarkm 197296341Sdelphij asm("divq %4":"=a"(ret), "=d"(waste) 198296341Sdelphij : "a"(l), "d"(h), "g"(d) 199296341Sdelphij : "cc"); 200109998Smarkm 201296341Sdelphij return ret; 202109998Smarkm} 203109998Smarkm 204296341SdelphijBN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 205296341Sdelphij int n) 206296341Sdelphij{ 207296341Sdelphij BN_ULONG ret = 0, i = 0; 208109998Smarkm 209296341Sdelphij if (n <= 0) 210296341Sdelphij return 0; 211109998Smarkm 212296341Sdelphij asm volatile (" subq %2,%2 \n" 213296341Sdelphij ".p2align 4 \n" 214296341Sdelphij "1: movq (%4,%2,8),%0 \n" 215296341Sdelphij " adcq (%5,%2,8),%0 \n" 216296341Sdelphij " movq %0,(%3,%2,8) \n" 217296341Sdelphij " leaq 1(%2),%2 \n" 218296341Sdelphij " loop 1b \n" 219296341Sdelphij " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 220296341Sdelphij "=&r"(i) 221296341Sdelphij :"r"(rp), "r"(ap), "r"(bp) 222296341Sdelphij :"cc", "memory"); 223109998Smarkm 224296341Sdelphij return ret & 1; 225109998Smarkm} 226109998Smarkm 227296341Sdelphij# ifndef SIMICS 228296341SdelphijBN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 229296341Sdelphij int n) 230296341Sdelphij{ 231296341Sdelphij BN_ULONG ret = 0, i = 0; 232109998Smarkm 233296341Sdelphij if (n <= 0) 234296341Sdelphij return 0; 235109998Smarkm 236296341Sdelphij asm volatile (" subq %2,%2 \n" 237296341Sdelphij ".p2align 4 \n" 238296341Sdelphij "1: movq (%4,%2,8),%0 \n" 239296341Sdelphij " sbbq (%5,%2,8),%0 \n" 240296341Sdelphij " movq %0,(%3,%2,8) \n" 241296341Sdelphij " leaq 1(%2),%2 \n" 242296341Sdelphij " loop 1b \n" 243296341Sdelphij " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 244296341Sdelphij "=&r"(i) 245296341Sdelphij :"r"(rp), "r"(ap), "r"(bp) 246296341Sdelphij :"cc", "memory"); 247109998Smarkm 248296341Sdelphij return ret & 1; 249109998Smarkm} 250296341Sdelphij# else 251109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */ 252296341Sdelphij# define BN_MASK2 0xffffffffffffffffL 253109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 254296341Sdelphij{ 255296341Sdelphij BN_ULONG t1, t2; 256296341Sdelphij int c = 0; 257109998Smarkm 258296341Sdelphij if (n <= 0) 259296341Sdelphij return ((BN_ULONG)0); 260109998Smarkm 261296341Sdelphij for (;;) { 262296341Sdelphij t1 = a[0]; 263296341Sdelphij t2 = b[0]; 264296341Sdelphij r[0] = (t1 - t2 - c) & BN_MASK2; 265296341Sdelphij if (t1 != t2) 266296341Sdelphij c = (t1 < t2); 267296341Sdelphij if (--n <= 0) 268296341Sdelphij break; 269109998Smarkm 270296341Sdelphij t1 = a[1]; 271296341Sdelphij t2 = b[1]; 272296341Sdelphij r[1] = (t1 - t2 - c) & BN_MASK2; 273296341Sdelphij if (t1 != t2) 274296341Sdelphij c = (t1 < t2); 275296341Sdelphij if (--n <= 0) 276296341Sdelphij break; 277109998Smarkm 278296341Sdelphij t1 = a[2]; 279296341Sdelphij t2 = b[2]; 280296341Sdelphij r[2] = (t1 - t2 - c) & BN_MASK2; 281296341Sdelphij if (t1 != t2) 282296341Sdelphij c = (t1 < t2); 283296341Sdelphij if (--n <= 0) 284296341Sdelphij break; 285109998Smarkm 286296341Sdelphij t1 = a[3]; 287296341Sdelphij t2 = b[3]; 288296341Sdelphij r[3] = (t1 - t2 - c) & BN_MASK2; 289296341Sdelphij if (t1 != t2) 290296341Sdelphij c = (t1 < t2); 291296341Sdelphij if (--n <= 0) 292296341Sdelphij break; 293109998Smarkm 294296341Sdelphij a += 4; 295296341Sdelphij b += 4; 296296341Sdelphij r += 4; 297296341Sdelphij } 298296341Sdelphij return (c); 299296341Sdelphij} 300296341Sdelphij# endif 301109998Smarkm 302109998Smarkm/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 303109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 304109998Smarkm/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 305296341Sdelphij/* 306296341Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 307296341Sdelphij * c=(c2,c1,c0) 308296341Sdelphij */ 309109998Smarkm 310277195Sdelphij/* 311277195Sdelphij * Keep in mind that carrying into high part of multiplication result 312277195Sdelphij * can not overflow, because it cannot be all-ones. 313277195Sdelphij */ 314296341Sdelphij# if 0 315109998Smarkm/* original macros are kept for reference purposes */ 316296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) { \ 317296341Sdelphij BN_ULONG ta=(a),tb=(b); \ 318296341Sdelphij t1 = ta * tb; \ 319296341Sdelphij t2 = BN_UMULT_HIGH(ta,tb); \ 320296341Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 321296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 322296341Sdelphij } 323109998Smarkm 324296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) { \ 325296341Sdelphij BN_ULONG ta=(a),tb=(b),t0; \ 326296341Sdelphij t1 = BN_UMULT_HIGH(ta,tb); \ 327296341Sdelphij t0 = ta * tb; \ 328296341Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 329296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 330296341Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 331296341Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 332296341Sdelphij } 333296341Sdelphij# else 334296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) do { \ 335296341Sdelphij asm ("mulq %3" \ 336296341Sdelphij : "=a"(t1),"=d"(t2) \ 337296341Sdelphij : "a"(a),"m"(b) \ 338296341Sdelphij : "cc"); \ 339296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 340296341Sdelphij : "+r"(c0),"+d"(t2) \ 341296341Sdelphij : "a"(t1),"g"(0) \ 342296341Sdelphij : "cc"); \ 343296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 344296341Sdelphij : "+r"(c1),"+r"(c2) \ 345296341Sdelphij : "d"(t2),"g"(0) \ 346296341Sdelphij : "cc"); \ 347296341Sdelphij } while (0) 348109998Smarkm 349296341Sdelphij# define sqr_add_c(a,i,c0,c1,c2) do { \ 350296341Sdelphij asm ("mulq %2" \ 351296341Sdelphij : "=a"(t1),"=d"(t2) \ 352296341Sdelphij : "a"(a[i]) \ 353296341Sdelphij : "cc"); \ 354296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 355296341Sdelphij : "+r"(c0),"+d"(t2) \ 356296341Sdelphij : "a"(t1),"g"(0) \ 357296341Sdelphij : "cc"); \ 358296341Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 359296341Sdelphij : "+r"(c1),"+r"(c2) \ 360296341Sdelphij : "d"(t2),"g"(0) \ 361296341Sdelphij : "cc"); \ 362296341Sdelphij } while (0) 363109998Smarkm 364296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) do { \ 365296341Sdelphij asm ("mulq %3" \ 366296341Sdelphij : "=a"(t1),"=d"(t2) \ 367296341Sdelphij : "a"(a),"m"(b) \ 368296341Sdelphij : "cc"); \ 369296341Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 370296341Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 371296341Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 372296341Sdelphij : "cc"); \ 373296341Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 374296341Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 375296341Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 376296341Sdelphij : "cc"); \ 377296341Sdelphij } while (0) 378296341Sdelphij# endif 379109998Smarkm 380296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 381296341Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 382109998Smarkm 383109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 384296341Sdelphij{ 385296341Sdelphij BN_ULONG t1, t2; 386296341Sdelphij BN_ULONG c1, c2, c3; 387109998Smarkm 388296341Sdelphij c1 = 0; 389296341Sdelphij c2 = 0; 390296341Sdelphij c3 = 0; 391296341Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 392296341Sdelphij r[0] = c1; 393296341Sdelphij c1 = 0; 394296341Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 395296341Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 396296341Sdelphij r[1] = c2; 397296341Sdelphij c2 = 0; 398296341Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 399296341Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 400296341Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 401296341Sdelphij r[2] = c3; 402296341Sdelphij c3 = 0; 403296341Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 404296341Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 405296341Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 406296341Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 407296341Sdelphij r[3] = c1; 408296341Sdelphij c1 = 0; 409296341Sdelphij mul_add_c(a[4], b[0], c2, c3, c1); 410296341Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 411296341Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 412296341Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 413296341Sdelphij mul_add_c(a[0], b[4], c2, c3, c1); 414296341Sdelphij r[4] = c2; 415296341Sdelphij c2 = 0; 416296341Sdelphij mul_add_c(a[0], b[5], c3, c1, c2); 417296341Sdelphij mul_add_c(a[1], b[4], c3, c1, c2); 418296341Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 419296341Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 420296341Sdelphij mul_add_c(a[4], b[1], c3, c1, c2); 421296341Sdelphij mul_add_c(a[5], b[0], c3, c1, c2); 422296341Sdelphij r[5] = c3; 423296341Sdelphij c3 = 0; 424296341Sdelphij mul_add_c(a[6], b[0], c1, c2, c3); 425296341Sdelphij mul_add_c(a[5], b[1], c1, c2, c3); 426296341Sdelphij mul_add_c(a[4], b[2], c1, c2, c3); 427296341Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 428296341Sdelphij mul_add_c(a[2], b[4], c1, c2, c3); 429296341Sdelphij mul_add_c(a[1], b[5], c1, c2, c3); 430296341Sdelphij mul_add_c(a[0], b[6], c1, c2, c3); 431296341Sdelphij r[6] = c1; 432296341Sdelphij c1 = 0; 433296341Sdelphij mul_add_c(a[0], b[7], c2, c3, c1); 434296341Sdelphij mul_add_c(a[1], b[6], c2, c3, c1); 435296341Sdelphij mul_add_c(a[2], b[5], c2, c3, c1); 436296341Sdelphij mul_add_c(a[3], b[4], c2, c3, c1); 437296341Sdelphij mul_add_c(a[4], b[3], c2, c3, c1); 438296341Sdelphij mul_add_c(a[5], b[2], c2, c3, c1); 439296341Sdelphij mul_add_c(a[6], b[1], c2, c3, c1); 440296341Sdelphij mul_add_c(a[7], b[0], c2, c3, c1); 441296341Sdelphij r[7] = c2; 442296341Sdelphij c2 = 0; 443296341Sdelphij mul_add_c(a[7], b[1], c3, c1, c2); 444296341Sdelphij mul_add_c(a[6], b[2], c3, c1, c2); 445296341Sdelphij mul_add_c(a[5], b[3], c3, c1, c2); 446296341Sdelphij mul_add_c(a[4], b[4], c3, c1, c2); 447296341Sdelphij mul_add_c(a[3], b[5], c3, c1, c2); 448296341Sdelphij mul_add_c(a[2], b[6], c3, c1, c2); 449296341Sdelphij mul_add_c(a[1], b[7], c3, c1, c2); 450296341Sdelphij r[8] = c3; 451296341Sdelphij c3 = 0; 452296341Sdelphij mul_add_c(a[2], b[7], c1, c2, c3); 453296341Sdelphij mul_add_c(a[3], b[6], c1, c2, c3); 454296341Sdelphij mul_add_c(a[4], b[5], c1, c2, c3); 455296341Sdelphij mul_add_c(a[5], b[4], c1, c2, c3); 456296341Sdelphij mul_add_c(a[6], b[3], c1, c2, c3); 457296341Sdelphij mul_add_c(a[7], b[2], c1, c2, c3); 458296341Sdelphij r[9] = c1; 459296341Sdelphij c1 = 0; 460296341Sdelphij mul_add_c(a[7], b[3], c2, c3, c1); 461296341Sdelphij mul_add_c(a[6], b[4], c2, c3, c1); 462296341Sdelphij mul_add_c(a[5], b[5], c2, c3, c1); 463296341Sdelphij mul_add_c(a[4], b[6], c2, c3, c1); 464296341Sdelphij mul_add_c(a[3], b[7], c2, c3, c1); 465296341Sdelphij r[10] = c2; 466296341Sdelphij c2 = 0; 467296341Sdelphij mul_add_c(a[4], b[7], c3, c1, c2); 468296341Sdelphij mul_add_c(a[5], b[6], c3, c1, c2); 469296341Sdelphij mul_add_c(a[6], b[5], c3, c1, c2); 470296341Sdelphij mul_add_c(a[7], b[4], c3, c1, c2); 471296341Sdelphij r[11] = c3; 472296341Sdelphij c3 = 0; 473296341Sdelphij mul_add_c(a[7], b[5], c1, c2, c3); 474296341Sdelphij mul_add_c(a[6], b[6], c1, c2, c3); 475296341Sdelphij mul_add_c(a[5], b[7], c1, c2, c3); 476296341Sdelphij r[12] = c1; 477296341Sdelphij c1 = 0; 478296341Sdelphij mul_add_c(a[6], b[7], c2, c3, c1); 479296341Sdelphij mul_add_c(a[7], b[6], c2, c3, c1); 480296341Sdelphij r[13] = c2; 481296341Sdelphij c2 = 0; 482296341Sdelphij mul_add_c(a[7], b[7], c3, c1, c2); 483296341Sdelphij r[14] = c3; 484296341Sdelphij r[15] = c1; 485296341Sdelphij} 486109998Smarkm 487109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 488296341Sdelphij{ 489296341Sdelphij BN_ULONG t1, t2; 490296341Sdelphij BN_ULONG c1, c2, c3; 491109998Smarkm 492296341Sdelphij c1 = 0; 493296341Sdelphij c2 = 0; 494296341Sdelphij c3 = 0; 495296341Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 496296341Sdelphij r[0] = c1; 497296341Sdelphij c1 = 0; 498296341Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 499296341Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 500296341Sdelphij r[1] = c2; 501296341Sdelphij c2 = 0; 502296341Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 503296341Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 504296341Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 505296341Sdelphij r[2] = c3; 506296341Sdelphij c3 = 0; 507296341Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 508296341Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 509296341Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 510296341Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 511296341Sdelphij r[3] = c1; 512296341Sdelphij c1 = 0; 513296341Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 514296341Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 515296341Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 516296341Sdelphij r[4] = c2; 517296341Sdelphij c2 = 0; 518296341Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 519296341Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 520296341Sdelphij r[5] = c3; 521296341Sdelphij c3 = 0; 522296341Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 523296341Sdelphij r[6] = c1; 524296341Sdelphij r[7] = c2; 525296341Sdelphij} 526109998Smarkm 527205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 528296341Sdelphij{ 529296341Sdelphij BN_ULONG t1, t2; 530296341Sdelphij BN_ULONG c1, c2, c3; 531109998Smarkm 532296341Sdelphij c1 = 0; 533296341Sdelphij c2 = 0; 534296341Sdelphij c3 = 0; 535296341Sdelphij sqr_add_c(a, 0, c1, c2, c3); 536296341Sdelphij r[0] = c1; 537296341Sdelphij c1 = 0; 538296341Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 539296341Sdelphij r[1] = c2; 540296341Sdelphij c2 = 0; 541296341Sdelphij sqr_add_c(a, 1, c3, c1, c2); 542296341Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 543296341Sdelphij r[2] = c3; 544296341Sdelphij c3 = 0; 545296341Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 546296341Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 547296341Sdelphij r[3] = c1; 548296341Sdelphij c1 = 0; 549296341Sdelphij sqr_add_c(a, 2, c2, c3, c1); 550296341Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 551296341Sdelphij sqr_add_c2(a, 4, 0, c2, c3, c1); 552296341Sdelphij r[4] = c2; 553296341Sdelphij c2 = 0; 554296341Sdelphij sqr_add_c2(a, 5, 0, c3, c1, c2); 555296341Sdelphij sqr_add_c2(a, 4, 1, c3, c1, c2); 556296341Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 557296341Sdelphij r[5] = c3; 558296341Sdelphij c3 = 0; 559296341Sdelphij sqr_add_c(a, 3, c1, c2, c3); 560296341Sdelphij sqr_add_c2(a, 4, 2, c1, c2, c3); 561296341Sdelphij sqr_add_c2(a, 5, 1, c1, c2, c3); 562296341Sdelphij sqr_add_c2(a, 6, 0, c1, c2, c3); 563296341Sdelphij r[6] = c1; 564296341Sdelphij c1 = 0; 565296341Sdelphij sqr_add_c2(a, 7, 0, c2, c3, c1); 566296341Sdelphij sqr_add_c2(a, 6, 1, c2, c3, c1); 567296341Sdelphij sqr_add_c2(a, 5, 2, c2, c3, c1); 568296341Sdelphij sqr_add_c2(a, 4, 3, c2, c3, c1); 569296341Sdelphij r[7] = c2; 570296341Sdelphij c2 = 0; 571296341Sdelphij sqr_add_c(a, 4, c3, c1, c2); 572296341Sdelphij sqr_add_c2(a, 5, 3, c3, c1, c2); 573296341Sdelphij sqr_add_c2(a, 6, 2, c3, c1, c2); 574296341Sdelphij sqr_add_c2(a, 7, 1, c3, c1, c2); 575296341Sdelphij r[8] = c3; 576296341Sdelphij c3 = 0; 577296341Sdelphij sqr_add_c2(a, 7, 2, c1, c2, c3); 578296341Sdelphij sqr_add_c2(a, 6, 3, c1, c2, c3); 579296341Sdelphij sqr_add_c2(a, 5, 4, c1, c2, c3); 580296341Sdelphij r[9] = c1; 581296341Sdelphij c1 = 0; 582296341Sdelphij sqr_add_c(a, 5, c2, c3, c1); 583296341Sdelphij sqr_add_c2(a, 6, 4, c2, c3, c1); 584296341Sdelphij sqr_add_c2(a, 7, 3, c2, c3, c1); 585296341Sdelphij r[10] = c2; 586296341Sdelphij c2 = 0; 587296341Sdelphij sqr_add_c2(a, 7, 4, c3, c1, c2); 588296341Sdelphij sqr_add_c2(a, 6, 5, c3, c1, c2); 589296341Sdelphij r[11] = c3; 590296341Sdelphij c3 = 0; 591296341Sdelphij sqr_add_c(a, 6, c1, c2, c3); 592296341Sdelphij sqr_add_c2(a, 7, 5, c1, c2, c3); 593296341Sdelphij r[12] = c1; 594296341Sdelphij c1 = 0; 595296341Sdelphij sqr_add_c2(a, 7, 6, c2, c3, c1); 596296341Sdelphij r[13] = c2; 597296341Sdelphij c2 = 0; 598296341Sdelphij sqr_add_c(a, 7, c3, c1, c2); 599296341Sdelphij r[14] = c3; 600296341Sdelphij r[15] = c1; 601296341Sdelphij} 602109998Smarkm 603205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 604296341Sdelphij{ 605296341Sdelphij BN_ULONG t1, t2; 606296341Sdelphij BN_ULONG c1, c2, c3; 607109998Smarkm 608296341Sdelphij c1 = 0; 609296341Sdelphij c2 = 0; 610296341Sdelphij c3 = 0; 611296341Sdelphij sqr_add_c(a, 0, c1, c2, c3); 612296341Sdelphij r[0] = c1; 613296341Sdelphij c1 = 0; 614296341Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 615296341Sdelphij r[1] = c2; 616296341Sdelphij c2 = 0; 617296341Sdelphij sqr_add_c(a, 1, c3, c1, c2); 618296341Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 619296341Sdelphij r[2] = c3; 620296341Sdelphij c3 = 0; 621296341Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 622296341Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 623296341Sdelphij r[3] = c1; 624296341Sdelphij c1 = 0; 625296341Sdelphij sqr_add_c(a, 2, c2, c3, c1); 626296341Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 627296341Sdelphij r[4] = c2; 628296341Sdelphij c2 = 0; 629296341Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 630296341Sdelphij r[5] = c3; 631296341Sdelphij c3 = 0; 632296341Sdelphij sqr_add_c(a, 3, c1, c2, c3); 633296341Sdelphij r[6] = c1; 634296341Sdelphij r[7] = c2; 635296341Sdelphij} 636162911Ssimon#endif 637