bn_asm.c revision 296341
1/* crypto/bn/bn_asm.c */ 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 3 * All rights reserved. 4 * 5 * This package is an SSL implementation written 6 * by Eric Young (eay@cryptsoft.com). 7 * The implementation was written so as to conform with Netscapes SSL. 8 * 9 * This library is free for commercial and non-commercial use as long as 10 * the following conditions are aheared to. The following conditions 11 * apply to all code found in this distribution, be it the RC4, RSA, 12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 13 * included with this distribution is covered by the same copyright terms 14 * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15 * 16 * Copyright remains Eric Young's, and as such any Copyright notices in 17 * the code are not to be removed. 18 * If this package is used in a product, Eric Young should be given attribution 19 * as the author of the parts of the library used. 20 * This can be in the form of a textual message at program startup or 21 * in documentation (online or textual) provided with the package. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 3. All advertising materials mentioning features or use of this software 32 * must display the following acknowledgement: 33 * "This product includes cryptographic software written by 34 * Eric Young (eay@cryptsoft.com)" 35 * The word 'cryptographic' can be left out if the rouines from the library 36 * being used are not cryptographic related :-). 37 * 4. If you include any Windows specific code (or a derivative thereof) from 38 * the apps directory (application code) you must include an acknowledgement: 39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40 * 41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * 53 * The licence and distribution terms for any publically available version or 54 * derivative of this code cannot be changed. i.e. this code cannot simply be 55 * copied and put under another distribution licence 56 * [including the GNU Public Licence.] 57 */ 58 59#ifndef BN_DEBUG 60# undef NDEBUG /* avoid conflicting definitions */ 61# define NDEBUG 62#endif 63 64#include <stdio.h> 65#include <assert.h> 66#include "cryptlib.h" 67#include "bn_lcl.h" 68 69#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 70 71BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 72 BN_ULONG w) 73{ 74 BN_ULONG c1 = 0; 75 76 assert(num >= 0); 77 if (num <= 0) 78 return (c1); 79 80# ifndef OPENSSL_SMALL_FOOTPRINT 81 while (num & ~3) { 82 mul_add(rp[0], ap[0], w, c1); 83 mul_add(rp[1], ap[1], w, c1); 84 mul_add(rp[2], ap[2], w, c1); 85 mul_add(rp[3], ap[3], w, c1); 86 ap += 4; 87 rp += 4; 88 num -= 4; 89 } 90# endif 91 while (num) { 92 mul_add(rp[0], ap[0], w, c1); 93 ap++; 94 rp++; 95 num--; 96 } 97 98 return (c1); 99} 100 101BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 102{ 103 BN_ULONG c1 = 0; 104 105 assert(num >= 0); 106 if (num <= 0) 107 return (c1); 108 109# ifndef OPENSSL_SMALL_FOOTPRINT 110 while (num & ~3) { 111 mul(rp[0], ap[0], w, c1); 112 mul(rp[1], ap[1], w, c1); 113 mul(rp[2], ap[2], w, c1); 114 mul(rp[3], ap[3], w, c1); 115 ap += 4; 116 rp += 4; 117 num -= 4; 118 } 119# endif 120 while (num) { 121 mul(rp[0], ap[0], w, c1); 122 ap++; 123 rp++; 124 num--; 125 } 126 return (c1); 127} 128 129void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 130{ 131 assert(n >= 0); 132 if (n <= 0) 133 return; 134 135# ifndef OPENSSL_SMALL_FOOTPRINT 136 while (n & ~3) { 137 sqr(r[0], r[1], a[0]); 138 sqr(r[2], r[3], a[1]); 139 sqr(r[4], r[5], a[2]); 140 sqr(r[6], r[7], a[3]); 141 a += 4; 142 r += 8; 143 n -= 4; 144 } 145# endif 146 while (n) { 147 sqr(r[0], r[1], a[0]); 148 a++; 149 r += 2; 150 n--; 151 } 152} 153 154#else /* !(defined(BN_LLONG) || 155 * defined(BN_UMULT_HIGH)) */ 156 157BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 158 BN_ULONG w) 159{ 160 BN_ULONG c = 0; 161 BN_ULONG bl, bh; 162 163 assert(num >= 0); 164 if (num <= 0) 165 return ((BN_ULONG)0); 166 167 bl = LBITS(w); 168 bh = HBITS(w); 169 170# ifndef OPENSSL_SMALL_FOOTPRINT 171 while (num & ~3) { 172 mul_add(rp[0], ap[0], bl, bh, c); 173 mul_add(rp[1], ap[1], bl, bh, c); 174 mul_add(rp[2], ap[2], bl, bh, c); 175 mul_add(rp[3], ap[3], bl, bh, c); 176 ap += 4; 177 rp += 4; 178 num -= 4; 179 } 180# endif 181 while (num) { 182 mul_add(rp[0], ap[0], bl, bh, c); 183 ap++; 184 rp++; 185 num--; 186 } 187 return (c); 188} 189 190BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 191{ 192 BN_ULONG carry = 0; 193 BN_ULONG bl, bh; 194 195 assert(num >= 0); 196 if (num <= 0) 197 return ((BN_ULONG)0); 198 199 bl = LBITS(w); 200 bh = HBITS(w); 201 202# ifndef OPENSSL_SMALL_FOOTPRINT 203 while (num & ~3) { 204 mul(rp[0], ap[0], bl, bh, carry); 205 mul(rp[1], ap[1], bl, bh, carry); 206 mul(rp[2], ap[2], bl, bh, carry); 207 mul(rp[3], ap[3], bl, bh, carry); 208 ap += 4; 209 rp += 4; 210 num -= 4; 211 } 212# endif 213 while (num) { 214 mul(rp[0], ap[0], bl, bh, carry); 215 ap++; 216 rp++; 217 num--; 218 } 219 return (carry); 220} 221 222void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 223{ 224 assert(n >= 0); 225 if (n <= 0) 226 return; 227 228# ifndef OPENSSL_SMALL_FOOTPRINT 229 while (n & ~3) { 230 sqr64(r[0], r[1], a[0]); 231 sqr64(r[2], r[3], a[1]); 232 sqr64(r[4], r[5], a[2]); 233 sqr64(r[6], r[7], a[3]); 234 a += 4; 235 r += 8; 236 n -= 4; 237 } 238# endif 239 while (n) { 240 sqr64(r[0], r[1], a[0]); 241 a++; 242 r += 2; 243 n--; 244 } 245} 246 247#endif /* !(defined(BN_LLONG) || 248 * defined(BN_UMULT_HIGH)) */ 249 250#if defined(BN_LLONG) && defined(BN_DIV2W) 251 252BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 253{ 254 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 255} 256 257#else 258 259/* Divide h,l by d and return the result. */ 260/* I need to test this some more :-( */ 261BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 262{ 263 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 264 int i, count = 2; 265 266 if (d == 0) 267 return (BN_MASK2); 268 269 i = BN_num_bits_word(d); 270 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 271 272 i = BN_BITS2 - i; 273 if (h >= d) 274 h -= d; 275 276 if (i) { 277 d <<= i; 278 h = (h << i) | (l >> (BN_BITS2 - i)); 279 l <<= i; 280 } 281 dh = (d & BN_MASK2h) >> BN_BITS4; 282 dl = (d & BN_MASK2l); 283 for (;;) { 284 if ((h >> BN_BITS4) == dh) 285 q = BN_MASK2l; 286 else 287 q = h / dh; 288 289 th = q * dh; 290 tl = dl * q; 291 for (;;) { 292 t = h - th; 293 if ((t & BN_MASK2h) || 294 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 295 break; 296 q--; 297 th -= dh; 298 tl -= dl; 299 } 300 t = (tl >> BN_BITS4); 301 tl = (tl << BN_BITS4) & BN_MASK2h; 302 th += t; 303 304 if (l < tl) 305 th++; 306 l -= tl; 307 if (h < th) { 308 h += d; 309 q--; 310 } 311 h -= th; 312 313 if (--count == 0) 314 break; 315 316 ret = q << BN_BITS4; 317 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 318 l = (l & BN_MASK2l) << BN_BITS4; 319 } 320 ret |= q; 321 return (ret); 322} 323#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 324 325#ifdef BN_LLONG 326BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 327 int n) 328{ 329 BN_ULLONG ll = 0; 330 331 assert(n >= 0); 332 if (n <= 0) 333 return ((BN_ULONG)0); 334 335# ifndef OPENSSL_SMALL_FOOTPRINT 336 while (n & ~3) { 337 ll += (BN_ULLONG) a[0] + b[0]; 338 r[0] = (BN_ULONG)ll & BN_MASK2; 339 ll >>= BN_BITS2; 340 ll += (BN_ULLONG) a[1] + b[1]; 341 r[1] = (BN_ULONG)ll & BN_MASK2; 342 ll >>= BN_BITS2; 343 ll += (BN_ULLONG) a[2] + b[2]; 344 r[2] = (BN_ULONG)ll & BN_MASK2; 345 ll >>= BN_BITS2; 346 ll += (BN_ULLONG) a[3] + b[3]; 347 r[3] = (BN_ULONG)ll & BN_MASK2; 348 ll >>= BN_BITS2; 349 a += 4; 350 b += 4; 351 r += 4; 352 n -= 4; 353 } 354# endif 355 while (n) { 356 ll += (BN_ULLONG) a[0] + b[0]; 357 r[0] = (BN_ULONG)ll & BN_MASK2; 358 ll >>= BN_BITS2; 359 a++; 360 b++; 361 r++; 362 n--; 363 } 364 return ((BN_ULONG)ll); 365} 366#else /* !BN_LLONG */ 367BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 368 int n) 369{ 370 BN_ULONG c, l, t; 371 372 assert(n >= 0); 373 if (n <= 0) 374 return ((BN_ULONG)0); 375 376 c = 0; 377# ifndef OPENSSL_SMALL_FOOTPRINT 378 while (n & ~3) { 379 t = a[0]; 380 t = (t + c) & BN_MASK2; 381 c = (t < c); 382 l = (t + b[0]) & BN_MASK2; 383 c += (l < t); 384 r[0] = l; 385 t = a[1]; 386 t = (t + c) & BN_MASK2; 387 c = (t < c); 388 l = (t + b[1]) & BN_MASK2; 389 c += (l < t); 390 r[1] = l; 391 t = a[2]; 392 t = (t + c) & BN_MASK2; 393 c = (t < c); 394 l = (t + b[2]) & BN_MASK2; 395 c += (l < t); 396 r[2] = l; 397 t = a[3]; 398 t = (t + c) & BN_MASK2; 399 c = (t < c); 400 l = (t + b[3]) & BN_MASK2; 401 c += (l < t); 402 r[3] = l; 403 a += 4; 404 b += 4; 405 r += 4; 406 n -= 4; 407 } 408# endif 409 while (n) { 410 t = a[0]; 411 t = (t + c) & BN_MASK2; 412 c = (t < c); 413 l = (t + b[0]) & BN_MASK2; 414 c += (l < t); 415 r[0] = l; 416 a++; 417 b++; 418 r++; 419 n--; 420 } 421 return ((BN_ULONG)c); 422} 423#endif /* !BN_LLONG */ 424 425BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 426 int n) 427{ 428 BN_ULONG t1, t2; 429 int c = 0; 430 431 assert(n >= 0); 432 if (n <= 0) 433 return ((BN_ULONG)0); 434 435#ifndef OPENSSL_SMALL_FOOTPRINT 436 while (n & ~3) { 437 t1 = a[0]; 438 t2 = b[0]; 439 r[0] = (t1 - t2 - c) & BN_MASK2; 440 if (t1 != t2) 441 c = (t1 < t2); 442 t1 = a[1]; 443 t2 = b[1]; 444 r[1] = (t1 - t2 - c) & BN_MASK2; 445 if (t1 != t2) 446 c = (t1 < t2); 447 t1 = a[2]; 448 t2 = b[2]; 449 r[2] = (t1 - t2 - c) & BN_MASK2; 450 if (t1 != t2) 451 c = (t1 < t2); 452 t1 = a[3]; 453 t2 = b[3]; 454 r[3] = (t1 - t2 - c) & BN_MASK2; 455 if (t1 != t2) 456 c = (t1 < t2); 457 a += 4; 458 b += 4; 459 r += 4; 460 n -= 4; 461 } 462#endif 463 while (n) { 464 t1 = a[0]; 465 t2 = b[0]; 466 r[0] = (t1 - t2 - c) & BN_MASK2; 467 if (t1 != t2) 468 c = (t1 < t2); 469 a++; 470 b++; 471 r++; 472 n--; 473 } 474 return (c); 475} 476 477#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 478 479# undef bn_mul_comba8 480# undef bn_mul_comba4 481# undef bn_sqr_comba8 482# undef bn_sqr_comba4 483 484/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 485/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 486/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 487/* 488 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 489 * c=(c2,c1,c0) 490 */ 491 492/* 493 * Keep in mind that carrying into high part of multiplication result 494 * can not overflow, because it cannot be all-ones. 495 */ 496# ifdef BN_LLONG 497# define mul_add_c(a,b,c0,c1,c2) \ 498 t=(BN_ULLONG)a*b; \ 499 t1=(BN_ULONG)Lw(t); \ 500 t2=(BN_ULONG)Hw(t); \ 501 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 502 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 503 504# define mul_add_c2(a,b,c0,c1,c2) \ 505 t=(BN_ULLONG)a*b; \ 506 tt=(t+t)&BN_MASK; \ 507 if (tt < t) c2++; \ 508 t1=(BN_ULONG)Lw(tt); \ 509 t2=(BN_ULONG)Hw(tt); \ 510 c0=(c0+t1)&BN_MASK2; \ 511 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 512 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 513 514# define sqr_add_c(a,i,c0,c1,c2) \ 515 t=(BN_ULLONG)a[i]*a[i]; \ 516 t1=(BN_ULONG)Lw(t); \ 517 t2=(BN_ULONG)Hw(t); \ 518 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 519 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 520 521# define sqr_add_c2(a,i,j,c0,c1,c2) \ 522 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 523 524# elif defined(BN_UMULT_LOHI) 525 526# define mul_add_c(a,b,c0,c1,c2) { \ 527 BN_ULONG ta=(a),tb=(b); \ 528 BN_UMULT_LOHI(t1,t2,ta,tb); \ 529 c0 += t1; t2 += (c0<t1)?1:0; \ 530 c1 += t2; c2 += (c1<t2)?1:0; \ 531 } 532 533# define mul_add_c2(a,b,c0,c1,c2) { \ 534 BN_ULONG ta=(a),tb=(b),t0; \ 535 BN_UMULT_LOHI(t0,t1,ta,tb); \ 536 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 537 c1 += t2; c2 += (c1<t2)?1:0; \ 538 c0 += t0; t1 += (c0<t0)?1:0; \ 539 c1 += t1; c2 += (c1<t1)?1:0; \ 540 } 541 542# define sqr_add_c(a,i,c0,c1,c2) { \ 543 BN_ULONG ta=(a)[i]; \ 544 BN_UMULT_LOHI(t1,t2,ta,ta); \ 545 c0 += t1; t2 += (c0<t1)?1:0; \ 546 c1 += t2; c2 += (c1<t2)?1:0; \ 547 } 548 549# define sqr_add_c2(a,i,j,c0,c1,c2) \ 550 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 551 552# elif defined(BN_UMULT_HIGH) 553 554# define mul_add_c(a,b,c0,c1,c2) { \ 555 BN_ULONG ta=(a),tb=(b); \ 556 t1 = ta * tb; \ 557 t2 = BN_UMULT_HIGH(ta,tb); \ 558 c0 += t1; t2 += (c0<t1)?1:0; \ 559 c1 += t2; c2 += (c1<t2)?1:0; \ 560 } 561 562# define mul_add_c2(a,b,c0,c1,c2) { \ 563 BN_ULONG ta=(a),tb=(b),t0; \ 564 t1 = BN_UMULT_HIGH(ta,tb); \ 565 t0 = ta * tb; \ 566 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 567 c1 += t2; c2 += (c1<t2)?1:0; \ 568 c0 += t0; t1 += (c0<t0)?1:0; \ 569 c1 += t1; c2 += (c1<t1)?1:0; \ 570 } 571 572# define sqr_add_c(a,i,c0,c1,c2) { \ 573 BN_ULONG ta=(a)[i]; \ 574 t1 = ta * ta; \ 575 t2 = BN_UMULT_HIGH(ta,ta); \ 576 c0 += t1; t2 += (c0<t1)?1:0; \ 577 c1 += t2; c2 += (c1<t2)?1:0; \ 578 } 579 580# define sqr_add_c2(a,i,j,c0,c1,c2) \ 581 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 582 583# else /* !BN_LLONG */ 584# define mul_add_c(a,b,c0,c1,c2) \ 585 t1=LBITS(a); t2=HBITS(a); \ 586 bl=LBITS(b); bh=HBITS(b); \ 587 mul64(t1,t2,bl,bh); \ 588 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 589 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 590 591# define mul_add_c2(a,b,c0,c1,c2) \ 592 t1=LBITS(a); t2=HBITS(a); \ 593 bl=LBITS(b); bh=HBITS(b); \ 594 mul64(t1,t2,bl,bh); \ 595 if (t2 & BN_TBIT) c2++; \ 596 t2=(t2+t2)&BN_MASK2; \ 597 if (t1 & BN_TBIT) t2++; \ 598 t1=(t1+t1)&BN_MASK2; \ 599 c0=(c0+t1)&BN_MASK2; \ 600 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 601 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 602 603# define sqr_add_c(a,i,c0,c1,c2) \ 604 sqr64(t1,t2,(a)[i]); \ 605 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 606 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 607 608# define sqr_add_c2(a,i,j,c0,c1,c2) \ 609 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 610# endif /* !BN_LLONG */ 611 612void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 613{ 614# ifdef BN_LLONG 615 BN_ULLONG t; 616# else 617 BN_ULONG bl, bh; 618# endif 619 BN_ULONG t1, t2; 620 BN_ULONG c1, c2, c3; 621 622 c1 = 0; 623 c2 = 0; 624 c3 = 0; 625 mul_add_c(a[0], b[0], c1, c2, c3); 626 r[0] = c1; 627 c1 = 0; 628 mul_add_c(a[0], b[1], c2, c3, c1); 629 mul_add_c(a[1], b[0], c2, c3, c1); 630 r[1] = c2; 631 c2 = 0; 632 mul_add_c(a[2], b[0], c3, c1, c2); 633 mul_add_c(a[1], b[1], c3, c1, c2); 634 mul_add_c(a[0], b[2], c3, c1, c2); 635 r[2] = c3; 636 c3 = 0; 637 mul_add_c(a[0], b[3], c1, c2, c3); 638 mul_add_c(a[1], b[2], c1, c2, c3); 639 mul_add_c(a[2], b[1], c1, c2, c3); 640 mul_add_c(a[3], b[0], c1, c2, c3); 641 r[3] = c1; 642 c1 = 0; 643 mul_add_c(a[4], b[0], c2, c3, c1); 644 mul_add_c(a[3], b[1], c2, c3, c1); 645 mul_add_c(a[2], b[2], c2, c3, c1); 646 mul_add_c(a[1], b[3], c2, c3, c1); 647 mul_add_c(a[0], b[4], c2, c3, c1); 648 r[4] = c2; 649 c2 = 0; 650 mul_add_c(a[0], b[5], c3, c1, c2); 651 mul_add_c(a[1], b[4], c3, c1, c2); 652 mul_add_c(a[2], b[3], c3, c1, c2); 653 mul_add_c(a[3], b[2], c3, c1, c2); 654 mul_add_c(a[4], b[1], c3, c1, c2); 655 mul_add_c(a[5], b[0], c3, c1, c2); 656 r[5] = c3; 657 c3 = 0; 658 mul_add_c(a[6], b[0], c1, c2, c3); 659 mul_add_c(a[5], b[1], c1, c2, c3); 660 mul_add_c(a[4], b[2], c1, c2, c3); 661 mul_add_c(a[3], b[3], c1, c2, c3); 662 mul_add_c(a[2], b[4], c1, c2, c3); 663 mul_add_c(a[1], b[5], c1, c2, c3); 664 mul_add_c(a[0], b[6], c1, c2, c3); 665 r[6] = c1; 666 c1 = 0; 667 mul_add_c(a[0], b[7], c2, c3, c1); 668 mul_add_c(a[1], b[6], c2, c3, c1); 669 mul_add_c(a[2], b[5], c2, c3, c1); 670 mul_add_c(a[3], b[4], c2, c3, c1); 671 mul_add_c(a[4], b[3], c2, c3, c1); 672 mul_add_c(a[5], b[2], c2, c3, c1); 673 mul_add_c(a[6], b[1], c2, c3, c1); 674 mul_add_c(a[7], b[0], c2, c3, c1); 675 r[7] = c2; 676 c2 = 0; 677 mul_add_c(a[7], b[1], c3, c1, c2); 678 mul_add_c(a[6], b[2], c3, c1, c2); 679 mul_add_c(a[5], b[3], c3, c1, c2); 680 mul_add_c(a[4], b[4], c3, c1, c2); 681 mul_add_c(a[3], b[5], c3, c1, c2); 682 mul_add_c(a[2], b[6], c3, c1, c2); 683 mul_add_c(a[1], b[7], c3, c1, c2); 684 r[8] = c3; 685 c3 = 0; 686 mul_add_c(a[2], b[7], c1, c2, c3); 687 mul_add_c(a[3], b[6], c1, c2, c3); 688 mul_add_c(a[4], b[5], c1, c2, c3); 689 mul_add_c(a[5], b[4], c1, c2, c3); 690 mul_add_c(a[6], b[3], c1, c2, c3); 691 mul_add_c(a[7], b[2], c1, c2, c3); 692 r[9] = c1; 693 c1 = 0; 694 mul_add_c(a[7], b[3], c2, c3, c1); 695 mul_add_c(a[6], b[4], c2, c3, c1); 696 mul_add_c(a[5], b[5], c2, c3, c1); 697 mul_add_c(a[4], b[6], c2, c3, c1); 698 mul_add_c(a[3], b[7], c2, c3, c1); 699 r[10] = c2; 700 c2 = 0; 701 mul_add_c(a[4], b[7], c3, c1, c2); 702 mul_add_c(a[5], b[6], c3, c1, c2); 703 mul_add_c(a[6], b[5], c3, c1, c2); 704 mul_add_c(a[7], b[4], c3, c1, c2); 705 r[11] = c3; 706 c3 = 0; 707 mul_add_c(a[7], b[5], c1, c2, c3); 708 mul_add_c(a[6], b[6], c1, c2, c3); 709 mul_add_c(a[5], b[7], c1, c2, c3); 710 r[12] = c1; 711 c1 = 0; 712 mul_add_c(a[6], b[7], c2, c3, c1); 713 mul_add_c(a[7], b[6], c2, c3, c1); 714 r[13] = c2; 715 c2 = 0; 716 mul_add_c(a[7], b[7], c3, c1, c2); 717 r[14] = c3; 718 r[15] = c1; 719} 720 721void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 722{ 723# ifdef BN_LLONG 724 BN_ULLONG t; 725# else 726 BN_ULONG bl, bh; 727# endif 728 BN_ULONG t1, t2; 729 BN_ULONG c1, c2, c3; 730 731 c1 = 0; 732 c2 = 0; 733 c3 = 0; 734 mul_add_c(a[0], b[0], c1, c2, c3); 735 r[0] = c1; 736 c1 = 0; 737 mul_add_c(a[0], b[1], c2, c3, c1); 738 mul_add_c(a[1], b[0], c2, c3, c1); 739 r[1] = c2; 740 c2 = 0; 741 mul_add_c(a[2], b[0], c3, c1, c2); 742 mul_add_c(a[1], b[1], c3, c1, c2); 743 mul_add_c(a[0], b[2], c3, c1, c2); 744 r[2] = c3; 745 c3 = 0; 746 mul_add_c(a[0], b[3], c1, c2, c3); 747 mul_add_c(a[1], b[2], c1, c2, c3); 748 mul_add_c(a[2], b[1], c1, c2, c3); 749 mul_add_c(a[3], b[0], c1, c2, c3); 750 r[3] = c1; 751 c1 = 0; 752 mul_add_c(a[3], b[1], c2, c3, c1); 753 mul_add_c(a[2], b[2], c2, c3, c1); 754 mul_add_c(a[1], b[3], c2, c3, c1); 755 r[4] = c2; 756 c2 = 0; 757 mul_add_c(a[2], b[3], c3, c1, c2); 758 mul_add_c(a[3], b[2], c3, c1, c2); 759 r[5] = c3; 760 c3 = 0; 761 mul_add_c(a[3], b[3], c1, c2, c3); 762 r[6] = c1; 763 r[7] = c2; 764} 765 766void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 767{ 768# ifdef BN_LLONG 769 BN_ULLONG t, tt; 770# else 771 BN_ULONG bl, bh; 772# endif 773 BN_ULONG t1, t2; 774 BN_ULONG c1, c2, c3; 775 776 c1 = 0; 777 c2 = 0; 778 c3 = 0; 779 sqr_add_c(a, 0, c1, c2, c3); 780 r[0] = c1; 781 c1 = 0; 782 sqr_add_c2(a, 1, 0, c2, c3, c1); 783 r[1] = c2; 784 c2 = 0; 785 sqr_add_c(a, 1, c3, c1, c2); 786 sqr_add_c2(a, 2, 0, c3, c1, c2); 787 r[2] = c3; 788 c3 = 0; 789 sqr_add_c2(a, 3, 0, c1, c2, c3); 790 sqr_add_c2(a, 2, 1, c1, c2, c3); 791 r[3] = c1; 792 c1 = 0; 793 sqr_add_c(a, 2, c2, c3, c1); 794 sqr_add_c2(a, 3, 1, c2, c3, c1); 795 sqr_add_c2(a, 4, 0, c2, c3, c1); 796 r[4] = c2; 797 c2 = 0; 798 sqr_add_c2(a, 5, 0, c3, c1, c2); 799 sqr_add_c2(a, 4, 1, c3, c1, c2); 800 sqr_add_c2(a, 3, 2, c3, c1, c2); 801 r[5] = c3; 802 c3 = 0; 803 sqr_add_c(a, 3, c1, c2, c3); 804 sqr_add_c2(a, 4, 2, c1, c2, c3); 805 sqr_add_c2(a, 5, 1, c1, c2, c3); 806 sqr_add_c2(a, 6, 0, c1, c2, c3); 807 r[6] = c1; 808 c1 = 0; 809 sqr_add_c2(a, 7, 0, c2, c3, c1); 810 sqr_add_c2(a, 6, 1, c2, c3, c1); 811 sqr_add_c2(a, 5, 2, c2, c3, c1); 812 sqr_add_c2(a, 4, 3, c2, c3, c1); 813 r[7] = c2; 814 c2 = 0; 815 sqr_add_c(a, 4, c3, c1, c2); 816 sqr_add_c2(a, 5, 3, c3, c1, c2); 817 sqr_add_c2(a, 6, 2, c3, c1, c2); 818 sqr_add_c2(a, 7, 1, c3, c1, c2); 819 r[8] = c3; 820 c3 = 0; 821 sqr_add_c2(a, 7, 2, c1, c2, c3); 822 sqr_add_c2(a, 6, 3, c1, c2, c3); 823 sqr_add_c2(a, 5, 4, c1, c2, c3); 824 r[9] = c1; 825 c1 = 0; 826 sqr_add_c(a, 5, c2, c3, c1); 827 sqr_add_c2(a, 6, 4, c2, c3, c1); 828 sqr_add_c2(a, 7, 3, c2, c3, c1); 829 r[10] = c2; 830 c2 = 0; 831 sqr_add_c2(a, 7, 4, c3, c1, c2); 832 sqr_add_c2(a, 6, 5, c3, c1, c2); 833 r[11] = c3; 834 c3 = 0; 835 sqr_add_c(a, 6, c1, c2, c3); 836 sqr_add_c2(a, 7, 5, c1, c2, c3); 837 r[12] = c1; 838 c1 = 0; 839 sqr_add_c2(a, 7, 6, c2, c3, c1); 840 r[13] = c2; 841 c2 = 0; 842 sqr_add_c(a, 7, c3, c1, c2); 843 r[14] = c3; 844 r[15] = c1; 845} 846 847void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 848{ 849# ifdef BN_LLONG 850 BN_ULLONG t, tt; 851# else 852 BN_ULONG bl, bh; 853# endif 854 BN_ULONG t1, t2; 855 BN_ULONG c1, c2, c3; 856 857 c1 = 0; 858 c2 = 0; 859 c3 = 0; 860 sqr_add_c(a, 0, c1, c2, c3); 861 r[0] = c1; 862 c1 = 0; 863 sqr_add_c2(a, 1, 0, c2, c3, c1); 864 r[1] = c2; 865 c2 = 0; 866 sqr_add_c(a, 1, c3, c1, c2); 867 sqr_add_c2(a, 2, 0, c3, c1, c2); 868 r[2] = c3; 869 c3 = 0; 870 sqr_add_c2(a, 3, 0, c1, c2, c3); 871 sqr_add_c2(a, 2, 1, c1, c2, c3); 872 r[3] = c1; 873 c1 = 0; 874 sqr_add_c(a, 2, c2, c3, c1); 875 sqr_add_c2(a, 3, 1, c2, c3, c1); 876 r[4] = c2; 877 c2 = 0; 878 sqr_add_c2(a, 3, 2, c3, c1, c2); 879 r[5] = c3; 880 c3 = 0; 881 sqr_add_c(a, 3, c1, c2, c3); 882 r[6] = c1; 883 r[7] = c2; 884} 885 886# ifdef OPENSSL_NO_ASM 887# ifdef OPENSSL_BN_ASM_MONT 888# include <alloca.h> 889/* 890 * This is essentially reference implementation, which may or may not 891 * result in performance improvement. E.g. on IA-32 this routine was 892 * observed to give 40% faster rsa1024 private key operations and 10% 893 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 894 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 895 * reference implementation, one to be used as starting point for 896 * platform-specific assembler. Mentioned numbers apply to compiler 897 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 898 * can vary not only from platform to platform, but even for compiler 899 * versions. Assembler vs. assembler improvement coefficients can 900 * [and are known to] differ and are to be documented elsewhere. 901 */ 902int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 903 const BN_ULONG *np, const BN_ULONG *n0p, int num) 904{ 905 BN_ULONG c0, c1, ml, *tp, n0; 906# ifdef mul64 907 BN_ULONG mh; 908# endif 909 volatile BN_ULONG *vp; 910 int i = 0, j; 911 912# if 0 /* template for platform-specific 913 * implementation */ 914 if (ap == bp) 915 return bn_sqr_mont(rp, ap, np, n0p, num); 916# endif 917 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 918 919 n0 = *n0p; 920 921 c0 = 0; 922 ml = bp[0]; 923# ifdef mul64 924 mh = HBITS(ml); 925 ml = LBITS(ml); 926 for (j = 0; j < num; ++j) 927 mul(tp[j], ap[j], ml, mh, c0); 928# else 929 for (j = 0; j < num; ++j) 930 mul(tp[j], ap[j], ml, c0); 931# endif 932 933 tp[num] = c0; 934 tp[num + 1] = 0; 935 goto enter; 936 937 for (i = 0; i < num; i++) { 938 c0 = 0; 939 ml = bp[i]; 940# ifdef mul64 941 mh = HBITS(ml); 942 ml = LBITS(ml); 943 for (j = 0; j < num; ++j) 944 mul_add(tp[j], ap[j], ml, mh, c0); 945# else 946 for (j = 0; j < num; ++j) 947 mul_add(tp[j], ap[j], ml, c0); 948# endif 949 c1 = (tp[num] + c0) & BN_MASK2; 950 tp[num] = c1; 951 tp[num + 1] = (c1 < c0 ? 1 : 0); 952 enter: 953 c1 = tp[0]; 954 ml = (c1 * n0) & BN_MASK2; 955 c0 = 0; 956# ifdef mul64 957 mh = HBITS(ml); 958 ml = LBITS(ml); 959 mul_add(c1, np[0], ml, mh, c0); 960# else 961 mul_add(c1, ml, np[0], c0); 962# endif 963 for (j = 1; j < num; j++) { 964 c1 = tp[j]; 965# ifdef mul64 966 mul_add(c1, np[j], ml, mh, c0); 967# else 968 mul_add(c1, ml, np[j], c0); 969# endif 970 tp[j - 1] = c1 & BN_MASK2; 971 } 972 c1 = (tp[num] + c0) & BN_MASK2; 973 tp[num - 1] = c1; 974 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 975 } 976 977 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 978 c0 = bn_sub_words(rp, tp, np, num); 979 if (tp[num] != 0 || c0 == 0) { 980 for (i = 0; i < num + 2; i++) 981 vp[i] = 0; 982 return 1; 983 } 984 } 985 for (i = 0; i < num; i++) 986 rp[i] = tp[i], vp[i] = 0; 987 vp[num] = 0; 988 vp[num + 1] = 0; 989 return 1; 990} 991# else 992/* 993 * Return value of 0 indicates that multiplication/convolution was not 994 * performed to signal the caller to fall down to alternative/original 995 * code-path. 996 */ 997int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 998 const BN_ULONG *np, const BN_ULONG *n0, int num) 999{ 1000 return 0; 1001} 1002# endif /* OPENSSL_BN_ASM_MONT */ 1003# endif 1004 1005#else /* !BN_MUL_COMBA */ 1006 1007/* hmm... is it faster just to do a multiply? */ 1008# undef bn_sqr_comba4 1009void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 1010{ 1011 BN_ULONG t[8]; 1012 bn_sqr_normal(r, a, 4, t); 1013} 1014 1015# undef bn_sqr_comba8 1016void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 1017{ 1018 BN_ULONG t[16]; 1019 bn_sqr_normal(r, a, 8, t); 1020} 1021 1022void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1023{ 1024 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 1025 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 1026 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 1027 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 1028} 1029 1030void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1031{ 1032 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 1033 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 1034 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 1035 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 1036 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 1037 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 1038 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 1039 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 1040} 1041 1042# ifdef OPENSSL_NO_ASM 1043# ifdef OPENSSL_BN_ASM_MONT 1044# include <alloca.h> 1045int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1046 const BN_ULONG *np, const BN_ULONG *n0p, int num) 1047{ 1048 BN_ULONG c0, c1, *tp, n0 = *n0p; 1049 volatile BN_ULONG *vp; 1050 int i = 0, j; 1051 1052 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1053 1054 for (i = 0; i <= num; i++) 1055 tp[i] = 0; 1056 1057 for (i = 0; i < num; i++) { 1058 c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1059 c1 = (tp[num] + c0) & BN_MASK2; 1060 tp[num] = c1; 1061 tp[num + 1] = (c1 < c0 ? 1 : 0); 1062 1063 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1064 c1 = (tp[num] + c0) & BN_MASK2; 1065 tp[num] = c1; 1066 tp[num + 1] += (c1 < c0 ? 1 : 0); 1067 for (j = 0; j <= num; j++) 1068 tp[j] = tp[j + 1]; 1069 } 1070 1071 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1072 c0 = bn_sub_words(rp, tp, np, num); 1073 if (tp[num] != 0 || c0 == 0) { 1074 for (i = 0; i < num + 2; i++) 1075 vp[i] = 0; 1076 return 1; 1077 } 1078 } 1079 for (i = 0; i < num; i++) 1080 rp[i] = tp[i], vp[i] = 0; 1081 vp[num] = 0; 1082 vp[num + 1] = 0; 1083 return 1; 1084} 1085# else 1086int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1087 const BN_ULONG *np, const BN_ULONG *n0, int num) 1088{ 1089 return 0; 1090} 1091# endif /* OPENSSL_BN_ASM_MONT */ 1092# endif 1093 1094#endif /* !BN_MUL_COMBA */ 1095