x86_64-gcc.c revision 296341
1#include "../bn_lcl.h"
2#if !(defined(__GNUC__) && __GNUC__>=2)
3# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
4#else
5/*-
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 *    versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 *    proof-of-concept hack. As you can see it's implemented with
19 *    inline assembler, which means that you're bound to GCC and that
20 *    there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
24 *    why I decided to let the compiler take care of subroutine
25 *    prologue/epilogue as well as register allocation. For reference.
26 *    Win64 implements different ABI for AMD64, different from Linux.
27 *
28 * Q. How much faster does it get?
29 * A. 'apps/openssl speed rsa dsa' output with no-asm:
30 *
31 *                        sign    verify    sign/s verify/s
32 *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33 *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34 *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35 *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36 *                        sign    verify    sign/s verify/s
37 *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38 *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39 *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40 *
41 *    'apps/openssl speed rsa dsa' output with this module:
42 *
43 *                        sign    verify    sign/s verify/s
44 *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45 *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46 *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47 *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48 *                        sign    verify    sign/s verify/s
49 *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50 *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51 *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52 *
53 *    For the reference. IA-32 assembler implementation performs
54 *    very much like 64-bit code compiled with no-asm on the same
55 *    machine.
56 */
57
58# ifdef _WIN64
59#  define BN_ULONG unsigned long long
60# else
61#  define BN_ULONG unsigned long
62# endif
63
64# undef mul
65# undef mul_add
66# undef sqr
67
68/*-
69 * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
70 * "g"(0)               let the compiler to decide where does it
71 *                      want to keep the value of zero;
72 */
73# define mul_add(r,a,word,carry) do {   \
74        register BN_ULONG high,low;     \
75        asm ("mulq %3"                  \
76                : "=a"(low),"=d"(high)  \
77                : "a"(word),"m"(a)      \
78                : "cc");                \
79        asm ("addq %2,%0; adcq %3,%1"   \
80                : "+r"(carry),"+d"(high)\
81                : "a"(low),"g"(0)       \
82                : "cc");                \
83        asm ("addq %2,%0; adcq %3,%1"   \
84                : "+m"(r),"+d"(high)    \
85                : "r"(carry),"g"(0)     \
86                : "cc");                \
87        carry=high;                     \
88        } while (0)
89
90# define mul(r,a,word,carry) do {       \
91        register BN_ULONG high,low;     \
92        asm ("mulq %3"                  \
93                : "=a"(low),"=d"(high)  \
94                : "a"(word),"g"(a)      \
95                : "cc");                \
96        asm ("addq %2,%0; adcq %3,%1"   \
97                : "+r"(carry),"+d"(high)\
98                : "a"(low),"g"(0)       \
99                : "cc");                \
100        (r)=carry, carry=high;          \
101        } while (0)
102
103# define sqr(r0,r1,a)                    \
104        asm ("mulq %2"                  \
105                : "=a"(r0),"=d"(r1)     \
106                : "a"(a)                \
107                : "cc");
108
109BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
110                          BN_ULONG w)
111{
112    BN_ULONG c1 = 0;
113
114    if (num <= 0)
115        return (c1);
116
117    while (num & ~3) {
118        mul_add(rp[0], ap[0], w, c1);
119        mul_add(rp[1], ap[1], w, c1);
120        mul_add(rp[2], ap[2], w, c1);
121        mul_add(rp[3], ap[3], w, c1);
122        ap += 4;
123        rp += 4;
124        num -= 4;
125    }
126    if (num) {
127        mul_add(rp[0], ap[0], w, c1);
128        if (--num == 0)
129            return c1;
130        mul_add(rp[1], ap[1], w, c1);
131        if (--num == 0)
132            return c1;
133        mul_add(rp[2], ap[2], w, c1);
134        return c1;
135    }
136
137    return (c1);
138}
139
140BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
141{
142    BN_ULONG c1 = 0;
143
144    if (num <= 0)
145        return (c1);
146
147    while (num & ~3) {
148        mul(rp[0], ap[0], w, c1);
149        mul(rp[1], ap[1], w, c1);
150        mul(rp[2], ap[2], w, c1);
151        mul(rp[3], ap[3], w, c1);
152        ap += 4;
153        rp += 4;
154        num -= 4;
155    }
156    if (num) {
157        mul(rp[0], ap[0], w, c1);
158        if (--num == 0)
159            return c1;
160        mul(rp[1], ap[1], w, c1);
161        if (--num == 0)
162            return c1;
163        mul(rp[2], ap[2], w, c1);
164    }
165    return (c1);
166}
167
168void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169{
170    if (n <= 0)
171        return;
172
173    while (n & ~3) {
174        sqr(r[0], r[1], a[0]);
175        sqr(r[2], r[3], a[1]);
176        sqr(r[4], r[5], a[2]);
177        sqr(r[6], r[7], a[3]);
178        a += 4;
179        r += 8;
180        n -= 4;
181    }
182    if (n) {
183        sqr(r[0], r[1], a[0]);
184        if (--n == 0)
185            return;
186        sqr(r[2], r[3], a[1]);
187        if (--n == 0)
188            return;
189        sqr(r[4], r[5], a[2]);
190    }
191}
192
193BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
194{
195    BN_ULONG ret, waste;
196
197 asm("divq      %4":"=a"(ret), "=d"(waste)
198 :     "a"(l), "d"(h), "g"(d)
199 :     "cc");
200
201    return ret;
202}
203
204BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
205                      int n)
206{
207    BN_ULONG ret = 0, i = 0;
208
209    if (n <= 0)
210        return 0;
211
212    asm volatile ("       subq    %2,%2           \n"
213                  ".p2align 4                     \n"
214                  "1:     movq    (%4,%2,8),%0    \n"
215                  "       adcq    (%5,%2,8),%0    \n"
216                  "       movq    %0,(%3,%2,8)    \n"
217                  "       leaq    1(%2),%2        \n"
218                  "       loop    1b              \n"
219                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
220                  "=&r"(i)
221                  :"r"(rp), "r"(ap), "r"(bp)
222                  :"cc", "memory");
223
224    return ret & 1;
225}
226
227# ifndef SIMICS
228BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
229                      int n)
230{
231    BN_ULONG ret = 0, i = 0;
232
233    if (n <= 0)
234        return 0;
235
236    asm volatile ("       subq    %2,%2           \n"
237                  ".p2align 4                     \n"
238                  "1:     movq    (%4,%2,8),%0    \n"
239                  "       sbbq    (%5,%2,8),%0    \n"
240                  "       movq    %0,(%3,%2,8)    \n"
241                  "       leaq    1(%2),%2        \n"
242                  "       loop    1b              \n"
243                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
244                  "=&r"(i)
245                  :"r"(rp), "r"(ap), "r"(bp)
246                  :"cc", "memory");
247
248    return ret & 1;
249}
250# else
251/* Simics 1.4<7 has buggy sbbq:-( */
252#  define BN_MASK2 0xffffffffffffffffL
253BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
254{
255    BN_ULONG t1, t2;
256    int c = 0;
257
258    if (n <= 0)
259        return ((BN_ULONG)0);
260
261    for (;;) {
262        t1 = a[0];
263        t2 = b[0];
264        r[0] = (t1 - t2 - c) & BN_MASK2;
265        if (t1 != t2)
266            c = (t1 < t2);
267        if (--n <= 0)
268            break;
269
270        t1 = a[1];
271        t2 = b[1];
272        r[1] = (t1 - t2 - c) & BN_MASK2;
273        if (t1 != t2)
274            c = (t1 < t2);
275        if (--n <= 0)
276            break;
277
278        t1 = a[2];
279        t2 = b[2];
280        r[2] = (t1 - t2 - c) & BN_MASK2;
281        if (t1 != t2)
282            c = (t1 < t2);
283        if (--n <= 0)
284            break;
285
286        t1 = a[3];
287        t2 = b[3];
288        r[3] = (t1 - t2 - c) & BN_MASK2;
289        if (t1 != t2)
290            c = (t1 < t2);
291        if (--n <= 0)
292            break;
293
294        a += 4;
295        b += 4;
296        r += 4;
297    }
298    return (c);
299}
300# endif
301
302/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
303/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
304/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
305/*
306 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
307 * c=(c2,c1,c0)
308 */
309
310/*
311 * Keep in mind that carrying into high part of multiplication result
312 * can not overflow, because it cannot be all-ones.
313 */
314# if 0
315/* original macros are kept for reference purposes */
316#  define mul_add_c(a,b,c0,c1,c2) {       \
317        BN_ULONG ta=(a),tb=(b);         \
318        t1 = ta * tb;                   \
319        t2 = BN_UMULT_HIGH(ta,tb);      \
320        c0 += t1; t2 += (c0<t1)?1:0;    \
321        c1 += t2; c2 += (c1<t2)?1:0;    \
322        }
323
324#  define mul_add_c2(a,b,c0,c1,c2) {      \
325        BN_ULONG ta=(a),tb=(b),t0;      \
326        t1 = BN_UMULT_HIGH(ta,tb);      \
327        t0 = ta * tb;                   \
328        c0 += t0; t2 = t1+((c0<t0)?1:0);\
329        c1 += t2; c2 += (c1<t2)?1:0;    \
330        c0 += t0; t1 += (c0<t0)?1:0;    \
331        c1 += t1; c2 += (c1<t1)?1:0;    \
332        }
333# else
334#  define mul_add_c(a,b,c0,c1,c2) do {    \
335        asm ("mulq %3"                  \
336                : "=a"(t1),"=d"(t2)     \
337                : "a"(a),"m"(b)         \
338                : "cc");                \
339        asm ("addq %2,%0; adcq %3,%1"   \
340                : "+r"(c0),"+d"(t2)     \
341                : "a"(t1),"g"(0)        \
342                : "cc");                \
343        asm ("addq %2,%0; adcq %3,%1"   \
344                : "+r"(c1),"+r"(c2)     \
345                : "d"(t2),"g"(0)        \
346                : "cc");                \
347        } while (0)
348
349#  define sqr_add_c(a,i,c0,c1,c2) do {    \
350        asm ("mulq %2"                  \
351                : "=a"(t1),"=d"(t2)     \
352                : "a"(a[i])             \
353                : "cc");                \
354        asm ("addq %2,%0; adcq %3,%1"   \
355                : "+r"(c0),"+d"(t2)     \
356                : "a"(t1),"g"(0)        \
357                : "cc");                \
358        asm ("addq %2,%0; adcq %3,%1"   \
359                : "+r"(c1),"+r"(c2)     \
360                : "d"(t2),"g"(0)        \
361                : "cc");                \
362        } while (0)
363
364#  define mul_add_c2(a,b,c0,c1,c2) do {   \
365        asm ("mulq %3"                  \
366                : "=a"(t1),"=d"(t2)     \
367                : "a"(a),"m"(b)         \
368                : "cc");                \
369        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
370                : "+r"(c0),"+r"(c1),"+r"(c2)            \
371                : "r"(t1),"r"(t2),"g"(0)                \
372                : "cc");                                \
373        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
374                : "+r"(c0),"+r"(c1),"+r"(c2)            \
375                : "r"(t1),"r"(t2),"g"(0)                \
376                : "cc");                                \
377        } while (0)
378# endif
379
380# define sqr_add_c2(a,i,j,c0,c1,c2)      \
381        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
382
383void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
384{
385    BN_ULONG t1, t2;
386    BN_ULONG c1, c2, c3;
387
388    c1 = 0;
389    c2 = 0;
390    c3 = 0;
391    mul_add_c(a[0], b[0], c1, c2, c3);
392    r[0] = c1;
393    c1 = 0;
394    mul_add_c(a[0], b[1], c2, c3, c1);
395    mul_add_c(a[1], b[0], c2, c3, c1);
396    r[1] = c2;
397    c2 = 0;
398    mul_add_c(a[2], b[0], c3, c1, c2);
399    mul_add_c(a[1], b[1], c3, c1, c2);
400    mul_add_c(a[0], b[2], c3, c1, c2);
401    r[2] = c3;
402    c3 = 0;
403    mul_add_c(a[0], b[3], c1, c2, c3);
404    mul_add_c(a[1], b[2], c1, c2, c3);
405    mul_add_c(a[2], b[1], c1, c2, c3);
406    mul_add_c(a[3], b[0], c1, c2, c3);
407    r[3] = c1;
408    c1 = 0;
409    mul_add_c(a[4], b[0], c2, c3, c1);
410    mul_add_c(a[3], b[1], c2, c3, c1);
411    mul_add_c(a[2], b[2], c2, c3, c1);
412    mul_add_c(a[1], b[3], c2, c3, c1);
413    mul_add_c(a[0], b[4], c2, c3, c1);
414    r[4] = c2;
415    c2 = 0;
416    mul_add_c(a[0], b[5], c3, c1, c2);
417    mul_add_c(a[1], b[4], c3, c1, c2);
418    mul_add_c(a[2], b[3], c3, c1, c2);
419    mul_add_c(a[3], b[2], c3, c1, c2);
420    mul_add_c(a[4], b[1], c3, c1, c2);
421    mul_add_c(a[5], b[0], c3, c1, c2);
422    r[5] = c3;
423    c3 = 0;
424    mul_add_c(a[6], b[0], c1, c2, c3);
425    mul_add_c(a[5], b[1], c1, c2, c3);
426    mul_add_c(a[4], b[2], c1, c2, c3);
427    mul_add_c(a[3], b[3], c1, c2, c3);
428    mul_add_c(a[2], b[4], c1, c2, c3);
429    mul_add_c(a[1], b[5], c1, c2, c3);
430    mul_add_c(a[0], b[6], c1, c2, c3);
431    r[6] = c1;
432    c1 = 0;
433    mul_add_c(a[0], b[7], c2, c3, c1);
434    mul_add_c(a[1], b[6], c2, c3, c1);
435    mul_add_c(a[2], b[5], c2, c3, c1);
436    mul_add_c(a[3], b[4], c2, c3, c1);
437    mul_add_c(a[4], b[3], c2, c3, c1);
438    mul_add_c(a[5], b[2], c2, c3, c1);
439    mul_add_c(a[6], b[1], c2, c3, c1);
440    mul_add_c(a[7], b[0], c2, c3, c1);
441    r[7] = c2;
442    c2 = 0;
443    mul_add_c(a[7], b[1], c3, c1, c2);
444    mul_add_c(a[6], b[2], c3, c1, c2);
445    mul_add_c(a[5], b[3], c3, c1, c2);
446    mul_add_c(a[4], b[4], c3, c1, c2);
447    mul_add_c(a[3], b[5], c3, c1, c2);
448    mul_add_c(a[2], b[6], c3, c1, c2);
449    mul_add_c(a[1], b[7], c3, c1, c2);
450    r[8] = c3;
451    c3 = 0;
452    mul_add_c(a[2], b[7], c1, c2, c3);
453    mul_add_c(a[3], b[6], c1, c2, c3);
454    mul_add_c(a[4], b[5], c1, c2, c3);
455    mul_add_c(a[5], b[4], c1, c2, c3);
456    mul_add_c(a[6], b[3], c1, c2, c3);
457    mul_add_c(a[7], b[2], c1, c2, c3);
458    r[9] = c1;
459    c1 = 0;
460    mul_add_c(a[7], b[3], c2, c3, c1);
461    mul_add_c(a[6], b[4], c2, c3, c1);
462    mul_add_c(a[5], b[5], c2, c3, c1);
463    mul_add_c(a[4], b[6], c2, c3, c1);
464    mul_add_c(a[3], b[7], c2, c3, c1);
465    r[10] = c2;
466    c2 = 0;
467    mul_add_c(a[4], b[7], c3, c1, c2);
468    mul_add_c(a[5], b[6], c3, c1, c2);
469    mul_add_c(a[6], b[5], c3, c1, c2);
470    mul_add_c(a[7], b[4], c3, c1, c2);
471    r[11] = c3;
472    c3 = 0;
473    mul_add_c(a[7], b[5], c1, c2, c3);
474    mul_add_c(a[6], b[6], c1, c2, c3);
475    mul_add_c(a[5], b[7], c1, c2, c3);
476    r[12] = c1;
477    c1 = 0;
478    mul_add_c(a[6], b[7], c2, c3, c1);
479    mul_add_c(a[7], b[6], c2, c3, c1);
480    r[13] = c2;
481    c2 = 0;
482    mul_add_c(a[7], b[7], c3, c1, c2);
483    r[14] = c3;
484    r[15] = c1;
485}
486
487void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
488{
489    BN_ULONG t1, t2;
490    BN_ULONG c1, c2, c3;
491
492    c1 = 0;
493    c2 = 0;
494    c3 = 0;
495    mul_add_c(a[0], b[0], c1, c2, c3);
496    r[0] = c1;
497    c1 = 0;
498    mul_add_c(a[0], b[1], c2, c3, c1);
499    mul_add_c(a[1], b[0], c2, c3, c1);
500    r[1] = c2;
501    c2 = 0;
502    mul_add_c(a[2], b[0], c3, c1, c2);
503    mul_add_c(a[1], b[1], c3, c1, c2);
504    mul_add_c(a[0], b[2], c3, c1, c2);
505    r[2] = c3;
506    c3 = 0;
507    mul_add_c(a[0], b[3], c1, c2, c3);
508    mul_add_c(a[1], b[2], c1, c2, c3);
509    mul_add_c(a[2], b[1], c1, c2, c3);
510    mul_add_c(a[3], b[0], c1, c2, c3);
511    r[3] = c1;
512    c1 = 0;
513    mul_add_c(a[3], b[1], c2, c3, c1);
514    mul_add_c(a[2], b[2], c2, c3, c1);
515    mul_add_c(a[1], b[3], c2, c3, c1);
516    r[4] = c2;
517    c2 = 0;
518    mul_add_c(a[2], b[3], c3, c1, c2);
519    mul_add_c(a[3], b[2], c3, c1, c2);
520    r[5] = c3;
521    c3 = 0;
522    mul_add_c(a[3], b[3], c1, c2, c3);
523    r[6] = c1;
524    r[7] = c2;
525}
526
527void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
528{
529    BN_ULONG t1, t2;
530    BN_ULONG c1, c2, c3;
531
532    c1 = 0;
533    c2 = 0;
534    c3 = 0;
535    sqr_add_c(a, 0, c1, c2, c3);
536    r[0] = c1;
537    c1 = 0;
538    sqr_add_c2(a, 1, 0, c2, c3, c1);
539    r[1] = c2;
540    c2 = 0;
541    sqr_add_c(a, 1, c3, c1, c2);
542    sqr_add_c2(a, 2, 0, c3, c1, c2);
543    r[2] = c3;
544    c3 = 0;
545    sqr_add_c2(a, 3, 0, c1, c2, c3);
546    sqr_add_c2(a, 2, 1, c1, c2, c3);
547    r[3] = c1;
548    c1 = 0;
549    sqr_add_c(a, 2, c2, c3, c1);
550    sqr_add_c2(a, 3, 1, c2, c3, c1);
551    sqr_add_c2(a, 4, 0, c2, c3, c1);
552    r[4] = c2;
553    c2 = 0;
554    sqr_add_c2(a, 5, 0, c3, c1, c2);
555    sqr_add_c2(a, 4, 1, c3, c1, c2);
556    sqr_add_c2(a, 3, 2, c3, c1, c2);
557    r[5] = c3;
558    c3 = 0;
559    sqr_add_c(a, 3, c1, c2, c3);
560    sqr_add_c2(a, 4, 2, c1, c2, c3);
561    sqr_add_c2(a, 5, 1, c1, c2, c3);
562    sqr_add_c2(a, 6, 0, c1, c2, c3);
563    r[6] = c1;
564    c1 = 0;
565    sqr_add_c2(a, 7, 0, c2, c3, c1);
566    sqr_add_c2(a, 6, 1, c2, c3, c1);
567    sqr_add_c2(a, 5, 2, c2, c3, c1);
568    sqr_add_c2(a, 4, 3, c2, c3, c1);
569    r[7] = c2;
570    c2 = 0;
571    sqr_add_c(a, 4, c3, c1, c2);
572    sqr_add_c2(a, 5, 3, c3, c1, c2);
573    sqr_add_c2(a, 6, 2, c3, c1, c2);
574    sqr_add_c2(a, 7, 1, c3, c1, c2);
575    r[8] = c3;
576    c3 = 0;
577    sqr_add_c2(a, 7, 2, c1, c2, c3);
578    sqr_add_c2(a, 6, 3, c1, c2, c3);
579    sqr_add_c2(a, 5, 4, c1, c2, c3);
580    r[9] = c1;
581    c1 = 0;
582    sqr_add_c(a, 5, c2, c3, c1);
583    sqr_add_c2(a, 6, 4, c2, c3, c1);
584    sqr_add_c2(a, 7, 3, c2, c3, c1);
585    r[10] = c2;
586    c2 = 0;
587    sqr_add_c2(a, 7, 4, c3, c1, c2);
588    sqr_add_c2(a, 6, 5, c3, c1, c2);
589    r[11] = c3;
590    c3 = 0;
591    sqr_add_c(a, 6, c1, c2, c3);
592    sqr_add_c2(a, 7, 5, c1, c2, c3);
593    r[12] = c1;
594    c1 = 0;
595    sqr_add_c2(a, 7, 6, c2, c3, c1);
596    r[13] = c2;
597    c2 = 0;
598    sqr_add_c(a, 7, c3, c1, c2);
599    r[14] = c3;
600    r[15] = c1;
601}
602
603void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
604{
605    BN_ULONG t1, t2;
606    BN_ULONG c1, c2, c3;
607
608    c1 = 0;
609    c2 = 0;
610    c3 = 0;
611    sqr_add_c(a, 0, c1, c2, c3);
612    r[0] = c1;
613    c1 = 0;
614    sqr_add_c2(a, 1, 0, c2, c3, c1);
615    r[1] = c2;
616    c2 = 0;
617    sqr_add_c(a, 1, c3, c1, c2);
618    sqr_add_c2(a, 2, 0, c3, c1, c2);
619    r[2] = c3;
620    c3 = 0;
621    sqr_add_c2(a, 3, 0, c1, c2, c3);
622    sqr_add_c2(a, 2, 1, c1, c2, c3);
623    r[3] = c1;
624    c1 = 0;
625    sqr_add_c(a, 2, c2, c3, c1);
626    sqr_add_c2(a, 3, 1, c2, c3, c1);
627    r[4] = c2;
628    c2 = 0;
629    sqr_add_c2(a, 3, 2, c3, c1, c2);
630    r[5] = c3;
631    c3 = 0;
632    sqr_add_c(a, 3, c1, c2, c3);
633    r[6] = c1;
634    r[7] = c2;
635}
636#endif
637