1205128Ssimon#include "../bn_lcl.h"
2238405Sjkim#if !(defined(__GNUC__) && __GNUC__>=2)
3296341Sdelphij# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
4162911Ssimon#else
5296341Sdelphij/*-
6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002.
7109998Smarkm *
8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9109998Smarkm * project.
10109998Smarkm *
11109998Smarkm * Rights for redistribution and usage in source and binary forms are
12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is
13109998Smarkm * disclaimed.
14109998Smarkm *
15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16109998Smarkm *    versions, like 1.0...
17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty
18109998Smarkm *    proof-of-concept hack. As you can see it's implemented with
19109998Smarkm *    inline assembler, which means that you're bound to GCC and that
20160814Ssimon *    there might be enough room for further improvement.
21109998Smarkm *
22109998Smarkm * Q. Why inline assembler?
23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is
24160814Ssimon *    why I decided to let the compiler take care of subroutine
25160814Ssimon *    prologue/epilogue as well as register allocation. For reference.
26160814Ssimon *    Win64 implements different ABI for AMD64, different from Linux.
27109998Smarkm *
28109998Smarkm * Q. How much faster does it get?
29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm:
30160814Ssimon *
31296341Sdelphij *                        sign    verify    sign/s verify/s
32296341Sdelphij *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33296341Sdelphij *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34296341Sdelphij *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35296341Sdelphij *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36296341Sdelphij *                        sign    verify    sign/s verify/s
37296341Sdelphij *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38296341Sdelphij *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39296341Sdelphij *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40160814Ssimon *
41160814Ssimon *    'apps/openssl speed rsa dsa' output with this module:
42160814Ssimon *
43296341Sdelphij *                        sign    verify    sign/s verify/s
44296341Sdelphij *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45296341Sdelphij *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46296341Sdelphij *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47296341Sdelphij *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48296341Sdelphij *                        sign    verify    sign/s verify/s
49296341Sdelphij *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50296341Sdelphij *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51296341Sdelphij *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52160814Ssimon *
53160814Ssimon *    For the reference. IA-32 assembler implementation performs
54160814Ssimon *    very much like 64-bit code compiled with no-asm on the same
55160814Ssimon *    machine.
56109998Smarkm */
57109998Smarkm
58296341Sdelphij# ifdef _WIN64
59296341Sdelphij#  define BN_ULONG unsigned long long
60296341Sdelphij# else
61296341Sdelphij#  define BN_ULONG unsigned long
62296341Sdelphij# endif
63109998Smarkm
64296341Sdelphij# undef mul
65296341Sdelphij# undef mul_add
66296341Sdelphij# undef sqr
67205128Ssimon
68296341Sdelphij/*-
69296341Sdelphij * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
70296341Sdelphij * "g"(0)               let the compiler to decide where does it
71296341Sdelphij *                      want to keep the value of zero;
72109998Smarkm */
73296341Sdelphij# define mul_add(r,a,word,carry) do {   \
74296341Sdelphij        register BN_ULONG high,low;     \
75296341Sdelphij        asm ("mulq %3"                  \
76296341Sdelphij                : "=a"(low),"=d"(high)  \
77296341Sdelphij                : "a"(word),"m"(a)      \
78296341Sdelphij                : "cc");                \
79296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
80296341Sdelphij                : "+r"(carry),"+d"(high)\
81296341Sdelphij                : "a"(low),"g"(0)       \
82296341Sdelphij                : "cc");                \
83296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
84296341Sdelphij                : "+m"(r),"+d"(high)    \
85296341Sdelphij                : "r"(carry),"g"(0)     \
86296341Sdelphij                : "cc");                \
87296341Sdelphij        carry=high;                     \
88296341Sdelphij        } while (0)
89109998Smarkm
90296341Sdelphij# define mul(r,a,word,carry) do {       \
91296341Sdelphij        register BN_ULONG high,low;     \
92296341Sdelphij        asm ("mulq %3"                  \
93296341Sdelphij                : "=a"(low),"=d"(high)  \
94296341Sdelphij                : "a"(word),"g"(a)      \
95296341Sdelphij                : "cc");                \
96296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
97296341Sdelphij                : "+r"(carry),"+d"(high)\
98296341Sdelphij                : "a"(low),"g"(0)       \
99296341Sdelphij                : "cc");                \
100296341Sdelphij        (r)=carry, carry=high;          \
101296341Sdelphij        } while (0)
102109998Smarkm
103296341Sdelphij# define sqr(r0,r1,a)                    \
104296341Sdelphij        asm ("mulq %2"                  \
105296341Sdelphij                : "=a"(r0),"=d"(r1)     \
106296341Sdelphij                : "a"(a)                \
107296341Sdelphij                : "cc");
108109998Smarkm
109296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
110296341Sdelphij                          BN_ULONG w)
111296341Sdelphij{
112296341Sdelphij    BN_ULONG c1 = 0;
113109998Smarkm
114296341Sdelphij    if (num <= 0)
115296341Sdelphij        return (c1);
116109998Smarkm
117296341Sdelphij    while (num & ~3) {
118296341Sdelphij        mul_add(rp[0], ap[0], w, c1);
119296341Sdelphij        mul_add(rp[1], ap[1], w, c1);
120296341Sdelphij        mul_add(rp[2], ap[2], w, c1);
121296341Sdelphij        mul_add(rp[3], ap[3], w, c1);
122296341Sdelphij        ap += 4;
123296341Sdelphij        rp += 4;
124296341Sdelphij        num -= 4;
125296341Sdelphij    }
126296341Sdelphij    if (num) {
127296341Sdelphij        mul_add(rp[0], ap[0], w, c1);
128296341Sdelphij        if (--num == 0)
129296341Sdelphij            return c1;
130296341Sdelphij        mul_add(rp[1], ap[1], w, c1);
131296341Sdelphij        if (--num == 0)
132296341Sdelphij            return c1;
133296341Sdelphij        mul_add(rp[2], ap[2], w, c1);
134296341Sdelphij        return c1;
135296341Sdelphij    }
136109998Smarkm
137296341Sdelphij    return (c1);
138296341Sdelphij}
139296341Sdelphij
140205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
141296341Sdelphij{
142296341Sdelphij    BN_ULONG c1 = 0;
143109998Smarkm
144296341Sdelphij    if (num <= 0)
145296341Sdelphij        return (c1);
146109998Smarkm
147296341Sdelphij    while (num & ~3) {
148296341Sdelphij        mul(rp[0], ap[0], w, c1);
149296341Sdelphij        mul(rp[1], ap[1], w, c1);
150296341Sdelphij        mul(rp[2], ap[2], w, c1);
151296341Sdelphij        mul(rp[3], ap[3], w, c1);
152296341Sdelphij        ap += 4;
153296341Sdelphij        rp += 4;
154296341Sdelphij        num -= 4;
155296341Sdelphij    }
156296341Sdelphij    if (num) {
157296341Sdelphij        mul(rp[0], ap[0], w, c1);
158296341Sdelphij        if (--num == 0)
159296341Sdelphij            return c1;
160296341Sdelphij        mul(rp[1], ap[1], w, c1);
161296341Sdelphij        if (--num == 0)
162296341Sdelphij            return c1;
163296341Sdelphij        mul(rp[2], ap[2], w, c1);
164296341Sdelphij    }
165296341Sdelphij    return (c1);
166296341Sdelphij}
167109998Smarkm
168205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169296341Sdelphij{
170296341Sdelphij    if (n <= 0)
171296341Sdelphij        return;
172109998Smarkm
173296341Sdelphij    while (n & ~3) {
174296341Sdelphij        sqr(r[0], r[1], a[0]);
175296341Sdelphij        sqr(r[2], r[3], a[1]);
176296341Sdelphij        sqr(r[4], r[5], a[2]);
177296341Sdelphij        sqr(r[6], r[7], a[3]);
178296341Sdelphij        a += 4;
179296341Sdelphij        r += 8;
180296341Sdelphij        n -= 4;
181296341Sdelphij    }
182296341Sdelphij    if (n) {
183296341Sdelphij        sqr(r[0], r[1], a[0]);
184296341Sdelphij        if (--n == 0)
185296341Sdelphij            return;
186296341Sdelphij        sqr(r[2], r[3], a[1]);
187296341Sdelphij        if (--n == 0)
188296341Sdelphij            return;
189296341Sdelphij        sqr(r[4], r[5], a[2]);
190296341Sdelphij    }
191296341Sdelphij}
192109998Smarkm
193109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
194296341Sdelphij{
195296341Sdelphij    BN_ULONG ret, waste;
196109998Smarkm
197296341Sdelphij asm("divq      %4":"=a"(ret), "=d"(waste)
198296341Sdelphij :     "a"(l), "d"(h), "g"(d)
199296341Sdelphij :     "cc");
200109998Smarkm
201296341Sdelphij    return ret;
202109998Smarkm}
203109998Smarkm
204296341SdelphijBN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
205296341Sdelphij                      int n)
206296341Sdelphij{
207296341Sdelphij    BN_ULONG ret = 0, i = 0;
208109998Smarkm
209296341Sdelphij    if (n <= 0)
210296341Sdelphij        return 0;
211109998Smarkm
212296341Sdelphij    asm volatile ("       subq    %2,%2           \n"
213296341Sdelphij                  ".p2align 4                     \n"
214296341Sdelphij                  "1:     movq    (%4,%2,8),%0    \n"
215296341Sdelphij                  "       adcq    (%5,%2,8),%0    \n"
216296341Sdelphij                  "       movq    %0,(%3,%2,8)    \n"
217296341Sdelphij                  "       leaq    1(%2),%2        \n"
218296341Sdelphij                  "       loop    1b              \n"
219296341Sdelphij                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
220296341Sdelphij                  "=&r"(i)
221296341Sdelphij                  :"r"(rp), "r"(ap), "r"(bp)
222296341Sdelphij                  :"cc", "memory");
223109998Smarkm
224296341Sdelphij    return ret & 1;
225109998Smarkm}
226109998Smarkm
227296341Sdelphij# ifndef SIMICS
228296341SdelphijBN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
229296341Sdelphij                      int n)
230296341Sdelphij{
231296341Sdelphij    BN_ULONG ret = 0, i = 0;
232109998Smarkm
233296341Sdelphij    if (n <= 0)
234296341Sdelphij        return 0;
235109998Smarkm
236296341Sdelphij    asm volatile ("       subq    %2,%2           \n"
237296341Sdelphij                  ".p2align 4                     \n"
238296341Sdelphij                  "1:     movq    (%4,%2,8),%0    \n"
239296341Sdelphij                  "       sbbq    (%5,%2,8),%0    \n"
240296341Sdelphij                  "       movq    %0,(%3,%2,8)    \n"
241296341Sdelphij                  "       leaq    1(%2),%2        \n"
242296341Sdelphij                  "       loop    1b              \n"
243296341Sdelphij                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
244296341Sdelphij                  "=&r"(i)
245296341Sdelphij                  :"r"(rp), "r"(ap), "r"(bp)
246296341Sdelphij                  :"cc", "memory");
247109998Smarkm
248296341Sdelphij    return ret & 1;
249109998Smarkm}
250296341Sdelphij# else
251109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */
252296341Sdelphij#  define BN_MASK2 0xffffffffffffffffL
253109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
254296341Sdelphij{
255296341Sdelphij    BN_ULONG t1, t2;
256296341Sdelphij    int c = 0;
257109998Smarkm
258296341Sdelphij    if (n <= 0)
259296341Sdelphij        return ((BN_ULONG)0);
260109998Smarkm
261296341Sdelphij    for (;;) {
262296341Sdelphij        t1 = a[0];
263296341Sdelphij        t2 = b[0];
264296341Sdelphij        r[0] = (t1 - t2 - c) & BN_MASK2;
265296341Sdelphij        if (t1 != t2)
266296341Sdelphij            c = (t1 < t2);
267296341Sdelphij        if (--n <= 0)
268296341Sdelphij            break;
269109998Smarkm
270296341Sdelphij        t1 = a[1];
271296341Sdelphij        t2 = b[1];
272296341Sdelphij        r[1] = (t1 - t2 - c) & BN_MASK2;
273296341Sdelphij        if (t1 != t2)
274296341Sdelphij            c = (t1 < t2);
275296341Sdelphij        if (--n <= 0)
276296341Sdelphij            break;
277109998Smarkm
278296341Sdelphij        t1 = a[2];
279296341Sdelphij        t2 = b[2];
280296341Sdelphij        r[2] = (t1 - t2 - c) & BN_MASK2;
281296341Sdelphij        if (t1 != t2)
282296341Sdelphij            c = (t1 < t2);
283296341Sdelphij        if (--n <= 0)
284296341Sdelphij            break;
285109998Smarkm
286296341Sdelphij        t1 = a[3];
287296341Sdelphij        t2 = b[3];
288296341Sdelphij        r[3] = (t1 - t2 - c) & BN_MASK2;
289296341Sdelphij        if (t1 != t2)
290296341Sdelphij            c = (t1 < t2);
291296341Sdelphij        if (--n <= 0)
292296341Sdelphij            break;
293109998Smarkm
294296341Sdelphij        a += 4;
295296341Sdelphij        b += 4;
296296341Sdelphij        r += 4;
297296341Sdelphij    }
298296341Sdelphij    return (c);
299296341Sdelphij}
300296341Sdelphij# endif
301109998Smarkm
302109998Smarkm/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
303109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
304109998Smarkm/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
305296341Sdelphij/*
306296341Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
307296341Sdelphij * c=(c2,c1,c0)
308296341Sdelphij */
309109998Smarkm
310277195Sdelphij/*
311277195Sdelphij * Keep in mind that carrying into high part of multiplication result
312277195Sdelphij * can not overflow, because it cannot be all-ones.
313277195Sdelphij */
314296341Sdelphij# if 0
315109998Smarkm/* original macros are kept for reference purposes */
316296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) {       \
317296341Sdelphij        BN_ULONG ta=(a),tb=(b);         \
318296341Sdelphij        t1 = ta * tb;                   \
319296341Sdelphij        t2 = BN_UMULT_HIGH(ta,tb);      \
320296341Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
321296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
322296341Sdelphij        }
323109998Smarkm
324296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) {      \
325296341Sdelphij        BN_ULONG ta=(a),tb=(b),t0;      \
326296341Sdelphij        t1 = BN_UMULT_HIGH(ta,tb);      \
327296341Sdelphij        t0 = ta * tb;                   \
328296341Sdelphij        c0 += t0; t2 = t1+((c0<t0)?1:0);\
329296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
330296341Sdelphij        c0 += t0; t1 += (c0<t0)?1:0;    \
331296341Sdelphij        c1 += t1; c2 += (c1<t1)?1:0;    \
332296341Sdelphij        }
333296341Sdelphij# else
334296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) do {    \
335296341Sdelphij        asm ("mulq %3"                  \
336296341Sdelphij                : "=a"(t1),"=d"(t2)     \
337296341Sdelphij                : "a"(a),"m"(b)         \
338296341Sdelphij                : "cc");                \
339296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
340296341Sdelphij                : "+r"(c0),"+d"(t2)     \
341296341Sdelphij                : "a"(t1),"g"(0)        \
342296341Sdelphij                : "cc");                \
343296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
344296341Sdelphij                : "+r"(c1),"+r"(c2)     \
345296341Sdelphij                : "d"(t2),"g"(0)        \
346296341Sdelphij                : "cc");                \
347296341Sdelphij        } while (0)
348109998Smarkm
349296341Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) do {    \
350296341Sdelphij        asm ("mulq %2"                  \
351296341Sdelphij                : "=a"(t1),"=d"(t2)     \
352296341Sdelphij                : "a"(a[i])             \
353296341Sdelphij                : "cc");                \
354296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
355296341Sdelphij                : "+r"(c0),"+d"(t2)     \
356296341Sdelphij                : "a"(t1),"g"(0)        \
357296341Sdelphij                : "cc");                \
358296341Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
359296341Sdelphij                : "+r"(c1),"+r"(c2)     \
360296341Sdelphij                : "d"(t2),"g"(0)        \
361296341Sdelphij                : "cc");                \
362296341Sdelphij        } while (0)
363109998Smarkm
364296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) do {   \
365296341Sdelphij        asm ("mulq %3"                  \
366296341Sdelphij                : "=a"(t1),"=d"(t2)     \
367296341Sdelphij                : "a"(a),"m"(b)         \
368296341Sdelphij                : "cc");                \
369296341Sdelphij        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
370296341Sdelphij                : "+r"(c0),"+r"(c1),"+r"(c2)            \
371296341Sdelphij                : "r"(t1),"r"(t2),"g"(0)                \
372296341Sdelphij                : "cc");                                \
373296341Sdelphij        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
374296341Sdelphij                : "+r"(c0),"+r"(c1),"+r"(c2)            \
375296341Sdelphij                : "r"(t1),"r"(t2),"g"(0)                \
376296341Sdelphij                : "cc");                                \
377296341Sdelphij        } while (0)
378296341Sdelphij# endif
379109998Smarkm
380296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2)      \
381296341Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
382109998Smarkm
383109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
384296341Sdelphij{
385296341Sdelphij    BN_ULONG t1, t2;
386296341Sdelphij    BN_ULONG c1, c2, c3;
387109998Smarkm
388296341Sdelphij    c1 = 0;
389296341Sdelphij    c2 = 0;
390296341Sdelphij    c3 = 0;
391296341Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
392296341Sdelphij    r[0] = c1;
393296341Sdelphij    c1 = 0;
394296341Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
395296341Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
396296341Sdelphij    r[1] = c2;
397296341Sdelphij    c2 = 0;
398296341Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
399296341Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
400296341Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
401296341Sdelphij    r[2] = c3;
402296341Sdelphij    c3 = 0;
403296341Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
404296341Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
405296341Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
406296341Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
407296341Sdelphij    r[3] = c1;
408296341Sdelphij    c1 = 0;
409296341Sdelphij    mul_add_c(a[4], b[0], c2, c3, c1);
410296341Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
411296341Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
412296341Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
413296341Sdelphij    mul_add_c(a[0], b[4], c2, c3, c1);
414296341Sdelphij    r[4] = c2;
415296341Sdelphij    c2 = 0;
416296341Sdelphij    mul_add_c(a[0], b[5], c3, c1, c2);
417296341Sdelphij    mul_add_c(a[1], b[4], c3, c1, c2);
418296341Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
419296341Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
420296341Sdelphij    mul_add_c(a[4], b[1], c3, c1, c2);
421296341Sdelphij    mul_add_c(a[5], b[0], c3, c1, c2);
422296341Sdelphij    r[5] = c3;
423296341Sdelphij    c3 = 0;
424296341Sdelphij    mul_add_c(a[6], b[0], c1, c2, c3);
425296341Sdelphij    mul_add_c(a[5], b[1], c1, c2, c3);
426296341Sdelphij    mul_add_c(a[4], b[2], c1, c2, c3);
427296341Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
428296341Sdelphij    mul_add_c(a[2], b[4], c1, c2, c3);
429296341Sdelphij    mul_add_c(a[1], b[5], c1, c2, c3);
430296341Sdelphij    mul_add_c(a[0], b[6], c1, c2, c3);
431296341Sdelphij    r[6] = c1;
432296341Sdelphij    c1 = 0;
433296341Sdelphij    mul_add_c(a[0], b[7], c2, c3, c1);
434296341Sdelphij    mul_add_c(a[1], b[6], c2, c3, c1);
435296341Sdelphij    mul_add_c(a[2], b[5], c2, c3, c1);
436296341Sdelphij    mul_add_c(a[3], b[4], c2, c3, c1);
437296341Sdelphij    mul_add_c(a[4], b[3], c2, c3, c1);
438296341Sdelphij    mul_add_c(a[5], b[2], c2, c3, c1);
439296341Sdelphij    mul_add_c(a[6], b[1], c2, c3, c1);
440296341Sdelphij    mul_add_c(a[7], b[0], c2, c3, c1);
441296341Sdelphij    r[7] = c2;
442296341Sdelphij    c2 = 0;
443296341Sdelphij    mul_add_c(a[7], b[1], c3, c1, c2);
444296341Sdelphij    mul_add_c(a[6], b[2], c3, c1, c2);
445296341Sdelphij    mul_add_c(a[5], b[3], c3, c1, c2);
446296341Sdelphij    mul_add_c(a[4], b[4], c3, c1, c2);
447296341Sdelphij    mul_add_c(a[3], b[5], c3, c1, c2);
448296341Sdelphij    mul_add_c(a[2], b[6], c3, c1, c2);
449296341Sdelphij    mul_add_c(a[1], b[7], c3, c1, c2);
450296341Sdelphij    r[8] = c3;
451296341Sdelphij    c3 = 0;
452296341Sdelphij    mul_add_c(a[2], b[7], c1, c2, c3);
453296341Sdelphij    mul_add_c(a[3], b[6], c1, c2, c3);
454296341Sdelphij    mul_add_c(a[4], b[5], c1, c2, c3);
455296341Sdelphij    mul_add_c(a[5], b[4], c1, c2, c3);
456296341Sdelphij    mul_add_c(a[6], b[3], c1, c2, c3);
457296341Sdelphij    mul_add_c(a[7], b[2], c1, c2, c3);
458296341Sdelphij    r[9] = c1;
459296341Sdelphij    c1 = 0;
460296341Sdelphij    mul_add_c(a[7], b[3], c2, c3, c1);
461296341Sdelphij    mul_add_c(a[6], b[4], c2, c3, c1);
462296341Sdelphij    mul_add_c(a[5], b[5], c2, c3, c1);
463296341Sdelphij    mul_add_c(a[4], b[6], c2, c3, c1);
464296341Sdelphij    mul_add_c(a[3], b[7], c2, c3, c1);
465296341Sdelphij    r[10] = c2;
466296341Sdelphij    c2 = 0;
467296341Sdelphij    mul_add_c(a[4], b[7], c3, c1, c2);
468296341Sdelphij    mul_add_c(a[5], b[6], c3, c1, c2);
469296341Sdelphij    mul_add_c(a[6], b[5], c3, c1, c2);
470296341Sdelphij    mul_add_c(a[7], b[4], c3, c1, c2);
471296341Sdelphij    r[11] = c3;
472296341Sdelphij    c3 = 0;
473296341Sdelphij    mul_add_c(a[7], b[5], c1, c2, c3);
474296341Sdelphij    mul_add_c(a[6], b[6], c1, c2, c3);
475296341Sdelphij    mul_add_c(a[5], b[7], c1, c2, c3);
476296341Sdelphij    r[12] = c1;
477296341Sdelphij    c1 = 0;
478296341Sdelphij    mul_add_c(a[6], b[7], c2, c3, c1);
479296341Sdelphij    mul_add_c(a[7], b[6], c2, c3, c1);
480296341Sdelphij    r[13] = c2;
481296341Sdelphij    c2 = 0;
482296341Sdelphij    mul_add_c(a[7], b[7], c3, c1, c2);
483296341Sdelphij    r[14] = c3;
484296341Sdelphij    r[15] = c1;
485296341Sdelphij}
486109998Smarkm
487109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
488296341Sdelphij{
489296341Sdelphij    BN_ULONG t1, t2;
490296341Sdelphij    BN_ULONG c1, c2, c3;
491109998Smarkm
492296341Sdelphij    c1 = 0;
493296341Sdelphij    c2 = 0;
494296341Sdelphij    c3 = 0;
495296341Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
496296341Sdelphij    r[0] = c1;
497296341Sdelphij    c1 = 0;
498296341Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
499296341Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
500296341Sdelphij    r[1] = c2;
501296341Sdelphij    c2 = 0;
502296341Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
503296341Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
504296341Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
505296341Sdelphij    r[2] = c3;
506296341Sdelphij    c3 = 0;
507296341Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
508296341Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
509296341Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
510296341Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
511296341Sdelphij    r[3] = c1;
512296341Sdelphij    c1 = 0;
513296341Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
514296341Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
515296341Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
516296341Sdelphij    r[4] = c2;
517296341Sdelphij    c2 = 0;
518296341Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
519296341Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
520296341Sdelphij    r[5] = c3;
521296341Sdelphij    c3 = 0;
522296341Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
523296341Sdelphij    r[6] = c1;
524296341Sdelphij    r[7] = c2;
525296341Sdelphij}
526109998Smarkm
527205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
528296341Sdelphij{
529296341Sdelphij    BN_ULONG t1, t2;
530296341Sdelphij    BN_ULONG c1, c2, c3;
531109998Smarkm
532296341Sdelphij    c1 = 0;
533296341Sdelphij    c2 = 0;
534296341Sdelphij    c3 = 0;
535296341Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
536296341Sdelphij    r[0] = c1;
537296341Sdelphij    c1 = 0;
538296341Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
539296341Sdelphij    r[1] = c2;
540296341Sdelphij    c2 = 0;
541296341Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
542296341Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
543296341Sdelphij    r[2] = c3;
544296341Sdelphij    c3 = 0;
545296341Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
546296341Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
547296341Sdelphij    r[3] = c1;
548296341Sdelphij    c1 = 0;
549296341Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
550296341Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
551296341Sdelphij    sqr_add_c2(a, 4, 0, c2, c3, c1);
552296341Sdelphij    r[4] = c2;
553296341Sdelphij    c2 = 0;
554296341Sdelphij    sqr_add_c2(a, 5, 0, c3, c1, c2);
555296341Sdelphij    sqr_add_c2(a, 4, 1, c3, c1, c2);
556296341Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
557296341Sdelphij    r[5] = c3;
558296341Sdelphij    c3 = 0;
559296341Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
560296341Sdelphij    sqr_add_c2(a, 4, 2, c1, c2, c3);
561296341Sdelphij    sqr_add_c2(a, 5, 1, c1, c2, c3);
562296341Sdelphij    sqr_add_c2(a, 6, 0, c1, c2, c3);
563296341Sdelphij    r[6] = c1;
564296341Sdelphij    c1 = 0;
565296341Sdelphij    sqr_add_c2(a, 7, 0, c2, c3, c1);
566296341Sdelphij    sqr_add_c2(a, 6, 1, c2, c3, c1);
567296341Sdelphij    sqr_add_c2(a, 5, 2, c2, c3, c1);
568296341Sdelphij    sqr_add_c2(a, 4, 3, c2, c3, c1);
569296341Sdelphij    r[7] = c2;
570296341Sdelphij    c2 = 0;
571296341Sdelphij    sqr_add_c(a, 4, c3, c1, c2);
572296341Sdelphij    sqr_add_c2(a, 5, 3, c3, c1, c2);
573296341Sdelphij    sqr_add_c2(a, 6, 2, c3, c1, c2);
574296341Sdelphij    sqr_add_c2(a, 7, 1, c3, c1, c2);
575296341Sdelphij    r[8] = c3;
576296341Sdelphij    c3 = 0;
577296341Sdelphij    sqr_add_c2(a, 7, 2, c1, c2, c3);
578296341Sdelphij    sqr_add_c2(a, 6, 3, c1, c2, c3);
579296341Sdelphij    sqr_add_c2(a, 5, 4, c1, c2, c3);
580296341Sdelphij    r[9] = c1;
581296341Sdelphij    c1 = 0;
582296341Sdelphij    sqr_add_c(a, 5, c2, c3, c1);
583296341Sdelphij    sqr_add_c2(a, 6, 4, c2, c3, c1);
584296341Sdelphij    sqr_add_c2(a, 7, 3, c2, c3, c1);
585296341Sdelphij    r[10] = c2;
586296341Sdelphij    c2 = 0;
587296341Sdelphij    sqr_add_c2(a, 7, 4, c3, c1, c2);
588296341Sdelphij    sqr_add_c2(a, 6, 5, c3, c1, c2);
589296341Sdelphij    r[11] = c3;
590296341Sdelphij    c3 = 0;
591296341Sdelphij    sqr_add_c(a, 6, c1, c2, c3);
592296341Sdelphij    sqr_add_c2(a, 7, 5, c1, c2, c3);
593296341Sdelphij    r[12] = c1;
594296341Sdelphij    c1 = 0;
595296341Sdelphij    sqr_add_c2(a, 7, 6, c2, c3, c1);
596296341Sdelphij    r[13] = c2;
597296341Sdelphij    c2 = 0;
598296341Sdelphij    sqr_add_c(a, 7, c3, c1, c2);
599296341Sdelphij    r[14] = c3;
600296341Sdelphij    r[15] = c1;
601296341Sdelphij}
602109998Smarkm
603205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
604296341Sdelphij{
605296341Sdelphij    BN_ULONG t1, t2;
606296341Sdelphij    BN_ULONG c1, c2, c3;
607109998Smarkm
608296341Sdelphij    c1 = 0;
609296341Sdelphij    c2 = 0;
610296341Sdelphij    c3 = 0;
611296341Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
612296341Sdelphij    r[0] = c1;
613296341Sdelphij    c1 = 0;
614296341Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
615296341Sdelphij    r[1] = c2;
616296341Sdelphij    c2 = 0;
617296341Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
618296341Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
619296341Sdelphij    r[2] = c3;
620296341Sdelphij    c3 = 0;
621296341Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
622296341Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
623296341Sdelphij    r[3] = c1;
624296341Sdelphij    c1 = 0;
625296341Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
626296341Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
627296341Sdelphij    r[4] = c2;
628296341Sdelphij    c2 = 0;
629296341Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
630296341Sdelphij    r[5] = c3;
631296341Sdelphij    c3 = 0;
632296341Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
633296341Sdelphij    r[6] = c1;
634296341Sdelphij    r[7] = c2;
635296341Sdelphij}
636162911Ssimon#endif
637