1238384Sjkim/* crypto/ec/ecp_nistp256.c */
2238384Sjkim/*
3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
4238384Sjkim */
5238384Sjkim/* Copyright 2011 Google Inc.
6238384Sjkim *
7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
8238384Sjkim *
9238384Sjkim * you may not use this file except in compliance with the License.
10238384Sjkim * You may obtain a copy of the License at
11238384Sjkim *
12238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
13238384Sjkim *
14238384Sjkim *  Unless required by applicable law or agreed to in writing, software
15238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
16238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17238384Sjkim *  See the License for the specific language governing permissions and
18238384Sjkim *  limitations under the License.
19238384Sjkim */
20238384Sjkim
21238384Sjkim/*
22238384Sjkim * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23238384Sjkim *
24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
27238384Sjkim */
28238384Sjkim
29238384Sjkim#include <openssl/opensslconf.h>
30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31238384Sjkim
32238384Sjkim#ifndef OPENSSL_SYS_VMS
33238384Sjkim#include <stdint.h>
34238384Sjkim#else
35238384Sjkim#include <inttypes.h>
36238384Sjkim#endif
37238384Sjkim
38238384Sjkim#include <string.h>
39238384Sjkim#include <openssl/err.h>
40238384Sjkim#include "ec_lcl.h"
41238384Sjkim
42238384Sjkim#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
44238384Sjkim  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
45238384Sjkim  typedef __int128_t int128_t;
46238384Sjkim#else
47238384Sjkim  #error "Need GCC 3.1 or later to define type uint128_t"
48238384Sjkim#endif
49238384Sjkim
50238384Sjkimtypedef uint8_t u8;
51238384Sjkimtypedef uint32_t u32;
52238384Sjkimtypedef uint64_t u64;
53238384Sjkimtypedef int64_t s64;
54238384Sjkim
55238384Sjkim/* The underlying field.
56238384Sjkim *
57238384Sjkim * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element
58238384Sjkim * of this field into 32 bytes. We call this an felem_bytearray. */
59238384Sjkim
60238384Sjkimtypedef u8 felem_bytearray[32];
61238384Sjkim
62238384Sjkim/* These are the parameters of P256, taken from FIPS 186-3, page 86. These
63238384Sjkim * values are big-endian. */
64238384Sjkimstatic const felem_bytearray nistp256_curve_params[5] = {
65238384Sjkim	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* p */
66238384Sjkim	 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
67238384Sjkim	 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
68238384Sjkim	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
69238384Sjkim	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* a = -3 */
70238384Sjkim	 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71238384Sjkim	 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
72238384Sjkim	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},      /* b */
73238384Sjkim	{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
74238384Sjkim	 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
75238384Sjkim	 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
76238384Sjkim	 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
77238384Sjkim	{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47,       /* x */
78238384Sjkim	 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
79238384Sjkim	 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
80238384Sjkim	 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
81238384Sjkim	{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b,       /* y */
82238384Sjkim	 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
83238384Sjkim	 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
84238384Sjkim	 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
85238384Sjkim};
86238384Sjkim
87238384Sjkim/* The representation of field elements.
88238384Sjkim * ------------------------------------
89238384Sjkim *
90238384Sjkim * We represent field elements with either four 128-bit values, eight 128-bit
91238384Sjkim * values, or four 64-bit values. The field element represented is:
92238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
93238384Sjkim * or:
94238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
95238384Sjkim *
96238384Sjkim * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
97238384Sjkim * apart, but are 128-bits wide, the most significant bits of each limb overlap
98238384Sjkim * with the least significant bits of the next.
99238384Sjkim *
100238384Sjkim * A field element with four limbs is an 'felem'. One with eight limbs is a
101238384Sjkim * 'longfelem'
102238384Sjkim *
103238384Sjkim * A field element with four, 64-bit values is called a 'smallfelem'. Small
104238384Sjkim * values are used as intermediate values before multiplication.
105238384Sjkim */
106238384Sjkim
107238384Sjkim#define NLIMBS 4
108238384Sjkim
109238384Sjkimtypedef uint128_t limb;
110238384Sjkimtypedef limb felem[NLIMBS];
111238384Sjkimtypedef limb longfelem[NLIMBS * 2];
112238384Sjkimtypedef u64 smallfelem[NLIMBS];
113238384Sjkim
114238384Sjkim/* This is the value of the prime as four 64-bit words, little-endian. */
115238384Sjkimstatic const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
116238384Sjkimstatic const u64 bottom63bits = 0x7ffffffffffffffful;
117238384Sjkim
118238384Sjkim/* bin32_to_felem takes a little-endian byte array and converts it into felem
119238384Sjkim * form. This assumes that the CPU is little-endian. */
120238384Sjkimstatic void bin32_to_felem(felem out, const u8 in[32])
121238384Sjkim	{
122238384Sjkim	out[0] = *((u64*) &in[0]);
123238384Sjkim	out[1] = *((u64*) &in[8]);
124238384Sjkim	out[2] = *((u64*) &in[16]);
125238384Sjkim	out[3] = *((u64*) &in[24]);
126238384Sjkim	}
127238384Sjkim
128238384Sjkim/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian,
129238384Sjkim * 32 byte array. This assumes that the CPU is little-endian. */
130238384Sjkimstatic void smallfelem_to_bin32(u8 out[32], const smallfelem in)
131238384Sjkim	{
132238384Sjkim	*((u64*) &out[0]) = in[0];
133238384Sjkim	*((u64*) &out[8]) = in[1];
134238384Sjkim	*((u64*) &out[16]) = in[2];
135238384Sjkim	*((u64*) &out[24]) = in[3];
136238384Sjkim	}
137238384Sjkim
138238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
139238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len)
140238384Sjkim	{
141238384Sjkim	unsigned i;
142238384Sjkim	for (i = 0; i < len; ++i)
143238384Sjkim		out[i] = in[len-1-i];
144238384Sjkim	}
145238384Sjkim
146238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
147238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
148238384Sjkim	{
149238384Sjkim	felem_bytearray b_in;
150238384Sjkim	felem_bytearray b_out;
151238384Sjkim	unsigned num_bytes;
152238384Sjkim
153238384Sjkim	/* BN_bn2bin eats leading zeroes */
154238384Sjkim	memset(b_out, 0, sizeof b_out);
155238384Sjkim	num_bytes = BN_num_bytes(bn);
156238384Sjkim	if (num_bytes > sizeof b_out)
157238384Sjkim		{
158238384Sjkim		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
159238384Sjkim		return 0;
160238384Sjkim		}
161238384Sjkim	if (BN_is_negative(bn))
162238384Sjkim		{
163238384Sjkim		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
164238384Sjkim		return 0;
165238384Sjkim		}
166238384Sjkim	num_bytes = BN_bn2bin(bn, b_in);
167238384Sjkim	flip_endian(b_out, b_in, num_bytes);
168238384Sjkim	bin32_to_felem(out, b_out);
169238384Sjkim	return 1;
170238384Sjkim	}
171238384Sjkim
172238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
173238384Sjkimstatic BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
174238384Sjkim	{
175238384Sjkim	felem_bytearray b_in, b_out;
176238384Sjkim	smallfelem_to_bin32(b_in, in);
177238384Sjkim	flip_endian(b_out, b_in, sizeof b_out);
178238384Sjkim	return BN_bin2bn(b_out, sizeof b_out, out);
179238384Sjkim	}
180238384Sjkim
181238384Sjkim
182238384Sjkim/* Field operations
183238384Sjkim * ---------------- */
184238384Sjkim
185238384Sjkimstatic void smallfelem_one(smallfelem out)
186238384Sjkim	{
187238384Sjkim	out[0] = 1;
188238384Sjkim	out[1] = 0;
189238384Sjkim	out[2] = 0;
190238384Sjkim	out[3] = 0;
191238384Sjkim	}
192238384Sjkim
193238384Sjkimstatic void smallfelem_assign(smallfelem out, const smallfelem in)
194238384Sjkim	{
195238384Sjkim	out[0] = in[0];
196238384Sjkim	out[1] = in[1];
197238384Sjkim	out[2] = in[2];
198238384Sjkim	out[3] = in[3];
199238384Sjkim	}
200238384Sjkim
201238384Sjkimstatic void felem_assign(felem out, const felem in)
202238384Sjkim	{
203238384Sjkim	out[0] = in[0];
204238384Sjkim	out[1] = in[1];
205238384Sjkim	out[2] = in[2];
206238384Sjkim	out[3] = in[3];
207238384Sjkim	}
208238384Sjkim
209238384Sjkim/* felem_sum sets out = out + in. */
210238384Sjkimstatic void felem_sum(felem out, const felem in)
211238384Sjkim	{
212238384Sjkim	out[0] += in[0];
213238384Sjkim	out[1] += in[1];
214238384Sjkim	out[2] += in[2];
215238384Sjkim	out[3] += in[3];
216238384Sjkim	}
217238384Sjkim
218238384Sjkim/* felem_small_sum sets out = out + in. */
219238384Sjkimstatic void felem_small_sum(felem out, const smallfelem in)
220238384Sjkim	{
221238384Sjkim	out[0] += in[0];
222238384Sjkim	out[1] += in[1];
223238384Sjkim	out[2] += in[2];
224238384Sjkim	out[3] += in[3];
225238384Sjkim	}
226238384Sjkim
227238384Sjkim/* felem_scalar sets out = out * scalar */
228238384Sjkimstatic void felem_scalar(felem out, const u64 scalar)
229238384Sjkim	{
230238384Sjkim	out[0] *= scalar;
231238384Sjkim	out[1] *= scalar;
232238384Sjkim	out[2] *= scalar;
233238384Sjkim	out[3] *= scalar;
234238384Sjkim	}
235238384Sjkim
236238384Sjkim/* longfelem_scalar sets out = out * scalar */
237238384Sjkimstatic void longfelem_scalar(longfelem out, const u64 scalar)
238238384Sjkim	{
239238384Sjkim	out[0] *= scalar;
240238384Sjkim	out[1] *= scalar;
241238384Sjkim	out[2] *= scalar;
242238384Sjkim	out[3] *= scalar;
243238384Sjkim	out[4] *= scalar;
244238384Sjkim	out[5] *= scalar;
245238384Sjkim	out[6] *= scalar;
246238384Sjkim	out[7] *= scalar;
247238384Sjkim	}
248238384Sjkim
249238384Sjkim#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
250238384Sjkim#define two105 (((limb)1) << 105)
251238384Sjkim#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
252238384Sjkim
253238384Sjkim/* zero105 is 0 mod p */
254238384Sjkimstatic const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 };
255238384Sjkim
256238384Sjkim/* smallfelem_neg sets |out| to |-small|
257238384Sjkim * On exit:
258238384Sjkim *   out[i] < out[i] + 2^105
259238384Sjkim */
260238384Sjkimstatic void smallfelem_neg(felem out, const smallfelem small)
261238384Sjkim	{
262238384Sjkim	/* In order to prevent underflow, we subtract from 0 mod p. */
263238384Sjkim	out[0] = zero105[0] - small[0];
264238384Sjkim	out[1] = zero105[1] - small[1];
265238384Sjkim	out[2] = zero105[2] - small[2];
266238384Sjkim	out[3] = zero105[3] - small[3];
267238384Sjkim	}
268238384Sjkim
269238384Sjkim/* felem_diff subtracts |in| from |out|
270238384Sjkim * On entry:
271238384Sjkim *   in[i] < 2^104
272238384Sjkim * On exit:
273238384Sjkim *   out[i] < out[i] + 2^105
274238384Sjkim */
275238384Sjkimstatic void felem_diff(felem out, const felem in)
276238384Sjkim	{
277238384Sjkim	/* In order to prevent underflow, we add 0 mod p before subtracting. */
278238384Sjkim	out[0] += zero105[0];
279238384Sjkim	out[1] += zero105[1];
280238384Sjkim	out[2] += zero105[2];
281238384Sjkim	out[3] += zero105[3];
282238384Sjkim
283238384Sjkim	out[0] -= in[0];
284238384Sjkim	out[1] -= in[1];
285238384Sjkim	out[2] -= in[2];
286238384Sjkim	out[3] -= in[3];
287238384Sjkim	}
288238384Sjkim
289238384Sjkim#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
290238384Sjkim#define two107 (((limb)1) << 107)
291238384Sjkim#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
292238384Sjkim
293238384Sjkim/* zero107 is 0 mod p */
294238384Sjkimstatic const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 };
295238384Sjkim
296238384Sjkim/* An alternative felem_diff for larger inputs |in|
297238384Sjkim * felem_diff_zero107 subtracts |in| from |out|
298238384Sjkim * On entry:
299238384Sjkim *   in[i] < 2^106
300238384Sjkim * On exit:
301238384Sjkim *   out[i] < out[i] + 2^107
302238384Sjkim */
303238384Sjkimstatic void felem_diff_zero107(felem out, const felem in)
304238384Sjkim	{
305238384Sjkim	/* In order to prevent underflow, we add 0 mod p before subtracting. */
306238384Sjkim	out[0] += zero107[0];
307238384Sjkim	out[1] += zero107[1];
308238384Sjkim	out[2] += zero107[2];
309238384Sjkim	out[3] += zero107[3];
310238384Sjkim
311238384Sjkim	out[0] -= in[0];
312238384Sjkim	out[1] -= in[1];
313238384Sjkim	out[2] -= in[2];
314238384Sjkim	out[3] -= in[3];
315238384Sjkim	}
316238384Sjkim
317238384Sjkim/* longfelem_diff subtracts |in| from |out|
318238384Sjkim * On entry:
319238384Sjkim *   in[i] < 7*2^67
320238384Sjkim * On exit:
321238384Sjkim *   out[i] < out[i] + 2^70 + 2^40
322238384Sjkim */
323238384Sjkimstatic void longfelem_diff(longfelem out, const longfelem in)
324238384Sjkim	{
325238384Sjkim	static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6);
326238384Sjkim	static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40);
327238384Sjkim	static const limb two70 = (((limb)1) << 70);
328238384Sjkim	static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6);
329238384Sjkim	static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6);
330238384Sjkim
331238384Sjkim	/* add 0 mod p to avoid underflow */
332238384Sjkim	out[0] += two70m8p6;
333238384Sjkim	out[1] += two70p40;
334238384Sjkim	out[2] += two70;
335238384Sjkim	out[3] += two70m40m38p6;
336238384Sjkim	out[4] += two70m6;
337238384Sjkim	out[5] += two70m6;
338238384Sjkim	out[6] += two70m6;
339238384Sjkim	out[7] += two70m6;
340238384Sjkim
341238384Sjkim	/* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
342238384Sjkim	out[0] -= in[0];
343238384Sjkim	out[1] -= in[1];
344238384Sjkim	out[2] -= in[2];
345238384Sjkim	out[3] -= in[3];
346238384Sjkim	out[4] -= in[4];
347238384Sjkim	out[5] -= in[5];
348238384Sjkim	out[6] -= in[6];
349238384Sjkim	out[7] -= in[7];
350238384Sjkim	}
351238384Sjkim
352238384Sjkim#define two64m0 (((limb)1) << 64) - 1
353238384Sjkim#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
354238384Sjkim#define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
355238384Sjkim#define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
356238384Sjkim
357238384Sjkim/* zero110 is 0 mod p */
358238384Sjkimstatic const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
359238384Sjkim
360238384Sjkim/* felem_shrink converts an felem into a smallfelem. The result isn't quite
361238384Sjkim * minimal as the value may be greater than p.
362238384Sjkim *
363238384Sjkim * On entry:
364238384Sjkim *   in[i] < 2^109
365238384Sjkim * On exit:
366238384Sjkim *   out[i] < 2^64
367238384Sjkim */
368238384Sjkimstatic void felem_shrink(smallfelem out, const felem in)
369238384Sjkim	{
370238384Sjkim	felem tmp;
371238384Sjkim	u64 a, b, mask;
372238384Sjkim	s64 high, low;
373238384Sjkim	static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
374238384Sjkim
375238384Sjkim	/* Carry 2->3 */
376238384Sjkim	tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64));
377238384Sjkim	/* tmp[3] < 2^110 */
378238384Sjkim
379238384Sjkim	tmp[2] = zero110[2] + (u64) in[2];
380238384Sjkim	tmp[0] = zero110[0] + in[0];
381238384Sjkim	tmp[1] = zero110[1] + in[1];
382238384Sjkim	/* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
383238384Sjkim
384238384Sjkim	/* We perform two partial reductions where we eliminate the
385238384Sjkim	 * high-word of tmp[3]. We don't update the other words till the end.
386238384Sjkim	 */
387238384Sjkim	a = tmp[3] >> 64; /* a < 2^46 */
388238384Sjkim	tmp[3] = (u64) tmp[3];
389238384Sjkim	tmp[3] -= a;
390238384Sjkim	tmp[3] += ((limb)a) << 32;
391238384Sjkim	/* tmp[3] < 2^79 */
392238384Sjkim
393238384Sjkim	b = a;
394238384Sjkim	a = tmp[3] >> 64; /* a < 2^15 */
395238384Sjkim	b += a; /* b < 2^46 + 2^15 < 2^47 */
396238384Sjkim	tmp[3] = (u64) tmp[3];
397238384Sjkim	tmp[3] -= a;
398238384Sjkim	tmp[3] += ((limb)a) << 32;
399238384Sjkim	/* tmp[3] < 2^64 + 2^47 */
400238384Sjkim
401238384Sjkim	/* This adjusts the other two words to complete the two partial
402238384Sjkim	 * reductions. */
403238384Sjkim	tmp[0] += b;
404238384Sjkim	tmp[1] -= (((limb)b) << 32);
405238384Sjkim
406238384Sjkim	/* In order to make space in tmp[3] for the carry from 2 -> 3, we
407238384Sjkim	 * conditionally subtract kPrime if tmp[3] is large enough. */
408238384Sjkim	high = tmp[3] >> 64;
409238384Sjkim	/* As tmp[3] < 2^65, high is either 1 or 0 */
410238384Sjkim	high <<= 63;
411238384Sjkim	high >>= 63;
412238384Sjkim	/* high is:
413238384Sjkim	 *   all ones   if the high word of tmp[3] is 1
414238384Sjkim	 *   all zeros  if the high word of tmp[3] if 0 */
415238384Sjkim	low = tmp[3];
416238384Sjkim	mask = low >> 63;
417238384Sjkim	/* mask is:
418238384Sjkim	 *   all ones   if the MSB of low is 1
419238384Sjkim	 *   all zeros  if the MSB of low if 0 */
420238384Sjkim	low &= bottom63bits;
421238384Sjkim	low -= kPrime3Test;
422238384Sjkim	/* if low was greater than kPrime3Test then the MSB is zero */
423238384Sjkim	low = ~low;
424238384Sjkim	low >>= 63;
425238384Sjkim	/* low is:
426238384Sjkim	 *   all ones   if low was > kPrime3Test
427238384Sjkim	 *   all zeros  if low was <= kPrime3Test */
428238384Sjkim	mask = (mask & low) | high;
429238384Sjkim	tmp[0] -= mask & kPrime[0];
430238384Sjkim	tmp[1] -= mask & kPrime[1];
431238384Sjkim	/* kPrime[2] is zero, so omitted */
432238384Sjkim	tmp[3] -= mask & kPrime[3];
433238384Sjkim	/* tmp[3] < 2**64 - 2**32 + 1 */
434238384Sjkim
435238384Sjkim	tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0];
436238384Sjkim	tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1];
437238384Sjkim	tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2];
438238384Sjkim	/* tmp[i] < 2^64 */
439238384Sjkim
440238384Sjkim	out[0] = tmp[0];
441238384Sjkim	out[1] = tmp[1];
442238384Sjkim	out[2] = tmp[2];
443238384Sjkim	out[3] = tmp[3];
444238384Sjkim	}
445238384Sjkim
446238384Sjkim/* smallfelem_expand converts a smallfelem to an felem */
447238384Sjkimstatic void smallfelem_expand(felem out, const smallfelem in)
448238384Sjkim	{
449238384Sjkim	out[0] = in[0];
450238384Sjkim	out[1] = in[1];
451238384Sjkim	out[2] = in[2];
452238384Sjkim	out[3] = in[3];
453238384Sjkim	}
454238384Sjkim
455238384Sjkim/* smallfelem_square sets |out| = |small|^2
456238384Sjkim * On entry:
457238384Sjkim *   small[i] < 2^64
458238384Sjkim * On exit:
459238384Sjkim *   out[i] < 7 * 2^64 < 2^67
460238384Sjkim */
461238384Sjkimstatic void smallfelem_square(longfelem out, const smallfelem small)
462238384Sjkim	{
463238384Sjkim	limb a;
464238384Sjkim	u64 high, low;
465238384Sjkim
466238384Sjkim	a = ((uint128_t) small[0]) * small[0];
467238384Sjkim	low = a;
468238384Sjkim	high = a >> 64;
469238384Sjkim	out[0] = low;
470238384Sjkim	out[1] = high;
471238384Sjkim
472238384Sjkim	a = ((uint128_t) small[0]) * small[1];
473238384Sjkim	low = a;
474238384Sjkim	high = a >> 64;
475238384Sjkim	out[1] += low;
476238384Sjkim	out[1] += low;
477238384Sjkim	out[2] = high;
478238384Sjkim
479238384Sjkim	a = ((uint128_t) small[0]) * small[2];
480238384Sjkim	low = a;
481238384Sjkim	high = a >> 64;
482238384Sjkim	out[2] += low;
483238384Sjkim	out[2] *= 2;
484238384Sjkim	out[3] = high;
485238384Sjkim
486238384Sjkim	a = ((uint128_t) small[0]) * small[3];
487238384Sjkim	low = a;
488238384Sjkim	high = a >> 64;
489238384Sjkim	out[3] += low;
490238384Sjkim	out[4] = high;
491238384Sjkim
492238384Sjkim	a = ((uint128_t) small[1]) * small[2];
493238384Sjkim	low = a;
494238384Sjkim	high = a >> 64;
495238384Sjkim	out[3] += low;
496238384Sjkim	out[3] *= 2;
497238384Sjkim	out[4] += high;
498238384Sjkim
499238384Sjkim	a = ((uint128_t) small[1]) * small[1];
500238384Sjkim	low = a;
501238384Sjkim	high = a >> 64;
502238384Sjkim	out[2] += low;
503238384Sjkim	out[3] += high;
504238384Sjkim
505238384Sjkim	a = ((uint128_t) small[1]) * small[3];
506238384Sjkim	low = a;
507238384Sjkim	high = a >> 64;
508238384Sjkim	out[4] += low;
509238384Sjkim	out[4] *= 2;
510238384Sjkim	out[5] = high;
511238384Sjkim
512238384Sjkim	a = ((uint128_t) small[2]) * small[3];
513238384Sjkim	low = a;
514238384Sjkim	high = a >> 64;
515238384Sjkim	out[5] += low;
516238384Sjkim	out[5] *= 2;
517238384Sjkim	out[6] = high;
518238384Sjkim	out[6] += high;
519238384Sjkim
520238384Sjkim	a = ((uint128_t) small[2]) * small[2];
521238384Sjkim	low = a;
522238384Sjkim	high = a >> 64;
523238384Sjkim	out[4] += low;
524238384Sjkim	out[5] += high;
525238384Sjkim
526238384Sjkim	a = ((uint128_t) small[3]) * small[3];
527238384Sjkim	low = a;
528238384Sjkim	high = a >> 64;
529238384Sjkim	out[6] += low;
530238384Sjkim	out[7] = high;
531238384Sjkim	}
532238384Sjkim
533238384Sjkim/* felem_square sets |out| = |in|^2
534238384Sjkim * On entry:
535238384Sjkim *   in[i] < 2^109
536238384Sjkim * On exit:
537238384Sjkim *   out[i] < 7 * 2^64 < 2^67
538238384Sjkim */
539238384Sjkimstatic void felem_square(longfelem out, const felem in)
540238384Sjkim	{
541238384Sjkim	u64 small[4];
542238384Sjkim	felem_shrink(small, in);
543238384Sjkim	smallfelem_square(out, small);
544238384Sjkim	}
545238384Sjkim
546238384Sjkim/* smallfelem_mul sets |out| = |small1| * |small2|
547238384Sjkim * On entry:
548238384Sjkim *   small1[i] < 2^64
549238384Sjkim *   small2[i] < 2^64
550238384Sjkim * On exit:
551238384Sjkim *   out[i] < 7 * 2^64 < 2^67
552238384Sjkim */
553238384Sjkimstatic void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2)
554238384Sjkim	{
555238384Sjkim	limb a;
556238384Sjkim	u64 high, low;
557238384Sjkim
558238384Sjkim	a = ((uint128_t) small1[0]) * small2[0];
559238384Sjkim	low = a;
560238384Sjkim	high = a >> 64;
561238384Sjkim	out[0] = low;
562238384Sjkim	out[1] = high;
563238384Sjkim
564238384Sjkim
565238384Sjkim	a = ((uint128_t) small1[0]) * small2[1];
566238384Sjkim	low = a;
567238384Sjkim	high = a >> 64;
568238384Sjkim	out[1] += low;
569238384Sjkim	out[2] = high;
570238384Sjkim
571238384Sjkim	a = ((uint128_t) small1[1]) * small2[0];
572238384Sjkim	low = a;
573238384Sjkim	high = a >> 64;
574238384Sjkim	out[1] += low;
575238384Sjkim	out[2] += high;
576238384Sjkim
577238384Sjkim
578238384Sjkim	a = ((uint128_t) small1[0]) * small2[2];
579238384Sjkim	low = a;
580238384Sjkim	high = a >> 64;
581238384Sjkim	out[2] += low;
582238384Sjkim	out[3] = high;
583238384Sjkim
584238384Sjkim	a = ((uint128_t) small1[1]) * small2[1];
585238384Sjkim	low = a;
586238384Sjkim	high = a >> 64;
587238384Sjkim	out[2] += low;
588238384Sjkim	out[3] += high;
589238384Sjkim
590238384Sjkim	a = ((uint128_t) small1[2]) * small2[0];
591238384Sjkim	low = a;
592238384Sjkim	high = a >> 64;
593238384Sjkim	out[2] += low;
594238384Sjkim	out[3] += high;
595238384Sjkim
596238384Sjkim
597238384Sjkim	a = ((uint128_t) small1[0]) * small2[3];
598238384Sjkim	low = a;
599238384Sjkim	high = a >> 64;
600238384Sjkim	out[3] += low;
601238384Sjkim	out[4] = high;
602238384Sjkim
603238384Sjkim	a = ((uint128_t) small1[1]) * small2[2];
604238384Sjkim	low = a;
605238384Sjkim	high = a >> 64;
606238384Sjkim	out[3] += low;
607238384Sjkim	out[4] += high;
608238384Sjkim
609238384Sjkim	a = ((uint128_t) small1[2]) * small2[1];
610238384Sjkim	low = a;
611238384Sjkim	high = a >> 64;
612238384Sjkim	out[3] += low;
613238384Sjkim	out[4] += high;
614238384Sjkim
615238384Sjkim	a = ((uint128_t) small1[3]) * small2[0];
616238384Sjkim	low = a;
617238384Sjkim	high = a >> 64;
618238384Sjkim	out[3] += low;
619238384Sjkim	out[4] += high;
620238384Sjkim
621238384Sjkim
622238384Sjkim	a = ((uint128_t) small1[1]) * small2[3];
623238384Sjkim	low = a;
624238384Sjkim	high = a >> 64;
625238384Sjkim	out[4] += low;
626238384Sjkim	out[5] = high;
627238384Sjkim
628238384Sjkim	a = ((uint128_t) small1[2]) * small2[2];
629238384Sjkim	low = a;
630238384Sjkim	high = a >> 64;
631238384Sjkim	out[4] += low;
632238384Sjkim	out[5] += high;
633238384Sjkim
634238384Sjkim	a = ((uint128_t) small1[3]) * small2[1];
635238384Sjkim	low = a;
636238384Sjkim	high = a >> 64;
637238384Sjkim	out[4] += low;
638238384Sjkim	out[5] += high;
639238384Sjkim
640238384Sjkim
641238384Sjkim	a = ((uint128_t) small1[2]) * small2[3];
642238384Sjkim	low = a;
643238384Sjkim	high = a >> 64;
644238384Sjkim	out[5] += low;
645238384Sjkim	out[6] = high;
646238384Sjkim
647238384Sjkim	a = ((uint128_t) small1[3]) * small2[2];
648238384Sjkim	low = a;
649238384Sjkim	high = a >> 64;
650238384Sjkim	out[5] += low;
651238384Sjkim	out[6] += high;
652238384Sjkim
653238384Sjkim
654238384Sjkim	a = ((uint128_t) small1[3]) * small2[3];
655238384Sjkim	low = a;
656238384Sjkim	high = a >> 64;
657238384Sjkim	out[6] += low;
658238384Sjkim	out[7] = high;
659238384Sjkim	}
660238384Sjkim
661238384Sjkim/* felem_mul sets |out| = |in1| * |in2|
662238384Sjkim * On entry:
663238384Sjkim *   in1[i] < 2^109
664238384Sjkim *   in2[i] < 2^109
665238384Sjkim * On exit:
666238384Sjkim *   out[i] < 7 * 2^64 < 2^67
667238384Sjkim */
668238384Sjkimstatic void felem_mul(longfelem out, const felem in1, const felem in2)
669238384Sjkim	{
670238384Sjkim	smallfelem small1, small2;
671238384Sjkim	felem_shrink(small1, in1);
672238384Sjkim	felem_shrink(small2, in2);
673238384Sjkim	smallfelem_mul(out, small1, small2);
674238384Sjkim	}
675238384Sjkim
676238384Sjkim/* felem_small_mul sets |out| = |small1| * |in2|
677238384Sjkim * On entry:
678238384Sjkim *   small1[i] < 2^64
679238384Sjkim *   in2[i] < 2^109
680238384Sjkim * On exit:
681238384Sjkim *   out[i] < 7 * 2^64 < 2^67
682238384Sjkim */
683238384Sjkimstatic void felem_small_mul(longfelem out, const smallfelem small1, const felem in2)
684238384Sjkim	{
685238384Sjkim	smallfelem small2;
686238384Sjkim	felem_shrink(small2, in2);
687238384Sjkim	smallfelem_mul(out, small1, small2);
688238384Sjkim	}
689238384Sjkim
690238384Sjkim#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
691238384Sjkim#define two100 (((limb)1) << 100)
692238384Sjkim#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
693238384Sjkim/* zero100 is 0 mod p */
694238384Sjkimstatic const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 };
695238384Sjkim
696238384Sjkim/* Internal function for the different flavours of felem_reduce.
697238384Sjkim * felem_reduce_ reduces the higher coefficients in[4]-in[7].
698238384Sjkim * On entry:
699238384Sjkim *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
700238384Sjkim *   out[1] >= in[7] + 2^32*in[4]
701238384Sjkim *   out[2] >= in[5] + 2^32*in[5]
702238384Sjkim *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
703238384Sjkim * On exit:
704238384Sjkim *   out[0] <= out[0] + in[4] + 2^32*in[5]
705238384Sjkim *   out[1] <= out[1] + in[5] + 2^33*in[6]
706238384Sjkim *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
707238384Sjkim *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
708238384Sjkim */
709238384Sjkimstatic void felem_reduce_(felem out, const longfelem in)
710238384Sjkim	{
711238384Sjkim	int128_t c;
712238384Sjkim	/* combine common terms from below */
713238384Sjkim	c = in[4] + (in[5] << 32);
714238384Sjkim	out[0] += c;
715238384Sjkim	out[3] -= c;
716238384Sjkim
717238384Sjkim	c = in[5] - in[7];
718238384Sjkim	out[1] += c;
719238384Sjkim	out[2] -= c;
720238384Sjkim
721238384Sjkim	/* the remaining terms */
722238384Sjkim	/* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
723238384Sjkim	out[1] -= (in[4] << 32);
724238384Sjkim	out[3] += (in[4] << 32);
725238384Sjkim
726238384Sjkim	/* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
727238384Sjkim	out[2] -= (in[5] << 32);
728238384Sjkim
729238384Sjkim	/* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
730238384Sjkim	out[0] -= in[6];
731238384Sjkim	out[0] -= (in[6] << 32);
732238384Sjkim	out[1] += (in[6] << 33);
733238384Sjkim	out[2] += (in[6] * 2);
734238384Sjkim	out[3] -= (in[6] << 32);
735238384Sjkim
736238384Sjkim	/* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
737238384Sjkim	out[0] -= in[7];
738238384Sjkim	out[0] -= (in[7] << 32);
739238384Sjkim	out[2] += (in[7] << 33);
740238384Sjkim	out[3] += (in[7] * 3);
741238384Sjkim	}
742238384Sjkim
743238384Sjkim/* felem_reduce converts a longfelem into an felem.
744238384Sjkim * To be called directly after felem_square or felem_mul.
745238384Sjkim * On entry:
746238384Sjkim *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
747238384Sjkim *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
748238384Sjkim * On exit:
749238384Sjkim *   out[i] < 2^101
750238384Sjkim */
751238384Sjkimstatic void felem_reduce(felem out, const longfelem in)
752238384Sjkim	{
753238384Sjkim	out[0] = zero100[0] + in[0];
754238384Sjkim	out[1] = zero100[1] + in[1];
755238384Sjkim	out[2] = zero100[2] + in[2];
756238384Sjkim	out[3] = zero100[3] + in[3];
757238384Sjkim
758238384Sjkim	felem_reduce_(out, in);
759238384Sjkim
760238384Sjkim	/* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
761238384Sjkim	 * out[1] > 2^100 - 2^64 - 7*2^96 > 0
762238384Sjkim	 * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
763238384Sjkim	 * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
764238384Sjkim	 *
765238384Sjkim	 * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
766238384Sjkim	 * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
767238384Sjkim	 * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
768238384Sjkim	 * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
769238384Sjkim	 */
770238384Sjkim	}
771238384Sjkim
772238384Sjkim/* felem_reduce_zero105 converts a larger longfelem into an felem.
773238384Sjkim * On entry:
774238384Sjkim *   in[0] < 2^71
775238384Sjkim * On exit:
776238384Sjkim *   out[i] < 2^106
777238384Sjkim */
778238384Sjkimstatic void felem_reduce_zero105(felem out, const longfelem in)
779238384Sjkim	{
780238384Sjkim	out[0] = zero105[0] + in[0];
781238384Sjkim	out[1] = zero105[1] + in[1];
782238384Sjkim	out[2] = zero105[2] + in[2];
783238384Sjkim	out[3] = zero105[3] + in[3];
784238384Sjkim
785238384Sjkim	felem_reduce_(out, in);
786238384Sjkim
787238384Sjkim	/* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
788238384Sjkim	 * out[1] > 2^105 - 2^71 - 2^103 > 0
789238384Sjkim	 * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
790238384Sjkim	 * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
791238384Sjkim	 *
792238384Sjkim	 * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
793238384Sjkim	 * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
794238384Sjkim	 * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
795238384Sjkim	 * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
796238384Sjkim	 */
797238384Sjkim	}
798238384Sjkim
799238384Sjkim/* subtract_u64 sets *result = *result - v and *carry to one if the subtraction
800238384Sjkim * underflowed. */
801238384Sjkimstatic void subtract_u64(u64* result, u64* carry, u64 v)
802238384Sjkim	{
803238384Sjkim	uint128_t r = *result;
804238384Sjkim	r -= v;
805238384Sjkim	*carry = (r >> 64) & 1;
806238384Sjkim	*result = (u64) r;
807238384Sjkim	}
808238384Sjkim
809238384Sjkim/* felem_contract converts |in| to its unique, minimal representation.
810238384Sjkim * On entry:
811238384Sjkim *   in[i] < 2^109
812238384Sjkim */
813238384Sjkimstatic void felem_contract(smallfelem out, const felem in)
814238384Sjkim	{
815238384Sjkim	unsigned i;
816238384Sjkim	u64 all_equal_so_far = 0, result = 0, carry;
817238384Sjkim
818238384Sjkim	felem_shrink(out, in);
819238384Sjkim	/* small is minimal except that the value might be > p */
820238384Sjkim
821238384Sjkim	all_equal_so_far--;
822238384Sjkim	/* We are doing a constant time test if out >= kPrime. We need to
823238384Sjkim	 * compare each u64, from most-significant to least significant. For
824238384Sjkim	 * each one, if all words so far have been equal (m is all ones) then a
825238384Sjkim	 * non-equal result is the answer. Otherwise we continue. */
826238384Sjkim	for (i = 3; i < 4; i--)
827238384Sjkim		{
828238384Sjkim		u64 equal;
829238384Sjkim		uint128_t a = ((uint128_t) kPrime[i]) - out[i];
830238384Sjkim		/* if out[i] > kPrime[i] then a will underflow and the high
831238384Sjkim		 * 64-bits will all be set. */
832238384Sjkim		result |= all_equal_so_far & ((u64) (a >> 64));
833238384Sjkim
834238384Sjkim		/* if kPrime[i] == out[i] then |equal| will be all zeros and
835238384Sjkim		 * the decrement will make it all ones. */
836238384Sjkim		equal = kPrime[i] ^ out[i];
837238384Sjkim		equal--;
838238384Sjkim		equal &= equal << 32;
839238384Sjkim		equal &= equal << 16;
840238384Sjkim		equal &= equal << 8;
841238384Sjkim		equal &= equal << 4;
842238384Sjkim		equal &= equal << 2;
843238384Sjkim		equal &= equal << 1;
844238384Sjkim		equal = ((s64) equal) >> 63;
845238384Sjkim
846238384Sjkim		all_equal_so_far &= equal;
847238384Sjkim		}
848238384Sjkim
849238384Sjkim	/* if all_equal_so_far is still all ones then the two values are equal
850238384Sjkim	 * and so out >= kPrime is true. */
851238384Sjkim	result |= all_equal_so_far;
852238384Sjkim
853238384Sjkim	/* if out >= kPrime then we subtract kPrime. */
854238384Sjkim	subtract_u64(&out[0], &carry, result & kPrime[0]);
855238384Sjkim	subtract_u64(&out[1], &carry, carry);
856238384Sjkim	subtract_u64(&out[2], &carry, carry);
857238384Sjkim	subtract_u64(&out[3], &carry, carry);
858238384Sjkim
859238384Sjkim	subtract_u64(&out[1], &carry, result & kPrime[1]);
860238384Sjkim	subtract_u64(&out[2], &carry, carry);
861238384Sjkim	subtract_u64(&out[3], &carry, carry);
862238384Sjkim
863238384Sjkim	subtract_u64(&out[2], &carry, result & kPrime[2]);
864238384Sjkim	subtract_u64(&out[3], &carry, carry);
865238384Sjkim
866238384Sjkim	subtract_u64(&out[3], &carry, result & kPrime[3]);
867238384Sjkim	}
868238384Sjkim
869238384Sjkimstatic void smallfelem_square_contract(smallfelem out, const smallfelem in)
870238384Sjkim	{
871238384Sjkim	longfelem longtmp;
872238384Sjkim	felem tmp;
873238384Sjkim
874238384Sjkim	smallfelem_square(longtmp, in);
875238384Sjkim	felem_reduce(tmp, longtmp);
876238384Sjkim	felem_contract(out, tmp);
877238384Sjkim	}
878238384Sjkim
879238384Sjkimstatic void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2)
880238384Sjkim	{
881238384Sjkim	longfelem longtmp;
882238384Sjkim	felem tmp;
883238384Sjkim
884238384Sjkim	smallfelem_mul(longtmp, in1, in2);
885238384Sjkim	felem_reduce(tmp, longtmp);
886238384Sjkim	felem_contract(out, tmp);
887238384Sjkim	}
888238384Sjkim
889238384Sjkim/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
890238384Sjkim * otherwise.
891238384Sjkim * On entry:
892238384Sjkim *   small[i] < 2^64
893238384Sjkim */
894238384Sjkimstatic limb smallfelem_is_zero(const smallfelem small)
895238384Sjkim	{
896238384Sjkim	limb result;
897238384Sjkim	u64 is_p;
898238384Sjkim
899238384Sjkim	u64 is_zero = small[0] | small[1] | small[2] | small[3];
900238384Sjkim	is_zero--;
901238384Sjkim	is_zero &= is_zero << 32;
902238384Sjkim	is_zero &= is_zero << 16;
903238384Sjkim	is_zero &= is_zero << 8;
904238384Sjkim	is_zero &= is_zero << 4;
905238384Sjkim	is_zero &= is_zero << 2;
906238384Sjkim	is_zero &= is_zero << 1;
907238384Sjkim	is_zero = ((s64) is_zero) >> 63;
908238384Sjkim
909238384Sjkim	is_p = (small[0] ^ kPrime[0]) |
910238384Sjkim	       (small[1] ^ kPrime[1]) |
911238384Sjkim	       (small[2] ^ kPrime[2]) |
912238384Sjkim	       (small[3] ^ kPrime[3]);
913238384Sjkim	is_p--;
914238384Sjkim	is_p &= is_p << 32;
915238384Sjkim	is_p &= is_p << 16;
916238384Sjkim	is_p &= is_p << 8;
917238384Sjkim	is_p &= is_p << 4;
918238384Sjkim	is_p &= is_p << 2;
919238384Sjkim	is_p &= is_p << 1;
920238384Sjkim	is_p = ((s64) is_p) >> 63;
921238384Sjkim
922238384Sjkim	is_zero |= is_p;
923238384Sjkim
924238384Sjkim	result = is_zero;
925238384Sjkim	result |= ((limb) is_zero) << 64;
926238384Sjkim	return result;
927238384Sjkim	}
928238384Sjkim
929238384Sjkimstatic int smallfelem_is_zero_int(const smallfelem small)
930238384Sjkim	{
931238384Sjkim	return (int) (smallfelem_is_zero(small) & ((limb)1));
932238384Sjkim	}
933238384Sjkim
934238384Sjkim/* felem_inv calculates |out| = |in|^{-1}
935238384Sjkim *
936238384Sjkim * Based on Fermat's Little Theorem:
937238384Sjkim *   a^p = a (mod p)
938238384Sjkim *   a^{p-1} = 1 (mod p)
939238384Sjkim *   a^{p-2} = a^{-1} (mod p)
940238384Sjkim */
941238384Sjkimstatic void felem_inv(felem out, const felem in)
942238384Sjkim	{
943238384Sjkim	felem ftmp, ftmp2;
944238384Sjkim	/* each e_I will hold |in|^{2^I - 1} */
945238384Sjkim	felem e2, e4, e8, e16, e32, e64;
946238384Sjkim	longfelem tmp;
947238384Sjkim	unsigned i;
948238384Sjkim
949238384Sjkim	felem_square(tmp, in); felem_reduce(ftmp, tmp);			/* 2^1 */
950238384Sjkim	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);		/* 2^2 - 2^0 */
951238384Sjkim	felem_assign(e2, ftmp);
952238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^3 - 2^1 */
953238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^4 - 2^2 */
954238384Sjkim	felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp);		/* 2^4 - 2^0 */
955238384Sjkim	felem_assign(e4, ftmp);
956238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^5 - 2^1 */
957238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^6 - 2^2 */
958238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^7 - 2^3 */
959238384Sjkim	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^8 - 2^4 */
960238384Sjkim	felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp);		/* 2^8 - 2^0 */
961238384Sjkim	felem_assign(e8, ftmp);
962238384Sjkim	for (i = 0; i < 8; i++) {
963238384Sjkim		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
964238384Sjkim	}								/* 2^16 - 2^8 */
965238384Sjkim	felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp);		/* 2^16 - 2^0 */
966238384Sjkim	felem_assign(e16, ftmp);
967238384Sjkim	for (i = 0; i < 16; i++) {
968238384Sjkim		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
969238384Sjkim	}								/* 2^32 - 2^16 */
970238384Sjkim	felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp);		/* 2^32 - 2^0 */
971238384Sjkim	felem_assign(e32, ftmp);
972238384Sjkim	for (i = 0; i < 32; i++) {
973238384Sjkim		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
974238384Sjkim	}								/* 2^64 - 2^32 */
975238384Sjkim	felem_assign(e64, ftmp);
976238384Sjkim	felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp);		/* 2^64 - 2^32 + 2^0 */
977238384Sjkim	for (i = 0; i < 192; i++) {
978238384Sjkim		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
979238384Sjkim	}								/* 2^256 - 2^224 + 2^192 */
980238384Sjkim
981238384Sjkim	felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp);		/* 2^64 - 2^0 */
982238384Sjkim	for (i = 0; i < 16; i++) {
983238384Sjkim		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
984238384Sjkim	}								/* 2^80 - 2^16 */
985238384Sjkim	felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp);		/* 2^80 - 2^0 */
986238384Sjkim	for (i = 0; i < 8; i++) {
987238384Sjkim		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
988238384Sjkim	}								/* 2^88 - 2^8 */
989238384Sjkim	felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp);		/* 2^88 - 2^0 */
990238384Sjkim	for (i = 0; i < 4; i++) {
991238384Sjkim		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
992238384Sjkim	}								/* 2^92 - 2^4 */
993238384Sjkim	felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp);		/* 2^92 - 2^0 */
994238384Sjkim	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^93 - 2^1 */
995238384Sjkim	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^94 - 2^2 */
996238384Sjkim	felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp);		/* 2^94 - 2^0 */
997238384Sjkim	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^95 - 2^1 */
998238384Sjkim	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^96 - 2^2 */
999238384Sjkim	felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp);		/* 2^96 - 3 */
1000238384Sjkim
1001238384Sjkim	felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1002238384Sjkim	}
1003238384Sjkim
1004238384Sjkimstatic void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1005238384Sjkim	{
1006238384Sjkim	felem tmp;
1007238384Sjkim
1008238384Sjkim	smallfelem_expand(tmp, in);
1009238384Sjkim	felem_inv(tmp, tmp);
1010238384Sjkim	felem_contract(out, tmp);
1011238384Sjkim	}
1012238384Sjkim
1013238384Sjkim/* Group operations
1014238384Sjkim * ----------------
1015238384Sjkim *
1016238384Sjkim * Building on top of the field operations we have the operations on the
1017238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
1018238384Sjkim * coordinates */
1019238384Sjkim
1020238384Sjkim/* point_double calculates 2*(x_in, y_in, z_in)
1021238384Sjkim *
1022238384Sjkim * The method is taken from:
1023238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1024238384Sjkim *
1025238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1026238384Sjkim * while x_out == y_in is not (maybe this works, but it's not tested). */
1027238384Sjkimstatic void
1028238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
1029238384Sjkim	     const felem x_in, const felem y_in, const felem z_in)
1030238384Sjkim	{
1031238384Sjkim	longfelem tmp, tmp2;
1032238384Sjkim	felem delta, gamma, beta, alpha, ftmp, ftmp2;
1033238384Sjkim	smallfelem small1, small2;
1034238384Sjkim
1035238384Sjkim	felem_assign(ftmp, x_in);
1036238384Sjkim	/* ftmp[i] < 2^106 */
1037238384Sjkim	felem_assign(ftmp2, x_in);
1038238384Sjkim	/* ftmp2[i] < 2^106 */
1039238384Sjkim
1040238384Sjkim	/* delta = z^2 */
1041238384Sjkim	felem_square(tmp, z_in);
1042238384Sjkim	felem_reduce(delta, tmp);
1043238384Sjkim	/* delta[i] < 2^101 */
1044238384Sjkim
1045238384Sjkim	/* gamma = y^2 */
1046238384Sjkim	felem_square(tmp, y_in);
1047238384Sjkim	felem_reduce(gamma, tmp);
1048238384Sjkim	/* gamma[i] < 2^101 */
1049238384Sjkim	felem_shrink(small1, gamma);
1050238384Sjkim
1051238384Sjkim	/* beta = x*gamma */
1052238384Sjkim	felem_small_mul(tmp, small1, x_in);
1053238384Sjkim	felem_reduce(beta, tmp);
1054238384Sjkim	/* beta[i] < 2^101 */
1055238384Sjkim
1056238384Sjkim	/* alpha = 3*(x-delta)*(x+delta) */
1057238384Sjkim	felem_diff(ftmp, delta);
1058238384Sjkim	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
1059238384Sjkim	felem_sum(ftmp2, delta);
1060238384Sjkim	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1061238384Sjkim	felem_scalar(ftmp2, 3);
1062238384Sjkim	/* ftmp2[i] < 3 * 2^107 < 2^109 */
1063238384Sjkim	felem_mul(tmp, ftmp, ftmp2);
1064238384Sjkim	felem_reduce(alpha, tmp);
1065238384Sjkim	/* alpha[i] < 2^101 */
1066238384Sjkim	felem_shrink(small2, alpha);
1067238384Sjkim
1068238384Sjkim	/* x' = alpha^2 - 8*beta */
1069238384Sjkim	smallfelem_square(tmp, small2);
1070238384Sjkim	felem_reduce(x_out, tmp);
1071238384Sjkim	felem_assign(ftmp, beta);
1072238384Sjkim	felem_scalar(ftmp, 8);
1073238384Sjkim	/* ftmp[i] < 8 * 2^101 = 2^104 */
1074238384Sjkim	felem_diff(x_out, ftmp);
1075238384Sjkim	/* x_out[i] < 2^105 + 2^101 < 2^106 */
1076238384Sjkim
1077238384Sjkim	/* z' = (y + z)^2 - gamma - delta */
1078238384Sjkim	felem_sum(delta, gamma);
1079238384Sjkim	/* delta[i] < 2^101 + 2^101 = 2^102 */
1080238384Sjkim	felem_assign(ftmp, y_in);
1081238384Sjkim	felem_sum(ftmp, z_in);
1082238384Sjkim	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
1083238384Sjkim	felem_square(tmp, ftmp);
1084238384Sjkim	felem_reduce(z_out, tmp);
1085238384Sjkim	felem_diff(z_out, delta);
1086238384Sjkim	/* z_out[i] < 2^105 + 2^101 < 2^106 */
1087238384Sjkim
1088238384Sjkim	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
1089238384Sjkim	felem_scalar(beta, 4);
1090238384Sjkim	/* beta[i] < 4 * 2^101 = 2^103 */
1091238384Sjkim	felem_diff_zero107(beta, x_out);
1092238384Sjkim	/* beta[i] < 2^107 + 2^103 < 2^108 */
1093238384Sjkim	felem_small_mul(tmp, small2, beta);
1094238384Sjkim	/* tmp[i] < 7 * 2^64 < 2^67 */
1095238384Sjkim	smallfelem_square(tmp2, small1);
1096238384Sjkim	/* tmp2[i] < 7 * 2^64 */
1097238384Sjkim	longfelem_scalar(tmp2, 8);
1098238384Sjkim	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1099238384Sjkim	longfelem_diff(tmp, tmp2);
1100238384Sjkim	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1101238384Sjkim	felem_reduce_zero105(y_out, tmp);
1102238384Sjkim	/* y_out[i] < 2^106 */
1103238384Sjkim	}
1104238384Sjkim
1105238384Sjkim/* point_double_small is the same as point_double, except that it operates on
1106238384Sjkim * smallfelems */
1107238384Sjkimstatic void
1108238384Sjkimpoint_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1109238384Sjkim		   const smallfelem x_in, const smallfelem y_in, const smallfelem z_in)
1110238384Sjkim	{
1111238384Sjkim	felem felem_x_out, felem_y_out, felem_z_out;
1112238384Sjkim	felem felem_x_in, felem_y_in, felem_z_in;
1113238384Sjkim
1114238384Sjkim	smallfelem_expand(felem_x_in, x_in);
1115238384Sjkim	smallfelem_expand(felem_y_in, y_in);
1116238384Sjkim	smallfelem_expand(felem_z_in, z_in);
1117238384Sjkim	point_double(felem_x_out, felem_y_out, felem_z_out,
1118238384Sjkim		     felem_x_in, felem_y_in, felem_z_in);
1119238384Sjkim	felem_shrink(x_out, felem_x_out);
1120238384Sjkim	felem_shrink(y_out, felem_y_out);
1121238384Sjkim	felem_shrink(z_out, felem_z_out);
1122238384Sjkim	}
1123238384Sjkim
1124238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
1125238384Sjkimstatic void
1126238384Sjkimcopy_conditional(felem out, const felem in, limb mask)
1127238384Sjkim	{
1128238384Sjkim	unsigned i;
1129238384Sjkim	for (i = 0; i < NLIMBS; ++i)
1130238384Sjkim		{
1131238384Sjkim		const limb tmp = mask & (in[i] ^ out[i]);
1132238384Sjkim		out[i] ^= tmp;
1133238384Sjkim		}
1134238384Sjkim	}
1135238384Sjkim
1136238384Sjkim/* copy_small_conditional copies in to out iff mask is all ones. */
1137238384Sjkimstatic void
1138238384Sjkimcopy_small_conditional(felem out, const smallfelem in, limb mask)
1139238384Sjkim	{
1140238384Sjkim	unsigned i;
1141238384Sjkim	const u64 mask64 = mask;
1142238384Sjkim	for (i = 0; i < NLIMBS; ++i)
1143238384Sjkim		{
1144238384Sjkim		out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1145238384Sjkim		}
1146238384Sjkim	}
1147238384Sjkim
1148238384Sjkim/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1149238384Sjkim *
1150238384Sjkim * The method is taken from:
1151238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1152238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1153238384Sjkim *
1154238384Sjkim * This function includes a branch for checking whether the two input points
1155238384Sjkim * are equal, (while not equal to the point at infinity). This case never
1156238384Sjkim * happens during single point multiplication, so there is no timing leak for
1157238384Sjkim * ECDH or ECDSA signing. */
1158238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
1159238384Sjkim	const felem x1, const felem y1, const felem z1,
1160238384Sjkim	const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2)
1161238384Sjkim	{
1162238384Sjkim	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1163238384Sjkim	longfelem tmp, tmp2;
1164238384Sjkim	smallfelem small1, small2, small3, small4, small5;
1165238384Sjkim	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1166238384Sjkim
1167238384Sjkim	felem_shrink(small3, z1);
1168238384Sjkim
1169238384Sjkim	z1_is_zero = smallfelem_is_zero(small3);
1170238384Sjkim	z2_is_zero = smallfelem_is_zero(z2);
1171238384Sjkim
1172238384Sjkim	/* ftmp = z1z1 = z1**2 */
1173238384Sjkim	smallfelem_square(tmp, small3);
1174238384Sjkim	felem_reduce(ftmp, tmp);
1175238384Sjkim	/* ftmp[i] < 2^101 */
1176238384Sjkim	felem_shrink(small1, ftmp);
1177238384Sjkim
1178238384Sjkim	if(!mixed)
1179238384Sjkim		{
1180238384Sjkim		/* ftmp2 = z2z2 = z2**2 */
1181238384Sjkim		smallfelem_square(tmp, z2);
1182238384Sjkim		felem_reduce(ftmp2, tmp);
1183238384Sjkim		/* ftmp2[i] < 2^101 */
1184238384Sjkim		felem_shrink(small2, ftmp2);
1185238384Sjkim
1186238384Sjkim		felem_shrink(small5, x1);
1187238384Sjkim
1188238384Sjkim		/* u1 = ftmp3 = x1*z2z2 */
1189238384Sjkim		smallfelem_mul(tmp, small5, small2);
1190238384Sjkim		felem_reduce(ftmp3, tmp);
1191238384Sjkim		/* ftmp3[i] < 2^101 */
1192238384Sjkim
1193238384Sjkim		/* ftmp5 = z1 + z2 */
1194238384Sjkim		felem_assign(ftmp5, z1);
1195238384Sjkim		felem_small_sum(ftmp5, z2);
1196238384Sjkim		/* ftmp5[i] < 2^107 */
1197238384Sjkim
1198238384Sjkim		/* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1199238384Sjkim		felem_square(tmp, ftmp5);
1200238384Sjkim		felem_reduce(ftmp5, tmp);
1201238384Sjkim		/* ftmp2 = z2z2 + z1z1 */
1202238384Sjkim		felem_sum(ftmp2, ftmp);
1203238384Sjkim		/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1204238384Sjkim		felem_diff(ftmp5, ftmp2);
1205238384Sjkim		/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1206238384Sjkim
1207238384Sjkim		/* ftmp2 = z2 * z2z2 */
1208238384Sjkim		smallfelem_mul(tmp, small2, z2);
1209238384Sjkim		felem_reduce(ftmp2, tmp);
1210238384Sjkim
1211238384Sjkim		/* s1 = ftmp2 = y1 * z2**3 */
1212238384Sjkim		felem_mul(tmp, y1, ftmp2);
1213238384Sjkim		felem_reduce(ftmp6, tmp);
1214238384Sjkim		/* ftmp6[i] < 2^101 */
1215238384Sjkim		}
1216238384Sjkim	else
1217238384Sjkim		{
1218238384Sjkim		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1219238384Sjkim
1220238384Sjkim		/* u1 = ftmp3 = x1*z2z2 */
1221238384Sjkim		felem_assign(ftmp3, x1);
1222238384Sjkim		/* ftmp3[i] < 2^106 */
1223238384Sjkim
1224238384Sjkim		/* ftmp5 = 2z1z2 */
1225238384Sjkim		felem_assign(ftmp5, z1);
1226238384Sjkim		felem_scalar(ftmp5, 2);
1227238384Sjkim		/* ftmp5[i] < 2*2^106 = 2^107 */
1228238384Sjkim
1229238384Sjkim		/* s1 = ftmp2 = y1 * z2**3 */
1230238384Sjkim		felem_assign(ftmp6, y1);
1231238384Sjkim		/* ftmp6[i] < 2^106 */
1232238384Sjkim		}
1233238384Sjkim
1234238384Sjkim	/* u2 = x2*z1z1 */
1235238384Sjkim	smallfelem_mul(tmp, x2, small1);
1236238384Sjkim	felem_reduce(ftmp4, tmp);
1237238384Sjkim
1238238384Sjkim	/* h = ftmp4 = u2 - u1 */
1239238384Sjkim	felem_diff_zero107(ftmp4, ftmp3);
1240238384Sjkim	/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1241238384Sjkim	felem_shrink(small4, ftmp4);
1242238384Sjkim
1243238384Sjkim	x_equal = smallfelem_is_zero(small4);
1244238384Sjkim
1245238384Sjkim	/* z_out = ftmp5 * h */
1246238384Sjkim	felem_small_mul(tmp, small4, ftmp5);
1247238384Sjkim	felem_reduce(z_out, tmp);
1248238384Sjkim	/* z_out[i] < 2^101 */
1249238384Sjkim
1250238384Sjkim	/* ftmp = z1 * z1z1 */
1251238384Sjkim	smallfelem_mul(tmp, small1, small3);
1252238384Sjkim	felem_reduce(ftmp, tmp);
1253238384Sjkim
1254238384Sjkim	/* s2 = tmp = y2 * z1**3 */
1255238384Sjkim	felem_small_mul(tmp, y2, ftmp);
1256238384Sjkim	felem_reduce(ftmp5, tmp);
1257238384Sjkim
1258238384Sjkim	/* r = ftmp5 = (s2 - s1)*2 */
1259238384Sjkim	felem_diff_zero107(ftmp5, ftmp6);
1260238384Sjkim	/* ftmp5[i] < 2^107 + 2^107 = 2^108*/
1261238384Sjkim	felem_scalar(ftmp5, 2);
1262238384Sjkim	/* ftmp5[i] < 2^109 */
1263238384Sjkim	felem_shrink(small1, ftmp5);
1264238384Sjkim	y_equal = smallfelem_is_zero(small1);
1265238384Sjkim
1266238384Sjkim	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1267238384Sjkim		{
1268238384Sjkim		point_double(x3, y3, z3, x1, y1, z1);
1269238384Sjkim		return;
1270238384Sjkim		}
1271238384Sjkim
1272238384Sjkim	/* I = ftmp = (2h)**2 */
1273238384Sjkim	felem_assign(ftmp, ftmp4);
1274238384Sjkim	felem_scalar(ftmp, 2);
1275238384Sjkim	/* ftmp[i] < 2*2^108 = 2^109 */
1276238384Sjkim	felem_square(tmp, ftmp);
1277238384Sjkim	felem_reduce(ftmp, tmp);
1278238384Sjkim
1279238384Sjkim	/* J = ftmp2 = h * I */
1280238384Sjkim	felem_mul(tmp, ftmp4, ftmp);
1281238384Sjkim	felem_reduce(ftmp2, tmp);
1282238384Sjkim
1283238384Sjkim	/* V = ftmp4 = U1 * I */
1284238384Sjkim	felem_mul(tmp, ftmp3, ftmp);
1285238384Sjkim	felem_reduce(ftmp4, tmp);
1286238384Sjkim
1287238384Sjkim	/* x_out = r**2 - J - 2V */
1288238384Sjkim	smallfelem_square(tmp, small1);
1289238384Sjkim	felem_reduce(x_out, tmp);
1290238384Sjkim	felem_assign(ftmp3, ftmp4);
1291238384Sjkim	felem_scalar(ftmp4, 2);
1292238384Sjkim	felem_sum(ftmp4, ftmp2);
1293238384Sjkim	/* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1294238384Sjkim	felem_diff(x_out, ftmp4);
1295238384Sjkim	/* x_out[i] < 2^105 + 2^101 */
1296238384Sjkim
1297238384Sjkim	/* y_out = r(V-x_out) - 2 * s1 * J */
1298238384Sjkim	felem_diff_zero107(ftmp3, x_out);
1299238384Sjkim	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1300238384Sjkim	felem_small_mul(tmp, small1, ftmp3);
1301238384Sjkim	felem_mul(tmp2, ftmp6, ftmp2);
1302238384Sjkim	longfelem_scalar(tmp2, 2);
1303238384Sjkim	/* tmp2[i] < 2*2^67 = 2^68 */
1304238384Sjkim	longfelem_diff(tmp, tmp2);
1305238384Sjkim	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1306238384Sjkim	felem_reduce_zero105(y_out, tmp);
1307238384Sjkim	/* y_out[i] < 2^106 */
1308238384Sjkim
1309238384Sjkim	copy_small_conditional(x_out, x2, z1_is_zero);
1310238384Sjkim	copy_conditional(x_out, x1, z2_is_zero);
1311238384Sjkim	copy_small_conditional(y_out, y2, z1_is_zero);
1312238384Sjkim	copy_conditional(y_out, y1, z2_is_zero);
1313238384Sjkim	copy_small_conditional(z_out, z2, z1_is_zero);
1314238384Sjkim	copy_conditional(z_out, z1, z2_is_zero);
1315238384Sjkim	felem_assign(x3, x_out);
1316238384Sjkim	felem_assign(y3, y_out);
1317238384Sjkim	felem_assign(z3, z_out);
1318238384Sjkim	}
1319238384Sjkim
1320238384Sjkim/* point_add_small is the same as point_add, except that it operates on
1321238384Sjkim * smallfelems */
1322238384Sjkimstatic void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1323238384Sjkim			    smallfelem x1, smallfelem y1, smallfelem z1,
1324238384Sjkim			    smallfelem x2, smallfelem y2, smallfelem z2)
1325238384Sjkim	{
1326238384Sjkim	felem felem_x3, felem_y3, felem_z3;
1327238384Sjkim	felem felem_x1, felem_y1, felem_z1;
1328238384Sjkim	smallfelem_expand(felem_x1, x1);
1329238384Sjkim	smallfelem_expand(felem_y1, y1);
1330238384Sjkim	smallfelem_expand(felem_z1, z1);
1331238384Sjkim	point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2);
1332238384Sjkim	felem_shrink(x3, felem_x3);
1333238384Sjkim	felem_shrink(y3, felem_y3);
1334238384Sjkim	felem_shrink(z3, felem_z3);
1335238384Sjkim	}
1336238384Sjkim
1337238384Sjkim/* Base point pre computation
1338238384Sjkim * --------------------------
1339238384Sjkim *
1340238384Sjkim * Two different sorts of precomputed tables are used in the following code.
1341238384Sjkim * Each contain various points on the curve, where each point is three field
1342238384Sjkim * elements (x, y, z).
1343238384Sjkim *
1344238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
1345238384Sjkim * This table has 2 * 16 elements, starting with the following:
1346238384Sjkim * index | bits    | point
1347238384Sjkim * ------+---------+------------------------------
1348238384Sjkim *     0 | 0 0 0 0 | 0G
1349238384Sjkim *     1 | 0 0 0 1 | 1G
1350238384Sjkim *     2 | 0 0 1 0 | 2^64G
1351238384Sjkim *     3 | 0 0 1 1 | (2^64 + 1)G
1352238384Sjkim *     4 | 0 1 0 0 | 2^128G
1353238384Sjkim *     5 | 0 1 0 1 | (2^128 + 1)G
1354238384Sjkim *     6 | 0 1 1 0 | (2^128 + 2^64)G
1355238384Sjkim *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1356238384Sjkim *     8 | 1 0 0 0 | 2^192G
1357238384Sjkim *     9 | 1 0 0 1 | (2^192 + 1)G
1358238384Sjkim *    10 | 1 0 1 0 | (2^192 + 2^64)G
1359238384Sjkim *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1360238384Sjkim *    12 | 1 1 0 0 | (2^192 + 2^128)G
1361238384Sjkim *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1362238384Sjkim *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1363238384Sjkim *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1364238384Sjkim * followed by a copy of this with each element multiplied by 2^32.
1365238384Sjkim *
1366238384Sjkim * The reason for this is so that we can clock bits into four different
1367238384Sjkim * locations when doing simple scalar multiplies against the base point,
1368238384Sjkim * and then another four locations using the second 16 elements.
1369238384Sjkim *
1370238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
1371238384Sjkim
1372238384Sjkim/* gmul is the table of precomputed base points */
1373238384Sjkimstatic const smallfelem gmul[2][16][3] =
1374238384Sjkim{{{{0, 0, 0, 0},
1375238384Sjkim   {0, 0, 0, 0},
1376238384Sjkim   {0, 0, 0, 0}},
1377238384Sjkim  {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247},
1378238384Sjkim   {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b},
1379238384Sjkim   {1, 0, 0, 0}},
1380238384Sjkim  {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5},
1381238384Sjkim   {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d},
1382238384Sjkim   {1, 0, 0, 0}},
1383238384Sjkim  {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f},
1384238384Sjkim   {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644},
1385238384Sjkim   {1, 0, 0, 0}},
1386238384Sjkim  {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67},
1387238384Sjkim   {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee},
1388238384Sjkim   {1, 0, 0, 0}},
1389238384Sjkim  {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff},
1390238384Sjkim   {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b},
1391238384Sjkim   {1, 0, 0, 0}},
1392238384Sjkim  {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8},
1393238384Sjkim   {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851},
1394238384Sjkim   {1, 0, 0, 0}},
1395238384Sjkim  {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea},
1396238384Sjkim   {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b},
1397238384Sjkim   {1, 0, 0, 0}},
1398238384Sjkim  {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276},
1399238384Sjkim   {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816},
1400238384Sjkim   {1, 0, 0, 0}},
1401238384Sjkim  {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad},
1402238384Sjkim   {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663},
1403238384Sjkim   {1, 0, 0, 0}},
1404238384Sjkim  {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d},
1405238384Sjkim   {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321},
1406238384Sjkim   {1, 0, 0, 0}},
1407238384Sjkim  {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287},
1408238384Sjkim   {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6},
1409238384Sjkim   {1, 0, 0, 0}},
1410238384Sjkim  {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466},
1411238384Sjkim   {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20},
1412238384Sjkim   {1, 0, 0, 0}},
1413238384Sjkim  {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9},
1414238384Sjkim   {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61},
1415238384Sjkim   {1, 0, 0, 0}},
1416238384Sjkim  {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a},
1417238384Sjkim   {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc},
1418238384Sjkim   {1, 0, 0, 0}},
1419238384Sjkim  {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c},
1420238384Sjkim   {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab},
1421238384Sjkim   {1, 0, 0, 0}}},
1422238384Sjkim {{{0, 0, 0, 0},
1423238384Sjkim   {0, 0, 0, 0},
1424238384Sjkim   {0, 0, 0, 0}},
1425238384Sjkim  {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89},
1426238384Sjkim   {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624},
1427238384Sjkim   {1, 0, 0, 0}},
1428238384Sjkim  {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6},
1429238384Sjkim   {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1},
1430238384Sjkim   {1, 0, 0, 0}},
1431238384Sjkim  {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a},
1432238384Sjkim   {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593},
1433238384Sjkim   {1, 0, 0, 0}},
1434238384Sjkim  {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617},
1435238384Sjkim   {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7},
1436238384Sjkim   {1, 0, 0, 0}},
1437238384Sjkim  {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276},
1438238384Sjkim   {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a},
1439238384Sjkim   {1, 0, 0, 0}},
1440238384Sjkim  {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908},
1441238384Sjkim   {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e},
1442238384Sjkim   {1, 0, 0, 0}},
1443238384Sjkim  {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7},
1444238384Sjkim   {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec},
1445238384Sjkim   {1, 0, 0, 0}},
1446238384Sjkim  {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee},
1447238384Sjkim   {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6},
1448238384Sjkim   {1, 0, 0, 0}},
1449238384Sjkim  {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109},
1450238384Sjkim   {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5},
1451238384Sjkim   {1, 0, 0, 0}},
1452238384Sjkim  {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba},
1453238384Sjkim   {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44},
1454238384Sjkim   {1, 0, 0, 0}},
1455238384Sjkim  {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b},
1456238384Sjkim   {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc},
1457238384Sjkim   {1, 0, 0, 0}},
1458238384Sjkim  {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107},
1459238384Sjkim   {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387},
1460238384Sjkim   {1, 0, 0, 0}},
1461238384Sjkim  {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503},
1462238384Sjkim   {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be},
1463238384Sjkim   {1, 0, 0, 0}},
1464238384Sjkim  {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9},
1465238384Sjkim   {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a},
1466238384Sjkim   {1, 0, 0, 0}},
1467238384Sjkim  {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6},
1468238384Sjkim   {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81},
1469238384Sjkim   {1, 0, 0, 0}}}};
1470238384Sjkim
1471238384Sjkim/* select_point selects the |idx|th point from a precomputation table and
1472238384Sjkim * copies it to out. */
1473238384Sjkimstatic void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3])
1474238384Sjkim	{
1475238384Sjkim	unsigned i, j;
1476238384Sjkim	u64 *outlimbs = &out[0][0];
1477238384Sjkim	memset(outlimbs, 0, 3 * sizeof(smallfelem));
1478238384Sjkim
1479238384Sjkim	for (i = 0; i < size; i++)
1480238384Sjkim		{
1481238384Sjkim		const u64 *inlimbs = (u64*) &pre_comp[i][0][0];
1482238384Sjkim		u64 mask = i ^ idx;
1483238384Sjkim		mask |= mask >> 4;
1484238384Sjkim		mask |= mask >> 2;
1485238384Sjkim		mask |= mask >> 1;
1486238384Sjkim		mask &= 1;
1487238384Sjkim		mask--;
1488238384Sjkim		for (j = 0; j < NLIMBS * 3; j++)
1489238384Sjkim			outlimbs[j] |= inlimbs[j] & mask;
1490238384Sjkim		}
1491238384Sjkim	}
1492238384Sjkim
1493238384Sjkim/* get_bit returns the |i|th bit in |in| */
1494238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
1495238384Sjkim	{
1496238384Sjkim	if ((i < 0) || (i >= 256))
1497238384Sjkim		return 0;
1498238384Sjkim	return (in[i >> 3] >> (i & 7)) & 1;
1499238384Sjkim	}
1500238384Sjkim
1501238384Sjkim/* Interleaved point multiplication using precomputed point multiples:
1502238384Sjkim * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[],
1503238384Sjkim * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1504238384Sjkim * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1505238384Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1506238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
1507238384Sjkim	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1508238384Sjkim	const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3])
1509238384Sjkim	{
1510238384Sjkim	int i, skip;
1511238384Sjkim	unsigned num, gen_mul = (g_scalar != NULL);
1512238384Sjkim	felem nq[3], ftmp;
1513238384Sjkim	smallfelem tmp[3];
1514238384Sjkim	u64 bits;
1515238384Sjkim	u8 sign, digit;
1516238384Sjkim
1517238384Sjkim	/* set nq to the point at infinity */
1518238384Sjkim	memset(nq, 0, 3 * sizeof(felem));
1519238384Sjkim
1520238384Sjkim	/* Loop over all scalars msb-to-lsb, interleaving additions
1521238384Sjkim	 * of multiples of the generator (two in each of the last 32 rounds)
1522238384Sjkim	 * and additions of other points multiples (every 5th round).
1523238384Sjkim	 */
1524238384Sjkim	skip = 1; /* save two point operations in the first round */
1525238384Sjkim	for (i = (num_points ? 255 : 31); i >= 0; --i)
1526238384Sjkim		{
1527238384Sjkim		/* double */
1528238384Sjkim		if (!skip)
1529238384Sjkim			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1530238384Sjkim
1531238384Sjkim		/* add multiples of the generator */
1532238384Sjkim		if (gen_mul && (i <= 31))
1533238384Sjkim			{
1534238384Sjkim			/* first, look 32 bits upwards */
1535238384Sjkim			bits = get_bit(g_scalar, i + 224) << 3;
1536238384Sjkim			bits |= get_bit(g_scalar, i + 160) << 2;
1537238384Sjkim			bits |= get_bit(g_scalar, i + 96) << 1;
1538238384Sjkim			bits |= get_bit(g_scalar, i + 32);
1539238384Sjkim			/* select the point to add, in constant time */
1540238384Sjkim			select_point(bits, 16, g_pre_comp[1], tmp);
1541238384Sjkim
1542238384Sjkim			if (!skip)
1543238384Sjkim				{
1544238384Sjkim				point_add(nq[0], nq[1], nq[2],
1545238384Sjkim					nq[0], nq[1], nq[2],
1546238384Sjkim					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1547238384Sjkim				}
1548238384Sjkim			else
1549238384Sjkim				{
1550238384Sjkim				smallfelem_expand(nq[0], tmp[0]);
1551238384Sjkim				smallfelem_expand(nq[1], tmp[1]);
1552238384Sjkim				smallfelem_expand(nq[2], tmp[2]);
1553238384Sjkim				skip = 0;
1554238384Sjkim				}
1555238384Sjkim
1556238384Sjkim			/* second, look at the current position */
1557238384Sjkim			bits = get_bit(g_scalar, i + 192) << 3;
1558238384Sjkim			bits |= get_bit(g_scalar, i + 128) << 2;
1559238384Sjkim			bits |= get_bit(g_scalar, i + 64) << 1;
1560238384Sjkim			bits |= get_bit(g_scalar, i);
1561238384Sjkim			/* select the point to add, in constant time */
1562238384Sjkim			select_point(bits, 16, g_pre_comp[0], tmp);
1563238384Sjkim			point_add(nq[0], nq[1], nq[2],
1564238384Sjkim				nq[0], nq[1], nq[2],
1565238384Sjkim				1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1566238384Sjkim			}
1567238384Sjkim
1568238384Sjkim		/* do other additions every 5 doublings */
1569238384Sjkim		if (num_points && (i % 5 == 0))
1570238384Sjkim			{
1571238384Sjkim			/* loop over all scalars */
1572238384Sjkim			for (num = 0; num < num_points; ++num)
1573238384Sjkim				{
1574238384Sjkim				bits = get_bit(scalars[num], i + 4) << 5;
1575238384Sjkim				bits |= get_bit(scalars[num], i + 3) << 4;
1576238384Sjkim				bits |= get_bit(scalars[num], i + 2) << 3;
1577238384Sjkim				bits |= get_bit(scalars[num], i + 1) << 2;
1578238384Sjkim				bits |= get_bit(scalars[num], i) << 1;
1579238384Sjkim				bits |= get_bit(scalars[num], i - 1);
1580238384Sjkim				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1581238384Sjkim
1582238384Sjkim				/* select the point to add or subtract, in constant time */
1583238384Sjkim				select_point(digit, 17, pre_comp[num], tmp);
1584238384Sjkim				smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */
1585238384Sjkim				copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1586238384Sjkim				felem_contract(tmp[1], ftmp);
1587238384Sjkim
1588238384Sjkim				if (!skip)
1589238384Sjkim					{
1590238384Sjkim					point_add(nq[0], nq[1], nq[2],
1591238384Sjkim						nq[0], nq[1], nq[2],
1592238384Sjkim						mixed, tmp[0], tmp[1], tmp[2]);
1593238384Sjkim					}
1594238384Sjkim				else
1595238384Sjkim					{
1596238384Sjkim					smallfelem_expand(nq[0], tmp[0]);
1597238384Sjkim					smallfelem_expand(nq[1], tmp[1]);
1598238384Sjkim					smallfelem_expand(nq[2], tmp[2]);
1599238384Sjkim					skip = 0;
1600238384Sjkim					}
1601238384Sjkim				}
1602238384Sjkim			}
1603238384Sjkim		}
1604238384Sjkim	felem_assign(x_out, nq[0]);
1605238384Sjkim	felem_assign(y_out, nq[1]);
1606238384Sjkim	felem_assign(z_out, nq[2]);
1607238384Sjkim	}
1608238384Sjkim
1609238384Sjkim/* Precomputation for the group generator. */
1610238384Sjkimtypedef struct {
1611238384Sjkim	smallfelem g_pre_comp[2][16][3];
1612238384Sjkim	int references;
1613238384Sjkim} NISTP256_PRE_COMP;
1614238384Sjkim
1615238384Sjkimconst EC_METHOD *EC_GFp_nistp256_method(void)
1616238384Sjkim	{
1617238384Sjkim	static const EC_METHOD ret = {
1618238384Sjkim		EC_FLAGS_DEFAULT_OCT,
1619238384Sjkim		NID_X9_62_prime_field,
1620238384Sjkim		ec_GFp_nistp256_group_init,
1621238384Sjkim		ec_GFp_simple_group_finish,
1622238384Sjkim		ec_GFp_simple_group_clear_finish,
1623238384Sjkim		ec_GFp_nist_group_copy,
1624238384Sjkim		ec_GFp_nistp256_group_set_curve,
1625238384Sjkim		ec_GFp_simple_group_get_curve,
1626238384Sjkim		ec_GFp_simple_group_get_degree,
1627238384Sjkim		ec_GFp_simple_group_check_discriminant,
1628238384Sjkim		ec_GFp_simple_point_init,
1629238384Sjkim		ec_GFp_simple_point_finish,
1630238384Sjkim		ec_GFp_simple_point_clear_finish,
1631238384Sjkim		ec_GFp_simple_point_copy,
1632238384Sjkim		ec_GFp_simple_point_set_to_infinity,
1633238384Sjkim		ec_GFp_simple_set_Jprojective_coordinates_GFp,
1634238384Sjkim		ec_GFp_simple_get_Jprojective_coordinates_GFp,
1635238384Sjkim		ec_GFp_simple_point_set_affine_coordinates,
1636238384Sjkim		ec_GFp_nistp256_point_get_affine_coordinates,
1637238384Sjkim		0 /* point_set_compressed_coordinates */,
1638238384Sjkim		0 /* point2oct */,
1639238384Sjkim		0 /* oct2point */,
1640238384Sjkim		ec_GFp_simple_add,
1641238384Sjkim		ec_GFp_simple_dbl,
1642238384Sjkim		ec_GFp_simple_invert,
1643238384Sjkim		ec_GFp_simple_is_at_infinity,
1644238384Sjkim		ec_GFp_simple_is_on_curve,
1645238384Sjkim		ec_GFp_simple_cmp,
1646238384Sjkim		ec_GFp_simple_make_affine,
1647238384Sjkim		ec_GFp_simple_points_make_affine,
1648238384Sjkim		ec_GFp_nistp256_points_mul,
1649238384Sjkim		ec_GFp_nistp256_precompute_mult,
1650238384Sjkim		ec_GFp_nistp256_have_precompute_mult,
1651238384Sjkim		ec_GFp_nist_field_mul,
1652238384Sjkim		ec_GFp_nist_field_sqr,
1653238384Sjkim		0 /* field_div */,
1654238384Sjkim		0 /* field_encode */,
1655238384Sjkim		0 /* field_decode */,
1656238384Sjkim		0 /* field_set_to_one */ };
1657238384Sjkim
1658238384Sjkim	return &ret;
1659238384Sjkim	}
1660238384Sjkim
1661238384Sjkim/******************************************************************************/
1662238384Sjkim/*		       FUNCTIONS TO MANAGE PRECOMPUTATION
1663238384Sjkim */
1664238384Sjkim
1665238384Sjkimstatic NISTP256_PRE_COMP *nistp256_pre_comp_new()
1666238384Sjkim	{
1667238384Sjkim	NISTP256_PRE_COMP *ret = NULL;
1668238384Sjkim	ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1669238384Sjkim	if (!ret)
1670238384Sjkim		{
1671238384Sjkim		ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1672238384Sjkim		return ret;
1673238384Sjkim		}
1674238384Sjkim	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1675238384Sjkim	ret->references = 1;
1676238384Sjkim	return ret;
1677238384Sjkim	}
1678238384Sjkim
1679238384Sjkimstatic void *nistp256_pre_comp_dup(void *src_)
1680238384Sjkim	{
1681238384Sjkim	NISTP256_PRE_COMP *src = src_;
1682238384Sjkim
1683238384Sjkim	/* no need to actually copy, these objects never change! */
1684238384Sjkim	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1685238384Sjkim
1686238384Sjkim	return src_;
1687238384Sjkim	}
1688238384Sjkim
1689238384Sjkimstatic void nistp256_pre_comp_free(void *pre_)
1690238384Sjkim	{
1691238384Sjkim	int i;
1692238384Sjkim	NISTP256_PRE_COMP *pre = pre_;
1693238384Sjkim
1694238384Sjkim	if (!pre)
1695238384Sjkim		return;
1696238384Sjkim
1697238384Sjkim	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1698238384Sjkim	if (i > 0)
1699238384Sjkim		return;
1700238384Sjkim
1701238384Sjkim	OPENSSL_free(pre);
1702238384Sjkim	}
1703238384Sjkim
1704238384Sjkimstatic void nistp256_pre_comp_clear_free(void *pre_)
1705238384Sjkim	{
1706238384Sjkim	int i;
1707238384Sjkim	NISTP256_PRE_COMP *pre = pre_;
1708238384Sjkim
1709238384Sjkim	if (!pre)
1710238384Sjkim		return;
1711238384Sjkim
1712238384Sjkim	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1713238384Sjkim	if (i > 0)
1714238384Sjkim		return;
1715238384Sjkim
1716238384Sjkim	OPENSSL_cleanse(pre, sizeof *pre);
1717238384Sjkim	OPENSSL_free(pre);
1718238384Sjkim	}
1719238384Sjkim
1720238384Sjkim/******************************************************************************/
1721238384Sjkim/*			   OPENSSL EC_METHOD FUNCTIONS
1722238384Sjkim */
1723238384Sjkim
1724238384Sjkimint ec_GFp_nistp256_group_init(EC_GROUP *group)
1725238384Sjkim	{
1726238384Sjkim	int ret;
1727238384Sjkim	ret = ec_GFp_simple_group_init(group);
1728238384Sjkim	group->a_is_minus3 = 1;
1729238384Sjkim	return ret;
1730238384Sjkim	}
1731238384Sjkim
1732238384Sjkimint ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1733238384Sjkim	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1734238384Sjkim	{
1735238384Sjkim	int ret = 0;
1736238384Sjkim	BN_CTX *new_ctx = NULL;
1737238384Sjkim	BIGNUM *curve_p, *curve_a, *curve_b;
1738238384Sjkim
1739238384Sjkim	if (ctx == NULL)
1740238384Sjkim		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1741238384Sjkim	BN_CTX_start(ctx);
1742238384Sjkim	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1743238384Sjkim		((curve_a = BN_CTX_get(ctx)) == NULL) ||
1744238384Sjkim		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1745238384Sjkim	BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1746238384Sjkim	BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1747238384Sjkim	BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1748238384Sjkim	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1749238384Sjkim		(BN_cmp(curve_b, b)))
1750238384Sjkim		{
1751238384Sjkim		ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1752238384Sjkim			EC_R_WRONG_CURVE_PARAMETERS);
1753238384Sjkim		goto err;
1754238384Sjkim		}
1755238384Sjkim	group->field_mod_func = BN_nist_mod_256;
1756238384Sjkim	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1757238384Sjkimerr:
1758238384Sjkim	BN_CTX_end(ctx);
1759238384Sjkim	if (new_ctx != NULL)
1760238384Sjkim		BN_CTX_free(new_ctx);
1761238384Sjkim	return ret;
1762238384Sjkim	}
1763238384Sjkim
1764238384Sjkim/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1765238384Sjkim * (X', Y') = (X/Z^2, Y/Z^3) */
1766238384Sjkimint ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1767238384Sjkim	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1768238384Sjkim	{
1769238384Sjkim	felem z1, z2, x_in, y_in;
1770238384Sjkim	smallfelem x_out, y_out;
1771238384Sjkim	longfelem tmp;
1772238384Sjkim
1773238384Sjkim	if (EC_POINT_is_at_infinity(group, point))
1774238384Sjkim		{
1775238384Sjkim		ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1776238384Sjkim			EC_R_POINT_AT_INFINITY);
1777238384Sjkim		return 0;
1778238384Sjkim		}
1779238384Sjkim	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1780238384Sjkim		(!BN_to_felem(z1, &point->Z))) return 0;
1781238384Sjkim	felem_inv(z2, z1);
1782238384Sjkim	felem_square(tmp, z2); felem_reduce(z1, tmp);
1783238384Sjkim	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1784238384Sjkim	felem_contract(x_out, x_in);
1785238384Sjkim	if (x != NULL)
1786238384Sjkim		{
1787238384Sjkim		if (!smallfelem_to_BN(x, x_out)) {
1788238384Sjkim		ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1789238384Sjkim			ERR_R_BN_LIB);
1790238384Sjkim		return 0;
1791238384Sjkim		}
1792238384Sjkim		}
1793238384Sjkim	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1794238384Sjkim	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1795238384Sjkim	felem_contract(y_out, y_in);
1796238384Sjkim	if (y != NULL)
1797238384Sjkim		{
1798238384Sjkim		if (!smallfelem_to_BN(y, y_out))
1799238384Sjkim			{
1800238384Sjkim			ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1801238384Sjkim				ERR_R_BN_LIB);
1802238384Sjkim			return 0;
1803238384Sjkim			}
1804238384Sjkim		}
1805238384Sjkim	return 1;
1806238384Sjkim	}
1807238384Sjkim
1808238384Sjkimstatic void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */])
1809238384Sjkim	{
1810238384Sjkim	/* Runs in constant time, unless an input is the point at infinity
1811238384Sjkim	 * (which normally shouldn't happen). */
1812238384Sjkim	ec_GFp_nistp_points_make_affine_internal(
1813238384Sjkim		num,
1814238384Sjkim		points,
1815238384Sjkim		sizeof(smallfelem),
1816238384Sjkim		tmp_smallfelems,
1817238384Sjkim		(void (*)(void *)) smallfelem_one,
1818238384Sjkim		(int (*)(const void *)) smallfelem_is_zero_int,
1819238384Sjkim		(void (*)(void *, const void *)) smallfelem_assign,
1820238384Sjkim		(void (*)(void *, const void *)) smallfelem_square_contract,
1821238384Sjkim		(void (*)(void *, const void *, const void *)) smallfelem_mul_contract,
1822238384Sjkim		(void (*)(void *, const void *)) smallfelem_inv_contract,
1823238384Sjkim		(void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */);
1824238384Sjkim	}
1825238384Sjkim
1826238384Sjkim/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1827238384Sjkim * Result is stored in r (r can equal one of the inputs). */
1828238384Sjkimint ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1829238384Sjkim	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1830238384Sjkim	const BIGNUM *scalars[], BN_CTX *ctx)
1831238384Sjkim	{
1832238384Sjkim	int ret = 0;
1833238384Sjkim	int j;
1834238384Sjkim	int mixed = 0;
1835238384Sjkim	BN_CTX *new_ctx = NULL;
1836238384Sjkim	BIGNUM *x, *y, *z, *tmp_scalar;
1837238384Sjkim	felem_bytearray g_secret;
1838238384Sjkim	felem_bytearray *secrets = NULL;
1839238384Sjkim	smallfelem (*pre_comp)[17][3] = NULL;
1840238384Sjkim	smallfelem *tmp_smallfelems = NULL;
1841238384Sjkim	felem_bytearray tmp;
1842238384Sjkim	unsigned i, num_bytes;
1843238384Sjkim	int have_pre_comp = 0;
1844238384Sjkim	size_t num_points = num;
1845238384Sjkim	smallfelem x_in, y_in, z_in;
1846238384Sjkim	felem x_out, y_out, z_out;
1847238384Sjkim	NISTP256_PRE_COMP *pre = NULL;
1848238384Sjkim	const smallfelem (*g_pre_comp)[16][3] = NULL;
1849238384Sjkim	EC_POINT *generator = NULL;
1850238384Sjkim	const EC_POINT *p = NULL;
1851238384Sjkim	const BIGNUM *p_scalar = NULL;
1852238384Sjkim
1853238384Sjkim	if (ctx == NULL)
1854238384Sjkim		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1855238384Sjkim	BN_CTX_start(ctx);
1856238384Sjkim	if (((x = BN_CTX_get(ctx)) == NULL) ||
1857238384Sjkim		((y = BN_CTX_get(ctx)) == NULL) ||
1858238384Sjkim		((z = BN_CTX_get(ctx)) == NULL) ||
1859238384Sjkim		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1860238384Sjkim		goto err;
1861238384Sjkim
1862238384Sjkim	if (scalar != NULL)
1863238384Sjkim		{
1864238384Sjkim		pre = EC_EX_DATA_get_data(group->extra_data,
1865238384Sjkim			nistp256_pre_comp_dup, nistp256_pre_comp_free,
1866238384Sjkim			nistp256_pre_comp_clear_free);
1867238384Sjkim		if (pre)
1868238384Sjkim			/* we have precomputation, try to use it */
1869238384Sjkim			g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp;
1870238384Sjkim		else
1871238384Sjkim			/* try to use the standard precomputation */
1872238384Sjkim			g_pre_comp = &gmul[0];
1873238384Sjkim		generator = EC_POINT_new(group);
1874238384Sjkim		if (generator == NULL)
1875238384Sjkim			goto err;
1876238384Sjkim		/* get the generator from precomputation */
1877238384Sjkim		if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
1878238384Sjkim			!smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
1879238384Sjkim			!smallfelem_to_BN(z, g_pre_comp[0][1][2]))
1880238384Sjkim			{
1881238384Sjkim			ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
1882238384Sjkim			goto err;
1883238384Sjkim			}
1884238384Sjkim		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1885238384Sjkim				generator, x, y, z, ctx))
1886238384Sjkim			goto err;
1887238384Sjkim		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1888238384Sjkim			/* precomputation matches generator */
1889238384Sjkim			have_pre_comp = 1;
1890238384Sjkim		else
1891238384Sjkim			/* we don't have valid precomputation:
1892238384Sjkim			 * treat the generator as a random point */
1893238384Sjkim			num_points++;
1894238384Sjkim		}
1895238384Sjkim	if (num_points > 0)
1896238384Sjkim		{
1897238384Sjkim		if (num_points >= 3)
1898238384Sjkim			{
1899238384Sjkim			/* unless we precompute multiples for just one or two points,
1900238384Sjkim			 * converting those into affine form is time well spent  */
1901238384Sjkim			mixed = 1;
1902238384Sjkim			}
1903238384Sjkim		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1904238384Sjkim		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
1905238384Sjkim		if (mixed)
1906238384Sjkim			tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
1907238384Sjkim		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL)))
1908238384Sjkim			{
1909238384Sjkim			ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1910238384Sjkim			goto err;
1911238384Sjkim			}
1912238384Sjkim
1913238384Sjkim		/* we treat NULL scalars as 0, and NULL points as points at infinity,
1914238384Sjkim		 * i.e., they contribute nothing to the linear combination */
1915238384Sjkim		memset(secrets, 0, num_points * sizeof(felem_bytearray));
1916238384Sjkim		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
1917238384Sjkim		for (i = 0; i < num_points; ++i)
1918238384Sjkim			{
1919238384Sjkim			if (i == num)
1920238384Sjkim				/* we didn't have a valid precomputation, so we pick
1921238384Sjkim				 * the generator */
1922238384Sjkim				{
1923238384Sjkim				p = EC_GROUP_get0_generator(group);
1924238384Sjkim				p_scalar = scalar;
1925238384Sjkim				}
1926238384Sjkim			else
1927238384Sjkim				/* the i^th point */
1928238384Sjkim				{
1929238384Sjkim				p = points[i];
1930238384Sjkim				p_scalar = scalars[i];
1931238384Sjkim				}
1932238384Sjkim			if ((p_scalar != NULL) && (p != NULL))
1933238384Sjkim				{
1934238384Sjkim				/* reduce scalar to 0 <= scalar < 2^256 */
1935238384Sjkim				if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar)))
1936238384Sjkim					{
1937238384Sjkim					/* this is an unusual input, and we don't guarantee
1938238384Sjkim					 * constant-timeness */
1939238384Sjkim					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1940238384Sjkim						{
1941238384Sjkim						ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
1942238384Sjkim						goto err;
1943238384Sjkim						}
1944238384Sjkim					num_bytes = BN_bn2bin(tmp_scalar, tmp);
1945238384Sjkim					}
1946238384Sjkim				else
1947238384Sjkim					num_bytes = BN_bn2bin(p_scalar, tmp);
1948238384Sjkim				flip_endian(secrets[i], tmp, num_bytes);
1949238384Sjkim				/* precompute multiples */
1950238384Sjkim				if ((!BN_to_felem(x_out, &p->X)) ||
1951238384Sjkim					(!BN_to_felem(y_out, &p->Y)) ||
1952238384Sjkim					(!BN_to_felem(z_out, &p->Z))) goto err;
1953238384Sjkim				felem_shrink(pre_comp[i][1][0], x_out);
1954238384Sjkim				felem_shrink(pre_comp[i][1][1], y_out);
1955238384Sjkim				felem_shrink(pre_comp[i][1][2], z_out);
1956238384Sjkim				for (j = 2; j <= 16; ++j)
1957238384Sjkim					{
1958238384Sjkim					if (j & 1)
1959238384Sjkim						{
1960238384Sjkim						point_add_small(
1961238384Sjkim							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1962238384Sjkim							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1963238384Sjkim							pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1964238384Sjkim						}
1965238384Sjkim					else
1966238384Sjkim						{
1967238384Sjkim						point_double_small(
1968238384Sjkim							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1969238384Sjkim							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1970238384Sjkim						}
1971238384Sjkim					}
1972238384Sjkim				}
1973238384Sjkim			}
1974238384Sjkim		if (mixed)
1975238384Sjkim			make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
1976238384Sjkim		}
1977238384Sjkim
1978238384Sjkim	/* the scalar for the generator */
1979238384Sjkim	if ((scalar != NULL) && (have_pre_comp))
1980238384Sjkim		{
1981238384Sjkim		memset(g_secret, 0, sizeof(g_secret));
1982238384Sjkim		/* reduce scalar to 0 <= scalar < 2^256 */
1983238384Sjkim		if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar)))
1984238384Sjkim			{
1985238384Sjkim			/* this is an unusual input, and we don't guarantee
1986238384Sjkim			 * constant-timeness */
1987238384Sjkim			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1988238384Sjkim				{
1989238384Sjkim				ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
1990238384Sjkim				goto err;
1991238384Sjkim				}
1992238384Sjkim			num_bytes = BN_bn2bin(tmp_scalar, tmp);
1993238384Sjkim			}
1994238384Sjkim		else
1995238384Sjkim			num_bytes = BN_bn2bin(scalar, tmp);
1996238384Sjkim		flip_endian(g_secret, tmp, num_bytes);
1997238384Sjkim		/* do the multiplication with generator precomputation*/
1998238384Sjkim		batch_mul(x_out, y_out, z_out,
1999238384Sjkim			(const felem_bytearray (*)) secrets, num_points,
2000238384Sjkim			g_secret,
2001238384Sjkim			mixed, (const smallfelem (*)[17][3]) pre_comp,
2002238384Sjkim			g_pre_comp);
2003238384Sjkim		}
2004238384Sjkim	else
2005238384Sjkim		/* do the multiplication without generator precomputation */
2006238384Sjkim		batch_mul(x_out, y_out, z_out,
2007238384Sjkim			(const felem_bytearray (*)) secrets, num_points,
2008238384Sjkim			NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL);
2009238384Sjkim	/* reduce the output to its unique minimal representation */
2010238384Sjkim	felem_contract(x_in, x_out);
2011238384Sjkim	felem_contract(y_in, y_out);
2012238384Sjkim	felem_contract(z_in, z_out);
2013238384Sjkim	if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2014238384Sjkim		(!smallfelem_to_BN(z, z_in)))
2015238384Sjkim		{
2016238384Sjkim		ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2017238384Sjkim		goto err;
2018238384Sjkim		}
2019238384Sjkim	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2020238384Sjkim
2021238384Sjkimerr:
2022238384Sjkim	BN_CTX_end(ctx);
2023238384Sjkim	if (generator != NULL)
2024238384Sjkim		EC_POINT_free(generator);
2025238384Sjkim	if (new_ctx != NULL)
2026238384Sjkim		BN_CTX_free(new_ctx);
2027238384Sjkim	if (secrets != NULL)
2028238384Sjkim		OPENSSL_free(secrets);
2029238384Sjkim	if (pre_comp != NULL)
2030238384Sjkim		OPENSSL_free(pre_comp);
2031238384Sjkim	if (tmp_smallfelems != NULL)
2032238384Sjkim		OPENSSL_free(tmp_smallfelems);
2033238384Sjkim	return ret;
2034238384Sjkim	}
2035238384Sjkim
2036238384Sjkimint ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2037238384Sjkim	{
2038238384Sjkim	int ret = 0;
2039238384Sjkim	NISTP256_PRE_COMP *pre = NULL;
2040238384Sjkim	int i, j;
2041238384Sjkim	BN_CTX *new_ctx = NULL;
2042238384Sjkim	BIGNUM *x, *y;
2043238384Sjkim	EC_POINT *generator = NULL;
2044238384Sjkim	smallfelem tmp_smallfelems[32];
2045238384Sjkim	felem x_tmp, y_tmp, z_tmp;
2046238384Sjkim
2047238384Sjkim	/* throw away old precomputation */
2048238384Sjkim	EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2049238384Sjkim		nistp256_pre_comp_free, nistp256_pre_comp_clear_free);
2050238384Sjkim	if (ctx == NULL)
2051238384Sjkim		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
2052238384Sjkim	BN_CTX_start(ctx);
2053238384Sjkim	if (((x = BN_CTX_get(ctx)) == NULL) ||
2054238384Sjkim		((y = BN_CTX_get(ctx)) == NULL))
2055238384Sjkim		goto err;
2056238384Sjkim	/* get the generator */
2057238384Sjkim	if (group->generator == NULL) goto err;
2058238384Sjkim	generator = EC_POINT_new(group);
2059238384Sjkim	if (generator == NULL)
2060238384Sjkim		goto err;
2061238384Sjkim	BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x);
2062238384Sjkim	BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y);
2063238384Sjkim	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2064238384Sjkim		goto err;
2065238384Sjkim	if ((pre = nistp256_pre_comp_new()) == NULL)
2066238384Sjkim		goto err;
2067238384Sjkim	/* if the generator is the standard one, use built-in precomputation */
2068238384Sjkim	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2069238384Sjkim		{
2070238384Sjkim		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2071238384Sjkim		ret = 1;
2072238384Sjkim		goto err;
2073238384Sjkim		}
2074238384Sjkim	if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2075238384Sjkim		(!BN_to_felem(y_tmp, &group->generator->Y)) ||
2076238384Sjkim		(!BN_to_felem(z_tmp, &group->generator->Z)))
2077238384Sjkim		goto err;
2078238384Sjkim	felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2079238384Sjkim	felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2080238384Sjkim	felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2081238384Sjkim	/* compute 2^64*G, 2^128*G, 2^192*G for the first table,
2082238384Sjkim	 * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one
2083238384Sjkim	 */
2084238384Sjkim	for (i = 1; i <= 8; i <<= 1)
2085238384Sjkim		{
2086238384Sjkim		point_double_small(
2087238384Sjkim			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
2088238384Sjkim			pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
2089238384Sjkim		for (j = 0; j < 31; ++j)
2090238384Sjkim			{
2091238384Sjkim			point_double_small(
2092238384Sjkim				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
2093238384Sjkim				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
2094238384Sjkim			}
2095238384Sjkim		if (i == 8)
2096238384Sjkim			break;
2097238384Sjkim		point_double_small(
2098238384Sjkim			pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
2099238384Sjkim			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
2100238384Sjkim		for (j = 0; j < 31; ++j)
2101238384Sjkim			{
2102238384Sjkim			point_double_small(
2103238384Sjkim				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
2104238384Sjkim				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]);
2105238384Sjkim			}
2106238384Sjkim		}
2107238384Sjkim	for (i = 0; i < 2; i++)
2108238384Sjkim		{
2109238384Sjkim		/* g_pre_comp[i][0] is the point at infinity */
2110238384Sjkim		memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2111238384Sjkim		/* the remaining multiples */
2112238384Sjkim		/* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2113238384Sjkim		point_add_small(
2114238384Sjkim			pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2],
2115238384Sjkim			pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2116238384Sjkim			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2117238384Sjkim		/* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2118238384Sjkim		point_add_small(
2119238384Sjkim			pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2],
2120238384Sjkim			pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2121238384Sjkim			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2122238384Sjkim		/* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2123238384Sjkim		point_add_small(
2124238384Sjkim			pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2125238384Sjkim			pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2126238384Sjkim			pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]);
2127238384Sjkim		/* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */
2128238384Sjkim		point_add_small(
2129238384Sjkim			pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2],
2130238384Sjkim			pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2131238384Sjkim			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2132238384Sjkim		for (j = 1; j < 8; ++j)
2133238384Sjkim			{
2134238384Sjkim			/* odd multiples: add G resp. 2^32*G */
2135238384Sjkim			point_add_small(
2136238384Sjkim				pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2],
2137238384Sjkim				pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2],
2138238384Sjkim				pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]);
2139238384Sjkim			}
2140238384Sjkim		}
2141238384Sjkim	make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2142238384Sjkim
2143238384Sjkim	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2144238384Sjkim			nistp256_pre_comp_free, nistp256_pre_comp_clear_free))
2145238384Sjkim		goto err;
2146238384Sjkim	ret = 1;
2147238384Sjkim	pre = NULL;
2148238384Sjkim err:
2149238384Sjkim	BN_CTX_end(ctx);
2150238384Sjkim	if (generator != NULL)
2151238384Sjkim		EC_POINT_free(generator);
2152238384Sjkim	if (new_ctx != NULL)
2153238384Sjkim		BN_CTX_free(new_ctx);
2154238384Sjkim	if (pre)
2155238384Sjkim		nistp256_pre_comp_free(pre);
2156238384Sjkim	return ret;
2157238384Sjkim	}
2158238384Sjkim
2159238384Sjkimint ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2160238384Sjkim	{
2161238384Sjkim	if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2162238384Sjkim			nistp256_pre_comp_free, nistp256_pre_comp_clear_free)
2163238384Sjkim		!= NULL)
2164238384Sjkim		return 1;
2165238384Sjkim	else
2166238384Sjkim		return 0;
2167238384Sjkim	}
2168238384Sjkim#else
2169238384Sjkimstatic void *dummy=&dummy;
2170238384Sjkim#endif
2171