1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "inner.h"
26
27/* obsolete
28#include <stdio.h>
29#include <stdlib.h>
30static void
31print_int(const char *name, const uint32_t *x)
32{
33	size_t u;
34	unsigned char tmp[40];
35
36	printf("%s = ", name);
37	for (u = 0; u < 9; u ++) {
38		if (x[u] > 0x3FFFFFFF) {
39			printf("INVALID:");
40			for (u = 0; u < 9; u ++) {
41				printf(" %08X", x[u]);
42			}
43			printf("\n");
44			return;
45		}
46	}
47	memset(tmp, 0, sizeof tmp);
48	for (u = 0; u < 9; u ++) {
49		uint64_t w;
50		int j, k;
51
52		w = x[u];
53		j = 30 * (int)u;
54		k = j & 7;
55		if (k != 0) {
56			w <<= k;
57			j -= k;
58		}
59		k = j >> 3;
60		for (j = 0; j < 8; j ++) {
61			tmp[39 - k - j] |= (unsigned char)w;
62			w >>= 8;
63		}
64	}
65	for (u = 8; u < 40; u ++) {
66		printf("%02X", tmp[u]);
67	}
68	printf("\n");
69}
70*/
71
72/*
73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74 * that right-shifting a signed negative integer copies the sign bit
75 * (arithmetic right-shift). This is "implementation-defined behaviour",
76 * i.e. it is not undefined, but it may differ between compilers. Each
77 * compiler is supposed to document its behaviour in that respect. GCC
78 * explicitly defines that an arithmetic right shift is used. We expect
79 * all other compilers to do the same, because underlying CPU offer an
80 * arithmetic right shift opcode that could not be used otherwise.
81 */
82#if BR_NO_ARITH_SHIFT
83#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85#else
86#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87#endif
88
89/*
90 * Convert an integer from unsigned little-endian encoding to a sequence of
91 * 30-bit words in little-endian order. The final "partial" word is
92 * returned.
93 */
94static uint32_t
95le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
96{
97	uint32_t acc;
98	int acc_len;
99
100	acc = 0;
101	acc_len = 0;
102	while (len -- > 0) {
103		uint32_t b;
104
105		b = *src ++;
106		if (acc_len < 22) {
107			acc |= b << acc_len;
108			acc_len += 8;
109		} else {
110			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111			acc = b >> (30 - acc_len);
112			acc_len -= 22;
113		}
114	}
115	return acc;
116}
117
118/*
119 * Convert an integer (30-bit words, little-endian) to unsigned
120 * little-endian encoding. The total encoding length is provided; all
121 * the destination bytes will be filled.
122 */
123static void
124le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125{
126	uint32_t acc;
127	int acc_len;
128
129	acc = 0;
130	acc_len = 0;
131	while (len -- > 0) {
132		if (acc_len < 8) {
133			uint32_t w;
134
135			w = *src ++;
136			*dst ++ = (unsigned char)(acc | (w << acc_len));
137			acc = w >> (8 - acc_len);
138			acc_len += 22;
139		} else {
140			*dst ++ = (unsigned char)acc;
141			acc >>= 8;
142			acc_len -= 8;
143		}
144	}
145}
146
147/*
148 * Multiply two integers. Source integers are represented as arrays of
149 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150 * 18 words of 30 bits each.
151 */
152static void
153mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154{
155	/*
156	 * Maximum intermediate result is no more than
157	 * 10376293531797946367, which fits in 64 bits. Reason:
158	 *
159	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160	 *   10376293531797946367 < 9663676407 * 2^30
161	 *
162	 * Thus, adding together 9 products of 30-bit integers, with
163	 * a carry of at most 9663676406, yields an integer that fits
164	 * on 64 bits and generates a carry of at most 9663676406.
165	 */
166	uint64_t t[17];
167	uint64_t cc;
168	int i;
169
170	t[ 0] = MUL31(a[0], b[0]);
171	t[ 1] = MUL31(a[0], b[1])
172		+ MUL31(a[1], b[0]);
173	t[ 2] = MUL31(a[0], b[2])
174		+ MUL31(a[1], b[1])
175		+ MUL31(a[2], b[0]);
176	t[ 3] = MUL31(a[0], b[3])
177		+ MUL31(a[1], b[2])
178		+ MUL31(a[2], b[1])
179		+ MUL31(a[3], b[0]);
180	t[ 4] = MUL31(a[0], b[4])
181		+ MUL31(a[1], b[3])
182		+ MUL31(a[2], b[2])
183		+ MUL31(a[3], b[1])
184		+ MUL31(a[4], b[0]);
185	t[ 5] = MUL31(a[0], b[5])
186		+ MUL31(a[1], b[4])
187		+ MUL31(a[2], b[3])
188		+ MUL31(a[3], b[2])
189		+ MUL31(a[4], b[1])
190		+ MUL31(a[5], b[0]);
191	t[ 6] = MUL31(a[0], b[6])
192		+ MUL31(a[1], b[5])
193		+ MUL31(a[2], b[4])
194		+ MUL31(a[3], b[3])
195		+ MUL31(a[4], b[2])
196		+ MUL31(a[5], b[1])
197		+ MUL31(a[6], b[0]);
198	t[ 7] = MUL31(a[0], b[7])
199		+ MUL31(a[1], b[6])
200		+ MUL31(a[2], b[5])
201		+ MUL31(a[3], b[4])
202		+ MUL31(a[4], b[3])
203		+ MUL31(a[5], b[2])
204		+ MUL31(a[6], b[1])
205		+ MUL31(a[7], b[0]);
206	t[ 8] = MUL31(a[0], b[8])
207		+ MUL31(a[1], b[7])
208		+ MUL31(a[2], b[6])
209		+ MUL31(a[3], b[5])
210		+ MUL31(a[4], b[4])
211		+ MUL31(a[5], b[3])
212		+ MUL31(a[6], b[2])
213		+ MUL31(a[7], b[1])
214		+ MUL31(a[8], b[0]);
215	t[ 9] = MUL31(a[1], b[8])
216		+ MUL31(a[2], b[7])
217		+ MUL31(a[3], b[6])
218		+ MUL31(a[4], b[5])
219		+ MUL31(a[5], b[4])
220		+ MUL31(a[6], b[3])
221		+ MUL31(a[7], b[2])
222		+ MUL31(a[8], b[1]);
223	t[10] = MUL31(a[2], b[8])
224		+ MUL31(a[3], b[7])
225		+ MUL31(a[4], b[6])
226		+ MUL31(a[5], b[5])
227		+ MUL31(a[6], b[4])
228		+ MUL31(a[7], b[3])
229		+ MUL31(a[8], b[2]);
230	t[11] = MUL31(a[3], b[8])
231		+ MUL31(a[4], b[7])
232		+ MUL31(a[5], b[6])
233		+ MUL31(a[6], b[5])
234		+ MUL31(a[7], b[4])
235		+ MUL31(a[8], b[3]);
236	t[12] = MUL31(a[4], b[8])
237		+ MUL31(a[5], b[7])
238		+ MUL31(a[6], b[6])
239		+ MUL31(a[7], b[5])
240		+ MUL31(a[8], b[4]);
241	t[13] = MUL31(a[5], b[8])
242		+ MUL31(a[6], b[7])
243		+ MUL31(a[7], b[6])
244		+ MUL31(a[8], b[5]);
245	t[14] = MUL31(a[6], b[8])
246		+ MUL31(a[7], b[7])
247		+ MUL31(a[8], b[6]);
248	t[15] = MUL31(a[7], b[8])
249		+ MUL31(a[8], b[7]);
250	t[16] = MUL31(a[8], b[8]);
251
252	/*
253	 * Propagate carries.
254	 */
255	cc = 0;
256	for (i = 0; i < 17; i ++) {
257		uint64_t w;
258
259		w = t[i] + cc;
260		d[i] = (uint32_t)w & 0x3FFFFFFF;
261		cc = w >> 30;
262	}
263	d[17] = (uint32_t)cc;
264}
265
266/*
267 * Square a 270-bit integer, represented as an array of nine 30-bit words.
268 * Result uses 18 words of 30 bits each.
269 */
270static void
271square9(uint32_t *d, const uint32_t *a)
272{
273	uint64_t t[17];
274	uint64_t cc;
275	int i;
276
277	t[ 0] = MUL31(a[0], a[0]);
278	t[ 1] = ((MUL31(a[0], a[1])) << 1);
279	t[ 2] = MUL31(a[1], a[1])
280		+ ((MUL31(a[0], a[2])) << 1);
281	t[ 3] = ((MUL31(a[0], a[3])
282		+ MUL31(a[1], a[2])) << 1);
283	t[ 4] = MUL31(a[2], a[2])
284		+ ((MUL31(a[0], a[4])
285		+ MUL31(a[1], a[3])) << 1);
286	t[ 5] = ((MUL31(a[0], a[5])
287		+ MUL31(a[1], a[4])
288		+ MUL31(a[2], a[3])) << 1);
289	t[ 6] = MUL31(a[3], a[3])
290		+ ((MUL31(a[0], a[6])
291		+ MUL31(a[1], a[5])
292		+ MUL31(a[2], a[4])) << 1);
293	t[ 7] = ((MUL31(a[0], a[7])
294		+ MUL31(a[1], a[6])
295		+ MUL31(a[2], a[5])
296		+ MUL31(a[3], a[4])) << 1);
297	t[ 8] = MUL31(a[4], a[4])
298		+ ((MUL31(a[0], a[8])
299		+ MUL31(a[1], a[7])
300		+ MUL31(a[2], a[6])
301		+ MUL31(a[3], a[5])) << 1);
302	t[ 9] = ((MUL31(a[1], a[8])
303		+ MUL31(a[2], a[7])
304		+ MUL31(a[3], a[6])
305		+ MUL31(a[4], a[5])) << 1);
306	t[10] = MUL31(a[5], a[5])
307		+ ((MUL31(a[2], a[8])
308		+ MUL31(a[3], a[7])
309		+ MUL31(a[4], a[6])) << 1);
310	t[11] = ((MUL31(a[3], a[8])
311		+ MUL31(a[4], a[7])
312		+ MUL31(a[5], a[6])) << 1);
313	t[12] = MUL31(a[6], a[6])
314		+ ((MUL31(a[4], a[8])
315		+ MUL31(a[5], a[7])) << 1);
316	t[13] = ((MUL31(a[5], a[8])
317		+ MUL31(a[6], a[7])) << 1);
318	t[14] = MUL31(a[7], a[7])
319		+ ((MUL31(a[6], a[8])) << 1);
320	t[15] = ((MUL31(a[7], a[8])) << 1);
321	t[16] = MUL31(a[8], a[8]);
322
323	/*
324	 * Propagate carries.
325	 */
326	cc = 0;
327	for (i = 0; i < 17; i ++) {
328		uint64_t w;
329
330		w = t[i] + cc;
331		d[i] = (uint32_t)w & 0x3FFFFFFF;
332		cc = w >> 30;
333	}
334	d[17] = (uint32_t)cc;
335}
336
337/*
338 * Perform a "final reduction" in field F255 (field for Curve25519)
339 * The source value must be less than twice the modulus. If the value
340 * is not lower than the modulus, then the modulus is subtracted and
341 * this function returns 1; otherwise, it leaves it untouched and it
342 * returns 0.
343 */
344static uint32_t
345reduce_final_f255(uint32_t *d)
346{
347	uint32_t t[9];
348	uint32_t cc;
349	int i;
350
351	memcpy(t, d, sizeof t);
352	cc = 19;
353	for (i = 0; i < 9; i ++) {
354		uint32_t w;
355
356		w = t[i] + cc;
357		cc = w >> 30;
358		t[i] = w & 0x3FFFFFFF;
359	}
360	cc = t[8] >> 15;
361	t[8] &= 0x7FFF;
362	CCOPY(cc, d, t, sizeof t);
363	return cc;
364}
365
366/*
367 * Perform a multiplication of two integers modulo 2^255-19.
368 * Operands are arrays of 9 words, each containing 30 bits of data, in
369 * little-endian order. Input value may be up to 2^256-1; on output, value
370 * fits on 256 bits and is lower than twice the modulus.
371 */
372static void
373f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374{
375	uint32_t t[18], cc;
376	int i;
377
378	/*
379	 * Compute raw multiplication. All result words fit in 30 bits
380	 * each; upper word (t[17]) must fit on 2 bits, since the product
381	 * of two 256-bit integers must fit on 512 bits.
382	 */
383	mul9(t, a, b);
384
385	/*
386	 * Modular reduction: each high word is added where necessary.
387	 * Since the modulus is 2^255-19 and word 9 corresponds to
388	 * offset 9*30 = 270, word 9+k must be added to word k with
389	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
390	 * added that way.
391	 *
392	 * Keeping the carry on 32 bits helps with 32-bit architectures,
393	 * and does not noticeably impact performance on 64-bit systems.
394	 */
395	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
396	t[8] &= 0x7FFF;
397	for (i = 0; i < 9; i ++) {
398		uint64_t w;
399
400		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
401		t[i] = (uint32_t)w & 0x3FFFFFFF;
402		cc = (uint32_t)(w >> 30);  /* at most 622592 */
403	}
404
405	/*
406	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407	 * This was split into two parts (upper of 257 bits, lower of 255
408	 * bits), and the upper was added to the lower with a factor 19,
409	 * which means that the intermediate value is less than 77*2^255
410	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
412	 */
413	cc = MUL15(t[8] >> 15, 19);
414	t[8] &= 0x7FFF;
415	for (i = 0; i < 9; i ++) {
416		uint32_t z;
417
418		z = t[i] + cc;
419		d[i] = z & 0x3FFFFFFF;
420		cc = z >> 30;
421	}
422
423	/*
424	 * Final result is at most 2^255 + 1443. In particular, the last
425	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
426	 */
427}
428
429/*
430 * Perform a squaring of an integer modulo 2^255-19.
431 * Operands are arrays of 9 words, each containing 30 bits of data, in
432 * little-endian order. Input value may be up to 2^256-1; on output, value
433 * fits on 256 bits and is lower than twice the modulus.
434 */
435static void
436f255_square(uint32_t *d, const uint32_t *a)
437{
438	uint32_t t[18], cc;
439	int i;
440
441	/*
442	 * Compute raw squaring. All result words fit in 30 bits
443	 * each; upper word (t[17]) must fit on 2 bits, since the square
444	 * of a 256-bit integers must fit on 512 bits.
445	 */
446	square9(t, a);
447
448	/*
449	 * Modular reduction: each high word is added where necessary.
450	 * See f255_mul() for details on the reduction and carry limits.
451	 */
452	cc = MUL15(t[8] >> 15, 19);
453	t[8] &= 0x7FFF;
454	for (i = 0; i < 9; i ++) {
455		uint64_t w;
456
457		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
458		t[i] = (uint32_t)w & 0x3FFFFFFF;
459		cc = (uint32_t)(w >> 30);
460	}
461	cc = MUL15(t[8] >> 15, 19);
462	t[8] &= 0x7FFF;
463	for (i = 0; i < 9; i ++) {
464		uint32_t z;
465
466		z = t[i] + cc;
467		d[i] = z & 0x3FFFFFFF;
468		cc = z >> 30;
469	}
470}
471
472/*
473 * Add two values in F255. Partial reduction is performed (down to less
474 * than twice the modulus).
475 */
476static void
477f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
478{
479	/*
480	 * Since operand words fit on 30 bits, we can use 32-bit
481	 * variables throughout.
482	 */
483	int i;
484	uint32_t cc, w;
485
486	cc = 0;
487	for (i = 0; i < 9; i ++) {
488		w = a[i] + b[i] + cc;
489		d[i] = w & 0x3FFFFFFF;
490		cc = w >> 30;
491	}
492	cc = MUL15(w >> 15, 19);
493	d[8] &= 0x7FFF;
494	for (i = 0; i < 9; i ++) {
495		w = d[i] + cc;
496		d[i] = w & 0x3FFFFFFF;
497		cc = w >> 30;
498	}
499}
500
501/*
502 * Subtract one value from another in F255. Partial reduction is
503 * performed (down to less than twice the modulus).
504 */
505static void
506f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
507{
508	/*
509	 * We actually compute a - b + 2*p, so that the final value is
510	 * necessarily positive.
511	 */
512	int i;
513	uint32_t cc, w;
514
515	cc = (uint32_t)-38;
516	for (i = 0; i < 9; i ++) {
517		w = a[i] - b[i] + cc;
518		d[i] = w & 0x3FFFFFFF;
519		cc = ARSH(w, 30);
520	}
521	cc = MUL15((w + 0x10000) >> 15, 19);
522	d[8] &= 0x7FFF;
523	for (i = 0; i < 9; i ++) {
524		w = d[i] + cc;
525		d[i] = w & 0x3FFFFFFF;
526		cc = w >> 30;
527	}
528}
529
530/*
531 * Multiply an integer by the 'A24' constant (121665). Partial reduction
532 * is performed (down to less than twice the modulus).
533 */
534static void
535f255_mul_a24(uint32_t *d, const uint32_t *a)
536{
537	int i;
538	uint64_t w;
539	uint32_t cc;
540
541	/*
542	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
543	 * We single out the processing of the last word: intermediate
544	 * value w is up to 121665*2^16, yielding a carry for the next
545	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
546	 */
547	cc = 0;
548	for (i = 0; i < 8; i ++) {
549		w = MUL31(a[i], 121665) + (uint64_t)cc;
550		d[i] = (uint32_t)w & 0x3FFFFFFF;
551		cc = (uint32_t)(w >> 30);
552	}
553	w = MUL31(a[8], 121665) + (uint64_t)cc;
554	d[8] = (uint32_t)w & 0x7FFF;
555	cc = MUL15((uint32_t)(w >> 15), 19);
556
557	for (i = 0; i < 9; i ++) {
558		uint32_t z;
559
560		z = d[i] + cc;
561		d[i] = z & 0x3FFFFFFF;
562		cc = z >> 30;
563	}
564}
565
566static const unsigned char GEN[] = {
567	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
568	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
569	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
570	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
571};
572
573static const unsigned char ORDER[] = {
574	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
575	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
576	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
577	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
578};
579
580static const unsigned char *
581api_generator(int curve, size_t *len)
582{
583	(void)curve;
584	*len = 32;
585	return GEN;
586}
587
588static const unsigned char *
589api_order(int curve, size_t *len)
590{
591	(void)curve;
592	*len = 32;
593	return ORDER;
594}
595
596static size_t
597api_xoff(int curve, size_t *len)
598{
599	(void)curve;
600	*len = 32;
601	return 0;
602}
603
604static void
605cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
606{
607	int i;
608
609	ctl = -ctl;
610	for (i = 0; i < 9; i ++) {
611		uint32_t aw, bw, tw;
612
613		aw = a[i];
614		bw = b[i];
615		tw = ctl & (aw ^ bw);
616		a[i] = aw ^ tw;
617		b[i] = bw ^ tw;
618	}
619}
620
621static uint32_t
622api_mul(unsigned char *G, size_t Glen,
623	const unsigned char *kb, size_t kblen, int curve)
624{
625	uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
626	uint32_t a[9], aa[9], b[9], bb[9];
627	uint32_t c[9], d[9], e[9], da[9], cb[9];
628	unsigned char k[32];
629	uint32_t swap;
630	int i;
631
632	(void)curve;
633
634	/*
635	 * Points are encoded over exactly 32 bytes. Multipliers must fit
636	 * in 32 bytes as well.
637	 * RFC 7748 mandates that the high bit of the last point byte must
638	 * be ignored/cleared.
639	 */
640	if (Glen != 32 || kblen > 32) {
641		return 0;
642	}
643	G[31] &= 0x7F;
644
645	/*
646	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
647	 * into Montgomery representation.
648	 */
649	x1[8] = le8_to_le30(x1, G, 32);
650	memcpy(x3, x1, sizeof x1);
651	memset(z2, 0, sizeof z2);
652	memset(x2, 0, sizeof x2);
653	x2[0] = 1;
654	memset(z3, 0, sizeof z3);
655	z3[0] = 1;
656
657	memset(k, 0, (sizeof k) - kblen);
658	memcpy(k + (sizeof k) - kblen, kb, kblen);
659	k[31] &= 0xF8;
660	k[0] &= 0x7F;
661	k[0] |= 0x40;
662
663	/* obsolete
664	print_int("x1", x1);
665	*/
666
667	swap = 0;
668	for (i = 254; i >= 0; i --) {
669		uint32_t kt;
670
671		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
672		swap ^= kt;
673		cswap(x2, x3, swap);
674		cswap(z2, z3, swap);
675		swap = kt;
676
677		/* obsolete
678		print_int("x2", x2);
679		print_int("z2", z2);
680		print_int("x3", x3);
681		print_int("z3", z3);
682		*/
683
684		f255_add(a, x2, z2);
685		f255_square(aa, a);
686		f255_sub(b, x2, z2);
687		f255_square(bb, b);
688		f255_sub(e, aa, bb);
689		f255_add(c, x3, z3);
690		f255_sub(d, x3, z3);
691		f255_mul(da, d, a);
692		f255_mul(cb, c, b);
693
694		/* obsolete
695		print_int("a ", a);
696		print_int("aa", aa);
697		print_int("b ", b);
698		print_int("bb", bb);
699		print_int("e ", e);
700		print_int("c ", c);
701		print_int("d ", d);
702		print_int("da", da);
703		print_int("cb", cb);
704		*/
705
706		f255_add(x3, da, cb);
707		f255_square(x3, x3);
708		f255_sub(z3, da, cb);
709		f255_square(z3, z3);
710		f255_mul(z3, z3, x1);
711		f255_mul(x2, aa, bb);
712		f255_mul_a24(z2, e);
713		f255_add(z2, z2, aa);
714		f255_mul(z2, e, z2);
715
716		/* obsolete
717		print_int("x2", x2);
718		print_int("z2", z2);
719		print_int("x3", x3);
720		print_int("z3", z3);
721		*/
722	}
723	cswap(x2, x3, swap);
724	cswap(z2, z3, swap);
725
726	/*
727	 * Inverse z2 with a modular exponentiation. This is a simple
728	 * square-and-multiply algorithm; we mutualise most non-squarings
729	 * since the exponent contains almost only ones.
730	 */
731	memcpy(a, z2, sizeof z2);
732	for (i = 0; i < 15; i ++) {
733		f255_square(a, a);
734		f255_mul(a, a, z2);
735	}
736	memcpy(b, a, sizeof a);
737	for (i = 0; i < 14; i ++) {
738		int j;
739
740		for (j = 0; j < 16; j ++) {
741			f255_square(b, b);
742		}
743		f255_mul(b, b, a);
744	}
745	for (i = 14; i >= 0; i --) {
746		f255_square(b, b);
747		if ((0xFFEB >> i) & 1) {
748			f255_mul(b, z2, b);
749		}
750	}
751	f255_mul(x2, x2, b);
752	reduce_final_f255(x2);
753	le30_to_le8(G, 32, x2);
754	return 1;
755}
756
757static size_t
758api_mulgen(unsigned char *R,
759	const unsigned char *x, size_t xlen, int curve)
760{
761	const unsigned char *G;
762	size_t Glen;
763
764	G = api_generator(curve, &Glen);
765	memcpy(R, G, Glen);
766	api_mul(R, Glen, x, xlen, curve);
767	return Glen;
768}
769
770static uint32_t
771api_muladd(unsigned char *A, const unsigned char *B, size_t len,
772	const unsigned char *x, size_t xlen,
773	const unsigned char *y, size_t ylen, int curve)
774{
775	/*
776	 * We don't implement this method, since it is used for ECDSA
777	 * only, and there is no ECDSA over Curve25519 (which instead
778	 * uses EdDSA).
779	 */
780	(void)A;
781	(void)B;
782	(void)len;
783	(void)x;
784	(void)xlen;
785	(void)y;
786	(void)ylen;
787	(void)curve;
788	return 0;
789}
790
791/* see bearssl_ec.h */
792const br_ec_impl br_ec_c25519_m31 = {
793	(uint32_t)0x20000000,
794	&api_generator,
795	&api_order,
796	&api_xoff,
797	&api_mul,
798	&api_mulgen,
799	&api_muladd
800};
801