1// SPDX-License-Identifier: GPL-2.0 OR MIT
2/*
3 * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
4 * Copyright (C) 2018-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 *
6 * This is a machine-generated formally verified implementation of Curve25519
7 * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
8 * generated, it has been tweaked to be suitable for use in the kernel. It is
9 * optimized for 64-bit machines that can efficiently work with 128-bit
10 * integer types.
11 */
12
13typedef __uint128_t u128;
14
15static __always_inline u64 u64_eq_mask(u64 a, u64 b)
16{
17	u64 x = a ^ b;
18	u64 minus_x = ~x + (u64)1U;
19	u64 x_or_minus_x = x | minus_x;
20	u64 xnx = x_or_minus_x >> (u32)63U;
21	u64 c = xnx - (u64)1U;
22	return c;
23}
24
25static __always_inline u64 u64_gte_mask(u64 a, u64 b)
26{
27	u64 x = a;
28	u64 y = b;
29	u64 x_xor_y = x ^ y;
30	u64 x_sub_y = x - y;
31	u64 x_sub_y_xor_y = x_sub_y ^ y;
32	u64 q = x_xor_y | x_sub_y_xor_y;
33	u64 x_xor_q = x ^ q;
34	u64 x_xor_q_ = x_xor_q >> (u32)63U;
35	u64 c = x_xor_q_ - (u64)1U;
36	return c;
37}
38
39static __always_inline void modulo_carry_top(u64 *b)
40{
41	u64 b4 = b[4];
42	u64 b0 = b[0];
43	u64 b4_ = b4 & 0x7ffffffffffffLLU;
44	u64 b0_ = b0 + 19 * (b4 >> 51);
45	b[4] = b4_;
46	b[0] = b0_;
47}
48
49static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
50{
51	{
52		u128 xi = input[0];
53		output[0] = ((u64)(xi));
54	}
55	{
56		u128 xi = input[1];
57		output[1] = ((u64)(xi));
58	}
59	{
60		u128 xi = input[2];
61		output[2] = ((u64)(xi));
62	}
63	{
64		u128 xi = input[3];
65		output[3] = ((u64)(xi));
66	}
67	{
68		u128 xi = input[4];
69		output[4] = ((u64)(xi));
70	}
71}
72
73static __always_inline void
74fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
75{
76	output[0] += (u128)input[0] * s;
77	output[1] += (u128)input[1] * s;
78	output[2] += (u128)input[2] * s;
79	output[3] += (u128)input[3] * s;
80	output[4] += (u128)input[4] * s;
81}
82
83static __always_inline void fproduct_carry_wide_(u128 *tmp)
84{
85	{
86		u32 ctr = 0;
87		u128 tctr = tmp[ctr];
88		u128 tctrp1 = tmp[ctr + 1];
89		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
90		u128 c = ((tctr) >> (51));
91		tmp[ctr] = ((u128)(r0));
92		tmp[ctr + 1] = ((tctrp1) + (c));
93	}
94	{
95		u32 ctr = 1;
96		u128 tctr = tmp[ctr];
97		u128 tctrp1 = tmp[ctr + 1];
98		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
99		u128 c = ((tctr) >> (51));
100		tmp[ctr] = ((u128)(r0));
101		tmp[ctr + 1] = ((tctrp1) + (c));
102	}
103
104	{
105		u32 ctr = 2;
106		u128 tctr = tmp[ctr];
107		u128 tctrp1 = tmp[ctr + 1];
108		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
109		u128 c = ((tctr) >> (51));
110		tmp[ctr] = ((u128)(r0));
111		tmp[ctr + 1] = ((tctrp1) + (c));
112	}
113	{
114		u32 ctr = 3;
115		u128 tctr = tmp[ctr];
116		u128 tctrp1 = tmp[ctr + 1];
117		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
118		u128 c = ((tctr) >> (51));
119		tmp[ctr] = ((u128)(r0));
120		tmp[ctr + 1] = ((tctrp1) + (c));
121	}
122}
123
124static __always_inline void fmul_shift_reduce(u64 *output)
125{
126	u64 tmp = output[4];
127	u64 b0;
128	{
129		u32 ctr = 5 - 0 - 1;
130		u64 z = output[ctr - 1];
131		output[ctr] = z;
132	}
133	{
134		u32 ctr = 5 - 1 - 1;
135		u64 z = output[ctr - 1];
136		output[ctr] = z;
137	}
138	{
139		u32 ctr = 5 - 2 - 1;
140		u64 z = output[ctr - 1];
141		output[ctr] = z;
142	}
143	{
144		u32 ctr = 5 - 3 - 1;
145		u64 z = output[ctr - 1];
146		output[ctr] = z;
147	}
148	output[0] = tmp;
149	b0 = output[0];
150	output[0] = 19 * b0;
151}
152
153static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
154						   u64 *input21)
155{
156	u32 i;
157	u64 input2i;
158	{
159		u64 input2i = input21[0];
160		fproduct_sum_scalar_multiplication_(output, input, input2i);
161		fmul_shift_reduce(input);
162	}
163	{
164		u64 input2i = input21[1];
165		fproduct_sum_scalar_multiplication_(output, input, input2i);
166		fmul_shift_reduce(input);
167	}
168	{
169		u64 input2i = input21[2];
170		fproduct_sum_scalar_multiplication_(output, input, input2i);
171		fmul_shift_reduce(input);
172	}
173	{
174		u64 input2i = input21[3];
175		fproduct_sum_scalar_multiplication_(output, input, input2i);
176		fmul_shift_reduce(input);
177	}
178	i = 4;
179	input2i = input21[i];
180	fproduct_sum_scalar_multiplication_(output, input, input2i);
181}
182
183static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
184{
185	u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
186	{
187		u128 b4;
188		u128 b0;
189		u128 b4_;
190		u128 b0_;
191		u64 i0;
192		u64 i1;
193		u64 i0_;
194		u64 i1_;
195		u128 t[5] = { 0 };
196		fmul_mul_shift_reduce_(t, tmp, input21);
197		fproduct_carry_wide_(t);
198		b4 = t[4];
199		b0 = t[0];
200		b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
201		b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
202		t[4] = b4_;
203		t[0] = b0_;
204		fproduct_copy_from_wide_(output, t);
205		i0 = output[0];
206		i1 = output[1];
207		i0_ = i0 & 0x7ffffffffffffLLU;
208		i1_ = i1 + (i0 >> 51);
209		output[0] = i0_;
210		output[1] = i1_;
211	}
212}
213
214static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
215{
216	u64 r0 = output[0];
217	u64 r1 = output[1];
218	u64 r2 = output[2];
219	u64 r3 = output[3];
220	u64 r4 = output[4];
221	u64 d0 = r0 * 2;
222	u64 d1 = r1 * 2;
223	u64 d2 = r2 * 2 * 19;
224	u64 d419 = r4 * 19;
225	u64 d4 = d419 * 2;
226	u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
227		   (((u128)(d2) * (r3))));
228	u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
229		   (((u128)(r3 * 19) * (r3))));
230	u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
231		   (((u128)(d4) * (r3))));
232	u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
233		   (((u128)(r4) * (d419))));
234	u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
235		   (((u128)(r2) * (r2))));
236	tmp[0] = s0;
237	tmp[1] = s1;
238	tmp[2] = s2;
239	tmp[3] = s3;
240	tmp[4] = s4;
241}
242
243static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
244{
245	u128 b4;
246	u128 b0;
247	u128 b4_;
248	u128 b0_;
249	u64 i0;
250	u64 i1;
251	u64 i0_;
252	u64 i1_;
253	fsquare_fsquare__(tmp, output);
254	fproduct_carry_wide_(tmp);
255	b4 = tmp[4];
256	b0 = tmp[0];
257	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
258	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
259	tmp[4] = b4_;
260	tmp[0] = b0_;
261	fproduct_copy_from_wide_(output, tmp);
262	i0 = output[0];
263	i1 = output[1];
264	i0_ = i0 & 0x7ffffffffffffLLU;
265	i1_ = i1 + (i0 >> 51);
266	output[0] = i0_;
267	output[1] = i1_;
268}
269
270static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
271						   u32 count1)
272{
273	u32 i;
274	fsquare_fsquare_(tmp, output);
275	for (i = 1; i < count1; ++i)
276		fsquare_fsquare_(tmp, output);
277}
278
279static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
280						  u32 count1)
281{
282	u128 t[5];
283	memcpy(output, input, 5 * sizeof(*input));
284	fsquare_fsquare_times_(output, t, count1);
285}
286
287static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
288							  u32 count1)
289{
290	u128 t[5];
291	fsquare_fsquare_times_(output, t, count1);
292}
293
294static __always_inline void crecip_crecip(u64 *out, u64 *z)
295{
296	u64 buf[20] = { 0 };
297	u64 *a0 = buf;
298	u64 *t00 = buf + 5;
299	u64 *b0 = buf + 10;
300	u64 *t01;
301	u64 *b1;
302	u64 *c0;
303	u64 *a;
304	u64 *t0;
305	u64 *b;
306	u64 *c;
307	fsquare_fsquare_times(a0, z, 1);
308	fsquare_fsquare_times(t00, a0, 2);
309	fmul_fmul(b0, t00, z);
310	fmul_fmul(a0, b0, a0);
311	fsquare_fsquare_times(t00, a0, 1);
312	fmul_fmul(b0, t00, b0);
313	fsquare_fsquare_times(t00, b0, 5);
314	t01 = buf + 5;
315	b1 = buf + 10;
316	c0 = buf + 15;
317	fmul_fmul(b1, t01, b1);
318	fsquare_fsquare_times(t01, b1, 10);
319	fmul_fmul(c0, t01, b1);
320	fsquare_fsquare_times(t01, c0, 20);
321	fmul_fmul(t01, t01, c0);
322	fsquare_fsquare_times_inplace(t01, 10);
323	fmul_fmul(b1, t01, b1);
324	fsquare_fsquare_times(t01, b1, 50);
325	a = buf;
326	t0 = buf + 5;
327	b = buf + 10;
328	c = buf + 15;
329	fmul_fmul(c, t0, b);
330	fsquare_fsquare_times(t0, c, 100);
331	fmul_fmul(t0, t0, c);
332	fsquare_fsquare_times_inplace(t0, 50);
333	fmul_fmul(t0, t0, b);
334	fsquare_fsquare_times_inplace(t0, 5);
335	fmul_fmul(out, t0, a);
336}
337
338static __always_inline void fsum(u64 *a, u64 *b)
339{
340	a[0] += b[0];
341	a[1] += b[1];
342	a[2] += b[2];
343	a[3] += b[3];
344	a[4] += b[4];
345}
346
347static __always_inline void fdifference(u64 *a, u64 *b)
348{
349	u64 tmp[5] = { 0 };
350	u64 b0;
351	u64 b1;
352	u64 b2;
353	u64 b3;
354	u64 b4;
355	memcpy(tmp, b, 5 * sizeof(*b));
356	b0 = tmp[0];
357	b1 = tmp[1];
358	b2 = tmp[2];
359	b3 = tmp[3];
360	b4 = tmp[4];
361	tmp[0] = b0 + 0x3fffffffffff68LLU;
362	tmp[1] = b1 + 0x3ffffffffffff8LLU;
363	tmp[2] = b2 + 0x3ffffffffffff8LLU;
364	tmp[3] = b3 + 0x3ffffffffffff8LLU;
365	tmp[4] = b4 + 0x3ffffffffffff8LLU;
366	{
367		u64 xi = a[0];
368		u64 yi = tmp[0];
369		a[0] = yi - xi;
370	}
371	{
372		u64 xi = a[1];
373		u64 yi = tmp[1];
374		a[1] = yi - xi;
375	}
376	{
377		u64 xi = a[2];
378		u64 yi = tmp[2];
379		a[2] = yi - xi;
380	}
381	{
382		u64 xi = a[3];
383		u64 yi = tmp[3];
384		a[3] = yi - xi;
385	}
386	{
387		u64 xi = a[4];
388		u64 yi = tmp[4];
389		a[4] = yi - xi;
390	}
391}
392
393static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
394{
395	u128 tmp[5];
396	u128 b4;
397	u128 b0;
398	u128 b4_;
399	u128 b0_;
400	{
401		u64 xi = b[0];
402		tmp[0] = ((u128)(xi) * (s));
403	}
404	{
405		u64 xi = b[1];
406		tmp[1] = ((u128)(xi) * (s));
407	}
408	{
409		u64 xi = b[2];
410		tmp[2] = ((u128)(xi) * (s));
411	}
412	{
413		u64 xi = b[3];
414		tmp[3] = ((u128)(xi) * (s));
415	}
416	{
417		u64 xi = b[4];
418		tmp[4] = ((u128)(xi) * (s));
419	}
420	fproduct_carry_wide_(tmp);
421	b4 = tmp[4];
422	b0 = tmp[0];
423	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
424	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
425	tmp[4] = b4_;
426	tmp[0] = b0_;
427	fproduct_copy_from_wide_(output, tmp);
428}
429
430static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
431{
432	fmul_fmul(output, a, b);
433}
434
435static __always_inline void crecip(u64 *output, u64 *input)
436{
437	crecip_crecip(output, input);
438}
439
440static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
441							u64 swap1, u32 ctr)
442{
443	u32 i = ctr - 1;
444	u64 ai = a[i];
445	u64 bi = b[i];
446	u64 x = swap1 & (ai ^ bi);
447	u64 ai1 = ai ^ x;
448	u64 bi1 = bi ^ x;
449	a[i] = ai1;
450	b[i] = bi1;
451}
452
453static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
454{
455	point_swap_conditional_step(a, b, swap1, 5);
456	point_swap_conditional_step(a, b, swap1, 4);
457	point_swap_conditional_step(a, b, swap1, 3);
458	point_swap_conditional_step(a, b, swap1, 2);
459	point_swap_conditional_step(a, b, swap1, 1);
460}
461
462static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
463{
464	u64 swap1 = 0 - iswap;
465	point_swap_conditional5(a, b, swap1);
466	point_swap_conditional5(a + 5, b + 5, swap1);
467}
468
469static __always_inline void point_copy(u64 *output, u64 *input)
470{
471	memcpy(output, input, 5 * sizeof(*input));
472	memcpy(output + 5, input + 5, 5 * sizeof(*input));
473}
474
475static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
476						u64 *pq, u64 *qmqp)
477{
478	u64 *qx = qmqp;
479	u64 *x2 = pp;
480	u64 *z2 = pp + 5;
481	u64 *x3 = ppq;
482	u64 *z3 = ppq + 5;
483	u64 *x = p;
484	u64 *z = p + 5;
485	u64 *xprime = pq;
486	u64 *zprime = pq + 5;
487	u64 buf[40] = { 0 };
488	u64 *origx = buf;
489	u64 *origxprime0 = buf + 5;
490	u64 *xxprime0;
491	u64 *zzprime0;
492	u64 *origxprime;
493	xxprime0 = buf + 25;
494	zzprime0 = buf + 30;
495	memcpy(origx, x, 5 * sizeof(*x));
496	fsum(x, z);
497	fdifference(z, origx);
498	memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
499	fsum(xprime, zprime);
500	fdifference(zprime, origxprime0);
501	fmul(xxprime0, xprime, z);
502	fmul(zzprime0, x, zprime);
503	origxprime = buf + 5;
504	{
505		u64 *xx0;
506		u64 *zz0;
507		u64 *xxprime;
508		u64 *zzprime;
509		u64 *zzzprime;
510		xx0 = buf + 15;
511		zz0 = buf + 20;
512		xxprime = buf + 25;
513		zzprime = buf + 30;
514		zzzprime = buf + 35;
515		memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
516		fsum(xxprime, zzprime);
517		fdifference(zzprime, origxprime);
518		fsquare_fsquare_times(x3, xxprime, 1);
519		fsquare_fsquare_times(zzzprime, zzprime, 1);
520		fmul(z3, zzzprime, qx);
521		fsquare_fsquare_times(xx0, x, 1);
522		fsquare_fsquare_times(zz0, z, 1);
523		{
524			u64 *zzz;
525			u64 *xx;
526			u64 *zz;
527			u64 scalar;
528			zzz = buf + 10;
529			xx = buf + 15;
530			zz = buf + 20;
531			fmul(x2, xx, zz);
532			fdifference(zz, xx);
533			scalar = 121665;
534			fscalar(zzz, zz, scalar);
535			fsum(zzz, xx);
536			fmul(z2, zzz, zz);
537		}
538	}
539}
540
541static __always_inline void
542ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
543				       u64 *q, u8 byt)
544{
545	u64 bit0 = (u64)(byt >> 7);
546	u64 bit;
547	point_swap_conditional(nq, nqpq, bit0);
548	addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
549	bit = (u64)(byt >> 7);
550	point_swap_conditional(nq2, nqpq2, bit);
551}
552
553static __always_inline void
554ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
555					      u64 *nqpq2, u64 *q, u8 byt)
556{
557	u8 byt1;
558	ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
559	byt1 = byt << 1;
560	ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
561}
562
563static __always_inline void
564ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
565				  u64 *q, u8 byt, u32 i)
566{
567	while (i--) {
568		ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
569							      nqpq2, q, byt);
570		byt <<= 2;
571	}
572}
573
574static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
575							  u64 *nqpq, u64 *nq2,
576							  u64 *nqpq2, u64 *q,
577							  u32 i)
578{
579	while (i--) {
580		u8 byte = n1[i];
581		ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
582						  byte, 4);
583	}
584}
585
586static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
587{
588	u64 point_buf[40] = { 0 };
589	u64 *nq = point_buf;
590	u64 *nqpq = point_buf + 10;
591	u64 *nq2 = point_buf + 20;
592	u64 *nqpq2 = point_buf + 30;
593	point_copy(nqpq, q);
594	nq[0] = 1;
595	ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
596	point_copy(result, nq);
597}
598
599static __always_inline void format_fexpand(u64 *output, const u8 *input)
600{
601	const u8 *x00 = input + 6;
602	const u8 *x01 = input + 12;
603	const u8 *x02 = input + 19;
604	const u8 *x0 = input + 24;
605	u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
606	i0 = get_unaligned_le64(input);
607	i1 = get_unaligned_le64(x00);
608	i2 = get_unaligned_le64(x01);
609	i3 = get_unaligned_le64(x02);
610	i4 = get_unaligned_le64(x0);
611	output0 = i0 & 0x7ffffffffffffLLU;
612	output1 = i1 >> 3 & 0x7ffffffffffffLLU;
613	output2 = i2 >> 6 & 0x7ffffffffffffLLU;
614	output3 = i3 >> 1 & 0x7ffffffffffffLLU;
615	output4 = i4 >> 12 & 0x7ffffffffffffLLU;
616	output[0] = output0;
617	output[1] = output1;
618	output[2] = output2;
619	output[3] = output3;
620	output[4] = output4;
621}
622
623static __always_inline void format_fcontract_first_carry_pass(u64 *input)
624{
625	u64 t0 = input[0];
626	u64 t1 = input[1];
627	u64 t2 = input[2];
628	u64 t3 = input[3];
629	u64 t4 = input[4];
630	u64 t1_ = t1 + (t0 >> 51);
631	u64 t0_ = t0 & 0x7ffffffffffffLLU;
632	u64 t2_ = t2 + (t1_ >> 51);
633	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
634	u64 t3_ = t3 + (t2_ >> 51);
635	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
636	u64 t4_ = t4 + (t3_ >> 51);
637	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
638	input[0] = t0_;
639	input[1] = t1__;
640	input[2] = t2__;
641	input[3] = t3__;
642	input[4] = t4_;
643}
644
645static __always_inline void format_fcontract_first_carry_full(u64 *input)
646{
647	format_fcontract_first_carry_pass(input);
648	modulo_carry_top(input);
649}
650
651static __always_inline void format_fcontract_second_carry_pass(u64 *input)
652{
653	u64 t0 = input[0];
654	u64 t1 = input[1];
655	u64 t2 = input[2];
656	u64 t3 = input[3];
657	u64 t4 = input[4];
658	u64 t1_ = t1 + (t0 >> 51);
659	u64 t0_ = t0 & 0x7ffffffffffffLLU;
660	u64 t2_ = t2 + (t1_ >> 51);
661	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
662	u64 t3_ = t3 + (t2_ >> 51);
663	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
664	u64 t4_ = t4 + (t3_ >> 51);
665	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
666	input[0] = t0_;
667	input[1] = t1__;
668	input[2] = t2__;
669	input[3] = t3__;
670	input[4] = t4_;
671}
672
673static __always_inline void format_fcontract_second_carry_full(u64 *input)
674{
675	u64 i0;
676	u64 i1;
677	u64 i0_;
678	u64 i1_;
679	format_fcontract_second_carry_pass(input);
680	modulo_carry_top(input);
681	i0 = input[0];
682	i1 = input[1];
683	i0_ = i0 & 0x7ffffffffffffLLU;
684	i1_ = i1 + (i0 >> 51);
685	input[0] = i0_;
686	input[1] = i1_;
687}
688
689static __always_inline void format_fcontract_trim(u64 *input)
690{
691	u64 a0 = input[0];
692	u64 a1 = input[1];
693	u64 a2 = input[2];
694	u64 a3 = input[3];
695	u64 a4 = input[4];
696	u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
697	u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
698	u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
699	u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
700	u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
701	u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
702	u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
703	u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
704	u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
705	u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
706	u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
707	input[0] = a0_;
708	input[1] = a1_;
709	input[2] = a2_;
710	input[3] = a3_;
711	input[4] = a4_;
712}
713
714static __always_inline void format_fcontract_store(u8 *output, u64 *input)
715{
716	u64 t0 = input[0];
717	u64 t1 = input[1];
718	u64 t2 = input[2];
719	u64 t3 = input[3];
720	u64 t4 = input[4];
721	u64 o0 = t1 << 51 | t0;
722	u64 o1 = t2 << 38 | t1 >> 13;
723	u64 o2 = t3 << 25 | t2 >> 26;
724	u64 o3 = t4 << 12 | t3 >> 39;
725	u8 *b0 = output;
726	u8 *b1 = output + 8;
727	u8 *b2 = output + 16;
728	u8 *b3 = output + 24;
729	put_unaligned_le64(o0, b0);
730	put_unaligned_le64(o1, b1);
731	put_unaligned_le64(o2, b2);
732	put_unaligned_le64(o3, b3);
733}
734
735static __always_inline void format_fcontract(u8 *output, u64 *input)
736{
737	format_fcontract_first_carry_full(input);
738	format_fcontract_second_carry_full(input);
739	format_fcontract_trim(input);
740	format_fcontract_store(output, input);
741}
742
743static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
744{
745	u64 *x = point;
746	u64 *z = point + 5;
747	u64 buf[10] __aligned(32) = { 0 };
748	u64 *zmone = buf;
749	u64 *sc = buf + 5;
750	crecip(zmone, z);
751	fmul(sc, x, zmone);
752	format_fcontract(scalar, sc);
753}
754
755static void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
756			       const u8 secret[CURVE25519_KEY_SIZE],
757			       const u8 basepoint[CURVE25519_KEY_SIZE])
758{
759	u64 buf0[10] __aligned(32) = { 0 };
760	u64 *x0 = buf0;
761	u64 *z = buf0 + 5;
762	u64 *q;
763	format_fexpand(x0, basepoint);
764	z[0] = 1;
765	q = buf0;
766	{
767		u8 e[32] __aligned(32) = { 0 };
768		u8 *scalar;
769		memcpy(e, secret, 32);
770		curve25519_clamp_secret(e);
771		scalar = e;
772		{
773			u64 buf[15] = { 0 };
774			u64 *nq = buf;
775			u64 *x = nq;
776			x[0] = 1;
777			ladder_cmult(nq, scalar, q);
778			format_scalar_of_point(mypublic, nq);
779			memzero_explicit(buf, sizeof(buf));
780		}
781		memzero_explicit(e, sizeof(e));
782	}
783	memzero_explicit(buf0, sizeof(buf0));
784}
785