1/* mulmod_bnm1.c -- multiplication mod B^n-1.
2
3   Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
4   Marco Bodrato.
5
6   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
7   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10Copyright 2009, 2010 Free Software Foundation, Inc.
11
12This file is part of the GNU MP Library.
13
14The GNU MP Library is free software; you can redistribute it and/or modify
15it under the terms of the GNU Lesser General Public License as published by
16the Free Software Foundation; either version 3 of the License, or (at your
17option) any later version.
18
19The GNU MP Library is distributed in the hope that it will be useful, but
20WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22License for more details.
23
24You should have received a copy of the GNU Lesser General Public License
25along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26
27
28#include "gmp.h"
29#include "gmp-impl.h"
30#include "longlong.h"
31
32/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
33   mod B^rn - 1, and values are semi-normalised; zero is represented
34   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
35   tp==rp is allowed. */
36void
37mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
38		    mp_ptr tp)
39{
40  mp_limb_t cy;
41
42  ASSERT (0 < rn);
43
44  mpn_mul_n (tp, ap, bp, rn);
45  cy = mpn_add_n (rp, tp, tp + rn, rn);
46  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
47   * be no overflow when adding in the carry. */
48  MPN_INCR_U (rp, rn, cy);
49}
50
51
52/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
53   semi-normalised representation, computation is mod B^rn + 1. Needs
54   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
55   Output is normalised. */
56static void
57mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
58		    mp_ptr tp)
59{
60  mp_limb_t cy;
61
62  ASSERT (0 < rn);
63
64  mpn_mul_n (tp, ap, bp, rn + 1);
65  ASSERT (tp[2*rn+1] == 0);
66  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
67  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
68  rp[rn] = 0;
69  MPN_INCR_U (rp, rn+1, cy );
70}
71
72
73/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
74 *
75 * The result is expected to be ZERO if and only if one of the operand
76 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
77 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
78 * combine results and obtain a natural number when one knows in
79 * advance that the final value is less than (B^rn-1).
80 * Moreover it should not be a problem if mulmod_bnm1 is used to
81 * compute the full product with an+bn <= rn, because this condition
82 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
83 *
84 * Requires 0 < bn <= an <= rn and an + bn > rn/2
85 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
86 *
87 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
88 */
89void
90mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
91{
92  ASSERT (0 < bn);
93  ASSERT (bn <= an);
94  ASSERT (an <= rn);
95
96  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
97    {
98      if (UNLIKELY (bn < rn))
99	{
100	  if (UNLIKELY (an + bn <= rn))
101	    {
102	      mpn_mul (rp, ap, an, bp, bn);
103	    }
104	  else
105	    {
106	      mp_limb_t cy;
107	      mpn_mul (tp, ap, an, bp, bn);
108	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
109	      MPN_INCR_U (rp, rn, cy);
110	    }
111	}
112      else
113	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
114    }
115  else
116    {
117      mp_size_t n;
118      mp_limb_t cy;
119      mp_limb_t hi;
120
121      n = rn >> 1;
122
123      /* We need at least an + bn >= n, to be able to fit one of the
124	 recursive products at rp. Requiring strict inequality makes
125	 the coded slightly simpler. If desired, we could avoid this
126	 restriction by initially halving rn as long as rn is even and
127	 an + bn <= rn/2. */
128
129      ASSERT (an + bn > n);
130
131      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
132	 and crt together as
133
134	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
135      */
136
137#define a0 ap
138#define a1 (ap + n)
139#define b0 bp
140#define b1 (bp + n)
141
142#define xp  tp	/* 2n + 2 */
143      /* am1  maybe in {xp, n} */
144      /* bm1  maybe in {xp + n, n} */
145#define sp1 (tp + 2*n + 2)
146      /* ap1  maybe in {sp1, n + 1} */
147      /* bp1  maybe in {sp1 + n + 1, n + 1} */
148
149      {
150	mp_srcptr am1, bm1;
151	mp_size_t anm, bnm;
152	mp_ptr so;
153
154	if (LIKELY (an > n))
155	  {
156	    am1 = xp;
157	    cy = mpn_add (xp, a0, n, a1, an - n);
158	    MPN_INCR_U (xp, n, cy);
159	    anm = n;
160	    if (LIKELY (bn > n))
161	      {
162		bm1 = xp + n;
163		cy = mpn_add (xp + n, b0, n, b1, bn - n);
164		MPN_INCR_U (xp + n, n, cy);
165		bnm = n;
166		so = xp + 2*n;
167	      }
168	    else
169	      {
170		so = xp + n;
171		bm1 = b0;
172		bnm = bn;
173	      }
174	  }
175	else
176	  {
177	    so = xp;
178	    am1 = a0;
179	    anm = an;
180	    bm1 = b0;
181	    bnm = bn;
182	  }
183
184	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
185      }
186
187      {
188	int       k;
189	mp_srcptr ap1, bp1;
190	mp_size_t anp, bnp;
191
192	if (LIKELY (an > n)) {
193	  ap1 = sp1;
194	  cy = mpn_sub (sp1, a0, n, a1, an - n);
195	  sp1[n] = 0;
196	  MPN_INCR_U (sp1, n + 1, cy);
197	  anp = n + ap1[n];
198	} else {
199	  ap1 = a0;
200	  anp = an;
201	}
202
203	if (LIKELY (bn > n)) {
204	  bp1 = sp1 + n + 1;
205	  cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
206	  sp1[2*n+1] = 0;
207	  MPN_INCR_U (sp1 + n + 1, n + 1, cy);
208	  bnp = n + bp1[n];
209	} else {
210	  bp1 = b0;
211	  bnp = bn;
212	}
213
214	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
215	  k=0;
216	else
217	  {
218	    int mask;
219	    k = mpn_fft_best_k (n, 0);
220	    mask = (1<<k) -1;
221	    while (n & mask) {k--; mask >>=1;};
222	  }
223	if (k >= FFT_FIRST_K)
224	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
225	else if (UNLIKELY (bp1 == b0))
226	  {
227	    ASSERT (anp + bnp <= 2*n+1);
228	    ASSERT (anp + bnp > n);
229	    ASSERT (anp >= bnp);
230	    mpn_mul (xp, ap1, anp, bp1, bnp);
231	    anp = anp + bnp - n;
232	    ASSERT (anp <= n || xp[2*n]==0);
233	    anp-= anp > n;
234	    cy = mpn_sub (xp, xp, n, xp + n, anp);
235	    xp[n] = 0;
236	    MPN_INCR_U (xp, n+1, cy);
237	  }
238	else
239	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
240      }
241
242      /* Here the CRT recomposition begins.
243
244	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
245	 Division by 2 is a bitwise rotation.
246
247	 Assumes xp normalised mod (B^n+1).
248
249	 The residue class [0] is represented by [B^n-1]; except when
250	 both input are ZERO.
251      */
252
253#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
254#if HAVE_NATIVE_mpn_rsh1add_nc
255      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
256      hi = cy << (GMP_NUMB_BITS - 1);
257      cy = 0;
258      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
259	 overflows, i.e. a further increment will not overflow again. */
260#else /* ! _nc */
261      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
262      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
263      cy >>= 1;
264      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
265	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
266#endif
267#if GMP_NAIL_BITS == 0
268      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
269#else
270      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
271      rp[n-1] ^= hi;
272#endif
273#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
274#if HAVE_NATIVE_mpn_add_nc
275      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
276#else /* ! _nc */
277      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
278#endif
279      cy += (rp[0]&1);
280      mpn_rshift(rp, rp, n, 1);
281      ASSERT (cy <= 2);
282      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
283      cy >>= 1;
284      /* We can have cy != 0 only if hi = 0... */
285      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
286      rp[n-1] |= hi;
287      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
288#endif
289      ASSERT (cy <= 1);
290      /* Next increment can not overflow, read the previous comments about cy. */
291      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
292      MPN_INCR_U(rp, n, cy);
293
294      /* Compute the highest half:
295	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
296       */
297      if (UNLIKELY (an + bn < rn))
298	{
299	  /* Note that in this case, the only way the result can equal
300	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
301	     then the output of both the recursive calls and this CRT
302	     reconstruction is zero, not B^{rn} - 1. Which is good,
303	     since the latter representation doesn't fit in the output
304	     area.*/
305	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
306
307	  /* FIXME: This subtraction of the high parts is not really
308	     necessary, we do it to get the carry out, and for sanity
309	     checking. */
310	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
311				   xp + an + bn - n, rn - (an + bn), cy);
312	  ASSERT (an + bn == rn - 1 ||
313		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
314	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
315	  ASSERT (cy == (xp + an + bn - n)[0]);
316	}
317      else
318	{
319	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
320	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
321	     DECR will affect _at most_ the lowest n limbs. */
322	  MPN_DECR_U (rp, 2*n, cy);
323	}
324#undef a0
325#undef a1
326#undef b0
327#undef b1
328#undef xp
329#undef sp1
330    }
331}
332
333mp_size_t
334mpn_mulmod_bnm1_next_size (mp_size_t n)
335{
336  mp_size_t nh;
337
338  if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
339    return n;
340  if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
341    return (n + (2-1)) & (-2);
342  if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
343    return (n + (4-1)) & (-4);
344
345  nh = (n + 1) >> 1;
346
347  if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
348    return (n + (8-1)) & (-8);
349
350  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
351}
352