1/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2   size.  Or more accurately, bn <= an < (3/2)bn.
3
4   Contributed to the GNU project by Torbjorn Granlund.
5   Additional improvements by Marco Bodrato.
6
7   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
8   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11Copyright 2006, 2007, 2008, 2010 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of the GNU Lesser General Public License as published by
17the Free Software Foundation; either version 3 of the License, or (at your
18option) any later version.
19
20The GNU MP Library is distributed in the hope that it will be useful, but
21WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23License for more details.
24
25You should have received a copy of the GNU Lesser General Public License
26along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27
28
29#include "gmp.h"
30#include "gmp-impl.h"
31
32/* Evaluate in: -1, 0, +1, +2, +inf
33
34  <-s--><--n--><--n--><--n-->
35   ____ ______ ______ ______
36  |_a3_|___a2_|___a1_|___a0_|
37   |b3_|___b2_|___b1_|___b0_|
38   <-t-><--n--><--n--><--n-->
39
40  v0  =  a0         * b0          #   A(0)*B(0)
41  v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
42  vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
43  v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
44  vinf=          a2 *         b2  # A(inf)*B(inf)
45*/
46
47#if TUNE_PROGRAM_BUILD
48#define MAYBE_mul_basecase 1
49#define MAYBE_mul_toom33   1
50#else
51#define MAYBE_mul_basecase						\
52  (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
53#define MAYBE_mul_toom33						\
54  (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
55#endif
56
57/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
58   multiplication at the infinity point. We may have
59   MAYBE_mul_basecase == 0, and still get s just below
60   MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
61   s == 1 and mpn_toom22_mul will crash.
62*/
63
64#define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
65  do {									\
66    if (MAYBE_mul_basecase						\
67	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
68      mpn_mul_basecase (p, a, n, b, n);					\
69    else if (! MAYBE_mul_toom33						\
70	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
71      mpn_toom22_mul (p, a, n, b, n, ws);				\
72    else								\
73      mpn_toom33_mul (p, a, n, b, n, ws);				\
74  } while (0)
75
76void
77mpn_toom33_mul (mp_ptr pp,
78		mp_srcptr ap, mp_size_t an,
79		mp_srcptr bp, mp_size_t bn,
80		mp_ptr scratch)
81{
82  mp_size_t n, s, t;
83  int vm1_neg;
84  mp_limb_t cy, vinf0;
85  mp_ptr gp;
86  mp_ptr as1, asm1, as2;
87  mp_ptr bs1, bsm1, bs2;
88
89#define a0  ap
90#define a1  (ap + n)
91#define a2  (ap + 2*n)
92#define b0  bp
93#define b1  (bp + n)
94#define b2  (bp + 2*n)
95
96  n = (an + 2) / (size_t) 3;
97
98  s = an - 2 * n;
99  t = bn - 2 * n;
100
101  ASSERT (an >= bn);
102
103  ASSERT (0 < s && s <= n);
104  ASSERT (0 < t && t <= n);
105
106  as1  = scratch + 4 * n + 4;
107  asm1 = scratch + 2 * n + 2;
108  as2 = pp + n + 1;
109
110  bs1 = pp;
111  bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
112  bs2 = pp + 2 * n + 2;
113
114  gp = scratch;
115
116  vm1_neg = 0;
117
118  /* Compute as1 and asm1.  */
119  cy = mpn_add (gp, a0, n, a2, s);
120#if HAVE_NATIVE_mpn_add_n_sub_n
121  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
122    {
123      cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
124      as1[n] = cy >> 1;
125      asm1[n] = 0;
126      vm1_neg = 1;
127    }
128  else
129    {
130      mp_limb_t cy2;
131      cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
132      as1[n] = cy + (cy2 >> 1);
133      asm1[n] = cy - (cy2 & 1);
134    }
135#else
136  as1[n] = cy + mpn_add_n (as1, gp, a1, n);
137  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
138    {
139      mpn_sub_n (asm1, a1, gp, n);
140      asm1[n] = 0;
141      vm1_neg = 1;
142    }
143  else
144    {
145      cy -= mpn_sub_n (asm1, gp, a1, n);
146      asm1[n] = cy;
147    }
148#endif
149
150  /* Compute as2.  */
151#if HAVE_NATIVE_mpn_rsblsh1_n
152  cy = mpn_add_n (as2, a2, as1, s);
153  if (s != n)
154    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
155  cy += as1[n];
156  cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
157#else
158#if HAVE_NATIVE_mpn_addlsh1_n
159  cy  = mpn_addlsh1_n (as2, a1, a2, s);
160  if (s != n)
161    cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
162  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
163#else
164  cy = mpn_add_n (as2, a2, as1, s);
165  if (s != n)
166    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
167  cy += as1[n];
168  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
169  cy -= mpn_sub_n (as2, as2, a0, n);
170#endif
171#endif
172  as2[n] = cy;
173
174  /* Compute bs1 and bsm1.  */
175  cy = mpn_add (gp, b0, n, b2, t);
176#if HAVE_NATIVE_mpn_add_n_sub_n
177  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
178    {
179      cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
180      bs1[n] = cy >> 1;
181      bsm1[n] = 0;
182      vm1_neg ^= 1;
183    }
184  else
185    {
186      mp_limb_t cy2;
187      cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
188      bs1[n] = cy + (cy2 >> 1);
189      bsm1[n] = cy - (cy2 & 1);
190    }
191#else
192  bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
193  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
194    {
195      mpn_sub_n (bsm1, b1, gp, n);
196      bsm1[n] = 0;
197      vm1_neg ^= 1;
198    }
199  else
200    {
201      cy -= mpn_sub_n (bsm1, gp, b1, n);
202      bsm1[n] = cy;
203    }
204#endif
205
206  /* Compute bs2.  */
207#if HAVE_NATIVE_mpn_rsblsh1_n
208  cy = mpn_add_n (bs2, b2, bs1, t);
209  if (t != n)
210    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
211  cy += bs1[n];
212  cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
213#else
214#if HAVE_NATIVE_mpn_addlsh1_n
215  cy  = mpn_addlsh1_n (bs2, b1, b2, t);
216  if (t != n)
217    cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
218  cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
219#else
220  cy  = mpn_add_n (bs2, bs1, b2, t);
221  if (t != n)
222    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
223  cy += bs1[n];
224  cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
225  cy -= mpn_sub_n (bs2, bs2, b0, n);
226#endif
227#endif
228  bs2[n] = cy;
229
230  ASSERT (as1[n] <= 2);
231  ASSERT (bs1[n] <= 2);
232  ASSERT (asm1[n] <= 1);
233  ASSERT (bsm1[n] <= 1);
234  ASSERT (as2[n] <= 6);
235  ASSERT (bs2[n] <= 6);
236
237#define v0    pp				/* 2n */
238#define v1    (pp + 2 * n)			/* 2n+1 */
239#define vinf  (pp + 4 * n)			/* s+t */
240#define vm1   scratch				/* 2n+1 */
241#define v2    (scratch + 2 * n + 1)		/* 2n+2 */
242#define scratch_out  (scratch + 5 * n + 5)
243
244  /* vm1, 2n+1 limbs */
245#ifdef SMALLER_RECURSION
246  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
247  cy = 0;
248  if (asm1[n] != 0)
249    cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
250  if (bsm1[n] != 0)
251    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
252  vm1[2 * n] = cy;
253#else
254  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
255#endif
256
257  TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
258
259  /* vinf, s+t limbs */
260  if (s > t)  mpn_mul (vinf, a2, s, b2, t);
261  else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
262
263  vinf0 = vinf[0];				/* v1 overlaps with this */
264
265#ifdef SMALLER_RECURSION
266  /* v1, 2n+1 limbs */
267  TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
268  if (as1[n] == 1)
269    {
270      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
271    }
272  else if (as1[n] != 0)
273    {
274#if HAVE_NATIVE_mpn_addlsh1_n
275      cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
276#else
277      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
278#endif
279    }
280  else
281    cy = 0;
282  if (bs1[n] == 1)
283    {
284      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
285    }
286  else if (bs1[n] != 0)
287    {
288#if HAVE_NATIVE_mpn_addlsh1_n
289      cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
290#else
291      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
292#endif
293    }
294  v1[2 * n] = cy;
295#else
296  cy = vinf[1];
297  TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
298  vinf[1] = cy;
299#endif
300
301  TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
302
303  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
304}
305