1/* hgcd.c.
2
3   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of the GNU Lesser General Public License as published by
13the Free Software Foundation; either version 3 of the License, or (at your
14option) any later version.
15
16The GNU MP Library is distributed in the hope that it will be useful, but
17WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19License for more details.
20
21You should have received a copy of the GNU Lesser General Public License
22along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23
24#include "gmp.h"
25#include "gmp-impl.h"
26#include "longlong.h"
27
28/* For input of size n, matrix elements are of size at most ceil(n/2)
29   - 1, but we need two limbs extra. */
30void
31mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
32{
33  mp_size_t s = (n+1)/2 + 1;
34  M->alloc = s;
35  M->n = 1;
36  MPN_ZERO (p, 4 * s);
37  M->p[0][0] = p;
38  M->p[0][1] = p + s;
39  M->p[1][0] = p + 2 * s;
40  M->p[1][1] = p + 3 * s;
41
42  M->p[0][0][0] = M->p[1][1][0] = 1;
43}
44
45/* Updated column COL, adding in column (1-COL). */
46static void
47hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
48{
49  mp_limb_t c0, c1;
50  ASSERT (col < 2);
51
52  c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
53  c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
54
55  M->p[0][col][M->n] = c0;
56  M->p[1][col][M->n] = c1;
57
58  M->n += (c0 | c1) != 0;
59  ASSERT (M->n < M->alloc);
60}
61
62/* Updated column COL, adding in column Q * (1-COL). Temporary
63 * storage: qn + n <= M->alloc, where n is the size of the largest
64 * element in column 1 - COL. */
65static void
66hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
67		      unsigned col, mp_ptr tp)
68{
69  ASSERT (col < 2);
70
71  if (qn == 1)
72    {
73      mp_limb_t q = qp[0];
74      mp_limb_t c0, c1;
75
76      c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
77      c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
78
79      M->p[0][col][M->n] = c0;
80      M->p[1][col][M->n] = c1;
81
82      M->n += (c0 | c1) != 0;
83    }
84  else
85    {
86      unsigned row;
87
88      /* Carries for the unlikely case that we get both high words
89	 from the multiplication and carries from the addition. */
90      mp_limb_t c[2];
91      mp_size_t n;
92
93      /* The matrix will not necessarily grow in size by qn, so we
94	 need normalization in order not to overflow M. */
95
96      for (n = M->n; n + qn > M->n; n--)
97	{
98	  ASSERT (n > 0);
99	  if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
100	    break;
101	}
102
103      ASSERT (qn + n <= M->alloc);
104
105      for (row = 0; row < 2; row++)
106	{
107	  if (qn <= n)
108	    mpn_mul (tp, M->p[row][1-col], n, qp, qn);
109	  else
110	    mpn_mul (tp, qp, qn, M->p[row][1-col], n);
111
112	  ASSERT (n + qn >= M->n);
113	  c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
114	}
115      if (c[0] | c[1])
116	{
117	  M->n = n + qn + 1;
118	  M->p[0][col][M->n - 1] = c[0];
119	  M->p[1][col][M->n - 1] = c[1];
120	}
121      else
122	{
123	  n += qn;
124	  n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
125	  if (n > M->n)
126	    M->n = n;
127	}
128    }
129
130  ASSERT (M->n < M->alloc);
131}
132
133/* Multiply M by M1 from the right. Since the M1 elements fit in
134   GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
135   temporary space M->n */
136static void
137hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
138		   mp_ptr tp)
139{
140  mp_size_t n0, n1;
141
142  /* Could avoid copy by some swapping of pointers. */
143  MPN_COPY (tp, M->p[0][0], M->n);
144  n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
145  MPN_COPY (tp, M->p[1][0], M->n);
146  n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
147
148  /* Depends on zero initialization */
149  M->n = MAX(n0, n1);
150  ASSERT (M->n < M->alloc);
151}
152
153/* Perform a few steps, using some of mpn_hgcd2, subtraction and
154   division. Reduces the size by almost one limb or more, but never
155   below the given size s. Return new size for a and b, or 0 if no
156   more steps are possible.
157
158   If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
159   limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
160   fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
161   hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
162   resulting size of $.
163
164   If N is the input size to the calling hgcd, then s = floor(N/2) +
165   1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
166   < N, so N is sufficient.
167*/
168
169static mp_size_t
170hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
171	   struct hgcd_matrix *M, mp_ptr tp)
172{
173  struct hgcd_matrix1 M1;
174  mp_limb_t mask;
175  mp_limb_t ah, al, bh, bl;
176  mp_size_t an, bn, qn;
177  int col;
178
179  ASSERT (n > s);
180
181  mask = ap[n-1] | bp[n-1];
182  ASSERT (mask > 0);
183
184  if (n == s + 1)
185    {
186      if (mask < 4)
187	goto subtract;
188
189      ah = ap[n-1]; al = ap[n-2];
190      bh = bp[n-1]; bl = bp[n-2];
191    }
192  else if (mask & GMP_NUMB_HIGHBIT)
193    {
194      ah = ap[n-1]; al = ap[n-2];
195      bh = bp[n-1]; bl = bp[n-2];
196    }
197  else
198    {
199      int shift;
200
201      count_leading_zeros (shift, mask);
202      ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
203      al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
204      bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
205      bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
206    }
207
208  /* Try an mpn_hgcd2 step */
209  if (mpn_hgcd2 (ah, al, bh, bl, &M1))
210    {
211      /* Multiply M <- M * M1 */
212      hgcd_matrix_mul_1 (M, &M1, tp);
213
214      /* Can't swap inputs, so we need to copy. */
215      MPN_COPY (tp, ap, n);
216      /* Multiply M1^{-1} (a;b) */
217      return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n);
218    }
219
220 subtract:
221  /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
222     bh was too small, or ah, bh were (almost) equal. Perform one
223     subtraction step (for possible cancellation of high limbs),
224     followed by one division. */
225
226  /* Since we must ensure that #(a-b) > s, we handle cancellation of
227     high limbs explicitly up front. (FIXME: Or is it better to just
228     subtract, normalize, and use an addition to undo if it turns out
229     the the difference is too small?) */
230  for (an = n; an > s; an--)
231    if (ap[an-1] != bp[an-1])
232      break;
233
234  if (an == s)
235    return 0;
236
237  /* Maintain a > b. When needed, swap a and b, and let col keep track
238     of how to update M. */
239  if (ap[an-1] > bp[an-1])
240    {
241      /* a is largest. In the subtraction step, we need to update
242	 column 1 of M */
243      col = 1;
244    }
245  else
246    {
247      MP_PTR_SWAP (ap, bp);
248      col = 0;
249    }
250
251  bn = n;
252  MPN_NORMALIZE (bp, bn);
253  if (bn <= s)
254    return 0;
255
256  /* We have #a, #b > s. When is it possible that #(a-b) < s? For
257     cancellation to happen, the numbers must be of the form
258
259       a = x + 1, 0,            ..., 0,            al
260       b = x    , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
261
262     where al, bl denotes the least significant k limbs. If al < bl,
263     then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
264     then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
265
266  if (ap[an-1] == bp[an-1] + 1)
267    {
268      mp_size_t k;
269      int c;
270      for (k = an-1; k > s; k--)
271	if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
272	  break;
273
274      MPN_CMP (c, ap, bp, k);
275      if (c < 0)
276	{
277	  mp_limb_t cy;
278
279	  /* The limbs from k and up are cancelled. */
280	  if (k == s)
281	    return 0;
282	  cy = mpn_sub_n (ap, ap, bp, k);
283	  ASSERT (cy == 1);
284	  an = k;
285	}
286      else
287	{
288	  ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
289	  ap[k] = 1;
290	  an = k + 1;
291	}
292    }
293  else
294    ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
295
296  ASSERT (an > s);
297  ASSERT (ap[an-1] > 0);
298  ASSERT (bn > s);
299  ASSERT (bp[bn-1] > 0);
300
301  hgcd_matrix_update_1 (M, col);
302
303  if (an < bn)
304    {
305      MPN_PTR_SWAP (ap, an, bp, bn);
306      col ^= 1;
307    }
308  else if (an == bn)
309    {
310      int c;
311      MPN_CMP (c, ap, bp, an);
312      if (c < 0)
313	{
314	  MP_PTR_SWAP (ap, bp);
315	  col ^= 1;
316	}
317    }
318
319  /* Divide a / b. */
320  qn = an + 1 - bn;
321
322  /* FIXME: We could use an approximate division, that may return a
323     too small quotient, and only guarantee that the size of r is
324     almost the size of b. FIXME: Let ap and remainder overlap. */
325  mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
326  qn -= (tp[qn -1] == 0);
327
328  /* Normalize remainder */
329  an = bn;
330  for ( ; an > s; an--)
331    if (ap[an-1] > 0)
332      break;
333
334  if (an <= s)
335    {
336      /* Quotient is too large */
337      mp_limb_t cy;
338
339      cy = mpn_add (ap, bp, bn, ap, an);
340
341      if (cy > 0)
342	{
343	  ASSERT (bn < n);
344	  ap[bn] = cy;
345	  bp[bn] = 0;
346	  bn++;
347	}
348
349      MPN_DECR_U (tp, qn, 1);
350      qn -= (tp[qn-1] == 0);
351    }
352
353  if (qn > 0)
354    hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
355
356  return bn;
357}
358
359/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
360   with elements of size at most (n+1)/2 - 1. Returns new size of a,
361   b, or zero if no reduction is possible. */
362mp_size_t
363mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
364		 struct hgcd_matrix *M, mp_ptr tp)
365{
366  mp_size_t s = n/2 + 1;
367  mp_size_t nn;
368
369  ASSERT (n > s);
370  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
371
372  nn = hgcd_step (n, ap, bp, s, M, tp);
373  if (!nn)
374    return 0;
375
376  for (;;)
377    {
378      n = nn;
379      ASSERT (n > s);
380      nn = hgcd_step (n, ap, bp, s, M, tp);
381      if (!nn )
382	return n;
383    }
384}
385
386/* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs
387   of temporary storage (see mpn_matrix22_mul_itch). */
388void
389mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
390		     mp_ptr tp)
391{
392  mp_size_t n;
393
394  /* About the new size of M:s elements. Since M1's diagonal elements
395     are > 0, no element can decrease. The new elements are of size
396     M->n + M1->n, one limb more or less. The computation of the
397     matrix product produces elements of size M->n + M1->n + 1. But
398     the true size, after normalization, may be three limbs smaller.
399
400     The reason that the product has normalized size >= M->n + M1->n -
401     2 is subtle. It depends on the fact that M and M1 can be factored
402     as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have
403     M ending with a large power and M1 starting with a large power of
404     the same matrix. */
405
406  /* FIXME: Strassen multiplication gives only a small speedup. In FFT
407     multiplication range, this function could be sped up quite a lot
408     using invariance. */
409  ASSERT (M->n + M1->n < M->alloc);
410
411  ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
412	   | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
413
414  ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
415	   | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
416
417  mpn_matrix22_mul (M->p[0][0], M->p[0][1],
418		    M->p[1][0], M->p[1][1], M->n,
419		    M1->p[0][0], M1->p[0][1],
420		    M1->p[1][0], M1->p[1][1], M1->n, tp);
421
422  /* Index of last potentially non-zero limb, size is one greater. */
423  n = M->n + M1->n;
424
425  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
426  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
427  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
428
429  ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
430
431  M->n = n + 1;
432}
433
434/* Multiplies the least significant p limbs of (a;b) by M^-1.
435   Temporary space needed: 2 * (p + M->n)*/
436mp_size_t
437mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
438			mp_size_t n, mp_ptr ap, mp_ptr bp,
439			mp_size_t p, mp_ptr tp)
440{
441  /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
442     = (r11 a - r01 b; - r10 a + r00 b */
443
444  mp_ptr t0 = tp;
445  mp_ptr t1 = tp + p + M->n;
446  mp_limb_t ah, bh;
447  mp_limb_t cy;
448
449  ASSERT (p + M->n  < n);
450
451  /* First compute the two values depending on a, before overwriting a */
452
453  if (M->n >= p)
454    {
455      mpn_mul (t0, M->p[1][1], M->n, ap, p);
456      mpn_mul (t1, M->p[1][0], M->n, ap, p);
457    }
458  else
459    {
460      mpn_mul (t0, ap, p, M->p[1][1], M->n);
461      mpn_mul (t1, ap, p, M->p[1][0], M->n);
462    }
463
464  /* Update a */
465  MPN_COPY (ap, t0, p);
466  ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
467
468  if (M->n >= p)
469    mpn_mul (t0, M->p[0][1], M->n, bp, p);
470  else
471    mpn_mul (t0, bp, p, M->p[0][1], M->n);
472
473  cy = mpn_sub (ap, ap, n, t0, p + M->n);
474  ASSERT (cy <= ah);
475  ah -= cy;
476
477  /* Update b */
478  if (M->n >= p)
479    mpn_mul (t0, M->p[0][0], M->n, bp, p);
480  else
481    mpn_mul (t0, bp, p, M->p[0][0], M->n);
482
483  MPN_COPY (bp, t0, p);
484  bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
485  cy = mpn_sub (bp, bp, n, t1, p + M->n);
486  ASSERT (cy <= bh);
487  bh -= cy;
488
489  if (ah > 0 || bh > 0)
490    {
491      ap[n] = ah;
492      bp[n] = bh;
493      n++;
494    }
495  else
496    {
497      /* The subtraction can reduce the size by at most one limb. */
498      if (ap[n-1] == 0 && bp[n-1] == 0)
499	n--;
500    }
501  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
502  return n;
503}
504
505/* Size analysis for hgcd:
506
507   For the recursive calls, we have n1 <= ceil(n / 2). Then the
508   storage need is determined by the storage for the recursive call
509   computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
510   (after this, the storage needed for M1 can be recycled).
511
512   Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
513   = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
514   and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total,
515   4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12.
516
517   For the recursive call, we need S(n1) = S(ceil(n/2)).
518
519   S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2))
520	<= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k))
521	<= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k))
522	<= 20 ceil(n/4) + 22k + S(ceil(n/2^k))
523*/
524
525mp_size_t
526mpn_hgcd_itch (mp_size_t n)
527{
528  unsigned k;
529  int count;
530  mp_size_t nscaled;
531
532  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
533    return MPN_HGCD_LEHMER_ITCH (n);
534
535  /* Get the recursion depth. */
536  nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
537  count_leading_zeros (count, nscaled);
538  k = GMP_LIMB_BITS - count;
539
540  return 20 * ((n+3) / 4) + 22 * k
541    + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
542}
543
544/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
545   with elements of size at most (n+1)/2 - 1. Returns new size of a,
546   b, or zero if no reduction is possible. */
547
548mp_size_t
549mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
550	  struct hgcd_matrix *M, mp_ptr tp)
551{
552  mp_size_t s = n/2 + 1;
553  mp_size_t n2 = (3*n)/4 + 1;
554
555  mp_size_t p, nn;
556  int success = 0;
557
558  if (n <= s)
559    /* Happens when n <= 2, a fairly uninteresting case but exercised
560       by the random inputs of the testsuite. */
561    return 0;
562
563  ASSERT ((ap[n-1] | bp[n-1]) > 0);
564
565  ASSERT ((n+1)/2 - 1 < M->alloc);
566
567  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
568    return mpn_hgcd_lehmer (ap, bp, n, M, tp);
569
570  p = n/2;
571  nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
572  if (nn > 0)
573    {
574      /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
575	 = 2 (n - 1) */
576      n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
577      success = 1;
578    }
579  while (n > n2)
580    {
581      /* Needs n + 1 storage */
582      nn = hgcd_step (n, ap, bp, s, M, tp);
583      if (!nn)
584	return success ? n : 0;
585      n = nn;
586      success = 1;
587    }
588
589  if (n > s + 2)
590    {
591      struct hgcd_matrix M1;
592      mp_size_t scratch;
593
594      p = 2*s - n + 1;
595      scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
596
597      mpn_hgcd_matrix_init(&M1, n - p, tp);
598      nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
599      if (nn > 0)
600	{
601	  /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
602	  ASSERT (M->n + 2 >= M1.n);
603
604	  /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
605	     then either q or q + 1 is a correct quotient, and M1 will
606	     start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
607	     rules out the case that the size of M * M1 is much
608	     smaller than the expected M->n + M1->n. */
609
610	  ASSERT (M->n + M1.n < M->alloc);
611
612	  /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
613	     = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
614	  n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
615
616	  /* We need a bound for of M->n + M1.n. Let n be the original
617	     input size. Then
618
619	       ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
620
621	     and it follows that
622
623	       M.n + M1.n <= ceil(n/2) + 1
624
625	     Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
626	     amount of needed scratch space. */
627	  mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
628	  success = 1;
629	}
630    }
631
632  /* This really is the base case */
633  for (;;)
634    {
635      /* Needs s+3 < n */
636      nn = hgcd_step (n, ap, bp, s, M, tp);
637      if (!nn)
638	return success ? n : 0;
639
640      n = nn;
641      success = 1;
642    }
643}
644