1/* mpn_gcdext -- Extended Greatest Common Divisor.
2
3Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
4Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MP Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20
21#include "gmp.h"
22#include "gmp-impl.h"
23#include "longlong.h"
24
25/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26   the size is returned (if inputs are non-normalized, result may be
27   non-normalized too). Temporary space needed is M->n + n.
28 */
29static size_t
30hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32{
33  mp_limb_t ah, bh;
34
35  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36
37     t  = u00 * a
38     r  = u10 * b
39     r += t;
40
41     t  = u11 * b
42     b  = u01 * a
43     b += t;
44  */
45
46  if (M->n >= n)
47    {
48      mpn_mul (tp, M->p[0][0], M->n, ap, n);
49      mpn_mul (rp, M->p[1][0], M->n, bp, n);
50    }
51  else
52    {
53      mpn_mul (tp, ap, n, M->p[0][0], M->n);
54      mpn_mul (rp, bp, n, M->p[1][0], M->n);
55    }
56
57  ah = mpn_add_n (rp, rp, tp, n + M->n);
58
59  if (M->n >= n)
60    {
61      mpn_mul (tp, M->p[1][1], M->n, bp, n);
62      mpn_mul (bp, M->p[0][1], M->n, ap, n);
63    }
64  else
65    {
66      mpn_mul (tp, bp, n, M->p[1][1], M->n);
67      mpn_mul (bp, ap, n, M->p[0][1], M->n);
68    }
69  bh = mpn_add_n (bp, bp, tp, n + M->n);
70
71  n += M->n;
72  if ( (ah | bh) > 0)
73    {
74      rp[n] = ah;
75      bp[n] = bh;
76      n++;
77    }
78  else
79    {
80      /* Normalize */
81      while ( (rp[n-1] | bp[n-1]) == 0)
82	n--;
83    }
84
85  return n;
86}
87
88#define COMPUTE_V_ITCH(n) (2*(n) + 1)
89
90/* Computes |v| = |(g - u a)| / b, where u may be positive or
91   negative, and v is of the opposite sign. a, b are of size n, u and
92   v at most size n, and v must have space for n+1 limbs. */
93static mp_size_t
94compute_v (mp_ptr vp,
95	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96	   mp_srcptr gp, mp_size_t gn,
97	   mp_srcptr up, mp_size_t usize,
98	   mp_ptr tp)
99{
100  mp_size_t size;
101  mp_size_t an;
102  mp_size_t bn;
103  mp_size_t vn;
104
105  ASSERT (n > 0);
106  ASSERT (gn > 0);
107  ASSERT (usize != 0);
108
109  size = ABS (usize);
110  ASSERT (size <= n);
111
112  an = n;
113  MPN_NORMALIZE (ap, an);
114
115  if (an >= size)
116    mpn_mul (tp, ap, an, up, size);
117  else
118    mpn_mul (tp, up, size, ap, an);
119
120  size += an;
121  size -= tp[size - 1] == 0;
122
123  ASSERT (gn <= size);
124
125  if (usize > 0)
126    {
127      /* |v| = -v = (u a - g) / b */
128
129      ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130      MPN_NORMALIZE (tp, size);
131      if (size == 0)
132	return 0;
133    }
134  else
135    { /* usize < 0 */
136      /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137      mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
138      if (cy)
139	tp[size++] = cy;
140    }
141
142  /* Now divide t / b. There must be no remainder */
143  bn = n;
144  MPN_NORMALIZE (bp, bn);
145  ASSERT (size >= bn);
146
147  vn = size + 1 - bn;
148  ASSERT (vn <= n + 1);
149
150  mpn_divexact (vp, tp, size, bp, bn);
151  vn -= (vp[vn-1] == 0);
152
153  return vn;
154}
155
156/* Temporary storage:
157
158   Initial division: Quotient of at most an - n + 1 <= an limbs.
159
160   Storage for u0 and u1: 2(n+1).
161
162   Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
163
164   Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
165
166   When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
167
168   When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
169
170   For the lehmer call after the loop, Let T denote
171   GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172   u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173   for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174   sufficient for both operations.
175
176*/
177
178/* Optimal choice of p seems difficult. In each iteration the division
179 * of work between hgcd and the updates of u0 and u1 depends on the
180 * current size of the u. It may be desirable to use a different
181 * choice of p in each iteration. Also the input size seems to matter;
182 * choosing p = n / 3 in the first iteration seems to improve
183 * performance slightly for input size just above the threshold, but
184 * degrade performance for larger inputs. */
185#define CHOOSE_P_1(n) ((n) / 2)
186#define CHOOSE_P_2(n) ((n) / 3)
187
188mp_size_t
189mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
191{
192  mp_size_t talloc;
193  mp_size_t scratch;
194  mp_size_t matrix_scratch;
195  mp_size_t ualloc = n + 1;
196
197  mp_size_t un;
198  mp_ptr u0;
199  mp_ptr u1;
200
201  mp_ptr tp;
202
203  TMP_DECL;
204
205  ASSERT (an >= n);
206  ASSERT (n > 0);
207
208  TMP_MARK;
209
210  /* FIXME: Check for small sizes first, before setting up temporary
211     storage etc. */
212  talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
213
214  /* For initial division */
215  scratch = an - n + 1;
216  if (scratch > talloc)
217    talloc = scratch;
218
219  if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
220    {
221      /* For hgcd loop. */
222      mp_size_t hgcd_scratch;
223      mp_size_t update_scratch;
224      mp_size_t p1 = CHOOSE_P_1 (n);
225      mp_size_t p2 = CHOOSE_P_2 (n);
226      mp_size_t min_p = MIN(p1, p2);
227      mp_size_t max_p = MAX(p1, p2);
228      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229      hgcd_scratch = mpn_hgcd_itch (n - min_p);
230      update_scratch = max_p + n - 1;
231
232      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233      if (scratch > talloc)
234	talloc = scratch;
235
236      /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237	 copies of a and b. */
238      scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239	+ 3*GCDEXT_DC_THRESHOLD;
240
241      if (scratch > talloc)
242	talloc = scratch;
243
244      /* Cofactors u0 and u1 */
245      talloc += 2*(n+1);
246    }
247
248  tp = TMP_ALLOC_LIMBS(talloc);
249
250  if (an > n)
251    {
252      mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
253
254      if (mpn_zero_p (ap, n))
255	{
256	  MPN_COPY (gp, bp, n);
257	  *usizep = 0;
258	  TMP_FREE;
259	  return n;
260	}
261    }
262
263  if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
264    {
265      mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
266
267      TMP_FREE;
268      return gn;
269    }
270
271  MPN_ZERO (tp, 2*ualloc);
272  u0 = tp; tp += ualloc;
273  u1 = tp; tp += ualloc;
274
275  {
276    /* For the first hgcd call, there are no u updates, and it makes
277       some sense to use a different choice for p. */
278
279    /* FIXME: We could trim use of temporary storage, since u0 and u1
280       are not used yet. For the hgcd call, we could swap in the u0
281       and u1 pointers for the relevant matrix elements. */
282
283    struct hgcd_matrix M;
284    mp_size_t p = CHOOSE_P_1 (n);
285    mp_size_t nn;
286
287    mpn_hgcd_matrix_init (&M, n - p, tp);
288    nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
289    if (nn > 0)
290      {
291	ASSERT (M.n <= (n - p - 1)/2);
292	ASSERT (M.n + p <= (p + n - 1) / 2);
293
294	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
295	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
296
297	MPN_COPY (u0, M.p[1][0], M.n);
298	MPN_COPY (u1, M.p[1][1], M.n);
299	un = M.n;
300	while ( (u0[un-1] | u1[un-1] ) == 0)
301	  un--;
302      }
303    else
304      {
305	/* mpn_hgcd has failed. Then either one of a or b is very
306	   small, or the difference is very small. Perform one
307	   subtraction followed by one division. */
308	mp_size_t gn;
309	mp_size_t updated_un = 1;
310
311	u1[0] = 1;
312
313	/* Temporary storage 2n + 1 */
314	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315				    u0, u1, &updated_un, tp, tp + n);
316	if (n == 0)
317	  {
318	    TMP_FREE;
319	    return gn;
320	  }
321
322	un = updated_un;
323	ASSERT (un < ualloc);
324      }
325  }
326
327  while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
328    {
329      struct hgcd_matrix M;
330      mp_size_t p = CHOOSE_P_2 (n);
331      mp_size_t nn;
332
333      mpn_hgcd_matrix_init (&M, n - p, tp);
334      nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
335      if (nn > 0)
336	{
337	  mp_ptr t0;
338
339	  t0 = tp + matrix_scratch;
340	  ASSERT (M.n <= (n - p - 1)/2);
341	  ASSERT (M.n + p <= (p + n - 1) / 2);
342
343	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
345
346	  /* By the same analysis as for mpn_hgcd_matrix_mul */
347	  ASSERT (M.n + un <= ualloc);
348
349	  /* FIXME: This copying could be avoided by some swapping of
350	   * pointers. May need more temporary storage, though. */
351	  MPN_COPY (t0, u0, un);
352
353	  /* Temporary storage ualloc */
354	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
355
356	  ASSERT (un < ualloc);
357	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
358	}
359      else
360	{
361	  /* mpn_hgcd has failed. Then either one of a or b is very
362	     small, or the difference is very small. Perform one
363	     subtraction followed by one division. */
364	  mp_size_t gn;
365	  mp_size_t updated_un = un;
366
367	  /* Temporary storage 2n + 1 */
368	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369				      u0, u1, &updated_un, tp, tp + n);
370	  if (n == 0)
371	    {
372	      TMP_FREE;
373	      return gn;
374	    }
375
376	  un = updated_un;
377	  ASSERT (un < ualloc);
378	}
379    }
380
381  if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
382    {
383      /* Must return the smallest cofactor, +u1 or -u0 */
384      int c;
385
386      MPN_COPY (gp, ap, n);
387
388      MPN_CMP (c, u0, u1, un);
389      /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390	 this case we choose the cofactor + 1, corresponding to G = A
391	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392      ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
393      if (c < 0)
394	{
395	  MPN_NORMALIZE (u0, un);
396	  MPN_COPY (up, u0, un);
397	  *usizep = -un;
398	}
399      else
400	{
401	  MPN_NORMALIZE_NOT_ZERO (u1, un);
402	  MPN_COPY (up, u1, un);
403	  *usizep = un;
404	}
405
406      TMP_FREE;
407      return n;
408    }
409  else if (mpn_zero_p (u0, un))
410    {
411      mp_size_t gn;
412      ASSERT (un == 1);
413      ASSERT (u1[0] == 1);
414
415      /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
416      gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
417
418      TMP_FREE;
419      return gn;
420    }
421  else
422    {
423      /* We have A = ... a + ... b
424		 B =  u0 a +  u1 b
425
426		 a = u1  A + ... B
427		 b = -u0 A + ... B
428
429	 with bounds
430
431	   |u0|, |u1| <= B / min(a, b)
432
433	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
434	 Here, u, v are bounded by
435
436	 |u| <= b,
437	 |v| <= a
438      */
439
440      mp_size_t u0n;
441      mp_size_t u1n;
442      mp_size_t lehmer_un;
443      mp_size_t lehmer_vn;
444      mp_size_t gn;
445
446      mp_ptr lehmer_up;
447      mp_ptr lehmer_vp;
448      int negate;
449
450      lehmer_up = tp; tp += n;
451
452      /* Call mpn_gcdext_lehmer_n with copies of a and b. */
453      MPN_COPY (tp, ap, n);
454      MPN_COPY (tp + n, bp, n);
455      gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
456
457      u0n = un;
458      MPN_NORMALIZE (u0, u0n);
459      if (lehmer_un == 0)
460	{
461	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
462	  MPN_COPY (up, u0, u0n);
463	  *usizep = -u0n;
464
465	  TMP_FREE;
466	  return gn;
467	}
468
469      lehmer_vp = tp;
470      /* Compute v = (g - u a) / b */
471      lehmer_vn = compute_v (lehmer_vp,
472			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
473
474      if (lehmer_un > 0)
475	negate = 0;
476      else
477	{
478	  lehmer_un = -lehmer_un;
479	  negate = 1;
480	}
481
482      u1n = un;
483      MPN_NORMALIZE (u1, u1n);
484
485      /* It's possible that u0 = 1, u1 = 0 */
486      if (u1n == 0)
487	{
488	  ASSERT (un == 1);
489	  ASSERT (u0[0] == 1);
490
491	  /* u1 == 0 ==> u u1 + v u0 = v */
492	  MPN_COPY (up, lehmer_vp, lehmer_vn);
493	  *usizep = negate ? lehmer_vn : - lehmer_vn;
494
495	  TMP_FREE;
496	  return gn;
497	}
498
499      ASSERT (lehmer_un + u1n <= ualloc);
500      ASSERT (lehmer_vn + u0n <= ualloc);
501
502      /* Now u0, u1, u are non-zero. We may still have v == 0 */
503
504      /* Compute u u0 */
505      if (lehmer_un <= u1n)
506	/* Should be the common case */
507	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
508      else
509	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
510
511      un = u1n + lehmer_un;
512      un -= (up[un - 1] == 0);
513
514      if (lehmer_vn > 0)
515	{
516	  mp_limb_t cy;
517
518	  /* Overwrites old u1 value */
519	  if (lehmer_vn <= u0n)
520	    /* Should be the common case */
521	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
522	  else
523	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
524
525	  u1n = u0n + lehmer_vn;
526	  u1n -= (u1[u1n - 1] == 0);
527
528	  if (u1n <= un)
529	    {
530	      cy = mpn_add (up, up, un, u1, u1n);
531	    }
532	  else
533	    {
534	      cy = mpn_add (up, u1, u1n, up, un);
535	      un = u1n;
536	    }
537	  up[un] = cy;
538	  un += (cy != 0);
539
540	  ASSERT (un < ualloc);
541	}
542      *usizep = negate ? -un : un;
543
544      TMP_FREE;
545      return gn;
546    }
547}
548