1/* Generate perfect square testing data.
2
3Copyright 2002, 2003, 2004 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of the GNU Lesser General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at your
10option) any later version.
11
12The GNU MP Library is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15License for more details.
16
17You should have received a copy of the GNU Lesser General Public License
18along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
19
20#include <stdio.h>
21#include <stdlib.h>
22
23#include "dumbmp.c"
24
25
26/* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
27   (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
28   residues and non-residues modulo small factors of that modulus.
29
30   For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
31   function exists specifically because 2^24-1 and 2^48-1 have nice sets of
32   prime factors.  For other limb sizes it's considered, but if it doesn't
33   have good factors then mpn_mod_1 will be used instead.
34
35   When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
36   selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
37   with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
38   GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
39   calculation within a single limb.
40
41   In either case primes can be combined to make divisors.  The table data
42   then effectively indicates remainders which are quadratic residues mod
43   all the primes.  This sort of combining reduces the number of steps
44   needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
45   Nothing is gained or lost in terms of detections, the same total fraction
46   of non-residues will be identified.
47
48   Nothing particularly sophisticated is attempted for combining factors to
49   make divisors.  This is probably a kind of knapsack problem so it'd be
50   too hard to attempt anything completely general.  For the usual 32 and 64
51   bit limbs we get a good enough result just pairing the biggest and
52   smallest which fit together, repeatedly.
53
54   Another aim is to get powerful combinations, ie. divisors which identify
55   biggest fraction of non-residues, and have those run first.  Again for
56   the usual 32 and 64 bits it seems good enough just to pair for big
57   divisors then sort according to the resulting fraction of non-residues
58   identified.
59
60   Also in this program, a table sq_res_0x100 of residues modulo 256 is
61   generated.  This simply fills bits into limbs of the appropriate
62   build-time GMP_LIMB_BITS each.
63
64*/
65
66
67/* Normally we aren't using const in gen*.c programs, so as not to have to
68   bother figuring out if it works, but using it with f_cmp_divisor and
69   f_cmp_fraction avoids warnings from the qsort calls. */
70
71/* Same tests as gmp.h. */
72#if  defined (__STDC__)                                 \
73  || defined (__cplusplus)                              \
74  || defined (_AIX)                                     \
75  || defined (__DECC)                                   \
76  || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
77  || defined (_MSC_VER)                                 \
78  || defined (_WIN32)
79#define HAVE_CONST        1
80#endif
81
82#if ! HAVE_CONST
83#define const
84#endif
85
86
87mpz_t  *sq_res_0x100;          /* table of limbs */
88int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
89int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
90double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
91
92int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
93int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
94int     max_divisor;       /* all divisors <= max_divisor */
95int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
96double  total_fraction;    /* of squares */
97mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
98mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
99mpz_t   pp_inverted;       /* invert_limb style inverse */
100mpz_t   mod_mask;          /* 2^mod_bits-1 */
101char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
102
103/* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
104struct rawfactor_t {
105  int     divisor;
106  int     multiplicity;
107};
108struct rawfactor_t  *rawfactor;
109int                 nrawfactor;
110
111/* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
112struct factor_t {
113  int     divisor;
114  mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
115  mpz_t   mask;      /* indicating squares mod divisor */
116  double  fraction;  /* squares/total */
117};
118struct factor_t  *factor;
119int              nfactor;       /* entries in use in factor array */
120int              factor_alloc;  /* entries allocated to factor array */
121
122
123int
124f_cmp_divisor (const void *parg, const void *qarg)
125{
126  const struct factor_t *p, *q;
127  p = parg;
128  q = qarg;
129  if (p->divisor > q->divisor)
130    return 1;
131  else if (p->divisor < q->divisor)
132    return -1;
133  else
134    return 0;
135}
136
137int
138f_cmp_fraction (const void *parg, const void *qarg)
139{
140  const struct factor_t *p, *q;
141  p = parg;
142  q = qarg;
143  if (p->fraction > q->fraction)
144    return 1;
145  else if (p->fraction < q->fraction)
146    return -1;
147  else
148    return 0;
149}
150
151/* Remove array[idx] by copying the remainder down, and adjust narray
152   accordingly.  */
153#define COLLAPSE_ELEMENT(array, idx, narray)                    \
154  do {                                                          \
155    mem_copyi ((char *) &(array)[idx],                          \
156               (char *) &(array)[idx+1],                        \
157               ((narray)-((idx)+1)) * sizeof (array[0]));       \
158    (narray)--;                                                 \
159  } while (0)
160
161
162/* return n*2^p mod m */
163int
164mul_2exp_mod (int n, int p, int m)
165{
166  int  i;
167  for (i = 0; i < p; i++)
168    n = (2 * n) % m;
169  return n;
170}
171
172/* return -n mod m */
173int
174neg_mod (int n, int m)
175{
176  ASSERT (n >= 0 && n < m);
177  return (n == 0 ? 0 : m-n);
178}
179
180/* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
181   "-(idx<<mod_bits)" can be a square modulo m.  */
182void
183square_mask (mpz_t mask, int m)
184{
185  int    p, i, r, idx;
186
187  p = mul_2exp_mod (1, mod_bits, m);
188  p = neg_mod (p, m);
189
190  mpz_set_ui (mask, 0L);
191  for (i = 0; i < m; i++)
192    {
193      r = (i * i) % m;
194      idx = (r * p) % m;
195      mpz_setbit (mask, (unsigned long) idx);
196    }
197}
198
199void
200generate_sq_res_0x100 (int limb_bits)
201{
202  int  i, res;
203
204  nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
205  sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
206
207  for (i = 0; i < nsq_res_0x100; i++)
208    mpz_init_set_ui (sq_res_0x100[i], 0L);
209
210  for (i = 0; i < 0x100; i++)
211    {
212      res = (i * i) % 0x100;
213      mpz_setbit (sq_res_0x100[res / limb_bits],
214                  (unsigned long) (res % limb_bits));
215    }
216
217  sq_res_0x100_num = 0;
218  for (i = 0; i < nsq_res_0x100; i++)
219    sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
220  sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
221}
222
223void
224generate_mod (int limb_bits, int nail_bits)
225{
226  int    numb_bits = limb_bits - nail_bits;
227  int    i, divisor;
228
229  mpz_init_set_ui (pp, 0L);
230  mpz_init_set_ui (pp_norm, 0L);
231  mpz_init_set_ui (pp_inverted, 0L);
232
233  /* no more than limb_bits many factors in a one limb modulus (and of
234     course in reality nothing like that many) */
235  factor_alloc = limb_bits;
236  factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
237  rawfactor = (struct rawfactor_t *)
238    xmalloc (factor_alloc * sizeof (*rawfactor));
239
240  if (numb_bits % 4 != 0)
241    {
242      strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
243      goto use_pp;
244    }
245
246  max_divisor = 2*limb_bits;
247  max_divisor_bits = log2_ceil (max_divisor);
248
249  if (numb_bits / 4 < max_divisor_bits)
250    {
251      /* Wind back to one limb worth of max_divisor, if that will let us use
252         mpn_mod_34lsub1.  */
253      max_divisor = limb_bits;
254      max_divisor_bits = log2_ceil (max_divisor);
255
256      if (numb_bits / 4 < max_divisor_bits)
257        {
258          strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
259          goto use_pp;
260        }
261    }
262
263  {
264    /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
265    mpz_t  m, q, r;
266    int    multiplicity;
267
268    mod34_bits = (numb_bits / 4) * 3;
269
270    /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
271       the mod34_bits mark, adding the two halves for a remainder of at most
272       mod34_bits+1 many bits */
273    mod_bits = mod34_bits + 1;
274
275    mpz_init_set_ui (m, 1L);
276    mpz_mul_2exp (m, m, mod34_bits);
277    mpz_sub_ui (m, m, 1L);
278
279    mpz_init (q);
280    mpz_init (r);
281
282    for (i = 3; i <= max_divisor; i++)
283      {
284        if (! isprime (i))
285          continue;
286
287        mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
288        if (mpz_sgn (r) != 0)
289          continue;
290
291        /* if a repeated prime is found it's used as an i^n in one factor */
292        divisor = 1;
293        multiplicity = 0;
294        do
295          {
296            if (divisor > max_divisor / i)
297              break;
298            multiplicity++;
299            mpz_set (m, q);
300            mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
301          }
302        while (mpz_sgn (r) == 0);
303
304        ASSERT (nrawfactor < factor_alloc);
305        rawfactor[nrawfactor].divisor = i;
306        rawfactor[nrawfactor].multiplicity = multiplicity;
307        nrawfactor++;
308      }
309
310    mpz_clear (m);
311    mpz_clear (q);
312    mpz_clear (r);
313  }
314
315  if (nrawfactor <= 2)
316    {
317      mpz_t  new_pp;
318
319      sprintf (mod34_excuse, "only %d small factor%s",
320               nrawfactor, nrawfactor == 1 ? "" : "s");
321
322    use_pp:
323      /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
324         tried with just one */
325      max_divisor = 2*limb_bits;
326      max_divisor_bits = log2_ceil (max_divisor);
327
328      mpz_init (new_pp);
329      nrawfactor = 0;
330      mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
331
332      /* one copy of each small prime */
333      mpz_set_ui (pp, 1L);
334      for (i = 3; i <= max_divisor; i++)
335        {
336          if (! isprime (i))
337            continue;
338
339          mpz_mul_ui (new_pp, pp, (unsigned long) i);
340          if (mpz_sizeinbase (new_pp, 2) > mod_bits)
341            break;
342          mpz_set (pp, new_pp);
343
344          ASSERT (nrawfactor < factor_alloc);
345          rawfactor[nrawfactor].divisor = i;
346          rawfactor[nrawfactor].multiplicity = 1;
347          nrawfactor++;
348        }
349
350      /* Plus an extra copy of one or more of the primes selected, if that
351         still fits in max_divisor and the total in mod_bits.  Usually only
352         3 or 5 will be candidates */
353      for (i = nrawfactor-1; i >= 0; i--)
354        {
355          if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
356            continue;
357          mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
358          if (mpz_sizeinbase (new_pp, 2) > mod_bits)
359            continue;
360          mpz_set (pp, new_pp);
361
362          rawfactor[i].multiplicity++;
363        }
364
365      mod_bits = mpz_sizeinbase (pp, 2);
366
367      mpz_set (pp_norm, pp);
368      while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
369        mpz_add (pp_norm, pp_norm, pp_norm);
370
371      mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
372
373      mpz_clear (new_pp);
374    }
375
376  /* start the factor array */
377  for (i = 0; i < nrawfactor; i++)
378    {
379      int  j;
380      ASSERT (nfactor < factor_alloc);
381      factor[nfactor].divisor = 1;
382      for (j = 0; j < rawfactor[i].multiplicity; j++)
383        factor[nfactor].divisor *= rawfactor[i].divisor;
384      nfactor++;
385    }
386
387 combine:
388  /* Combine entries in the factor array.  Combine the smallest entry with
389     the biggest one that will fit with it (ie. under max_divisor), then
390     repeat that with the new smallest entry. */
391  qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
392  for (i = nfactor-1; i >= 1; i--)
393    {
394      if (factor[i].divisor <= max_divisor / factor[0].divisor)
395        {
396          factor[0].divisor *= factor[i].divisor;
397          COLLAPSE_ELEMENT (factor, i, nfactor);
398          goto combine;
399        }
400    }
401
402  total_fraction = 1.0;
403  for (i = 0; i < nfactor; i++)
404    {
405      mpz_init (factor[i].inverse);
406      mpz_invert_ui_2exp (factor[i].inverse,
407                          (unsigned long) factor[i].divisor,
408                          (unsigned long) mod_bits);
409
410      mpz_init (factor[i].mask);
411      square_mask (factor[i].mask, factor[i].divisor);
412
413      /* fraction of possible squares */
414      factor[i].fraction = (double) mpz_popcount (factor[i].mask)
415        / factor[i].divisor;
416
417      /* total fraction of possible squares */
418      total_fraction *= factor[i].fraction;
419    }
420
421  /* best tests first (ie. smallest fraction) */
422  qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
423}
424
425void
426print (int limb_bits, int nail_bits)
427{
428  int    i;
429  mpz_t  mhi, mlo;
430
431  printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
432  printf ("\n");
433
434  printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
435          limb_bits, nail_bits);
436  printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
437          limb_bits, nail_bits);
438  printf ("#endif\n");
439  printf ("\n");
440
441  printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
442  printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
443          (1.0 - sq_res_0x100_fraction) * 100.0,
444          0x100 - sq_res_0x100_num);
445  printf ("static const mp_limb_t\n");
446  printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
447  for (i = 0; i < nsq_res_0x100; i++)
448    {
449      printf ("  CNST_LIMB(0x");
450      mpz_out_str (stdout, 16, sq_res_0x100[i]);
451      printf ("),\n");
452    }
453  printf ("};\n");
454  printf ("\n");
455
456  if (mpz_sgn (pp) != 0)
457    {
458      printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
459      printf ("/* PERFSQR_PP = ");
460    }
461  else
462    printf ("/* 2^%d-1 = ", mod34_bits);
463  for (i = 0; i < nrawfactor; i++)
464    {
465      if (i != 0)
466        printf (" * ");
467      printf ("%d", rawfactor[i].divisor);
468      if (rawfactor[i].multiplicity != 1)
469        printf ("^%d", rawfactor[i].multiplicity);
470    }
471  printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
472
473  printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
474  if (mpz_sgn (pp) != 0)
475    {
476      printf ("#define PERFSQR_PP            CNST_LIMB(0x");
477      mpz_out_str (stdout, 16, pp);
478      printf (")\n");
479      printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
480      mpz_out_str (stdout, 16, pp_norm);
481      printf (")\n");
482      printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
483      mpz_out_str (stdout, 16, pp_inverted);
484      printf (")\n");
485    }
486  printf ("\n");
487
488  mpz_init (mhi);
489  mpz_init (mlo);
490
491  printf ("/* This test identifies %.2f%% as non-squares. */\n",
492          (1.0 - total_fraction) * 100.0);
493  printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
494  printf ("  do {                              \\\n");
495  printf ("    mp_limb_t  r;                   \\\n");
496  if (mpz_sgn (pp) != 0)
497    printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
498  else
499    printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
500
501  for (i = 0; i < nfactor; i++)
502    {
503      printf ("                                    \\\n");
504      printf ("    /* %5.2f%% */                    \\\n",
505              (1.0 - factor[i].fraction) * 100.0);
506
507      printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
508              factor[i].divisor <= limb_bits ? 1 : 2,
509              factor[i].divisor);
510      mpz_out_str (stdout, 16, factor[i].inverse);
511      printf ("), \\\n");
512      printf ("                   CNST_LIMB(0x");
513
514      if ( factor[i].divisor <= limb_bits)
515        {
516          mpz_out_str (stdout, 16, factor[i].mask);
517        }
518      else
519        {
520          mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
521          mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
522          mpz_out_str (stdout, 16, mhi);
523          printf ("), CNST_LIMB(0x");
524          mpz_out_str (stdout, 16, mlo);
525        }
526      printf (")); \\\n");
527    }
528
529  printf ("  } while (0)\n");
530  printf ("\n");
531
532  printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
533          (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
534  printf ("\n");
535
536  printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
537  printf ("#define PERFSQR_DIVISORS  { 256,");
538  for (i = 0; i < nfactor; i++)
539      printf (" %d,", factor[i].divisor);
540  printf (" }\n");
541
542
543  mpz_clear (mhi);
544  mpz_clear (mlo);
545}
546
547int
548main (int argc, char *argv[])
549{
550  int  limb_bits, nail_bits;
551
552  if (argc != 3)
553    {
554      fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
555      exit (1);
556    }
557
558  limb_bits = atoi (argv[1]);
559  nail_bits = atoi (argv[2]);
560
561  if (limb_bits <= 0
562      || nail_bits < 0
563      || nail_bits >= limb_bits)
564    {
565      fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
566               limb_bits, nail_bits);
567      exit (1);
568    }
569
570  generate_sq_res_0x100 (limb_bits);
571  generate_mod (limb_bits, nail_bits);
572
573  print (limb_bits, nail_bits);
574
575  return 0;
576}
577