1/* Search algorithm.
2   Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3   Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4   and Bruno Haible <bruno@clisp.org>.
5
6   This file is part of GNU GPERF.
7
8   GNU GPERF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU GPERF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation, Inc.,
21   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
22
23/* Specification. */
24#include "search.h"
25
26#include <stdio.h>
27#include <stdlib.h> /* declares exit(), rand(), srand() */
28#include <string.h> /* declares memset(), memcmp() */
29#include <time.h> /* declares time() */
30#include <math.h> /* declares exp() */
31#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32#include "options.h"
33#include "hash-table.h"
34#include "config.h"
35
36/* ============================== Portability ============================== */
37
38/* Assume ISO C++ 'for' scoping rule.  */
39#define for if (0) ; else for
40
41/* Dynamically allocated array with dynamic extent:
42
43   Example:
44       DYNAMIC_ARRAY (my_array, int, n);
45       ...
46       FREE_DYNAMIC_ARRAY (my_array);
47
48   Attention: depending on your implementation my_array is either the array
49   itself or a pointer to the array! Always use my_array only as expression!
50 */
51#if HAVE_DYNAMIC_ARRAY
52  #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
53  #define FREE_DYNAMIC_ARRAY(var)
54#else
55  #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
56  #define FREE_DYNAMIC_ARRAY(var) delete[] var
57#endif
58
59/* ================================ Theory ================================= */
60
61/* The general form of the hash function is
62
63      hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
64                       + len (keyword)
65
66   where Pos is a set of byte positions,
67   each alpha_inc[i] is a nonnegative integer,
68   each asso_values[c] is a nonnegative integer,
69   len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
70
71   Theorem 1: If all keywords are different, there is a set Pos such that
72   all tuples (keyword[i] : i in Pos) are different.
73
74   Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
75   are nonnegative integers alpha_inc[i] such that all multisets
76   {keyword[i] + alpha_inc[i] : i in Pos} are different.
77
78   Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
79
80   Theorem 3: If all multisets selchars[keyword] are different, there are
81   nonnegative integers asso_values[c] such that all hash values
82   sum (asso_values[c] : c in selchars[keyword]) are different.
83
84   Based on these three facts, we find the hash function in three steps:
85
86   Step 1 (Finding good byte positions):
87   Find a set Pos, as small as possible, such that all tuples
88   (keyword[i] : i in Pos) are different.
89
90   Step 2 (Finding good alpha increments):
91   Find nonnegative integers alpha_inc[i], as many of them as possible being
92   zero, and the others being as small as possible, such that all multisets
93   {keyword[i] + alpha_inc[i] : i in Pos} are different.
94
95   Step 3 (Finding good asso_values):
96   Find asso_values[c] such that all hash (keyword) are different.
97
98   In other words, each step finds a projection that is injective on the
99   given finite set:
100     proj1 : String --> Map (Pos --> N)
101     proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
102     proj3 : Map (Pos --> N) / S(Pos) --> N
103   where
104     N denotes the set of nonnegative integers,
105     Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
106     S(Pos) is the symmetric group over Pos.
107
108   This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
109   modifications apply:
110     proj1 : String --> Map (Pos --> N) x N
111     proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
112     proj3 : Map (Pos --> N) / S(Pos) x N --> N
113
114   For a case-insensitive hash function, the general form is
115
116      hash (keyword) =
117        sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
118        + len (keyword)
119
120   where alpha_unify[c] is chosen so that an upper/lower case change in
121   keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
122 */
123
124/* ==================== Initialization and Preparation ===================== */
125
126Search::Search (KeywordExt_List *list)
127  : _head (list)
128{
129}
130
131void
132Search::prepare ()
133{
134  /* Compute the total number of keywords.  */
135  _total_keys = 0;
136  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
137    _total_keys++;
138
139  /* Compute the minimum and maximum keyword length.  */
140  _max_key_len = INT_MIN;
141  _min_key_len = INT_MAX;
142  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
143    {
144      KeywordExt *keyword = temp->first();
145
146      if (_max_key_len < keyword->_allchars_length)
147        _max_key_len = keyword->_allchars_length;
148      if (_min_key_len > keyword->_allchars_length)
149        _min_key_len = keyword->_allchars_length;
150    }
151
152  /* Exit program if an empty string is used as keyword, since the comparison
153     expressions don't work correctly for looking up an empty string.  */
154  if (_min_key_len == 0)
155    {
156      fprintf (stderr, "Empty input keyword is not allowed.\n"
157               "To recognize an empty input keyword, your code should check for\n"
158               "len == 0 before calling the gperf generated lookup function.\n");
159      exit (1);
160    }
161
162  /* Exit program if the characters in the keywords are not in the required
163     range.  */
164  if (option[SEVENBIT])
165    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
166      {
167        KeywordExt *keyword = temp->first();
168
169        const char *k = keyword->_allchars;
170        for (int i = keyword->_allchars_length; i > 0; k++, i--)
171          if (!(static_cast<unsigned char>(*k) < 128))
172            {
173              fprintf (stderr, "Option --seven-bit has been specified,\n"
174                       "but keyword \"%.*s\" contains non-ASCII characters.\n"
175                       "Try removing option --seven-bit.\n",
176                       keyword->_allchars_length, keyword->_allchars);
177              exit (1);
178            }
179      }
180}
181
182/* ====================== Finding good byte positions ====================== */
183
184/* Computes the upper bound on the indices passed to asso_values[],
185   assuming no alpha_increments.  */
186unsigned int
187Search::compute_alpha_size () const
188{
189  return (option[SEVENBIT] ? 128 : 256);
190}
191
192/* Computes the unification rules between different asso_values[c],
193   assuming no alpha_increments.  */
194unsigned int *
195Search::compute_alpha_unify () const
196{
197  if (option[UPPERLOWER])
198    {
199      /* Uppercase to lowercase mapping.  */
200      unsigned int alpha_size = compute_alpha_size();
201      unsigned int *alpha_unify = new unsigned int[alpha_size];
202      for (unsigned int c = 0; c < alpha_size; c++)
203        alpha_unify[c] = c;
204      for (unsigned int c = 'A'; c <= 'Z'; c++)
205        alpha_unify[c] = c + ('a'-'A');
206      return alpha_unify;
207    }
208  else
209    /* Identity mapping.  */
210    return NULL;
211}
212
213/* Initializes each keyword's _selchars array.  */
214void
215Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
216{
217  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
218    temp->first()->init_selchars_tuple(positions, alpha_unify);
219}
220
221/* Deletes each keyword's _selchars array.  */
222void
223Search::delete_selchars () const
224{
225  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
226    temp->first()->delete_selchars();
227}
228
229/* Count the duplicate keywords that occur with a given set of positions.
230   In other words, it returns the difference
231     # K - # proj1 (K)
232   where K is the multiset of given keywords.  */
233unsigned int
234Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
235{
236  /* Run through the keyword list and count the duplicates incrementally.
237     The result does not depend on the order of the keyword list, thanks to
238     the formula above.  */
239  init_selchars_tuple (positions, alpha_unify);
240
241  unsigned int count = 0;
242  {
243    Hash_Table representatives (_total_keys, option[NOLENGTH]);
244    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
245      {
246        KeywordExt *keyword = temp->first();
247        if (representatives.insert (keyword))
248          count++;
249      }
250  }
251
252  delete_selchars ();
253
254  return count;
255}
256
257/* Find good key positions.  */
258
259void
260Search::find_positions ()
261{
262  /* If the user gave the key positions, we use them.  */
263  if (option[POSITIONS])
264    {
265      _key_positions = option.get_key_positions();
266      return;
267    }
268
269  /* Compute preliminary alpha_unify table.  */
270  unsigned int *alpha_unify = compute_alpha_unify ();
271
272  /* 1. Find positions that must occur in order to distinguish duplicates.  */
273  Positions mandatory;
274
275  if (!option[DUP])
276    {
277      for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
278        {
279          KeywordExt *keyword1 = l1->first();
280          for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
281            {
282              KeywordExt *keyword2 = l2->first();
283
284              /* If keyword1 and keyword2 have the same length and differ
285                 in just one position, and it is not the last character,
286                 this position is mandatory.  */
287              if (keyword1->_allchars_length == keyword2->_allchars_length)
288                {
289                  int n = keyword1->_allchars_length;
290                  int i;
291                  for (i = 0; i < n - 1; i++)
292                    {
293                      unsigned char c1 = keyword1->_allchars[i];
294                      unsigned char c2 = keyword2->_allchars[i];
295                      if (option[UPPERLOWER])
296                        {
297                          if (c1 >= 'A' && c1 <= 'Z')
298                            c1 += 'a' - 'A';
299                          if (c2 >= 'A' && c2 <= 'Z')
300                            c2 += 'a' - 'A';
301                        }
302                      if (c1 != c2)
303                        break;
304                    }
305                  if (i < n - 1)
306                    {
307                      int j;
308                      for (j = i + 1; j < n; j++)
309                        {
310                          unsigned char c1 = keyword1->_allchars[j];
311                          unsigned char c2 = keyword2->_allchars[j];
312                          if (option[UPPERLOWER])
313                            {
314                              if (c1 >= 'A' && c1 <= 'Z')
315                                c1 += 'a' - 'A';
316                              if (c2 >= 'A' && c2 <= 'Z')
317                                c2 += 'a' - 'A';
318                            }
319                          if (c1 != c2)
320                            break;
321                        }
322                      if (j >= n)
323                        {
324                          /* Position i is mandatory.  */
325                          if (!mandatory.contains (i))
326                            mandatory.add (i);
327                        }
328                    }
329                }
330            }
331        }
332    }
333
334  /* 2. Add positions, as long as this decreases the duplicates count.  */
335  int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
336              ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
337  Positions current = mandatory;
338  unsigned int current_duplicates_count =
339    count_duplicates_tuple (current, alpha_unify);
340  for (;;)
341    {
342      Positions best;
343      unsigned int best_duplicates_count = UINT_MAX;
344
345      for (int i = imax; i >= -1; i--)
346        if (!current.contains (i))
347          {
348            Positions tryal = current;
349            tryal.add (i);
350            unsigned int try_duplicates_count =
351              count_duplicates_tuple (tryal, alpha_unify);
352
353            /* We prefer 'try' to 'best' if it produces less duplicates,
354               or if it produces the same number of duplicates but with
355               a more efficient hash function.  */
356            if (try_duplicates_count < best_duplicates_count
357                || (try_duplicates_count == best_duplicates_count && i >= 0))
358              {
359                best = tryal;
360                best_duplicates_count = try_duplicates_count;
361              }
362          }
363
364      /* Stop adding positions when it gives no improvement.  */
365      if (best_duplicates_count >= current_duplicates_count)
366        break;
367
368      current = best;
369      current_duplicates_count = best_duplicates_count;
370    }
371
372  /* 3. Remove positions, as long as this doesn't increase the duplicates
373     count.  */
374  for (;;)
375    {
376      Positions best;
377      unsigned int best_duplicates_count = UINT_MAX;
378
379      for (int i = imax; i >= -1; i--)
380        if (current.contains (i) && !mandatory.contains (i))
381          {
382            Positions tryal = current;
383            tryal.remove (i);
384            unsigned int try_duplicates_count =
385              count_duplicates_tuple (tryal, alpha_unify);
386
387            /* We prefer 'try' to 'best' if it produces less duplicates,
388               or if it produces the same number of duplicates but with
389               a more efficient hash function.  */
390            if (try_duplicates_count < best_duplicates_count
391                || (try_duplicates_count == best_duplicates_count && i == -1))
392              {
393                best = tryal;
394                best_duplicates_count = try_duplicates_count;
395              }
396          }
397
398      /* Stop removing positions when it gives no improvement.  */
399      if (best_duplicates_count > current_duplicates_count)
400        break;
401
402      current = best;
403      current_duplicates_count = best_duplicates_count;
404    }
405
406  /* 4. Replace two positions by one, as long as this doesn't increase the
407     duplicates count.  */
408  for (;;)
409    {
410      Positions best;
411      unsigned int best_duplicates_count = UINT_MAX;
412
413      for (int i1 = imax; i1 >= -1; i1--)
414        if (current.contains (i1) && !mandatory.contains (i1))
415          for (int i2 = imax; i2 >= -1; i2--)
416            if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
417              for (int i3 = imax; i3 >= 0; i3--)
418                if (!current.contains (i3))
419                  {
420                    Positions tryal = current;
421                    tryal.remove (i1);
422                    tryal.remove (i2);
423                    tryal.add (i3);
424                    unsigned int try_duplicates_count =
425                      count_duplicates_tuple (tryal, alpha_unify);
426
427                    /* We prefer 'try' to 'best' if it produces less duplicates,
428                       or if it produces the same number of duplicates but with
429                       a more efficient hash function.  */
430                    if (try_duplicates_count < best_duplicates_count
431                        || (try_duplicates_count == best_duplicates_count
432                            && (i1 == -1 || i2 == -1 || i3 >= 0)))
433                      {
434                        best = tryal;
435                        best_duplicates_count = try_duplicates_count;
436                      }
437                  }
438
439      /* Stop removing positions when it gives no improvement.  */
440      if (best_duplicates_count > current_duplicates_count)
441        break;
442
443      current = best;
444      current_duplicates_count = best_duplicates_count;
445    }
446
447  /* That's it.  Hope it's good enough.  */
448  _key_positions = current;
449
450  if (option[DEBUG])
451    {
452      /* Print the result.  */
453      fprintf (stderr, "\nComputed positions: ");
454      PositionReverseIterator iter = _key_positions.reviterator();
455      bool seen_lastchar = false;
456      bool first = true;
457      for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
458        {
459          if (!first)
460            fprintf (stderr, ", ");
461          if (i == Positions::LASTCHAR)
462            seen_lastchar = true;
463          else
464            {
465              fprintf (stderr, "%d", i + 1);
466              first = false;
467            }
468        }
469      if (seen_lastchar)
470        {
471          if (!first)
472            fprintf (stderr, ", ");
473          fprintf (stderr, "$");
474        }
475      fprintf (stderr, "\n");
476    }
477
478  /* Free preliminary alpha_unify table.  */
479  delete[] alpha_unify;
480}
481
482/* Count the duplicate keywords that occur with the found set of positions.
483   In other words, it returns the difference
484     # K - # proj1 (K)
485   where K is the multiset of given keywords.  */
486unsigned int
487Search::count_duplicates_tuple () const
488{
489  unsigned int *alpha_unify = compute_alpha_unify ();
490  unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
491  delete[] alpha_unify;
492  return count;
493}
494
495/* ===================== Finding good alpha increments ===================== */
496
497/* Computes the upper bound on the indices passed to asso_values[].  */
498unsigned int
499Search::compute_alpha_size (const unsigned int *alpha_inc) const
500{
501  unsigned int max_alpha_inc = 0;
502  for (int i = 0; i < _max_key_len; i++)
503    if (max_alpha_inc < alpha_inc[i])
504      max_alpha_inc = alpha_inc[i];
505  return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
506}
507
508/* Computes the unification rules between different asso_values[c].  */
509unsigned int *
510Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
511{
512  if (option[UPPERLOWER])
513    {
514      /* Without alpha increments, we would simply unify
515           'A' -> 'a', ..., 'Z' -> 'z'.
516         But when a keyword contains at position i a character c,
517         we have the constraint
518            asso_values[tolower(c) + alpha_inc[i]] ==
519            asso_values[toupper(c) + alpha_inc[i]].
520         This introduces a unification
521           toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
522         Note that this unification can extend outside the range of
523         ASCII letters!  But still every unified character pair is at
524         a distance of 'a'-'A' = 32, or (after chained unification)
525         at a multiple of 32.  So in the end the alpha_unify vector has
526         the form    c -> c + 32 * f(c)   where f(c) is a nonnegative
527         integer.  */
528      unsigned int alpha_size = compute_alpha_size (alpha_inc);
529
530      unsigned int *alpha_unify = new unsigned int[alpha_size];
531      for (unsigned int c = 0; c < alpha_size; c++)
532        alpha_unify[c] = c;
533
534      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
535        {
536          KeywordExt *keyword = temp->first();
537
538          /* Iterate through the selected character positions.  */
539          PositionIterator iter = positions.iterator(keyword->_allchars_length);
540
541          for (int i; (i = iter.next ()) != PositionIterator::EOS; )
542            {
543              unsigned int c;
544              if (i == Positions::LASTCHAR)
545                c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
546              else if (i < keyword->_allchars_length)
547                c = static_cast<unsigned char>(keyword->_allchars[i]);
548              else
549                abort ();
550              if (c >= 'A' && c <= 'Z')
551                c += 'a' - 'A';
552              if (c >= 'a' && c <= 'z')
553                {
554                  if (i != Positions::LASTCHAR)
555                    c += alpha_inc[i];
556                  /* Unify c with c - ('a'-'A').  */
557                  unsigned int d = alpha_unify[c];
558                  unsigned int b = c - ('a'-'A');
559                  for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
560                    alpha_unify[a] = d;
561                }
562            }
563        }
564      return alpha_unify;
565    }
566  else
567    /* Identity mapping.  */
568    return NULL;
569}
570
571/* Initializes each keyword's _selchars array.  */
572void
573Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
574{
575  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
576    temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
577}
578
579/* Count the duplicate keywords that occur with the given set of positions
580   and a given alpha_inc[] array.
581   In other words, it returns the difference
582     # K - # proj2 (proj1 (K))
583   where K is the multiset of given keywords.  */
584unsigned int
585Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
586{
587  /* Run through the keyword list and count the duplicates incrementally.
588     The result does not depend on the order of the keyword list, thanks to
589     the formula above.  */
590  unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
591  init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
592
593  unsigned int count = 0;
594  {
595    Hash_Table representatives (_total_keys, option[NOLENGTH]);
596    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
597      {
598        KeywordExt *keyword = temp->first();
599        if (representatives.insert (keyword))
600          count++;
601      }
602  }
603
604  delete_selchars ();
605  delete[] alpha_unify;
606
607  return count;
608}
609
610/* Find good _alpha_inc[].  */
611
612void
613Search::find_alpha_inc ()
614{
615  /* The goal is to choose _alpha_inc[] such that it doesn't introduce
616     artificial duplicates.
617     In other words, the goal is  # proj2 (proj1 (K)) = # proj1 (K).  */
618  unsigned int duplicates_goal = count_duplicates_tuple ();
619
620  /* Start with zero increments.  This is sufficient in most cases.  */
621  unsigned int *current = new unsigned int [_max_key_len];
622  for (int i = 0; i < _max_key_len; i++)
623    current[i] = 0;
624  unsigned int current_duplicates_count = count_duplicates_multiset (current);
625
626  if (current_duplicates_count > duplicates_goal)
627    {
628      /* Look which _alpha_inc[i] we are free to increment.  */
629      unsigned int nindices;
630      {
631        nindices = 0;
632        PositionIterator iter = _key_positions.iterator(_max_key_len);
633        for (;;)
634          {
635            int key_pos = iter.next ();
636            if (key_pos == PositionIterator::EOS)
637              break;
638            if (key_pos != Positions::LASTCHAR)
639              nindices++;
640          }
641      }
642
643      DYNAMIC_ARRAY (indices, unsigned int, nindices);
644      {
645        unsigned int j = 0;
646        PositionIterator iter = _key_positions.iterator(_max_key_len);
647        for (;;)
648          {
649            int key_pos = iter.next ();
650            if (key_pos == PositionIterator::EOS)
651              break;
652            if (key_pos != Positions::LASTCHAR)
653              indices[j++] = key_pos;
654          }
655        if (!(j == nindices))
656          abort ();
657      }
658
659      /* Perform several rounds of searching for a good alpha increment.
660         Each round reduces the number of artificial collisions by adding
661         an increment in a single key position.  */
662      DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
663      DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
664      do
665        {
666          /* An increment of 1 is not always enough.  Try higher increments
667             also.  */
668          for (unsigned int inc = 1; ; inc++)
669            {
670              unsigned int best_duplicates_count = UINT_MAX;
671
672              for (unsigned int j = 0; j < nindices; j++)
673                {
674                  memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
675                  tryal[indices[j]] += inc;
676                  unsigned int try_duplicates_count =
677                    count_duplicates_multiset (tryal);
678
679                  /* We prefer 'try' to 'best' if it produces less
680                     duplicates.  */
681                  if (try_duplicates_count < best_duplicates_count)
682                    {
683                      memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
684                      best_duplicates_count = try_duplicates_count;
685                    }
686                }
687
688              /* Stop this round when we got an improvement.  */
689              if (best_duplicates_count < current_duplicates_count)
690                {
691                  memcpy (current, best, _max_key_len * sizeof (unsigned int));
692                  current_duplicates_count = best_duplicates_count;
693                  break;
694                }
695            }
696        }
697      while (current_duplicates_count > duplicates_goal);
698      FREE_DYNAMIC_ARRAY (tryal);
699      FREE_DYNAMIC_ARRAY (best);
700
701      if (option[DEBUG])
702        {
703          /* Print the result.  */
704          fprintf (stderr, "\nComputed alpha increments: ");
705          bool first = true;
706          for (unsigned int j = nindices; j-- > 0; )
707            if (current[indices[j]] != 0)
708              {
709                if (!first)
710                  fprintf (stderr, ", ");
711                fprintf (stderr, "%u:+%u",
712                         indices[j] + 1, current[indices[j]]);
713                first = false;
714              }
715          fprintf (stderr, "\n");
716        }
717      FREE_DYNAMIC_ARRAY (indices);
718    }
719
720  _alpha_inc = current;
721  _alpha_size = compute_alpha_size (_alpha_inc);
722  _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
723}
724
725/* ======================= Finding good asso_values ======================== */
726
727/* Initializes the asso_values[] related parameters.  */
728
729void
730Search::prepare_asso_values ()
731{
732  KeywordExt_List *temp;
733
734  /* Initialize each keyword's _selchars array.  */
735  init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
736
737  /* Compute the maximum _selchars_length over all keywords.  */
738  _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
739
740  /* Check for duplicates, i.e. keywords with the same _selchars array
741     (and - if !option[NOLENGTH] - also the same length).
742     We deal with these by building an equivalence class, so that only
743     1 keyword is representative of the entire collection.  Only this
744     representative remains in the keyword list; the others are accessible
745     through the _duplicate_link chain, starting at the representative.
746     This *greatly* simplifies processing during later stages of the program.
747     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
748  {
749    _list_len = _total_keys;
750    _total_duplicates = 0;
751    /* Make hash table for efficiency.  */
752    Hash_Table representatives (_list_len, option[NOLENGTH]);
753
754    KeywordExt_List *prev = NULL; /* list node before temp */
755    for (temp = _head; temp; )
756      {
757        KeywordExt *keyword = temp->first();
758        KeywordExt *other_keyword = representatives.insert (keyword);
759        KeywordExt_List *garbage = NULL;
760
761        if (other_keyword)
762          {
763            _total_duplicates++;
764            _list_len--;
765            /* Remove keyword from the main list.  */
766            prev->rest() = temp->rest();
767            garbage = temp;
768            /* And insert it on other_keyword's duplicate list.  */
769            keyword->_duplicate_link = other_keyword->_duplicate_link;
770            other_keyword->_duplicate_link = keyword;
771
772            /* Complain if user hasn't enabled the duplicate option. */
773            if (!option[DUP] || option[DEBUG])
774              {
775                fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
776                         keyword->_allchars_length, keyword->_allchars,
777                         other_keyword->_allchars_length, other_keyword->_allchars);
778                for (int j = 0; j < keyword->_selchars_length; j++)
779                  putc (keyword->_selchars[j], stderr);
780                fprintf (stderr, "\".\n");
781              }
782          }
783        else
784          {
785            keyword->_duplicate_link = NULL;
786            prev = temp;
787          }
788        temp = temp->rest();
789        if (garbage)
790          delete garbage;
791      }
792    if (option[DEBUG])
793      representatives.dump();
794  }
795
796  /* Exit program if duplicates exists and option[DUP] not set, since we
797     don't want to continue in this case.  (We don't want to turn on
798     option[DUP] implicitly, because the generated code is usually much
799     slower.  */
800  if (_total_duplicates)
801    {
802      if (option[DUP])
803        fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
804                         _total_duplicates);
805      else
806        {
807          fprintf (stderr, "%d input keys have identical hash values,\n",
808                           _total_duplicates);
809          if (option[POSITIONS])
810            fprintf (stderr, "try different key positions or use option -D.\n");
811          else
812            fprintf (stderr, "use option -D.\n");
813          exit (1);
814        }
815    }
816
817  /* Compute the occurrences of each character in the alphabet.  */
818  _occurrences = new int[_alpha_size];
819  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
820  for (temp = _head; temp; temp = temp->rest())
821    {
822      KeywordExt *keyword = temp->first();
823      const unsigned int *ptr = keyword->_selchars;
824      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
825        _occurrences[*ptr]++;
826    }
827
828  /* Memory allocation.  */
829  _asso_values = new int[_alpha_size];
830
831  int non_linked_length = _list_len;
832  unsigned int asso_value_max;
833
834  asso_value_max =
835    static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
836  /* Round up to the next power of two.  This makes it easy to ensure
837     an _asso_value[c] is >= 0 and < asso_value_max.  Also, the jump value
838     being odd, it guarantees that Search::try_asso_value() will iterate
839     through different values for _asso_value[c].  */
840  if (asso_value_max == 0)
841    asso_value_max = 1;
842  asso_value_max |= asso_value_max >> 1;
843  asso_value_max |= asso_value_max >> 2;
844  asso_value_max |= asso_value_max >> 4;
845  asso_value_max |= asso_value_max >> 8;
846  asso_value_max |= asso_value_max >> 16;
847  asso_value_max++;
848  _asso_value_max = asso_value_max;
849
850  /* Given the bound for _asso_values[c], we have a bound for the possible
851     hash values, as computed in compute_hash().  */
852  _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
853                    + (_asso_value_max - 1) * _max_selchars_length;
854  /* Allocate a sparse bit vector for detection of collisions of hash
855     values.  */
856  _collision_detector = new Bool_Array (_max_hash_value + 1);
857
858  if (option[DEBUG])
859    {
860      fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
861               "\nmaximum size of generated hash table is %d\n",
862               non_linked_length, asso_value_max, _max_hash_value);
863
864      int field_width;
865
866      field_width = 0;
867      {
868        for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
869          {
870            KeywordExt *keyword = temp->first();
871            if (field_width < keyword->_selchars_length)
872              field_width = keyword->_selchars_length;
873          }
874      }
875
876      fprintf (stderr, "\ndumping the keyword list without duplicates\n");
877      fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
878      int i = 0;
879      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
880        {
881          KeywordExt *keyword = temp->first();
882          fprintf (stderr, "%9d, ", ++i);
883          if (field_width > keyword->_selchars_length)
884            fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
885          for (int j = 0; j < keyword->_selchars_length; j++)
886            putc (keyword->_selchars[j], stderr);
887          fprintf (stderr, ", %.*s\n",
888                   keyword->_allchars_length, keyword->_allchars);
889        }
890      fprintf (stderr, "\nend of keyword list\n\n");
891    }
892
893  if (option[RANDOM] || option.get_jump () == 0)
894    /* We will use rand(), so initialize the random number generator.  */
895    srand (static_cast<long>(time (0)));
896
897  _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
898  _jump = option.get_jump ();
899}
900
901/* Finds some _asso_values[] that fit.  */
902
903/* The idea is to choose the _asso_values[] one by one, in a way that
904   a choice that has been made never needs to be undone later.  This
905   means that we split the work into several steps.  Each step chooses
906   one or more _asso_values[c].  The result of choosing one or more
907   _asso_values[c] is that the partitioning of the keyword set gets
908   broader.
909   Look at this partitioning:  After every step, the _asso_values[] of a
910   certain set C of characters are undetermined.  (At the beginning, C
911   is the set of characters c with _occurrences[c] > 0.  At the end, C
912   is empty.)  To each keyword K, we associate the multiset of _selchars
913   for which the _asso_values[] are undetermined:
914                    K  -->  K->_selchars intersect C.
915   Consider two keywords equivalent if their value under this mapping is
916   the same.  This introduces an equivalence relation on the set of
917   keywords.  The equivalence classes partition the keyword set.  (At the
918   beginning, the partition is the finest possible: each K is an equivalence
919   class by itself, because all K have a different _selchars.  At the end,
920   all K have been merged into a single equivalence class.)
921   The partition before a step is always a refinement of the partition
922   after the step.
923   We choose the steps in such a way that the partition really becomes
924   broader at each step.  (A step that only chooses an _asso_values[c]
925   without changing the partition is better merged with the previous step,
926   to avoid useless backtracking.)  */
927
928struct EquivalenceClass
929{
930  /* The keywords in this equivalence class.  */
931  KeywordExt_List *     _keywords;
932  KeywordExt_List *     _keywords_last;
933  /* The number of keywords in this equivalence class.  */
934  unsigned int          _cardinality;
935  /* The undetermined selected characters for the keywords in this
936     equivalence class, as a canonically reordered multiset.  */
937  unsigned int *        _undetermined_chars;
938  unsigned int          _undetermined_chars_length;
939
940  EquivalenceClass *    _next;
941};
942
943struct Step
944{
945  /* The characters whose values are being determined in this step.  */
946  unsigned int          _changing_count;
947  unsigned int *        _changing;
948  /* Exclusive upper bound for the _asso_values[c] of this step.
949     A power of 2.  */
950  unsigned int          _asso_value_max;
951  /* The characters whose values will be determined after this step.  */
952  bool *                _undetermined;
953  /* The keyword set partition after this step.  */
954  EquivalenceClass *    _partition;
955  /* The expected number of iterations in this step.  */
956  double                _expected_lower;
957  double                _expected_upper;
958
959  Step *                _next;
960};
961
962static inline bool
963equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
964{
965  while (len > 0)
966    {
967      if (*ptr1 != *ptr2)
968        return false;
969      ptr1++;
970      ptr2++;
971      len--;
972    }
973  return true;
974}
975
976EquivalenceClass *
977Search::compute_partition (bool *undetermined) const
978{
979  EquivalenceClass *partition = NULL;
980  EquivalenceClass *partition_last = NULL;
981  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
982    {
983      KeywordExt *keyword = temp->first();
984
985      /* Compute the undetermined characters for this keyword.  */
986      unsigned int *undetermined_chars =
987        new unsigned int[keyword->_selchars_length];
988      unsigned int undetermined_chars_length = 0;
989
990      for (int i = 0; i < keyword->_selchars_length; i++)
991        if (undetermined[keyword->_selchars[i]])
992          undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
993
994      /* Look up the equivalence class to which this keyword belongs.  */
995      EquivalenceClass *equclass;
996      for (equclass = partition; equclass; equclass = equclass->_next)
997        if (equclass->_undetermined_chars_length == undetermined_chars_length
998            && equals (equclass->_undetermined_chars, undetermined_chars,
999                       undetermined_chars_length))
1000          break;
1001      if (equclass == NULL)
1002        {
1003          equclass = new EquivalenceClass();
1004          equclass->_keywords = NULL;
1005          equclass->_keywords_last = NULL;
1006          equclass->_cardinality = 0;
1007          equclass->_undetermined_chars = undetermined_chars;
1008          equclass->_undetermined_chars_length = undetermined_chars_length;
1009          equclass->_next = NULL;
1010          if (partition)
1011            partition_last->_next = equclass;
1012          else
1013            partition = equclass;
1014          partition_last = equclass;
1015        }
1016      else
1017        delete[] undetermined_chars;
1018
1019      /* Add the keyword to the equivalence class.  */
1020      KeywordExt_List *cons = new KeywordExt_List(keyword);
1021      if (equclass->_keywords)
1022        equclass->_keywords_last->rest() = cons;
1023      else
1024        equclass->_keywords = cons;
1025      equclass->_keywords_last = cons;
1026      equclass->_cardinality++;
1027    }
1028
1029  /* Free some of the allocated memory.  The caller doesn't need it.  */
1030  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1031    delete[] cls->_undetermined_chars;
1032
1033  return partition;
1034}
1035
1036static void
1037delete_partition (EquivalenceClass *partition)
1038{
1039  while (partition != NULL)
1040    {
1041      EquivalenceClass *equclass = partition;
1042      partition = equclass->_next;
1043      delete_list (equclass->_keywords);
1044      //delete[] equclass->_undetermined_chars; // already freed above
1045      delete equclass;
1046    }
1047}
1048
1049/* Compute the possible number of collisions when _asso_values[c] is
1050   chosen, leading to the given partition.  */
1051unsigned int
1052Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1053{
1054  /* Every equivalence class p is split according to the frequency of
1055     occurrence of c, leading to equivalence classes p1, p2, ...
1056     This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1057     Return the sum of this expression over all equivalence classes.  */
1058  unsigned int sum = 0;
1059  unsigned int m = _max_selchars_length;
1060  DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1061  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1062    {
1063      for (unsigned int i = 0; i <= m; i++)
1064        split_cardinalities[i] = 0;
1065
1066      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1067        {
1068          KeywordExt *keyword = temp->first();
1069
1070          unsigned int count = 0;
1071          for (int i = 0; i < keyword->_selchars_length; i++)
1072            if (keyword->_selchars[i] == c)
1073              count++;
1074
1075          split_cardinalities[count]++;
1076        }
1077
1078      sum += cls->_cardinality * cls->_cardinality;
1079      for (unsigned int i = 0; i <= m; i++)
1080        sum -= split_cardinalities[i] * split_cardinalities[i];
1081    }
1082  FREE_DYNAMIC_ARRAY (split_cardinalities);
1083  return sum;
1084}
1085
1086/* Test whether adding c to the undetermined characters changes the given
1087   partition.  */
1088bool
1089Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1090{
1091  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1092    {
1093      unsigned int first_count = UINT_MAX;
1094
1095      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1096        {
1097          KeywordExt *keyword = temp->first();
1098
1099          unsigned int count = 0;
1100          for (int i = 0; i < keyword->_selchars_length; i++)
1101            if (keyword->_selchars[i] == c)
1102              count++;
1103
1104          if (temp == cls->_keywords)
1105            first_count = count;
1106          else if (count != first_count)
1107            /* c would split this equivalence class.  */
1108            return false;
1109        }
1110    }
1111  return true;
1112}
1113
1114void
1115Search::find_asso_values ()
1116{
1117  Step *steps;
1118
1119  /* Determine the steps, starting with the last one.  */
1120  {
1121    bool *undetermined;
1122    bool *determined;
1123
1124    steps = NULL;
1125
1126    undetermined = new bool[_alpha_size];
1127    for (unsigned int c = 0; c < _alpha_size; c++)
1128      undetermined[c] = false;
1129
1130    determined = new bool[_alpha_size];
1131    for (unsigned int c = 0; c < _alpha_size; c++)
1132      determined[c] = true;
1133
1134    for (;;)
1135      {
1136        /* Compute the partition that needs to be refined.  */
1137        EquivalenceClass *partition = compute_partition (undetermined);
1138
1139        /* Determine the main character to be chosen in this step.
1140           Choosing such a character c has the effect of splitting every
1141           equivalence class (according the the frequency of occurrence of c).
1142           We choose the c with the minimum number of possible collisions,
1143           so that characters which lead to a large number of collisions get
1144           handled early during the search.  */
1145        unsigned int chosen_c;
1146        unsigned int chosen_possible_collisions;
1147        {
1148          unsigned int best_c = 0;
1149          unsigned int best_possible_collisions = UINT_MAX;
1150          for (unsigned int c = 0; c < _alpha_size; c++)
1151            if (_occurrences[c] > 0 && determined[c])
1152              {
1153                unsigned int possible_collisions =
1154                  count_possible_collisions (partition, c);
1155                if (possible_collisions < best_possible_collisions)
1156                  {
1157                    best_c = c;
1158                    best_possible_collisions = possible_collisions;
1159                  }
1160              }
1161          if (best_possible_collisions == UINT_MAX)
1162            {
1163              /* All c with _occurrences[c] > 0 are undetermined.  We are
1164                 are the starting situation and don't need any more step.  */
1165              delete_partition (partition);
1166              break;
1167            }
1168          chosen_c = best_c;
1169          chosen_possible_collisions = best_possible_collisions;
1170        }
1171
1172        /* We need one more step.  */
1173        Step *step = new Step();
1174
1175        step->_undetermined = new bool[_alpha_size];
1176        memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1177
1178        step->_partition = partition;
1179
1180        /* Now determine how the equivalence classes will be before this
1181           step.  */
1182        undetermined[chosen_c] = true;
1183        partition = compute_partition (undetermined);
1184
1185        /* Now determine which other characters should be determined in this
1186           step, because they will not change the equivalence classes at
1187           this point.  It is the set of all c which, for all equivalence
1188           classes, have the same frequency of occurrence in every keyword
1189           of the equivalence class.  */
1190        for (unsigned int c = 0; c < _alpha_size; c++)
1191          if (_occurrences[c] > 0 && determined[c]
1192              && unchanged_partition (partition, c))
1193            {
1194              undetermined[c] = true;
1195              determined[c] = false;
1196            }
1197
1198        /* main_c must be one of these.  */
1199        if (determined[chosen_c])
1200          abort ();
1201
1202        /* Now the set of changing characters of this step.  */
1203        unsigned int changing_count;
1204
1205        changing_count = 0;
1206        for (unsigned int c = 0; c < _alpha_size; c++)
1207          if (undetermined[c] && !step->_undetermined[c])
1208            changing_count++;
1209
1210        unsigned int *changing = new unsigned int[changing_count];
1211        changing_count = 0;
1212        for (unsigned int c = 0; c < _alpha_size; c++)
1213          if (undetermined[c] && !step->_undetermined[c])
1214            changing[changing_count++] = c;
1215
1216        step->_changing = changing;
1217        step->_changing_count = changing_count;
1218
1219        step->_asso_value_max = _asso_value_max;
1220
1221        step->_expected_lower =
1222          exp (static_cast<double>(chosen_possible_collisions)
1223               / static_cast<double>(_max_hash_value));
1224        step->_expected_upper =
1225          exp (static_cast<double>(chosen_possible_collisions)
1226               / static_cast<double>(_asso_value_max));
1227
1228        delete_partition (partition);
1229
1230        step->_next = steps;
1231        steps = step;
1232      }
1233
1234    delete[] determined;
1235    delete[] undetermined;
1236  }
1237
1238  if (option[DEBUG])
1239    {
1240      unsigned int stepno = 0;
1241      for (Step *step = steps; step; step = step->_next)
1242        {
1243          stepno++;
1244          fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1245          for (unsigned int i = 0; i < step->_changing_count; i++)
1246            {
1247              if (i > 0)
1248                fprintf (stderr, ",");
1249              fprintf (stderr, "'%c'", step->_changing[i]);
1250            }
1251          fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1252                   step->_expected_lower, step->_expected_upper);
1253          fprintf (stderr, "Keyword equivalence classes:\n");
1254          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1255            {
1256              fprintf (stderr, "\n");
1257              for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1258                {
1259                  KeywordExt *keyword = temp->first();
1260                  fprintf (stderr, "  %.*s\n",
1261                           keyword->_allchars_length, keyword->_allchars);
1262                }
1263            }
1264          fprintf (stderr, "\n");
1265        }
1266    }
1267
1268  /* Initialize _asso_values[].  (The value given here matters only
1269     for those c which occur in all keywords with equal multiplicity.)  */
1270  for (unsigned int c = 0; c < _alpha_size; c++)
1271    _asso_values[c] = 0;
1272
1273  unsigned int stepno = 0;
1274  for (Step *step = steps; step; step = step->_next)
1275    {
1276      stepno++;
1277
1278      /* Initialize the asso_values[].  */
1279      unsigned int k = step->_changing_count;
1280      for (unsigned int i = 0; i < k; i++)
1281        {
1282          unsigned int c = step->_changing[i];
1283          _asso_values[c] =
1284            (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1285            & (step->_asso_value_max - 1);
1286        }
1287
1288      unsigned int iterations = 0;
1289      DYNAMIC_ARRAY (iter, unsigned int, k);
1290      for (unsigned int i = 0; i < k; i++)
1291        iter[i] = 0;
1292      unsigned int ii = (_jump != 0 ? k - 1 : 0);
1293
1294      for (;;)
1295        {
1296          /* Test whether these asso_values[] lead to collisions among
1297             the equivalence classes that should be collision-free.  */
1298          bool has_collision = false;
1299          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1300            {
1301              /* Iteration Number array is a win, O(1) initialization time!  */
1302              _collision_detector->clear ();
1303
1304              for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1305                {
1306                  KeywordExt *keyword = ptr->first();
1307
1308                  /* Compute the new hash code for the keyword, leaving apart
1309                     the yet undetermined asso_values[].  */
1310                  int hashcode;
1311                  {
1312                    int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1313                    const unsigned int *p = keyword->_selchars;
1314                    int i = keyword->_selchars_length;
1315                    for (; i > 0; p++, i--)
1316                      if (!step->_undetermined[*p])
1317                        sum += _asso_values[*p];
1318                    hashcode = sum;
1319                  }
1320
1321                  /* See whether it collides with another keyword's hash code,
1322                     from the same equivalence class.  */
1323                  if (_collision_detector->set_bit (hashcode))
1324                    {
1325                      has_collision = true;
1326                      break;
1327                    }
1328                }
1329
1330              /* Don't need to continue looking at the other equivalence
1331                 classes if we already have found a collision.  */
1332              if (has_collision)
1333                break;
1334            }
1335
1336          iterations++;
1337          if (!has_collision)
1338            break;
1339
1340          /* Try other asso_values[].  */
1341          if (_jump != 0)
1342            {
1343              /* The way we try various values for
1344                   asso_values[step->_changing[0],...step->_changing[k-1]]
1345                 is like this:
1346                 for (bound = 0,1,...)
1347                   for (ii = 0,...,k-1)
1348                     iter[ii] := bound
1349                     iter[0..ii-1] := values <= bound
1350                     iter[ii+1..k-1] := values < bound
1351                 and
1352                   asso_values[step->_changing[i]] =
1353                     _initial_asso_value + iter[i] * _jump.
1354                 This makes it more likely to find small asso_values[].
1355               */
1356              unsigned int bound = iter[ii];
1357              unsigned int i = 0;
1358              while (i < ii)
1359                {
1360                  unsigned int c = step->_changing[i];
1361                  iter[i]++;
1362                  _asso_values[c] =
1363                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1364                  if (iter[i] <= bound)
1365                    goto found_next;
1366                  _asso_values[c] =
1367                    (_asso_values[c] - iter[i] * _jump)
1368                    & (step->_asso_value_max - 1);
1369                  iter[i] = 0;
1370                  i++;
1371                }
1372              i = ii + 1;
1373              while (i < k)
1374                {
1375                  unsigned int c = step->_changing[i];
1376                  iter[i]++;
1377                  _asso_values[c] =
1378                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1379                  if (iter[i] < bound)
1380                    goto found_next;
1381                  _asso_values[c] =
1382                    (_asso_values[c] - iter[i] * _jump)
1383                    & (step->_asso_value_max - 1);
1384                  iter[i] = 0;
1385                  i++;
1386                }
1387              /* Switch from one ii to the next.  */
1388              {
1389                unsigned int c = step->_changing[ii];
1390                _asso_values[c] =
1391                  (_asso_values[c] - bound * _jump)
1392                  & (step->_asso_value_max - 1);
1393                iter[ii] = 0;
1394              }
1395              /* Here all iter[i] == 0.  */
1396              ii++;
1397              if (ii == k)
1398                {
1399                  ii = 0;
1400                  bound++;
1401                  if (bound == step->_asso_value_max)
1402                    {
1403                      /* Out of search space!  We can either backtrack, or
1404                         increase the available search space of this step.
1405                         It seems simpler to choose the latter solution.  */
1406                      step->_asso_value_max = 2 * step->_asso_value_max;
1407                      if (step->_asso_value_max > _asso_value_max)
1408                        {
1409                          _asso_value_max = step->_asso_value_max;
1410                          /* Reinitialize _max_hash_value.  */
1411                          _max_hash_value =
1412                            (option[NOLENGTH] ? 0 : _max_key_len)
1413                            + (_asso_value_max - 1) * _max_selchars_length;
1414                          /* Reinitialize _collision_detector.  */
1415                          delete _collision_detector;
1416                          _collision_detector =
1417                            new Bool_Array (_max_hash_value + 1);
1418                        }
1419                    }
1420                }
1421              {
1422                unsigned int c = step->_changing[ii];
1423                iter[ii] = bound;
1424                _asso_values[c] =
1425                  (_asso_values[c] + bound * _jump)
1426                  & (step->_asso_value_max - 1);
1427              }
1428             found_next: ;
1429            }
1430          else
1431            {
1432              /* Random.  */
1433              unsigned int c = step->_changing[ii];
1434              _asso_values[c] =
1435                (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1436              /* Next time, change the next c.  */
1437              ii++;
1438              if (ii == k)
1439                ii = 0;
1440            }
1441        }
1442      FREE_DYNAMIC_ARRAY (iter);
1443
1444      if (option[DEBUG])
1445        {
1446          fprintf (stderr, "Step %u chose _asso_values[", stepno);
1447          for (unsigned int i = 0; i < step->_changing_count; i++)
1448            {
1449              if (i > 0)
1450                fprintf (stderr, ",");
1451              fprintf (stderr, "'%c'", step->_changing[i]);
1452            }
1453          fprintf (stderr, "] in %u iterations.\n", iterations);
1454        }
1455    }
1456
1457  /* Free allocated memory.  */
1458  while (steps != NULL)
1459    {
1460      Step *step = steps;
1461      steps = step->_next;
1462      delete[] step->_changing;
1463      delete[] step->_undetermined;
1464      delete_partition (step->_partition);
1465      delete step;
1466    }
1467}
1468
1469/* Computes a keyword's hash value, relative to the current _asso_values[],
1470   and stores it in keyword->_hash_value.  */
1471
1472inline int
1473Search::compute_hash (KeywordExt *keyword) const
1474{
1475  int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1476
1477  const unsigned int *p = keyword->_selchars;
1478  int i = keyword->_selchars_length;
1479  for (; i > 0; p++, i--)
1480    sum += _asso_values[*p];
1481
1482  return keyword->_hash_value = sum;
1483}
1484
1485/* Finds good _asso_values[].  */
1486
1487void
1488Search::find_good_asso_values ()
1489{
1490  prepare_asso_values ();
1491
1492  /* Search for good _asso_values[].  */
1493  int asso_iteration;
1494  if ((asso_iteration = option.get_asso_iterations ()) == 0)
1495    /* Try only the given _initial_asso_value and _jump.  */
1496    find_asso_values ();
1497  else
1498    {
1499      /* Try different pairs of _initial_asso_value and _jump, in the
1500         following order:
1501           (0, 1)
1502           (1, 1)
1503           (2, 1) (0, 3)
1504           (3, 1) (1, 3)
1505           (4, 1) (2, 3) (0, 5)
1506           (5, 1) (3, 3) (1, 5)
1507           ..... */
1508      KeywordExt_List *saved_head = _head;
1509      int best_initial_asso_value = 0;
1510      int best_jump = 1;
1511      int *best_asso_values = new int[_alpha_size];
1512      int best_collisions = INT_MAX;
1513      int best_max_hash_value = INT_MAX;
1514
1515      _initial_asso_value = 0; _jump = 1;
1516      for (;;)
1517        {
1518          /* Restore the keyword list in its original order.  */
1519          _head = copy_list (saved_head);
1520          /* Find good _asso_values[].  */
1521          find_asso_values ();
1522          /* Test whether it is the best solution so far.  */
1523          int collisions = 0;
1524          int max_hash_value = INT_MIN;
1525          _collision_detector->clear ();
1526          for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1527            {
1528              KeywordExt *keyword = ptr->first();
1529              int hashcode = compute_hash (keyword);
1530              if (max_hash_value < hashcode)
1531                max_hash_value = hashcode;
1532              if (_collision_detector->set_bit (hashcode))
1533                collisions++;
1534            }
1535          if (collisions < best_collisions
1536              || (collisions == best_collisions
1537                  && max_hash_value < best_max_hash_value))
1538            {
1539              memcpy (best_asso_values, _asso_values,
1540                      _alpha_size * sizeof (_asso_values[0]));
1541              best_collisions = collisions;
1542              best_max_hash_value = max_hash_value;
1543            }
1544          /* Delete the copied keyword list.  */
1545          delete_list (_head);
1546
1547          if (--asso_iteration == 0)
1548            break;
1549          /* Prepare for next iteration.  */
1550          if (_initial_asso_value >= 2)
1551            _initial_asso_value -= 2, _jump += 2;
1552          else
1553            _initial_asso_value += _jump, _jump = 1;
1554        }
1555      _head = saved_head;
1556      /* Install the best found asso_values.  */
1557      _initial_asso_value = best_initial_asso_value;
1558      _jump = best_jump;
1559      memcpy (_asso_values, best_asso_values,
1560              _alpha_size * sizeof (_asso_values[0]));
1561      delete[] best_asso_values;
1562      /* The keywords' _hash_value fields are recomputed below.  */
1563    }
1564}
1565
1566/* ========================================================================= */
1567
1568/* Comparison function for sorting by increasing _hash_value.  */
1569static bool
1570less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1571{
1572  return keyword1->_hash_value < keyword2->_hash_value;
1573}
1574
1575/* Sorts the keyword list by hash value.  */
1576
1577void
1578Search::sort ()
1579{
1580  _head = mergesort_list (_head, less_by_hash_value);
1581}
1582
1583void
1584Search::optimize ()
1585{
1586  /* Preparations.  */
1587  prepare ();
1588
1589  /* Step 1: Finding good byte positions.  */
1590  find_positions ();
1591
1592  /* Step 2: Finding good alpha increments.  */
1593  find_alpha_inc ();
1594
1595  /* Step 3: Finding good asso_values.  */
1596  find_good_asso_values ();
1597
1598  /* Make one final check, just to make sure nothing weird happened.... */
1599  _collision_detector->clear ();
1600  for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1601    {
1602      KeywordExt *curr = curr_ptr->first();
1603      unsigned int hashcode = compute_hash (curr);
1604      if (_collision_detector->set_bit (hashcode))
1605        {
1606          /* This shouldn't happen.  proj1, proj2, proj3 must have been
1607             computed to be injective on the given keyword set.  */
1608          fprintf (stderr,
1609                   "\nInternal error, unexpected duplicate hash code\n");
1610          if (option[POSITIONS])
1611            fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1612          else
1613            fprintf (stderr, "try options -m or -r.\n\n");
1614          exit (1);
1615        }
1616    }
1617
1618  /* Sorts the keyword list by hash value.  */
1619  sort ();
1620
1621  /* Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely
1622     necessary, but speeds up the lookup function in many cases of lookup
1623     failure: no string comparison is needed once the hash value of a string
1624     is larger than the hash value of any keyword.  */
1625  int max_hash_value;
1626  {
1627    KeywordExt_List *temp;
1628    for (temp = _head; temp->rest(); temp = temp->rest())
1629      ;
1630    max_hash_value = temp->first()->_hash_value;
1631  }
1632  for (unsigned int c = 0; c < _alpha_size; c++)
1633    if (_occurrences[c] == 0)
1634      _asso_values[c] = max_hash_value + 1;
1635
1636  /* Propagate unified asso_values.  */
1637  if (_alpha_unify)
1638    for (unsigned int c = 0; c < _alpha_size; c++)
1639      if (_alpha_unify[c] != c)
1640        _asso_values[c] = _asso_values[_alpha_unify[c]];
1641}
1642
1643/* Prints out some diagnostics upon completion.  */
1644
1645Search::~Search ()
1646{
1647  delete _collision_detector;
1648  if (option[DEBUG])
1649    {
1650      fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1651
1652      for (unsigned int i = 0; i < _alpha_size; i++)
1653        if (_occurrences[i])
1654          fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1655                   i, _asso_values[i], i, _occurrences[i]);
1656
1657      fprintf (stderr, "end table dumping\n");
1658
1659      fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1660               "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1661               _list_len, _total_keys, _total_duplicates, _max_key_len);
1662
1663      int field_width = _max_selchars_length;
1664      fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1665               field_width, "selchars");
1666      for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1667        {
1668          fprintf (stderr, "%11d,%11d,%6d, ",
1669                   ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1670          if (field_width > ptr->first()->_selchars_length)
1671            fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1672          for (int j = 0; j < ptr->first()->_selchars_length; j++)
1673            putc (ptr->first()->_selchars[j], stderr);
1674          fprintf (stderr, ", %.*s\n",
1675                   ptr->first()->_allchars_length, ptr->first()->_allchars);
1676        }
1677
1678      fprintf (stderr, "End dumping list.\n\n");
1679    }
1680  delete[] _asso_values;
1681  delete[] _occurrences;
1682  delete[] _alpha_unify;
1683  delete[] _alpha_inc;
1684}
1685