1228060Sbapt/* Search algorithm.
2228060Sbapt   Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3228060Sbapt   Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4228060Sbapt   and Bruno Haible <bruno@clisp.org>.
5228060Sbapt
6228060Sbapt   This file is part of GNU GPERF.
7228060Sbapt
8228060Sbapt   GNU GPERF is free software; you can redistribute it and/or modify
9228060Sbapt   it under the terms of the GNU General Public License as published by
10228060Sbapt   the Free Software Foundation; either version 2, or (at your option)
11228060Sbapt   any later version.
12228060Sbapt
13228060Sbapt   GNU GPERF is distributed in the hope that it will be useful,
14228060Sbapt   but WITHOUT ANY WARRANTY; without even the implied warranty of
15228060Sbapt   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16228060Sbapt   GNU General Public License for more details.
17228060Sbapt
18228060Sbapt   You should have received a copy of the GNU General Public License
19228060Sbapt   along with this program; see the file COPYING.
20228060Sbapt   If not, write to the Free Software Foundation, Inc.,
21228060Sbapt   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
22228060Sbapt
23228060Sbapt/* Specification. */
24228060Sbapt#include "search.h"
25228060Sbapt
26228060Sbapt#include <stdio.h>
27228060Sbapt#include <stdlib.h> /* declares exit(), rand(), srand() */
28228060Sbapt#include <string.h> /* declares memset(), memcmp() */
29228060Sbapt#include <time.h> /* declares time() */
30228060Sbapt#include <math.h> /* declares exp() */
31228060Sbapt#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32228060Sbapt#include "options.h"
33228060Sbapt#include "hash-table.h"
34228060Sbapt#include "config.h"
35228060Sbapt
36228060Sbapt/* ============================== Portability ============================== */
37228060Sbapt
38228060Sbapt/* Assume ISO C++ 'for' scoping rule.  */
39228060Sbapt#define for if (0) ; else for
40228060Sbapt
41228060Sbapt/* Dynamically allocated array with dynamic extent:
42228060Sbapt
43228060Sbapt   Example:
44228060Sbapt       DYNAMIC_ARRAY (my_array, int, n);
45228060Sbapt       ...
46228060Sbapt       FREE_DYNAMIC_ARRAY (my_array);
47228060Sbapt
48228060Sbapt   Attention: depending on your implementation my_array is either the array
49228060Sbapt   itself or a pointer to the array! Always use my_array only as expression!
50228060Sbapt */
51228060Sbapt#if HAVE_DYNAMIC_ARRAY
52228060Sbapt  #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
53228060Sbapt  #define FREE_DYNAMIC_ARRAY(var)
54228060Sbapt#else
55228060Sbapt  #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
56228060Sbapt  #define FREE_DYNAMIC_ARRAY(var) delete[] var
57228060Sbapt#endif
58228060Sbapt
59228060Sbapt/* ================================ Theory ================================= */
60228060Sbapt
61228060Sbapt/* The general form of the hash function is
62228060Sbapt
63228060Sbapt      hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
64228060Sbapt                       + len (keyword)
65228060Sbapt
66228060Sbapt   where Pos is a set of byte positions,
67228060Sbapt   each alpha_inc[i] is a nonnegative integer,
68228060Sbapt   each asso_values[c] is a nonnegative integer,
69228060Sbapt   len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
70228060Sbapt
71228060Sbapt   Theorem 1: If all keywords are different, there is a set Pos such that
72228060Sbapt   all tuples (keyword[i] : i in Pos) are different.
73228060Sbapt
74228060Sbapt   Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
75228060Sbapt   are nonnegative integers alpha_inc[i] such that all multisets
76228060Sbapt   {keyword[i] + alpha_inc[i] : i in Pos} are different.
77228060Sbapt
78228060Sbapt   Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
79228060Sbapt
80228060Sbapt   Theorem 3: If all multisets selchars[keyword] are different, there are
81228060Sbapt   nonnegative integers asso_values[c] such that all hash values
82228060Sbapt   sum (asso_values[c] : c in selchars[keyword]) are different.
83228060Sbapt
84228060Sbapt   Based on these three facts, we find the hash function in three steps:
85228060Sbapt
86228060Sbapt   Step 1 (Finding good byte positions):
87228060Sbapt   Find a set Pos, as small as possible, such that all tuples
88228060Sbapt   (keyword[i] : i in Pos) are different.
89228060Sbapt
90228060Sbapt   Step 2 (Finding good alpha increments):
91228060Sbapt   Find nonnegative integers alpha_inc[i], as many of them as possible being
92228060Sbapt   zero, and the others being as small as possible, such that all multisets
93228060Sbapt   {keyword[i] + alpha_inc[i] : i in Pos} are different.
94228060Sbapt
95228060Sbapt   Step 3 (Finding good asso_values):
96228060Sbapt   Find asso_values[c] such that all hash (keyword) are different.
97228060Sbapt
98228060Sbapt   In other words, each step finds a projection that is injective on the
99228060Sbapt   given finite set:
100228060Sbapt     proj1 : String --> Map (Pos --> N)
101228060Sbapt     proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
102228060Sbapt     proj3 : Map (Pos --> N) / S(Pos) --> N
103228060Sbapt   where
104228060Sbapt     N denotes the set of nonnegative integers,
105228060Sbapt     Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
106228060Sbapt     S(Pos) is the symmetric group over Pos.
107228060Sbapt
108228060Sbapt   This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
109228060Sbapt   modifications apply:
110228060Sbapt     proj1 : String --> Map (Pos --> N) x N
111228060Sbapt     proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
112228060Sbapt     proj3 : Map (Pos --> N) / S(Pos) x N --> N
113228060Sbapt
114228060Sbapt   For a case-insensitive hash function, the general form is
115228060Sbapt
116228060Sbapt      hash (keyword) =
117228060Sbapt        sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
118228060Sbapt        + len (keyword)
119228060Sbapt
120228060Sbapt   where alpha_unify[c] is chosen so that an upper/lower case change in
121228060Sbapt   keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
122228060Sbapt */
123228060Sbapt
124228060Sbapt/* ==================== Initialization and Preparation ===================== */
125228060Sbapt
126228060SbaptSearch::Search (KeywordExt_List *list)
127228060Sbapt  : _head (list)
128228060Sbapt{
129228060Sbapt}
130228060Sbapt
131228060Sbaptvoid
132228060SbaptSearch::prepare ()
133228060Sbapt{
134228060Sbapt  /* Compute the total number of keywords.  */
135228060Sbapt  _total_keys = 0;
136228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
137228060Sbapt    _total_keys++;
138228060Sbapt
139228060Sbapt  /* Compute the minimum and maximum keyword length.  */
140228060Sbapt  _max_key_len = INT_MIN;
141228060Sbapt  _min_key_len = INT_MAX;
142228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
143228060Sbapt    {
144228060Sbapt      KeywordExt *keyword = temp->first();
145228060Sbapt
146228060Sbapt      if (_max_key_len < keyword->_allchars_length)
147228060Sbapt        _max_key_len = keyword->_allchars_length;
148228060Sbapt      if (_min_key_len > keyword->_allchars_length)
149228060Sbapt        _min_key_len = keyword->_allchars_length;
150228060Sbapt    }
151228060Sbapt
152228060Sbapt  /* Exit program if an empty string is used as keyword, since the comparison
153228060Sbapt     expressions don't work correctly for looking up an empty string.  */
154228060Sbapt  if (_min_key_len == 0)
155228060Sbapt    {
156228060Sbapt      fprintf (stderr, "Empty input keyword is not allowed.\n"
157228060Sbapt               "To recognize an empty input keyword, your code should check for\n"
158228060Sbapt               "len == 0 before calling the gperf generated lookup function.\n");
159228060Sbapt      exit (1);
160228060Sbapt    }
161228060Sbapt
162228060Sbapt  /* Exit program if the characters in the keywords are not in the required
163228060Sbapt     range.  */
164228060Sbapt  if (option[SEVENBIT])
165228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
166228060Sbapt      {
167228060Sbapt        KeywordExt *keyword = temp->first();
168228060Sbapt
169228060Sbapt        const char *k = keyword->_allchars;
170228060Sbapt        for (int i = keyword->_allchars_length; i > 0; k++, i--)
171228060Sbapt          if (!(static_cast<unsigned char>(*k) < 128))
172228060Sbapt            {
173228060Sbapt              fprintf (stderr, "Option --seven-bit has been specified,\n"
174228060Sbapt                       "but keyword \"%.*s\" contains non-ASCII characters.\n"
175228060Sbapt                       "Try removing option --seven-bit.\n",
176228060Sbapt                       keyword->_allchars_length, keyword->_allchars);
177228060Sbapt              exit (1);
178228060Sbapt            }
179228060Sbapt      }
180228060Sbapt}
181228060Sbapt
182228060Sbapt/* ====================== Finding good byte positions ====================== */
183228060Sbapt
184228060Sbapt/* Computes the upper bound on the indices passed to asso_values[],
185228060Sbapt   assuming no alpha_increments.  */
186228060Sbaptunsigned int
187228060SbaptSearch::compute_alpha_size () const
188228060Sbapt{
189228060Sbapt  return (option[SEVENBIT] ? 128 : 256);
190228060Sbapt}
191228060Sbapt
192228060Sbapt/* Computes the unification rules between different asso_values[c],
193228060Sbapt   assuming no alpha_increments.  */
194228060Sbaptunsigned int *
195228060SbaptSearch::compute_alpha_unify () const
196228060Sbapt{
197228060Sbapt  if (option[UPPERLOWER])
198228060Sbapt    {
199228060Sbapt      /* Uppercase to lowercase mapping.  */
200228060Sbapt      unsigned int alpha_size = compute_alpha_size();
201228060Sbapt      unsigned int *alpha_unify = new unsigned int[alpha_size];
202228060Sbapt      for (unsigned int c = 0; c < alpha_size; c++)
203228060Sbapt        alpha_unify[c] = c;
204228060Sbapt      for (unsigned int c = 'A'; c <= 'Z'; c++)
205228060Sbapt        alpha_unify[c] = c + ('a'-'A');
206228060Sbapt      return alpha_unify;
207228060Sbapt    }
208228060Sbapt  else
209228060Sbapt    /* Identity mapping.  */
210228060Sbapt    return NULL;
211228060Sbapt}
212228060Sbapt
213228060Sbapt/* Initializes each keyword's _selchars array.  */
214228060Sbaptvoid
215228060SbaptSearch::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
216228060Sbapt{
217228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
218228060Sbapt    temp->first()->init_selchars_tuple(positions, alpha_unify);
219228060Sbapt}
220228060Sbapt
221228060Sbapt/* Deletes each keyword's _selchars array.  */
222228060Sbaptvoid
223228060SbaptSearch::delete_selchars () const
224228060Sbapt{
225228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
226228060Sbapt    temp->first()->delete_selchars();
227228060Sbapt}
228228060Sbapt
229228060Sbapt/* Count the duplicate keywords that occur with a given set of positions.
230228060Sbapt   In other words, it returns the difference
231228060Sbapt     # K - # proj1 (K)
232228060Sbapt   where K is the multiset of given keywords.  */
233228060Sbaptunsigned int
234228060SbaptSearch::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
235228060Sbapt{
236228060Sbapt  /* Run through the keyword list and count the duplicates incrementally.
237228060Sbapt     The result does not depend on the order of the keyword list, thanks to
238228060Sbapt     the formula above.  */
239228060Sbapt  init_selchars_tuple (positions, alpha_unify);
240228060Sbapt
241228060Sbapt  unsigned int count = 0;
242228060Sbapt  {
243228060Sbapt    Hash_Table representatives (_total_keys, option[NOLENGTH]);
244228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
245228060Sbapt      {
246228060Sbapt        KeywordExt *keyword = temp->first();
247228060Sbapt        if (representatives.insert (keyword))
248228060Sbapt          count++;
249228060Sbapt      }
250228060Sbapt  }
251228060Sbapt
252228060Sbapt  delete_selchars ();
253228060Sbapt
254228060Sbapt  return count;
255228060Sbapt}
256228060Sbapt
257228060Sbapt/* Find good key positions.  */
258228060Sbapt
259228060Sbaptvoid
260228060SbaptSearch::find_positions ()
261228060Sbapt{
262228060Sbapt  /* If the user gave the key positions, we use them.  */
263228060Sbapt  if (option[POSITIONS])
264228060Sbapt    {
265228060Sbapt      _key_positions = option.get_key_positions();
266228060Sbapt      return;
267228060Sbapt    }
268228060Sbapt
269228060Sbapt  /* Compute preliminary alpha_unify table.  */
270228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify ();
271228060Sbapt
272228060Sbapt  /* 1. Find positions that must occur in order to distinguish duplicates.  */
273228060Sbapt  Positions mandatory;
274228060Sbapt
275228060Sbapt  if (!option[DUP])
276228060Sbapt    {
277228060Sbapt      for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
278228060Sbapt        {
279228060Sbapt          KeywordExt *keyword1 = l1->first();
280228060Sbapt          for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
281228060Sbapt            {
282228060Sbapt              KeywordExt *keyword2 = l2->first();
283228060Sbapt
284228060Sbapt              /* If keyword1 and keyword2 have the same length and differ
285228060Sbapt                 in just one position, and it is not the last character,
286228060Sbapt                 this position is mandatory.  */
287228060Sbapt              if (keyword1->_allchars_length == keyword2->_allchars_length)
288228060Sbapt                {
289228060Sbapt                  int n = keyword1->_allchars_length;
290228060Sbapt                  int i;
291228060Sbapt                  for (i = 0; i < n - 1; i++)
292228060Sbapt                    {
293228060Sbapt                      unsigned char c1 = keyword1->_allchars[i];
294228060Sbapt                      unsigned char c2 = keyword2->_allchars[i];
295228060Sbapt                      if (option[UPPERLOWER])
296228060Sbapt                        {
297228060Sbapt                          if (c1 >= 'A' && c1 <= 'Z')
298228060Sbapt                            c1 += 'a' - 'A';
299228060Sbapt                          if (c2 >= 'A' && c2 <= 'Z')
300228060Sbapt                            c2 += 'a' - 'A';
301228060Sbapt                        }
302228060Sbapt                      if (c1 != c2)
303228060Sbapt                        break;
304228060Sbapt                    }
305228060Sbapt                  if (i < n - 1)
306228060Sbapt                    {
307228060Sbapt                      int j;
308228060Sbapt                      for (j = i + 1; j < n; j++)
309228060Sbapt                        {
310228060Sbapt                          unsigned char c1 = keyword1->_allchars[j];
311228060Sbapt                          unsigned char c2 = keyword2->_allchars[j];
312228060Sbapt                          if (option[UPPERLOWER])
313228060Sbapt                            {
314228060Sbapt                              if (c1 >= 'A' && c1 <= 'Z')
315228060Sbapt                                c1 += 'a' - 'A';
316228060Sbapt                              if (c2 >= 'A' && c2 <= 'Z')
317228060Sbapt                                c2 += 'a' - 'A';
318228060Sbapt                            }
319228060Sbapt                          if (c1 != c2)
320228060Sbapt                            break;
321228060Sbapt                        }
322228060Sbapt                      if (j >= n)
323228060Sbapt                        {
324228060Sbapt                          /* Position i is mandatory.  */
325228060Sbapt                          if (!mandatory.contains (i))
326228060Sbapt                            mandatory.add (i);
327228060Sbapt                        }
328228060Sbapt                    }
329228060Sbapt                }
330228060Sbapt            }
331228060Sbapt        }
332228060Sbapt    }
333228060Sbapt
334228060Sbapt  /* 2. Add positions, as long as this decreases the duplicates count.  */
335228060Sbapt  int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
336228060Sbapt              ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
337228060Sbapt  Positions current = mandatory;
338228060Sbapt  unsigned int current_duplicates_count =
339228060Sbapt    count_duplicates_tuple (current, alpha_unify);
340228060Sbapt  for (;;)
341228060Sbapt    {
342228060Sbapt      Positions best;
343228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
344228060Sbapt
345228060Sbapt      for (int i = imax; i >= -1; i--)
346228060Sbapt        if (!current.contains (i))
347228060Sbapt          {
348228060Sbapt            Positions tryal = current;
349228060Sbapt            tryal.add (i);
350228060Sbapt            unsigned int try_duplicates_count =
351228060Sbapt              count_duplicates_tuple (tryal, alpha_unify);
352228060Sbapt
353228060Sbapt            /* We prefer 'try' to 'best' if it produces less duplicates,
354228060Sbapt               or if it produces the same number of duplicates but with
355228060Sbapt               a more efficient hash function.  */
356228060Sbapt            if (try_duplicates_count < best_duplicates_count
357228060Sbapt                || (try_duplicates_count == best_duplicates_count && i >= 0))
358228060Sbapt              {
359228060Sbapt                best = tryal;
360228060Sbapt                best_duplicates_count = try_duplicates_count;
361228060Sbapt              }
362228060Sbapt          }
363228060Sbapt
364228060Sbapt      /* Stop adding positions when it gives no improvement.  */
365228060Sbapt      if (best_duplicates_count >= current_duplicates_count)
366228060Sbapt        break;
367228060Sbapt
368228060Sbapt      current = best;
369228060Sbapt      current_duplicates_count = best_duplicates_count;
370228060Sbapt    }
371228060Sbapt
372228060Sbapt  /* 3. Remove positions, as long as this doesn't increase the duplicates
373228060Sbapt     count.  */
374228060Sbapt  for (;;)
375228060Sbapt    {
376228060Sbapt      Positions best;
377228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
378228060Sbapt
379228060Sbapt      for (int i = imax; i >= -1; i--)
380228060Sbapt        if (current.contains (i) && !mandatory.contains (i))
381228060Sbapt          {
382228060Sbapt            Positions tryal = current;
383228060Sbapt            tryal.remove (i);
384228060Sbapt            unsigned int try_duplicates_count =
385228060Sbapt              count_duplicates_tuple (tryal, alpha_unify);
386228060Sbapt
387228060Sbapt            /* We prefer 'try' to 'best' if it produces less duplicates,
388228060Sbapt               or if it produces the same number of duplicates but with
389228060Sbapt               a more efficient hash function.  */
390228060Sbapt            if (try_duplicates_count < best_duplicates_count
391228060Sbapt                || (try_duplicates_count == best_duplicates_count && i == -1))
392228060Sbapt              {
393228060Sbapt                best = tryal;
394228060Sbapt                best_duplicates_count = try_duplicates_count;
395228060Sbapt              }
396228060Sbapt          }
397228060Sbapt
398228060Sbapt      /* Stop removing positions when it gives no improvement.  */
399228060Sbapt      if (best_duplicates_count > current_duplicates_count)
400228060Sbapt        break;
401228060Sbapt
402228060Sbapt      current = best;
403228060Sbapt      current_duplicates_count = best_duplicates_count;
404228060Sbapt    }
405228060Sbapt
406228060Sbapt  /* 4. Replace two positions by one, as long as this doesn't increase the
407228060Sbapt     duplicates count.  */
408228060Sbapt  for (;;)
409228060Sbapt    {
410228060Sbapt      Positions best;
411228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
412228060Sbapt
413228060Sbapt      for (int i1 = imax; i1 >= -1; i1--)
414228060Sbapt        if (current.contains (i1) && !mandatory.contains (i1))
415228060Sbapt          for (int i2 = imax; i2 >= -1; i2--)
416228060Sbapt            if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
417228060Sbapt              for (int i3 = imax; i3 >= 0; i3--)
418228060Sbapt                if (!current.contains (i3))
419228060Sbapt                  {
420228060Sbapt                    Positions tryal = current;
421228060Sbapt                    tryal.remove (i1);
422228060Sbapt                    tryal.remove (i2);
423228060Sbapt                    tryal.add (i3);
424228060Sbapt                    unsigned int try_duplicates_count =
425228060Sbapt                      count_duplicates_tuple (tryal, alpha_unify);
426228060Sbapt
427228060Sbapt                    /* We prefer 'try' to 'best' if it produces less duplicates,
428228060Sbapt                       or if it produces the same number of duplicates but with
429228060Sbapt                       a more efficient hash function.  */
430228060Sbapt                    if (try_duplicates_count < best_duplicates_count
431228060Sbapt                        || (try_duplicates_count == best_duplicates_count
432228060Sbapt                            && (i1 == -1 || i2 == -1 || i3 >= 0)))
433228060Sbapt                      {
434228060Sbapt                        best = tryal;
435228060Sbapt                        best_duplicates_count = try_duplicates_count;
436228060Sbapt                      }
437228060Sbapt                  }
438228060Sbapt
439228060Sbapt      /* Stop removing positions when it gives no improvement.  */
440228060Sbapt      if (best_duplicates_count > current_duplicates_count)
441228060Sbapt        break;
442228060Sbapt
443228060Sbapt      current = best;
444228060Sbapt      current_duplicates_count = best_duplicates_count;
445228060Sbapt    }
446228060Sbapt
447228060Sbapt  /* That's it.  Hope it's good enough.  */
448228060Sbapt  _key_positions = current;
449228060Sbapt
450228060Sbapt  if (option[DEBUG])
451228060Sbapt    {
452228060Sbapt      /* Print the result.  */
453228060Sbapt      fprintf (stderr, "\nComputed positions: ");
454228060Sbapt      PositionReverseIterator iter = _key_positions.reviterator();
455228060Sbapt      bool seen_lastchar = false;
456228060Sbapt      bool first = true;
457228060Sbapt      for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
458228060Sbapt        {
459228060Sbapt          if (!first)
460228060Sbapt            fprintf (stderr, ", ");
461228060Sbapt          if (i == Positions::LASTCHAR)
462228060Sbapt            seen_lastchar = true;
463228060Sbapt          else
464228060Sbapt            {
465228060Sbapt              fprintf (stderr, "%d", i + 1);
466228060Sbapt              first = false;
467228060Sbapt            }
468228060Sbapt        }
469228060Sbapt      if (seen_lastchar)
470228060Sbapt        {
471228060Sbapt          if (!first)
472228060Sbapt            fprintf (stderr, ", ");
473228060Sbapt          fprintf (stderr, "$");
474228060Sbapt        }
475228060Sbapt      fprintf (stderr, "\n");
476228060Sbapt    }
477228060Sbapt
478228060Sbapt  /* Free preliminary alpha_unify table.  */
479228060Sbapt  delete[] alpha_unify;
480228060Sbapt}
481228060Sbapt
482228060Sbapt/* Count the duplicate keywords that occur with the found set of positions.
483228060Sbapt   In other words, it returns the difference
484228060Sbapt     # K - # proj1 (K)
485228060Sbapt   where K is the multiset of given keywords.  */
486228060Sbaptunsigned int
487228060SbaptSearch::count_duplicates_tuple () const
488228060Sbapt{
489228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify ();
490228060Sbapt  unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
491228060Sbapt  delete[] alpha_unify;
492228060Sbapt  return count;
493228060Sbapt}
494228060Sbapt
495228060Sbapt/* ===================== Finding good alpha increments ===================== */
496228060Sbapt
497228060Sbapt/* Computes the upper bound on the indices passed to asso_values[].  */
498228060Sbaptunsigned int
499228060SbaptSearch::compute_alpha_size (const unsigned int *alpha_inc) const
500228060Sbapt{
501228060Sbapt  unsigned int max_alpha_inc = 0;
502228060Sbapt  for (int i = 0; i < _max_key_len; i++)
503228060Sbapt    if (max_alpha_inc < alpha_inc[i])
504228060Sbapt      max_alpha_inc = alpha_inc[i];
505228060Sbapt  return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
506228060Sbapt}
507228060Sbapt
508228060Sbapt/* Computes the unification rules between different asso_values[c].  */
509228060Sbaptunsigned int *
510228060SbaptSearch::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
511228060Sbapt{
512228060Sbapt  if (option[UPPERLOWER])
513228060Sbapt    {
514228060Sbapt      /* Without alpha increments, we would simply unify
515228060Sbapt           'A' -> 'a', ..., 'Z' -> 'z'.
516228060Sbapt         But when a keyword contains at position i a character c,
517228060Sbapt         we have the constraint
518228060Sbapt            asso_values[tolower(c) + alpha_inc[i]] ==
519228060Sbapt            asso_values[toupper(c) + alpha_inc[i]].
520228060Sbapt         This introduces a unification
521228060Sbapt           toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
522228060Sbapt         Note that this unification can extend outside the range of
523228060Sbapt         ASCII letters!  But still every unified character pair is at
524228060Sbapt         a distance of 'a'-'A' = 32, or (after chained unification)
525228060Sbapt         at a multiple of 32.  So in the end the alpha_unify vector has
526228060Sbapt         the form    c -> c + 32 * f(c)   where f(c) is a nonnegative
527228060Sbapt         integer.  */
528228060Sbapt      unsigned int alpha_size = compute_alpha_size (alpha_inc);
529228060Sbapt
530228060Sbapt      unsigned int *alpha_unify = new unsigned int[alpha_size];
531228060Sbapt      for (unsigned int c = 0; c < alpha_size; c++)
532228060Sbapt        alpha_unify[c] = c;
533228060Sbapt
534228060Sbapt      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
535228060Sbapt        {
536228060Sbapt          KeywordExt *keyword = temp->first();
537228060Sbapt
538228060Sbapt          /* Iterate through the selected character positions.  */
539228060Sbapt          PositionIterator iter = positions.iterator(keyword->_allchars_length);
540228060Sbapt
541228060Sbapt          for (int i; (i = iter.next ()) != PositionIterator::EOS; )
542228060Sbapt            {
543228060Sbapt              unsigned int c;
544228060Sbapt              if (i == Positions::LASTCHAR)
545228060Sbapt                c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
546228060Sbapt              else if (i < keyword->_allchars_length)
547228060Sbapt                c = static_cast<unsigned char>(keyword->_allchars[i]);
548228060Sbapt              else
549228060Sbapt                abort ();
550228060Sbapt              if (c >= 'A' && c <= 'Z')
551228060Sbapt                c += 'a' - 'A';
552228060Sbapt              if (c >= 'a' && c <= 'z')
553228060Sbapt                {
554228060Sbapt                  if (i != Positions::LASTCHAR)
555228060Sbapt                    c += alpha_inc[i];
556228060Sbapt                  /* Unify c with c - ('a'-'A').  */
557228060Sbapt                  unsigned int d = alpha_unify[c];
558228060Sbapt                  unsigned int b = c - ('a'-'A');
559228060Sbapt                  for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
560228060Sbapt                    alpha_unify[a] = d;
561228060Sbapt                }
562228060Sbapt            }
563228060Sbapt        }
564228060Sbapt      return alpha_unify;
565228060Sbapt    }
566228060Sbapt  else
567228060Sbapt    /* Identity mapping.  */
568228060Sbapt    return NULL;
569228060Sbapt}
570228060Sbapt
571228060Sbapt/* Initializes each keyword's _selchars array.  */
572228060Sbaptvoid
573228060SbaptSearch::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
574228060Sbapt{
575228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
576228060Sbapt    temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
577228060Sbapt}
578228060Sbapt
579228060Sbapt/* Count the duplicate keywords that occur with the given set of positions
580228060Sbapt   and a given alpha_inc[] array.
581228060Sbapt   In other words, it returns the difference
582228060Sbapt     # K - # proj2 (proj1 (K))
583228060Sbapt   where K is the multiset of given keywords.  */
584228060Sbaptunsigned int
585228060SbaptSearch::count_duplicates_multiset (const unsigned int *alpha_inc) const
586228060Sbapt{
587228060Sbapt  /* Run through the keyword list and count the duplicates incrementally.
588228060Sbapt     The result does not depend on the order of the keyword list, thanks to
589228060Sbapt     the formula above.  */
590228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
591228060Sbapt  init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
592228060Sbapt
593228060Sbapt  unsigned int count = 0;
594228060Sbapt  {
595228060Sbapt    Hash_Table representatives (_total_keys, option[NOLENGTH]);
596228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
597228060Sbapt      {
598228060Sbapt        KeywordExt *keyword = temp->first();
599228060Sbapt        if (representatives.insert (keyword))
600228060Sbapt          count++;
601228060Sbapt      }
602228060Sbapt  }
603228060Sbapt
604228060Sbapt  delete_selchars ();
605228060Sbapt  delete[] alpha_unify;
606228060Sbapt
607228060Sbapt  return count;
608228060Sbapt}
609228060Sbapt
610228060Sbapt/* Find good _alpha_inc[].  */
611228060Sbapt
612228060Sbaptvoid
613228060SbaptSearch::find_alpha_inc ()
614228060Sbapt{
615228060Sbapt  /* The goal is to choose _alpha_inc[] such that it doesn't introduce
616228060Sbapt     artificial duplicates.
617228060Sbapt     In other words, the goal is  # proj2 (proj1 (K)) = # proj1 (K).  */
618228060Sbapt  unsigned int duplicates_goal = count_duplicates_tuple ();
619228060Sbapt
620228060Sbapt  /* Start with zero increments.  This is sufficient in most cases.  */
621228060Sbapt  unsigned int *current = new unsigned int [_max_key_len];
622228060Sbapt  for (int i = 0; i < _max_key_len; i++)
623228060Sbapt    current[i] = 0;
624228060Sbapt  unsigned int current_duplicates_count = count_duplicates_multiset (current);
625228060Sbapt
626228060Sbapt  if (current_duplicates_count > duplicates_goal)
627228060Sbapt    {
628228060Sbapt      /* Look which _alpha_inc[i] we are free to increment.  */
629228060Sbapt      unsigned int nindices;
630228060Sbapt      {
631228060Sbapt        nindices = 0;
632228060Sbapt        PositionIterator iter = _key_positions.iterator(_max_key_len);
633228060Sbapt        for (;;)
634228060Sbapt          {
635228060Sbapt            int key_pos = iter.next ();
636228060Sbapt            if (key_pos == PositionIterator::EOS)
637228060Sbapt              break;
638228060Sbapt            if (key_pos != Positions::LASTCHAR)
639228060Sbapt              nindices++;
640228060Sbapt          }
641228060Sbapt      }
642228060Sbapt
643228060Sbapt      DYNAMIC_ARRAY (indices, unsigned int, nindices);
644228060Sbapt      {
645228060Sbapt        unsigned int j = 0;
646228060Sbapt        PositionIterator iter = _key_positions.iterator(_max_key_len);
647228060Sbapt        for (;;)
648228060Sbapt          {
649228060Sbapt            int key_pos = iter.next ();
650228060Sbapt            if (key_pos == PositionIterator::EOS)
651228060Sbapt              break;
652228060Sbapt            if (key_pos != Positions::LASTCHAR)
653228060Sbapt              indices[j++] = key_pos;
654228060Sbapt          }
655228060Sbapt        if (!(j == nindices))
656228060Sbapt          abort ();
657228060Sbapt      }
658228060Sbapt
659228060Sbapt      /* Perform several rounds of searching for a good alpha increment.
660228060Sbapt         Each round reduces the number of artificial collisions by adding
661228060Sbapt         an increment in a single key position.  */
662228060Sbapt      DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
663228060Sbapt      DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
664228060Sbapt      do
665228060Sbapt        {
666228060Sbapt          /* An increment of 1 is not always enough.  Try higher increments
667228060Sbapt             also.  */
668228060Sbapt          for (unsigned int inc = 1; ; inc++)
669228060Sbapt            {
670228060Sbapt              unsigned int best_duplicates_count = UINT_MAX;
671228060Sbapt
672228060Sbapt              for (unsigned int j = 0; j < nindices; j++)
673228060Sbapt                {
674228060Sbapt                  memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
675228060Sbapt                  tryal[indices[j]] += inc;
676228060Sbapt                  unsigned int try_duplicates_count =
677228060Sbapt                    count_duplicates_multiset (tryal);
678228060Sbapt
679228060Sbapt                  /* We prefer 'try' to 'best' if it produces less
680228060Sbapt                     duplicates.  */
681228060Sbapt                  if (try_duplicates_count < best_duplicates_count)
682228060Sbapt                    {
683228060Sbapt                      memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
684228060Sbapt                      best_duplicates_count = try_duplicates_count;
685228060Sbapt                    }
686228060Sbapt                }
687228060Sbapt
688228060Sbapt              /* Stop this round when we got an improvement.  */
689228060Sbapt              if (best_duplicates_count < current_duplicates_count)
690228060Sbapt                {
691228060Sbapt                  memcpy (current, best, _max_key_len * sizeof (unsigned int));
692228060Sbapt                  current_duplicates_count = best_duplicates_count;
693228060Sbapt                  break;
694228060Sbapt                }
695228060Sbapt            }
696228060Sbapt        }
697228060Sbapt      while (current_duplicates_count > duplicates_goal);
698228060Sbapt      FREE_DYNAMIC_ARRAY (tryal);
699228060Sbapt      FREE_DYNAMIC_ARRAY (best);
700228060Sbapt
701228060Sbapt      if (option[DEBUG])
702228060Sbapt        {
703228060Sbapt          /* Print the result.  */
704228060Sbapt          fprintf (stderr, "\nComputed alpha increments: ");
705228060Sbapt          bool first = true;
706228060Sbapt          for (unsigned int j = nindices; j-- > 0; )
707228060Sbapt            if (current[indices[j]] != 0)
708228060Sbapt              {
709228060Sbapt                if (!first)
710228060Sbapt                  fprintf (stderr, ", ");
711228060Sbapt                fprintf (stderr, "%u:+%u",
712228060Sbapt                         indices[j] + 1, current[indices[j]]);
713228060Sbapt                first = false;
714228060Sbapt              }
715228060Sbapt          fprintf (stderr, "\n");
716228060Sbapt        }
717228060Sbapt      FREE_DYNAMIC_ARRAY (indices);
718228060Sbapt    }
719228060Sbapt
720228060Sbapt  _alpha_inc = current;
721228060Sbapt  _alpha_size = compute_alpha_size (_alpha_inc);
722228060Sbapt  _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
723228060Sbapt}
724228060Sbapt
725228060Sbapt/* ======================= Finding good asso_values ======================== */
726228060Sbapt
727228060Sbapt/* Initializes the asso_values[] related parameters.  */
728228060Sbapt
729228060Sbaptvoid
730228060SbaptSearch::prepare_asso_values ()
731228060Sbapt{
732228060Sbapt  KeywordExt_List *temp;
733228060Sbapt
734228060Sbapt  /* Initialize each keyword's _selchars array.  */
735228060Sbapt  init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
736228060Sbapt
737228060Sbapt  /* Compute the maximum _selchars_length over all keywords.  */
738228060Sbapt  _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
739228060Sbapt
740228060Sbapt  /* Check for duplicates, i.e. keywords with the same _selchars array
741228060Sbapt     (and - if !option[NOLENGTH] - also the same length).
742228060Sbapt     We deal with these by building an equivalence class, so that only
743228060Sbapt     1 keyword is representative of the entire collection.  Only this
744228060Sbapt     representative remains in the keyword list; the others are accessible
745228060Sbapt     through the _duplicate_link chain, starting at the representative.
746228060Sbapt     This *greatly* simplifies processing during later stages of the program.
747228060Sbapt     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
748228060Sbapt  {
749228060Sbapt    _list_len = _total_keys;
750228060Sbapt    _total_duplicates = 0;
751228060Sbapt    /* Make hash table for efficiency.  */
752228060Sbapt    Hash_Table representatives (_list_len, option[NOLENGTH]);
753228060Sbapt
754228060Sbapt    KeywordExt_List *prev = NULL; /* list node before temp */
755228060Sbapt    for (temp = _head; temp; )
756228060Sbapt      {
757228060Sbapt        KeywordExt *keyword = temp->first();
758228060Sbapt        KeywordExt *other_keyword = representatives.insert (keyword);
759228060Sbapt        KeywordExt_List *garbage = NULL;
760228060Sbapt
761228060Sbapt        if (other_keyword)
762228060Sbapt          {
763228060Sbapt            _total_duplicates++;
764228060Sbapt            _list_len--;
765228060Sbapt            /* Remove keyword from the main list.  */
766228060Sbapt            prev->rest() = temp->rest();
767228060Sbapt            garbage = temp;
768228060Sbapt            /* And insert it on other_keyword's duplicate list.  */
769228060Sbapt            keyword->_duplicate_link = other_keyword->_duplicate_link;
770228060Sbapt            other_keyword->_duplicate_link = keyword;
771228060Sbapt
772228060Sbapt            /* Complain if user hasn't enabled the duplicate option. */
773228060Sbapt            if (!option[DUP] || option[DEBUG])
774228060Sbapt              {
775228060Sbapt                fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
776228060Sbapt                         keyword->_allchars_length, keyword->_allchars,
777228060Sbapt                         other_keyword->_allchars_length, other_keyword->_allchars);
778228060Sbapt                for (int j = 0; j < keyword->_selchars_length; j++)
779228060Sbapt                  putc (keyword->_selchars[j], stderr);
780228060Sbapt                fprintf (stderr, "\".\n");
781228060Sbapt              }
782228060Sbapt          }
783228060Sbapt        else
784228060Sbapt          {
785228060Sbapt            keyword->_duplicate_link = NULL;
786228060Sbapt            prev = temp;
787228060Sbapt          }
788228060Sbapt        temp = temp->rest();
789228060Sbapt        if (garbage)
790228060Sbapt          delete garbage;
791228060Sbapt      }
792228060Sbapt    if (option[DEBUG])
793228060Sbapt      representatives.dump();
794228060Sbapt  }
795228060Sbapt
796228060Sbapt  /* Exit program if duplicates exists and option[DUP] not set, since we
797228060Sbapt     don't want to continue in this case.  (We don't want to turn on
798228060Sbapt     option[DUP] implicitly, because the generated code is usually much
799228060Sbapt     slower.  */
800228060Sbapt  if (_total_duplicates)
801228060Sbapt    {
802228060Sbapt      if (option[DUP])
803228060Sbapt        fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
804228060Sbapt                         _total_duplicates);
805228060Sbapt      else
806228060Sbapt        {
807228060Sbapt          fprintf (stderr, "%d input keys have identical hash values,\n",
808228060Sbapt                           _total_duplicates);
809228060Sbapt          if (option[POSITIONS])
810228060Sbapt            fprintf (stderr, "try different key positions or use option -D.\n");
811228060Sbapt          else
812228060Sbapt            fprintf (stderr, "use option -D.\n");
813228060Sbapt          exit (1);
814228060Sbapt        }
815228060Sbapt    }
816228060Sbapt
817228060Sbapt  /* Compute the occurrences of each character in the alphabet.  */
818228060Sbapt  _occurrences = new int[_alpha_size];
819228060Sbapt  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
820228060Sbapt  for (temp = _head; temp; temp = temp->rest())
821228060Sbapt    {
822228060Sbapt      KeywordExt *keyword = temp->first();
823228060Sbapt      const unsigned int *ptr = keyword->_selchars;
824228060Sbapt      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
825228060Sbapt        _occurrences[*ptr]++;
826228060Sbapt    }
827228060Sbapt
828228060Sbapt  /* Memory allocation.  */
829228060Sbapt  _asso_values = new int[_alpha_size];
830228060Sbapt
831228060Sbapt  int non_linked_length = _list_len;
832228060Sbapt  unsigned int asso_value_max;
833228060Sbapt
834228060Sbapt  asso_value_max =
835228060Sbapt    static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
836228060Sbapt  /* Round up to the next power of two.  This makes it easy to ensure
837228060Sbapt     an _asso_value[c] is >= 0 and < asso_value_max.  Also, the jump value
838228060Sbapt     being odd, it guarantees that Search::try_asso_value() will iterate
839228060Sbapt     through different values for _asso_value[c].  */
840228060Sbapt  if (asso_value_max == 0)
841228060Sbapt    asso_value_max = 1;
842228060Sbapt  asso_value_max |= asso_value_max >> 1;
843228060Sbapt  asso_value_max |= asso_value_max >> 2;
844228060Sbapt  asso_value_max |= asso_value_max >> 4;
845228060Sbapt  asso_value_max |= asso_value_max >> 8;
846228060Sbapt  asso_value_max |= asso_value_max >> 16;
847228060Sbapt  asso_value_max++;
848228060Sbapt  _asso_value_max = asso_value_max;
849228060Sbapt
850228060Sbapt  /* Given the bound for _asso_values[c], we have a bound for the possible
851228060Sbapt     hash values, as computed in compute_hash().  */
852228060Sbapt  _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
853228060Sbapt                    + (_asso_value_max - 1) * _max_selchars_length;
854228060Sbapt  /* Allocate a sparse bit vector for detection of collisions of hash
855228060Sbapt     values.  */
856228060Sbapt  _collision_detector = new Bool_Array (_max_hash_value + 1);
857228060Sbapt
858228060Sbapt  if (option[DEBUG])
859228060Sbapt    {
860228060Sbapt      fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
861228060Sbapt               "\nmaximum size of generated hash table is %d\n",
862228060Sbapt               non_linked_length, asso_value_max, _max_hash_value);
863228060Sbapt
864228060Sbapt      int field_width;
865228060Sbapt
866228060Sbapt      field_width = 0;
867228060Sbapt      {
868228060Sbapt        for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
869228060Sbapt          {
870228060Sbapt            KeywordExt *keyword = temp->first();
871228060Sbapt            if (field_width < keyword->_selchars_length)
872228060Sbapt              field_width = keyword->_selchars_length;
873228060Sbapt          }
874228060Sbapt      }
875228060Sbapt
876228060Sbapt      fprintf (stderr, "\ndumping the keyword list without duplicates\n");
877228060Sbapt      fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
878228060Sbapt      int i = 0;
879228060Sbapt      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
880228060Sbapt        {
881228060Sbapt          KeywordExt *keyword = temp->first();
882228060Sbapt          fprintf (stderr, "%9d, ", ++i);
883228060Sbapt          if (field_width > keyword->_selchars_length)
884228060Sbapt            fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
885228060Sbapt          for (int j = 0; j < keyword->_selchars_length; j++)
886228060Sbapt            putc (keyword->_selchars[j], stderr);
887228060Sbapt          fprintf (stderr, ", %.*s\n",
888228060Sbapt                   keyword->_allchars_length, keyword->_allchars);
889228060Sbapt        }
890228060Sbapt      fprintf (stderr, "\nend of keyword list\n\n");
891228060Sbapt    }
892228060Sbapt
893228060Sbapt  if (option[RANDOM] || option.get_jump () == 0)
894228060Sbapt    /* We will use rand(), so initialize the random number generator.  */
895228060Sbapt    srand (static_cast<long>(time (0)));
896228060Sbapt
897228060Sbapt  _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
898228060Sbapt  _jump = option.get_jump ();
899228060Sbapt}
900228060Sbapt
901228060Sbapt/* Finds some _asso_values[] that fit.  */
902228060Sbapt
903228060Sbapt/* The idea is to choose the _asso_values[] one by one, in a way that
904228060Sbapt   a choice that has been made never needs to be undone later.  This
905228060Sbapt   means that we split the work into several steps.  Each step chooses
906228060Sbapt   one or more _asso_values[c].  The result of choosing one or more
907228060Sbapt   _asso_values[c] is that the partitioning of the keyword set gets
908228060Sbapt   broader.
909228060Sbapt   Look at this partitioning:  After every step, the _asso_values[] of a
910228060Sbapt   certain set C of characters are undetermined.  (At the beginning, C
911228060Sbapt   is the set of characters c with _occurrences[c] > 0.  At the end, C
912228060Sbapt   is empty.)  To each keyword K, we associate the multiset of _selchars
913228060Sbapt   for which the _asso_values[] are undetermined:
914228060Sbapt                    K  -->  K->_selchars intersect C.
915228060Sbapt   Consider two keywords equivalent if their value under this mapping is
916228060Sbapt   the same.  This introduces an equivalence relation on the set of
917228060Sbapt   keywords.  The equivalence classes partition the keyword set.  (At the
918228060Sbapt   beginning, the partition is the finest possible: each K is an equivalence
919228060Sbapt   class by itself, because all K have a different _selchars.  At the end,
920228060Sbapt   all K have been merged into a single equivalence class.)
921228060Sbapt   The partition before a step is always a refinement of the partition
922228060Sbapt   after the step.
923228060Sbapt   We choose the steps in such a way that the partition really becomes
924228060Sbapt   broader at each step.  (A step that only chooses an _asso_values[c]
925228060Sbapt   without changing the partition is better merged with the previous step,
926228060Sbapt   to avoid useless backtracking.)  */
927228060Sbapt
928228060Sbaptstruct EquivalenceClass
929228060Sbapt{
930228060Sbapt  /* The keywords in this equivalence class.  */
931228060Sbapt  KeywordExt_List *     _keywords;
932228060Sbapt  KeywordExt_List *     _keywords_last;
933228060Sbapt  /* The number of keywords in this equivalence class.  */
934228060Sbapt  unsigned int          _cardinality;
935228060Sbapt  /* The undetermined selected characters for the keywords in this
936228060Sbapt     equivalence class, as a canonically reordered multiset.  */
937228060Sbapt  unsigned int *        _undetermined_chars;
938228060Sbapt  unsigned int          _undetermined_chars_length;
939228060Sbapt
940228060Sbapt  EquivalenceClass *    _next;
941228060Sbapt};
942228060Sbapt
943228060Sbaptstruct Step
944228060Sbapt{
945228060Sbapt  /* The characters whose values are being determined in this step.  */
946228060Sbapt  unsigned int          _changing_count;
947228060Sbapt  unsigned int *        _changing;
948228060Sbapt  /* Exclusive upper bound for the _asso_values[c] of this step.
949228060Sbapt     A power of 2.  */
950228060Sbapt  unsigned int          _asso_value_max;
951228060Sbapt  /* The characters whose values will be determined after this step.  */
952228060Sbapt  bool *                _undetermined;
953228060Sbapt  /* The keyword set partition after this step.  */
954228060Sbapt  EquivalenceClass *    _partition;
955228060Sbapt  /* The expected number of iterations in this step.  */
956228060Sbapt  double                _expected_lower;
957228060Sbapt  double                _expected_upper;
958228060Sbapt
959228060Sbapt  Step *                _next;
960228060Sbapt};
961228060Sbapt
962228060Sbaptstatic inline bool
963228060Sbaptequals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
964228060Sbapt{
965228060Sbapt  while (len > 0)
966228060Sbapt    {
967228060Sbapt      if (*ptr1 != *ptr2)
968228060Sbapt        return false;
969228060Sbapt      ptr1++;
970228060Sbapt      ptr2++;
971228060Sbapt      len--;
972228060Sbapt    }
973228060Sbapt  return true;
974228060Sbapt}
975228060Sbapt
976228060SbaptEquivalenceClass *
977228060SbaptSearch::compute_partition (bool *undetermined) const
978228060Sbapt{
979228060Sbapt  EquivalenceClass *partition = NULL;
980228060Sbapt  EquivalenceClass *partition_last = NULL;
981228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
982228060Sbapt    {
983228060Sbapt      KeywordExt *keyword = temp->first();
984228060Sbapt
985228060Sbapt      /* Compute the undetermined characters for this keyword.  */
986228060Sbapt      unsigned int *undetermined_chars =
987228060Sbapt        new unsigned int[keyword->_selchars_length];
988228060Sbapt      unsigned int undetermined_chars_length = 0;
989228060Sbapt
990228060Sbapt      for (int i = 0; i < keyword->_selchars_length; i++)
991228060Sbapt        if (undetermined[keyword->_selchars[i]])
992228060Sbapt          undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
993228060Sbapt
994228060Sbapt      /* Look up the equivalence class to which this keyword belongs.  */
995228060Sbapt      EquivalenceClass *equclass;
996228060Sbapt      for (equclass = partition; equclass; equclass = equclass->_next)
997228060Sbapt        if (equclass->_undetermined_chars_length == undetermined_chars_length
998228060Sbapt            && equals (equclass->_undetermined_chars, undetermined_chars,
999228060Sbapt                       undetermined_chars_length))
1000228060Sbapt          break;
1001228060Sbapt      if (equclass == NULL)
1002228060Sbapt        {
1003228060Sbapt          equclass = new EquivalenceClass();
1004228060Sbapt          equclass->_keywords = NULL;
1005228060Sbapt          equclass->_keywords_last = NULL;
1006228060Sbapt          equclass->_cardinality = 0;
1007228060Sbapt          equclass->_undetermined_chars = undetermined_chars;
1008228060Sbapt          equclass->_undetermined_chars_length = undetermined_chars_length;
1009228060Sbapt          equclass->_next = NULL;
1010228060Sbapt          if (partition)
1011228060Sbapt            partition_last->_next = equclass;
1012228060Sbapt          else
1013228060Sbapt            partition = equclass;
1014228060Sbapt          partition_last = equclass;
1015228060Sbapt        }
1016228060Sbapt      else
1017228060Sbapt        delete[] undetermined_chars;
1018228060Sbapt
1019228060Sbapt      /* Add the keyword to the equivalence class.  */
1020228060Sbapt      KeywordExt_List *cons = new KeywordExt_List(keyword);
1021228060Sbapt      if (equclass->_keywords)
1022228060Sbapt        equclass->_keywords_last->rest() = cons;
1023228060Sbapt      else
1024228060Sbapt        equclass->_keywords = cons;
1025228060Sbapt      equclass->_keywords_last = cons;
1026228060Sbapt      equclass->_cardinality++;
1027228060Sbapt    }
1028228060Sbapt
1029228060Sbapt  /* Free some of the allocated memory.  The caller doesn't need it.  */
1030228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1031228060Sbapt    delete[] cls->_undetermined_chars;
1032228060Sbapt
1033228060Sbapt  return partition;
1034228060Sbapt}
1035228060Sbapt
1036228060Sbaptstatic void
1037228060Sbaptdelete_partition (EquivalenceClass *partition)
1038228060Sbapt{
1039228060Sbapt  while (partition != NULL)
1040228060Sbapt    {
1041228060Sbapt      EquivalenceClass *equclass = partition;
1042228060Sbapt      partition = equclass->_next;
1043228060Sbapt      delete_list (equclass->_keywords);
1044228060Sbapt      //delete[] equclass->_undetermined_chars; // already freed above
1045228060Sbapt      delete equclass;
1046228060Sbapt    }
1047228060Sbapt}
1048228060Sbapt
1049228060Sbapt/* Compute the possible number of collisions when _asso_values[c] is
1050228060Sbapt   chosen, leading to the given partition.  */
1051228060Sbaptunsigned int
1052228060SbaptSearch::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1053228060Sbapt{
1054228060Sbapt  /* Every equivalence class p is split according to the frequency of
1055228060Sbapt     occurrence of c, leading to equivalence classes p1, p2, ...
1056228060Sbapt     This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1057228060Sbapt     Return the sum of this expression over all equivalence classes.  */
1058228060Sbapt  unsigned int sum = 0;
1059228060Sbapt  unsigned int m = _max_selchars_length;
1060228060Sbapt  DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1061228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1062228060Sbapt    {
1063228060Sbapt      for (unsigned int i = 0; i <= m; i++)
1064228060Sbapt        split_cardinalities[i] = 0;
1065228060Sbapt
1066228060Sbapt      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1067228060Sbapt        {
1068228060Sbapt          KeywordExt *keyword = temp->first();
1069228060Sbapt
1070228060Sbapt          unsigned int count = 0;
1071228060Sbapt          for (int i = 0; i < keyword->_selchars_length; i++)
1072228060Sbapt            if (keyword->_selchars[i] == c)
1073228060Sbapt              count++;
1074228060Sbapt
1075228060Sbapt          split_cardinalities[count]++;
1076228060Sbapt        }
1077228060Sbapt
1078228060Sbapt      sum += cls->_cardinality * cls->_cardinality;
1079228060Sbapt      for (unsigned int i = 0; i <= m; i++)
1080228060Sbapt        sum -= split_cardinalities[i] * split_cardinalities[i];
1081228060Sbapt    }
1082228060Sbapt  FREE_DYNAMIC_ARRAY (split_cardinalities);
1083228060Sbapt  return sum;
1084228060Sbapt}
1085228060Sbapt
1086228060Sbapt/* Test whether adding c to the undetermined characters changes the given
1087228060Sbapt   partition.  */
1088228060Sbaptbool
1089228060SbaptSearch::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1090228060Sbapt{
1091228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1092228060Sbapt    {
1093228060Sbapt      unsigned int first_count = UINT_MAX;
1094228060Sbapt
1095228060Sbapt      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1096228060Sbapt        {
1097228060Sbapt          KeywordExt *keyword = temp->first();
1098228060Sbapt
1099228060Sbapt          unsigned int count = 0;
1100228060Sbapt          for (int i = 0; i < keyword->_selchars_length; i++)
1101228060Sbapt            if (keyword->_selchars[i] == c)
1102228060Sbapt              count++;
1103228060Sbapt
1104228060Sbapt          if (temp == cls->_keywords)
1105228060Sbapt            first_count = count;
1106228060Sbapt          else if (count != first_count)
1107228060Sbapt            /* c would split this equivalence class.  */
1108228060Sbapt            return false;
1109228060Sbapt        }
1110228060Sbapt    }
1111228060Sbapt  return true;
1112228060Sbapt}
1113228060Sbapt
1114228060Sbaptvoid
1115228060SbaptSearch::find_asso_values ()
1116228060Sbapt{
1117228060Sbapt  Step *steps;
1118228060Sbapt
1119228060Sbapt  /* Determine the steps, starting with the last one.  */
1120228060Sbapt  {
1121228060Sbapt    bool *undetermined;
1122228060Sbapt    bool *determined;
1123228060Sbapt
1124228060Sbapt    steps = NULL;
1125228060Sbapt
1126228060Sbapt    undetermined = new bool[_alpha_size];
1127228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1128228060Sbapt      undetermined[c] = false;
1129228060Sbapt
1130228060Sbapt    determined = new bool[_alpha_size];
1131228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1132228060Sbapt      determined[c] = true;
1133228060Sbapt
1134228060Sbapt    for (;;)
1135228060Sbapt      {
1136228060Sbapt        /* Compute the partition that needs to be refined.  */
1137228060Sbapt        EquivalenceClass *partition = compute_partition (undetermined);
1138228060Sbapt
1139228060Sbapt        /* Determine the main character to be chosen in this step.
1140228060Sbapt           Choosing such a character c has the effect of splitting every
1141228060Sbapt           equivalence class (according the the frequency of occurrence of c).
1142228060Sbapt           We choose the c with the minimum number of possible collisions,
1143228060Sbapt           so that characters which lead to a large number of collisions get
1144228060Sbapt           handled early during the search.  */
1145228060Sbapt        unsigned int chosen_c;
1146228060Sbapt        unsigned int chosen_possible_collisions;
1147228060Sbapt        {
1148228060Sbapt          unsigned int best_c = 0;
1149228060Sbapt          unsigned int best_possible_collisions = UINT_MAX;
1150228060Sbapt          for (unsigned int c = 0; c < _alpha_size; c++)
1151228060Sbapt            if (_occurrences[c] > 0 && determined[c])
1152228060Sbapt              {
1153228060Sbapt                unsigned int possible_collisions =
1154228060Sbapt                  count_possible_collisions (partition, c);
1155228060Sbapt                if (possible_collisions < best_possible_collisions)
1156228060Sbapt                  {
1157228060Sbapt                    best_c = c;
1158228060Sbapt                    best_possible_collisions = possible_collisions;
1159228060Sbapt                  }
1160228060Sbapt              }
1161228060Sbapt          if (best_possible_collisions == UINT_MAX)
1162228060Sbapt            {
1163228060Sbapt              /* All c with _occurrences[c] > 0 are undetermined.  We are
1164228060Sbapt                 are the starting situation and don't need any more step.  */
1165228060Sbapt              delete_partition (partition);
1166228060Sbapt              break;
1167228060Sbapt            }
1168228060Sbapt          chosen_c = best_c;
1169228060Sbapt          chosen_possible_collisions = best_possible_collisions;
1170228060Sbapt        }
1171228060Sbapt
1172228060Sbapt        /* We need one more step.  */
1173228060Sbapt        Step *step = new Step();
1174228060Sbapt
1175228060Sbapt        step->_undetermined = new bool[_alpha_size];
1176228060Sbapt        memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1177228060Sbapt
1178228060Sbapt        step->_partition = partition;
1179228060Sbapt
1180228060Sbapt        /* Now determine how the equivalence classes will be before this
1181228060Sbapt           step.  */
1182228060Sbapt        undetermined[chosen_c] = true;
1183228060Sbapt        partition = compute_partition (undetermined);
1184228060Sbapt
1185228060Sbapt        /* Now determine which other characters should be determined in this
1186228060Sbapt           step, because they will not change the equivalence classes at
1187228060Sbapt           this point.  It is the set of all c which, for all equivalence
1188228060Sbapt           classes, have the same frequency of occurrence in every keyword
1189228060Sbapt           of the equivalence class.  */
1190228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1191228060Sbapt          if (_occurrences[c] > 0 && determined[c]
1192228060Sbapt              && unchanged_partition (partition, c))
1193228060Sbapt            {
1194228060Sbapt              undetermined[c] = true;
1195228060Sbapt              determined[c] = false;
1196228060Sbapt            }
1197228060Sbapt
1198228060Sbapt        /* main_c must be one of these.  */
1199228060Sbapt        if (determined[chosen_c])
1200228060Sbapt          abort ();
1201228060Sbapt
1202228060Sbapt        /* Now the set of changing characters of this step.  */
1203228060Sbapt        unsigned int changing_count;
1204228060Sbapt
1205228060Sbapt        changing_count = 0;
1206228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1207228060Sbapt          if (undetermined[c] && !step->_undetermined[c])
1208228060Sbapt            changing_count++;
1209228060Sbapt
1210228060Sbapt        unsigned int *changing = new unsigned int[changing_count];
1211228060Sbapt        changing_count = 0;
1212228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1213228060Sbapt          if (undetermined[c] && !step->_undetermined[c])
1214228060Sbapt            changing[changing_count++] = c;
1215228060Sbapt
1216228060Sbapt        step->_changing = changing;
1217228060Sbapt        step->_changing_count = changing_count;
1218228060Sbapt
1219228060Sbapt        step->_asso_value_max = _asso_value_max;
1220228060Sbapt
1221228060Sbapt        step->_expected_lower =
1222228060Sbapt          exp (static_cast<double>(chosen_possible_collisions)
1223228060Sbapt               / static_cast<double>(_max_hash_value));
1224228060Sbapt        step->_expected_upper =
1225228060Sbapt          exp (static_cast<double>(chosen_possible_collisions)
1226228060Sbapt               / static_cast<double>(_asso_value_max));
1227228060Sbapt
1228228060Sbapt        delete_partition (partition);
1229228060Sbapt
1230228060Sbapt        step->_next = steps;
1231228060Sbapt        steps = step;
1232228060Sbapt      }
1233228060Sbapt
1234228060Sbapt    delete[] determined;
1235228060Sbapt    delete[] undetermined;
1236228060Sbapt  }
1237228060Sbapt
1238228060Sbapt  if (option[DEBUG])
1239228060Sbapt    {
1240228060Sbapt      unsigned int stepno = 0;
1241228060Sbapt      for (Step *step = steps; step; step = step->_next)
1242228060Sbapt        {
1243228060Sbapt          stepno++;
1244228060Sbapt          fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1245228060Sbapt          for (unsigned int i = 0; i < step->_changing_count; i++)
1246228060Sbapt            {
1247228060Sbapt              if (i > 0)
1248228060Sbapt                fprintf (stderr, ",");
1249228060Sbapt              fprintf (stderr, "'%c'", step->_changing[i]);
1250228060Sbapt            }
1251228060Sbapt          fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1252228060Sbapt                   step->_expected_lower, step->_expected_upper);
1253228060Sbapt          fprintf (stderr, "Keyword equivalence classes:\n");
1254228060Sbapt          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1255228060Sbapt            {
1256228060Sbapt              fprintf (stderr, "\n");
1257228060Sbapt              for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1258228060Sbapt                {
1259228060Sbapt                  KeywordExt *keyword = temp->first();
1260228060Sbapt                  fprintf (stderr, "  %.*s\n",
1261228060Sbapt                           keyword->_allchars_length, keyword->_allchars);
1262228060Sbapt                }
1263228060Sbapt            }
1264228060Sbapt          fprintf (stderr, "\n");
1265228060Sbapt        }
1266228060Sbapt    }
1267228060Sbapt
1268228060Sbapt  /* Initialize _asso_values[].  (The value given here matters only
1269228060Sbapt     for those c which occur in all keywords with equal multiplicity.)  */
1270228060Sbapt  for (unsigned int c = 0; c < _alpha_size; c++)
1271228060Sbapt    _asso_values[c] = 0;
1272228060Sbapt
1273228060Sbapt  unsigned int stepno = 0;
1274228060Sbapt  for (Step *step = steps; step; step = step->_next)
1275228060Sbapt    {
1276228060Sbapt      stepno++;
1277228060Sbapt
1278228060Sbapt      /* Initialize the asso_values[].  */
1279228060Sbapt      unsigned int k = step->_changing_count;
1280228060Sbapt      for (unsigned int i = 0; i < k; i++)
1281228060Sbapt        {
1282228060Sbapt          unsigned int c = step->_changing[i];
1283228060Sbapt          _asso_values[c] =
1284228060Sbapt            (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1285228060Sbapt            & (step->_asso_value_max - 1);
1286228060Sbapt        }
1287228060Sbapt
1288228060Sbapt      unsigned int iterations = 0;
1289228060Sbapt      DYNAMIC_ARRAY (iter, unsigned int, k);
1290228060Sbapt      for (unsigned int i = 0; i < k; i++)
1291228060Sbapt        iter[i] = 0;
1292228060Sbapt      unsigned int ii = (_jump != 0 ? k - 1 : 0);
1293228060Sbapt
1294228060Sbapt      for (;;)
1295228060Sbapt        {
1296228060Sbapt          /* Test whether these asso_values[] lead to collisions among
1297228060Sbapt             the equivalence classes that should be collision-free.  */
1298228060Sbapt          bool has_collision = false;
1299228060Sbapt          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1300228060Sbapt            {
1301228060Sbapt              /* Iteration Number array is a win, O(1) initialization time!  */
1302228060Sbapt              _collision_detector->clear ();
1303228060Sbapt
1304228060Sbapt              for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1305228060Sbapt                {
1306228060Sbapt                  KeywordExt *keyword = ptr->first();
1307228060Sbapt
1308228060Sbapt                  /* Compute the new hash code for the keyword, leaving apart
1309228060Sbapt                     the yet undetermined asso_values[].  */
1310228060Sbapt                  int hashcode;
1311228060Sbapt                  {
1312228060Sbapt                    int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1313228060Sbapt                    const unsigned int *p = keyword->_selchars;
1314228060Sbapt                    int i = keyword->_selchars_length;
1315228060Sbapt                    for (; i > 0; p++, i--)
1316228060Sbapt                      if (!step->_undetermined[*p])
1317228060Sbapt                        sum += _asso_values[*p];
1318228060Sbapt                    hashcode = sum;
1319228060Sbapt                  }
1320228060Sbapt
1321228060Sbapt                  /* See whether it collides with another keyword's hash code,
1322228060Sbapt                     from the same equivalence class.  */
1323228060Sbapt                  if (_collision_detector->set_bit (hashcode))
1324228060Sbapt                    {
1325228060Sbapt                      has_collision = true;
1326228060Sbapt                      break;
1327228060Sbapt                    }
1328228060Sbapt                }
1329228060Sbapt
1330228060Sbapt              /* Don't need to continue looking at the other equivalence
1331228060Sbapt                 classes if we already have found a collision.  */
1332228060Sbapt              if (has_collision)
1333228060Sbapt                break;
1334228060Sbapt            }
1335228060Sbapt
1336228060Sbapt          iterations++;
1337228060Sbapt          if (!has_collision)
1338228060Sbapt            break;
1339228060Sbapt
1340228060Sbapt          /* Try other asso_values[].  */
1341228060Sbapt          if (_jump != 0)
1342228060Sbapt            {
1343228060Sbapt              /* The way we try various values for
1344228060Sbapt                   asso_values[step->_changing[0],...step->_changing[k-1]]
1345228060Sbapt                 is like this:
1346228060Sbapt                 for (bound = 0,1,...)
1347228060Sbapt                   for (ii = 0,...,k-1)
1348228060Sbapt                     iter[ii] := bound
1349228060Sbapt                     iter[0..ii-1] := values <= bound
1350228060Sbapt                     iter[ii+1..k-1] := values < bound
1351228060Sbapt                 and
1352228060Sbapt                   asso_values[step->_changing[i]] =
1353228060Sbapt                     _initial_asso_value + iter[i] * _jump.
1354228060Sbapt                 This makes it more likely to find small asso_values[].
1355228060Sbapt               */
1356228060Sbapt              unsigned int bound = iter[ii];
1357228060Sbapt              unsigned int i = 0;
1358228060Sbapt              while (i < ii)
1359228060Sbapt                {
1360228060Sbapt                  unsigned int c = step->_changing[i];
1361228060Sbapt                  iter[i]++;
1362228060Sbapt                  _asso_values[c] =
1363228060Sbapt                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1364228060Sbapt                  if (iter[i] <= bound)
1365228060Sbapt                    goto found_next;
1366228060Sbapt                  _asso_values[c] =
1367228060Sbapt                    (_asso_values[c] - iter[i] * _jump)
1368228060Sbapt                    & (step->_asso_value_max - 1);
1369228060Sbapt                  iter[i] = 0;
1370228060Sbapt                  i++;
1371228060Sbapt                }
1372228060Sbapt              i = ii + 1;
1373228060Sbapt              while (i < k)
1374228060Sbapt                {
1375228060Sbapt                  unsigned int c = step->_changing[i];
1376228060Sbapt                  iter[i]++;
1377228060Sbapt                  _asso_values[c] =
1378228060Sbapt                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1379228060Sbapt                  if (iter[i] < bound)
1380228060Sbapt                    goto found_next;
1381228060Sbapt                  _asso_values[c] =
1382228060Sbapt                    (_asso_values[c] - iter[i] * _jump)
1383228060Sbapt                    & (step->_asso_value_max - 1);
1384228060Sbapt                  iter[i] = 0;
1385228060Sbapt                  i++;
1386228060Sbapt                }
1387228060Sbapt              /* Switch from one ii to the next.  */
1388228060Sbapt              {
1389228060Sbapt                unsigned int c = step->_changing[ii];
1390228060Sbapt                _asso_values[c] =
1391228060Sbapt                  (_asso_values[c] - bound * _jump)
1392228060Sbapt                  & (step->_asso_value_max - 1);
1393228060Sbapt                iter[ii] = 0;
1394228060Sbapt              }
1395228060Sbapt              /* Here all iter[i] == 0.  */
1396228060Sbapt              ii++;
1397228060Sbapt              if (ii == k)
1398228060Sbapt                {
1399228060Sbapt                  ii = 0;
1400228060Sbapt                  bound++;
1401228060Sbapt                  if (bound == step->_asso_value_max)
1402228060Sbapt                    {
1403228060Sbapt                      /* Out of search space!  We can either backtrack, or
1404228060Sbapt                         increase the available search space of this step.
1405228060Sbapt                         It seems simpler to choose the latter solution.  */
1406228060Sbapt                      step->_asso_value_max = 2 * step->_asso_value_max;
1407228060Sbapt                      if (step->_asso_value_max > _asso_value_max)
1408228060Sbapt                        {
1409228060Sbapt                          _asso_value_max = step->_asso_value_max;
1410228060Sbapt                          /* Reinitialize _max_hash_value.  */
1411228060Sbapt                          _max_hash_value =
1412228060Sbapt                            (option[NOLENGTH] ? 0 : _max_key_len)
1413228060Sbapt                            + (_asso_value_max - 1) * _max_selchars_length;
1414228060Sbapt                          /* Reinitialize _collision_detector.  */
1415228060Sbapt                          delete _collision_detector;
1416228060Sbapt                          _collision_detector =
1417228060Sbapt                            new Bool_Array (_max_hash_value + 1);
1418228060Sbapt                        }
1419228060Sbapt                    }
1420228060Sbapt                }
1421228060Sbapt              {
1422228060Sbapt                unsigned int c = step->_changing[ii];
1423228060Sbapt                iter[ii] = bound;
1424228060Sbapt                _asso_values[c] =
1425228060Sbapt                  (_asso_values[c] + bound * _jump)
1426228060Sbapt                  & (step->_asso_value_max - 1);
1427228060Sbapt              }
1428228060Sbapt             found_next: ;
1429228060Sbapt            }
1430228060Sbapt          else
1431228060Sbapt            {
1432228060Sbapt              /* Random.  */
1433228060Sbapt              unsigned int c = step->_changing[ii];
1434228060Sbapt              _asso_values[c] =
1435228060Sbapt                (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1436228060Sbapt              /* Next time, change the next c.  */
1437228060Sbapt              ii++;
1438228060Sbapt              if (ii == k)
1439228060Sbapt                ii = 0;
1440228060Sbapt            }
1441228060Sbapt        }
1442228060Sbapt      FREE_DYNAMIC_ARRAY (iter);
1443228060Sbapt
1444228060Sbapt      if (option[DEBUG])
1445228060Sbapt        {
1446228060Sbapt          fprintf (stderr, "Step %u chose _asso_values[", stepno);
1447228060Sbapt          for (unsigned int i = 0; i < step->_changing_count; i++)
1448228060Sbapt            {
1449228060Sbapt              if (i > 0)
1450228060Sbapt                fprintf (stderr, ",");
1451228060Sbapt              fprintf (stderr, "'%c'", step->_changing[i]);
1452228060Sbapt            }
1453228060Sbapt          fprintf (stderr, "] in %u iterations.\n", iterations);
1454228060Sbapt        }
1455228060Sbapt    }
1456228060Sbapt
1457228060Sbapt  /* Free allocated memory.  */
1458228060Sbapt  while (steps != NULL)
1459228060Sbapt    {
1460228060Sbapt      Step *step = steps;
1461228060Sbapt      steps = step->_next;
1462228060Sbapt      delete[] step->_changing;
1463228060Sbapt      delete[] step->_undetermined;
1464228060Sbapt      delete_partition (step->_partition);
1465228060Sbapt      delete step;
1466228060Sbapt    }
1467228060Sbapt}
1468228060Sbapt
1469228060Sbapt/* Computes a keyword's hash value, relative to the current _asso_values[],
1470228060Sbapt   and stores it in keyword->_hash_value.  */
1471228060Sbapt
1472228060Sbaptinline int
1473228060SbaptSearch::compute_hash (KeywordExt *keyword) const
1474228060Sbapt{
1475228060Sbapt  int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1476228060Sbapt
1477228060Sbapt  const unsigned int *p = keyword->_selchars;
1478228060Sbapt  int i = keyword->_selchars_length;
1479228060Sbapt  for (; i > 0; p++, i--)
1480228060Sbapt    sum += _asso_values[*p];
1481228060Sbapt
1482228060Sbapt  return keyword->_hash_value = sum;
1483228060Sbapt}
1484228060Sbapt
1485228060Sbapt/* Finds good _asso_values[].  */
1486228060Sbapt
1487228060Sbaptvoid
1488228060SbaptSearch::find_good_asso_values ()
1489228060Sbapt{
1490228060Sbapt  prepare_asso_values ();
1491228060Sbapt
1492228060Sbapt  /* Search for good _asso_values[].  */
1493228060Sbapt  int asso_iteration;
1494228060Sbapt  if ((asso_iteration = option.get_asso_iterations ()) == 0)
1495228060Sbapt    /* Try only the given _initial_asso_value and _jump.  */
1496228060Sbapt    find_asso_values ();
1497228060Sbapt  else
1498228060Sbapt    {
1499228060Sbapt      /* Try different pairs of _initial_asso_value and _jump, in the
1500228060Sbapt         following order:
1501228060Sbapt           (0, 1)
1502228060Sbapt           (1, 1)
1503228060Sbapt           (2, 1) (0, 3)
1504228060Sbapt           (3, 1) (1, 3)
1505228060Sbapt           (4, 1) (2, 3) (0, 5)
1506228060Sbapt           (5, 1) (3, 3) (1, 5)
1507228060Sbapt           ..... */
1508228060Sbapt      KeywordExt_List *saved_head = _head;
1509228060Sbapt      int best_initial_asso_value = 0;
1510228060Sbapt      int best_jump = 1;
1511228060Sbapt      int *best_asso_values = new int[_alpha_size];
1512228060Sbapt      int best_collisions = INT_MAX;
1513228060Sbapt      int best_max_hash_value = INT_MAX;
1514228060Sbapt
1515228060Sbapt      _initial_asso_value = 0; _jump = 1;
1516228060Sbapt      for (;;)
1517228060Sbapt        {
1518228060Sbapt          /* Restore the keyword list in its original order.  */
1519228060Sbapt          _head = copy_list (saved_head);
1520228060Sbapt          /* Find good _asso_values[].  */
1521228060Sbapt          find_asso_values ();
1522228060Sbapt          /* Test whether it is the best solution so far.  */
1523228060Sbapt          int collisions = 0;
1524228060Sbapt          int max_hash_value = INT_MIN;
1525228060Sbapt          _collision_detector->clear ();
1526228060Sbapt          for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1527228060Sbapt            {
1528228060Sbapt              KeywordExt *keyword = ptr->first();
1529228060Sbapt              int hashcode = compute_hash (keyword);
1530228060Sbapt              if (max_hash_value < hashcode)
1531228060Sbapt                max_hash_value = hashcode;
1532228060Sbapt              if (_collision_detector->set_bit (hashcode))
1533228060Sbapt                collisions++;
1534228060Sbapt            }
1535228060Sbapt          if (collisions < best_collisions
1536228060Sbapt              || (collisions == best_collisions
1537228060Sbapt                  && max_hash_value < best_max_hash_value))
1538228060Sbapt            {
1539228060Sbapt              memcpy (best_asso_values, _asso_values,
1540228060Sbapt                      _alpha_size * sizeof (_asso_values[0]));
1541228060Sbapt              best_collisions = collisions;
1542228060Sbapt              best_max_hash_value = max_hash_value;
1543228060Sbapt            }
1544228060Sbapt          /* Delete the copied keyword list.  */
1545228060Sbapt          delete_list (_head);
1546228060Sbapt
1547228060Sbapt          if (--asso_iteration == 0)
1548228060Sbapt            break;
1549228060Sbapt          /* Prepare for next iteration.  */
1550228060Sbapt          if (_initial_asso_value >= 2)
1551228060Sbapt            _initial_asso_value -= 2, _jump += 2;
1552228060Sbapt          else
1553228060Sbapt            _initial_asso_value += _jump, _jump = 1;
1554228060Sbapt        }
1555228060Sbapt      _head = saved_head;
1556228060Sbapt      /* Install the best found asso_values.  */
1557228060Sbapt      _initial_asso_value = best_initial_asso_value;
1558228060Sbapt      _jump = best_jump;
1559228060Sbapt      memcpy (_asso_values, best_asso_values,
1560228060Sbapt              _alpha_size * sizeof (_asso_values[0]));
1561228060Sbapt      delete[] best_asso_values;
1562228060Sbapt      /* The keywords' _hash_value fields are recomputed below.  */
1563228060Sbapt    }
1564228060Sbapt}
1565228060Sbapt
1566228060Sbapt/* ========================================================================= */
1567228060Sbapt
1568228060Sbapt/* Comparison function for sorting by increasing _hash_value.  */
1569228060Sbaptstatic bool
1570228060Sbaptless_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1571228060Sbapt{
1572228060Sbapt  return keyword1->_hash_value < keyword2->_hash_value;
1573228060Sbapt}
1574228060Sbapt
1575228060Sbapt/* Sorts the keyword list by hash value.  */
1576228060Sbapt
1577228060Sbaptvoid
1578228060SbaptSearch::sort ()
1579228060Sbapt{
1580228060Sbapt  _head = mergesort_list (_head, less_by_hash_value);
1581228060Sbapt}
1582228060Sbapt
1583228060Sbaptvoid
1584228060SbaptSearch::optimize ()
1585228060Sbapt{
1586228060Sbapt  /* Preparations.  */
1587228060Sbapt  prepare ();
1588228060Sbapt
1589228060Sbapt  /* Step 1: Finding good byte positions.  */
1590228060Sbapt  find_positions ();
1591228060Sbapt
1592228060Sbapt  /* Step 2: Finding good alpha increments.  */
1593228060Sbapt  find_alpha_inc ();
1594228060Sbapt
1595228060Sbapt  /* Step 3: Finding good asso_values.  */
1596228060Sbapt  find_good_asso_values ();
1597228060Sbapt
1598228060Sbapt  /* Make one final check, just to make sure nothing weird happened.... */
1599228060Sbapt  _collision_detector->clear ();
1600228060Sbapt  for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1601228060Sbapt    {
1602228060Sbapt      KeywordExt *curr = curr_ptr->first();
1603228060Sbapt      unsigned int hashcode = compute_hash (curr);
1604228060Sbapt      if (_collision_detector->set_bit (hashcode))
1605228060Sbapt        {
1606228060Sbapt          /* This shouldn't happen.  proj1, proj2, proj3 must have been
1607228060Sbapt             computed to be injective on the given keyword set.  */
1608228060Sbapt          fprintf (stderr,
1609228060Sbapt                   "\nInternal error, unexpected duplicate hash code\n");
1610228060Sbapt          if (option[POSITIONS])
1611228060Sbapt            fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1612228060Sbapt          else
1613228060Sbapt            fprintf (stderr, "try options -m or -r.\n\n");
1614228060Sbapt          exit (1);
1615228060Sbapt        }
1616228060Sbapt    }
1617228060Sbapt
1618228060Sbapt  /* Sorts the keyword list by hash value.  */
1619228060Sbapt  sort ();
1620228060Sbapt
1621228060Sbapt  /* Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely
1622228060Sbapt     necessary, but speeds up the lookup function in many cases of lookup
1623228060Sbapt     failure: no string comparison is needed once the hash value of a string
1624228060Sbapt     is larger than the hash value of any keyword.  */
1625228060Sbapt  int max_hash_value;
1626228060Sbapt  {
1627228060Sbapt    KeywordExt_List *temp;
1628228060Sbapt    for (temp = _head; temp->rest(); temp = temp->rest())
1629228060Sbapt      ;
1630228060Sbapt    max_hash_value = temp->first()->_hash_value;
1631228060Sbapt  }
1632228060Sbapt  for (unsigned int c = 0; c < _alpha_size; c++)
1633228060Sbapt    if (_occurrences[c] == 0)
1634228060Sbapt      _asso_values[c] = max_hash_value + 1;
1635228060Sbapt
1636228060Sbapt  /* Propagate unified asso_values.  */
1637228060Sbapt  if (_alpha_unify)
1638228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1639228060Sbapt      if (_alpha_unify[c] != c)
1640228060Sbapt        _asso_values[c] = _asso_values[_alpha_unify[c]];
1641228060Sbapt}
1642228060Sbapt
1643228060Sbapt/* Prints out some diagnostics upon completion.  */
1644228060Sbapt
1645228060SbaptSearch::~Search ()
1646228060Sbapt{
1647228060Sbapt  delete _collision_detector;
1648228060Sbapt  if (option[DEBUG])
1649228060Sbapt    {
1650228060Sbapt      fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1651228060Sbapt
1652228060Sbapt      for (unsigned int i = 0; i < _alpha_size; i++)
1653228060Sbapt        if (_occurrences[i])
1654228060Sbapt          fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1655228060Sbapt                   i, _asso_values[i], i, _occurrences[i]);
1656228060Sbapt
1657228060Sbapt      fprintf (stderr, "end table dumping\n");
1658228060Sbapt
1659228060Sbapt      fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1660228060Sbapt               "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1661228060Sbapt               _list_len, _total_keys, _total_duplicates, _max_key_len);
1662228060Sbapt
1663228060Sbapt      int field_width = _max_selchars_length;
1664228060Sbapt      fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1665228060Sbapt               field_width, "selchars");
1666228060Sbapt      for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1667228060Sbapt        {
1668228060Sbapt          fprintf (stderr, "%11d,%11d,%6d, ",
1669228060Sbapt                   ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1670228060Sbapt          if (field_width > ptr->first()->_selchars_length)
1671228060Sbapt            fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1672228060Sbapt          for (int j = 0; j < ptr->first()->_selchars_length; j++)
1673228060Sbapt            putc (ptr->first()->_selchars[j], stderr);
1674228060Sbapt          fprintf (stderr, ", %.*s\n",
1675228060Sbapt                   ptr->first()->_allchars_length, ptr->first()->_allchars);
1676228060Sbapt        }
1677228060Sbapt
1678228060Sbapt      fprintf (stderr, "End dumping list.\n\n");
1679228060Sbapt    }
1680228060Sbapt  delete[] _asso_values;
1681228060Sbapt  delete[] _occurrences;
1682228060Sbapt  delete[] _alpha_unify;
1683228060Sbapt  delete[] _alpha_inc;
1684228060Sbapt}
1685