1/* Search algorithm. 2 Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc. 3 Written by Douglas C. Schmidt <schmidt@ics.uci.edu> 4 and Bruno Haible <bruno@clisp.org>. 5 6 This file is part of GNU GPERF. 7 8 GNU GPERF is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GNU GPERF is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING. 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23/* Specification. */ 24#include "search.h" 25 26#include <stdio.h> 27#include <stdlib.h> /* declares exit(), rand(), srand() */ 28#include <string.h> /* declares memset(), memcmp() */ 29#include <time.h> /* declares time() */ 30#include <math.h> /* declares exp() */ 31#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ 32#include "options.h" 33#include "hash-table.h" 34#include "config.h" 35 36/* ============================== Portability ============================== */ 37 38/* Assume ISO C++ 'for' scoping rule. */ 39#define for if (0) ; else for 40 41/* Dynamically allocated array with dynamic extent: 42 43 Example: 44 DYNAMIC_ARRAY (my_array, int, n); 45 ... 46 FREE_DYNAMIC_ARRAY (my_array); 47 48 Attention: depending on your implementation my_array is either the array 49 itself or a pointer to the array! Always use my_array only as expression! 50 */ 51#if HAVE_DYNAMIC_ARRAY 52 #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size] 53 #define FREE_DYNAMIC_ARRAY(var) 54#else 55 #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size] 56 #define FREE_DYNAMIC_ARRAY(var) delete[] var 57#endif 58 59/* ================================ Theory ================================= */ 60 61/* The general form of the hash function is 62 63 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) 64 + len (keyword) 65 66 where Pos is a set of byte positions, 67 each alpha_inc[i] is a nonnegative integer, 68 each asso_values[c] is a nonnegative integer, 69 len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise. 70 71 Theorem 1: If all keywords are different, there is a set Pos such that 72 all tuples (keyword[i] : i in Pos) are different. 73 74 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there 75 are nonnegative integers alpha_inc[i] such that all multisets 76 {keyword[i] + alpha_inc[i] : i in Pos} are different. 77 78 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. 79 80 Theorem 3: If all multisets selchars[keyword] are different, there are 81 nonnegative integers asso_values[c] such that all hash values 82 sum (asso_values[c] : c in selchars[keyword]) are different. 83 84 Based on these three facts, we find the hash function in three steps: 85 86 Step 1 (Finding good byte positions): 87 Find a set Pos, as small as possible, such that all tuples 88 (keyword[i] : i in Pos) are different. 89 90 Step 2 (Finding good alpha increments): 91 Find nonnegative integers alpha_inc[i], as many of them as possible being 92 zero, and the others being as small as possible, such that all multisets 93 {keyword[i] + alpha_inc[i] : i in Pos} are different. 94 95 Step 3 (Finding good asso_values): 96 Find asso_values[c] such that all hash (keyword) are different. 97 98 In other words, each step finds a projection that is injective on the 99 given finite set: 100 proj1 : String --> Map (Pos --> N) 101 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) 102 proj3 : Map (Pos --> N) / S(Pos) --> N 103 where 104 N denotes the set of nonnegative integers, 105 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and 106 S(Pos) is the symmetric group over Pos. 107 108 This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight 109 modifications apply: 110 proj1 : String --> Map (Pos --> N) x N 111 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N 112 proj3 : Map (Pos --> N) / S(Pos) x N --> N 113 114 For a case-insensitive hash function, the general form is 115 116 hash (keyword) = 117 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) 118 + len (keyword) 119 120 where alpha_unify[c] is chosen so that an upper/lower case change in 121 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. 122 */ 123 124/* ==================== Initialization and Preparation ===================== */ 125 126Search::Search (KeywordExt_List *list) 127 : _head (list) 128{ 129} 130 131void 132Search::prepare () 133{ 134 /* Compute the total number of keywords. */ 135 _total_keys = 0; 136 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 137 _total_keys++; 138 139 /* Compute the minimum and maximum keyword length. */ 140 _max_key_len = INT_MIN; 141 _min_key_len = INT_MAX; 142 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 143 { 144 KeywordExt *keyword = temp->first(); 145 146 if (_max_key_len < keyword->_allchars_length) 147 _max_key_len = keyword->_allchars_length; 148 if (_min_key_len > keyword->_allchars_length) 149 _min_key_len = keyword->_allchars_length; 150 } 151 152 /* Exit program if an empty string is used as keyword, since the comparison 153 expressions don't work correctly for looking up an empty string. */ 154 if (_min_key_len == 0) 155 { 156 fprintf (stderr, "Empty input keyword is not allowed.\n" 157 "To recognize an empty input keyword, your code should check for\n" 158 "len == 0 before calling the gperf generated lookup function.\n"); 159 exit (1); 160 } 161 162 /* Exit program if the characters in the keywords are not in the required 163 range. */ 164 if (option[SEVENBIT]) 165 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 166 { 167 KeywordExt *keyword = temp->first(); 168 169 const char *k = keyword->_allchars; 170 for (int i = keyword->_allchars_length; i > 0; k++, i--) 171 if (!(static_cast<unsigned char>(*k) < 128)) 172 { 173 fprintf (stderr, "Option --seven-bit has been specified,\n" 174 "but keyword \"%.*s\" contains non-ASCII characters.\n" 175 "Try removing option --seven-bit.\n", 176 keyword->_allchars_length, keyword->_allchars); 177 exit (1); 178 } 179 } 180} 181 182/* ====================== Finding good byte positions ====================== */ 183 184/* Computes the upper bound on the indices passed to asso_values[], 185 assuming no alpha_increments. */ 186unsigned int 187Search::compute_alpha_size () const 188{ 189 return (option[SEVENBIT] ? 128 : 256); 190} 191 192/* Computes the unification rules between different asso_values[c], 193 assuming no alpha_increments. */ 194unsigned int * 195Search::compute_alpha_unify () const 196{ 197 if (option[UPPERLOWER]) 198 { 199 /* Uppercase to lowercase mapping. */ 200 unsigned int alpha_size = compute_alpha_size(); 201 unsigned int *alpha_unify = new unsigned int[alpha_size]; 202 for (unsigned int c = 0; c < alpha_size; c++) 203 alpha_unify[c] = c; 204 for (unsigned int c = 'A'; c <= 'Z'; c++) 205 alpha_unify[c] = c + ('a'-'A'); 206 return alpha_unify; 207 } 208 else 209 /* Identity mapping. */ 210 return NULL; 211} 212 213/* Initializes each keyword's _selchars array. */ 214void 215Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const 216{ 217 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 218 temp->first()->init_selchars_tuple(positions, alpha_unify); 219} 220 221/* Deletes each keyword's _selchars array. */ 222void 223Search::delete_selchars () const 224{ 225 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 226 temp->first()->delete_selchars(); 227} 228 229/* Count the duplicate keywords that occur with a given set of positions. 230 In other words, it returns the difference 231 # K - # proj1 (K) 232 where K is the multiset of given keywords. */ 233unsigned int 234Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const 235{ 236 /* Run through the keyword list and count the duplicates incrementally. 237 The result does not depend on the order of the keyword list, thanks to 238 the formula above. */ 239 init_selchars_tuple (positions, alpha_unify); 240 241 unsigned int count = 0; 242 { 243 Hash_Table representatives (_total_keys, option[NOLENGTH]); 244 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 245 { 246 KeywordExt *keyword = temp->first(); 247 if (representatives.insert (keyword)) 248 count++; 249 } 250 } 251 252 delete_selchars (); 253 254 return count; 255} 256 257/* Find good key positions. */ 258 259void 260Search::find_positions () 261{ 262 /* If the user gave the key positions, we use them. */ 263 if (option[POSITIONS]) 264 { 265 _key_positions = option.get_key_positions(); 266 return; 267 } 268 269 /* Compute preliminary alpha_unify table. */ 270 unsigned int *alpha_unify = compute_alpha_unify (); 271 272 /* 1. Find positions that must occur in order to distinguish duplicates. */ 273 Positions mandatory; 274 275 if (!option[DUP]) 276 { 277 for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) 278 { 279 KeywordExt *keyword1 = l1->first(); 280 for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) 281 { 282 KeywordExt *keyword2 = l2->first(); 283 284 /* If keyword1 and keyword2 have the same length and differ 285 in just one position, and it is not the last character, 286 this position is mandatory. */ 287 if (keyword1->_allchars_length == keyword2->_allchars_length) 288 { 289 int n = keyword1->_allchars_length; 290 int i; 291 for (i = 0; i < n - 1; i++) 292 { 293 unsigned char c1 = keyword1->_allchars[i]; 294 unsigned char c2 = keyword2->_allchars[i]; 295 if (option[UPPERLOWER]) 296 { 297 if (c1 >= 'A' && c1 <= 'Z') 298 c1 += 'a' - 'A'; 299 if (c2 >= 'A' && c2 <= 'Z') 300 c2 += 'a' - 'A'; 301 } 302 if (c1 != c2) 303 break; 304 } 305 if (i < n - 1) 306 { 307 int j; 308 for (j = i + 1; j < n; j++) 309 { 310 unsigned char c1 = keyword1->_allchars[j]; 311 unsigned char c2 = keyword2->_allchars[j]; 312 if (option[UPPERLOWER]) 313 { 314 if (c1 >= 'A' && c1 <= 'Z') 315 c1 += 'a' - 'A'; 316 if (c2 >= 'A' && c2 <= 'Z') 317 c2 += 'a' - 'A'; 318 } 319 if (c1 != c2) 320 break; 321 } 322 if (j >= n) 323 { 324 /* Position i is mandatory. */ 325 if (!mandatory.contains (i)) 326 mandatory.add (i); 327 } 328 } 329 } 330 } 331 } 332 } 333 334 /* 2. Add positions, as long as this decreases the duplicates count. */ 335 int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 336 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); 337 Positions current = mandatory; 338 unsigned int current_duplicates_count = 339 count_duplicates_tuple (current, alpha_unify); 340 for (;;) 341 { 342 Positions best; 343 unsigned int best_duplicates_count = UINT_MAX; 344 345 for (int i = imax; i >= -1; i--) 346 if (!current.contains (i)) 347 { 348 Positions tryal = current; 349 tryal.add (i); 350 unsigned int try_duplicates_count = 351 count_duplicates_tuple (tryal, alpha_unify); 352 353 /* We prefer 'try' to 'best' if it produces less duplicates, 354 or if it produces the same number of duplicates but with 355 a more efficient hash function. */ 356 if (try_duplicates_count < best_duplicates_count 357 || (try_duplicates_count == best_duplicates_count && i >= 0)) 358 { 359 best = tryal; 360 best_duplicates_count = try_duplicates_count; 361 } 362 } 363 364 /* Stop adding positions when it gives no improvement. */ 365 if (best_duplicates_count >= current_duplicates_count) 366 break; 367 368 current = best; 369 current_duplicates_count = best_duplicates_count; 370 } 371 372 /* 3. Remove positions, as long as this doesn't increase the duplicates 373 count. */ 374 for (;;) 375 { 376 Positions best; 377 unsigned int best_duplicates_count = UINT_MAX; 378 379 for (int i = imax; i >= -1; i--) 380 if (current.contains (i) && !mandatory.contains (i)) 381 { 382 Positions tryal = current; 383 tryal.remove (i); 384 unsigned int try_duplicates_count = 385 count_duplicates_tuple (tryal, alpha_unify); 386 387 /* We prefer 'try' to 'best' if it produces less duplicates, 388 or if it produces the same number of duplicates but with 389 a more efficient hash function. */ 390 if (try_duplicates_count < best_duplicates_count 391 || (try_duplicates_count == best_duplicates_count && i == -1)) 392 { 393 best = tryal; 394 best_duplicates_count = try_duplicates_count; 395 } 396 } 397 398 /* Stop removing positions when it gives no improvement. */ 399 if (best_duplicates_count > current_duplicates_count) 400 break; 401 402 current = best; 403 current_duplicates_count = best_duplicates_count; 404 } 405 406 /* 4. Replace two positions by one, as long as this doesn't increase the 407 duplicates count. */ 408 for (;;) 409 { 410 Positions best; 411 unsigned int best_duplicates_count = UINT_MAX; 412 413 for (int i1 = imax; i1 >= -1; i1--) 414 if (current.contains (i1) && !mandatory.contains (i1)) 415 for (int i2 = imax; i2 >= -1; i2--) 416 if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) 417 for (int i3 = imax; i3 >= 0; i3--) 418 if (!current.contains (i3)) 419 { 420 Positions tryal = current; 421 tryal.remove (i1); 422 tryal.remove (i2); 423 tryal.add (i3); 424 unsigned int try_duplicates_count = 425 count_duplicates_tuple (tryal, alpha_unify); 426 427 /* We prefer 'try' to 'best' if it produces less duplicates, 428 or if it produces the same number of duplicates but with 429 a more efficient hash function. */ 430 if (try_duplicates_count < best_duplicates_count 431 || (try_duplicates_count == best_duplicates_count 432 && (i1 == -1 || i2 == -1 || i3 >= 0))) 433 { 434 best = tryal; 435 best_duplicates_count = try_duplicates_count; 436 } 437 } 438 439 /* Stop removing positions when it gives no improvement. */ 440 if (best_duplicates_count > current_duplicates_count) 441 break; 442 443 current = best; 444 current_duplicates_count = best_duplicates_count; 445 } 446 447 /* That's it. Hope it's good enough. */ 448 _key_positions = current; 449 450 if (option[DEBUG]) 451 { 452 /* Print the result. */ 453 fprintf (stderr, "\nComputed positions: "); 454 PositionReverseIterator iter = _key_positions.reviterator(); 455 bool seen_lastchar = false; 456 bool first = true; 457 for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; ) 458 { 459 if (!first) 460 fprintf (stderr, ", "); 461 if (i == Positions::LASTCHAR) 462 seen_lastchar = true; 463 else 464 { 465 fprintf (stderr, "%d", i + 1); 466 first = false; 467 } 468 } 469 if (seen_lastchar) 470 { 471 if (!first) 472 fprintf (stderr, ", "); 473 fprintf (stderr, "$"); 474 } 475 fprintf (stderr, "\n"); 476 } 477 478 /* Free preliminary alpha_unify table. */ 479 delete[] alpha_unify; 480} 481 482/* Count the duplicate keywords that occur with the found set of positions. 483 In other words, it returns the difference 484 # K - # proj1 (K) 485 where K is the multiset of given keywords. */ 486unsigned int 487Search::count_duplicates_tuple () const 488{ 489 unsigned int *alpha_unify = compute_alpha_unify (); 490 unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); 491 delete[] alpha_unify; 492 return count; 493} 494 495/* ===================== Finding good alpha increments ===================== */ 496 497/* Computes the upper bound on the indices passed to asso_values[]. */ 498unsigned int 499Search::compute_alpha_size (const unsigned int *alpha_inc) const 500{ 501 unsigned int max_alpha_inc = 0; 502 for (int i = 0; i < _max_key_len; i++) 503 if (max_alpha_inc < alpha_inc[i]) 504 max_alpha_inc = alpha_inc[i]; 505 return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; 506} 507 508/* Computes the unification rules between different asso_values[c]. */ 509unsigned int * 510Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const 511{ 512 if (option[UPPERLOWER]) 513 { 514 /* Without alpha increments, we would simply unify 515 'A' -> 'a', ..., 'Z' -> 'z'. 516 But when a keyword contains at position i a character c, 517 we have the constraint 518 asso_values[tolower(c) + alpha_inc[i]] == 519 asso_values[toupper(c) + alpha_inc[i]]. 520 This introduces a unification 521 toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. 522 Note that this unification can extend outside the range of 523 ASCII letters! But still every unified character pair is at 524 a distance of 'a'-'A' = 32, or (after chained unification) 525 at a multiple of 32. So in the end the alpha_unify vector has 526 the form c -> c + 32 * f(c) where f(c) is a nonnegative 527 integer. */ 528 unsigned int alpha_size = compute_alpha_size (alpha_inc); 529 530 unsigned int *alpha_unify = new unsigned int[alpha_size]; 531 for (unsigned int c = 0; c < alpha_size; c++) 532 alpha_unify[c] = c; 533 534 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 535 { 536 KeywordExt *keyword = temp->first(); 537 538 /* Iterate through the selected character positions. */ 539 PositionIterator iter = positions.iterator(keyword->_allchars_length); 540 541 for (int i; (i = iter.next ()) != PositionIterator::EOS; ) 542 { 543 unsigned int c; 544 if (i == Positions::LASTCHAR) 545 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); 546 else if (i < keyword->_allchars_length) 547 c = static_cast<unsigned char>(keyword->_allchars[i]); 548 else 549 abort (); 550 if (c >= 'A' && c <= 'Z') 551 c += 'a' - 'A'; 552 if (c >= 'a' && c <= 'z') 553 { 554 if (i != Positions::LASTCHAR) 555 c += alpha_inc[i]; 556 /* Unify c with c - ('a'-'A'). */ 557 unsigned int d = alpha_unify[c]; 558 unsigned int b = c - ('a'-'A'); 559 for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) 560 alpha_unify[a] = d; 561 } 562 } 563 } 564 return alpha_unify; 565 } 566 else 567 /* Identity mapping. */ 568 return NULL; 569} 570 571/* Initializes each keyword's _selchars array. */ 572void 573Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const 574{ 575 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 576 temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc); 577} 578 579/* Count the duplicate keywords that occur with the given set of positions 580 and a given alpha_inc[] array. 581 In other words, it returns the difference 582 # K - # proj2 (proj1 (K)) 583 where K is the multiset of given keywords. */ 584unsigned int 585Search::count_duplicates_multiset (const unsigned int *alpha_inc) const 586{ 587 /* Run through the keyword list and count the duplicates incrementally. 588 The result does not depend on the order of the keyword list, thanks to 589 the formula above. */ 590 unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); 591 init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); 592 593 unsigned int count = 0; 594 { 595 Hash_Table representatives (_total_keys, option[NOLENGTH]); 596 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 597 { 598 KeywordExt *keyword = temp->first(); 599 if (representatives.insert (keyword)) 600 count++; 601 } 602 } 603 604 delete_selchars (); 605 delete[] alpha_unify; 606 607 return count; 608} 609 610/* Find good _alpha_inc[]. */ 611 612void 613Search::find_alpha_inc () 614{ 615 /* The goal is to choose _alpha_inc[] such that it doesn't introduce 616 artificial duplicates. 617 In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ 618 unsigned int duplicates_goal = count_duplicates_tuple (); 619 620 /* Start with zero increments. This is sufficient in most cases. */ 621 unsigned int *current = new unsigned int [_max_key_len]; 622 for (int i = 0; i < _max_key_len; i++) 623 current[i] = 0; 624 unsigned int current_duplicates_count = count_duplicates_multiset (current); 625 626 if (current_duplicates_count > duplicates_goal) 627 { 628 /* Look which _alpha_inc[i] we are free to increment. */ 629 unsigned int nindices; 630 { 631 nindices = 0; 632 PositionIterator iter = _key_positions.iterator(_max_key_len); 633 for (;;) 634 { 635 int key_pos = iter.next (); 636 if (key_pos == PositionIterator::EOS) 637 break; 638 if (key_pos != Positions::LASTCHAR) 639 nindices++; 640 } 641 } 642 643 DYNAMIC_ARRAY (indices, unsigned int, nindices); 644 { 645 unsigned int j = 0; 646 PositionIterator iter = _key_positions.iterator(_max_key_len); 647 for (;;) 648 { 649 int key_pos = iter.next (); 650 if (key_pos == PositionIterator::EOS) 651 break; 652 if (key_pos != Positions::LASTCHAR) 653 indices[j++] = key_pos; 654 } 655 if (!(j == nindices)) 656 abort (); 657 } 658 659 /* Perform several rounds of searching for a good alpha increment. 660 Each round reduces the number of artificial collisions by adding 661 an increment in a single key position. */ 662 DYNAMIC_ARRAY (best, unsigned int, _max_key_len); 663 DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len); 664 do 665 { 666 /* An increment of 1 is not always enough. Try higher increments 667 also. */ 668 for (unsigned int inc = 1; ; inc++) 669 { 670 unsigned int best_duplicates_count = UINT_MAX; 671 672 for (unsigned int j = 0; j < nindices; j++) 673 { 674 memcpy (tryal, current, _max_key_len * sizeof (unsigned int)); 675 tryal[indices[j]] += inc; 676 unsigned int try_duplicates_count = 677 count_duplicates_multiset (tryal); 678 679 /* We prefer 'try' to 'best' if it produces less 680 duplicates. */ 681 if (try_duplicates_count < best_duplicates_count) 682 { 683 memcpy (best, tryal, _max_key_len * sizeof (unsigned int)); 684 best_duplicates_count = try_duplicates_count; 685 } 686 } 687 688 /* Stop this round when we got an improvement. */ 689 if (best_duplicates_count < current_duplicates_count) 690 { 691 memcpy (current, best, _max_key_len * sizeof (unsigned int)); 692 current_duplicates_count = best_duplicates_count; 693 break; 694 } 695 } 696 } 697 while (current_duplicates_count > duplicates_goal); 698 FREE_DYNAMIC_ARRAY (tryal); 699 FREE_DYNAMIC_ARRAY (best); 700 701 if (option[DEBUG]) 702 { 703 /* Print the result. */ 704 fprintf (stderr, "\nComputed alpha increments: "); 705 bool first = true; 706 for (unsigned int j = nindices; j-- > 0; ) 707 if (current[indices[j]] != 0) 708 { 709 if (!first) 710 fprintf (stderr, ", "); 711 fprintf (stderr, "%u:+%u", 712 indices[j] + 1, current[indices[j]]); 713 first = false; 714 } 715 fprintf (stderr, "\n"); 716 } 717 FREE_DYNAMIC_ARRAY (indices); 718 } 719 720 _alpha_inc = current; 721 _alpha_size = compute_alpha_size (_alpha_inc); 722 _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); 723} 724 725/* ======================= Finding good asso_values ======================== */ 726 727/* Initializes the asso_values[] related parameters. */ 728 729void 730Search::prepare_asso_values () 731{ 732 KeywordExt_List *temp; 733 734 /* Initialize each keyword's _selchars array. */ 735 init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc); 736 737 /* Compute the maximum _selchars_length over all keywords. */ 738 _max_selchars_length = _key_positions.iterator(_max_key_len).remaining(); 739 740 /* Check for duplicates, i.e. keywords with the same _selchars array 741 (and - if !option[NOLENGTH] - also the same length). 742 We deal with these by building an equivalence class, so that only 743 1 keyword is representative of the entire collection. Only this 744 representative remains in the keyword list; the others are accessible 745 through the _duplicate_link chain, starting at the representative. 746 This *greatly* simplifies processing during later stages of the program. 747 Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */ 748 { 749 _list_len = _total_keys; 750 _total_duplicates = 0; 751 /* Make hash table for efficiency. */ 752 Hash_Table representatives (_list_len, option[NOLENGTH]); 753 754 KeywordExt_List *prev = NULL; /* list node before temp */ 755 for (temp = _head; temp; ) 756 { 757 KeywordExt *keyword = temp->first(); 758 KeywordExt *other_keyword = representatives.insert (keyword); 759 KeywordExt_List *garbage = NULL; 760 761 if (other_keyword) 762 { 763 _total_duplicates++; 764 _list_len--; 765 /* Remove keyword from the main list. */ 766 prev->rest() = temp->rest(); 767 garbage = temp; 768 /* And insert it on other_keyword's duplicate list. */ 769 keyword->_duplicate_link = other_keyword->_duplicate_link; 770 other_keyword->_duplicate_link = keyword; 771 772 /* Complain if user hasn't enabled the duplicate option. */ 773 if (!option[DUP] || option[DEBUG]) 774 { 775 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", 776 keyword->_allchars_length, keyword->_allchars, 777 other_keyword->_allchars_length, other_keyword->_allchars); 778 for (int j = 0; j < keyword->_selchars_length; j++) 779 putc (keyword->_selchars[j], stderr); 780 fprintf (stderr, "\".\n"); 781 } 782 } 783 else 784 { 785 keyword->_duplicate_link = NULL; 786 prev = temp; 787 } 788 temp = temp->rest(); 789 if (garbage) 790 delete garbage; 791 } 792 if (option[DEBUG]) 793 representatives.dump(); 794 } 795 796 /* Exit program if duplicates exists and option[DUP] not set, since we 797 don't want to continue in this case. (We don't want to turn on 798 option[DUP] implicitly, because the generated code is usually much 799 slower. */ 800 if (_total_duplicates) 801 { 802 if (option[DUP]) 803 fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n", 804 _total_duplicates); 805 else 806 { 807 fprintf (stderr, "%d input keys have identical hash values,\n", 808 _total_duplicates); 809 if (option[POSITIONS]) 810 fprintf (stderr, "try different key positions or use option -D.\n"); 811 else 812 fprintf (stderr, "use option -D.\n"); 813 exit (1); 814 } 815 } 816 817 /* Compute the occurrences of each character in the alphabet. */ 818 _occurrences = new int[_alpha_size]; 819 memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); 820 for (temp = _head; temp; temp = temp->rest()) 821 { 822 KeywordExt *keyword = temp->first(); 823 const unsigned int *ptr = keyword->_selchars; 824 for (int count = keyword->_selchars_length; count > 0; ptr++, count--) 825 _occurrences[*ptr]++; 826 } 827 828 /* Memory allocation. */ 829 _asso_values = new int[_alpha_size]; 830 831 int non_linked_length = _list_len; 832 unsigned int asso_value_max; 833 834 asso_value_max = 835 static_cast<unsigned int>(non_linked_length * option.get_size_multiple()); 836 /* Round up to the next power of two. This makes it easy to ensure 837 an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value 838 being odd, it guarantees that Search::try_asso_value() will iterate 839 through different values for _asso_value[c]. */ 840 if (asso_value_max == 0) 841 asso_value_max = 1; 842 asso_value_max |= asso_value_max >> 1; 843 asso_value_max |= asso_value_max >> 2; 844 asso_value_max |= asso_value_max >> 4; 845 asso_value_max |= asso_value_max >> 8; 846 asso_value_max |= asso_value_max >> 16; 847 asso_value_max++; 848 _asso_value_max = asso_value_max; 849 850 /* Given the bound for _asso_values[c], we have a bound for the possible 851 hash values, as computed in compute_hash(). */ 852 _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len) 853 + (_asso_value_max - 1) * _max_selchars_length; 854 /* Allocate a sparse bit vector for detection of collisions of hash 855 values. */ 856 _collision_detector = new Bool_Array (_max_hash_value + 1); 857 858 if (option[DEBUG]) 859 { 860 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d" 861 "\nmaximum size of generated hash table is %d\n", 862 non_linked_length, asso_value_max, _max_hash_value); 863 864 int field_width; 865 866 field_width = 0; 867 { 868 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 869 { 870 KeywordExt *keyword = temp->first(); 871 if (field_width < keyword->_selchars_length) 872 field_width = keyword->_selchars_length; 873 } 874 } 875 876 fprintf (stderr, "\ndumping the keyword list without duplicates\n"); 877 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig"); 878 int i = 0; 879 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 880 { 881 KeywordExt *keyword = temp->first(); 882 fprintf (stderr, "%9d, ", ++i); 883 if (field_width > keyword->_selchars_length) 884 fprintf (stderr, "%*s", field_width - keyword->_selchars_length, ""); 885 for (int j = 0; j < keyword->_selchars_length; j++) 886 putc (keyword->_selchars[j], stderr); 887 fprintf (stderr, ", %.*s\n", 888 keyword->_allchars_length, keyword->_allchars); 889 } 890 fprintf (stderr, "\nend of keyword list\n\n"); 891 } 892 893 if (option[RANDOM] || option.get_jump () == 0) 894 /* We will use rand(), so initialize the random number generator. */ 895 srand (static_cast<long>(time (0))); 896 897 _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ()); 898 _jump = option.get_jump (); 899} 900 901/* Finds some _asso_values[] that fit. */ 902 903/* The idea is to choose the _asso_values[] one by one, in a way that 904 a choice that has been made never needs to be undone later. This 905 means that we split the work into several steps. Each step chooses 906 one or more _asso_values[c]. The result of choosing one or more 907 _asso_values[c] is that the partitioning of the keyword set gets 908 broader. 909 Look at this partitioning: After every step, the _asso_values[] of a 910 certain set C of characters are undetermined. (At the beginning, C 911 is the set of characters c with _occurrences[c] > 0. At the end, C 912 is empty.) To each keyword K, we associate the multiset of _selchars 913 for which the _asso_values[] are undetermined: 914 K --> K->_selchars intersect C. 915 Consider two keywords equivalent if their value under this mapping is 916 the same. This introduces an equivalence relation on the set of 917 keywords. The equivalence classes partition the keyword set. (At the 918 beginning, the partition is the finest possible: each K is an equivalence 919 class by itself, because all K have a different _selchars. At the end, 920 all K have been merged into a single equivalence class.) 921 The partition before a step is always a refinement of the partition 922 after the step. 923 We choose the steps in such a way that the partition really becomes 924 broader at each step. (A step that only chooses an _asso_values[c] 925 without changing the partition is better merged with the previous step, 926 to avoid useless backtracking.) */ 927 928struct EquivalenceClass 929{ 930 /* The keywords in this equivalence class. */ 931 KeywordExt_List * _keywords; 932 KeywordExt_List * _keywords_last; 933 /* The number of keywords in this equivalence class. */ 934 unsigned int _cardinality; 935 /* The undetermined selected characters for the keywords in this 936 equivalence class, as a canonically reordered multiset. */ 937 unsigned int * _undetermined_chars; 938 unsigned int _undetermined_chars_length; 939 940 EquivalenceClass * _next; 941}; 942 943struct Step 944{ 945 /* The characters whose values are being determined in this step. */ 946 unsigned int _changing_count; 947 unsigned int * _changing; 948 /* Exclusive upper bound for the _asso_values[c] of this step. 949 A power of 2. */ 950 unsigned int _asso_value_max; 951 /* The characters whose values will be determined after this step. */ 952 bool * _undetermined; 953 /* The keyword set partition after this step. */ 954 EquivalenceClass * _partition; 955 /* The expected number of iterations in this step. */ 956 double _expected_lower; 957 double _expected_upper; 958 959 Step * _next; 960}; 961 962static inline bool 963equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) 964{ 965 while (len > 0) 966 { 967 if (*ptr1 != *ptr2) 968 return false; 969 ptr1++; 970 ptr2++; 971 len--; 972 } 973 return true; 974} 975 976EquivalenceClass * 977Search::compute_partition (bool *undetermined) const 978{ 979 EquivalenceClass *partition = NULL; 980 EquivalenceClass *partition_last = NULL; 981 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 982 { 983 KeywordExt *keyword = temp->first(); 984 985 /* Compute the undetermined characters for this keyword. */ 986 unsigned int *undetermined_chars = 987 new unsigned int[keyword->_selchars_length]; 988 unsigned int undetermined_chars_length = 0; 989 990 for (int i = 0; i < keyword->_selchars_length; i++) 991 if (undetermined[keyword->_selchars[i]]) 992 undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; 993 994 /* Look up the equivalence class to which this keyword belongs. */ 995 EquivalenceClass *equclass; 996 for (equclass = partition; equclass; equclass = equclass->_next) 997 if (equclass->_undetermined_chars_length == undetermined_chars_length 998 && equals (equclass->_undetermined_chars, undetermined_chars, 999 undetermined_chars_length)) 1000 break; 1001 if (equclass == NULL) 1002 { 1003 equclass = new EquivalenceClass(); 1004 equclass->_keywords = NULL; 1005 equclass->_keywords_last = NULL; 1006 equclass->_cardinality = 0; 1007 equclass->_undetermined_chars = undetermined_chars; 1008 equclass->_undetermined_chars_length = undetermined_chars_length; 1009 equclass->_next = NULL; 1010 if (partition) 1011 partition_last->_next = equclass; 1012 else 1013 partition = equclass; 1014 partition_last = equclass; 1015 } 1016 else 1017 delete[] undetermined_chars; 1018 1019 /* Add the keyword to the equivalence class. */ 1020 KeywordExt_List *cons = new KeywordExt_List(keyword); 1021 if (equclass->_keywords) 1022 equclass->_keywords_last->rest() = cons; 1023 else 1024 equclass->_keywords = cons; 1025 equclass->_keywords_last = cons; 1026 equclass->_cardinality++; 1027 } 1028 1029 /* Free some of the allocated memory. The caller doesn't need it. */ 1030 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1031 delete[] cls->_undetermined_chars; 1032 1033 return partition; 1034} 1035 1036static void 1037delete_partition (EquivalenceClass *partition) 1038{ 1039 while (partition != NULL) 1040 { 1041 EquivalenceClass *equclass = partition; 1042 partition = equclass->_next; 1043 delete_list (equclass->_keywords); 1044 //delete[] equclass->_undetermined_chars; // already freed above 1045 delete equclass; 1046 } 1047} 1048 1049/* Compute the possible number of collisions when _asso_values[c] is 1050 chosen, leading to the given partition. */ 1051unsigned int 1052Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const 1053{ 1054 /* Every equivalence class p is split according to the frequency of 1055 occurrence of c, leading to equivalence classes p1, p2, ... 1056 This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions. 1057 Return the sum of this expression over all equivalence classes. */ 1058 unsigned int sum = 0; 1059 unsigned int m = _max_selchars_length; 1060 DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1); 1061 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1062 { 1063 for (unsigned int i = 0; i <= m; i++) 1064 split_cardinalities[i] = 0; 1065 1066 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1067 { 1068 KeywordExt *keyword = temp->first(); 1069 1070 unsigned int count = 0; 1071 for (int i = 0; i < keyword->_selchars_length; i++) 1072 if (keyword->_selchars[i] == c) 1073 count++; 1074 1075 split_cardinalities[count]++; 1076 } 1077 1078 sum += cls->_cardinality * cls->_cardinality; 1079 for (unsigned int i = 0; i <= m; i++) 1080 sum -= split_cardinalities[i] * split_cardinalities[i]; 1081 } 1082 FREE_DYNAMIC_ARRAY (split_cardinalities); 1083 return sum; 1084} 1085 1086/* Test whether adding c to the undetermined characters changes the given 1087 partition. */ 1088bool 1089Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const 1090{ 1091 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1092 { 1093 unsigned int first_count = UINT_MAX; 1094 1095 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1096 { 1097 KeywordExt *keyword = temp->first(); 1098 1099 unsigned int count = 0; 1100 for (int i = 0; i < keyword->_selchars_length; i++) 1101 if (keyword->_selchars[i] == c) 1102 count++; 1103 1104 if (temp == cls->_keywords) 1105 first_count = count; 1106 else if (count != first_count) 1107 /* c would split this equivalence class. */ 1108 return false; 1109 } 1110 } 1111 return true; 1112} 1113 1114void 1115Search::find_asso_values () 1116{ 1117 Step *steps; 1118 1119 /* Determine the steps, starting with the last one. */ 1120 { 1121 bool *undetermined; 1122 bool *determined; 1123 1124 steps = NULL; 1125 1126 undetermined = new bool[_alpha_size]; 1127 for (unsigned int c = 0; c < _alpha_size; c++) 1128 undetermined[c] = false; 1129 1130 determined = new bool[_alpha_size]; 1131 for (unsigned int c = 0; c < _alpha_size; c++) 1132 determined[c] = true; 1133 1134 for (;;) 1135 { 1136 /* Compute the partition that needs to be refined. */ 1137 EquivalenceClass *partition = compute_partition (undetermined); 1138 1139 /* Determine the main character to be chosen in this step. 1140 Choosing such a character c has the effect of splitting every 1141 equivalence class (according the the frequency of occurrence of c). 1142 We choose the c with the minimum number of possible collisions, 1143 so that characters which lead to a large number of collisions get 1144 handled early during the search. */ 1145 unsigned int chosen_c; 1146 unsigned int chosen_possible_collisions; 1147 { 1148 unsigned int best_c = 0; 1149 unsigned int best_possible_collisions = UINT_MAX; 1150 for (unsigned int c = 0; c < _alpha_size; c++) 1151 if (_occurrences[c] > 0 && determined[c]) 1152 { 1153 unsigned int possible_collisions = 1154 count_possible_collisions (partition, c); 1155 if (possible_collisions < best_possible_collisions) 1156 { 1157 best_c = c; 1158 best_possible_collisions = possible_collisions; 1159 } 1160 } 1161 if (best_possible_collisions == UINT_MAX) 1162 { 1163 /* All c with _occurrences[c] > 0 are undetermined. We are 1164 are the starting situation and don't need any more step. */ 1165 delete_partition (partition); 1166 break; 1167 } 1168 chosen_c = best_c; 1169 chosen_possible_collisions = best_possible_collisions; 1170 } 1171 1172 /* We need one more step. */ 1173 Step *step = new Step(); 1174 1175 step->_undetermined = new bool[_alpha_size]; 1176 memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool)); 1177 1178 step->_partition = partition; 1179 1180 /* Now determine how the equivalence classes will be before this 1181 step. */ 1182 undetermined[chosen_c] = true; 1183 partition = compute_partition (undetermined); 1184 1185 /* Now determine which other characters should be determined in this 1186 step, because they will not change the equivalence classes at 1187 this point. It is the set of all c which, for all equivalence 1188 classes, have the same frequency of occurrence in every keyword 1189 of the equivalence class. */ 1190 for (unsigned int c = 0; c < _alpha_size; c++) 1191 if (_occurrences[c] > 0 && determined[c] 1192 && unchanged_partition (partition, c)) 1193 { 1194 undetermined[c] = true; 1195 determined[c] = false; 1196 } 1197 1198 /* main_c must be one of these. */ 1199 if (determined[chosen_c]) 1200 abort (); 1201 1202 /* Now the set of changing characters of this step. */ 1203 unsigned int changing_count; 1204 1205 changing_count = 0; 1206 for (unsigned int c = 0; c < _alpha_size; c++) 1207 if (undetermined[c] && !step->_undetermined[c]) 1208 changing_count++; 1209 1210 unsigned int *changing = new unsigned int[changing_count]; 1211 changing_count = 0; 1212 for (unsigned int c = 0; c < _alpha_size; c++) 1213 if (undetermined[c] && !step->_undetermined[c]) 1214 changing[changing_count++] = c; 1215 1216 step->_changing = changing; 1217 step->_changing_count = changing_count; 1218 1219 step->_asso_value_max = _asso_value_max; 1220 1221 step->_expected_lower = 1222 exp (static_cast<double>(chosen_possible_collisions) 1223 / static_cast<double>(_max_hash_value)); 1224 step->_expected_upper = 1225 exp (static_cast<double>(chosen_possible_collisions) 1226 / static_cast<double>(_asso_value_max)); 1227 1228 delete_partition (partition); 1229 1230 step->_next = steps; 1231 steps = step; 1232 } 1233 1234 delete[] determined; 1235 delete[] undetermined; 1236 } 1237 1238 if (option[DEBUG]) 1239 { 1240 unsigned int stepno = 0; 1241 for (Step *step = steps; step; step = step->_next) 1242 { 1243 stepno++; 1244 fprintf (stderr, "Step %u chooses _asso_values[", stepno); 1245 for (unsigned int i = 0; i < step->_changing_count; i++) 1246 { 1247 if (i > 0) 1248 fprintf (stderr, ","); 1249 fprintf (stderr, "'%c'", step->_changing[i]); 1250 } 1251 fprintf (stderr, "], expected number of iterations between %g and %g.\n", 1252 step->_expected_lower, step->_expected_upper); 1253 fprintf (stderr, "Keyword equivalence classes:\n"); 1254 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1255 { 1256 fprintf (stderr, "\n"); 1257 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1258 { 1259 KeywordExt *keyword = temp->first(); 1260 fprintf (stderr, " %.*s\n", 1261 keyword->_allchars_length, keyword->_allchars); 1262 } 1263 } 1264 fprintf (stderr, "\n"); 1265 } 1266 } 1267 1268 /* Initialize _asso_values[]. (The value given here matters only 1269 for those c which occur in all keywords with equal multiplicity.) */ 1270 for (unsigned int c = 0; c < _alpha_size; c++) 1271 _asso_values[c] = 0; 1272 1273 unsigned int stepno = 0; 1274 for (Step *step = steps; step; step = step->_next) 1275 { 1276 stepno++; 1277 1278 /* Initialize the asso_values[]. */ 1279 unsigned int k = step->_changing_count; 1280 for (unsigned int i = 0; i < k; i++) 1281 { 1282 unsigned int c = step->_changing[i]; 1283 _asso_values[c] = 1284 (_initial_asso_value < 0 ? rand () : _initial_asso_value) 1285 & (step->_asso_value_max - 1); 1286 } 1287 1288 unsigned int iterations = 0; 1289 DYNAMIC_ARRAY (iter, unsigned int, k); 1290 for (unsigned int i = 0; i < k; i++) 1291 iter[i] = 0; 1292 unsigned int ii = (_jump != 0 ? k - 1 : 0); 1293 1294 for (;;) 1295 { 1296 /* Test whether these asso_values[] lead to collisions among 1297 the equivalence classes that should be collision-free. */ 1298 bool has_collision = false; 1299 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1300 { 1301 /* Iteration Number array is a win, O(1) initialization time! */ 1302 _collision_detector->clear (); 1303 1304 for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest()) 1305 { 1306 KeywordExt *keyword = ptr->first(); 1307 1308 /* Compute the new hash code for the keyword, leaving apart 1309 the yet undetermined asso_values[]. */ 1310 int hashcode; 1311 { 1312 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1313 const unsigned int *p = keyword->_selchars; 1314 int i = keyword->_selchars_length; 1315 for (; i > 0; p++, i--) 1316 if (!step->_undetermined[*p]) 1317 sum += _asso_values[*p]; 1318 hashcode = sum; 1319 } 1320 1321 /* See whether it collides with another keyword's hash code, 1322 from the same equivalence class. */ 1323 if (_collision_detector->set_bit (hashcode)) 1324 { 1325 has_collision = true; 1326 break; 1327 } 1328 } 1329 1330 /* Don't need to continue looking at the other equivalence 1331 classes if we already have found a collision. */ 1332 if (has_collision) 1333 break; 1334 } 1335 1336 iterations++; 1337 if (!has_collision) 1338 break; 1339 1340 /* Try other asso_values[]. */ 1341 if (_jump != 0) 1342 { 1343 /* The way we try various values for 1344 asso_values[step->_changing[0],...step->_changing[k-1]] 1345 is like this: 1346 for (bound = 0,1,...) 1347 for (ii = 0,...,k-1) 1348 iter[ii] := bound 1349 iter[0..ii-1] := values <= bound 1350 iter[ii+1..k-1] := values < bound 1351 and 1352 asso_values[step->_changing[i]] = 1353 _initial_asso_value + iter[i] * _jump. 1354 This makes it more likely to find small asso_values[]. 1355 */ 1356 unsigned int bound = iter[ii]; 1357 unsigned int i = 0; 1358 while (i < ii) 1359 { 1360 unsigned int c = step->_changing[i]; 1361 iter[i]++; 1362 _asso_values[c] = 1363 (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1364 if (iter[i] <= bound) 1365 goto found_next; 1366 _asso_values[c] = 1367 (_asso_values[c] - iter[i] * _jump) 1368 & (step->_asso_value_max - 1); 1369 iter[i] = 0; 1370 i++; 1371 } 1372 i = ii + 1; 1373 while (i < k) 1374 { 1375 unsigned int c = step->_changing[i]; 1376 iter[i]++; 1377 _asso_values[c] = 1378 (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1379 if (iter[i] < bound) 1380 goto found_next; 1381 _asso_values[c] = 1382 (_asso_values[c] - iter[i] * _jump) 1383 & (step->_asso_value_max - 1); 1384 iter[i] = 0; 1385 i++; 1386 } 1387 /* Switch from one ii to the next. */ 1388 { 1389 unsigned int c = step->_changing[ii]; 1390 _asso_values[c] = 1391 (_asso_values[c] - bound * _jump) 1392 & (step->_asso_value_max - 1); 1393 iter[ii] = 0; 1394 } 1395 /* Here all iter[i] == 0. */ 1396 ii++; 1397 if (ii == k) 1398 { 1399 ii = 0; 1400 bound++; 1401 if (bound == step->_asso_value_max) 1402 { 1403 /* Out of search space! We can either backtrack, or 1404 increase the available search space of this step. 1405 It seems simpler to choose the latter solution. */ 1406 step->_asso_value_max = 2 * step->_asso_value_max; 1407 if (step->_asso_value_max > _asso_value_max) 1408 { 1409 _asso_value_max = step->_asso_value_max; 1410 /* Reinitialize _max_hash_value. */ 1411 _max_hash_value = 1412 (option[NOLENGTH] ? 0 : _max_key_len) 1413 + (_asso_value_max - 1) * _max_selchars_length; 1414 /* Reinitialize _collision_detector. */ 1415 delete _collision_detector; 1416 _collision_detector = 1417 new Bool_Array (_max_hash_value + 1); 1418 } 1419 } 1420 } 1421 { 1422 unsigned int c = step->_changing[ii]; 1423 iter[ii] = bound; 1424 _asso_values[c] = 1425 (_asso_values[c] + bound * _jump) 1426 & (step->_asso_value_max - 1); 1427 } 1428 found_next: ; 1429 } 1430 else 1431 { 1432 /* Random. */ 1433 unsigned int c = step->_changing[ii]; 1434 _asso_values[c] = 1435 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1); 1436 /* Next time, change the next c. */ 1437 ii++; 1438 if (ii == k) 1439 ii = 0; 1440 } 1441 } 1442 FREE_DYNAMIC_ARRAY (iter); 1443 1444 if (option[DEBUG]) 1445 { 1446 fprintf (stderr, "Step %u chose _asso_values[", stepno); 1447 for (unsigned int i = 0; i < step->_changing_count; i++) 1448 { 1449 if (i > 0) 1450 fprintf (stderr, ","); 1451 fprintf (stderr, "'%c'", step->_changing[i]); 1452 } 1453 fprintf (stderr, "] in %u iterations.\n", iterations); 1454 } 1455 } 1456 1457 /* Free allocated memory. */ 1458 while (steps != NULL) 1459 { 1460 Step *step = steps; 1461 steps = step->_next; 1462 delete[] step->_changing; 1463 delete[] step->_undetermined; 1464 delete_partition (step->_partition); 1465 delete step; 1466 } 1467} 1468 1469/* Computes a keyword's hash value, relative to the current _asso_values[], 1470 and stores it in keyword->_hash_value. */ 1471 1472inline int 1473Search::compute_hash (KeywordExt *keyword) const 1474{ 1475 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1476 1477 const unsigned int *p = keyword->_selchars; 1478 int i = keyword->_selchars_length; 1479 for (; i > 0; p++, i--) 1480 sum += _asso_values[*p]; 1481 1482 return keyword->_hash_value = sum; 1483} 1484 1485/* Finds good _asso_values[]. */ 1486 1487void 1488Search::find_good_asso_values () 1489{ 1490 prepare_asso_values (); 1491 1492 /* Search for good _asso_values[]. */ 1493 int asso_iteration; 1494 if ((asso_iteration = option.get_asso_iterations ()) == 0) 1495 /* Try only the given _initial_asso_value and _jump. */ 1496 find_asso_values (); 1497 else 1498 { 1499 /* Try different pairs of _initial_asso_value and _jump, in the 1500 following order: 1501 (0, 1) 1502 (1, 1) 1503 (2, 1) (0, 3) 1504 (3, 1) (1, 3) 1505 (4, 1) (2, 3) (0, 5) 1506 (5, 1) (3, 3) (1, 5) 1507 ..... */ 1508 KeywordExt_List *saved_head = _head; 1509 int best_initial_asso_value = 0; 1510 int best_jump = 1; 1511 int *best_asso_values = new int[_alpha_size]; 1512 int best_collisions = INT_MAX; 1513 int best_max_hash_value = INT_MAX; 1514 1515 _initial_asso_value = 0; _jump = 1; 1516 for (;;) 1517 { 1518 /* Restore the keyword list in its original order. */ 1519 _head = copy_list (saved_head); 1520 /* Find good _asso_values[]. */ 1521 find_asso_values (); 1522 /* Test whether it is the best solution so far. */ 1523 int collisions = 0; 1524 int max_hash_value = INT_MIN; 1525 _collision_detector->clear (); 1526 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1527 { 1528 KeywordExt *keyword = ptr->first(); 1529 int hashcode = compute_hash (keyword); 1530 if (max_hash_value < hashcode) 1531 max_hash_value = hashcode; 1532 if (_collision_detector->set_bit (hashcode)) 1533 collisions++; 1534 } 1535 if (collisions < best_collisions 1536 || (collisions == best_collisions 1537 && max_hash_value < best_max_hash_value)) 1538 { 1539 memcpy (best_asso_values, _asso_values, 1540 _alpha_size * sizeof (_asso_values[0])); 1541 best_collisions = collisions; 1542 best_max_hash_value = max_hash_value; 1543 } 1544 /* Delete the copied keyword list. */ 1545 delete_list (_head); 1546 1547 if (--asso_iteration == 0) 1548 break; 1549 /* Prepare for next iteration. */ 1550 if (_initial_asso_value >= 2) 1551 _initial_asso_value -= 2, _jump += 2; 1552 else 1553 _initial_asso_value += _jump, _jump = 1; 1554 } 1555 _head = saved_head; 1556 /* Install the best found asso_values. */ 1557 _initial_asso_value = best_initial_asso_value; 1558 _jump = best_jump; 1559 memcpy (_asso_values, best_asso_values, 1560 _alpha_size * sizeof (_asso_values[0])); 1561 delete[] best_asso_values; 1562 /* The keywords' _hash_value fields are recomputed below. */ 1563 } 1564} 1565 1566/* ========================================================================= */ 1567 1568/* Comparison function for sorting by increasing _hash_value. */ 1569static bool 1570less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2) 1571{ 1572 return keyword1->_hash_value < keyword2->_hash_value; 1573} 1574 1575/* Sorts the keyword list by hash value. */ 1576 1577void 1578Search::sort () 1579{ 1580 _head = mergesort_list (_head, less_by_hash_value); 1581} 1582 1583void 1584Search::optimize () 1585{ 1586 /* Preparations. */ 1587 prepare (); 1588 1589 /* Step 1: Finding good byte positions. */ 1590 find_positions (); 1591 1592 /* Step 2: Finding good alpha increments. */ 1593 find_alpha_inc (); 1594 1595 /* Step 3: Finding good asso_values. */ 1596 find_good_asso_values (); 1597 1598 /* Make one final check, just to make sure nothing weird happened.... */ 1599 _collision_detector->clear (); 1600 for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest()) 1601 { 1602 KeywordExt *curr = curr_ptr->first(); 1603 unsigned int hashcode = compute_hash (curr); 1604 if (_collision_detector->set_bit (hashcode)) 1605 { 1606 /* This shouldn't happen. proj1, proj2, proj3 must have been 1607 computed to be injective on the given keyword set. */ 1608 fprintf (stderr, 1609 "\nInternal error, unexpected duplicate hash code\n"); 1610 if (option[POSITIONS]) 1611 fprintf (stderr, "try options -m or -r, or use new key positions.\n\n"); 1612 else 1613 fprintf (stderr, "try options -m or -r.\n\n"); 1614 exit (1); 1615 } 1616 } 1617 1618 /* Sorts the keyword list by hash value. */ 1619 sort (); 1620 1621 /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely 1622 necessary, but speeds up the lookup function in many cases of lookup 1623 failure: no string comparison is needed once the hash value of a string 1624 is larger than the hash value of any keyword. */ 1625 int max_hash_value; 1626 { 1627 KeywordExt_List *temp; 1628 for (temp = _head; temp->rest(); temp = temp->rest()) 1629 ; 1630 max_hash_value = temp->first()->_hash_value; 1631 } 1632 for (unsigned int c = 0; c < _alpha_size; c++) 1633 if (_occurrences[c] == 0) 1634 _asso_values[c] = max_hash_value + 1; 1635 1636 /* Propagate unified asso_values. */ 1637 if (_alpha_unify) 1638 for (unsigned int c = 0; c < _alpha_size; c++) 1639 if (_alpha_unify[c] != c) 1640 _asso_values[c] = _asso_values[_alpha_unify[c]]; 1641} 1642 1643/* Prints out some diagnostics upon completion. */ 1644 1645Search::~Search () 1646{ 1647 delete _collision_detector; 1648 if (option[DEBUG]) 1649 { 1650 fprintf (stderr, "\ndumping occurrence and associated values tables\n"); 1651 1652 for (unsigned int i = 0; i < _alpha_size; i++) 1653 if (_occurrences[i]) 1654 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", 1655 i, _asso_values[i], i, _occurrences[i]); 1656 1657 fprintf (stderr, "end table dumping\n"); 1658 1659 fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d" 1660 "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", 1661 _list_len, _total_keys, _total_duplicates, _max_key_len); 1662 1663 int field_width = _max_selchars_length; 1664 fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", 1665 field_width, "selchars"); 1666 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1667 { 1668 fprintf (stderr, "%11d,%11d,%6d, ", 1669 ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index); 1670 if (field_width > ptr->first()->_selchars_length) 1671 fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, ""); 1672 for (int j = 0; j < ptr->first()->_selchars_length; j++) 1673 putc (ptr->first()->_selchars[j], stderr); 1674 fprintf (stderr, ", %.*s\n", 1675 ptr->first()->_allchars_length, ptr->first()->_allchars); 1676 } 1677 1678 fprintf (stderr, "End dumping list.\n\n"); 1679 } 1680 delete[] _asso_values; 1681 delete[] _occurrences; 1682 delete[] _alpha_unify; 1683 delete[] _alpha_inc; 1684} 1685