index.cpp revision 151498
1// -*- C++ -*- 2/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004 3 Free Software Foundation, Inc. 4 Written by James Clark (jjc@jclark.com) 5 6This file is part of groff. 7 8groff is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 2, or (at your option) any later 11version. 12 13groff is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License along 19with groff; see the file COPYING. If not, write to the Free Software 20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21 22#include "lib.h" 23 24#include <stdlib.h> 25#include <errno.h> 26 27#include "posix.h" 28#include "cset.h" 29#include "cmap.h" 30#include "errarg.h" 31#include "error.h" 32 33#include "refid.h" 34#include "search.h" 35#include "index.h" 36#include "defs.h" 37 38#include "nonposix.h" 39 40// Interface to mmap. 41extern "C" { 42 void *mapread(int fd, int len); 43 int unmap(void *, int len); 44} 45 46#if 0 47const 48#endif 49int minus_one = -1; 50 51int verify_flag = 0; 52 53struct word_list; 54 55class index_search_item : public search_item { 56 search_item *out_of_date_files; 57 index_header header; 58 char *buffer; 59 void *map_addr; 60 int map_len; 61 tag *tags; 62 int *table; 63 int *lists; 64 char *pool; 65 char *key_buffer; 66 char *filename_buffer; 67 int filename_buflen; 68 char **common_words_table; 69 int common_words_table_size; 70 const char *ignore_fields; 71 time_t mtime; 72 73 const char *do_verify(); 74 const int *search1(const char **pp, const char *end); 75 const int *search(const char *ptr, int length, int **temp_listp); 76 const char *munge_filename(const char *); 77 void read_common_words_file(); 78 void add_out_of_date_file(int fd, const char *filename, int fid); 79public: 80 index_search_item(const char *, int); 81 ~index_search_item(); 82 int load(int fd); 83 search_item_iterator *make_search_item_iterator(const char *); 84 int verify(); 85 void check_files(); 86 int next_filename_id() const; 87 friend class index_search_item_iterator; 88}; 89 90class index_search_item_iterator : public search_item_iterator { 91 index_search_item *indx; 92 search_item_iterator *out_of_date_files_iter; 93 search_item *next_out_of_date_file; 94 const int *found_list; 95 int *temp_list; 96 char *buf; 97 int buflen; 98 linear_searcher searcher; 99 char *query; 100 int get_tag(int tagno, const linear_searcher &, const char **, int *, 101 reference_id *); 102public: 103 index_search_item_iterator(index_search_item *, const char *); 104 ~index_search_item_iterator(); 105 int next(const linear_searcher &, const char **, int *, reference_id *); 106}; 107 108 109index_search_item::index_search_item(const char *filename, int fid) 110: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0), 111 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0), 112 common_words_table(0) 113{ 114} 115 116index_search_item::~index_search_item() 117{ 118 if (buffer) 119 free(buffer); 120 if (map_addr) { 121 if (unmap(map_addr, map_len) < 0) 122 error("unmap: %1", strerror(errno)); 123 } 124 while (out_of_date_files) { 125 search_item *tem = out_of_date_files; 126 out_of_date_files = out_of_date_files->next; 127 delete tem; 128 } 129 a_delete filename_buffer; 130 a_delete key_buffer; 131 if (common_words_table) { 132 for (int i = 0; i < common_words_table_size; i++) 133 a_delete common_words_table[i]; 134 a_delete common_words_table; 135 } 136} 137 138class file_closer { 139 int *fdp; 140public: 141 file_closer(int &fd) : fdp(&fd) { } 142 ~file_closer() { close(*fdp); } 143}; 144 145// Tell the compiler that a variable is intentionally unused. 146inline void unused(void *) { } 147 148int index_search_item::load(int fd) 149{ 150 file_closer fd_closer(fd); // close fd on return 151 unused(&fd_closer); 152 struct stat sb; 153 if (fstat(fd, &sb) < 0) { 154 error("can't fstat `%1': %2", name, strerror(errno)); 155 return 0; 156 } 157 if (!S_ISREG(sb.st_mode)) { 158 error("`%1' is not a regular file", name); 159 return 0; 160 } 161 mtime = sb.st_mtime; 162 int size = int(sb.st_size); 163 char *addr; 164 map_addr = mapread(fd, size); 165 if (map_addr) { 166 addr = (char *)map_addr; 167 map_len = size; 168 } 169 else { 170 addr = buffer = (char *)malloc(size); 171 if (buffer == 0) { 172 error("can't allocate buffer for `%1'", name); 173 return 0; 174 } 175 char *ptr = buffer; 176 int bytes_to_read = size; 177 while (bytes_to_read > 0) { 178 int nread = read(fd, ptr, bytes_to_read); 179 if (nread == 0) { 180 error("unexpected EOF on `%1'", name); 181 return 0; 182 } 183 if (nread < 0) { 184 error("read error on `%1': %2", name, strerror(errno)); 185 return 0; 186 } 187 bytes_to_read -= nread; 188 ptr += nread; 189 } 190 } 191 header = *(index_header *)addr; 192 if (header.magic != INDEX_MAGIC) { 193 error("`%1' is not an index file: wrong magic number", name); 194 return 0; 195 } 196 if (header.version != INDEX_VERSION) { 197 error("version number in `%1' is wrong: was %2, should be %3", 198 name, header.version, INDEX_VERSION); 199 return 0; 200 } 201 int sz = (header.tags_size * sizeof(tag) 202 + header.lists_size * sizeof(int) 203 + header.table_size * sizeof(int) 204 + header.strings_size 205 + sizeof(header)); 206 if (sz != size) { 207 error("size of `%1' is wrong: was %2, should be %3", 208 name, size, sz); 209 return 0; 210 } 211 tags = (tag *)(addr + sizeof(header)); 212 lists = (int *)(tags + header.tags_size); 213 table = (int *)(lists + header.lists_size); 214 pool = (char *)(table + header.table_size); 215 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1; 216 key_buffer = new char[header.truncate]; 217 read_common_words_file(); 218 return 1; 219} 220 221const char *index_search_item::do_verify() 222{ 223 if (tags == 0) 224 return "not loaded"; 225 if (lists[header.lists_size - 1] >= 0) 226 return "last list element not negative"; 227 int i; 228 for (i = 0; i < header.table_size; i++) { 229 int li = table[i]; 230 if (li >= header.lists_size) 231 return "bad list index"; 232 if (li >= 0) { 233 for (int *ptr = lists + li; *ptr >= 0; ptr++) { 234 if (*ptr >= header.tags_size) 235 return "bad tag index"; 236 if (*ptr >= ptr[1] && ptr[1] >= 0) 237 return "list not ordered"; 238 } 239 } 240 } 241 for (i = 0; i < header.tags_size; i++) { 242 if (tags[i].filename_index >= header.strings_size) 243 return "bad index in tags"; 244 if (tags[i].length < 0) 245 return "bad length in tags"; 246 if (tags[i].start < 0) 247 return "bad start in tags"; 248 } 249 if (pool[header.strings_size - 1] != '\0') 250 return "last character in pool not nul"; 251 return 0; 252} 253 254int index_search_item::verify() 255{ 256 const char *reason = do_verify(); 257 if (!reason) 258 return 1; 259 error("`%1' is bad: %2", name, reason); 260 return 0; 261} 262 263int index_search_item::next_filename_id() const 264{ 265 return filename_id + header.strings_size + 1; 266} 267 268search_item_iterator *index_search_item::make_search_item_iterator( 269 const char *query) 270{ 271 return new index_search_item_iterator(this, query); 272} 273 274search_item *make_index_search_item(const char *filename, int fid) 275{ 276 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)]; 277 strcpy(index_filename, filename); 278 strcat(index_filename, INDEX_SUFFIX); 279 int fd = open(index_filename, O_RDONLY | O_BINARY); 280 if (fd < 0) 281 return 0; 282 index_search_item *item = new index_search_item(index_filename, fid); 283 a_delete index_filename; 284 if (!item->load(fd)) { 285 close(fd); 286 delete item; 287 return 0; 288 } 289 else if (verify_flag && !item->verify()) { 290 delete item; 291 return 0; 292 } 293 else { 294 item->check_files(); 295 return item; 296 } 297} 298 299 300index_search_item_iterator::index_search_item_iterator(index_search_item *ind, 301 const char *q) 302: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0), 303 buf(0), buflen(0), 304 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate), 305 query(strsave(q)) 306{ 307 found_list = indx->search(q, strlen(q), &temp_list); 308 if (!found_list) { 309 found_list = &minus_one; 310 warning("all keys would have been discarded in constructing index `%1'", 311 indx->name); 312 } 313} 314 315index_search_item_iterator::~index_search_item_iterator() 316{ 317 a_delete temp_list; 318 a_delete buf; 319 a_delete query; 320 delete out_of_date_files_iter; 321} 322 323int index_search_item_iterator::next(const linear_searcher &, 324 const char **pp, int *lenp, 325 reference_id *ridp) 326{ 327 if (found_list) { 328 for (;;) { 329 int tagno = *found_list; 330 if (tagno == -1) 331 break; 332 found_list++; 333 if (get_tag(tagno, searcher, pp, lenp, ridp)) 334 return 1; 335 } 336 found_list = 0; 337 next_out_of_date_file = indx->out_of_date_files; 338 } 339 while (next_out_of_date_file) { 340 if (out_of_date_files_iter == 0) 341 out_of_date_files_iter 342 = next_out_of_date_file->make_search_item_iterator(query); 343 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp)) 344 return 1; 345 delete out_of_date_files_iter; 346 out_of_date_files_iter = 0; 347 next_out_of_date_file = next_out_of_date_file->next; 348 } 349 return 0; 350} 351 352int index_search_item_iterator::get_tag(int tagno, 353 const linear_searcher &searchr, 354 const char **pp, int *lenp, 355 reference_id *ridp) 356{ 357 if (tagno < 0 || tagno >= indx->header.tags_size) { 358 error("bad tag number"); 359 return 0; 360 } 361 tag *tp = indx->tags + tagno; 362 const char *filename = indx->munge_filename(indx->pool + tp->filename_index); 363 int fd = open(filename, O_RDONLY | O_BINARY); 364 if (fd < 0) { 365 error("can't open `%1': %2", filename, strerror(errno)); 366 return 0; 367 } 368 struct stat sb; 369 if (fstat(fd, &sb) < 0) { 370 error("can't fstat: %1", strerror(errno)); 371 close(fd); 372 return 0; 373 } 374 time_t mtime = sb.st_mtime; 375 if (mtime > indx->mtime) { 376 indx->add_out_of_date_file(fd, filename, 377 indx->filename_id + tp->filename_index); 378 return 0; 379 } 380 int res = 0; 381 FILE *fp = fdopen(fd, FOPEN_RB); 382 if (!fp) { 383 error("fdopen failed"); 384 close(fd); 385 return 0; 386 } 387 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0) 388 error("can't seek on `%1': %2", filename, strerror(errno)); 389 else { 390 int length = tp->length; 391 int err = 0; 392 if (length == 0) { 393 if (fstat(fileno(fp), &sb) < 0) { 394 error("can't stat `%1': %2", filename, strerror(errno)); 395 err = 1; 396 } 397 else if (!S_ISREG(sb.st_mode)) { 398 error("`%1' is not a regular file", filename); 399 err = 1; 400 } 401 else 402 length = int(sb.st_size); 403 } 404 if (!err) { 405 if (length + 2 > buflen) { 406 a_delete buf; 407 buflen = length + 2; 408 buf = new char[buflen]; 409 } 410 if (fread(buf + 1, 1, length, fp) != (size_t)length) 411 error("fread on `%1' failed: %2", filename, strerror(errno)); 412 else { 413 buf[0] = '\n'; 414 // Remove the CR characters from CRLF pairs. 415 int sidx = 1, didx = 1; 416 for ( ; sidx < length + 1; sidx++, didx++) 417 { 418 if (buf[sidx] == '\r') 419 { 420 if (buf[++sidx] != '\n') 421 buf[didx++] = '\r'; 422 else 423 length--; 424 } 425 if (sidx != didx) 426 buf[didx] = buf[sidx]; 427 } 428 buf[length + 1] = '\n'; 429 res = searchr.search(buf + 1, buf + 2 + length, pp, lenp); 430 if (res && ridp) 431 *ridp = reference_id(indx->filename_id + tp->filename_index, 432 tp->start); 433 } 434 } 435 } 436 fclose(fp); 437 return res; 438} 439 440const char *index_search_item::munge_filename(const char *filename) 441{ 442 if (IS_ABSOLUTE(filename)) 443 return filename; 444 const char *cwd = pool; 445 int need_slash = (cwd[0] != 0 446 && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0); 447 int len = strlen(cwd) + strlen(filename) + need_slash + 1; 448 if (len > filename_buflen) { 449 a_delete filename_buffer; 450 filename_buflen = len; 451 filename_buffer = new char[len]; 452 } 453 strcpy(filename_buffer, cwd); 454 if (need_slash) 455 strcat(filename_buffer, "/"); 456 strcat(filename_buffer, filename); 457 return filename_buffer; 458} 459 460const int *index_search_item::search1(const char **pp, const char *end) 461{ 462 while (*pp < end && !csalnum(**pp)) 463 *pp += 1; 464 if (*pp >= end) 465 return 0; 466 const char *start = *pp; 467 while (*pp < end && csalnum(**pp)) 468 *pp += 1; 469 int len = *pp - start; 470 if (len < header.shortest) 471 return 0; 472 if (len > header.truncate) 473 len = header.truncate; 474 int is_number = 1; 475 for (int i = 0; i < len; i++) 476 if (csdigit(start[i])) 477 key_buffer[i] = start[i]; 478 else { 479 key_buffer[i] = cmlower(start[i]); 480 is_number = 0; 481 } 482 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9')) 483 return 0; 484 unsigned hc = hash(key_buffer, len); 485 if (common_words_table) { 486 for (int h = hc % common_words_table_size; 487 common_words_table[h]; 488 --h) { 489 if (strlen(common_words_table[h]) == (size_t)len 490 && memcmp(common_words_table[h], key_buffer, len) == 0) 491 return 0; 492 if (h == 0) 493 h = common_words_table_size; 494 } 495 } 496 int li = table[int(hc % header.table_size)]; 497 return li < 0 ? &minus_one : lists + li; 498} 499 500static void merge(int *result, const int *s1, const int *s2) 501{ 502 for (; *s1 >= 0; s1++) { 503 while (*s2 >= 0 && *s2 < *s1) 504 s2++; 505 if (*s2 == *s1) 506 *result++ = *s2; 507 } 508 *result++ = -1; 509} 510 511const int *index_search_item::search(const char *ptr, int length, 512 int **temp_listp) 513{ 514 const char *end = ptr + length; 515 if (*temp_listp) { 516 a_delete *temp_listp; 517 *temp_listp = 0; 518 } 519 const int *first_list = 0; 520 while (ptr < end && (first_list = search1(&ptr, end)) == 0) 521 ; 522 if (!first_list) 523 return 0; 524 if (*first_list < 0) 525 return first_list; 526 const int *second_list = 0; 527 while (ptr < end && (second_list = search1(&ptr, end)) == 0) 528 ; 529 if (!second_list) 530 return first_list; 531 if (*second_list < 0) 532 return second_list; 533 const int *p; 534 for (p = first_list; *p >= 0; p++) 535 ; 536 int len = p - first_list; 537 for (p = second_list; *p >= 0; p++) 538 ; 539 if (p - second_list < len) 540 len = p - second_list; 541 int *matches = new int[len + 1]; 542 merge(matches, first_list, second_list); 543 while (ptr < end) { 544 const int *list = search1(&ptr, end); 545 if (list != 0) { 546 if (*list < 0) { 547 a_delete matches; 548 return list; 549 } 550 merge(matches, matches, list); 551 if (*matches < 0) { 552 a_delete matches; 553 return &minus_one; 554 } 555 } 556 } 557 *temp_listp = matches; 558 return matches; 559} 560 561void index_search_item::read_common_words_file() 562{ 563 if (header.common <= 0) 564 return; 565 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1); 566 errno = 0; 567 FILE *fp = fopen(common_words_file, "r"); 568 if (!fp) { 569 error("can't open `%1': %2", common_words_file, strerror(errno)); 570 return; 571 } 572 common_words_table_size = 2*header.common + 1; 573 while (!is_prime(common_words_table_size)) 574 common_words_table_size++; 575 common_words_table = new char *[common_words_table_size]; 576 for (int i = 0; i < common_words_table_size; i++) 577 common_words_table[i] = 0; 578 int count = 0; 579 int key_len = 0; 580 for (;;) { 581 int c = getc(fp); 582 while (c != EOF && !csalnum(c)) 583 c = getc(fp); 584 if (c == EOF) 585 break; 586 do { 587 if (key_len < header.truncate) 588 key_buffer[key_len++] = cmlower(c); 589 c = getc(fp); 590 } while (c != EOF && csalnum(c)); 591 if (key_len >= header.shortest) { 592 int h = hash(key_buffer, key_len) % common_words_table_size; 593 while (common_words_table[h]) { 594 if (h == 0) 595 h = common_words_table_size; 596 --h; 597 } 598 common_words_table[h] = new char[key_len + 1]; 599 memcpy(common_words_table[h], key_buffer, key_len); 600 common_words_table[h][key_len] = '\0'; 601 } 602 if (++count >= header.common) 603 break; 604 key_len = 0; 605 if (c == EOF) 606 break; 607 } 608 fclose(fp); 609} 610 611void index_search_item::add_out_of_date_file(int fd, const char *filename, 612 int fid) 613{ 614 search_item **pp; 615 for (pp = &out_of_date_files; *pp; pp = &(*pp)->next) 616 if ((*pp)->is_named(filename)) 617 return; 618 *pp = make_linear_search_item(fd, filename, fid); 619 warning("`%1' modified since `%2' created", filename, name); 620} 621 622void index_search_item::check_files() 623{ 624 const char *pool_end = pool + header.strings_size; 625 for (const char *ptr = strchr(ignore_fields, '\0') + 1; 626 ptr < pool_end; 627 ptr = strchr(ptr, '\0') + 1) { 628 const char *path = munge_filename(ptr); 629 struct stat sb; 630 if (stat(path, &sb) < 0) 631 error("can't stat `%1': %2", path, strerror(errno)); 632 else if (sb.st_mtime > mtime) { 633 int fd = open(path, O_RDONLY | O_BINARY); 634 if (fd < 0) 635 error("can't open `%1': %2", path, strerror(errno)); 636 else 637 add_out_of_date_file(fd, path, filename_id + (ptr - pool)); 638 } 639 } 640} 641