1// -*- C++ -*-
2/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004
3   Free Software Foundation, Inc.
4     Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License along
19with groff; see the file COPYING.  If not, write to the Free Software
20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22#include "lib.h"
23
24#include <stdlib.h>
25#include <errno.h>
26
27#include "posix.h"
28#include "cset.h"
29#include "cmap.h"
30#include "errarg.h"
31#include "error.h"
32
33#include "refid.h"
34#include "search.h"
35#include "index.h"
36#include "defs.h"
37
38#include "nonposix.h"
39
40// Interface to mmap.
41extern "C" {
42  void *mapread(int fd, int len);
43  int unmap(void *, int len);
44}
45
46#if 0
47const
48#endif
49int minus_one = -1;
50
51int verify_flag = 0;
52
53struct word_list;
54
55class index_search_item : public search_item {
56  search_item *out_of_date_files;
57  index_header header;
58  char *buffer;
59  void *map_addr;
60  int map_len;
61  tag *tags;
62  int *table;
63  int *lists;
64  char *pool;
65  char *key_buffer;
66  char *filename_buffer;
67  int filename_buflen;
68  char **common_words_table;
69  int common_words_table_size;
70  const char *ignore_fields;
71  time_t mtime;
72
73  const char *do_verify();
74  const int *search1(const char **pp, const char *end);
75  const int *search(const char *ptr, int length, int **temp_listp);
76  const char *munge_filename(const char *);
77  void read_common_words_file();
78  void add_out_of_date_file(int fd, const char *filename, int fid);
79public:
80  index_search_item(const char *, int);
81  ~index_search_item();
82  int load(int fd);
83  search_item_iterator *make_search_item_iterator(const char *);
84  int verify();
85  void check_files();
86  int next_filename_id() const;
87  friend class index_search_item_iterator;
88};
89
90class index_search_item_iterator : public search_item_iterator {
91  index_search_item *indx;
92  search_item_iterator *out_of_date_files_iter;
93  search_item *next_out_of_date_file;
94  const int *found_list;
95  int *temp_list;
96  char *buf;
97  int buflen;
98  linear_searcher searcher;
99  char *query;
100  int get_tag(int tagno, const linear_searcher &, const char **, int *,
101	      reference_id *);
102public:
103  index_search_item_iterator(index_search_item *, const char *);
104  ~index_search_item_iterator();
105  int next(const linear_searcher &, const char **, int *, reference_id *);
106};
107
108
109index_search_item::index_search_item(const char *filename, int fid)
110: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
111  map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
112  common_words_table(0)
113{
114}
115
116index_search_item::~index_search_item()
117{
118  if (buffer)
119    free(buffer);
120  if (map_addr) {
121    if (unmap(map_addr, map_len) < 0)
122      error("unmap: %1", strerror(errno));
123  }
124  while (out_of_date_files) {
125    search_item *tem = out_of_date_files;
126    out_of_date_files = out_of_date_files->next;
127    delete tem;
128  }
129  a_delete filename_buffer;
130  a_delete key_buffer;
131  if (common_words_table) {
132    for (int i = 0; i < common_words_table_size; i++)
133      a_delete common_words_table[i];
134    a_delete common_words_table;
135  }
136}
137
138class file_closer {
139  int *fdp;
140public:
141  file_closer(int &fd) : fdp(&fd) { }
142  ~file_closer() { close(*fdp); }
143};
144
145// Tell the compiler that a variable is intentionally unused.
146inline void unused(void *) { }
147
148int index_search_item::load(int fd)
149{
150  file_closer fd_closer(fd);	// close fd on return
151  unused(&fd_closer);
152  struct stat sb;
153  if (fstat(fd, &sb) < 0) {
154    error("can't fstat `%1': %2", name, strerror(errno));
155    return 0;
156  }
157  if (!S_ISREG(sb.st_mode)) {
158    error("`%1' is not a regular file", name);
159    return 0;
160  }
161  mtime = sb.st_mtime;
162  int size = int(sb.st_size);
163  char *addr;
164  map_addr = mapread(fd, size);
165  if (map_addr) {
166    addr = (char *)map_addr;
167    map_len = size;
168  }
169  else {
170    addr = buffer = (char *)malloc(size);
171    if (buffer == 0) {
172      error("can't allocate buffer for `%1'", name);
173      return 0;
174    }
175    char *ptr = buffer;
176    int bytes_to_read = size;
177    while (bytes_to_read > 0) {
178      int nread = read(fd, ptr, bytes_to_read);
179      if (nread == 0) {
180	error("unexpected EOF on `%1'", name);
181	return 0;
182      }
183      if (nread < 0) {
184	error("read error on `%1': %2", name, strerror(errno));
185	return 0;
186      }
187      bytes_to_read -= nread;
188      ptr += nread;
189    }
190  }
191  header = *(index_header *)addr;
192  if (header.magic != INDEX_MAGIC) {
193    error("`%1' is not an index file: wrong magic number", name);
194    return 0;
195  }
196  if (header.version != INDEX_VERSION) {
197    error("version number in `%1' is wrong: was %2, should be %3",
198	  name, header.version, INDEX_VERSION);
199    return 0;
200  }
201  int sz = (header.tags_size * sizeof(tag)
202	    + header.lists_size * sizeof(int)
203	    + header.table_size * sizeof(int)
204	    + header.strings_size
205	    + sizeof(header));
206  if (sz != size) {
207    error("size of `%1' is wrong: was %2, should be %3",
208	  name, size, sz);
209    return 0;
210  }
211  tags = (tag *)(addr + sizeof(header));
212  lists = (int *)(tags + header.tags_size);
213  table = (int *)(lists + header.lists_size);
214  pool = (char *)(table + header.table_size);
215  ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
216  key_buffer = new char[header.truncate];
217  read_common_words_file();
218  return 1;
219}
220
221const char *index_search_item::do_verify()
222{
223  if (tags == 0)
224    return "not loaded";
225  if (lists[header.lists_size - 1] >= 0)
226    return "last list element not negative";
227  int i;
228  for (i = 0; i < header.table_size; i++) {
229    int li = table[i];
230    if (li >= header.lists_size)
231      return "bad list index";
232    if (li >= 0) {
233      for (int *ptr = lists + li; *ptr >= 0; ptr++) {
234	if (*ptr >= header.tags_size)
235	  return "bad tag index";
236	if (*ptr >= ptr[1] && ptr[1] >= 0)
237	  return "list not ordered";
238      }
239    }
240  }
241  for (i = 0; i < header.tags_size; i++) {
242    if (tags[i].filename_index >= header.strings_size)
243      return "bad index in tags";
244    if (tags[i].length < 0)
245      return "bad length in tags";
246    if (tags[i].start < 0)
247      return "bad start in tags";
248  }
249  if (pool[header.strings_size - 1] != '\0')
250    return "last character in pool not nul";
251  return 0;
252}
253
254int index_search_item::verify()
255{
256  const char *reason = do_verify();
257  if (!reason)
258    return 1;
259  error("`%1' is bad: %2", name, reason);
260  return 0;
261}
262
263int index_search_item::next_filename_id() const
264{
265  return filename_id + header.strings_size + 1;
266}
267
268search_item_iterator *index_search_item::make_search_item_iterator(
269  const char *query)
270{
271  return new index_search_item_iterator(this, query);
272}
273
274search_item *make_index_search_item(const char *filename, int fid)
275{
276  char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
277  strcpy(index_filename, filename);
278  strcat(index_filename, INDEX_SUFFIX);
279  int fd = open(index_filename, O_RDONLY | O_BINARY);
280  if (fd < 0)
281    return 0;
282  index_search_item *item = new index_search_item(index_filename, fid);
283  a_delete index_filename;
284  if (!item->load(fd)) {
285    close(fd);
286    delete item;
287    return 0;
288  }
289  else if (verify_flag && !item->verify()) {
290    delete item;
291    return 0;
292  }
293  else {
294    item->check_files();
295    return item;
296  }
297}
298
299
300index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
301						       const char *q)
302: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
303  buf(0), buflen(0),
304  searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
305  query(strsave(q))
306{
307  found_list = indx->search(q, strlen(q), &temp_list);
308  if (!found_list) {
309    found_list = &minus_one;
310    warning("all keys would have been discarded in constructing index `%1'",
311	    indx->name);
312  }
313}
314
315index_search_item_iterator::~index_search_item_iterator()
316{
317  a_delete temp_list;
318  a_delete buf;
319  a_delete query;
320  delete out_of_date_files_iter;
321}
322
323int index_search_item_iterator::next(const linear_searcher &,
324				     const char **pp, int *lenp,
325				     reference_id *ridp)
326{
327  if (found_list) {
328    for (;;) {
329      int tagno = *found_list;
330      if (tagno == -1)
331	break;
332      found_list++;
333      if (get_tag(tagno, searcher, pp, lenp, ridp))
334	return 1;
335    }
336    found_list = 0;
337    next_out_of_date_file = indx->out_of_date_files;
338  }
339  while (next_out_of_date_file) {
340    if (out_of_date_files_iter == 0)
341      out_of_date_files_iter
342	= next_out_of_date_file->make_search_item_iterator(query);
343    if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
344      return 1;
345    delete out_of_date_files_iter;
346    out_of_date_files_iter = 0;
347    next_out_of_date_file = next_out_of_date_file->next;
348  }
349  return 0;
350}
351
352int index_search_item_iterator::get_tag(int tagno,
353					const linear_searcher &searchr,
354					const char **pp, int *lenp,
355					reference_id *ridp)
356{
357  if (tagno < 0 || tagno >= indx->header.tags_size) {
358    error("bad tag number");
359    return 0;
360  }
361  tag *tp = indx->tags + tagno;
362  const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
363  int fd = open(filename, O_RDONLY | O_BINARY);
364  if (fd < 0) {
365    error("can't open `%1': %2", filename, strerror(errno));
366    return 0;
367  }
368  struct stat sb;
369  if (fstat(fd, &sb) < 0) {
370    error("can't fstat: %1", strerror(errno));
371    close(fd);
372    return 0;
373  }
374  time_t mtime = sb.st_mtime;
375  if (mtime > indx->mtime) {
376    indx->add_out_of_date_file(fd, filename,
377			       indx->filename_id + tp->filename_index);
378    return 0;
379  }
380  int res = 0;
381  FILE *fp = fdopen(fd, FOPEN_RB);
382  if (!fp) {
383    error("fdopen failed");
384    close(fd);
385    return 0;
386  }
387  if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
388    error("can't seek on `%1': %2", filename, strerror(errno));
389  else {
390    int length = tp->length;
391    int err = 0;
392    if (length == 0) {
393      if (fstat(fileno(fp), &sb) < 0) {
394	error("can't stat `%1': %2", filename, strerror(errno));
395	err = 1;
396      }
397      else if (!S_ISREG(sb.st_mode)) {
398	error("`%1' is not a regular file", filename);
399	err = 1;
400      }
401      else
402	length = int(sb.st_size);
403    }
404    if (!err) {
405      if (length + 2 > buflen) {
406	a_delete buf;
407	buflen = length + 2;
408	buf = new char[buflen];
409      }
410      if (fread(buf + 1, 1, length, fp) != (size_t)length)
411	error("fread on `%1' failed: %2", filename, strerror(errno));
412      else {
413	buf[0] = '\n';
414	// Remove the CR characters from CRLF pairs.
415	int sidx = 1, didx = 1;
416	for ( ; sidx < length + 1; sidx++, didx++)
417	  {
418	    if (buf[sidx] == '\r')
419	      {
420		if (buf[++sidx] != '\n')
421		  buf[didx++] = '\r';
422		else
423		  length--;
424	      }
425	    if (sidx != didx)
426	      buf[didx] = buf[sidx];
427	  }
428	buf[length + 1] = '\n';
429	res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
430	if (res && ridp)
431	  *ridp = reference_id(indx->filename_id + tp->filename_index,
432			       tp->start);
433      }
434    }
435  }
436  fclose(fp);
437  return res;
438}
439
440const char *index_search_item::munge_filename(const char *filename)
441{
442  if (IS_ABSOLUTE(filename))
443    return filename;
444  const char *cwd = pool;
445  int need_slash = (cwd[0] != 0
446		    && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
447  int len = strlen(cwd) + strlen(filename) + need_slash + 1;
448  if (len > filename_buflen) {
449    a_delete filename_buffer;
450    filename_buflen = len;
451    filename_buffer = new char[len];
452  }
453  strcpy(filename_buffer, cwd);
454  if (need_slash)
455    strcat(filename_buffer, "/");
456  strcat(filename_buffer, filename);
457  return filename_buffer;
458}
459
460const int *index_search_item::search1(const char **pp, const char *end)
461{
462  while (*pp < end && !csalnum(**pp))
463    *pp += 1;
464  if (*pp >= end)
465    return 0;
466  const char *start = *pp;
467  while (*pp < end && csalnum(**pp))
468    *pp += 1;
469  int len = *pp - start;
470  if (len < header.shortest)
471    return 0;
472  if (len > header.truncate)
473    len = header.truncate;
474  int is_number = 1;
475  for (int i = 0; i < len; i++)
476    if (csdigit(start[i]))
477      key_buffer[i] = start[i];
478    else {
479      key_buffer[i] = cmlower(start[i]);
480      is_number = 0;
481    }
482  if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
483    return 0;
484  unsigned hc = hash(key_buffer, len);
485  if (common_words_table) {
486    for (int h = hc % common_words_table_size;
487	 common_words_table[h];
488	 --h) {
489      if (strlen(common_words_table[h]) == (size_t)len
490	  && memcmp(common_words_table[h], key_buffer, len) == 0)
491	return 0;
492      if (h == 0)
493	h = common_words_table_size;
494    }
495  }
496  int li = table[int(hc % header.table_size)];
497  return li < 0 ? &minus_one : lists + li;
498}
499
500static void merge(int *result, const int *s1, const int *s2)
501{
502  for (; *s1 >= 0; s1++) {
503    while (*s2 >= 0 && *s2 < *s1)
504      s2++;
505    if (*s2 == *s1)
506      *result++ = *s2;
507  }
508  *result++ = -1;
509}
510
511const int *index_search_item::search(const char *ptr, int length,
512				     int **temp_listp)
513{
514  const char *end = ptr + length;
515  if (*temp_listp) {
516    a_delete *temp_listp;
517    *temp_listp = 0;
518  }
519  const int *first_list = 0;
520  while (ptr < end && (first_list = search1(&ptr, end)) == 0)
521    ;
522  if (!first_list)
523    return 0;
524  if (*first_list < 0)
525    return first_list;
526  const int *second_list = 0;
527  while (ptr < end && (second_list = search1(&ptr, end)) == 0)
528    ;
529  if (!second_list)
530    return first_list;
531  if (*second_list < 0)
532    return second_list;
533  const int *p;
534  for (p = first_list; *p >= 0; p++)
535    ;
536  int len = p - first_list;
537  for (p = second_list; *p >= 0; p++)
538    ;
539  if (p - second_list < len)
540    len = p - second_list;
541  int *matches = new int[len + 1];
542  merge(matches, first_list, second_list);
543  while (ptr < end) {
544    const int *list = search1(&ptr, end);
545    if (list != 0) {
546      if (*list < 0) {
547	a_delete matches;
548	return list;
549      }
550      merge(matches, matches, list);
551      if (*matches < 0) {
552	a_delete matches;
553	return &minus_one;
554      }
555    }
556  }
557  *temp_listp = matches;
558  return matches;
559}
560
561void index_search_item::read_common_words_file()
562{
563  if (header.common <= 0)
564    return;
565  const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
566  errno = 0;
567  FILE *fp = fopen(common_words_file, "r");
568  if (!fp) {
569    error("can't open `%1': %2", common_words_file, strerror(errno));
570    return;
571  }
572  common_words_table_size = 2*header.common + 1;
573  while (!is_prime(common_words_table_size))
574    common_words_table_size++;
575  common_words_table = new char *[common_words_table_size];
576  for (int i = 0; i < common_words_table_size; i++)
577    common_words_table[i] = 0;
578  int count = 0;
579  int key_len = 0;
580  for (;;) {
581    int c = getc(fp);
582    while (c != EOF && !csalnum(c))
583      c = getc(fp);
584    if (c == EOF)
585      break;
586    do {
587      if (key_len < header.truncate)
588	key_buffer[key_len++] = cmlower(c);
589      c = getc(fp);
590    } while (c != EOF && csalnum(c));
591    if (key_len >= header.shortest) {
592      int h = hash(key_buffer, key_len) % common_words_table_size;
593      while (common_words_table[h]) {
594	if (h == 0)
595	  h = common_words_table_size;
596	--h;
597      }
598      common_words_table[h] = new char[key_len + 1];
599      memcpy(common_words_table[h], key_buffer, key_len);
600      common_words_table[h][key_len] = '\0';
601    }
602    if (++count >= header.common)
603      break;
604    key_len = 0;
605    if (c == EOF)
606      break;
607  }
608  fclose(fp);
609}
610
611void index_search_item::add_out_of_date_file(int fd, const char *filename,
612					     int fid)
613{
614  search_item **pp;
615  for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
616    if ((*pp)->is_named(filename))
617      return;
618  *pp = make_linear_search_item(fd, filename, fid);
619  warning("`%1' modified since `%2' created", filename, name);
620}
621
622void index_search_item::check_files()
623{
624  const char *pool_end = pool + header.strings_size;
625  for (const char *ptr = strchr(ignore_fields, '\0') + 1;
626       ptr < pool_end;
627       ptr = strchr(ptr, '\0') + 1) {
628    const char *path = munge_filename(ptr);
629    struct stat sb;
630    if (stat(path, &sb) < 0)
631      error("can't stat `%1': %2", path, strerror(errno));
632    else if (sb.st_mtime > mtime) {
633      int fd = open(path, O_RDONLY | O_BINARY);
634      if (fd < 0)
635	error("can't open `%1': %2", path, strerror(errno));
636      else
637	add_out_of_date_file(fd, path, filename_id + (ptr - pool));
638    }
639  }
640}
641