1// -*- C++ -*-
2/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
3     Written by James Clark (jjc@jclark.com)
4
5This file is part of groff.
6
7groff is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 2, or (at your option) any later
10version.
11
12groff is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17You should have received a copy of the GNU General Public License along
18with groff; see the file COPYING.  If not, write to the Free Software
19Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
20
21#include "refer.h"
22#include "token.h"
23
24#define TOKEN_TABLE_SIZE 1009
25// I believe in Icelandic thorn sorts after z.
26#define THORN_SORT_KEY "{"
27
28struct token_table_entry {
29  const char *tok;
30  token_info ti;
31  token_table_entry();
32};
33
34token_table_entry token_table[TOKEN_TABLE_SIZE];
35int ntokens = 0;
36
37static void skip_name(const char **ptr, const char *end)
38{
39  if (*ptr < end) {
40    switch (*(*ptr)++) {
41    case '(':
42      if (*ptr < end) {
43	*ptr += 1;
44	if (*ptr < end)
45	  *ptr += 1;
46      }
47      break;
48    case '[':
49      while (*ptr < end)
50	if (*(*ptr)++ == ']')
51	  break;
52      break;
53    }
54  }
55}
56
57int get_token(const char **ptr, const char *end)
58{
59  if (*ptr >= end)
60    return 0;
61  char c = *(*ptr)++;
62  if (c == '\\' && *ptr < end) {
63    switch (**ptr) {
64    default:
65      *ptr += 1;
66      break;
67    case '(':
68    case '[':
69      skip_name(ptr, end);
70      break;
71    case '*':
72    case 'f':
73      *ptr += 1;
74      skip_name(ptr, end);
75      break;
76    }
77  }
78  return 1;
79}
80
81token_info::token_info()
82: type(TOKEN_OTHER), sort_key(0), other_case(0)
83{
84}
85
86void token_info::set(token_type t, const char *sk, const char *oc)
87{
88  assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
89  type = t;
90  sort_key = sk;
91  other_case = oc;
92}
93
94void token_info::sortify(const char *start, const char *end, string &result)
95     const
96{
97  if (sort_key)
98    result += sort_key;
99  else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
100    for (; start < end; start++)
101      if (csalpha(*start))
102	result += cmlower(*start);
103  }
104}
105
106int token_info::sortify_non_empty(const char *start, const char *end) const
107{
108  if (sort_key)
109    return *sort_key != '\0';
110  if (type != TOKEN_UPPER && type != TOKEN_LOWER)
111    return 0;
112  for (; start < end; start++)
113    if (csalpha(*start))
114      return 1;
115  return 0;
116}
117
118
119void token_info::lower_case(const char *start, const char *end,
120			    string &result) const
121{
122  if (type != TOKEN_UPPER) {
123    while (start < end)
124      result += *start++;
125  }
126  else if (other_case)
127    result += other_case;
128  else {
129    while (start < end)
130      result += cmlower(*start++);
131  }
132}
133
134void token_info::upper_case(const char *start, const char *end,
135			    string &result) const
136{
137  if (type != TOKEN_LOWER) {
138    while (start < end)
139      result += *start++;
140  }
141  else if (other_case)
142    result += other_case;
143  else {
144    while (start < end)
145      result += cmupper(*start++);
146  }
147}
148
149token_table_entry::token_table_entry()
150: tok(0)
151{
152}
153
154static void store_token(const char *tok, token_type typ,
155			const char *sk = 0, const char *oc = 0)
156{
157  unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
158  for (;;) {
159    if (token_table[n].tok == 0) {
160      if (++ntokens == TOKEN_TABLE_SIZE)
161	assert(0);
162      token_table[n].tok = tok;
163      break;
164    }
165    if (strcmp(tok, token_table[n].tok) == 0)
166      break;
167    if (n == 0)
168      n = TOKEN_TABLE_SIZE - 1;
169    else
170      --n;
171  }
172  token_table[n].ti.set(typ, sk, oc);
173}
174
175
176token_info default_token_info;
177
178const token_info *lookup_token(const char *start, const char *end)
179{
180  unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
181  for (;;) {
182    if (token_table[n].tok == 0)
183      break;
184    if (strlen(token_table[n].tok) == size_t(end - start)
185	&& memcmp(token_table[n].tok, start, end - start) == 0)
186      return &(token_table[n].ti);
187    if (n == 0)
188      n = TOKEN_TABLE_SIZE - 1;
189    else
190      --n;
191  }
192  return &default_token_info;
193}
194
195static void init_ascii()
196{
197  const char *p;
198  for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
199    char buf[2];
200    buf[0] = *p;
201    buf[1] = '\0';
202    store_token(strsave(buf), TOKEN_LOWER);
203    buf[0] = cmupper(buf[0]);
204    store_token(strsave(buf), TOKEN_UPPER);
205  }
206  for (p = "0123456789"; *p; p++) {
207    char buf[2];
208    buf[0] = *p;
209    buf[1] = '\0';
210    const char *s = strsave(buf);
211    store_token(s, TOKEN_OTHER, s);
212  }
213  for (p = ".,:;?!"; *p; p++) {
214    char buf[2];
215    buf[0] = *p;
216    buf[1] = '\0';
217    store_token(strsave(buf), TOKEN_PUNCT);
218  }
219  store_token("-", TOKEN_HYPHEN);
220}
221
222static void store_letter(const char *lower, const char *upper,
223		  const char *sort_key = 0)
224{
225  store_token(lower, TOKEN_LOWER, sort_key, upper);
226  store_token(upper, TOKEN_UPPER, sort_key, lower);
227}
228
229static void init_letter(unsigned char uc_code, unsigned char lc_code,
230		 const char *sort_key)
231{
232  char lbuf[2];
233  lbuf[0] = lc_code;
234  lbuf[1] = 0;
235  char ubuf[2];
236  ubuf[0] = uc_code;
237  ubuf[1] = 0;
238  store_letter(strsave(lbuf), strsave(ubuf), sort_key);
239}
240
241static void init_latin1()
242{
243  init_letter(0xc0, 0xe0, "a");
244  init_letter(0xc1, 0xe1, "a");
245  init_letter(0xc2, 0xe2, "a");
246  init_letter(0xc3, 0xe3, "a");
247  init_letter(0xc4, 0xe4, "a");
248  init_letter(0xc5, 0xe5, "a");
249  init_letter(0xc6, 0xe6, "ae");
250  init_letter(0xc7, 0xe7, "c");
251  init_letter(0xc8, 0xe8, "e");
252  init_letter(0xc9, 0xe9, "e");
253  init_letter(0xca, 0xea, "e");
254  init_letter(0xcb, 0xeb, "e");
255  init_letter(0xcc, 0xec, "i");
256  init_letter(0xcd, 0xed, "i");
257  init_letter(0xce, 0xee, "i");
258  init_letter(0xcf, 0xef, "i");
259
260  init_letter(0xd0, 0xf0, "d");
261  init_letter(0xd1, 0xf1, "n");
262  init_letter(0xd2, 0xf2, "o");
263  init_letter(0xd3, 0xf3, "o");
264  init_letter(0xd4, 0xf4, "o");
265  init_letter(0xd5, 0xf5, "o");
266  init_letter(0xd6, 0xf6, "o");
267  init_letter(0xd8, 0xf8, "o");
268  init_letter(0xd9, 0xf9, "u");
269  init_letter(0xda, 0xfa, "u");
270  init_letter(0xdb, 0xfb, "u");
271  init_letter(0xdc, 0xfc, "u");
272  init_letter(0xdd, 0xfd, "y");
273  init_letter(0xde, 0xfe, THORN_SORT_KEY);
274
275  store_token("\337", TOKEN_LOWER, "ss", "SS");
276  store_token("\377", TOKEN_LOWER, "y", "Y");
277}
278
279static void init_two_char_letter(char l1, char l2, char u1, char u2,
280				 const char *sk = 0)
281{
282  char buf[6];
283  buf[0] = '\\';
284  buf[1] = '(';
285  buf[2] = l1;
286  buf[3] = l2;
287  buf[4] = '\0';
288  const char *p = strsave(buf);
289  buf[2] = u1;
290  buf[3] = u2;
291  store_letter(p, strsave(buf), sk);
292  buf[1] = '[';
293  buf[4] = ']';
294  buf[5] = '\0';
295  p = strsave(buf);
296  buf[2] = l1;
297  buf[3] = l2;
298  store_letter(strsave(buf), p, sk);
299
300}
301
302static void init_special_chars()
303{
304  const char *p;
305  for (p = "':^`~"; *p; p++)
306    for (const char *q = "aeiouy"; *q; q++) {
307      // Use a variable to work around bug in gcc 2.0
308      char c = cmupper(*q);
309      init_two_char_letter(*p, *q, *p, c);
310    }
311  for (p = "/l/o~n,coeaeij"; *p; p += 2) {
312    // Use variables to work around bug in gcc 2.0
313    char c0 = cmupper(p[0]);
314    char c1 = cmupper(p[1]);
315    init_two_char_letter(p[0], p[1], c0, c1);
316  }
317  init_two_char_letter('v', 's', 'v', 'S', "s");
318  init_two_char_letter('v', 'z', 'v', 'Z', "z");
319  init_two_char_letter('o', 'a', 'o', 'A', "a");
320  init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
321  init_two_char_letter('-', 'd', '-', 'D');
322
323  store_token("\\(ss", TOKEN_LOWER, 0, "SS");
324  store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
325
326  store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
327  store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
328  store_token("\\(hy", TOKEN_HYPHEN);
329  store_token("\\[hy]", TOKEN_HYPHEN);
330  store_token("\\(en", TOKEN_RANGE_SEP);
331  store_token("\\[en]", TOKEN_RANGE_SEP);
332}
333
334static void init_strings()
335{
336  char buf[6];
337  buf[0] = '\\';
338  buf[1] = '*';
339  for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
340    buf[2] = *p;
341    buf[3] = '\0';
342    store_token(strsave(buf), TOKEN_ACCENT);
343    buf[2] = '[';
344    buf[3] = *p;
345    buf[4] = ']';
346    buf[5] = '\0';
347    store_token(strsave(buf), TOKEN_ACCENT);
348  }
349
350  // -ms special letters
351  store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
352  store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
353  store_letter("\\*(d-", "\\*(D-");
354  store_letter("\\*[d-]", "\\*[D-]");
355  store_letter("\\*(ae", "\\*(Ae", "ae");
356  store_letter("\\*[ae]", "\\*[Ae]", "ae");
357  store_letter("\\*(oe", "\\*(Oe", "oe");
358  store_letter("\\*[oe]", "\\*[Oe]", "oe");
359
360  store_token("\\*3", TOKEN_LOWER, "y", "Y");
361  store_token("\\*8", TOKEN_LOWER, "ss", "SS");
362  store_token("\\*q", TOKEN_LOWER, "o", "O");
363}
364
365struct token_initer {
366  token_initer();
367};
368
369static token_initer the_token_initer;
370
371token_initer::token_initer()
372{
373  init_ascii();
374  init_latin1();
375  init_special_chars();
376  init_strings();
377  default_token_info.set(TOKEN_OTHER);
378}
379