153451Speter/* dfa.c - deterministic extended regexp routines for GNU
2126435Sache   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
353451Speter
453451Speter   This program is free software; you can redistribute it and/or modify
553451Speter   it under the terms of the GNU General Public License as published by
653451Speter   the Free Software Foundation; either version 2, or (at your option)
753451Speter   any later version.
853451Speter
953451Speter   This program is distributed in the hope that it will be useful,
1053451Speter   but WITHOUT ANY WARRANTY; without even the implied warranty of
1153451Speter   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1253451Speter   GNU General Public License for more details.
1353451Speter
1453451Speter   You should have received a copy of the GNU General Public License
1553451Speter   along with this program; if not, write to the Free Software
1653472Sobrien   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
1753451Speter
1853451Speter/* Written June, 1988 by Mike Haertel
1953451Speter   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
2053451Speter
2153472Sobrien/* $FreeBSD$ */
2253472Sobrien
2353472Sobrien#ifdef HAVE_CONFIG_H
2453472Sobrien#include <config.h>
2553472Sobrien#endif
2653472Sobrien
2753451Speter#include <assert.h>
2853451Speter#include <ctype.h>
2953451Speter#include <stdio.h>
3053451Speter
3153472Sobrien#include <sys/types.h>
3253451Speter#ifdef STDC_HEADERS
3353451Speter#include <stdlib.h>
3453451Speter#else
3553451Speterextern char *calloc(), *malloc(), *realloc();
3653451Speterextern void free();
3753451Speter#endif
3853451Speter
3953451Speter#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
4053451Speter#include <string.h>
4153451Speter#else
4253451Speter#include <strings.h>
4353451Speter#endif
4453451Speter
45131557Stjr#if HAVE_SETLOCALE
46131557Stjr# include <locale.h>
47131557Stjr#endif
48131557Stjr
49131557Stjr#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
50131557Stjr/* We can handle multibyte string.  */
51131557Stjr# define MBS_SUPPORT
52131557Stjr#endif
53131557Stjr
54131557Stjr#ifdef MBS_SUPPORT
55131557Stjr# include <wchar.h>
56131557Stjr# include <wctype.h>
57131557Stjr#endif
58131557Stjr
5953472Sobrien#ifndef DEBUG	/* use the same approach as regex.c */
6053472Sobrien#undef assert
6153472Sobrien#define assert(e)
6253472Sobrien#endif /* DEBUG */
6353472Sobrien
6453451Speter#ifndef isgraph
6553472Sobrien#define isgraph(C) (isprint(C) && !isspace(C))
6653451Speter#endif
6753451Speter
6853472Sobrien#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
6953472Sobrien#define ISALPHA(C) isalpha(C)
7053472Sobrien#define ISUPPER(C) isupper(C)
7153472Sobrien#define ISLOWER(C) islower(C)
7253472Sobrien#define ISDIGIT(C) isdigit(C)
7353472Sobrien#define ISXDIGIT(C) isxdigit(C)
7453472Sobrien#define ISSPACE(C) isspace(C)
7553472Sobrien#define ISPUNCT(C) ispunct(C)
7653472Sobrien#define ISALNUM(C) isalnum(C)
7753472Sobrien#define ISPRINT(C) isprint(C)
7853472Sobrien#define ISGRAPH(C) isgraph(C)
7953472Sobrien#define ISCNTRL(C) iscntrl(C)
8053472Sobrien#else
8153472Sobrien#define ISALPHA(C) (isascii(C) && isalpha(C))
8253472Sobrien#define ISUPPER(C) (isascii(C) && isupper(C))
8353472Sobrien#define ISLOWER(C) (isascii(C) && islower(C))
8453472Sobrien#define ISDIGIT(C) (isascii(C) && isdigit(C))
8553472Sobrien#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
8653472Sobrien#define ISSPACE(C) (isascii(C) && isspace(C))
8753472Sobrien#define ISPUNCT(C) (isascii(C) && ispunct(C))
8853472Sobrien#define ISALNUM(C) (isascii(C) && isalnum(C))
8953472Sobrien#define ISPRINT(C) (isascii(C) && isprint(C))
9053472Sobrien#define ISGRAPH(C) (isascii(C) && isgraph(C))
9153472Sobrien#define ISCNTRL(C) (isascii(C) && iscntrl(C))
9253472Sobrien#endif
9353451Speter
9456919Sru/* ISASCIIDIGIT differs from ISDIGIT, as follows:
9556919Sru   - Its arg may be any int or unsigned int; it need not be an unsigned char.
9656919Sru   - It's guaranteed to evaluate its argument exactly once.
9756919Sru   - It's typically faster.
9856919Sru   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
9956919Sru   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
10056919Sru   it's important to use the locale's definition of `digit' even when the
10156919Sru   host does not conform to Posix.  */
10256919Sru#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
10356919Sru
10453472Sobrien/* If we (don't) have I18N.  */
10553472Sobrien/* glibc defines _ */
10653472Sobrien#ifndef _
10753472Sobrien# ifdef HAVE_LIBINTL_H
10853472Sobrien#  include <libintl.h>
10953472Sobrien#  ifndef _
11053472Sobrien#   define _(Str) gettext (Str)
11153472Sobrien#  endif
11253472Sobrien# else
11353472Sobrien#  define _(Str) (Str)
11453472Sobrien# endif
11553472Sobrien#endif
11653472Sobrien
11753472Sobrien#include "regex.h"
11853472Sobrien#include "dfa.h"
119131557Stjr#include "hard-locale.h"
12053451Speter
12153472Sobrien/* HPUX, define those as macros in sys/param.h */
12253472Sobrien#ifdef setbit
12353472Sobrien# undef setbit
12453472Sobrien#endif
12553472Sobrien#ifdef clrbit
12653472Sobrien# undef clrbit
12753472Sobrien#endif
12853451Speter
12953472Sobrienstatic void dfamust PARAMS ((struct dfa *dfa));
13053472Sobrienstatic void regexp PARAMS ((int toplevel));
13153472Sobrien
13253451Speterstatic ptr_t
13356919Sruxcalloc (size_t n, size_t s)
13453451Speter{
13553451Speter  ptr_t r = calloc(n, s);
13653451Speter
13753451Speter  if (!r)
13853472Sobrien    dfaerror(_("Memory exhausted"));
13953451Speter  return r;
14053451Speter}
14153451Speter
14253451Speterstatic ptr_t
14356919Sruxmalloc (size_t n)
14453451Speter{
14553451Speter  ptr_t r = malloc(n);
14653451Speter
14753451Speter  assert(n != 0);
14853451Speter  if (!r)
14953472Sobrien    dfaerror(_("Memory exhausted"));
15053451Speter  return r;
15153451Speter}
15253451Speter
15353451Speterstatic ptr_t
15456919Sruxrealloc (ptr_t p, size_t n)
15553451Speter{
15653451Speter  ptr_t r = realloc(p, n);
15753451Speter
15853451Speter  assert(n != 0);
15953451Speter  if (!r)
16053472Sobrien    dfaerror(_("Memory exhausted"));
16153451Speter  return r;
16253451Speter}
16353451Speter
16453472Sobrien#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
16553451Speter#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
16653451Speter#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
16753451Speter
16853451Speter/* Reallocate an array of type t if nalloc is too small for index. */
16953451Speter#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
17053451Speter  if ((index) >= (nalloc))			  \
17153451Speter    {						  \
172131557Stjr      do					  \
17353451Speter	(nalloc) *= 2;				  \
174131557Stjr      while ((index) >= (nalloc));		  \
17553451Speter      REALLOC(p, t, nalloc);			  \
17653451Speter    }
17753451Speter
17853451Speter#ifdef DEBUG
17953451Speter
18053451Speterstatic void
18156919Sruprtok (token t)
18253451Speter{
183131557Stjr  char const *s;
18453451Speter
18553451Speter  if (t < 0)
18653451Speter    fprintf(stderr, "END");
18753451Speter  else if (t < NOTCHAR)
18853451Speter    fprintf(stderr, "%c", t);
18953451Speter  else
19053451Speter    {
19153451Speter      switch (t)
19253451Speter	{
19353451Speter	case EMPTY: s = "EMPTY"; break;
19453451Speter	case BACKREF: s = "BACKREF"; break;
19553451Speter	case BEGLINE: s = "BEGLINE"; break;
19653451Speter	case ENDLINE: s = "ENDLINE"; break;
19753451Speter	case BEGWORD: s = "BEGWORD"; break;
19853451Speter	case ENDWORD: s = "ENDWORD"; break;
19953451Speter	case LIMWORD: s = "LIMWORD"; break;
20053451Speter	case NOTLIMWORD: s = "NOTLIMWORD"; break;
20153451Speter	case QMARK: s = "QMARK"; break;
20253451Speter	case STAR: s = "STAR"; break;
20353451Speter	case PLUS: s = "PLUS"; break;
20453451Speter	case CAT: s = "CAT"; break;
20553451Speter	case OR: s = "OR"; break;
20653451Speter	case ORTOP: s = "ORTOP"; break;
20753451Speter	case LPAREN: s = "LPAREN"; break;
20853451Speter	case RPAREN: s = "RPAREN"; break;
209131557Stjr	case CRANGE: s = "CRANGE"; break;
210131557Stjr#ifdef MBS_SUPPORT
211131557Stjr	case ANYCHAR: s = "ANYCHAR"; break;
212131557Stjr	case MBCSET: s = "MBCSET"; break;
213131557Stjr#endif /* MBS_SUPPORT */
21453451Speter	default: s = "CSET"; break;
21553451Speter	}
21653451Speter      fprintf(stderr, "%s", s);
21753451Speter    }
21853451Speter}
21953451Speter#endif /* DEBUG */
22053451Speter
22153451Speter/* Stuff pertaining to charclasses. */
22253451Speter
22353451Speterstatic int
224131557Stjrtstbit (unsigned b, charclass c)
22553451Speter{
22653451Speter  return c[b / INTBITS] & 1 << b % INTBITS;
22753451Speter}
22853451Speter
22953451Speterstatic void
230131557Stjrsetbit (unsigned b, charclass c)
23153451Speter{
23253451Speter  c[b / INTBITS] |= 1 << b % INTBITS;
23353451Speter}
23453451Speter
23553451Speterstatic void
236131557Stjrclrbit (unsigned b, charclass c)
23753451Speter{
23853451Speter  c[b / INTBITS] &= ~(1 << b % INTBITS);
23953451Speter}
24053451Speter
24153451Speterstatic void
24256919Srucopyset (charclass src, charclass dst)
24353451Speter{
244131557Stjr  memcpy (dst, src, sizeof (charclass));
24553451Speter}
24653451Speter
24753451Speterstatic void
24856919Sruzeroset (charclass s)
24953451Speter{
250131557Stjr  memset (s, 0, sizeof (charclass));
25153451Speter}
25253451Speter
25353451Speterstatic void
25456919Srunotset (charclass s)
25553451Speter{
25653451Speter  int i;
25753451Speter
25853451Speter  for (i = 0; i < CHARCLASS_INTS; ++i)
25953451Speter    s[i] = ~s[i];
26053451Speter}
26153451Speter
26253451Speterstatic int
26356919Sruequal (charclass s1, charclass s2)
26453451Speter{
265131557Stjr  return memcmp (s1, s2, sizeof (charclass)) == 0;
26653451Speter}
26753451Speter
26853451Speter/* A pointer to the current dfa is kept here during parsing. */
26953451Speterstatic struct dfa *dfa;
27053451Speter
27153451Speter/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
27253451Speterstatic int
27356919Srucharclass_index (charclass s)
27453451Speter{
27553451Speter  int i;
27653451Speter
27753451Speter  for (i = 0; i < dfa->cindex; ++i)
27853451Speter    if (equal(s, dfa->charclasses[i]))
27953451Speter      return i;
28053451Speter  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
28153451Speter  ++dfa->cindex;
28253451Speter  copyset(s, dfa->charclasses[i]);
28353451Speter  return i;
28453451Speter}
28553451Speter
28653451Speter/* Syntax bits controlling the behavior of the lexical analyzer. */
28753472Sobrienstatic reg_syntax_t syntax_bits, syntax_bits_set;
28853451Speter
28953451Speter/* Flag for case-folding letters into sets. */
29053451Speterstatic int case_fold;
29153451Speter
29255379Sobrien/* End-of-line byte in data.  */
29355379Sobrienstatic unsigned char eolbyte;
29455379Sobrien
29553451Speter/* Entry point to set syntax options. */
29653451Spetervoid
297131557Stjrdfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
29853451Speter{
29953451Speter  syntax_bits_set = 1;
30053451Speter  syntax_bits = bits;
30153451Speter  case_fold = fold;
30255379Sobrien  eolbyte = eol;
30353451Speter}
30453451Speter
305131557Stjr/* Like setbit, but if case is folded, set both cases of a letter.  */
306131557Stjrstatic void
307131557Stjrsetbit_case_fold (unsigned b, charclass c)
308131557Stjr{
309131557Stjr  setbit (b, c);
310131557Stjr  if (case_fold)
311131557Stjr    {
312131557Stjr      if (ISUPPER (b))
313131557Stjr	setbit (tolower (b), c);
314131557Stjr      else if (ISLOWER (b))
315131557Stjr	setbit (toupper (b), c);
316131557Stjr    }
317131557Stjr}
318131557Stjr
31953451Speter/* Lexical analyzer.  All the dross that deals with the obnoxious
32053451Speter   GNU Regex syntax bits is located here.  The poor, suffering
32153451Speter   reader is referred to the GNU Regex documentation for the
32253451Speter   meaning of the @#%!@#%^!@ syntax bits. */
32353451Speter
324131557Stjrstatic char const *lexstart;	/* Pointer to beginning of input string. */
325131557Stjrstatic char const *lexptr;	/* Pointer to next input character. */
32653472Sobrienstatic int lexleft;		/* Number of characters remaining. */
32753451Speterstatic token lasttok;		/* Previous token returned; initially END. */
32853451Speterstatic int laststart;		/* True if we're separated from beginning or (, |
32953451Speter				   only by zero-width characters. */
33053451Speterstatic int parens;		/* Count of outstanding left parens. */
33153451Speterstatic int minrep, maxrep;	/* Repeat counts for {m,n}. */
332131557Stjrstatic int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
33353451Speter
334131557Stjr#ifdef MBS_SUPPORT
335131557Stjr/* These variables are used only if (MB_CUR_MAX > 1).  */
336131557Stjrstatic mbstate_t mbs;		/* Mbstate for mbrlen().  */
337250823Spfgstatic ssize_t cur_mb_len;	/* Byte length of the current scanning
338250823Spfg				   multibyte character.  Must also handle
339250823Spfg				   negative result from mbrlen().  */
340250823Spfgstatic ssize_t cur_mb_index;	/* Byte index of the current scanning multibyte
341131557Stjr                                   character.
342131557Stjr
343131557Stjr				   singlebyte character : cur_mb_index = 0
344131557Stjr				   multibyte character
345131557Stjr				       1st byte : cur_mb_index = 1
346131557Stjr				       2nd byte : cur_mb_index = 2
347131557Stjr				         ...
348131557Stjr				       nth byte : cur_mb_index = n  */
349131557Stjrstatic unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
350131557Stjr                                  Each element store the amount of remain
351131557Stjr                                  byte of corresponding multibyte character
352131557Stjr                                  in the input string.  A element's value
353131557Stjr                                  is 0 if corresponding character is a
354131557Stjr                                  singlebyte chracter.
355131557Stjr                                  e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
356131557Stjr                                   mblen_buf :   0,       3,       2,       1
357131557Stjr                               */
358131557Stjrstatic wchar_t *inputwcs;	/* Wide character representation of input
359131557Stjr				   string in dfaexec().
360131557Stjr				   The length of this array is same as
361131557Stjr				   the length of input string(char array).
362131557Stjr				   inputstring[i] is a single-byte char,
363131557Stjr				   or 1st byte of a multibyte char.
364131557Stjr				   And inputwcs[i] is the codepoint.  */
365131557Stjrstatic unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
366131557Stjrstatic unsigned char const *buf_end;	/* refference to end in dfaexec().  */
367131557Stjr#endif /* MBS_SUPPORT  */
368131557Stjr
369131557Stjr#ifdef MBS_SUPPORT
370131557Stjr/* This function update cur_mb_len, and cur_mb_index.
371131557Stjr   p points current lexptr, len is the remaining buffer length.  */
372131557Stjrstatic void
373250823Spfgupdate_mb_len_index (unsigned char const *p, size_t len)
374131557Stjr{
375131557Stjr  /* If last character is a part of a multibyte character,
376131557Stjr     we update cur_mb_index.  */
377131557Stjr  if (cur_mb_index)
378131557Stjr    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
379131557Stjr			: cur_mb_index + 1;
380131557Stjr
381131557Stjr  /* If last character is a single byte character, or the
382131557Stjr     last portion of a multibyte character, we check whether
383131557Stjr     next character is a multibyte character or not.  */
384131557Stjr  if (! cur_mb_index)
385131557Stjr    {
386131557Stjr      cur_mb_len = mbrlen(p, len, &mbs);
387131557Stjr      if (cur_mb_len > 1)
388131557Stjr	/* It is a multibyte character.
389131557Stjr	   cur_mb_len was already set by mbrlen().  */
390131557Stjr	cur_mb_index = 1;
391131557Stjr      else if (cur_mb_len < 1)
392131557Stjr	/* Invalid sequence.  We treat it as a singlebyte character.
393131557Stjr	   cur_mb_index is aleady 0.  */
394131557Stjr	cur_mb_len = 1;
395131557Stjr      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
396131557Stjr	 cur_mb_index is aleady 0.  */
397131557Stjr    }
398131557Stjr}
399131557Stjr#endif /* MBS_SUPPORT */
400131557Stjr
401131557Stjr#ifdef MBS_SUPPORT
40253451Speter/* Note that characters become unsigned here. */
403131557Stjr# define FETCH(c, eoferr)			\
404131557Stjr  {						\
405131557Stjr    if (! lexleft)				\
406131557Stjr     {						\
407131557Stjr	if (eoferr != 0)			\
408131557Stjr	  dfaerror (eoferr);			\
409131557Stjr	else					\
410131557Stjr	  return lasttok = END;			\
411131557Stjr      }						\
412131557Stjr    if (MB_CUR_MAX > 1)				\
413131557Stjr      update_mb_len_index(lexptr, lexleft);	\
414131557Stjr    (c) = (unsigned char) *lexptr++;		\
415131557Stjr    --lexleft;					\
416131557Stjr  }
417131557Stjr
418131557Stjr/* This function fetch a wide character, and update cur_mb_len,
419131557Stjr   used only if the current locale is a multibyte environment.  */
420131564Stjrstatic wint_t
421131557Stjrfetch_wc (char const *eoferr)
422131557Stjr{
423131557Stjr  wchar_t wc;
424131557Stjr  if (! lexleft)
425131557Stjr    {
426131557Stjr      if (eoferr != 0)
427131557Stjr	dfaerror (eoferr);
428131557Stjr      else
429131564Stjr	return WEOF;
430131557Stjr    }
431131557Stjr
432131557Stjr  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
433131557Stjr  if (cur_mb_len <= 0)
434131557Stjr   {
435131557Stjr      cur_mb_len = 1;
436131557Stjr      wc = *lexptr;
437131557Stjr    }
438131557Stjr  lexptr += cur_mb_len;
439131557Stjr  lexleft -= cur_mb_len;
440131557Stjr  return wc;
441131557Stjr}
442131557Stjr#else
443131557Stjr/* Note that characters become unsigned here. */
444131557Stjr# define FETCH(c, eoferr)   	      \
44553451Speter  {			   	      \
44653451Speter    if (! lexleft)	   	      \
44756919Sru      {				      \
44856919Sru	if (eoferr != 0)	      \
44956919Sru	  dfaerror (eoferr);	      \
45056919Sru	else		   	      \
45156919Sru	  return lasttok = END;	      \
45256919Sru      }				      \
45353451Speter    (c) = (unsigned char) *lexptr++;  \
45453451Speter    --lexleft;		   	      \
45553451Speter  }
456131557Stjr#endif /* MBS_SUPPORT */
45753451Speter
458131557Stjr#ifdef MBS_SUPPORT
459131557Stjr/* Multibyte character handling sub-routin for lex.
460131557Stjr   This function  parse a bracket expression and build a struct
461131557Stjr   mb_char_classes.  */
462131557Stjrstatic void
463131557Stjrparse_bracket_exp_mb ()
464131557Stjr{
465131564Stjr  wint_t wc, wc1, wc2;
466131557Stjr
467131557Stjr  /* Work area to build a mb_char_classes.  */
468131557Stjr  struct mb_char_classes *work_mbc;
469131557Stjr  int chars_al, range_sts_al, range_ends_al, ch_classes_al,
470131557Stjr    equivs_al, coll_elems_al;
471131557Stjr
472131557Stjr  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
473131557Stjr		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
474131557Stjr  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
475131557Stjr     We will update dfa->multibyte_prop in addtok(), because we can't
476131557Stjr     decide the index in dfa->tokens[].  */
477131557Stjr
478131557Stjr  /* Initialize work are */
479131557Stjr  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
480131557Stjr
481131557Stjr  chars_al = 1;
482131557Stjr  range_sts_al = range_ends_al = 0;
483131557Stjr  ch_classes_al = equivs_al = coll_elems_al = 0;
484131557Stjr  MALLOC(work_mbc->chars, wchar_t, chars_al);
485131557Stjr
486131557Stjr  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
487131557Stjr  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
488131557Stjr  work_mbc->chars = work_mbc->ch_classes = NULL;
489131557Stjr  work_mbc->range_sts = work_mbc->range_ends = NULL;
490131557Stjr  work_mbc->equivs = work_mbc->coll_elems = NULL;
491131557Stjr
492131557Stjr  wc = fetch_wc(_("Unbalanced ["));
493131557Stjr  if (wc == L'^')
494131557Stjr    {
495131557Stjr      wc = fetch_wc(_("Unbalanced ["));
496131557Stjr      work_mbc->invert = 1;
497131557Stjr    }
498131557Stjr  else
499131557Stjr    work_mbc->invert = 0;
500131557Stjr  do
501131557Stjr    {
502131564Stjr      wc1 = WEOF; /* mark wc1 is not initialized".  */
503131557Stjr
504131557Stjr      /* Note that if we're looking at some other [:...:] construct,
505131557Stjr	 we just treat it as a bunch of ordinary characters.  We can do
506131557Stjr	 this because we assume regex has checked for syntax errors before
507131557Stjr	 dfa is ever called. */
508131557Stjr      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
509131557Stjr	{
510131557Stjr#define BRACKET_BUFFER_SIZE 128
511131557Stjr	  char str[BRACKET_BUFFER_SIZE];
512131557Stjr	  wc1 = wc;
513131557Stjr	  wc = fetch_wc(_("Unbalanced ["));
514131557Stjr
515131557Stjr	  /* If pattern contains `[[:', `[[.', or `[[='.  */
516131557Stjr	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
517131557Stjr	    {
518131557Stjr	      unsigned char c;
519131557Stjr	      unsigned char delim = (unsigned char)wc;
520131557Stjr	      int len = 0;
521131557Stjr	      for (;;)
522131557Stjr		{
523131557Stjr		  if (! lexleft)
524131557Stjr		    dfaerror (_("Unbalanced ["));
525131557Stjr		  c = (unsigned char) *lexptr++;
526131557Stjr		  --lexleft;
527131557Stjr
528131557Stjr		  if ((c == delim && *lexptr == ']') || lexleft == 0)
529131557Stjr		    break;
530131557Stjr		  if (len < BRACKET_BUFFER_SIZE)
531131557Stjr		    str[len++] = c;
532131557Stjr		  else
533131557Stjr		    /* This is in any case an invalid class name.  */
534131557Stjr		    str[0] = '\0';
535131557Stjr		}
536131557Stjr	      str[len] = '\0';
537131557Stjr
538131557Stjr	      if (lexleft == 0)
539131557Stjr		{
540131557Stjr		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
541131557Stjr				       work_mbc->nchars + 2);
542131557Stjr		  work_mbc->chars[work_mbc->nchars++] = L'[';
543131557Stjr		  work_mbc->chars[work_mbc->nchars++] = delim;
544131557Stjr		  break;
545131557Stjr		}
546131557Stjr
547131557Stjr	      if (--lexleft, *lexptr++ != ']')
548131557Stjr		dfaerror (_("Unbalanced ["));
549131557Stjr	      if (delim == ':')
550131557Stjr		/* build character class.  */
551131557Stjr		{
552131557Stjr		  wctype_t wt;
553131557Stjr		  /* Query the character class as wctype_t.  */
554131557Stjr		  wt = wctype (str);
555131557Stjr
556131557Stjr		  if (ch_classes_al == 0)
557131557Stjr		    MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
558131557Stjr		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
559131557Stjr				       ch_classes_al,
560131557Stjr				       work_mbc->nch_classes + 1);
561131557Stjr		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
562131557Stjr
563131557Stjr 		}
564131557Stjr	      else if (delim == '=' || delim == '.')
565131557Stjr		{
566131557Stjr		  char *elem;
567131557Stjr		  MALLOC(elem, char, len + 1);
568131557Stjr		  strncpy(elem, str, len + 1);
569131557Stjr
570131557Stjr		  if (delim == '=')
571131557Stjr		    /* build equivalent class.  */
572131557Stjr		    {
573131557Stjr		      if (equivs_al == 0)
574131557Stjr			MALLOC(work_mbc->equivs, char*, ++equivs_al);
575131557Stjr		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
576131557Stjr					   equivs_al,
577131557Stjr					   work_mbc->nequivs + 1);
578131557Stjr		      work_mbc->equivs[work_mbc->nequivs++] = elem;
579131557Stjr		    }
580131557Stjr
581131557Stjr		  if (delim == '.')
582131557Stjr		    /* build collating element.  */
583131557Stjr		    {
584131557Stjr		      if (coll_elems_al == 0)
585131557Stjr			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
586131557Stjr		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
587131557Stjr					   coll_elems_al,
588131557Stjr					   work_mbc->ncoll_elems + 1);
589131557Stjr		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
590131557Stjr		    }
591131557Stjr 		}
592131578Stjr	      wc1 = wc = WEOF;
593131557Stjr	    }
594131557Stjr	  else
595131557Stjr	    /* We treat '[' as a normal character here.  */
596131557Stjr	    {
597131557Stjr	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
598131557Stjr	    }
599131557Stjr	}
600131557Stjr      else
601131557Stjr	{
602131557Stjr	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
603131557Stjr	    wc = fetch_wc(("Unbalanced ["));
604131557Stjr	}
605131557Stjr
606131564Stjr      if (wc1 == WEOF)
607131557Stjr	wc1 = fetch_wc(_("Unbalanced ["));
608131557Stjr
609131557Stjr      if (wc1 == L'-')
610131557Stjr	/* build range characters.  */
611131557Stjr	{
612131557Stjr	  wc2 = fetch_wc(_("Unbalanced ["));
613131557Stjr	  if (wc2 == L']')
614131557Stjr	    {
615131557Stjr	      /* In the case [x-], the - is an ordinary hyphen,
616131557Stjr		 which is left in c1, the lookahead character. */
617131557Stjr	      lexptr -= cur_mb_len;
618131557Stjr	      lexleft += cur_mb_len;
619131557Stjr	      wc2 = wc;
620131557Stjr	    }
621131557Stjr	  else
622131557Stjr	    {
623131557Stjr	      if (wc2 == L'\\'
624131557Stjr		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
625131557Stjr		wc2 = fetch_wc(_("Unbalanced ["));
626131557Stjr	      wc1 = fetch_wc(_("Unbalanced ["));
627131557Stjr	    }
628131557Stjr
629131557Stjr	  if (range_sts_al == 0)
630131557Stjr	    {
631131557Stjr	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
632131557Stjr	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
633131557Stjr	    }
634131557Stjr	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
635131557Stjr			       range_sts_al, work_mbc->nranges + 1);
636131564Stjr	  work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
637131557Stjr	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
638131557Stjr			       range_ends_al, work_mbc->nranges + 1);
639131564Stjr	  work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
640131557Stjr	}
641131564Stjr      else if (wc != WEOF)
642131557Stjr	/* build normal characters.  */
643131557Stjr	{
644131557Stjr	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
645131557Stjr			       work_mbc->nchars + 1);
646131564Stjr	  work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
647131557Stjr	}
648131557Stjr    }
649131557Stjr  while ((wc = wc1) != L']');
650131557Stjr}
651131557Stjr#endif /* MBS_SUPPORT */
652131557Stjr
65353472Sobrien#ifdef __STDC__
65453472Sobrien#define FUNC(F, P) static int F(int c) { return P(c); }
65553472Sobrien#else
65653451Speter#define FUNC(F, P) static int F(c) int c; { return P(c); }
65753472Sobrien#endif
65853451Speter
65953451SpeterFUNC(is_alpha, ISALPHA)
66053451SpeterFUNC(is_upper, ISUPPER)
66153451SpeterFUNC(is_lower, ISLOWER)
66253451SpeterFUNC(is_digit, ISDIGIT)
66353451SpeterFUNC(is_xdigit, ISXDIGIT)
66453451SpeterFUNC(is_space, ISSPACE)
66553451SpeterFUNC(is_punct, ISPUNCT)
66653451SpeterFUNC(is_alnum, ISALNUM)
66753451SpeterFUNC(is_print, ISPRINT)
66853451SpeterFUNC(is_graph, ISGRAPH)
66953451SpeterFUNC(is_cntrl, ISCNTRL)
67053451Speter
67156919Srustatic int
67256919Sruis_blank (int c)
67353472Sobrien{
67453472Sobrien   return (c == ' ' || c == '\t');
67553472Sobrien}
67653472Sobrien
67753451Speter/* The following list maps the names of the Posix named character classes
67853451Speter   to predicate functions that determine whether a given character is in
67953451Speter   the class.  The leading [ has already been eaten by the lexical analyzer. */
68053451Speterstatic struct {
68153472Sobrien  const char *name;
68253472Sobrien  int (*pred) PARAMS ((int));
683131557Stjr} const prednames[] = {
68453472Sobrien  { ":alpha:]", is_alpha },
68553472Sobrien  { ":upper:]", is_upper },
68653472Sobrien  { ":lower:]", is_lower },
68753472Sobrien  { ":digit:]", is_digit },
68853472Sobrien  { ":xdigit:]", is_xdigit },
68953472Sobrien  { ":space:]", is_space },
69053472Sobrien  { ":punct:]", is_punct },
69153472Sobrien  { ":alnum:]", is_alnum },
69253472Sobrien  { ":print:]", is_print },
69353472Sobrien  { ":graph:]", is_graph },
69453472Sobrien  { ":cntrl:]", is_cntrl },
69553472Sobrien  { ":blank:]", is_blank },
69653472Sobrien  { 0 }
69753451Speter};
69853451Speter
69953472Sobrien/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
70053472Sobrien#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
70153472Sobrien
70253451Speterstatic int
70356919Srulooking_at (char const *s)
70453451Speter{
70553472Sobrien  size_t len;
70653451Speter
70753451Speter  len = strlen(s);
70853451Speter  if (lexleft < len)
70953451Speter    return 0;
71053451Speter  return strncmp(s, lexptr, len) == 0;
71153451Speter}
71253451Speter
71353451Speterstatic token
71456919Srulex (void)
71553451Speter{
716131557Stjr  unsigned c, c1, c2;
71753451Speter  int backslash = 0, invert;
71853451Speter  charclass ccl;
71953451Speter  int i;
72053451Speter
72153451Speter  /* Basic plan: We fetch a character.  If it's a backslash,
72253451Speter     we set the backslash flag and go through the loop again.
72353451Speter     On the plus side, this avoids having a duplicate of the
72453451Speter     main switch inside the backslash case.  On the minus side,
72553451Speter     it means that just about every case begins with
72653451Speter     "if (backslash) ...".  */
72753451Speter  for (i = 0; i < 2; ++i)
72853451Speter    {
72953451Speter      FETCH(c, 0);
730131557Stjr#ifdef MBS_SUPPORT
731131557Stjr      if (MB_CUR_MAX > 1 && cur_mb_index)
732131557Stjr	/* If this is a part of a multi-byte character, we must treat
733131557Stjr	   this byte data as a normal character.
734131557Stjr	   e.g. In case of SJIS encoding, some character contains '\',
735131557Stjr	        but they must not be backslash.  */
736131557Stjr	goto normal_char;
737131557Stjr#endif /* MBS_SUPPORT  */
73853451Speter      switch (c)
73953451Speter	{
74053451Speter	case '\\':
74153451Speter	  if (backslash)
74253451Speter	    goto normal_char;
74353451Speter	  if (lexleft == 0)
74453472Sobrien	    dfaerror(_("Unfinished \\ escape"));
74553451Speter	  backslash = 1;
74653451Speter	  break;
74753451Speter
74853451Speter	case '^':
74953451Speter	  if (backslash)
75053451Speter	    goto normal_char;
75153451Speter	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
75253451Speter	      || lasttok == END
75353451Speter	      || lasttok == LPAREN
75453451Speter	      || lasttok == OR)
75553451Speter	    return lasttok = BEGLINE;
75653451Speter	  goto normal_char;
75753451Speter
75853451Speter	case '$':
75953451Speter	  if (backslash)
76053451Speter	    goto normal_char;
76153451Speter	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
76253451Speter	      || lexleft == 0
76353451Speter	      || (syntax_bits & RE_NO_BK_PARENS
76453451Speter		  ? lexleft > 0 && *lexptr == ')'
76553451Speter		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
76653451Speter	      || (syntax_bits & RE_NO_BK_VBAR
76753451Speter		  ? lexleft > 0 && *lexptr == '|'
76853451Speter		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
76953451Speter	      || ((syntax_bits & RE_NEWLINE_ALT)
77053451Speter	          && lexleft > 0 && *lexptr == '\n'))
77153451Speter	    return lasttok = ENDLINE;
77253451Speter	  goto normal_char;
77353451Speter
77453451Speter	case '1':
77553451Speter	case '2':
77653451Speter	case '3':
77753451Speter	case '4':
77853451Speter	case '5':
77953451Speter	case '6':
78053451Speter	case '7':
78153451Speter	case '8':
78253451Speter	case '9':
78353451Speter	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
78453451Speter	    {
78553451Speter	      laststart = 0;
78653451Speter	      return lasttok = BACKREF;
78753451Speter	    }
78853451Speter	  goto normal_char;
78953451Speter
79053472Sobrien	case '`':
79153472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
79253472Sobrien	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
79353472Sobrien	  goto normal_char;
79453472Sobrien
79553472Sobrien	case '\'':
79653472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
79753472Sobrien	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
79853472Sobrien	  goto normal_char;
79953472Sobrien
80053451Speter	case '<':
80153472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
80253451Speter	    return lasttok = BEGWORD;
80353451Speter	  goto normal_char;
80453451Speter
80553451Speter	case '>':
80653472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
80753451Speter	    return lasttok = ENDWORD;
80853451Speter	  goto normal_char;
80953451Speter
81053451Speter	case 'b':
81153472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
81253451Speter	    return lasttok = LIMWORD;
81353451Speter	  goto normal_char;
81453451Speter
81553451Speter	case 'B':
81653472Sobrien	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
81753451Speter	    return lasttok = NOTLIMWORD;
81853451Speter	  goto normal_char;
81953451Speter
82053451Speter	case '?':
82153451Speter	  if (syntax_bits & RE_LIMITED_OPS)
82253451Speter	    goto normal_char;
82353451Speter	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
82453451Speter	    goto normal_char;
82553451Speter	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
82653451Speter	    goto normal_char;
82753451Speter	  return lasttok = QMARK;
82853451Speter
82953451Speter	case '*':
83053451Speter	  if (backslash)
83153451Speter	    goto normal_char;
83253451Speter	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
83353451Speter	    goto normal_char;
83453451Speter	  return lasttok = STAR;
83553451Speter
83653451Speter	case '+':
83753451Speter	  if (syntax_bits & RE_LIMITED_OPS)
83853451Speter	    goto normal_char;
83953451Speter	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
84053451Speter	    goto normal_char;
84153451Speter	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
84253451Speter	    goto normal_char;
84353451Speter	  return lasttok = PLUS;
84453451Speter
84553451Speter	case '{':
84653451Speter	  if (!(syntax_bits & RE_INTERVALS))
84753451Speter	    goto normal_char;
84853451Speter	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
84953451Speter	    goto normal_char;
85055379Sobrien	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
85155379Sobrien	    goto normal_char;
85255379Sobrien
85355379Sobrien	  if (syntax_bits & RE_NO_BK_BRACES)
85455379Sobrien	    {
85555379Sobrien	      /* Scan ahead for a valid interval; if it's not valid,
85655379Sobrien		 treat it as a literal '{'.  */
85755379Sobrien	      int lo = -1, hi = -1;
85855379Sobrien	      char const *p = lexptr;
85955379Sobrien	      char const *lim = p + lexleft;
86056919Sru	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
86155379Sobrien		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
86255379Sobrien	      if (p != lim && *p == ',')
86356919Sru		while (++p != lim && ISASCIIDIGIT (*p))
86455379Sobrien		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
86555379Sobrien	      else
86655379Sobrien		hi = lo;
86755379Sobrien	      if (p == lim || *p != '}'
86855379Sobrien		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
86955379Sobrien		goto normal_char;
87055379Sobrien	    }
87155379Sobrien
87255379Sobrien	  minrep = 0;
87353451Speter	  /* Cases:
87453451Speter	     {M} - exact count
87553451Speter	     {M,} - minimum count, maximum is infinity
87653451Speter	     {M,N} - M through N */
87753472Sobrien	  FETCH(c, _("unfinished repeat count"));
87856919Sru	  if (ISASCIIDIGIT (c))
87953451Speter	    {
88053451Speter	      minrep = c - '0';
88153451Speter	      for (;;)
88253451Speter		{
88353472Sobrien		  FETCH(c, _("unfinished repeat count"));
88456919Sru		  if (! ISASCIIDIGIT (c))
88553451Speter		    break;
88653451Speter		  minrep = 10 * minrep + c - '0';
88753451Speter		}
88853451Speter	    }
88955379Sobrien	  else
89053472Sobrien	    dfaerror(_("malformed repeat count"));
89153451Speter	  if (c == ',')
89255379Sobrien	    {
89355379Sobrien	      FETCH (c, _("unfinished repeat count"));
89456919Sru	      if (! ISASCIIDIGIT (c))
89555379Sobrien		maxrep = -1;
89655379Sobrien	      else
89755379Sobrien		{
89855379Sobrien		  maxrep = c - '0';
89955379Sobrien		  for (;;)
90055379Sobrien		    {
90155379Sobrien		      FETCH (c, _("unfinished repeat count"));
90256919Sru		      if (! ISASCIIDIGIT (c))
90355379Sobrien			break;
90455379Sobrien		      maxrep = 10 * maxrep + c - '0';
90555379Sobrien		    }
90655379Sobrien		  if (0 <= maxrep && maxrep < minrep)
90755379Sobrien		    dfaerror (_("malformed repeat count"));
90855379Sobrien		}
90955379Sobrien	    }
91053451Speter	  else
91153451Speter	    maxrep = minrep;
91253451Speter	  if (!(syntax_bits & RE_NO_BK_BRACES))
91353451Speter	    {
91453451Speter	      if (c != '\\')
91553472Sobrien		dfaerror(_("malformed repeat count"));
91653472Sobrien	      FETCH(c, _("unfinished repeat count"));
91753451Speter	    }
91853451Speter	  if (c != '}')
91953472Sobrien	    dfaerror(_("malformed repeat count"));
92053451Speter	  laststart = 0;
92153451Speter	  return lasttok = REPMN;
92253451Speter
92353451Speter	case '|':
92453451Speter	  if (syntax_bits & RE_LIMITED_OPS)
92553451Speter	    goto normal_char;
92653451Speter	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
92753451Speter	    goto normal_char;
92853451Speter	  laststart = 1;
92953451Speter	  return lasttok = OR;
93053451Speter
93153451Speter	case '\n':
93253451Speter	  if (syntax_bits & RE_LIMITED_OPS
93353451Speter	      || backslash
93453451Speter	      || !(syntax_bits & RE_NEWLINE_ALT))
93553451Speter	    goto normal_char;
93653451Speter	  laststart = 1;
93753451Speter	  return lasttok = OR;
93853451Speter
93953451Speter	case '(':
94053451Speter	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
94153451Speter	    goto normal_char;
94253451Speter	  ++parens;
94353451Speter	  laststart = 1;
94453451Speter	  return lasttok = LPAREN;
94553451Speter
94653451Speter	case ')':
94753451Speter	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
94853451Speter	    goto normal_char;
94953451Speter	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
95053451Speter	    goto normal_char;
95153451Speter	  --parens;
95253451Speter	  laststart = 0;
95353451Speter	  return lasttok = RPAREN;
95453451Speter
95553451Speter	case '.':
95653451Speter	  if (backslash)
95753451Speter	    goto normal_char;
958131557Stjr#ifdef MBS_SUPPORT
959131557Stjr	  if (MB_CUR_MAX > 1)
960131557Stjr	    {
961131557Stjr	      /* In multibyte environment period must match with a single
962131557Stjr		 character not a byte.  So we use ANYCHAR.  */
963131557Stjr	      laststart = 0;
964131557Stjr	      return lasttok = ANYCHAR;
965131557Stjr	    }
966131557Stjr#endif /* MBS_SUPPORT */
96753451Speter	  zeroset(ccl);
96853451Speter	  notset(ccl);
96953451Speter	  if (!(syntax_bits & RE_DOT_NEWLINE))
97055379Sobrien	    clrbit(eolbyte, ccl);
97153451Speter	  if (syntax_bits & RE_DOT_NOT_NULL)
97253451Speter	    clrbit('\0', ccl);
97353451Speter	  laststart = 0;
97453451Speter	  return lasttok = CSET + charclass_index(ccl);
97553451Speter
97653451Speter	case 'w':
97753451Speter	case 'W':
97853472Sobrien	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
97953451Speter	    goto normal_char;
98053451Speter	  zeroset(ccl);
98153451Speter	  for (c2 = 0; c2 < NOTCHAR; ++c2)
98253472Sobrien	    if (IS_WORD_CONSTITUENT(c2))
98353451Speter	      setbit(c2, ccl);
98453451Speter	  if (c == 'W')
98553451Speter	    notset(ccl);
98653451Speter	  laststart = 0;
98753451Speter	  return lasttok = CSET + charclass_index(ccl);
98853451Speter
98953451Speter	case '[':
99053451Speter	  if (backslash)
99153451Speter	    goto normal_char;
992131557Stjr	  laststart = 0;
993131557Stjr#ifdef MBS_SUPPORT
994131557Stjr	  if (MB_CUR_MAX > 1)
995131557Stjr	    {
996131557Stjr	      /* In multibyte environment a bracket expression may contain
997131557Stjr		 multibyte characters, which must be treated as characters
998131557Stjr		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
999131557Stjr	      parse_bracket_exp_mb();
1000131557Stjr	      return lasttok = MBCSET;
1001131557Stjr	    }
1002131557Stjr#endif
100353451Speter	  zeroset(ccl);
100453472Sobrien	  FETCH(c, _("Unbalanced ["));
100553451Speter	  if (c == '^')
100653451Speter	    {
100753472Sobrien	      FETCH(c, _("Unbalanced ["));
100853451Speter	      invert = 1;
100953451Speter	    }
101053451Speter	  else
101153451Speter	    invert = 0;
101253451Speter	  do
101353451Speter	    {
101453451Speter	      /* Nobody ever said this had to be fast. :-)
101553451Speter		 Note that if we're looking at some other [:...:]
101653451Speter		 construct, we just treat it as a bunch of ordinary
101753451Speter		 characters.  We can do this because we assume
101853451Speter		 regex has checked for syntax errors before
101953451Speter		 dfa is ever called. */
102053451Speter	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
102153451Speter		for (c1 = 0; prednames[c1].name; ++c1)
102253451Speter		  if (looking_at(prednames[c1].name))
102353451Speter		    {
1024131557Stjr		      int (*pred) PARAMS ((int)) = prednames[c1].pred;
102553472Sobrien
102653451Speter		      for (c2 = 0; c2 < NOTCHAR; ++c2)
102753472Sobrien			if ((*pred)(c2))
1028131557Stjr			  setbit_case_fold (c2, ccl);
102953451Speter		      lexptr += strlen(prednames[c1].name);
103053451Speter		      lexleft -= strlen(prednames[c1].name);
103153472Sobrien		      FETCH(c1, _("Unbalanced ["));
103253451Speter		      goto skip;
103353451Speter		    }
103453451Speter	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
103553472Sobrien		FETCH(c, _("Unbalanced ["));
103653472Sobrien	      FETCH(c1, _("Unbalanced ["));
103753451Speter	      if (c1 == '-')
103853451Speter		{
103953472Sobrien		  FETCH(c2, _("Unbalanced ["));
104053451Speter		  if (c2 == ']')
104153451Speter		    {
104253451Speter		      /* In the case [x-], the - is an ordinary hyphen,
104353451Speter			 which is left in c1, the lookahead character. */
104453451Speter		      --lexptr;
104553451Speter		      ++lexleft;
104653451Speter		    }
104753451Speter		  else
104853451Speter		    {
104953451Speter		      if (c2 == '\\'
105053451Speter			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
105153472Sobrien			FETCH(c2, _("Unbalanced ["));
105253472Sobrien		      FETCH(c1, _("Unbalanced ["));
1053131557Stjr		      if (!hard_LC_COLLATE) {
1054131557Stjr		        for (; c <= c2; c++)
1055131557Stjr			  setbit_case_fold (c, ccl);
1056131557Stjr		      } else {
1057131557Stjr			/* POSIX locales are painful - leave the decision to libc */
1058131557Stjr			char expr[6] = { '[', c, '-', c2, ']', '\0' };
1059131557Stjr			regex_t re;
1060131557Stjr			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1061131557Stjr			  for (c = 0; c < NOTCHAR; ++c) {
1062131557Stjr			    char buf[2] = { c, '\0' };
1063131557Stjr			    regmatch_t mat;
1064131557Stjr			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1065131557Stjr                               && mat.rm_so == 0 && mat.rm_eo == 1)
1066131557Stjr                              setbit_case_fold (c, ccl);
1067131557Stjr			  }
1068131557Stjr			  regfree (&re);
106956919Sru			}
1070131557Stjr		      }
1071131557Stjr		      continue;
107256919Sru		    }
107353451Speter		}
107456919Sru
1075131557Stjr	      setbit_case_fold (c, ccl);
1076131557Stjr
107753451Speter	    skip:
107853451Speter	      ;
107953451Speter	    }
108053451Speter	  while ((c = c1) != ']');
108153451Speter	  if (invert)
108253451Speter	    {
108353451Speter	      notset(ccl);
108453451Speter	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
108555379Sobrien		clrbit(eolbyte, ccl);
108653451Speter	    }
108753451Speter	  return lasttok = CSET + charclass_index(ccl);
108853451Speter
108953451Speter	default:
109053451Speter	normal_char:
109153451Speter	  laststart = 0;
109253451Speter	  if (case_fold && ISALPHA(c))
109353451Speter	    {
109453451Speter	      zeroset(ccl);
1095131557Stjr	      setbit_case_fold (c, ccl);
109653451Speter	      return lasttok = CSET + charclass_index(ccl);
109753451Speter	    }
109853451Speter	  return c;
109953451Speter	}
110053451Speter    }
110153451Speter
110253451Speter  /* The above loop should consume at most a backslash
110353451Speter     and some other character. */
110453451Speter  abort();
110553472Sobrien  return END;	/* keeps pedantic compilers happy. */
110653451Speter}
110753451Speter
110853451Speter/* Recursive descent parser for regular expressions. */
110953451Speter
111053451Speterstatic token tok;		/* Lookahead token. */
111153472Sobrienstatic int depth;		/* Current depth of a hypothetical stack
111253451Speter				   holding deferred productions.  This is
111353451Speter				   used to determine the depth that will be
111453451Speter				   required of the real stack later on in
111553451Speter				   dfaanalyze(). */
111653451Speter
111753451Speter/* Add the given token to the parse tree, maintaining the depth count and
111853451Speter   updating the maximum depth if necessary. */
111953451Speterstatic void
112056919Sruaddtok (token t)
112153451Speter{
1122131557Stjr#ifdef MBS_SUPPORT
1123131557Stjr  if (MB_CUR_MAX > 1)
1124131557Stjr    {
1125131557Stjr      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1126131557Stjr			   dfa->tindex);
1127131557Stjr      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1128131557Stjr      if (t == MBCSET)
1129131557Stjr	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1130131557Stjr      else if (t < NOTCHAR)
1131131557Stjr	dfa->multibyte_prop[dfa->tindex]
1132131557Stjr	  = (cur_mb_len == 1)? 3 /* single-byte char */
1133131557Stjr	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1134131557Stjr	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1135131557Stjr      else
1136131557Stjr	/* It may be unnecesssary, but it is safer to treat other
1137131557Stjr	   symbols as singlebyte characters.  */
1138131557Stjr	dfa->multibyte_prop[dfa->tindex] = 3;
1139131557Stjr    }
1140131557Stjr#endif
1141131557Stjr
114253451Speter  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
114353451Speter  dfa->tokens[dfa->tindex++] = t;
114453451Speter
114553451Speter  switch (t)
114653451Speter    {
114753451Speter    case QMARK:
114853451Speter    case STAR:
114953451Speter    case PLUS:
115053451Speter      break;
115153451Speter
115253451Speter    case CAT:
115353451Speter    case OR:
115453451Speter    case ORTOP:
115553451Speter      --depth;
115653451Speter      break;
115753451Speter
115853451Speter    default:
115953451Speter      ++dfa->nleaves;
116053451Speter    case EMPTY:
116153451Speter      ++depth;
116253451Speter      break;
116353451Speter    }
116453451Speter  if (depth > dfa->depth)
116553451Speter    dfa->depth = depth;
116653451Speter}
116753451Speter
116853451Speter/* The grammar understood by the parser is as follows.
116953451Speter
117053451Speter   regexp:
117153451Speter     regexp OR branch
117253451Speter     branch
117353451Speter
117453451Speter   branch:
117553451Speter     branch closure
117653451Speter     closure
117753451Speter
117853451Speter   closure:
117953451Speter     closure QMARK
118053451Speter     closure STAR
118153451Speter     closure PLUS
1182131557Stjr     closure REPMN
118353451Speter     atom
118453451Speter
118553451Speter   atom:
118653451Speter     <normal character>
1187131557Stjr     <multibyte character>
1188131557Stjr     ANYCHAR
1189131557Stjr     MBCSET
119053451Speter     CSET
119153451Speter     BACKREF
119253451Speter     BEGLINE
119353451Speter     ENDLINE
119453451Speter     BEGWORD
119553451Speter     ENDWORD
119653451Speter     LIMWORD
119753451Speter     NOTLIMWORD
1198131557Stjr     CRANGE
1199131557Stjr     LPAREN regexp RPAREN
120053451Speter     <empty>
120153451Speter
120253451Speter   The parser builds a parse tree in postfix form in an array of tokens. */
120353451Speter
120453451Speterstatic void
120556919Sruatom (void)
120653451Speter{
120753451Speter  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
120853451Speter      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1209131557Stjr#ifdef MBS_SUPPORT
1210131557Stjr      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1211131557Stjr#endif /* MBS_SUPPORT */
121253451Speter      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
121353451Speter    {
121453451Speter      addtok(tok);
121553451Speter      tok = lex();
1216131557Stjr#ifdef MBS_SUPPORT
1217131557Stjr      /* We treat a multibyte character as a single atom, so that DFA
1218131557Stjr	 can treat a multibyte character as a single expression.
1219131557Stjr
1220131557Stjr         e.g. We construct following tree from "<mb1><mb2>".
1221131557Stjr              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1222131557Stjr              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1223131557Stjr      */
1224131557Stjr      if (MB_CUR_MAX > 1)
1225131557Stjr	{
1226131557Stjr	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1227131557Stjr	    {
1228131557Stjr	      addtok(tok);
1229131557Stjr	      addtok(CAT);
1230131557Stjr	      tok = lex();
1231131557Stjr	    }
1232131557Stjr	}
1233131557Stjr#endif /* MBS_SUPPORT  */
123453451Speter    }
1235131557Stjr  else if (tok == CRANGE)
1236131557Stjr    {
1237131557Stjr      /* A character range like "[a-z]" in a locale other than "C" or
1238131557Stjr	 "POSIX".  This range might any sequence of one or more
1239131557Stjr	 characters.  Unfortunately the POSIX locale primitives give
1240131557Stjr	 us no practical way to find what character sequences might be
1241131557Stjr	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1242131557Stjr	 one character, and then punt to the full matcher.  */
1243131557Stjr      charclass ccl;
1244131557Stjr      zeroset (ccl);
1245131557Stjr      notset (ccl);
1246131557Stjr      addtok (CSET + charclass_index (ccl));
1247131557Stjr      addtok (BACKREF);
1248131557Stjr      addtok (CAT);
1249131557Stjr      tok = lex ();
1250131557Stjr    }
125153451Speter  else if (tok == LPAREN)
125253451Speter    {
125353451Speter      tok = lex();
125453451Speter      regexp(0);
125553451Speter      if (tok != RPAREN)
125653472Sobrien	dfaerror(_("Unbalanced ("));
125753451Speter      tok = lex();
125853451Speter    }
125953451Speter  else
126053451Speter    addtok(EMPTY);
126153451Speter}
126253451Speter
126353451Speter/* Return the number of tokens in the given subexpression. */
126453451Speterstatic int
126556919Srunsubtoks (int tindex)
126653451Speter{
126753451Speter  int ntoks1;
126853451Speter
126953451Speter  switch (dfa->tokens[tindex - 1])
127053451Speter    {
127153451Speter    default:
127253451Speter      return 1;
127353451Speter    case QMARK:
127453451Speter    case STAR:
127553451Speter    case PLUS:
127653451Speter      return 1 + nsubtoks(tindex - 1);
127753451Speter    case CAT:
127853451Speter    case OR:
127953451Speter    case ORTOP:
128053451Speter      ntoks1 = nsubtoks(tindex - 1);
128153451Speter      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
128253451Speter    }
128353451Speter}
128453451Speter
128553451Speter/* Copy the given subexpression to the top of the tree. */
128653451Speterstatic void
128756919Srucopytoks (int tindex, int ntokens)
128853451Speter{
128953451Speter  int i;
129053451Speter
129153451Speter  for (i = 0; i < ntokens; ++i)
129253451Speter    addtok(dfa->tokens[tindex + i]);
129353451Speter}
129453451Speter
129553451Speterstatic void
129656919Sruclosure (void)
129753451Speter{
129853451Speter  int tindex, ntokens, i;
129953451Speter
130053451Speter  atom();
130153451Speter  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
130253451Speter    if (tok == REPMN)
130353451Speter      {
130453451Speter	ntokens = nsubtoks(dfa->tindex);
130553451Speter	tindex = dfa->tindex - ntokens;
130655379Sobrien	if (maxrep < 0)
130753451Speter	  addtok(PLUS);
130853451Speter	if (minrep == 0)
130953451Speter	  addtok(QMARK);
131053451Speter	for (i = 1; i < minrep; ++i)
131153451Speter	  {
131253451Speter	    copytoks(tindex, ntokens);
131353451Speter	    addtok(CAT);
131453451Speter	  }
131553451Speter	for (; i < maxrep; ++i)
131653451Speter	  {
131753451Speter	    copytoks(tindex, ntokens);
131853451Speter	    addtok(QMARK);
131953451Speter	    addtok(CAT);
132053451Speter	  }
132153451Speter	tok = lex();
132253451Speter      }
132353451Speter    else
132453451Speter      {
132553451Speter	addtok(tok);
132653451Speter	tok = lex();
132753451Speter      }
132853451Speter}
132953451Speter
133053451Speterstatic void
133156919Srubranch (void)
133253451Speter{
133353451Speter  closure();
133453451Speter  while (tok != RPAREN && tok != OR && tok >= 0)
133553451Speter    {
133653451Speter      closure();
133753451Speter      addtok(CAT);
133853451Speter    }
133953451Speter}
134053451Speter
134153451Speterstatic void
134256919Sruregexp (int toplevel)
134353451Speter{
134453451Speter  branch();
134553451Speter  while (tok == OR)
134653451Speter    {
134753451Speter      tok = lex();
134853451Speter      branch();
134953451Speter      if (toplevel)
135053451Speter	addtok(ORTOP);
135153451Speter      else
135253451Speter	addtok(OR);
135353451Speter    }
135453451Speter}
135553451Speter
135653451Speter/* Main entry point for the parser.  S is a string to be parsed, len is the
135753451Speter   length of the string, so s can include NUL characters.  D is a pointer to
135853451Speter   the struct dfa to parse into. */
135953451Spetervoid
1360131557Stjrdfaparse (char const *s, size_t len, struct dfa *d)
136153451Speter{
136253451Speter  dfa = d;
136353451Speter  lexstart = lexptr = s;
136453451Speter  lexleft = len;
136553451Speter  lasttok = END;
136653451Speter  laststart = 1;
136753451Speter  parens = 0;
1368131557Stjr  hard_LC_COLLATE = hard_locale (LC_COLLATE);
1369131557Stjr#ifdef MBS_SUPPORT
1370131557Stjr  if (MB_CUR_MAX > 1)
1371131557Stjr    {
1372131557Stjr      cur_mb_index = 0;
1373131557Stjr      cur_mb_len = 0;
1374131557Stjr      memset(&mbs, 0, sizeof(mbstate_t));
1375131557Stjr    }
1376131557Stjr#endif /* MBS_SUPPORT  */
137753451Speter
137853451Speter  if (! syntax_bits_set)
137953472Sobrien    dfaerror(_("No syntax specified"));
138053451Speter
138153451Speter  tok = lex();
138253451Speter  depth = d->depth;
138353451Speter
138453451Speter  regexp(1);
138553451Speter
138653451Speter  if (tok != END)
138753472Sobrien    dfaerror(_("Unbalanced )"));
138853451Speter
138953451Speter  addtok(END - d->nregexps);
139053451Speter  addtok(CAT);
139153451Speter
139253451Speter  if (d->nregexps)
139353451Speter    addtok(ORTOP);
139453451Speter
139553451Speter  ++d->nregexps;
139653451Speter}
139753451Speter
139853451Speter/* Some primitives for operating on sets of positions. */
139953451Speter
140053451Speter/* Copy one set to another; the destination must be large enough. */
140153451Speterstatic void
1402131557Stjrcopy (position_set const *src, position_set *dst)
140353451Speter{
140453451Speter  int i;
140553451Speter
140653451Speter  for (i = 0; i < src->nelem; ++i)
140753451Speter    dst->elems[i] = src->elems[i];
140853451Speter  dst->nelem = src->nelem;
140953451Speter}
141053451Speter
141153451Speter/* Insert a position in a set.  Position sets are maintained in sorted
141253451Speter   order according to index.  If position already exists in the set with
141353451Speter   the same index then their constraints are logically or'd together.
141453451Speter   S->elems must point to an array large enough to hold the resulting set. */
141553451Speterstatic void
141656919Sruinsert (position p, position_set *s)
141753451Speter{
141853451Speter  int i;
141953451Speter  position t1, t2;
142053451Speter
142153451Speter  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
142253472Sobrien    continue;
142353451Speter  if (i < s->nelem && p.index == s->elems[i].index)
142453451Speter    s->elems[i].constraint |= p.constraint;
142553451Speter  else
142653451Speter    {
142753451Speter      t1 = p;
142853451Speter      ++s->nelem;
142953451Speter      while (i < s->nelem)
143053451Speter	{
143153451Speter	  t2 = s->elems[i];
143253451Speter	  s->elems[i++] = t1;
143353451Speter	  t1 = t2;
143453451Speter	}
143553451Speter    }
143653451Speter}
143753451Speter
143853451Speter/* Merge two sets of positions into a third.  The result is exactly as if
143953451Speter   the positions of both sets were inserted into an initially empty set. */
144053451Speterstatic void
1441131557Stjrmerge (position_set const *s1, position_set const *s2, position_set *m)
144253451Speter{
144353451Speter  int i = 0, j = 0;
144453451Speter
144553451Speter  m->nelem = 0;
144653451Speter  while (i < s1->nelem && j < s2->nelem)
144753451Speter    if (s1->elems[i].index > s2->elems[j].index)
144853451Speter      m->elems[m->nelem++] = s1->elems[i++];
144953451Speter    else if (s1->elems[i].index < s2->elems[j].index)
145053451Speter      m->elems[m->nelem++] = s2->elems[j++];
145153451Speter    else
145253451Speter      {
145353451Speter	m->elems[m->nelem] = s1->elems[i++];
145453451Speter	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
145553451Speter      }
145653451Speter  while (i < s1->nelem)
145753451Speter    m->elems[m->nelem++] = s1->elems[i++];
145853451Speter  while (j < s2->nelem)
145953451Speter    m->elems[m->nelem++] = s2->elems[j++];
146053451Speter}
146153451Speter
146253451Speter/* Delete a position from a set. */
146353451Speterstatic void
146456919Srudelete (position p, position_set *s)
146553451Speter{
146653451Speter  int i;
146753451Speter
146853451Speter  for (i = 0; i < s->nelem; ++i)
146953451Speter    if (p.index == s->elems[i].index)
147053451Speter      break;
147153451Speter  if (i < s->nelem)
147253451Speter    for (--s->nelem; i < s->nelem; ++i)
147353451Speter      s->elems[i] = s->elems[i + 1];
147453451Speter}
147553451Speter
147653451Speter/* Find the index of the state corresponding to the given position set with
147753451Speter   the given preceding context, or create a new state if there is no such
147853451Speter   state.  Newline and letter tell whether we got here on a newline or
147953451Speter   letter, respectively. */
148053451Speterstatic int
1481131557Stjrstate_index (struct dfa *d, position_set const *s, int newline, int letter)
148253451Speter{
148353451Speter  int hash = 0;
148453451Speter  int constraint;
148553451Speter  int i, j;
148653451Speter
148753451Speter  newline = newline ? 1 : 0;
148853451Speter  letter = letter ? 1 : 0;
148953451Speter
149053451Speter  for (i = 0; i < s->nelem; ++i)
149153451Speter    hash ^= s->elems[i].index + s->elems[i].constraint;
149253451Speter
149353451Speter  /* Try to find a state that exactly matches the proposed one. */
149453451Speter  for (i = 0; i < d->sindex; ++i)
149553451Speter    {
149653451Speter      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
149753451Speter	  || newline != d->states[i].newline || letter != d->states[i].letter)
149853451Speter	continue;
149953451Speter      for (j = 0; j < s->nelem; ++j)
150053451Speter	if (s->elems[j].constraint
150153451Speter	    != d->states[i].elems.elems[j].constraint
150253451Speter	    || s->elems[j].index != d->states[i].elems.elems[j].index)
150353451Speter	  break;
150453451Speter      if (j == s->nelem)
150553451Speter	return i;
150653451Speter    }
150753451Speter
150853451Speter  /* We'll have to create a new state. */
150953451Speter  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
151053451Speter  d->states[i].hash = hash;
151153451Speter  MALLOC(d->states[i].elems.elems, position, s->nelem);
151253451Speter  copy(s, &d->states[i].elems);
151353451Speter  d->states[i].newline = newline;
151453451Speter  d->states[i].letter = letter;
151553451Speter  d->states[i].backref = 0;
151653451Speter  d->states[i].constraint = 0;
151753451Speter  d->states[i].first_end = 0;
1518131557Stjr#ifdef MBS_SUPPORT
1519131557Stjr  if (MB_CUR_MAX > 1)
1520131557Stjr    d->states[i].mbps.nelem = 0;
1521131557Stjr#endif
152253451Speter  for (j = 0; j < s->nelem; ++j)
152353451Speter    if (d->tokens[s->elems[j].index] < 0)
152453451Speter      {
152553451Speter	constraint = s->elems[j].constraint;
152653451Speter	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
152753451Speter	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
152853451Speter	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
152953451Speter	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
153053451Speter	  d->states[i].constraint |= constraint;
153153451Speter	if (! d->states[i].first_end)
153253451Speter	  d->states[i].first_end = d->tokens[s->elems[j].index];
153353451Speter      }
153453451Speter    else if (d->tokens[s->elems[j].index] == BACKREF)
153553451Speter      {
153653451Speter	d->states[i].constraint = NO_CONSTRAINT;
153753451Speter	d->states[i].backref = 1;
153853451Speter      }
153953451Speter
154053451Speter  ++d->sindex;
154153451Speter
154253451Speter  return i;
154353451Speter}
154453451Speter
154553451Speter/* Find the epsilon closure of a set of positions.  If any position of the set
154653451Speter   contains a symbol that matches the empty string in some context, replace
154753451Speter   that position with the elements of its follow labeled with an appropriate
154853451Speter   constraint.  Repeat exhaustively until no funny positions are left.
154953451Speter   S->elems must be large enough to hold the result. */
155053472Sobrienstatic void
1551131557Stjrepsclosure (position_set *s, struct dfa const *d)
155253451Speter{
155353451Speter  int i, j;
155453451Speter  int *visited;
155553451Speter  position p, old;
155653451Speter
155753451Speter  MALLOC(visited, int, d->tindex);
155853451Speter  for (i = 0; i < d->tindex; ++i)
155953451Speter    visited[i] = 0;
156053451Speter
156153451Speter  for (i = 0; i < s->nelem; ++i)
156253451Speter    if (d->tokens[s->elems[i].index] >= NOTCHAR
156353451Speter	&& d->tokens[s->elems[i].index] != BACKREF
1564131557Stjr#ifdef MBS_SUPPORT
1565131557Stjr	&& d->tokens[s->elems[i].index] != ANYCHAR
1566131557Stjr	&& d->tokens[s->elems[i].index] != MBCSET
1567131557Stjr#endif
156853451Speter	&& d->tokens[s->elems[i].index] < CSET)
156953451Speter      {
157053451Speter	old = s->elems[i];
157153451Speter	p.constraint = old.constraint;
157253451Speter	delete(s->elems[i], s);
157353451Speter	if (visited[old.index])
157453451Speter	  {
157553451Speter	    --i;
157653451Speter	    continue;
157753451Speter	  }
157853451Speter	visited[old.index] = 1;
157953451Speter	switch (d->tokens[old.index])
158053451Speter	  {
158153451Speter	  case BEGLINE:
158253451Speter	    p.constraint &= BEGLINE_CONSTRAINT;
158353451Speter	    break;
158453451Speter	  case ENDLINE:
158553451Speter	    p.constraint &= ENDLINE_CONSTRAINT;
158653451Speter	    break;
158753451Speter	  case BEGWORD:
158853451Speter	    p.constraint &= BEGWORD_CONSTRAINT;
158953451Speter	    break;
159053451Speter	  case ENDWORD:
159153451Speter	    p.constraint &= ENDWORD_CONSTRAINT;
159253451Speter	    break;
159353451Speter	  case LIMWORD:
159453451Speter	    p.constraint &= LIMWORD_CONSTRAINT;
159553451Speter	    break;
159653451Speter	  case NOTLIMWORD:
159753451Speter	    p.constraint &= NOTLIMWORD_CONSTRAINT;
159853451Speter	    break;
159953451Speter	  default:
160053451Speter	    break;
160153451Speter	  }
160253451Speter	for (j = 0; j < d->follows[old.index].nelem; ++j)
160353451Speter	  {
160453451Speter	    p.index = d->follows[old.index].elems[j].index;
160553451Speter	    insert(p, s);
160653451Speter	  }
160753451Speter	/* Force rescan to start at the beginning. */
160853451Speter	i = -1;
160953451Speter      }
161053451Speter
161153451Speter  free(visited);
161253451Speter}
161353451Speter
161453451Speter/* Perform bottom-up analysis on the parse tree, computing various functions.
161553451Speter   Note that at this point, we're pretending constructs like \< are real
161653451Speter   characters rather than constraints on what can follow them.
161753451Speter
161853451Speter   Nullable:  A node is nullable if it is at the root of a regexp that can
161953451Speter   match the empty string.
162053451Speter   *  EMPTY leaves are nullable.
162153451Speter   * No other leaf is nullable.
162253451Speter   * A QMARK or STAR node is nullable.
162353451Speter   * A PLUS node is nullable if its argument is nullable.
162453451Speter   * A CAT node is nullable if both its arguments are nullable.
162553451Speter   * An OR node is nullable if either argument is nullable.
162653451Speter
162753451Speter   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
162853451Speter   that could correspond to the first character of a string matching the
162953451Speter   regexp rooted at the given node.
163053451Speter   * EMPTY leaves have empty firstpos.
163153451Speter   * The firstpos of a nonempty leaf is that leaf itself.
163253451Speter   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
163353451Speter     argument.
163453451Speter   * The firstpos of a CAT node is the firstpos of the left argument, union
163553451Speter     the firstpos of the right if the left argument is nullable.
163653451Speter   * The firstpos of an OR node is the union of firstpos of each argument.
163753451Speter
163853451Speter   Lastpos:  The lastpos of a node is the set of positions that could
163953451Speter   correspond to the last character of a string matching the regexp at
164053451Speter   the given node.
164153451Speter   * EMPTY leaves have empty lastpos.
164253451Speter   * The lastpos of a nonempty leaf is that leaf itself.
164353451Speter   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
164453451Speter     argument.
164553451Speter   * The lastpos of a CAT node is the lastpos of its right argument, union
164653451Speter     the lastpos of the left if the right argument is nullable.
164753451Speter   * The lastpos of an OR node is the union of the lastpos of each argument.
164853451Speter
164953451Speter   Follow:  The follow of a position is the set of positions that could
165053451Speter   correspond to the character following a character matching the node in
165153451Speter   a string matching the regexp.  At this point we consider special symbols
165253451Speter   that match the empty string in some context to be just normal characters.
165353451Speter   Later, if we find that a special symbol is in a follow set, we will
165453451Speter   replace it with the elements of its follow, labeled with an appropriate
165553451Speter   constraint.
165653451Speter   * Every node in the firstpos of the argument of a STAR or PLUS node is in
165753451Speter     the follow of every node in the lastpos.
165853451Speter   * Every node in the firstpos of the second argument of a CAT node is in
165953451Speter     the follow of every node in the lastpos of the first argument.
166053451Speter
166153451Speter   Because of the postfix representation of the parse tree, the depth-first
166253451Speter   analysis is conveniently done by a linear scan with the aid of a stack.
166353451Speter   Sets are stored as arrays of the elements, obeying a stack-like allocation
166453451Speter   scheme; the number of elements in each set deeper in the stack can be
166553451Speter   used to determine the address of a particular set's array. */
166653451Spetervoid
166756919Srudfaanalyze (struct dfa *d, int searchflag)
166853451Speter{
166953451Speter  int *nullable;		/* Nullable stack. */
167053451Speter  int *nfirstpos;		/* Element count stack for firstpos sets. */
167153451Speter  position *firstpos;		/* Array where firstpos elements are stored. */
167253451Speter  int *nlastpos;		/* Element count stack for lastpos sets. */
167353451Speter  position *lastpos;		/* Array where lastpos elements are stored. */
167453451Speter  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
167553451Speter  position_set tmp;		/* Temporary set for merging sets. */
167653451Speter  position_set merged;		/* Result of merging sets. */
167753451Speter  int wants_newline;		/* True if some position wants newline info. */
167853451Speter  int *o_nullable;
167953451Speter  int *o_nfirst, *o_nlast;
168053451Speter  position *o_firstpos, *o_lastpos;
168153451Speter  int i, j;
168253451Speter  position *pos;
168353451Speter
168453451Speter#ifdef DEBUG
168553451Speter  fprintf(stderr, "dfaanalyze:\n");
168653451Speter  for (i = 0; i < d->tindex; ++i)
168753451Speter    {
168853451Speter      fprintf(stderr, " %d:", i);
168953451Speter      prtok(d->tokens[i]);
169053451Speter    }
169153451Speter  putc('\n', stderr);
169253451Speter#endif
169353451Speter
169453451Speter  d->searchflag = searchflag;
169553451Speter
169653451Speter  MALLOC(nullable, int, d->depth);
169753451Speter  o_nullable = nullable;
169853451Speter  MALLOC(nfirstpos, int, d->depth);
169953451Speter  o_nfirst = nfirstpos;
170053451Speter  MALLOC(firstpos, position, d->nleaves);
170153451Speter  o_firstpos = firstpos, firstpos += d->nleaves;
170253451Speter  MALLOC(nlastpos, int, d->depth);
170353451Speter  o_nlast = nlastpos;
170453451Speter  MALLOC(lastpos, position, d->nleaves);
170553451Speter  o_lastpos = lastpos, lastpos += d->nleaves;
170653451Speter  MALLOC(nalloc, int, d->tindex);
170753451Speter  for (i = 0; i < d->tindex; ++i)
170853451Speter    nalloc[i] = 0;
170953451Speter  MALLOC(merged.elems, position, d->nleaves);
171053451Speter
171153451Speter  CALLOC(d->follows, position_set, d->tindex);
171253451Speter
171353451Speter  for (i = 0; i < d->tindex; ++i)
171453451Speter#ifdef DEBUG
171553451Speter    {				/* Nonsyntactic #ifdef goo... */
171653451Speter#endif
171753451Speter    switch (d->tokens[i])
171853451Speter      {
171953451Speter      case EMPTY:
172053451Speter	/* The empty set is nullable. */
172153451Speter	*nullable++ = 1;
172253451Speter
172353451Speter	/* The firstpos and lastpos of the empty leaf are both empty. */
172453451Speter	*nfirstpos++ = *nlastpos++ = 0;
172553451Speter	break;
172653451Speter
172753451Speter      case STAR:
172853451Speter      case PLUS:
172953451Speter	/* Every element in the firstpos of the argument is in the follow
173053451Speter	   of every element in the lastpos. */
173153451Speter	tmp.nelem = nfirstpos[-1];
173253451Speter	tmp.elems = firstpos;
173353451Speter	pos = lastpos;
173453451Speter	for (j = 0; j < nlastpos[-1]; ++j)
173553451Speter	  {
173653451Speter	    merge(&tmp, &d->follows[pos[j].index], &merged);
173753451Speter	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
173853451Speter				 nalloc[pos[j].index], merged.nelem - 1);
173953451Speter	    copy(&merged, &d->follows[pos[j].index]);
174053451Speter	  }
174153451Speter
174253451Speter      case QMARK:
174353451Speter	/* A QMARK or STAR node is automatically nullable. */
174453451Speter	if (d->tokens[i] != PLUS)
174553451Speter	  nullable[-1] = 1;
174653451Speter	break;
174753451Speter
174853451Speter      case CAT:
174953451Speter	/* Every element in the firstpos of the second argument is in the
175053451Speter	   follow of every element in the lastpos of the first argument. */
175153451Speter	tmp.nelem = nfirstpos[-1];
175253451Speter	tmp.elems = firstpos;
175353451Speter	pos = lastpos + nlastpos[-1];
175453451Speter	for (j = 0; j < nlastpos[-2]; ++j)
175553451Speter	  {
175653451Speter	    merge(&tmp, &d->follows[pos[j].index], &merged);
175753451Speter	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
175853451Speter				 nalloc[pos[j].index], merged.nelem - 1);
175953451Speter	    copy(&merged, &d->follows[pos[j].index]);
176053451Speter	  }
176153451Speter
176253451Speter	/* The firstpos of a CAT node is the firstpos of the first argument,
176353451Speter	   union that of the second argument if the first is nullable. */
176453451Speter	if (nullable[-2])
176553451Speter	  nfirstpos[-2] += nfirstpos[-1];
176653451Speter	else
176753451Speter	  firstpos += nfirstpos[-1];
176853451Speter	--nfirstpos;
176953451Speter
177053451Speter	/* The lastpos of a CAT node is the lastpos of the second argument,
177153451Speter	   union that of the first argument if the second is nullable. */
177253451Speter	if (nullable[-1])
177353451Speter	  nlastpos[-2] += nlastpos[-1];
177453451Speter	else
177553451Speter	  {
177653451Speter	    pos = lastpos + nlastpos[-2];
177753451Speter	    for (j = nlastpos[-1] - 1; j >= 0; --j)
177853451Speter	      pos[j] = lastpos[j];
177953451Speter	    lastpos += nlastpos[-2];
178053451Speter	    nlastpos[-2] = nlastpos[-1];
178153451Speter	  }
178253451Speter	--nlastpos;
178353451Speter
178453451Speter	/* A CAT node is nullable if both arguments are nullable. */
178553451Speter	nullable[-2] = nullable[-1] && nullable[-2];
178653451Speter	--nullable;
178753451Speter	break;
178853451Speter
178953451Speter      case OR:
179053451Speter      case ORTOP:
179153451Speter	/* The firstpos is the union of the firstpos of each argument. */
179253451Speter	nfirstpos[-2] += nfirstpos[-1];
179353451Speter	--nfirstpos;
179453451Speter
179553451Speter	/* The lastpos is the union of the lastpos of each argument. */
179653451Speter	nlastpos[-2] += nlastpos[-1];
179753451Speter	--nlastpos;
179853451Speter
179953451Speter	/* An OR node is nullable if either argument is nullable. */
180053451Speter	nullable[-2] = nullable[-1] || nullable[-2];
180153451Speter	--nullable;
180253451Speter	break;
180353451Speter
180453451Speter      default:
180553451Speter	/* Anything else is a nonempty position.  (Note that special
180653451Speter	   constructs like \< are treated as nonempty strings here;
180753451Speter	   an "epsilon closure" effectively makes them nullable later.
180853451Speter	   Backreferences have to get a real position so we can detect
180953451Speter	   transitions on them later.  But they are nullable. */
181053451Speter	*nullable++ = d->tokens[i] == BACKREF;
181153451Speter
181253451Speter	/* This position is in its own firstpos and lastpos. */
181353451Speter	*nfirstpos++ = *nlastpos++ = 1;
181453451Speter	--firstpos, --lastpos;
181553451Speter	firstpos->index = lastpos->index = i;
181653451Speter	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
181753451Speter
181853451Speter	/* Allocate the follow set for this position. */
181953451Speter	nalloc[i] = 1;
182053451Speter	MALLOC(d->follows[i].elems, position, nalloc[i]);
182153451Speter	break;
182253451Speter      }
182353451Speter#ifdef DEBUG
182453451Speter    /* ... balance the above nonsyntactic #ifdef goo... */
182553451Speter      fprintf(stderr, "node %d:", i);
182653451Speter      prtok(d->tokens[i]);
182753451Speter      putc('\n', stderr);
182853451Speter      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
182953451Speter      fprintf(stderr, " firstpos:");
183053451Speter      for (j = nfirstpos[-1] - 1; j >= 0; --j)
183153451Speter	{
183253451Speter	  fprintf(stderr, " %d:", firstpos[j].index);
183353451Speter	  prtok(d->tokens[firstpos[j].index]);
183453451Speter	}
183553451Speter      fprintf(stderr, "\n lastpos:");
183653451Speter      for (j = nlastpos[-1] - 1; j >= 0; --j)
183753451Speter	{
183853451Speter	  fprintf(stderr, " %d:", lastpos[j].index);
183953451Speter	  prtok(d->tokens[lastpos[j].index]);
184053451Speter	}
184153451Speter      putc('\n', stderr);
184253451Speter    }
184353451Speter#endif
184453451Speter
184553451Speter  /* For each follow set that is the follow set of a real position, replace
184653451Speter     it with its epsilon closure. */
184753451Speter  for (i = 0; i < d->tindex; ++i)
184853451Speter    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1849131557Stjr#ifdef MBS_SUPPORT
1850131557Stjr        || d->tokens[i] == ANYCHAR
1851131557Stjr        || d->tokens[i] == MBCSET
1852131557Stjr#endif
185353451Speter	|| d->tokens[i] >= CSET)
185453451Speter      {
185553451Speter#ifdef DEBUG
185653451Speter	fprintf(stderr, "follows(%d:", i);
185753451Speter	prtok(d->tokens[i]);
185853451Speter	fprintf(stderr, "):");
185953451Speter	for (j = d->follows[i].nelem - 1; j >= 0; --j)
186053451Speter	  {
186153451Speter	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
186253451Speter	    prtok(d->tokens[d->follows[i].elems[j].index]);
186353451Speter	  }
186453451Speter	putc('\n', stderr);
186553451Speter#endif
186653451Speter	copy(&d->follows[i], &merged);
186753451Speter	epsclosure(&merged, d);
186853451Speter	if (d->follows[i].nelem < merged.nelem)
186953451Speter	  REALLOC(d->follows[i].elems, position, merged.nelem);
187053451Speter	copy(&merged, &d->follows[i]);
187153451Speter      }
187253451Speter
187353451Speter  /* Get the epsilon closure of the firstpos of the regexp.  The result will
187453451Speter     be the set of positions of state 0. */
187553451Speter  merged.nelem = 0;
187653451Speter  for (i = 0; i < nfirstpos[-1]; ++i)
187753451Speter    insert(firstpos[i], &merged);
187853451Speter  epsclosure(&merged, d);
187953451Speter
188053451Speter  /* Check if any of the positions of state 0 will want newline context. */
188153451Speter  wants_newline = 0;
188253451Speter  for (i = 0; i < merged.nelem; ++i)
188353451Speter    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
188453451Speter      wants_newline = 1;
188553451Speter
188653451Speter  /* Build the initial state. */
188753451Speter  d->salloc = 1;
188853451Speter  d->sindex = 0;
188953451Speter  MALLOC(d->states, dfa_state, d->salloc);
189053451Speter  state_index(d, &merged, wants_newline, 0);
189153451Speter
189253451Speter  free(o_nullable);
189353451Speter  free(o_nfirst);
189453451Speter  free(o_firstpos);
189553451Speter  free(o_nlast);
189653451Speter  free(o_lastpos);
189753451Speter  free(nalloc);
189853451Speter  free(merged.elems);
189953451Speter}
190053451Speter
190153451Speter/* Find, for each character, the transition out of state s of d, and store
190253451Speter   it in the appropriate slot of trans.
190353451Speter
190453451Speter   We divide the positions of s into groups (positions can appear in more
190553451Speter   than one group).  Each group is labeled with a set of characters that
190653451Speter   every position in the group matches (taking into account, if necessary,
190753451Speter   preceding context information of s).  For each group, find the union
190853451Speter   of the its elements' follows.  This set is the set of positions of the
190953451Speter   new state.  For each character in the group's label, set the transition
191053451Speter   on this character to be to a state corresponding to the set's positions,
191153451Speter   and its associated backward context information, if necessary.
191253451Speter
191353451Speter   If we are building a searching matcher, we include the positions of state
191453451Speter   0 in every state.
191553451Speter
191653451Speter   The collection of groups is constructed by building an equivalence-class
191753451Speter   partition of the positions of s.
191853451Speter
191953451Speter   For each position, find the set of characters C that it matches.  Eliminate
192053451Speter   any characters from C that fail on grounds of backward context.
192153451Speter
192253451Speter   Search through the groups, looking for a group whose label L has nonempty
192353451Speter   intersection with C.  If L - C is nonempty, create a new group labeled
192453451Speter   L - C and having the same positions as the current group, and set L to
192553451Speter   the intersection of L and C.  Insert the position in this group, set
192653451Speter   C = C - L, and resume scanning.
192753451Speter
192853451Speter   If after comparing with every group there are characters remaining in C,
192953451Speter   create a new group labeled with the characters of C and insert this
193053451Speter   position in that group. */
193153451Spetervoid
193256919Srudfastate (int s, struct dfa *d, int trans[])
193353451Speter{
193453451Speter  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
193553451Speter  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
193653451Speter  int ngrps = 0;		/* Number of groups actually used. */
193753451Speter  position pos;			/* Current position being considered. */
193853451Speter  charclass matches;		/* Set of matching characters. */
193953451Speter  int matchesf;			/* True if matches is nonempty. */
194053451Speter  charclass intersect;		/* Intersection with some label set. */
194153451Speter  int intersectf;		/* True if intersect is nonempty. */
194253451Speter  charclass leftovers;		/* Stuff in the label that didn't match. */
194353451Speter  int leftoversf;		/* True if leftovers is nonempty. */
194453451Speter  static charclass letters;	/* Set of characters considered letters. */
194553451Speter  static charclass newline;	/* Set of characters that aren't newline. */
194653451Speter  position_set follows;		/* Union of the follows of some group. */
194753451Speter  position_set tmp;		/* Temporary space for merging sets. */
194853451Speter  int state;			/* New state. */
194953451Speter  int wants_newline;		/* New state wants to know newline context. */
195053451Speter  int state_newline;		/* New state on a newline transition. */
195153451Speter  int wants_letter;		/* New state wants to know letter context. */
195253451Speter  int state_letter;		/* New state on a letter transition. */
195353472Sobrien  static int initialized;	/* Flag for static initialization. */
1954131557Stjr#ifdef MBS_SUPPORT
1955131557Stjr  int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1956131557Stjr#endif
195753451Speter  int i, j, k;
195853451Speter
195953451Speter  /* Initialize the set of letters, if necessary. */
196053451Speter  if (! initialized)
196153451Speter    {
196253451Speter      initialized = 1;
196353451Speter      for (i = 0; i < NOTCHAR; ++i)
196453472Sobrien	if (IS_WORD_CONSTITUENT(i))
196553451Speter	  setbit(i, letters);
196655379Sobrien      setbit(eolbyte, newline);
196753451Speter    }
196853451Speter
196953451Speter  zeroset(matches);
197053451Speter
197153451Speter  for (i = 0; i < d->states[s].elems.nelem; ++i)
197253451Speter    {
197353451Speter      pos = d->states[s].elems.elems[i];
197453451Speter      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
197553451Speter	setbit(d->tokens[pos.index], matches);
197653451Speter      else if (d->tokens[pos.index] >= CSET)
197753451Speter	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1978131557Stjr#ifdef MBS_SUPPORT
1979131557Stjr      else if (d->tokens[pos.index] == ANYCHAR
1980131557Stjr               || d->tokens[pos.index] == MBCSET)
1981131557Stjr      /* MB_CUR_MAX > 1  */
1982131557Stjr	{
1983131557Stjr	  /* ANYCHAR and MBCSET must match with a single character, so we
1984131557Stjr	     must put it to d->states[s].mbps, which contains the positions
1985131557Stjr	     which can match with a single character not a byte.  */
1986131557Stjr	  if (d->states[s].mbps.nelem == 0)
1987131557Stjr	    {
1988131557Stjr	      MALLOC(d->states[s].mbps.elems, position,
1989131557Stjr		     d->states[s].elems.nelem);
1990131557Stjr	    }
1991131557Stjr	  insert(pos, &(d->states[s].mbps));
1992131557Stjr	  continue;
1993131557Stjr	}
1994131557Stjr#endif /* MBS_SUPPORT */
199553451Speter      else
199653451Speter	continue;
199753451Speter
199853451Speter      /* Some characters may need to be eliminated from matches because
199953451Speter	 they fail in the current context. */
200053451Speter      if (pos.constraint != 0xFF)
200153451Speter	{
200253451Speter	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
200353451Speter					 d->states[s].newline, 1))
200455379Sobrien	    clrbit(eolbyte, matches);
200553451Speter	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
200653451Speter					 d->states[s].newline, 0))
200753451Speter	    for (j = 0; j < CHARCLASS_INTS; ++j)
200853451Speter	      matches[j] &= newline[j];
200953451Speter	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
201053451Speter					d->states[s].letter, 1))
201153451Speter	    for (j = 0; j < CHARCLASS_INTS; ++j)
201253451Speter	      matches[j] &= ~letters[j];
201353451Speter	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
201453451Speter					d->states[s].letter, 0))
201553451Speter	    for (j = 0; j < CHARCLASS_INTS; ++j)
201653451Speter	      matches[j] &= letters[j];
201753451Speter
201853451Speter	  /* If there are no characters left, there's no point in going on. */
201953451Speter	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
202053472Sobrien	    continue;
202153451Speter	  if (j == CHARCLASS_INTS)
202253451Speter	    continue;
202353451Speter	}
202453451Speter
202553451Speter      for (j = 0; j < ngrps; ++j)
202653451Speter	{
202753451Speter	  /* If matches contains a single character only, and the current
202853451Speter	     group's label doesn't contain that character, go on to the
202953451Speter	     next group. */
203053451Speter	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
203153451Speter	      && !tstbit(d->tokens[pos.index], labels[j]))
203253451Speter	    continue;
203353451Speter
203453451Speter	  /* Check if this group's label has a nonempty intersection with
203553451Speter	     matches. */
203653451Speter	  intersectf = 0;
203753451Speter	  for (k = 0; k < CHARCLASS_INTS; ++k)
203853472Sobrien	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
203953451Speter	  if (! intersectf)
204053451Speter	    continue;
204153451Speter
204253451Speter	  /* It does; now find the set differences both ways. */
204353451Speter	  leftoversf = matchesf = 0;
204453451Speter	  for (k = 0; k < CHARCLASS_INTS; ++k)
204553451Speter	    {
204653451Speter	      /* Even an optimizing compiler can't know this for sure. */
204753451Speter	      int match = matches[k], label = labels[j][k];
204853451Speter
204953472Sobrien	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
205053472Sobrien	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
205153451Speter	    }
205253451Speter
205353451Speter	  /* If there were leftovers, create a new group labeled with them. */
205453451Speter	  if (leftoversf)
205553451Speter	    {
205653451Speter	      copyset(leftovers, labels[ngrps]);
205753451Speter	      copyset(intersect, labels[j]);
205853451Speter	      MALLOC(grps[ngrps].elems, position, d->nleaves);
205953451Speter	      copy(&grps[j], &grps[ngrps]);
206053451Speter	      ++ngrps;
206153451Speter	    }
206253451Speter
206353451Speter	  /* Put the position in the current group.  Note that there is no
206453451Speter	     reason to call insert() here. */
206553451Speter	  grps[j].elems[grps[j].nelem++] = pos;
206653451Speter
206753451Speter	  /* If every character matching the current position has been
206853451Speter	     accounted for, we're done. */
206953451Speter	  if (! matchesf)
207053451Speter	    break;
207153451Speter	}
207253451Speter
207353451Speter      /* If we've passed the last group, and there are still characters
207453451Speter	 unaccounted for, then we'll have to create a new group. */
207553451Speter      if (j == ngrps)
207653451Speter	{
207753451Speter	  copyset(matches, labels[ngrps]);
207853451Speter	  zeroset(matches);
207953451Speter	  MALLOC(grps[ngrps].elems, position, d->nleaves);
208053451Speter	  grps[ngrps].nelem = 1;
208153451Speter	  grps[ngrps].elems[0] = pos;
208253451Speter	  ++ngrps;
208353451Speter	}
208453451Speter    }
208553451Speter
208653451Speter  MALLOC(follows.elems, position, d->nleaves);
208753451Speter  MALLOC(tmp.elems, position, d->nleaves);
208853451Speter
208953451Speter  /* If we are a searching matcher, the default transition is to a state
209053451Speter     containing the positions of state 0, otherwise the default transition
209153451Speter     is to fail miserably. */
209253451Speter  if (d->searchflag)
209353451Speter    {
209453451Speter      wants_newline = 0;
209553451Speter      wants_letter = 0;
209653451Speter      for (i = 0; i < d->states[0].elems.nelem; ++i)
209753451Speter	{
209853451Speter	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
209953451Speter	    wants_newline = 1;
210053451Speter	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
210153451Speter	    wants_letter = 1;
210253451Speter	}
210353451Speter      copy(&d->states[0].elems, &follows);
210453451Speter      state = state_index(d, &follows, 0, 0);
210553451Speter      if (wants_newline)
210653451Speter	state_newline = state_index(d, &follows, 1, 0);
210753451Speter      else
210853451Speter	state_newline = state;
210953451Speter      if (wants_letter)
211053451Speter	state_letter = state_index(d, &follows, 0, 1);
211153451Speter      else
211253451Speter	state_letter = state;
211353451Speter      for (i = 0; i < NOTCHAR; ++i)
211453472Sobrien	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
211555379Sobrien      trans[eolbyte] = state_newline;
211653451Speter    }
211753451Speter  else
211853451Speter    for (i = 0; i < NOTCHAR; ++i)
211953451Speter      trans[i] = -1;
212053451Speter
212153451Speter  for (i = 0; i < ngrps; ++i)
212253451Speter    {
212353451Speter      follows.nelem = 0;
212453451Speter
212553451Speter      /* Find the union of the follows of the positions of the group.
212653451Speter	 This is a hideously inefficient loop.  Fix it someday. */
212753451Speter      for (j = 0; j < grps[i].nelem; ++j)
212853451Speter	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
212953451Speter	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
213053451Speter
2131131557Stjr#ifdef MBS_SUPPORT
2132131557Stjr      if (MB_CUR_MAX > 1)
2133131557Stjr	{
2134131557Stjr	  /* If a token in follows.elems is not 1st byte of a multibyte
2135131557Stjr	     character, or the states of follows must accept the bytes
2136131557Stjr	     which are not 1st byte of the multibyte character.
2137131557Stjr	     Then, if a state of follows encounter a byte, it must not be
2138131557Stjr	     a 1st byte of a multibyte character nor singlebyte character.
2139131557Stjr	     We cansel to add state[0].follows to next state, because
2140131557Stjr	     state[0] must accept 1st-byte
2141131557Stjr
2142131557Stjr	     For example, we assume <sb a> is a certain singlebyte
2143131557Stjr	     character, <mb A> is a certain multibyte character, and the
2144131557Stjr	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2145131557Stjr	     <mb A>.
2146131557Stjr	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2147131557Stjr	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2148131557Stjr	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2149131557Stjr	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2150131557Stjr	     <mb A>, so we can not add state[0].  */
2151131557Stjr
2152131557Stjr	  next_isnt_1st_byte = 0;
2153131557Stjr	  for (j = 0; j < follows.nelem; ++j)
2154131557Stjr	    {
2155131557Stjr	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2156131557Stjr		{
2157131557Stjr		  next_isnt_1st_byte = 1;
2158131557Stjr		  break;
2159131557Stjr		}
2160131557Stjr	    }
2161131557Stjr	}
2162131557Stjr#endif
2163131557Stjr
216453451Speter      /* If we are building a searching matcher, throw in the positions
216553451Speter	 of state 0 as well. */
2166131557Stjr#ifdef MBS_SUPPORT
2167131557Stjr      if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2168131557Stjr#else
216953451Speter      if (d->searchflag)
2170131557Stjr#endif
217153451Speter	for (j = 0; j < d->states[0].elems.nelem; ++j)
217253451Speter	  insert(d->states[0].elems.elems[j], &follows);
217353451Speter
217453451Speter      /* Find out if the new state will want any context information. */
217553451Speter      wants_newline = 0;
217655379Sobrien      if (tstbit(eolbyte, labels[i]))
217753451Speter	for (j = 0; j < follows.nelem; ++j)
217853451Speter	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
217953451Speter	    wants_newline = 1;
218053451Speter
218153451Speter      wants_letter = 0;
218253451Speter      for (j = 0; j < CHARCLASS_INTS; ++j)
218353451Speter	if (labels[i][j] & letters[j])
218453451Speter	  break;
218553451Speter      if (j < CHARCLASS_INTS)
218653451Speter	for (j = 0; j < follows.nelem; ++j)
218753451Speter	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
218853451Speter	    wants_letter = 1;
218953451Speter
219053451Speter      /* Find the state(s) corresponding to the union of the follows. */
219153451Speter      state = state_index(d, &follows, 0, 0);
219253451Speter      if (wants_newline)
219353451Speter	state_newline = state_index(d, &follows, 1, 0);
219453451Speter      else
219553451Speter	state_newline = state;
219653451Speter      if (wants_letter)
219753451Speter	state_letter = state_index(d, &follows, 0, 1);
219853451Speter      else
219953451Speter	state_letter = state;
220053451Speter
220153451Speter      /* Set the transitions for each character in the current label. */
220253451Speter      for (j = 0; j < CHARCLASS_INTS; ++j)
220353451Speter	for (k = 0; k < INTBITS; ++k)
220453451Speter	  if (labels[i][j] & 1 << k)
220553451Speter	    {
220653451Speter	      int c = j * INTBITS + k;
220753451Speter
220855379Sobrien	      if (c == eolbyte)
220953451Speter		trans[c] = state_newline;
221053472Sobrien	      else if (IS_WORD_CONSTITUENT(c))
221153451Speter		trans[c] = state_letter;
221253451Speter	      else if (c < NOTCHAR)
221353451Speter		trans[c] = state;
221453451Speter	    }
221553451Speter    }
221653451Speter
221753451Speter  for (i = 0; i < ngrps; ++i)
221853451Speter    free(grps[i].elems);
221953451Speter  free(follows.elems);
222053451Speter  free(tmp.elems);
222153451Speter}
222253451Speter
222353451Speter/* Some routines for manipulating a compiled dfa's transition tables.
222453451Speter   Each state may or may not have a transition table; if it does, and it
222553451Speter   is a non-accepting state, then d->trans[state] points to its table.
222653451Speter   If it is an accepting state then d->fails[state] points to its table.
222753451Speter   If it has no table at all, then d->trans[state] is NULL.
222853451Speter   TODO: Improve this comment, get rid of the unnecessary redundancy. */
222953451Speter
223053451Speterstatic void
223156919Srubuild_state (int s, struct dfa *d)
223253451Speter{
223353451Speter  int *trans;			/* The new transition table. */
223453451Speter  int i;
223553451Speter
223653451Speter  /* Set an upper limit on the number of transition tables that will ever
223753451Speter     exist at once.  1024 is arbitrary.  The idea is that the frequently
223853451Speter     used transition tables will be quickly rebuilt, whereas the ones that
223953451Speter     were only needed once or twice will be cleared away. */
224053451Speter  if (d->trcount >= 1024)
224153451Speter    {
224253451Speter      for (i = 0; i < d->tralloc; ++i)
224353451Speter	if (d->trans[i])
224453451Speter	  {
224553451Speter	    free((ptr_t) d->trans[i]);
224653451Speter	    d->trans[i] = NULL;
224753451Speter	  }
224853451Speter	else if (d->fails[i])
224953451Speter	  {
225053451Speter	    free((ptr_t) d->fails[i]);
225153451Speter	    d->fails[i] = NULL;
225253451Speter	  }
225353451Speter      d->trcount = 0;
225453451Speter    }
225553451Speter
225653451Speter  ++d->trcount;
225753451Speter
225853451Speter  /* Set up the success bits for this state. */
225953451Speter  d->success[s] = 0;
226053451Speter  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
226153451Speter      s, *d))
226253451Speter    d->success[s] |= 4;
226353451Speter  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
226453451Speter      s, *d))
226553451Speter    d->success[s] |= 2;
226653451Speter  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
226753451Speter      s, *d))
226853451Speter    d->success[s] |= 1;
226953451Speter
227053451Speter  MALLOC(trans, int, NOTCHAR);
227153451Speter  dfastate(s, d, trans);
227253451Speter
227353451Speter  /* Now go through the new transition table, and make sure that the trans
227453451Speter     and fail arrays are allocated large enough to hold a pointer for the
227553451Speter     largest state mentioned in the table. */
227653451Speter  for (i = 0; i < NOTCHAR; ++i)
227753451Speter    if (trans[i] >= d->tralloc)
227853451Speter      {
227953451Speter	int oldalloc = d->tralloc;
228053451Speter
228153451Speter	while (trans[i] >= d->tralloc)
228253451Speter	  d->tralloc *= 2;
228353451Speter	REALLOC(d->realtrans, int *, d->tralloc + 1);
228453451Speter	d->trans = d->realtrans + 1;
228553451Speter	REALLOC(d->fails, int *, d->tralloc);
228653451Speter	REALLOC(d->success, int, d->tralloc);
228753451Speter	while (oldalloc < d->tralloc)
228853451Speter	  {
228953451Speter	    d->trans[oldalloc] = NULL;
229053451Speter	    d->fails[oldalloc++] = NULL;
229153451Speter	  }
229253451Speter      }
229353451Speter
2294131557Stjr  /* Newline is a sentinel.  */
229555379Sobrien  trans[eolbyte] = -1;
229653451Speter
229753451Speter  if (ACCEPTING(s, *d))
229853451Speter    d->fails[s] = trans;
229953451Speter  else
230053451Speter    d->trans[s] = trans;
230153451Speter}
230253451Speter
230353451Speterstatic void
230456919Srubuild_state_zero (struct dfa *d)
230553451Speter{
230653451Speter  d->tralloc = 1;
230753451Speter  d->trcount = 0;
230853451Speter  CALLOC(d->realtrans, int *, d->tralloc + 1);
230953451Speter  d->trans = d->realtrans + 1;
231053451Speter  CALLOC(d->fails, int *, d->tralloc);
231153451Speter  MALLOC(d->success, int, d->tralloc);
231253451Speter  build_state(0, d);
231353451Speter}
231453451Speter
2315131557Stjr#ifdef MBS_SUPPORT
2316131557Stjr/* Multibyte character handling sub-routins for dfaexec.  */
2317131557Stjr
2318131557Stjr/* Initial state may encounter the byte which is not a singlebyte character
2319131557Stjr   nor 1st byte of a multibyte character.  But it is incorrect for initial
2320131557Stjr   state to accept such a byte.
2321131557Stjr   For example, in sjis encoding the regular expression like "\\" accepts
2322131557Stjr   the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2323131557Stjr   0x815c. Then Initial state must skip the bytes which are not a singlebyte
2324131557Stjr   character nor 1st byte of a multibyte character.  */
2325131557Stjr#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2326131557Stjr  if (s == 0)						\
2327131557Stjr    {							\
2328131557Stjr      while (inputwcs[p - buf_begin] == 0		\
2329131557Stjr            && mblen_buf[p - buf_begin] > 0		\
2330131557Stjr	    && p < buf_end)				\
2331131557Stjr        ++p;						\
2332131557Stjr      if (p >= end)					\
2333131557Stjr	{						\
2334131557Stjr          free(mblen_buf);				\
2335131557Stjr          free(inputwcs);				\
2336131557Stjr	  return (size_t) -1;				\
2337131557Stjr	}						\
2338131557Stjr    }
2339131557Stjr
2340131557Stjrstatic void
2341131557Stjrrealloc_trans_if_necessary(struct dfa *d, int new_state)
2342131557Stjr{
2343131557Stjr  /* Make sure that the trans and fail arrays are allocated large enough
2344131557Stjr     to hold a pointer for the new state. */
2345131557Stjr  if (new_state >= d->tralloc)
2346131557Stjr    {
2347131557Stjr      int oldalloc = d->tralloc;
2348131557Stjr
2349131557Stjr      while (new_state >= d->tralloc)
2350131557Stjr	d->tralloc *= 2;
2351131557Stjr      REALLOC(d->realtrans, int *, d->tralloc + 1);
2352131557Stjr      d->trans = d->realtrans + 1;
2353131557Stjr      REALLOC(d->fails, int *, d->tralloc);
2354131557Stjr      REALLOC(d->success, int, d->tralloc);
2355131557Stjr      while (oldalloc < d->tralloc)
2356131557Stjr	{
2357131557Stjr	  d->trans[oldalloc] = NULL;
2358131557Stjr	  d->fails[oldalloc++] = NULL;
2359131557Stjr	}
2360131557Stjr    }
2361131557Stjr}
2362131557Stjr
2363131557Stjr/* Return values of transit_state_singlebyte(), and
2364131557Stjr   transit_state_consume_1char.  */
2365131557Stjrtypedef enum
2366131557Stjr{
2367131557Stjr  TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2368131557Stjr  TRANSIT_STATE_DONE,		/* State transition has finished.  */
2369131557Stjr  TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2370131557Stjr} status_transit_state;
2371131557Stjr
2372131557Stjr/* Consume a single byte and transit state from 's' to '*next_state'.
2373131557Stjr   This function is almost same as the state transition routin in dfaexec().
2374131557Stjr   But state transition is done just once, otherwise matching succeed or
2375131557Stjr   reach the end of the buffer.  */
2376131557Stjrstatic status_transit_state
2377131557Stjrtransit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2378131557Stjr				  int *next_state)
2379131557Stjr{
2380131557Stjr  int *t;
2381131557Stjr  int works = s;
2382131557Stjr
2383131557Stjr  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2384131557Stjr
2385131557Stjr  while (rval == TRANSIT_STATE_IN_PROGRESS)
2386131557Stjr    {
2387131557Stjr      if ((t = d->trans[works]) != NULL)
2388131557Stjr	{
2389131557Stjr	  works = t[*p];
2390131557Stjr	  rval = TRANSIT_STATE_DONE;
2391131557Stjr	  if (works < 0)
2392131557Stjr	    works = 0;
2393131557Stjr	}
2394131557Stjr      else if (works < 0)
2395131557Stjr	{
2396131557Stjr	  if (p == buf_end)
2397131557Stjr	    /* At the moment, it must not happen.  */
2398131557Stjr	    return TRANSIT_STATE_END_BUFFER;
2399131557Stjr	  works = 0;
2400131557Stjr	}
2401131557Stjr      else if (d->fails[works])
2402131557Stjr	{
2403131557Stjr	  works = d->fails[works][*p];
2404131557Stjr	  rval = TRANSIT_STATE_DONE;
2405131557Stjr	}
2406131557Stjr      else
2407131557Stjr	{
2408131557Stjr	  build_state(works, d);
2409131557Stjr	}
2410131557Stjr    }
2411131557Stjr  *next_state = works;
2412131557Stjr  return rval;
2413131557Stjr}
2414131557Stjr
2415131557Stjr/* Check whether period can match or not in the current context.  If it can,
2416131557Stjr   return the amount of the bytes with which period can match, otherwise
2417131557Stjr   return 0.
2418131557Stjr   `pos' is the position of the period.  `index' is the index from the
2419131557Stjr   buf_begin, and it is the current position in the buffer.  */
2420131557Stjrstatic int
2421131557Stjrmatch_anychar (struct dfa *d, int s, position pos, int index)
2422131557Stjr{
2423131557Stjr  int newline = 0;
2424131557Stjr  int letter = 0;
2425131557Stjr  wchar_t wc;
2426131557Stjr  int mbclen;
2427131557Stjr
2428131557Stjr  wc = inputwcs[index];
2429131557Stjr  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2430131557Stjr
2431131557Stjr  /* Check context.  */
2432131557Stjr  if (wc == (wchar_t)eolbyte)
2433131557Stjr    {
2434131557Stjr      if (!(syntax_bits & RE_DOT_NEWLINE))
2435131557Stjr	return 0;
2436131557Stjr      newline = 1;
2437131557Stjr    }
2438131557Stjr  else if (wc == (wchar_t)'\0')
2439131557Stjr    {
2440131557Stjr      if (syntax_bits & RE_DOT_NOT_NULL)
2441131557Stjr	return 0;
2442131557Stjr      newline = 1;
2443131557Stjr    }
2444131557Stjr
2445131557Stjr  if (iswalnum(wc) || wc == L'_')
2446131557Stjr    letter = 1;
2447131557Stjr
2448131557Stjr  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2449131557Stjr			   newline, d->states[s].letter, letter))
2450131557Stjr    return 0;
2451131557Stjr
2452131557Stjr  return mbclen;
2453131557Stjr}
2454131557Stjr
2455131557Stjr/* Check whether bracket expression can match or not in the current context.
2456131557Stjr   If it can, return the amount of the bytes with which expression can match,
2457131557Stjr   otherwise return 0.
2458131557Stjr   `pos' is the position of the bracket expression.  `index' is the index
2459131557Stjr   from the buf_begin, and it is the current position in the buffer.  */
2460131557Stjrint
2461131557Stjrmatch_mb_charset (struct dfa *d, int s, position pos, int index)
2462131557Stjr{
2463131557Stjr  int i;
2464131557Stjr  int match;		/* Flag which represent that matching succeed.  */
2465131557Stjr  int match_len;	/* Length of the character (or collating element)
2466131557Stjr			   with which this operator match.  */
2467250823Spfg  size_t op_len;	/* Length of the operator.  */
2468131557Stjr  char buffer[128];
2469131557Stjr  wchar_t wcbuf[6];
2470131557Stjr
2471131557Stjr  /* Pointer to the structure to which we are currently reffering.  */
2472131557Stjr  struct mb_char_classes *work_mbc;
2473131557Stjr
2474131557Stjr  int newline = 0;
2475131557Stjr  int letter = 0;
2476131557Stjr  wchar_t wc;		/* Current reffering character.  */
2477131557Stjr
2478131557Stjr  wc = inputwcs[index];
2479131557Stjr
2480131557Stjr  /* Check context.  */
2481131557Stjr  if (wc == (wchar_t)eolbyte)
2482131557Stjr    {
2483131557Stjr      if (!(syntax_bits & RE_DOT_NEWLINE))
2484131557Stjr	return 0;
2485131557Stjr      newline = 1;
2486131557Stjr    }
2487131557Stjr  else if (wc == (wchar_t)'\0')
2488131557Stjr    {
2489131557Stjr      if (syntax_bits & RE_DOT_NOT_NULL)
2490131557Stjr	return 0;
2491131557Stjr      newline = 1;
2492131557Stjr    }
2493131557Stjr  if (iswalnum(wc) || wc == L'_')
2494131557Stjr    letter = 1;
2495131557Stjr  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2496131557Stjr			   newline, d->states[s].letter, letter))
2497131557Stjr    return 0;
2498131557Stjr
2499131557Stjr  /* Assign the current reffering operator to work_mbc.  */
2500131557Stjr  work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2501131557Stjr  match = !work_mbc->invert;
2502131557Stjr  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2503131557Stjr
2504131557Stjr  /* match with a character class?  */
2505131557Stjr  for (i = 0; i<work_mbc->nch_classes; i++)
2506131557Stjr    {
2507131557Stjr      if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2508131557Stjr	goto charset_matched;
2509131557Stjr    }
2510131557Stjr
2511131557Stjr  strncpy(buffer, buf_begin + index, match_len);
2512131557Stjr  buffer[match_len] = '\0';
2513131557Stjr
2514131557Stjr  /* match with an equivalent class?  */
2515131557Stjr  for (i = 0; i<work_mbc->nequivs; i++)
2516131557Stjr    {
2517131557Stjr      op_len = strlen(work_mbc->equivs[i]);
2518131557Stjr      strncpy(buffer, buf_begin + index, op_len);
2519131557Stjr      buffer[op_len] = '\0';
2520131557Stjr      if (strcoll(work_mbc->equivs[i], buffer) == 0)
2521131557Stjr	{
2522131557Stjr	  match_len = op_len;
2523131557Stjr	  goto charset_matched;
2524131557Stjr	}
2525131557Stjr    }
2526131557Stjr
2527131557Stjr  /* match with a collating element?  */
2528131557Stjr  for (i = 0; i<work_mbc->ncoll_elems; i++)
2529131557Stjr    {
2530131557Stjr      op_len = strlen(work_mbc->coll_elems[i]);
2531131557Stjr      strncpy(buffer, buf_begin + index, op_len);
2532131557Stjr      buffer[op_len] = '\0';
2533131557Stjr
2534131557Stjr      if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2535131557Stjr	{
2536131557Stjr	  match_len = op_len;
2537131557Stjr	  goto charset_matched;
2538131557Stjr	}
2539131557Stjr    }
2540131557Stjr
2541131557Stjr  wcbuf[0] = wc;
2542131557Stjr  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2543131557Stjr
2544131557Stjr  /* match with a range?  */
2545131557Stjr  for (i = 0; i<work_mbc->nranges; i++)
2546131557Stjr    {
2547131557Stjr      wcbuf[2] = work_mbc->range_sts[i];
2548131557Stjr      wcbuf[4] = work_mbc->range_ends[i];
2549131557Stjr
2550131557Stjr      if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2551131557Stjr	  wcscoll(wcbuf+4, wcbuf) >= 0)
2552131557Stjr	goto charset_matched;
2553131557Stjr    }
2554131557Stjr
2555131557Stjr  /* match with a character?  */
2556146200Stjr  if (case_fold)
2557146200Stjr    wc = towlower (wc);
2558131557Stjr  for (i = 0; i<work_mbc->nchars; i++)
2559131557Stjr    {
2560131557Stjr      if (wc == work_mbc->chars[i])
2561131557Stjr	goto charset_matched;
2562131557Stjr    }
2563131557Stjr
2564131557Stjr  match = !match;
2565131557Stjr
2566131557Stjr charset_matched:
2567131557Stjr  return match ? match_len : 0;
2568131557Stjr}
2569131557Stjr
2570131557Stjr/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2571131557Stjr   array which corresponds to `d->states[s].mbps.elem' and each element of
2572131557Stjr   the array contains the amount of the bytes with which the element can
2573131557Stjr   match.
2574131557Stjr   `index' is the index from the buf_begin, and it is the current position
2575131557Stjr   in the buffer.
2576131557Stjr   Caller MUST free the array which this function return.  */
2577131557Stjrstatic int*
2578131557Stjrcheck_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2579131557Stjr{
2580131557Stjr  int i;
2581131557Stjr  int* rarray;
2582131557Stjr
2583131557Stjr  MALLOC(rarray, int, d->states[s].mbps.nelem);
2584131557Stjr  for (i = 0; i < d->states[s].mbps.nelem; ++i)
2585131557Stjr    {
2586131557Stjr      position pos = d->states[s].mbps.elems[i];
2587131557Stjr      switch(d->tokens[pos.index])
2588131557Stjr	{
2589131557Stjr	case ANYCHAR:
2590131557Stjr	  rarray[i] = match_anychar(d, s, pos, index);
2591131557Stjr	  break;
2592131557Stjr	case MBCSET:
2593131557Stjr	  rarray[i] = match_mb_charset(d, s, pos, index);
2594131557Stjr	  break;
2595131557Stjr	default:
2596131557Stjr	  break; /* can not happen.  */
2597131557Stjr	}
2598131557Stjr    }
2599131557Stjr  return rarray;
2600131557Stjr}
2601131557Stjr
2602131557Stjr/* Consume a single character and enumerate all of the positions which can
2603131557Stjr   be next position from the state `s'.
2604131557Stjr   `match_lens' is the input. It can be NULL, but it can also be the output
2605131557Stjr   of check_matching_with_multibyte_ops() for optimization.
2606131557Stjr   `mbclen' and `pps' are the output.  `mbclen' is the length of the
2607131557Stjr   character consumed, and `pps' is the set this function enumerate.  */
2608131557Stjrstatic status_transit_state
2609131557Stjrtransit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2610131557Stjr			     int *match_lens, int *mbclen, position_set *pps)
2611131557Stjr{
2612131557Stjr  int i, j;
2613131557Stjr  int s1, s2;
2614131557Stjr  int* work_mbls;
2615131557Stjr  status_transit_state rs = TRANSIT_STATE_DONE;
2616131557Stjr
2617131557Stjr  /* Calculate the length of the (single/multi byte) character
2618131557Stjr     to which p points.  */
2619131557Stjr  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2620131557Stjr    : mblen_buf[*pp - buf_begin];
2621131557Stjr
2622131557Stjr  /* Calculate the state which can be reached from the state `s' by
2623131557Stjr     consuming `*mbclen' single bytes from the buffer.  */
2624131557Stjr  s1 = s;
2625131557Stjr  for (i = 0; i < *mbclen; i++)
2626131557Stjr    {
2627131557Stjr      s2 = s1;
2628131557Stjr      rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2629131557Stjr    }
2630131557Stjr  /* Copy the positions contained by `s1' to the set `pps'.  */
2631131557Stjr  copy(&(d->states[s1].elems), pps);
2632131557Stjr
2633131557Stjr  /* Check (inputed)match_lens, and initialize if it is NULL.  */
2634131557Stjr  if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2635131557Stjr    work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2636131557Stjr  else
2637131557Stjr    work_mbls = match_lens;
2638131557Stjr
2639131557Stjr  /* Add all of the positions which can be reached from `s' by consuming
2640131557Stjr     a single character.  */
2641131557Stjr  for (i = 0; i < d->states[s].mbps.nelem ; i++)
2642131557Stjr   {
2643131557Stjr      if (work_mbls[i] == *mbclen)
2644131557Stjr	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2645131557Stjr	     j++)
2646131557Stjr	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2647131557Stjr		 pps);
2648131557Stjr    }
2649131557Stjr
2650131557Stjr  if (match_lens == NULL && work_mbls != NULL)
2651131557Stjr    free(work_mbls);
2652131557Stjr  return rs;
2653131557Stjr}
2654131557Stjr
2655131557Stjr/* Transit state from s, then return new state and update the pointer of the
2656131557Stjr   buffer.  This function is for some operator which can match with a multi-
2657131557Stjr   byte character or a collating element(which may be multi characters).  */
2658131557Stjrstatic int
2659131557Stjrtransit_state (struct dfa *d, int s, unsigned char const **pp)
2660131557Stjr{
2661131557Stjr  int s1;
2662131557Stjr  int mbclen;		/* The length of current input multibyte character. */
2663131557Stjr  int maxlen = 0;
2664131557Stjr  int i, j;
2665131557Stjr  int *match_lens = NULL;
2666131557Stjr  int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2667131557Stjr  position_set follows;
2668131557Stjr  unsigned char const *p1 = *pp;
2669131557Stjr  status_transit_state rs;
2670131557Stjr  wchar_t wc;
2671131557Stjr
2672131557Stjr  if (nelem > 0)
2673131557Stjr    /* This state has (a) multibyte operator(s).
2674131557Stjr       We check whether each of them can match or not.  */
2675131557Stjr    {
2676131557Stjr      /* Note: caller must free the return value of this function.  */
2677131557Stjr      match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2678131557Stjr
2679131557Stjr      for (i = 0; i < nelem; i++)
2680131557Stjr	/* Search the operator which match the longest string,
2681131557Stjr	   in this state.  */
2682131557Stjr	{
2683131557Stjr	  if (match_lens[i] > maxlen)
2684131557Stjr	    maxlen = match_lens[i];
2685131557Stjr	}
2686131557Stjr    }
2687131557Stjr
2688131557Stjr  if (nelem == 0 || maxlen == 0)
2689131557Stjr    /* This state has no multibyte operator which can match.
2690131557Stjr       We need to  check only one singlebyte character.  */
2691131557Stjr    {
2692131557Stjr      status_transit_state rs;
2693131557Stjr      rs = transit_state_singlebyte(d, s, *pp, &s1);
2694131557Stjr
2695131557Stjr      /* We must update the pointer if state transition succeeded.  */
2696131557Stjr      if (rs == TRANSIT_STATE_DONE)
2697131557Stjr	++*pp;
2698131557Stjr
2699131557Stjr      if (match_lens != NULL)
2700131557Stjr	free(match_lens);
2701131557Stjr      return s1;
2702131557Stjr    }
2703131557Stjr
2704131557Stjr  /* This state has some operators which can match a multibyte character.  */
2705131557Stjr  follows.nelem = 0;
2706131557Stjr  MALLOC(follows.elems, position, d->nleaves);
2707131557Stjr
2708131557Stjr  /* `maxlen' may be longer than the length of a character, because it may
2709131557Stjr     not be a character but a (multi character) collating element.
2710131557Stjr     We enumerate all of the positions which `s' can reach by consuming
2711131557Stjr     `maxlen' bytes.  */
2712131557Stjr  rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2713131557Stjr
2714131557Stjr  wc = inputwcs[*pp - mbclen - buf_begin];
2715131557Stjr  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2716131557Stjr  realloc_trans_if_necessary(d, s1);
2717131557Stjr
2718131557Stjr  while (*pp - p1 < maxlen)
2719131557Stjr    {
2720131557Stjr      follows.nelem = 0;
2721131557Stjr      rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2722131557Stjr
2723131557Stjr      for (i = 0; i < nelem ; i++)
2724131557Stjr	{
2725131557Stjr	  if (match_lens[i] == *pp - p1)
2726131557Stjr	    for (j = 0;
2727131557Stjr		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2728131557Stjr	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2729131557Stjr		     &follows);
2730131557Stjr	}
2731131557Stjr
2732131557Stjr      wc = inputwcs[*pp - mbclen - buf_begin];
2733131557Stjr      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2734131557Stjr      realloc_trans_if_necessary(d, s1);
2735131557Stjr    }
2736131557Stjr  free(match_lens);
2737131557Stjr  free(follows.elems);
2738131557Stjr  return s1;
2739131557Stjr}
2740131557Stjr
2741131557Stjr#endif
2742131557Stjr
274353451Speter/* Search through a buffer looking for a match to the given struct dfa.
274453451Speter   Find the first occurrence of a string matching the regexp in the buffer,
2745131557Stjr   and the shortest possible version thereof.  Return the offset of the first
2746131557Stjr   character after the match, or (size_t) -1 if none is found.  BEGIN points to
2747131557Stjr   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2748131557Stjr   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
274953451Speter   where we're supposed to store a 1 if backreferencing happened and the
275053451Speter   match needs to be verified by a backtracking matcher.  Otherwise
275153451Speter   we store a 0 in *backref. */
2752131557Stjrsize_t
2753146199Stjrdfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
275453451Speter{
2755131557Stjr  register int s;	/* Current state. */
2756131557Stjr  register unsigned char const *p; /* Current input character. */
2757131557Stjr  register unsigned char const *end; /* One past the last input character.  */
275853472Sobrien  register int **trans, *t;	/* Copy of d->trans so it can be optimized
275953451Speter				   into a register. */
276055379Sobrien  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
276153472Sobrien  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
276253472Sobrien  static int sbit_init;
276353451Speter
276453451Speter  if (! sbit_init)
276553451Speter    {
276653451Speter      int i;
276753451Speter
276853451Speter      sbit_init = 1;
276953451Speter      for (i = 0; i < NOTCHAR; ++i)
277053472Sobrien	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
277155379Sobrien      sbit[eol] = 4;
277253451Speter    }
277353451Speter
277453451Speter  if (! d->tralloc)
277553451Speter    build_state_zero(d);
277653451Speter
2777131557Stjr  s = 0;
2778131557Stjr  p = (unsigned char const *) begin;
2779131557Stjr  end = p + size;
278053451Speter  trans = d->trans;
278153451Speter
2782131557Stjr#ifdef MBS_SUPPORT
2783131557Stjr  if (MB_CUR_MAX > 1)
2784131557Stjr    {
2785146199Stjr      int remain_bytes, i;
2786131557Stjr      buf_begin = begin;
2787131557Stjr      buf_end = end;
2788146199Stjr
2789146199Stjr      /* initialize mblen_buf, and inputwcs.  */
2790146199Stjr      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2791146199Stjr      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2792146199Stjr      memset(&mbs, 0, sizeof(mbstate_t));
2793146199Stjr      remain_bytes = 0;
2794146199Stjr      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2795131576Stjr	{
2796146199Stjr	  if (remain_bytes == 0)
2797131557Stjr	    {
2798146199Stjr	      remain_bytes
2799146199Stjr		= mbrtowc(inputwcs + i, begin + i,
2800146199Stjr			  end - (unsigned char const *)begin - i + 1, &mbs);
2801146199Stjr	      if (remain_bytes <= 1)
2802131557Stjr		{
2803146199Stjr		  remain_bytes = 0;
2804146199Stjr		  inputwcs[i] = (wchar_t)begin[i];
2805146199Stjr		  mblen_buf[i] = 0;
2806131557Stjr		}
2807131557Stjr	      else
2808131557Stjr		{
2809131557Stjr		  mblen_buf[i] = remain_bytes;
2810131557Stjr		  remain_bytes--;
2811131557Stjr		}
2812131557Stjr	    }
2813146199Stjr	  else
2814131557Stjr	    {
2815146199Stjr	      mblen_buf[i] = remain_bytes;
2816146199Stjr	      inputwcs[i] = 0;
2817146199Stjr	      remain_bytes--;
2818131557Stjr	    }
2819131557Stjr	}
2820146199Stjr      mblen_buf[i] = 0;
2821146199Stjr      inputwcs[i] = 0; /* sentinel */
2822131557Stjr    }
2823131557Stjr#endif /* MBS_SUPPORT */
2824131557Stjr
282553451Speter  for (;;)
282653451Speter    {
2827131557Stjr#ifdef MBS_SUPPORT
2828131557Stjr      if (MB_CUR_MAX > 1)
2829131557Stjr	while ((t = trans[s]))
2830131557Stjr	  {
2831131557Stjr	    if (d->states[s].mbps.nelem != 0)
2832131557Stjr	      {
2833131557Stjr		/* Can match with a multibyte character( and multi character
2834131557Stjr		   collating element).  */
2835131557Stjr		unsigned char const *nextp;
283653451Speter
2837131557Stjr		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2838131557Stjr
2839131557Stjr		nextp = p;
2840131557Stjr		s = transit_state(d, s, &nextp);
2841131557Stjr		p = nextp;
2842131557Stjr
2843131557Stjr		/* Trans table might be updated.  */
2844131557Stjr		trans = d->trans;
2845131557Stjr	      }
2846131557Stjr	    else
2847131557Stjr	      {
2848131557Stjr		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2849131557Stjr		s = t[*p++];
2850131557Stjr	      }
2851131557Stjr	  }
2852131557Stjr      else
2853131557Stjr#endif /* MBS_SUPPORT */
2854131557Stjr        while ((t = trans[s]))
2855131557Stjr	  s = t[*p++];
2856131557Stjr
2857131557Stjr      if (s < 0)
285853451Speter	{
2859131557Stjr	  if (p == end)
2860131557Stjr	    {
2861131557Stjr#ifdef MBS_SUPPORT
2862131557Stjr	      if (MB_CUR_MAX > 1)
2863131557Stjr		{
2864131557Stjr		  free(mblen_buf);
2865131557Stjr		  free(inputwcs);
2866131557Stjr		}
2867131557Stjr#endif /* MBS_SUPPORT */
2868131557Stjr	      return (size_t) -1;
2869131557Stjr	    }
2870131557Stjr	  s = 0;
2871131557Stjr	}
2872131557Stjr      else if ((t = d->fails[s]))
2873131557Stjr	{
287453451Speter	  if (d->success[s] & sbit[*p])
287553451Speter	    {
287653451Speter	      if (backref)
287753472Sobrien		*backref = (d->states[s].backref != 0);
2878131557Stjr#ifdef MBS_SUPPORT
2879131557Stjr	      if (MB_CUR_MAX > 1)
2880131557Stjr		{
2881131557Stjr		  free(mblen_buf);
2882131557Stjr		  free(inputwcs);
2883131557Stjr		}
2884131557Stjr#endif /* MBS_SUPPORT */
2885131557Stjr	      return (char const *) p - begin;
288653451Speter	    }
288753451Speter
2888131557Stjr#ifdef MBS_SUPPORT
2889131557Stjr	  if (MB_CUR_MAX > 1)
2890131557Stjr	    {
2891131557Stjr		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2892131557Stjr		if (d->states[s].mbps.nelem != 0)
2893131557Stjr		  {
2894131557Stjr		    /* Can match with a multibyte character( and multi
2895131557Stjr		       character collating element).  */
2896131557Stjr		    unsigned char const *nextp;
2897131557Stjr		    nextp = p;
2898131557Stjr		    s = transit_state(d, s, &nextp);
2899131557Stjr		    p = nextp;
2900131557Stjr
2901131557Stjr		    /* Trans table might be updated.  */
2902131557Stjr		    trans = d->trans;
2903131557Stjr		  }
2904131557Stjr		else
2905131557Stjr		s = t[*p++];
2906131557Stjr	    }
2907131557Stjr	  else
2908131557Stjr#endif /* MBS_SUPPORT */
2909131557Stjr	  s = t[*p++];
291053451Speter	}
2911131557Stjr      else
291253451Speter	{
291353451Speter	  build_state(s, d);
291453451Speter	  trans = d->trans;
291553451Speter	}
291653451Speter    }
291753451Speter}
291853451Speter
291953451Speter/* Initialize the components of a dfa that the other routines don't
292053451Speter   initialize for themselves. */
292153451Spetervoid
292256919Srudfainit (struct dfa *d)
292353451Speter{
292453451Speter  d->calloc = 1;
292553451Speter  MALLOC(d->charclasses, charclass, d->calloc);
292653451Speter  d->cindex = 0;
292753451Speter
292853451Speter  d->talloc = 1;
292953451Speter  MALLOC(d->tokens, token, d->talloc);
293053451Speter  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2931131557Stjr#ifdef MBS_SUPPORT
2932131557Stjr  if (MB_CUR_MAX > 1)
2933131557Stjr    {
2934131557Stjr      d->nmultibyte_prop = 1;
2935131557Stjr      MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2936131557Stjr      d->nmbcsets = 0;
2937131557Stjr      d->mbcsets_alloc = 1;
2938131557Stjr      MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2939131557Stjr    }
2940131557Stjr#endif
294153451Speter
294253451Speter  d->searchflag = 0;
294353451Speter  d->tralloc = 0;
294453451Speter
294553451Speter  d->musts = 0;
294653451Speter}
294753451Speter
294853451Speter/* Parse and analyze a single string of the given length. */
294953451Spetervoid
2950131557Stjrdfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
295153451Speter{
295253451Speter  if (case_fold)	/* dummy folding in service of dfamust() */
295353451Speter    {
295453472Sobrien      char *lcopy;
295553451Speter      int i;
295653451Speter
295753472Sobrien      lcopy = malloc(len);
295853472Sobrien      if (!lcopy)
295953472Sobrien	dfaerror(_("out of memory"));
296053451Speter
296153451Speter      /* This is a kludge. */
296253451Speter      case_fold = 0;
296353451Speter      for (i = 0; i < len; ++i)
296453472Sobrien	if (ISUPPER ((unsigned char) s[i]))
296553472Sobrien	  lcopy[i] = tolower ((unsigned char) s[i]);
296653451Speter	else
296753472Sobrien	  lcopy[i] = s[i];
296853451Speter
296953451Speter      dfainit(d);
297053472Sobrien      dfaparse(lcopy, len, d);
297153472Sobrien      free(lcopy);
297253451Speter      dfamust(d);
297353451Speter      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
297453451Speter      case_fold = 1;
297553451Speter      dfaparse(s, len, d);
297653451Speter      dfaanalyze(d, searchflag);
297753451Speter    }
297853451Speter  else
297953451Speter    {
298053451Speter        dfainit(d);
298153451Speter        dfaparse(s, len, d);
298253451Speter	dfamust(d);
298353451Speter        dfaanalyze(d, searchflag);
298453451Speter    }
298553451Speter}
298653451Speter
298753451Speter/* Free the storage held by the components of a dfa. */
298853451Spetervoid
298956919Srudfafree (struct dfa *d)
299053451Speter{
299153451Speter  int i;
299253451Speter  struct dfamust *dm, *ndm;
299353451Speter
299453451Speter  free((ptr_t) d->charclasses);
299553451Speter  free((ptr_t) d->tokens);
2996131557Stjr
2997131557Stjr#ifdef MBS_SUPPORT
2998131557Stjr  if (MB_CUR_MAX > 1)
2999131557Stjr    {
3000131557Stjr      free((ptr_t) d->multibyte_prop);
3001131557Stjr      for (i = 0; i < d->nmbcsets; ++i)
3002131557Stjr	{
3003131557Stjr	  int j;
3004131557Stjr	  struct mb_char_classes *p = &(d->mbcsets[i]);
3005131557Stjr	  if (p->chars != NULL)
3006131557Stjr	    free(p->chars);
3007131557Stjr	  if (p->ch_classes != NULL)
3008131557Stjr	    free(p->ch_classes);
3009131557Stjr	  if (p->range_sts != NULL)
3010131557Stjr	    free(p->range_sts);
3011131557Stjr	  if (p->range_ends != NULL)
3012131557Stjr	    free(p->range_ends);
3013131557Stjr
3014131557Stjr	  for (j = 0; j < p->nequivs; ++j)
3015131557Stjr	    free(p->equivs[j]);
3016131557Stjr	  if (p->equivs != NULL)
3017131557Stjr	    free(p->equivs);
3018131557Stjr
3019131557Stjr	  for (j = 0; j < p->ncoll_elems; ++j)
3020131557Stjr	    free(p->coll_elems[j]);
3021131557Stjr	  if (p->coll_elems != NULL)
3022131557Stjr	    free(p->coll_elems);
3023131557Stjr	}
3024131557Stjr      free((ptr_t) d->mbcsets);
3025131557Stjr    }
3026131557Stjr#endif /* MBS_SUPPORT */
3027131557Stjr
302853451Speter  for (i = 0; i < d->sindex; ++i)
302953451Speter    free((ptr_t) d->states[i].elems.elems);
303053451Speter  free((ptr_t) d->states);
303153451Speter  for (i = 0; i < d->tindex; ++i)
303253451Speter    if (d->follows[i].elems)
303353451Speter      free((ptr_t) d->follows[i].elems);
303453451Speter  free((ptr_t) d->follows);
303553451Speter  for (i = 0; i < d->tralloc; ++i)
303653451Speter    if (d->trans[i])
303753451Speter      free((ptr_t) d->trans[i]);
303853451Speter    else if (d->fails[i])
303953451Speter      free((ptr_t) d->fails[i]);
304053472Sobrien  if (d->realtrans) free((ptr_t) d->realtrans);
304153472Sobrien  if (d->fails) free((ptr_t) d->fails);
304253472Sobrien  if (d->success) free((ptr_t) d->success);
304353451Speter  for (dm = d->musts; dm; dm = ndm)
304453451Speter    {
304553451Speter      ndm = dm->next;
304653451Speter      free(dm->must);
304753451Speter      free((ptr_t) dm);
304853451Speter    }
304953451Speter}
305053451Speter
305153451Speter/* Having found the postfix representation of the regular expression,
305253451Speter   try to find a long sequence of characters that must appear in any line
305353451Speter   containing the r.e.
305453451Speter   Finding a "longest" sequence is beyond the scope here;
305553451Speter   we take an easy way out and hope for the best.
305653451Speter   (Take "(ab|a)b"--please.)
305753451Speter
305853451Speter   We do a bottom-up calculation of sequences of characters that must appear
305953451Speter   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
306053451Speter   representation:
306153451Speter	sequences that must appear at the left of the match ("left")
306253451Speter	sequences that must appear at the right of the match ("right")
306353451Speter	lists of sequences that must appear somewhere in the match ("in")
306453451Speter	sequences that must constitute the match ("is")
306553451Speter
306653451Speter   When we get to the root of the tree, we use one of the longest of its
306753451Speter   calculated "in" sequences as our answer.  The sequence we find is returned in
306853451Speter   d->must (where "d" is the single argument passed to "dfamust");
306953451Speter   the length of the sequence is returned in d->mustn.
307053451Speter
307153451Speter   The sequences calculated for the various types of node (in pseudo ANSI c)
307253451Speter   are shown below.  "p" is the operand of unary operators (and the left-hand
307353451Speter   operand of binary operators); "q" is the right-hand operand of binary
307453451Speter   operators.
307553451Speter
307653451Speter   "ZERO" means "a zero-length sequence" below.
307753451Speter
307853451Speter	Type	left		right		is		in
307953451Speter	----	----		-----		--		--
308053451Speter	char c	# c		# c		# c		# c
308153451Speter
3082131557Stjr	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3083131557Stjr
3084131557Stjr	MBCSET	ZERO		ZERO		ZERO		ZERO
3085131557Stjr
308653451Speter	CSET	ZERO		ZERO		ZERO		ZERO
308753451Speter
308853451Speter	STAR	ZERO		ZERO		ZERO		ZERO
308953451Speter
309053451Speter	QMARK	ZERO		ZERO		ZERO		ZERO
309153451Speter
309253451Speter	PLUS	p->left		p->right	ZERO		p->in
309353451Speter
309453451Speter	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
309553451Speter		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
309653451Speter		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
309753451Speter						ZERO
309853451Speter
309953451Speter	OR	longest common	longest common	(do p->is and	substrings common to
310053451Speter		leading		trailing	q->is have same	p->in and q->in
310153451Speter		(sub)sequence	(sub)sequence	length and
310253451Speter		of p->left	of p->right	content) ?
310353451Speter		and q->left	and q->right	p->is : NULL
310453451Speter
310553451Speter   If there's anything else we recognize in the tree, all four sequences get set
310653451Speter   to zero-length sequences.  If there's something we don't recognize in the tree,
310753451Speter   we just return a zero-length sequence.
310853451Speter
310953451Speter   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
311053451Speter   'aaa')?
311153451Speter
311253451Speter   And. . .is it here or someplace that we might ponder "optimizations" such as
311353451Speter	egrep 'psi|epsilon'	->	egrep 'psi'
311453451Speter	egrep 'pepsi|epsilon'	->	egrep 'epsi'
311553451Speter					(Yes, we now find "epsi" as a "string
311653451Speter					that must occur", but we might also
311753451Speter					simplify the *entire* r.e. being sought)
311853451Speter	grep '[c]'		->	grep 'c'
311953451Speter	grep '(ab|a)b'		->	grep 'ab'
312053451Speter	grep 'ab*'		->	grep 'a'
312153451Speter	grep 'a*b'		->	grep 'b'
312253451Speter
312353451Speter   There are several issues:
312453451Speter
312553451Speter   Is optimization easy (enough)?
312653451Speter
312753451Speter   Does optimization actually accomplish anything,
312853451Speter   or is the automaton you get from "psi|epsilon" (for example)
312953451Speter   the same as the one you get from "psi" (for example)?
313053451Speter
313153451Speter   Are optimizable r.e.'s likely to be used in real-life situations
313253451Speter   (something like 'ab*' is probably unlikely; something like is
313353451Speter   'psi|epsilon' is likelier)? */
313453451Speter
313553451Speterstatic char *
313656919Sruicatalloc (char *old, char *new)
313753451Speter{
313853451Speter  char *result;
313953472Sobrien  size_t oldsize, newsize;
314053451Speter
314153451Speter  newsize = (new == NULL) ? 0 : strlen(new);
314253451Speter  if (old == NULL)
314353451Speter    oldsize = 0;
314453451Speter  else if (newsize == 0)
314553451Speter    return old;
314653451Speter  else	oldsize = strlen(old);
314753451Speter  if (old == NULL)
314853451Speter    result = (char *) malloc(newsize + 1);
314953451Speter  else
315053451Speter    result = (char *) realloc((void *) old, oldsize + newsize + 1);
315153451Speter  if (result != NULL && new != NULL)
315253451Speter    (void) strcpy(result + oldsize, new);
315353451Speter  return result;
315453451Speter}
315553451Speter
315653451Speterstatic char *
315756919Sruicpyalloc (char *string)
315853451Speter{
315953451Speter  return icatalloc((char *) NULL, string);
316053451Speter}
316153451Speter
316253451Speterstatic char *
316356919Sruistrstr (char *lookin, char *lookfor)
316453451Speter{
316553451Speter  char *cp;
316653472Sobrien  size_t len;
316753451Speter
316853451Speter  len = strlen(lookfor);
316953451Speter  for (cp = lookin; *cp != '\0'; ++cp)
317053451Speter    if (strncmp(cp, lookfor, len) == 0)
317153451Speter      return cp;
317253451Speter  return NULL;
317353451Speter}
317453451Speter
317553451Speterstatic void
317656919Sruifree (char *cp)
317753451Speter{
317853451Speter  if (cp != NULL)
317953451Speter    free(cp);
318053451Speter}
318153451Speter
318253451Speterstatic void
318356919Srufreelist (char **cpp)
318453451Speter{
318553451Speter  int i;
318653451Speter
318753451Speter  if (cpp == NULL)
318853451Speter    return;
318953451Speter  for (i = 0; cpp[i] != NULL; ++i)
319053451Speter    {
319153451Speter      free(cpp[i]);
319253451Speter      cpp[i] = NULL;
319353451Speter    }
319453451Speter}
319553451Speter
319653451Speterstatic char **
319756919Sruenlist (char **cpp, char *new, size_t len)
319853451Speter{
319953451Speter  int i, j;
320053451Speter
320153451Speter  if (cpp == NULL)
320253451Speter    return NULL;
320353451Speter  if ((new = icpyalloc(new)) == NULL)
320453451Speter    {
320553451Speter      freelist(cpp);
320653451Speter      return NULL;
320753451Speter    }
320853451Speter  new[len] = '\0';
320953451Speter  /* Is there already something in the list that's new (or longer)? */
321053451Speter  for (i = 0; cpp[i] != NULL; ++i)
321153451Speter    if (istrstr(cpp[i], new) != NULL)
321253451Speter      {
321353451Speter	free(new);
321453451Speter	return cpp;
321553451Speter      }
321653451Speter  /* Eliminate any obsoleted strings. */
321753451Speter  j = 0;
321853451Speter  while (cpp[j] != NULL)
321953451Speter    if (istrstr(new, cpp[j]) == NULL)
322053451Speter      ++j;
322153451Speter    else
322253451Speter      {
322353451Speter	free(cpp[j]);
322453451Speter	if (--i == j)
322553451Speter	  break;
322653451Speter	cpp[j] = cpp[i];
322753451Speter	cpp[i] = NULL;
322853451Speter      }
322953451Speter  /* Add the new string. */
323053451Speter  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
323153451Speter  if (cpp == NULL)
323253451Speter    return NULL;
323353451Speter  cpp[i] = new;
323453451Speter  cpp[i + 1] = NULL;
323553451Speter  return cpp;
323653451Speter}
323753451Speter
323853451Speter/* Given pointers to two strings, return a pointer to an allocated
323953451Speter   list of their distinct common substrings. Return NULL if something
324053451Speter   seems wild. */
324153451Speterstatic char **
324256919Srucomsubs (char *left, char *right)
324353451Speter{
324453451Speter  char **cpp;
324553451Speter  char *lcp;
324653451Speter  char *rcp;
324753472Sobrien  size_t i, len;
324853451Speter
324953451Speter  if (left == NULL || right == NULL)
325053451Speter    return NULL;
325153451Speter  cpp = (char **) malloc(sizeof *cpp);
325253451Speter  if (cpp == NULL)
325353451Speter    return NULL;
325453451Speter  cpp[0] = NULL;
325553451Speter  for (lcp = left; *lcp != '\0'; ++lcp)
325653451Speter    {
325753451Speter      len = 0;
3258131557Stjr      rcp = strchr (right, *lcp);
325953451Speter      while (rcp != NULL)
326053451Speter	{
326153451Speter	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
326253472Sobrien	    continue;
326353451Speter	  if (i > len)
326453451Speter	    len = i;
3265131557Stjr	  rcp = strchr (rcp + 1, *lcp);
326653451Speter	}
326753451Speter      if (len == 0)
326853451Speter	continue;
326953451Speter      if ((cpp = enlist(cpp, lcp, len)) == NULL)
327053451Speter	break;
327153451Speter    }
327253451Speter  return cpp;
327353451Speter}
327453451Speter
327553451Speterstatic char **
327656919Sruaddlists (char **old, char **new)
327753451Speter{
327853451Speter  int i;
327953451Speter
328053451Speter  if (old == NULL || new == NULL)
328153451Speter    return NULL;
328253451Speter  for (i = 0; new[i] != NULL; ++i)
328353451Speter    {
328453451Speter      old = enlist(old, new[i], strlen(new[i]));
328553451Speter      if (old == NULL)
328653451Speter	break;
328753451Speter    }
328853451Speter  return old;
328953451Speter}
329053451Speter
329153451Speter/* Given two lists of substrings, return a new list giving substrings
329253451Speter   common to both. */
329353451Speterstatic char **
329456919Sruinboth (char **left, char **right)
329553451Speter{
329653451Speter  char **both;
329753451Speter  char **temp;
329853451Speter  int lnum, rnum;
329953451Speter
330053451Speter  if (left == NULL || right == NULL)
330153451Speter    return NULL;
330253451Speter  both = (char **) malloc(sizeof *both);
330353451Speter  if (both == NULL)
330453451Speter    return NULL;
330553451Speter  both[0] = NULL;
330653451Speter  for (lnum = 0; left[lnum] != NULL; ++lnum)
330753451Speter    {
330853451Speter      for (rnum = 0; right[rnum] != NULL; ++rnum)
330953451Speter	{
331053451Speter	  temp = comsubs(left[lnum], right[rnum]);
331153451Speter	  if (temp == NULL)
331253451Speter	    {
331353451Speter	      freelist(both);
331453451Speter	      return NULL;
331553451Speter	    }
331653451Speter	  both = addlists(both, temp);
331753451Speter	  freelist(temp);
331853472Sobrien	  free(temp);
331953451Speter	  if (both == NULL)
332053451Speter	    return NULL;
332153451Speter	}
332253451Speter    }
332353451Speter  return both;
332453451Speter}
332553451Speter
332653451Spetertypedef struct
332753451Speter{
332853451Speter  char **in;
332953451Speter  char *left;
333053451Speter  char *right;
333153451Speter  char *is;
333253451Speter} must;
333353451Speter
333453451Speterstatic void
333556919Sruresetmust (must *mp)
333653451Speter{
333753451Speter  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
333853451Speter  freelist(mp->in);
333953451Speter}
334053451Speter
334153451Speterstatic void
334256919Srudfamust (struct dfa *dfa)
334353451Speter{
334453451Speter  must *musts;
334553451Speter  must *mp;
334653451Speter  char *result;
334753451Speter  int ri;
334853451Speter  int i;
334953451Speter  int exact;
335053451Speter  token t;
335153451Speter  static must must0;
335253451Speter  struct dfamust *dm;
335353472Sobrien  static char empty_string[] = "";
335453451Speter
335553472Sobrien  result = empty_string;
335653451Speter  exact = 0;
335753451Speter  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
335853451Speter  if (musts == NULL)
335953451Speter    return;
336053451Speter  mp = musts;
336153451Speter  for (i = 0; i <= dfa->tindex; ++i)
336253451Speter    mp[i] = must0;
336353451Speter  for (i = 0; i <= dfa->tindex; ++i)
336453451Speter    {
336553451Speter      mp[i].in = (char **) malloc(sizeof *mp[i].in);
336653451Speter      mp[i].left = malloc(2);
336753451Speter      mp[i].right = malloc(2);
336853451Speter      mp[i].is = malloc(2);
336953451Speter      if (mp[i].in == NULL || mp[i].left == NULL ||
337053451Speter	  mp[i].right == NULL || mp[i].is == NULL)
337153451Speter	goto done;
337253451Speter      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
337353451Speter      mp[i].in[0] = NULL;
337453451Speter    }
337553451Speter#ifdef DEBUG
337653451Speter  fprintf(stderr, "dfamust:\n");
337753451Speter  for (i = 0; i < dfa->tindex; ++i)
337853451Speter    {
337953451Speter      fprintf(stderr, " %d:", i);
338053451Speter      prtok(dfa->tokens[i]);
338153451Speter    }
338253451Speter  putc('\n', stderr);
338353451Speter#endif
338453451Speter  for (ri = 0; ri < dfa->tindex; ++ri)
338553451Speter    {
338653451Speter      switch (t = dfa->tokens[ri])
338753451Speter	{
338853451Speter	case LPAREN:
338953451Speter	case RPAREN:
339053451Speter	  goto done;		/* "cannot happen" */
339153451Speter	case EMPTY:
339253451Speter	case BEGLINE:
339353451Speter	case ENDLINE:
339453451Speter	case BEGWORD:
339553451Speter	case ENDWORD:
339653451Speter	case LIMWORD:
339753451Speter	case NOTLIMWORD:
339853451Speter	case BACKREF:
339953451Speter	  resetmust(mp);
340053451Speter	  break;
340153451Speter	case STAR:
340253451Speter	case QMARK:
340353451Speter	  if (mp <= musts)
340453451Speter	    goto done;		/* "cannot happen" */
340553451Speter	  --mp;
340653451Speter	  resetmust(mp);
340753451Speter	  break;
340853451Speter	case OR:
340953451Speter	case ORTOP:
341053451Speter	  if (mp < &musts[2])
341153451Speter	    goto done;		/* "cannot happen" */
341253451Speter	  {
341353451Speter	    char **new;
341453451Speter	    must *lmp;
341553451Speter	    must *rmp;
341653451Speter	    int j, ln, rn, n;
341753451Speter
341853451Speter	    rmp = --mp;
341953451Speter	    lmp = --mp;
342053451Speter	    /* Guaranteed to be.  Unlikely, but. . . */
342153451Speter	    if (strcmp(lmp->is, rmp->is) != 0)
342253451Speter	      lmp->is[0] = '\0';
342353451Speter	    /* Left side--easy */
342453451Speter	    i = 0;
342553451Speter	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
342653451Speter	      ++i;
342753451Speter	    lmp->left[i] = '\0';
342853451Speter	    /* Right side */
342953451Speter	    ln = strlen(lmp->right);
343053451Speter	    rn = strlen(rmp->right);
343153451Speter	    n = ln;
343253451Speter	    if (n > rn)
343353451Speter	      n = rn;
343453451Speter	    for (i = 0; i < n; ++i)
343553451Speter	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
343653451Speter		break;
343753451Speter	    for (j = 0; j < i; ++j)
343853451Speter	      lmp->right[j] = lmp->right[(ln - i) + j];
343953451Speter	    lmp->right[j] = '\0';
344053451Speter	    new = inboth(lmp->in, rmp->in);
344153451Speter	    if (new == NULL)
344253451Speter	      goto done;
344353451Speter	    freelist(lmp->in);
344453451Speter	    free((char *) lmp->in);
344553451Speter	    lmp->in = new;
344653451Speter	  }
344753451Speter	  break;
344853451Speter	case PLUS:
344953451Speter	  if (mp <= musts)
345053451Speter	    goto done;		/* "cannot happen" */
345153451Speter	  --mp;
345253451Speter	  mp->is[0] = '\0';
345353451Speter	  break;
345453451Speter	case END:
345553451Speter	  if (mp != &musts[1])
345653451Speter	    goto done;		/* "cannot happen" */
345753451Speter	  for (i = 0; musts[0].in[i] != NULL; ++i)
345853451Speter	    if (strlen(musts[0].in[i]) > strlen(result))
345953451Speter	      result = musts[0].in[i];
346053451Speter	  if (strcmp(result, musts[0].is) == 0)
346153451Speter	    exact = 1;
346253451Speter	  goto done;
346353451Speter	case CAT:
346453451Speter	  if (mp < &musts[2])
346553451Speter	    goto done;		/* "cannot happen" */
346653451Speter	  {
346753451Speter	    must *lmp;
346853451Speter	    must *rmp;
346953451Speter
347053451Speter	    rmp = --mp;
347153451Speter	    lmp = --mp;
347253451Speter	    /* In.  Everything in left, plus everything in
347353451Speter	       right, plus catenation of
347453451Speter	       left's right and right's left. */
347553451Speter	    lmp->in = addlists(lmp->in, rmp->in);
347653451Speter	    if (lmp->in == NULL)
347753451Speter	      goto done;
347853451Speter	    if (lmp->right[0] != '\0' &&
347953451Speter		rmp->left[0] != '\0')
348053451Speter	      {
348153451Speter		char *tp;
348253451Speter
348353451Speter		tp = icpyalloc(lmp->right);
348453451Speter		if (tp == NULL)
348553451Speter		  goto done;
348653451Speter		tp = icatalloc(tp, rmp->left);
348753451Speter		if (tp == NULL)
348853451Speter		  goto done;
348953451Speter		lmp->in = enlist(lmp->in, tp,
349053451Speter				 strlen(tp));
349153451Speter		free(tp);
349253451Speter		if (lmp->in == NULL)
349353451Speter		  goto done;
349453451Speter	      }
349553451Speter	    /* Left-hand */
349653451Speter	    if (lmp->is[0] != '\0')
349753451Speter	      {
349853451Speter		lmp->left = icatalloc(lmp->left,
349953451Speter				      rmp->left);
350053451Speter		if (lmp->left == NULL)
350153451Speter		  goto done;
350253451Speter	      }
350353451Speter	    /* Right-hand */
350453451Speter	    if (rmp->is[0] == '\0')
350553451Speter	      lmp->right[0] = '\0';
350653451Speter	    lmp->right = icatalloc(lmp->right, rmp->right);
350753451Speter	    if (lmp->right == NULL)
350853451Speter	      goto done;
350953451Speter	    /* Guaranteed to be */
351053451Speter	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
351153451Speter	      {
351253451Speter		lmp->is = icatalloc(lmp->is, rmp->is);
351353451Speter		if (lmp->is == NULL)
351453451Speter		  goto done;
351553451Speter	      }
351653451Speter	    else
351753451Speter	      lmp->is[0] = '\0';
351853451Speter	  }
351953451Speter	  break;
352053451Speter	default:
352153451Speter	  if (t < END)
352253451Speter	    {
352353451Speter	      /* "cannot happen" */
352453451Speter	      goto done;
352553451Speter	    }
352653451Speter	  else if (t == '\0')
352753451Speter	    {
352853451Speter	      /* not on *my* shift */
352953451Speter	      goto done;
353053451Speter	    }
3531131557Stjr	  else if (t >= CSET
3532131557Stjr#ifdef MBS_SUPPORT
3533131557Stjr		   || t == ANYCHAR
3534131557Stjr		   || t == MBCSET
3535131557Stjr#endif /* MBS_SUPPORT */
3536131557Stjr		   )
353753451Speter	    {
353853451Speter	      /* easy enough */
353953451Speter	      resetmust(mp);
354053451Speter	    }
354153451Speter	  else
354253451Speter	    {
354353451Speter	      /* plain character */
354453451Speter	      resetmust(mp);
354553451Speter	      mp->is[0] = mp->left[0] = mp->right[0] = t;
354653451Speter	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
354753472Sobrien	      mp->in = enlist(mp->in, mp->is, (size_t)1);
354853451Speter	      if (mp->in == NULL)
354953451Speter		goto done;
355053451Speter	    }
355153451Speter	  break;
355253451Speter	}
355353451Speter#ifdef DEBUG
355453451Speter      fprintf(stderr, " node: %d:", ri);
355553451Speter      prtok(dfa->tokens[ri]);
355653451Speter      fprintf(stderr, "\n  in:");
355753451Speter      for (i = 0; mp->in[i]; ++i)
355853451Speter	fprintf(stderr, " \"%s\"", mp->in[i]);
355953451Speter      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
356053451Speter      fprintf(stderr, "  left: \"%s\"\n", mp->left);
356153451Speter      fprintf(stderr, "  right: \"%s\"\n", mp->right);
356253451Speter#endif
356353451Speter      ++mp;
356453451Speter    }
356553451Speter done:
356653451Speter  if (strlen(result))
356753451Speter    {
356853451Speter      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
356953451Speter      dm->exact = exact;
357053451Speter      dm->must = malloc(strlen(result) + 1);
357153451Speter      strcpy(dm->must, result);
357253451Speter      dm->next = dfa->musts;
357353451Speter      dfa->musts = dm;
357453451Speter    }
357553451Speter  mp = musts;
357653451Speter  for (i = 0; i <= dfa->tindex; ++i)
357753451Speter    {
357853451Speter      freelist(mp[i].in);
357953451Speter      ifree((char *) mp[i].in);
358053451Speter      ifree(mp[i].left);
358153451Speter      ifree(mp[i].right);
358253451Speter      ifree(mp[i].is);
358353451Speter    }
358453451Speter  free((char *) mp);
358553451Speter}
3586131557Stjr/* vim:set shiftwidth=2: */
3587