153451Speter/* dfa.c - deterministic extended regexp routines for GNU 2126435Sache Copyright 1988, 1998, 2000 Free Software Foundation, Inc. 353451Speter 453451Speter This program is free software; you can redistribute it and/or modify 553451Speter it under the terms of the GNU General Public License as published by 653451Speter the Free Software Foundation; either version 2, or (at your option) 753451Speter any later version. 853451Speter 953451Speter This program is distributed in the hope that it will be useful, 1053451Speter but WITHOUT ANY WARRANTY; without even the implied warranty of 1153451Speter MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1253451Speter GNU General Public License for more details. 1353451Speter 1453451Speter You should have received a copy of the GNU General Public License 1553451Speter along with this program; if not, write to the Free Software 1653472Sobrien Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 1753451Speter 1853451Speter/* Written June, 1988 by Mike Haertel 1953451Speter Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 2053451Speter 2153472Sobrien/* $FreeBSD$ */ 2253472Sobrien 2353472Sobrien#ifdef HAVE_CONFIG_H 2453472Sobrien#include <config.h> 2553472Sobrien#endif 2653472Sobrien 2753451Speter#include <assert.h> 2853451Speter#include <ctype.h> 2953451Speter#include <stdio.h> 3053451Speter 3153472Sobrien#include <sys/types.h> 3253451Speter#ifdef STDC_HEADERS 3353451Speter#include <stdlib.h> 3453451Speter#else 3553451Speterextern char *calloc(), *malloc(), *realloc(); 3653451Speterextern void free(); 3753451Speter#endif 3853451Speter 3953451Speter#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 4053451Speter#include <string.h> 4153451Speter#else 4253451Speter#include <strings.h> 4353451Speter#endif 4453451Speter 45131557Stjr#if HAVE_SETLOCALE 46131557Stjr# include <locale.h> 47131557Stjr#endif 48131557Stjr 49131557Stjr#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 50131557Stjr/* We can handle multibyte string. */ 51131557Stjr# define MBS_SUPPORT 52131557Stjr#endif 53131557Stjr 54131557Stjr#ifdef MBS_SUPPORT 55131557Stjr# include <wchar.h> 56131557Stjr# include <wctype.h> 57131557Stjr#endif 58131557Stjr 5953472Sobrien#ifndef DEBUG /* use the same approach as regex.c */ 6053472Sobrien#undef assert 6153472Sobrien#define assert(e) 6253472Sobrien#endif /* DEBUG */ 6353472Sobrien 6453451Speter#ifndef isgraph 6553472Sobrien#define isgraph(C) (isprint(C) && !isspace(C)) 6653451Speter#endif 6753451Speter 6853472Sobrien#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 6953472Sobrien#define ISALPHA(C) isalpha(C) 7053472Sobrien#define ISUPPER(C) isupper(C) 7153472Sobrien#define ISLOWER(C) islower(C) 7253472Sobrien#define ISDIGIT(C) isdigit(C) 7353472Sobrien#define ISXDIGIT(C) isxdigit(C) 7453472Sobrien#define ISSPACE(C) isspace(C) 7553472Sobrien#define ISPUNCT(C) ispunct(C) 7653472Sobrien#define ISALNUM(C) isalnum(C) 7753472Sobrien#define ISPRINT(C) isprint(C) 7853472Sobrien#define ISGRAPH(C) isgraph(C) 7953472Sobrien#define ISCNTRL(C) iscntrl(C) 8053472Sobrien#else 8153472Sobrien#define ISALPHA(C) (isascii(C) && isalpha(C)) 8253472Sobrien#define ISUPPER(C) (isascii(C) && isupper(C)) 8353472Sobrien#define ISLOWER(C) (isascii(C) && islower(C)) 8453472Sobrien#define ISDIGIT(C) (isascii(C) && isdigit(C)) 8553472Sobrien#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 8653472Sobrien#define ISSPACE(C) (isascii(C) && isspace(C)) 8753472Sobrien#define ISPUNCT(C) (isascii(C) && ispunct(C)) 8853472Sobrien#define ISALNUM(C) (isascii(C) && isalnum(C)) 8953472Sobrien#define ISPRINT(C) (isascii(C) && isprint(C)) 9053472Sobrien#define ISGRAPH(C) (isascii(C) && isgraph(C)) 9153472Sobrien#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 9253472Sobrien#endif 9353451Speter 9456919Sru/* ISASCIIDIGIT differs from ISDIGIT, as follows: 9556919Sru - Its arg may be any int or unsigned int; it need not be an unsigned char. 9656919Sru - It's guaranteed to evaluate its argument exactly once. 9756919Sru - It's typically faster. 9856919Sru Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 9956919Sru only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 10056919Sru it's important to use the locale's definition of `digit' even when the 10156919Sru host does not conform to Posix. */ 10256919Sru#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 10356919Sru 10453472Sobrien/* If we (don't) have I18N. */ 10553472Sobrien/* glibc defines _ */ 10653472Sobrien#ifndef _ 10753472Sobrien# ifdef HAVE_LIBINTL_H 10853472Sobrien# include <libintl.h> 10953472Sobrien# ifndef _ 11053472Sobrien# define _(Str) gettext (Str) 11153472Sobrien# endif 11253472Sobrien# else 11353472Sobrien# define _(Str) (Str) 11453472Sobrien# endif 11553472Sobrien#endif 11653472Sobrien 11753472Sobrien#include "regex.h" 11853472Sobrien#include "dfa.h" 119131557Stjr#include "hard-locale.h" 12053451Speter 12153472Sobrien/* HPUX, define those as macros in sys/param.h */ 12253472Sobrien#ifdef setbit 12353472Sobrien# undef setbit 12453472Sobrien#endif 12553472Sobrien#ifdef clrbit 12653472Sobrien# undef clrbit 12753472Sobrien#endif 12853451Speter 12953472Sobrienstatic void dfamust PARAMS ((struct dfa *dfa)); 13053472Sobrienstatic void regexp PARAMS ((int toplevel)); 13153472Sobrien 13253451Speterstatic ptr_t 13356919Sruxcalloc (size_t n, size_t s) 13453451Speter{ 13553451Speter ptr_t r = calloc(n, s); 13653451Speter 13753451Speter if (!r) 13853472Sobrien dfaerror(_("Memory exhausted")); 13953451Speter return r; 14053451Speter} 14153451Speter 14253451Speterstatic ptr_t 14356919Sruxmalloc (size_t n) 14453451Speter{ 14553451Speter ptr_t r = malloc(n); 14653451Speter 14753451Speter assert(n != 0); 14853451Speter if (!r) 14953472Sobrien dfaerror(_("Memory exhausted")); 15053451Speter return r; 15153451Speter} 15253451Speter 15353451Speterstatic ptr_t 15456919Sruxrealloc (ptr_t p, size_t n) 15553451Speter{ 15653451Speter ptr_t r = realloc(p, n); 15753451Speter 15853451Speter assert(n != 0); 15953451Speter if (!r) 16053472Sobrien dfaerror(_("Memory exhausted")); 16153451Speter return r; 16253451Speter} 16353451Speter 16453472Sobrien#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 16553451Speter#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 16653451Speter#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 16753451Speter 16853451Speter/* Reallocate an array of type t if nalloc is too small for index. */ 16953451Speter#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 17053451Speter if ((index) >= (nalloc)) \ 17153451Speter { \ 172131557Stjr do \ 17353451Speter (nalloc) *= 2; \ 174131557Stjr while ((index) >= (nalloc)); \ 17553451Speter REALLOC(p, t, nalloc); \ 17653451Speter } 17753451Speter 17853451Speter#ifdef DEBUG 17953451Speter 18053451Speterstatic void 18156919Sruprtok (token t) 18253451Speter{ 183131557Stjr char const *s; 18453451Speter 18553451Speter if (t < 0) 18653451Speter fprintf(stderr, "END"); 18753451Speter else if (t < NOTCHAR) 18853451Speter fprintf(stderr, "%c", t); 18953451Speter else 19053451Speter { 19153451Speter switch (t) 19253451Speter { 19353451Speter case EMPTY: s = "EMPTY"; break; 19453451Speter case BACKREF: s = "BACKREF"; break; 19553451Speter case BEGLINE: s = "BEGLINE"; break; 19653451Speter case ENDLINE: s = "ENDLINE"; break; 19753451Speter case BEGWORD: s = "BEGWORD"; break; 19853451Speter case ENDWORD: s = "ENDWORD"; break; 19953451Speter case LIMWORD: s = "LIMWORD"; break; 20053451Speter case NOTLIMWORD: s = "NOTLIMWORD"; break; 20153451Speter case QMARK: s = "QMARK"; break; 20253451Speter case STAR: s = "STAR"; break; 20353451Speter case PLUS: s = "PLUS"; break; 20453451Speter case CAT: s = "CAT"; break; 20553451Speter case OR: s = "OR"; break; 20653451Speter case ORTOP: s = "ORTOP"; break; 20753451Speter case LPAREN: s = "LPAREN"; break; 20853451Speter case RPAREN: s = "RPAREN"; break; 209131557Stjr case CRANGE: s = "CRANGE"; break; 210131557Stjr#ifdef MBS_SUPPORT 211131557Stjr case ANYCHAR: s = "ANYCHAR"; break; 212131557Stjr case MBCSET: s = "MBCSET"; break; 213131557Stjr#endif /* MBS_SUPPORT */ 21453451Speter default: s = "CSET"; break; 21553451Speter } 21653451Speter fprintf(stderr, "%s", s); 21753451Speter } 21853451Speter} 21953451Speter#endif /* DEBUG */ 22053451Speter 22153451Speter/* Stuff pertaining to charclasses. */ 22253451Speter 22353451Speterstatic int 224131557Stjrtstbit (unsigned b, charclass c) 22553451Speter{ 22653451Speter return c[b / INTBITS] & 1 << b % INTBITS; 22753451Speter} 22853451Speter 22953451Speterstatic void 230131557Stjrsetbit (unsigned b, charclass c) 23153451Speter{ 23253451Speter c[b / INTBITS] |= 1 << b % INTBITS; 23353451Speter} 23453451Speter 23553451Speterstatic void 236131557Stjrclrbit (unsigned b, charclass c) 23753451Speter{ 23853451Speter c[b / INTBITS] &= ~(1 << b % INTBITS); 23953451Speter} 24053451Speter 24153451Speterstatic void 24256919Srucopyset (charclass src, charclass dst) 24353451Speter{ 244131557Stjr memcpy (dst, src, sizeof (charclass)); 24553451Speter} 24653451Speter 24753451Speterstatic void 24856919Sruzeroset (charclass s) 24953451Speter{ 250131557Stjr memset (s, 0, sizeof (charclass)); 25153451Speter} 25253451Speter 25353451Speterstatic void 25456919Srunotset (charclass s) 25553451Speter{ 25653451Speter int i; 25753451Speter 25853451Speter for (i = 0; i < CHARCLASS_INTS; ++i) 25953451Speter s[i] = ~s[i]; 26053451Speter} 26153451Speter 26253451Speterstatic int 26356919Sruequal (charclass s1, charclass s2) 26453451Speter{ 265131557Stjr return memcmp (s1, s2, sizeof (charclass)) == 0; 26653451Speter} 26753451Speter 26853451Speter/* A pointer to the current dfa is kept here during parsing. */ 26953451Speterstatic struct dfa *dfa; 27053451Speter 27153451Speter/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 27253451Speterstatic int 27356919Srucharclass_index (charclass s) 27453451Speter{ 27553451Speter int i; 27653451Speter 27753451Speter for (i = 0; i < dfa->cindex; ++i) 27853451Speter if (equal(s, dfa->charclasses[i])) 27953451Speter return i; 28053451Speter REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 28153451Speter ++dfa->cindex; 28253451Speter copyset(s, dfa->charclasses[i]); 28353451Speter return i; 28453451Speter} 28553451Speter 28653451Speter/* Syntax bits controlling the behavior of the lexical analyzer. */ 28753472Sobrienstatic reg_syntax_t syntax_bits, syntax_bits_set; 28853451Speter 28953451Speter/* Flag for case-folding letters into sets. */ 29053451Speterstatic int case_fold; 29153451Speter 29255379Sobrien/* End-of-line byte in data. */ 29355379Sobrienstatic unsigned char eolbyte; 29455379Sobrien 29553451Speter/* Entry point to set syntax options. */ 29653451Spetervoid 297131557Stjrdfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 29853451Speter{ 29953451Speter syntax_bits_set = 1; 30053451Speter syntax_bits = bits; 30153451Speter case_fold = fold; 30255379Sobrien eolbyte = eol; 30353451Speter} 30453451Speter 305131557Stjr/* Like setbit, but if case is folded, set both cases of a letter. */ 306131557Stjrstatic void 307131557Stjrsetbit_case_fold (unsigned b, charclass c) 308131557Stjr{ 309131557Stjr setbit (b, c); 310131557Stjr if (case_fold) 311131557Stjr { 312131557Stjr if (ISUPPER (b)) 313131557Stjr setbit (tolower (b), c); 314131557Stjr else if (ISLOWER (b)) 315131557Stjr setbit (toupper (b), c); 316131557Stjr } 317131557Stjr} 318131557Stjr 31953451Speter/* Lexical analyzer. All the dross that deals with the obnoxious 32053451Speter GNU Regex syntax bits is located here. The poor, suffering 32153451Speter reader is referred to the GNU Regex documentation for the 32253451Speter meaning of the @#%!@#%^!@ syntax bits. */ 32353451Speter 324131557Stjrstatic char const *lexstart; /* Pointer to beginning of input string. */ 325131557Stjrstatic char const *lexptr; /* Pointer to next input character. */ 32653472Sobrienstatic int lexleft; /* Number of characters remaining. */ 32753451Speterstatic token lasttok; /* Previous token returned; initially END. */ 32853451Speterstatic int laststart; /* True if we're separated from beginning or (, | 32953451Speter only by zero-width characters. */ 33053451Speterstatic int parens; /* Count of outstanding left parens. */ 33153451Speterstatic int minrep, maxrep; /* Repeat counts for {m,n}. */ 332131557Stjrstatic int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 33353451Speter 334131557Stjr#ifdef MBS_SUPPORT 335131557Stjr/* These variables are used only if (MB_CUR_MAX > 1). */ 336131557Stjrstatic mbstate_t mbs; /* Mbstate for mbrlen(). */ 337250823Spfgstatic ssize_t cur_mb_len; /* Byte length of the current scanning 338250823Spfg multibyte character. Must also handle 339250823Spfg negative result from mbrlen(). */ 340250823Spfgstatic ssize_t cur_mb_index; /* Byte index of the current scanning multibyte 341131557Stjr character. 342131557Stjr 343131557Stjr singlebyte character : cur_mb_index = 0 344131557Stjr multibyte character 345131557Stjr 1st byte : cur_mb_index = 1 346131557Stjr 2nd byte : cur_mb_index = 2 347131557Stjr ... 348131557Stjr nth byte : cur_mb_index = n */ 349131557Stjrstatic unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 350131557Stjr Each element store the amount of remain 351131557Stjr byte of corresponding multibyte character 352131557Stjr in the input string. A element's value 353131557Stjr is 0 if corresponding character is a 354131557Stjr singlebyte chracter. 355131557Stjr e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 356131557Stjr mblen_buf : 0, 3, 2, 1 357131557Stjr */ 358131557Stjrstatic wchar_t *inputwcs; /* Wide character representation of input 359131557Stjr string in dfaexec(). 360131557Stjr The length of this array is same as 361131557Stjr the length of input string(char array). 362131557Stjr inputstring[i] is a single-byte char, 363131557Stjr or 1st byte of a multibyte char. 364131557Stjr And inputwcs[i] is the codepoint. */ 365131557Stjrstatic unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 366131557Stjrstatic unsigned char const *buf_end; /* refference to end in dfaexec(). */ 367131557Stjr#endif /* MBS_SUPPORT */ 368131557Stjr 369131557Stjr#ifdef MBS_SUPPORT 370131557Stjr/* This function update cur_mb_len, and cur_mb_index. 371131557Stjr p points current lexptr, len is the remaining buffer length. */ 372131557Stjrstatic void 373250823Spfgupdate_mb_len_index (unsigned char const *p, size_t len) 374131557Stjr{ 375131557Stjr /* If last character is a part of a multibyte character, 376131557Stjr we update cur_mb_index. */ 377131557Stjr if (cur_mb_index) 378131557Stjr cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 379131557Stjr : cur_mb_index + 1; 380131557Stjr 381131557Stjr /* If last character is a single byte character, or the 382131557Stjr last portion of a multibyte character, we check whether 383131557Stjr next character is a multibyte character or not. */ 384131557Stjr if (! cur_mb_index) 385131557Stjr { 386131557Stjr cur_mb_len = mbrlen(p, len, &mbs); 387131557Stjr if (cur_mb_len > 1) 388131557Stjr /* It is a multibyte character. 389131557Stjr cur_mb_len was already set by mbrlen(). */ 390131557Stjr cur_mb_index = 1; 391131557Stjr else if (cur_mb_len < 1) 392131557Stjr /* Invalid sequence. We treat it as a singlebyte character. 393131557Stjr cur_mb_index is aleady 0. */ 394131557Stjr cur_mb_len = 1; 395131557Stjr /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 396131557Stjr cur_mb_index is aleady 0. */ 397131557Stjr } 398131557Stjr} 399131557Stjr#endif /* MBS_SUPPORT */ 400131557Stjr 401131557Stjr#ifdef MBS_SUPPORT 40253451Speter/* Note that characters become unsigned here. */ 403131557Stjr# define FETCH(c, eoferr) \ 404131557Stjr { \ 405131557Stjr if (! lexleft) \ 406131557Stjr { \ 407131557Stjr if (eoferr != 0) \ 408131557Stjr dfaerror (eoferr); \ 409131557Stjr else \ 410131557Stjr return lasttok = END; \ 411131557Stjr } \ 412131557Stjr if (MB_CUR_MAX > 1) \ 413131557Stjr update_mb_len_index(lexptr, lexleft); \ 414131557Stjr (c) = (unsigned char) *lexptr++; \ 415131557Stjr --lexleft; \ 416131557Stjr } 417131557Stjr 418131557Stjr/* This function fetch a wide character, and update cur_mb_len, 419131557Stjr used only if the current locale is a multibyte environment. */ 420131564Stjrstatic wint_t 421131557Stjrfetch_wc (char const *eoferr) 422131557Stjr{ 423131557Stjr wchar_t wc; 424131557Stjr if (! lexleft) 425131557Stjr { 426131557Stjr if (eoferr != 0) 427131557Stjr dfaerror (eoferr); 428131557Stjr else 429131564Stjr return WEOF; 430131557Stjr } 431131557Stjr 432131557Stjr cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 433131557Stjr if (cur_mb_len <= 0) 434131557Stjr { 435131557Stjr cur_mb_len = 1; 436131557Stjr wc = *lexptr; 437131557Stjr } 438131557Stjr lexptr += cur_mb_len; 439131557Stjr lexleft -= cur_mb_len; 440131557Stjr return wc; 441131557Stjr} 442131557Stjr#else 443131557Stjr/* Note that characters become unsigned here. */ 444131557Stjr# define FETCH(c, eoferr) \ 44553451Speter { \ 44653451Speter if (! lexleft) \ 44756919Sru { \ 44856919Sru if (eoferr != 0) \ 44956919Sru dfaerror (eoferr); \ 45056919Sru else \ 45156919Sru return lasttok = END; \ 45256919Sru } \ 45353451Speter (c) = (unsigned char) *lexptr++; \ 45453451Speter --lexleft; \ 45553451Speter } 456131557Stjr#endif /* MBS_SUPPORT */ 45753451Speter 458131557Stjr#ifdef MBS_SUPPORT 459131557Stjr/* Multibyte character handling sub-routin for lex. 460131557Stjr This function parse a bracket expression and build a struct 461131557Stjr mb_char_classes. */ 462131557Stjrstatic void 463131557Stjrparse_bracket_exp_mb () 464131557Stjr{ 465131564Stjr wint_t wc, wc1, wc2; 466131557Stjr 467131557Stjr /* Work area to build a mb_char_classes. */ 468131557Stjr struct mb_char_classes *work_mbc; 469131557Stjr int chars_al, range_sts_al, range_ends_al, ch_classes_al, 470131557Stjr equivs_al, coll_elems_al; 471131557Stjr 472131557Stjr REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 473131557Stjr dfa->mbcsets_alloc, dfa->nmbcsets + 1); 474131557Stjr /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 475131557Stjr We will update dfa->multibyte_prop in addtok(), because we can't 476131557Stjr decide the index in dfa->tokens[]. */ 477131557Stjr 478131557Stjr /* Initialize work are */ 479131557Stjr work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 480131557Stjr 481131557Stjr chars_al = 1; 482131557Stjr range_sts_al = range_ends_al = 0; 483131557Stjr ch_classes_al = equivs_al = coll_elems_al = 0; 484131557Stjr MALLOC(work_mbc->chars, wchar_t, chars_al); 485131557Stjr 486131557Stjr work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 487131557Stjr work_mbc->nequivs = work_mbc->ncoll_elems = 0; 488131557Stjr work_mbc->chars = work_mbc->ch_classes = NULL; 489131557Stjr work_mbc->range_sts = work_mbc->range_ends = NULL; 490131557Stjr work_mbc->equivs = work_mbc->coll_elems = NULL; 491131557Stjr 492131557Stjr wc = fetch_wc(_("Unbalanced [")); 493131557Stjr if (wc == L'^') 494131557Stjr { 495131557Stjr wc = fetch_wc(_("Unbalanced [")); 496131557Stjr work_mbc->invert = 1; 497131557Stjr } 498131557Stjr else 499131557Stjr work_mbc->invert = 0; 500131557Stjr do 501131557Stjr { 502131564Stjr wc1 = WEOF; /* mark wc1 is not initialized". */ 503131557Stjr 504131557Stjr /* Note that if we're looking at some other [:...:] construct, 505131557Stjr we just treat it as a bunch of ordinary characters. We can do 506131557Stjr this because we assume regex has checked for syntax errors before 507131557Stjr dfa is ever called. */ 508131557Stjr if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 509131557Stjr { 510131557Stjr#define BRACKET_BUFFER_SIZE 128 511131557Stjr char str[BRACKET_BUFFER_SIZE]; 512131557Stjr wc1 = wc; 513131557Stjr wc = fetch_wc(_("Unbalanced [")); 514131557Stjr 515131557Stjr /* If pattern contains `[[:', `[[.', or `[[='. */ 516131557Stjr if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 517131557Stjr { 518131557Stjr unsigned char c; 519131557Stjr unsigned char delim = (unsigned char)wc; 520131557Stjr int len = 0; 521131557Stjr for (;;) 522131557Stjr { 523131557Stjr if (! lexleft) 524131557Stjr dfaerror (_("Unbalanced [")); 525131557Stjr c = (unsigned char) *lexptr++; 526131557Stjr --lexleft; 527131557Stjr 528131557Stjr if ((c == delim && *lexptr == ']') || lexleft == 0) 529131557Stjr break; 530131557Stjr if (len < BRACKET_BUFFER_SIZE) 531131557Stjr str[len++] = c; 532131557Stjr else 533131557Stjr /* This is in any case an invalid class name. */ 534131557Stjr str[0] = '\0'; 535131557Stjr } 536131557Stjr str[len] = '\0'; 537131557Stjr 538131557Stjr if (lexleft == 0) 539131557Stjr { 540131557Stjr REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 541131557Stjr work_mbc->nchars + 2); 542131557Stjr work_mbc->chars[work_mbc->nchars++] = L'['; 543131557Stjr work_mbc->chars[work_mbc->nchars++] = delim; 544131557Stjr break; 545131557Stjr } 546131557Stjr 547131557Stjr if (--lexleft, *lexptr++ != ']') 548131557Stjr dfaerror (_("Unbalanced [")); 549131557Stjr if (delim == ':') 550131557Stjr /* build character class. */ 551131557Stjr { 552131557Stjr wctype_t wt; 553131557Stjr /* Query the character class as wctype_t. */ 554131557Stjr wt = wctype (str); 555131557Stjr 556131557Stjr if (ch_classes_al == 0) 557131557Stjr MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); 558131557Stjr REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 559131557Stjr ch_classes_al, 560131557Stjr work_mbc->nch_classes + 1); 561131557Stjr work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 562131557Stjr 563131557Stjr } 564131557Stjr else if (delim == '=' || delim == '.') 565131557Stjr { 566131557Stjr char *elem; 567131557Stjr MALLOC(elem, char, len + 1); 568131557Stjr strncpy(elem, str, len + 1); 569131557Stjr 570131557Stjr if (delim == '=') 571131557Stjr /* build equivalent class. */ 572131557Stjr { 573131557Stjr if (equivs_al == 0) 574131557Stjr MALLOC(work_mbc->equivs, char*, ++equivs_al); 575131557Stjr REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 576131557Stjr equivs_al, 577131557Stjr work_mbc->nequivs + 1); 578131557Stjr work_mbc->equivs[work_mbc->nequivs++] = elem; 579131557Stjr } 580131557Stjr 581131557Stjr if (delim == '.') 582131557Stjr /* build collating element. */ 583131557Stjr { 584131557Stjr if (coll_elems_al == 0) 585131557Stjr MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 586131557Stjr REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 587131557Stjr coll_elems_al, 588131557Stjr work_mbc->ncoll_elems + 1); 589131557Stjr work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 590131557Stjr } 591131557Stjr } 592131578Stjr wc1 = wc = WEOF; 593131557Stjr } 594131557Stjr else 595131557Stjr /* We treat '[' as a normal character here. */ 596131557Stjr { 597131557Stjr wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 598131557Stjr } 599131557Stjr } 600131557Stjr else 601131557Stjr { 602131557Stjr if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 603131557Stjr wc = fetch_wc(("Unbalanced [")); 604131557Stjr } 605131557Stjr 606131564Stjr if (wc1 == WEOF) 607131557Stjr wc1 = fetch_wc(_("Unbalanced [")); 608131557Stjr 609131557Stjr if (wc1 == L'-') 610131557Stjr /* build range characters. */ 611131557Stjr { 612131557Stjr wc2 = fetch_wc(_("Unbalanced [")); 613131557Stjr if (wc2 == L']') 614131557Stjr { 615131557Stjr /* In the case [x-], the - is an ordinary hyphen, 616131557Stjr which is left in c1, the lookahead character. */ 617131557Stjr lexptr -= cur_mb_len; 618131557Stjr lexleft += cur_mb_len; 619131557Stjr wc2 = wc; 620131557Stjr } 621131557Stjr else 622131557Stjr { 623131557Stjr if (wc2 == L'\\' 624131557Stjr && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 625131557Stjr wc2 = fetch_wc(_("Unbalanced [")); 626131557Stjr wc1 = fetch_wc(_("Unbalanced [")); 627131557Stjr } 628131557Stjr 629131557Stjr if (range_sts_al == 0) 630131557Stjr { 631131557Stjr MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 632131557Stjr MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 633131557Stjr } 634131557Stjr REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 635131557Stjr range_sts_al, work_mbc->nranges + 1); 636131564Stjr work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; 637131557Stjr REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 638131557Stjr range_ends_al, work_mbc->nranges + 1); 639131564Stjr work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; 640131557Stjr } 641131564Stjr else if (wc != WEOF) 642131557Stjr /* build normal characters. */ 643131557Stjr { 644131557Stjr REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 645131557Stjr work_mbc->nchars + 1); 646131564Stjr work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; 647131557Stjr } 648131557Stjr } 649131557Stjr while ((wc = wc1) != L']'); 650131557Stjr} 651131557Stjr#endif /* MBS_SUPPORT */ 652131557Stjr 65353472Sobrien#ifdef __STDC__ 65453472Sobrien#define FUNC(F, P) static int F(int c) { return P(c); } 65553472Sobrien#else 65653451Speter#define FUNC(F, P) static int F(c) int c; { return P(c); } 65753472Sobrien#endif 65853451Speter 65953451SpeterFUNC(is_alpha, ISALPHA) 66053451SpeterFUNC(is_upper, ISUPPER) 66153451SpeterFUNC(is_lower, ISLOWER) 66253451SpeterFUNC(is_digit, ISDIGIT) 66353451SpeterFUNC(is_xdigit, ISXDIGIT) 66453451SpeterFUNC(is_space, ISSPACE) 66553451SpeterFUNC(is_punct, ISPUNCT) 66653451SpeterFUNC(is_alnum, ISALNUM) 66753451SpeterFUNC(is_print, ISPRINT) 66853451SpeterFUNC(is_graph, ISGRAPH) 66953451SpeterFUNC(is_cntrl, ISCNTRL) 67053451Speter 67156919Srustatic int 67256919Sruis_blank (int c) 67353472Sobrien{ 67453472Sobrien return (c == ' ' || c == '\t'); 67553472Sobrien} 67653472Sobrien 67753451Speter/* The following list maps the names of the Posix named character classes 67853451Speter to predicate functions that determine whether a given character is in 67953451Speter the class. The leading [ has already been eaten by the lexical analyzer. */ 68053451Speterstatic struct { 68153472Sobrien const char *name; 68253472Sobrien int (*pred) PARAMS ((int)); 683131557Stjr} const prednames[] = { 68453472Sobrien { ":alpha:]", is_alpha }, 68553472Sobrien { ":upper:]", is_upper }, 68653472Sobrien { ":lower:]", is_lower }, 68753472Sobrien { ":digit:]", is_digit }, 68853472Sobrien { ":xdigit:]", is_xdigit }, 68953472Sobrien { ":space:]", is_space }, 69053472Sobrien { ":punct:]", is_punct }, 69153472Sobrien { ":alnum:]", is_alnum }, 69253472Sobrien { ":print:]", is_print }, 69353472Sobrien { ":graph:]", is_graph }, 69453472Sobrien { ":cntrl:]", is_cntrl }, 69553472Sobrien { ":blank:]", is_blank }, 69653472Sobrien { 0 } 69753451Speter}; 69853451Speter 69953472Sobrien/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 70053472Sobrien#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 70153472Sobrien 70253451Speterstatic int 70356919Srulooking_at (char const *s) 70453451Speter{ 70553472Sobrien size_t len; 70653451Speter 70753451Speter len = strlen(s); 70853451Speter if (lexleft < len) 70953451Speter return 0; 71053451Speter return strncmp(s, lexptr, len) == 0; 71153451Speter} 71253451Speter 71353451Speterstatic token 71456919Srulex (void) 71553451Speter{ 716131557Stjr unsigned c, c1, c2; 71753451Speter int backslash = 0, invert; 71853451Speter charclass ccl; 71953451Speter int i; 72053451Speter 72153451Speter /* Basic plan: We fetch a character. If it's a backslash, 72253451Speter we set the backslash flag and go through the loop again. 72353451Speter On the plus side, this avoids having a duplicate of the 72453451Speter main switch inside the backslash case. On the minus side, 72553451Speter it means that just about every case begins with 72653451Speter "if (backslash) ...". */ 72753451Speter for (i = 0; i < 2; ++i) 72853451Speter { 72953451Speter FETCH(c, 0); 730131557Stjr#ifdef MBS_SUPPORT 731131557Stjr if (MB_CUR_MAX > 1 && cur_mb_index) 732131557Stjr /* If this is a part of a multi-byte character, we must treat 733131557Stjr this byte data as a normal character. 734131557Stjr e.g. In case of SJIS encoding, some character contains '\', 735131557Stjr but they must not be backslash. */ 736131557Stjr goto normal_char; 737131557Stjr#endif /* MBS_SUPPORT */ 73853451Speter switch (c) 73953451Speter { 74053451Speter case '\\': 74153451Speter if (backslash) 74253451Speter goto normal_char; 74353451Speter if (lexleft == 0) 74453472Sobrien dfaerror(_("Unfinished \\ escape")); 74553451Speter backslash = 1; 74653451Speter break; 74753451Speter 74853451Speter case '^': 74953451Speter if (backslash) 75053451Speter goto normal_char; 75153451Speter if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 75253451Speter || lasttok == END 75353451Speter || lasttok == LPAREN 75453451Speter || lasttok == OR) 75553451Speter return lasttok = BEGLINE; 75653451Speter goto normal_char; 75753451Speter 75853451Speter case '$': 75953451Speter if (backslash) 76053451Speter goto normal_char; 76153451Speter if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 76253451Speter || lexleft == 0 76353451Speter || (syntax_bits & RE_NO_BK_PARENS 76453451Speter ? lexleft > 0 && *lexptr == ')' 76553451Speter : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 76653451Speter || (syntax_bits & RE_NO_BK_VBAR 76753451Speter ? lexleft > 0 && *lexptr == '|' 76853451Speter : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 76953451Speter || ((syntax_bits & RE_NEWLINE_ALT) 77053451Speter && lexleft > 0 && *lexptr == '\n')) 77153451Speter return lasttok = ENDLINE; 77253451Speter goto normal_char; 77353451Speter 77453451Speter case '1': 77553451Speter case '2': 77653451Speter case '3': 77753451Speter case '4': 77853451Speter case '5': 77953451Speter case '6': 78053451Speter case '7': 78153451Speter case '8': 78253451Speter case '9': 78353451Speter if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 78453451Speter { 78553451Speter laststart = 0; 78653451Speter return lasttok = BACKREF; 78753451Speter } 78853451Speter goto normal_char; 78953451Speter 79053472Sobrien case '`': 79153472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 79253472Sobrien return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 79353472Sobrien goto normal_char; 79453472Sobrien 79553472Sobrien case '\'': 79653472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 79753472Sobrien return lasttok = ENDLINE; /* FIXME: should be end of string */ 79853472Sobrien goto normal_char; 79953472Sobrien 80053451Speter case '<': 80153472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 80253451Speter return lasttok = BEGWORD; 80353451Speter goto normal_char; 80453451Speter 80553451Speter case '>': 80653472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 80753451Speter return lasttok = ENDWORD; 80853451Speter goto normal_char; 80953451Speter 81053451Speter case 'b': 81153472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 81253451Speter return lasttok = LIMWORD; 81353451Speter goto normal_char; 81453451Speter 81553451Speter case 'B': 81653472Sobrien if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 81753451Speter return lasttok = NOTLIMWORD; 81853451Speter goto normal_char; 81953451Speter 82053451Speter case '?': 82153451Speter if (syntax_bits & RE_LIMITED_OPS) 82253451Speter goto normal_char; 82353451Speter if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 82453451Speter goto normal_char; 82553451Speter if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 82653451Speter goto normal_char; 82753451Speter return lasttok = QMARK; 82853451Speter 82953451Speter case '*': 83053451Speter if (backslash) 83153451Speter goto normal_char; 83253451Speter if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 83353451Speter goto normal_char; 83453451Speter return lasttok = STAR; 83553451Speter 83653451Speter case '+': 83753451Speter if (syntax_bits & RE_LIMITED_OPS) 83853451Speter goto normal_char; 83953451Speter if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 84053451Speter goto normal_char; 84153451Speter if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 84253451Speter goto normal_char; 84353451Speter return lasttok = PLUS; 84453451Speter 84553451Speter case '{': 84653451Speter if (!(syntax_bits & RE_INTERVALS)) 84753451Speter goto normal_char; 84853451Speter if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 84953451Speter goto normal_char; 85055379Sobrien if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 85155379Sobrien goto normal_char; 85255379Sobrien 85355379Sobrien if (syntax_bits & RE_NO_BK_BRACES) 85455379Sobrien { 85555379Sobrien /* Scan ahead for a valid interval; if it's not valid, 85655379Sobrien treat it as a literal '{'. */ 85755379Sobrien int lo = -1, hi = -1; 85855379Sobrien char const *p = lexptr; 85955379Sobrien char const *lim = p + lexleft; 86056919Sru for (; p != lim && ISASCIIDIGIT (*p); p++) 86155379Sobrien lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 86255379Sobrien if (p != lim && *p == ',') 86356919Sru while (++p != lim && ISASCIIDIGIT (*p)) 86455379Sobrien hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 86555379Sobrien else 86655379Sobrien hi = lo; 86755379Sobrien if (p == lim || *p != '}' 86855379Sobrien || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 86955379Sobrien goto normal_char; 87055379Sobrien } 87155379Sobrien 87255379Sobrien minrep = 0; 87353451Speter /* Cases: 87453451Speter {M} - exact count 87553451Speter {M,} - minimum count, maximum is infinity 87653451Speter {M,N} - M through N */ 87753472Sobrien FETCH(c, _("unfinished repeat count")); 87856919Sru if (ISASCIIDIGIT (c)) 87953451Speter { 88053451Speter minrep = c - '0'; 88153451Speter for (;;) 88253451Speter { 88353472Sobrien FETCH(c, _("unfinished repeat count")); 88456919Sru if (! ISASCIIDIGIT (c)) 88553451Speter break; 88653451Speter minrep = 10 * minrep + c - '0'; 88753451Speter } 88853451Speter } 88955379Sobrien else 89053472Sobrien dfaerror(_("malformed repeat count")); 89153451Speter if (c == ',') 89255379Sobrien { 89355379Sobrien FETCH (c, _("unfinished repeat count")); 89456919Sru if (! ISASCIIDIGIT (c)) 89555379Sobrien maxrep = -1; 89655379Sobrien else 89755379Sobrien { 89855379Sobrien maxrep = c - '0'; 89955379Sobrien for (;;) 90055379Sobrien { 90155379Sobrien FETCH (c, _("unfinished repeat count")); 90256919Sru if (! ISASCIIDIGIT (c)) 90355379Sobrien break; 90455379Sobrien maxrep = 10 * maxrep + c - '0'; 90555379Sobrien } 90655379Sobrien if (0 <= maxrep && maxrep < minrep) 90755379Sobrien dfaerror (_("malformed repeat count")); 90855379Sobrien } 90955379Sobrien } 91053451Speter else 91153451Speter maxrep = minrep; 91253451Speter if (!(syntax_bits & RE_NO_BK_BRACES)) 91353451Speter { 91453451Speter if (c != '\\') 91553472Sobrien dfaerror(_("malformed repeat count")); 91653472Sobrien FETCH(c, _("unfinished repeat count")); 91753451Speter } 91853451Speter if (c != '}') 91953472Sobrien dfaerror(_("malformed repeat count")); 92053451Speter laststart = 0; 92153451Speter return lasttok = REPMN; 92253451Speter 92353451Speter case '|': 92453451Speter if (syntax_bits & RE_LIMITED_OPS) 92553451Speter goto normal_char; 92653451Speter if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 92753451Speter goto normal_char; 92853451Speter laststart = 1; 92953451Speter return lasttok = OR; 93053451Speter 93153451Speter case '\n': 93253451Speter if (syntax_bits & RE_LIMITED_OPS 93353451Speter || backslash 93453451Speter || !(syntax_bits & RE_NEWLINE_ALT)) 93553451Speter goto normal_char; 93653451Speter laststart = 1; 93753451Speter return lasttok = OR; 93853451Speter 93953451Speter case '(': 94053451Speter if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 94153451Speter goto normal_char; 94253451Speter ++parens; 94353451Speter laststart = 1; 94453451Speter return lasttok = LPAREN; 94553451Speter 94653451Speter case ')': 94753451Speter if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 94853451Speter goto normal_char; 94953451Speter if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 95053451Speter goto normal_char; 95153451Speter --parens; 95253451Speter laststart = 0; 95353451Speter return lasttok = RPAREN; 95453451Speter 95553451Speter case '.': 95653451Speter if (backslash) 95753451Speter goto normal_char; 958131557Stjr#ifdef MBS_SUPPORT 959131557Stjr if (MB_CUR_MAX > 1) 960131557Stjr { 961131557Stjr /* In multibyte environment period must match with a single 962131557Stjr character not a byte. So we use ANYCHAR. */ 963131557Stjr laststart = 0; 964131557Stjr return lasttok = ANYCHAR; 965131557Stjr } 966131557Stjr#endif /* MBS_SUPPORT */ 96753451Speter zeroset(ccl); 96853451Speter notset(ccl); 96953451Speter if (!(syntax_bits & RE_DOT_NEWLINE)) 97055379Sobrien clrbit(eolbyte, ccl); 97153451Speter if (syntax_bits & RE_DOT_NOT_NULL) 97253451Speter clrbit('\0', ccl); 97353451Speter laststart = 0; 97453451Speter return lasttok = CSET + charclass_index(ccl); 97553451Speter 97653451Speter case 'w': 97753451Speter case 'W': 97853472Sobrien if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 97953451Speter goto normal_char; 98053451Speter zeroset(ccl); 98153451Speter for (c2 = 0; c2 < NOTCHAR; ++c2) 98253472Sobrien if (IS_WORD_CONSTITUENT(c2)) 98353451Speter setbit(c2, ccl); 98453451Speter if (c == 'W') 98553451Speter notset(ccl); 98653451Speter laststart = 0; 98753451Speter return lasttok = CSET + charclass_index(ccl); 98853451Speter 98953451Speter case '[': 99053451Speter if (backslash) 99153451Speter goto normal_char; 992131557Stjr laststart = 0; 993131557Stjr#ifdef MBS_SUPPORT 994131557Stjr if (MB_CUR_MAX > 1) 995131557Stjr { 996131557Stjr /* In multibyte environment a bracket expression may contain 997131557Stjr multibyte characters, which must be treated as characters 998131557Stjr (not bytes). So we parse it by parse_bracket_exp_mb(). */ 999131557Stjr parse_bracket_exp_mb(); 1000131557Stjr return lasttok = MBCSET; 1001131557Stjr } 1002131557Stjr#endif 100353451Speter zeroset(ccl); 100453472Sobrien FETCH(c, _("Unbalanced [")); 100553451Speter if (c == '^') 100653451Speter { 100753472Sobrien FETCH(c, _("Unbalanced [")); 100853451Speter invert = 1; 100953451Speter } 101053451Speter else 101153451Speter invert = 0; 101253451Speter do 101353451Speter { 101453451Speter /* Nobody ever said this had to be fast. :-) 101553451Speter Note that if we're looking at some other [:...:] 101653451Speter construct, we just treat it as a bunch of ordinary 101753451Speter characters. We can do this because we assume 101853451Speter regex has checked for syntax errors before 101953451Speter dfa is ever called. */ 102053451Speter if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 102153451Speter for (c1 = 0; prednames[c1].name; ++c1) 102253451Speter if (looking_at(prednames[c1].name)) 102353451Speter { 1024131557Stjr int (*pred) PARAMS ((int)) = prednames[c1].pred; 102553472Sobrien 102653451Speter for (c2 = 0; c2 < NOTCHAR; ++c2) 102753472Sobrien if ((*pred)(c2)) 1028131557Stjr setbit_case_fold (c2, ccl); 102953451Speter lexptr += strlen(prednames[c1].name); 103053451Speter lexleft -= strlen(prednames[c1].name); 103153472Sobrien FETCH(c1, _("Unbalanced [")); 103253451Speter goto skip; 103353451Speter } 103453451Speter if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 103553472Sobrien FETCH(c, _("Unbalanced [")); 103653472Sobrien FETCH(c1, _("Unbalanced [")); 103753451Speter if (c1 == '-') 103853451Speter { 103953472Sobrien FETCH(c2, _("Unbalanced [")); 104053451Speter if (c2 == ']') 104153451Speter { 104253451Speter /* In the case [x-], the - is an ordinary hyphen, 104353451Speter which is left in c1, the lookahead character. */ 104453451Speter --lexptr; 104553451Speter ++lexleft; 104653451Speter } 104753451Speter else 104853451Speter { 104953451Speter if (c2 == '\\' 105053451Speter && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 105153472Sobrien FETCH(c2, _("Unbalanced [")); 105253472Sobrien FETCH(c1, _("Unbalanced [")); 1053131557Stjr if (!hard_LC_COLLATE) { 1054131557Stjr for (; c <= c2; c++) 1055131557Stjr setbit_case_fold (c, ccl); 1056131557Stjr } else { 1057131557Stjr /* POSIX locales are painful - leave the decision to libc */ 1058131557Stjr char expr[6] = { '[', c, '-', c2, ']', '\0' }; 1059131557Stjr regex_t re; 1060131557Stjr if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1061131557Stjr for (c = 0; c < NOTCHAR; ++c) { 1062131557Stjr char buf[2] = { c, '\0' }; 1063131557Stjr regmatch_t mat; 1064131557Stjr if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1065131557Stjr && mat.rm_so == 0 && mat.rm_eo == 1) 1066131557Stjr setbit_case_fold (c, ccl); 1067131557Stjr } 1068131557Stjr regfree (&re); 106956919Sru } 1070131557Stjr } 1071131557Stjr continue; 107256919Sru } 107353451Speter } 107456919Sru 1075131557Stjr setbit_case_fold (c, ccl); 1076131557Stjr 107753451Speter skip: 107853451Speter ; 107953451Speter } 108053451Speter while ((c = c1) != ']'); 108153451Speter if (invert) 108253451Speter { 108353451Speter notset(ccl); 108453451Speter if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 108555379Sobrien clrbit(eolbyte, ccl); 108653451Speter } 108753451Speter return lasttok = CSET + charclass_index(ccl); 108853451Speter 108953451Speter default: 109053451Speter normal_char: 109153451Speter laststart = 0; 109253451Speter if (case_fold && ISALPHA(c)) 109353451Speter { 109453451Speter zeroset(ccl); 1095131557Stjr setbit_case_fold (c, ccl); 109653451Speter return lasttok = CSET + charclass_index(ccl); 109753451Speter } 109853451Speter return c; 109953451Speter } 110053451Speter } 110153451Speter 110253451Speter /* The above loop should consume at most a backslash 110353451Speter and some other character. */ 110453451Speter abort(); 110553472Sobrien return END; /* keeps pedantic compilers happy. */ 110653451Speter} 110753451Speter 110853451Speter/* Recursive descent parser for regular expressions. */ 110953451Speter 111053451Speterstatic token tok; /* Lookahead token. */ 111153472Sobrienstatic int depth; /* Current depth of a hypothetical stack 111253451Speter holding deferred productions. This is 111353451Speter used to determine the depth that will be 111453451Speter required of the real stack later on in 111553451Speter dfaanalyze(). */ 111653451Speter 111753451Speter/* Add the given token to the parse tree, maintaining the depth count and 111853451Speter updating the maximum depth if necessary. */ 111953451Speterstatic void 112056919Sruaddtok (token t) 112153451Speter{ 1122131557Stjr#ifdef MBS_SUPPORT 1123131557Stjr if (MB_CUR_MAX > 1) 1124131557Stjr { 1125131557Stjr REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1126131557Stjr dfa->tindex); 1127131557Stjr /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1128131557Stjr if (t == MBCSET) 1129131557Stjr dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1130131557Stjr else if (t < NOTCHAR) 1131131557Stjr dfa->multibyte_prop[dfa->tindex] 1132131557Stjr = (cur_mb_len == 1)? 3 /* single-byte char */ 1133131557Stjr : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1134131557Stjr + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1135131557Stjr else 1136131557Stjr /* It may be unnecesssary, but it is safer to treat other 1137131557Stjr symbols as singlebyte characters. */ 1138131557Stjr dfa->multibyte_prop[dfa->tindex] = 3; 1139131557Stjr } 1140131557Stjr#endif 1141131557Stjr 114253451Speter REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 114353451Speter dfa->tokens[dfa->tindex++] = t; 114453451Speter 114553451Speter switch (t) 114653451Speter { 114753451Speter case QMARK: 114853451Speter case STAR: 114953451Speter case PLUS: 115053451Speter break; 115153451Speter 115253451Speter case CAT: 115353451Speter case OR: 115453451Speter case ORTOP: 115553451Speter --depth; 115653451Speter break; 115753451Speter 115853451Speter default: 115953451Speter ++dfa->nleaves; 116053451Speter case EMPTY: 116153451Speter ++depth; 116253451Speter break; 116353451Speter } 116453451Speter if (depth > dfa->depth) 116553451Speter dfa->depth = depth; 116653451Speter} 116753451Speter 116853451Speter/* The grammar understood by the parser is as follows. 116953451Speter 117053451Speter regexp: 117153451Speter regexp OR branch 117253451Speter branch 117353451Speter 117453451Speter branch: 117553451Speter branch closure 117653451Speter closure 117753451Speter 117853451Speter closure: 117953451Speter closure QMARK 118053451Speter closure STAR 118153451Speter closure PLUS 1182131557Stjr closure REPMN 118353451Speter atom 118453451Speter 118553451Speter atom: 118653451Speter <normal character> 1187131557Stjr <multibyte character> 1188131557Stjr ANYCHAR 1189131557Stjr MBCSET 119053451Speter CSET 119153451Speter BACKREF 119253451Speter BEGLINE 119353451Speter ENDLINE 119453451Speter BEGWORD 119553451Speter ENDWORD 119653451Speter LIMWORD 119753451Speter NOTLIMWORD 1198131557Stjr CRANGE 1199131557Stjr LPAREN regexp RPAREN 120053451Speter <empty> 120153451Speter 120253451Speter The parser builds a parse tree in postfix form in an array of tokens. */ 120353451Speter 120453451Speterstatic void 120556919Sruatom (void) 120653451Speter{ 120753451Speter if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 120853451Speter || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1209131557Stjr#ifdef MBS_SUPPORT 1210131557Stjr || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1211131557Stjr#endif /* MBS_SUPPORT */ 121253451Speter || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 121353451Speter { 121453451Speter addtok(tok); 121553451Speter tok = lex(); 1216131557Stjr#ifdef MBS_SUPPORT 1217131557Stjr /* We treat a multibyte character as a single atom, so that DFA 1218131557Stjr can treat a multibyte character as a single expression. 1219131557Stjr 1220131557Stjr e.g. We construct following tree from "<mb1><mb2>". 1221131557Stjr <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1222131557Stjr <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1223131557Stjr */ 1224131557Stjr if (MB_CUR_MAX > 1) 1225131557Stjr { 1226131557Stjr while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1227131557Stjr { 1228131557Stjr addtok(tok); 1229131557Stjr addtok(CAT); 1230131557Stjr tok = lex(); 1231131557Stjr } 1232131557Stjr } 1233131557Stjr#endif /* MBS_SUPPORT */ 123453451Speter } 1235131557Stjr else if (tok == CRANGE) 1236131557Stjr { 1237131557Stjr /* A character range like "[a-z]" in a locale other than "C" or 1238131557Stjr "POSIX". This range might any sequence of one or more 1239131557Stjr characters. Unfortunately the POSIX locale primitives give 1240131557Stjr us no practical way to find what character sequences might be 1241131557Stjr matched. Treat this approximately like "(.\1)" -- i.e. match 1242131557Stjr one character, and then punt to the full matcher. */ 1243131557Stjr charclass ccl; 1244131557Stjr zeroset (ccl); 1245131557Stjr notset (ccl); 1246131557Stjr addtok (CSET + charclass_index (ccl)); 1247131557Stjr addtok (BACKREF); 1248131557Stjr addtok (CAT); 1249131557Stjr tok = lex (); 1250131557Stjr } 125153451Speter else if (tok == LPAREN) 125253451Speter { 125353451Speter tok = lex(); 125453451Speter regexp(0); 125553451Speter if (tok != RPAREN) 125653472Sobrien dfaerror(_("Unbalanced (")); 125753451Speter tok = lex(); 125853451Speter } 125953451Speter else 126053451Speter addtok(EMPTY); 126153451Speter} 126253451Speter 126353451Speter/* Return the number of tokens in the given subexpression. */ 126453451Speterstatic int 126556919Srunsubtoks (int tindex) 126653451Speter{ 126753451Speter int ntoks1; 126853451Speter 126953451Speter switch (dfa->tokens[tindex - 1]) 127053451Speter { 127153451Speter default: 127253451Speter return 1; 127353451Speter case QMARK: 127453451Speter case STAR: 127553451Speter case PLUS: 127653451Speter return 1 + nsubtoks(tindex - 1); 127753451Speter case CAT: 127853451Speter case OR: 127953451Speter case ORTOP: 128053451Speter ntoks1 = nsubtoks(tindex - 1); 128153451Speter return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 128253451Speter } 128353451Speter} 128453451Speter 128553451Speter/* Copy the given subexpression to the top of the tree. */ 128653451Speterstatic void 128756919Srucopytoks (int tindex, int ntokens) 128853451Speter{ 128953451Speter int i; 129053451Speter 129153451Speter for (i = 0; i < ntokens; ++i) 129253451Speter addtok(dfa->tokens[tindex + i]); 129353451Speter} 129453451Speter 129553451Speterstatic void 129656919Sruclosure (void) 129753451Speter{ 129853451Speter int tindex, ntokens, i; 129953451Speter 130053451Speter atom(); 130153451Speter while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 130253451Speter if (tok == REPMN) 130353451Speter { 130453451Speter ntokens = nsubtoks(dfa->tindex); 130553451Speter tindex = dfa->tindex - ntokens; 130655379Sobrien if (maxrep < 0) 130753451Speter addtok(PLUS); 130853451Speter if (minrep == 0) 130953451Speter addtok(QMARK); 131053451Speter for (i = 1; i < minrep; ++i) 131153451Speter { 131253451Speter copytoks(tindex, ntokens); 131353451Speter addtok(CAT); 131453451Speter } 131553451Speter for (; i < maxrep; ++i) 131653451Speter { 131753451Speter copytoks(tindex, ntokens); 131853451Speter addtok(QMARK); 131953451Speter addtok(CAT); 132053451Speter } 132153451Speter tok = lex(); 132253451Speter } 132353451Speter else 132453451Speter { 132553451Speter addtok(tok); 132653451Speter tok = lex(); 132753451Speter } 132853451Speter} 132953451Speter 133053451Speterstatic void 133156919Srubranch (void) 133253451Speter{ 133353451Speter closure(); 133453451Speter while (tok != RPAREN && tok != OR && tok >= 0) 133553451Speter { 133653451Speter closure(); 133753451Speter addtok(CAT); 133853451Speter } 133953451Speter} 134053451Speter 134153451Speterstatic void 134256919Sruregexp (int toplevel) 134353451Speter{ 134453451Speter branch(); 134553451Speter while (tok == OR) 134653451Speter { 134753451Speter tok = lex(); 134853451Speter branch(); 134953451Speter if (toplevel) 135053451Speter addtok(ORTOP); 135153451Speter else 135253451Speter addtok(OR); 135353451Speter } 135453451Speter} 135553451Speter 135653451Speter/* Main entry point for the parser. S is a string to be parsed, len is the 135753451Speter length of the string, so s can include NUL characters. D is a pointer to 135853451Speter the struct dfa to parse into. */ 135953451Spetervoid 1360131557Stjrdfaparse (char const *s, size_t len, struct dfa *d) 136153451Speter{ 136253451Speter dfa = d; 136353451Speter lexstart = lexptr = s; 136453451Speter lexleft = len; 136553451Speter lasttok = END; 136653451Speter laststart = 1; 136753451Speter parens = 0; 1368131557Stjr hard_LC_COLLATE = hard_locale (LC_COLLATE); 1369131557Stjr#ifdef MBS_SUPPORT 1370131557Stjr if (MB_CUR_MAX > 1) 1371131557Stjr { 1372131557Stjr cur_mb_index = 0; 1373131557Stjr cur_mb_len = 0; 1374131557Stjr memset(&mbs, 0, sizeof(mbstate_t)); 1375131557Stjr } 1376131557Stjr#endif /* MBS_SUPPORT */ 137753451Speter 137853451Speter if (! syntax_bits_set) 137953472Sobrien dfaerror(_("No syntax specified")); 138053451Speter 138153451Speter tok = lex(); 138253451Speter depth = d->depth; 138353451Speter 138453451Speter regexp(1); 138553451Speter 138653451Speter if (tok != END) 138753472Sobrien dfaerror(_("Unbalanced )")); 138853451Speter 138953451Speter addtok(END - d->nregexps); 139053451Speter addtok(CAT); 139153451Speter 139253451Speter if (d->nregexps) 139353451Speter addtok(ORTOP); 139453451Speter 139553451Speter ++d->nregexps; 139653451Speter} 139753451Speter 139853451Speter/* Some primitives for operating on sets of positions. */ 139953451Speter 140053451Speter/* Copy one set to another; the destination must be large enough. */ 140153451Speterstatic void 1402131557Stjrcopy (position_set const *src, position_set *dst) 140353451Speter{ 140453451Speter int i; 140553451Speter 140653451Speter for (i = 0; i < src->nelem; ++i) 140753451Speter dst->elems[i] = src->elems[i]; 140853451Speter dst->nelem = src->nelem; 140953451Speter} 141053451Speter 141153451Speter/* Insert a position in a set. Position sets are maintained in sorted 141253451Speter order according to index. If position already exists in the set with 141353451Speter the same index then their constraints are logically or'd together. 141453451Speter S->elems must point to an array large enough to hold the resulting set. */ 141553451Speterstatic void 141656919Sruinsert (position p, position_set *s) 141753451Speter{ 141853451Speter int i; 141953451Speter position t1, t2; 142053451Speter 142153451Speter for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 142253472Sobrien continue; 142353451Speter if (i < s->nelem && p.index == s->elems[i].index) 142453451Speter s->elems[i].constraint |= p.constraint; 142553451Speter else 142653451Speter { 142753451Speter t1 = p; 142853451Speter ++s->nelem; 142953451Speter while (i < s->nelem) 143053451Speter { 143153451Speter t2 = s->elems[i]; 143253451Speter s->elems[i++] = t1; 143353451Speter t1 = t2; 143453451Speter } 143553451Speter } 143653451Speter} 143753451Speter 143853451Speter/* Merge two sets of positions into a third. The result is exactly as if 143953451Speter the positions of both sets were inserted into an initially empty set. */ 144053451Speterstatic void 1441131557Stjrmerge (position_set const *s1, position_set const *s2, position_set *m) 144253451Speter{ 144353451Speter int i = 0, j = 0; 144453451Speter 144553451Speter m->nelem = 0; 144653451Speter while (i < s1->nelem && j < s2->nelem) 144753451Speter if (s1->elems[i].index > s2->elems[j].index) 144853451Speter m->elems[m->nelem++] = s1->elems[i++]; 144953451Speter else if (s1->elems[i].index < s2->elems[j].index) 145053451Speter m->elems[m->nelem++] = s2->elems[j++]; 145153451Speter else 145253451Speter { 145353451Speter m->elems[m->nelem] = s1->elems[i++]; 145453451Speter m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 145553451Speter } 145653451Speter while (i < s1->nelem) 145753451Speter m->elems[m->nelem++] = s1->elems[i++]; 145853451Speter while (j < s2->nelem) 145953451Speter m->elems[m->nelem++] = s2->elems[j++]; 146053451Speter} 146153451Speter 146253451Speter/* Delete a position from a set. */ 146353451Speterstatic void 146456919Srudelete (position p, position_set *s) 146553451Speter{ 146653451Speter int i; 146753451Speter 146853451Speter for (i = 0; i < s->nelem; ++i) 146953451Speter if (p.index == s->elems[i].index) 147053451Speter break; 147153451Speter if (i < s->nelem) 147253451Speter for (--s->nelem; i < s->nelem; ++i) 147353451Speter s->elems[i] = s->elems[i + 1]; 147453451Speter} 147553451Speter 147653451Speter/* Find the index of the state corresponding to the given position set with 147753451Speter the given preceding context, or create a new state if there is no such 147853451Speter state. Newline and letter tell whether we got here on a newline or 147953451Speter letter, respectively. */ 148053451Speterstatic int 1481131557Stjrstate_index (struct dfa *d, position_set const *s, int newline, int letter) 148253451Speter{ 148353451Speter int hash = 0; 148453451Speter int constraint; 148553451Speter int i, j; 148653451Speter 148753451Speter newline = newline ? 1 : 0; 148853451Speter letter = letter ? 1 : 0; 148953451Speter 149053451Speter for (i = 0; i < s->nelem; ++i) 149153451Speter hash ^= s->elems[i].index + s->elems[i].constraint; 149253451Speter 149353451Speter /* Try to find a state that exactly matches the proposed one. */ 149453451Speter for (i = 0; i < d->sindex; ++i) 149553451Speter { 149653451Speter if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 149753451Speter || newline != d->states[i].newline || letter != d->states[i].letter) 149853451Speter continue; 149953451Speter for (j = 0; j < s->nelem; ++j) 150053451Speter if (s->elems[j].constraint 150153451Speter != d->states[i].elems.elems[j].constraint 150253451Speter || s->elems[j].index != d->states[i].elems.elems[j].index) 150353451Speter break; 150453451Speter if (j == s->nelem) 150553451Speter return i; 150653451Speter } 150753451Speter 150853451Speter /* We'll have to create a new state. */ 150953451Speter REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 151053451Speter d->states[i].hash = hash; 151153451Speter MALLOC(d->states[i].elems.elems, position, s->nelem); 151253451Speter copy(s, &d->states[i].elems); 151353451Speter d->states[i].newline = newline; 151453451Speter d->states[i].letter = letter; 151553451Speter d->states[i].backref = 0; 151653451Speter d->states[i].constraint = 0; 151753451Speter d->states[i].first_end = 0; 1518131557Stjr#ifdef MBS_SUPPORT 1519131557Stjr if (MB_CUR_MAX > 1) 1520131557Stjr d->states[i].mbps.nelem = 0; 1521131557Stjr#endif 152253451Speter for (j = 0; j < s->nelem; ++j) 152353451Speter if (d->tokens[s->elems[j].index] < 0) 152453451Speter { 152553451Speter constraint = s->elems[j].constraint; 152653451Speter if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 152753451Speter || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 152853451Speter || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 152953451Speter || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 153053451Speter d->states[i].constraint |= constraint; 153153451Speter if (! d->states[i].first_end) 153253451Speter d->states[i].first_end = d->tokens[s->elems[j].index]; 153353451Speter } 153453451Speter else if (d->tokens[s->elems[j].index] == BACKREF) 153553451Speter { 153653451Speter d->states[i].constraint = NO_CONSTRAINT; 153753451Speter d->states[i].backref = 1; 153853451Speter } 153953451Speter 154053451Speter ++d->sindex; 154153451Speter 154253451Speter return i; 154353451Speter} 154453451Speter 154553451Speter/* Find the epsilon closure of a set of positions. If any position of the set 154653451Speter contains a symbol that matches the empty string in some context, replace 154753451Speter that position with the elements of its follow labeled with an appropriate 154853451Speter constraint. Repeat exhaustively until no funny positions are left. 154953451Speter S->elems must be large enough to hold the result. */ 155053472Sobrienstatic void 1551131557Stjrepsclosure (position_set *s, struct dfa const *d) 155253451Speter{ 155353451Speter int i, j; 155453451Speter int *visited; 155553451Speter position p, old; 155653451Speter 155753451Speter MALLOC(visited, int, d->tindex); 155853451Speter for (i = 0; i < d->tindex; ++i) 155953451Speter visited[i] = 0; 156053451Speter 156153451Speter for (i = 0; i < s->nelem; ++i) 156253451Speter if (d->tokens[s->elems[i].index] >= NOTCHAR 156353451Speter && d->tokens[s->elems[i].index] != BACKREF 1564131557Stjr#ifdef MBS_SUPPORT 1565131557Stjr && d->tokens[s->elems[i].index] != ANYCHAR 1566131557Stjr && d->tokens[s->elems[i].index] != MBCSET 1567131557Stjr#endif 156853451Speter && d->tokens[s->elems[i].index] < CSET) 156953451Speter { 157053451Speter old = s->elems[i]; 157153451Speter p.constraint = old.constraint; 157253451Speter delete(s->elems[i], s); 157353451Speter if (visited[old.index]) 157453451Speter { 157553451Speter --i; 157653451Speter continue; 157753451Speter } 157853451Speter visited[old.index] = 1; 157953451Speter switch (d->tokens[old.index]) 158053451Speter { 158153451Speter case BEGLINE: 158253451Speter p.constraint &= BEGLINE_CONSTRAINT; 158353451Speter break; 158453451Speter case ENDLINE: 158553451Speter p.constraint &= ENDLINE_CONSTRAINT; 158653451Speter break; 158753451Speter case BEGWORD: 158853451Speter p.constraint &= BEGWORD_CONSTRAINT; 158953451Speter break; 159053451Speter case ENDWORD: 159153451Speter p.constraint &= ENDWORD_CONSTRAINT; 159253451Speter break; 159353451Speter case LIMWORD: 159453451Speter p.constraint &= LIMWORD_CONSTRAINT; 159553451Speter break; 159653451Speter case NOTLIMWORD: 159753451Speter p.constraint &= NOTLIMWORD_CONSTRAINT; 159853451Speter break; 159953451Speter default: 160053451Speter break; 160153451Speter } 160253451Speter for (j = 0; j < d->follows[old.index].nelem; ++j) 160353451Speter { 160453451Speter p.index = d->follows[old.index].elems[j].index; 160553451Speter insert(p, s); 160653451Speter } 160753451Speter /* Force rescan to start at the beginning. */ 160853451Speter i = -1; 160953451Speter } 161053451Speter 161153451Speter free(visited); 161253451Speter} 161353451Speter 161453451Speter/* Perform bottom-up analysis on the parse tree, computing various functions. 161553451Speter Note that at this point, we're pretending constructs like \< are real 161653451Speter characters rather than constraints on what can follow them. 161753451Speter 161853451Speter Nullable: A node is nullable if it is at the root of a regexp that can 161953451Speter match the empty string. 162053451Speter * EMPTY leaves are nullable. 162153451Speter * No other leaf is nullable. 162253451Speter * A QMARK or STAR node is nullable. 162353451Speter * A PLUS node is nullable if its argument is nullable. 162453451Speter * A CAT node is nullable if both its arguments are nullable. 162553451Speter * An OR node is nullable if either argument is nullable. 162653451Speter 162753451Speter Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 162853451Speter that could correspond to the first character of a string matching the 162953451Speter regexp rooted at the given node. 163053451Speter * EMPTY leaves have empty firstpos. 163153451Speter * The firstpos of a nonempty leaf is that leaf itself. 163253451Speter * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 163353451Speter argument. 163453451Speter * The firstpos of a CAT node is the firstpos of the left argument, union 163553451Speter the firstpos of the right if the left argument is nullable. 163653451Speter * The firstpos of an OR node is the union of firstpos of each argument. 163753451Speter 163853451Speter Lastpos: The lastpos of a node is the set of positions that could 163953451Speter correspond to the last character of a string matching the regexp at 164053451Speter the given node. 164153451Speter * EMPTY leaves have empty lastpos. 164253451Speter * The lastpos of a nonempty leaf is that leaf itself. 164353451Speter * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 164453451Speter argument. 164553451Speter * The lastpos of a CAT node is the lastpos of its right argument, union 164653451Speter the lastpos of the left if the right argument is nullable. 164753451Speter * The lastpos of an OR node is the union of the lastpos of each argument. 164853451Speter 164953451Speter Follow: The follow of a position is the set of positions that could 165053451Speter correspond to the character following a character matching the node in 165153451Speter a string matching the regexp. At this point we consider special symbols 165253451Speter that match the empty string in some context to be just normal characters. 165353451Speter Later, if we find that a special symbol is in a follow set, we will 165453451Speter replace it with the elements of its follow, labeled with an appropriate 165553451Speter constraint. 165653451Speter * Every node in the firstpos of the argument of a STAR or PLUS node is in 165753451Speter the follow of every node in the lastpos. 165853451Speter * Every node in the firstpos of the second argument of a CAT node is in 165953451Speter the follow of every node in the lastpos of the first argument. 166053451Speter 166153451Speter Because of the postfix representation of the parse tree, the depth-first 166253451Speter analysis is conveniently done by a linear scan with the aid of a stack. 166353451Speter Sets are stored as arrays of the elements, obeying a stack-like allocation 166453451Speter scheme; the number of elements in each set deeper in the stack can be 166553451Speter used to determine the address of a particular set's array. */ 166653451Spetervoid 166756919Srudfaanalyze (struct dfa *d, int searchflag) 166853451Speter{ 166953451Speter int *nullable; /* Nullable stack. */ 167053451Speter int *nfirstpos; /* Element count stack for firstpos sets. */ 167153451Speter position *firstpos; /* Array where firstpos elements are stored. */ 167253451Speter int *nlastpos; /* Element count stack for lastpos sets. */ 167353451Speter position *lastpos; /* Array where lastpos elements are stored. */ 167453451Speter int *nalloc; /* Sizes of arrays allocated to follow sets. */ 167553451Speter position_set tmp; /* Temporary set for merging sets. */ 167653451Speter position_set merged; /* Result of merging sets. */ 167753451Speter int wants_newline; /* True if some position wants newline info. */ 167853451Speter int *o_nullable; 167953451Speter int *o_nfirst, *o_nlast; 168053451Speter position *o_firstpos, *o_lastpos; 168153451Speter int i, j; 168253451Speter position *pos; 168353451Speter 168453451Speter#ifdef DEBUG 168553451Speter fprintf(stderr, "dfaanalyze:\n"); 168653451Speter for (i = 0; i < d->tindex; ++i) 168753451Speter { 168853451Speter fprintf(stderr, " %d:", i); 168953451Speter prtok(d->tokens[i]); 169053451Speter } 169153451Speter putc('\n', stderr); 169253451Speter#endif 169353451Speter 169453451Speter d->searchflag = searchflag; 169553451Speter 169653451Speter MALLOC(nullable, int, d->depth); 169753451Speter o_nullable = nullable; 169853451Speter MALLOC(nfirstpos, int, d->depth); 169953451Speter o_nfirst = nfirstpos; 170053451Speter MALLOC(firstpos, position, d->nleaves); 170153451Speter o_firstpos = firstpos, firstpos += d->nleaves; 170253451Speter MALLOC(nlastpos, int, d->depth); 170353451Speter o_nlast = nlastpos; 170453451Speter MALLOC(lastpos, position, d->nleaves); 170553451Speter o_lastpos = lastpos, lastpos += d->nleaves; 170653451Speter MALLOC(nalloc, int, d->tindex); 170753451Speter for (i = 0; i < d->tindex; ++i) 170853451Speter nalloc[i] = 0; 170953451Speter MALLOC(merged.elems, position, d->nleaves); 171053451Speter 171153451Speter CALLOC(d->follows, position_set, d->tindex); 171253451Speter 171353451Speter for (i = 0; i < d->tindex; ++i) 171453451Speter#ifdef DEBUG 171553451Speter { /* Nonsyntactic #ifdef goo... */ 171653451Speter#endif 171753451Speter switch (d->tokens[i]) 171853451Speter { 171953451Speter case EMPTY: 172053451Speter /* The empty set is nullable. */ 172153451Speter *nullable++ = 1; 172253451Speter 172353451Speter /* The firstpos and lastpos of the empty leaf are both empty. */ 172453451Speter *nfirstpos++ = *nlastpos++ = 0; 172553451Speter break; 172653451Speter 172753451Speter case STAR: 172853451Speter case PLUS: 172953451Speter /* Every element in the firstpos of the argument is in the follow 173053451Speter of every element in the lastpos. */ 173153451Speter tmp.nelem = nfirstpos[-1]; 173253451Speter tmp.elems = firstpos; 173353451Speter pos = lastpos; 173453451Speter for (j = 0; j < nlastpos[-1]; ++j) 173553451Speter { 173653451Speter merge(&tmp, &d->follows[pos[j].index], &merged); 173753451Speter REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 173853451Speter nalloc[pos[j].index], merged.nelem - 1); 173953451Speter copy(&merged, &d->follows[pos[j].index]); 174053451Speter } 174153451Speter 174253451Speter case QMARK: 174353451Speter /* A QMARK or STAR node is automatically nullable. */ 174453451Speter if (d->tokens[i] != PLUS) 174553451Speter nullable[-1] = 1; 174653451Speter break; 174753451Speter 174853451Speter case CAT: 174953451Speter /* Every element in the firstpos of the second argument is in the 175053451Speter follow of every element in the lastpos of the first argument. */ 175153451Speter tmp.nelem = nfirstpos[-1]; 175253451Speter tmp.elems = firstpos; 175353451Speter pos = lastpos + nlastpos[-1]; 175453451Speter for (j = 0; j < nlastpos[-2]; ++j) 175553451Speter { 175653451Speter merge(&tmp, &d->follows[pos[j].index], &merged); 175753451Speter REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 175853451Speter nalloc[pos[j].index], merged.nelem - 1); 175953451Speter copy(&merged, &d->follows[pos[j].index]); 176053451Speter } 176153451Speter 176253451Speter /* The firstpos of a CAT node is the firstpos of the first argument, 176353451Speter union that of the second argument if the first is nullable. */ 176453451Speter if (nullable[-2]) 176553451Speter nfirstpos[-2] += nfirstpos[-1]; 176653451Speter else 176753451Speter firstpos += nfirstpos[-1]; 176853451Speter --nfirstpos; 176953451Speter 177053451Speter /* The lastpos of a CAT node is the lastpos of the second argument, 177153451Speter union that of the first argument if the second is nullable. */ 177253451Speter if (nullable[-1]) 177353451Speter nlastpos[-2] += nlastpos[-1]; 177453451Speter else 177553451Speter { 177653451Speter pos = lastpos + nlastpos[-2]; 177753451Speter for (j = nlastpos[-1] - 1; j >= 0; --j) 177853451Speter pos[j] = lastpos[j]; 177953451Speter lastpos += nlastpos[-2]; 178053451Speter nlastpos[-2] = nlastpos[-1]; 178153451Speter } 178253451Speter --nlastpos; 178353451Speter 178453451Speter /* A CAT node is nullable if both arguments are nullable. */ 178553451Speter nullable[-2] = nullable[-1] && nullable[-2]; 178653451Speter --nullable; 178753451Speter break; 178853451Speter 178953451Speter case OR: 179053451Speter case ORTOP: 179153451Speter /* The firstpos is the union of the firstpos of each argument. */ 179253451Speter nfirstpos[-2] += nfirstpos[-1]; 179353451Speter --nfirstpos; 179453451Speter 179553451Speter /* The lastpos is the union of the lastpos of each argument. */ 179653451Speter nlastpos[-2] += nlastpos[-1]; 179753451Speter --nlastpos; 179853451Speter 179953451Speter /* An OR node is nullable if either argument is nullable. */ 180053451Speter nullable[-2] = nullable[-1] || nullable[-2]; 180153451Speter --nullable; 180253451Speter break; 180353451Speter 180453451Speter default: 180553451Speter /* Anything else is a nonempty position. (Note that special 180653451Speter constructs like \< are treated as nonempty strings here; 180753451Speter an "epsilon closure" effectively makes them nullable later. 180853451Speter Backreferences have to get a real position so we can detect 180953451Speter transitions on them later. But they are nullable. */ 181053451Speter *nullable++ = d->tokens[i] == BACKREF; 181153451Speter 181253451Speter /* This position is in its own firstpos and lastpos. */ 181353451Speter *nfirstpos++ = *nlastpos++ = 1; 181453451Speter --firstpos, --lastpos; 181553451Speter firstpos->index = lastpos->index = i; 181653451Speter firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 181753451Speter 181853451Speter /* Allocate the follow set for this position. */ 181953451Speter nalloc[i] = 1; 182053451Speter MALLOC(d->follows[i].elems, position, nalloc[i]); 182153451Speter break; 182253451Speter } 182353451Speter#ifdef DEBUG 182453451Speter /* ... balance the above nonsyntactic #ifdef goo... */ 182553451Speter fprintf(stderr, "node %d:", i); 182653451Speter prtok(d->tokens[i]); 182753451Speter putc('\n', stderr); 182853451Speter fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 182953451Speter fprintf(stderr, " firstpos:"); 183053451Speter for (j = nfirstpos[-1] - 1; j >= 0; --j) 183153451Speter { 183253451Speter fprintf(stderr, " %d:", firstpos[j].index); 183353451Speter prtok(d->tokens[firstpos[j].index]); 183453451Speter } 183553451Speter fprintf(stderr, "\n lastpos:"); 183653451Speter for (j = nlastpos[-1] - 1; j >= 0; --j) 183753451Speter { 183853451Speter fprintf(stderr, " %d:", lastpos[j].index); 183953451Speter prtok(d->tokens[lastpos[j].index]); 184053451Speter } 184153451Speter putc('\n', stderr); 184253451Speter } 184353451Speter#endif 184453451Speter 184553451Speter /* For each follow set that is the follow set of a real position, replace 184653451Speter it with its epsilon closure. */ 184753451Speter for (i = 0; i < d->tindex; ++i) 184853451Speter if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1849131557Stjr#ifdef MBS_SUPPORT 1850131557Stjr || d->tokens[i] == ANYCHAR 1851131557Stjr || d->tokens[i] == MBCSET 1852131557Stjr#endif 185353451Speter || d->tokens[i] >= CSET) 185453451Speter { 185553451Speter#ifdef DEBUG 185653451Speter fprintf(stderr, "follows(%d:", i); 185753451Speter prtok(d->tokens[i]); 185853451Speter fprintf(stderr, "):"); 185953451Speter for (j = d->follows[i].nelem - 1; j >= 0; --j) 186053451Speter { 186153451Speter fprintf(stderr, " %d:", d->follows[i].elems[j].index); 186253451Speter prtok(d->tokens[d->follows[i].elems[j].index]); 186353451Speter } 186453451Speter putc('\n', stderr); 186553451Speter#endif 186653451Speter copy(&d->follows[i], &merged); 186753451Speter epsclosure(&merged, d); 186853451Speter if (d->follows[i].nelem < merged.nelem) 186953451Speter REALLOC(d->follows[i].elems, position, merged.nelem); 187053451Speter copy(&merged, &d->follows[i]); 187153451Speter } 187253451Speter 187353451Speter /* Get the epsilon closure of the firstpos of the regexp. The result will 187453451Speter be the set of positions of state 0. */ 187553451Speter merged.nelem = 0; 187653451Speter for (i = 0; i < nfirstpos[-1]; ++i) 187753451Speter insert(firstpos[i], &merged); 187853451Speter epsclosure(&merged, d); 187953451Speter 188053451Speter /* Check if any of the positions of state 0 will want newline context. */ 188153451Speter wants_newline = 0; 188253451Speter for (i = 0; i < merged.nelem; ++i) 188353451Speter if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 188453451Speter wants_newline = 1; 188553451Speter 188653451Speter /* Build the initial state. */ 188753451Speter d->salloc = 1; 188853451Speter d->sindex = 0; 188953451Speter MALLOC(d->states, dfa_state, d->salloc); 189053451Speter state_index(d, &merged, wants_newline, 0); 189153451Speter 189253451Speter free(o_nullable); 189353451Speter free(o_nfirst); 189453451Speter free(o_firstpos); 189553451Speter free(o_nlast); 189653451Speter free(o_lastpos); 189753451Speter free(nalloc); 189853451Speter free(merged.elems); 189953451Speter} 190053451Speter 190153451Speter/* Find, for each character, the transition out of state s of d, and store 190253451Speter it in the appropriate slot of trans. 190353451Speter 190453451Speter We divide the positions of s into groups (positions can appear in more 190553451Speter than one group). Each group is labeled with a set of characters that 190653451Speter every position in the group matches (taking into account, if necessary, 190753451Speter preceding context information of s). For each group, find the union 190853451Speter of the its elements' follows. This set is the set of positions of the 190953451Speter new state. For each character in the group's label, set the transition 191053451Speter on this character to be to a state corresponding to the set's positions, 191153451Speter and its associated backward context information, if necessary. 191253451Speter 191353451Speter If we are building a searching matcher, we include the positions of state 191453451Speter 0 in every state. 191553451Speter 191653451Speter The collection of groups is constructed by building an equivalence-class 191753451Speter partition of the positions of s. 191853451Speter 191953451Speter For each position, find the set of characters C that it matches. Eliminate 192053451Speter any characters from C that fail on grounds of backward context. 192153451Speter 192253451Speter Search through the groups, looking for a group whose label L has nonempty 192353451Speter intersection with C. If L - C is nonempty, create a new group labeled 192453451Speter L - C and having the same positions as the current group, and set L to 192553451Speter the intersection of L and C. Insert the position in this group, set 192653451Speter C = C - L, and resume scanning. 192753451Speter 192853451Speter If after comparing with every group there are characters remaining in C, 192953451Speter create a new group labeled with the characters of C and insert this 193053451Speter position in that group. */ 193153451Spetervoid 193256919Srudfastate (int s, struct dfa *d, int trans[]) 193353451Speter{ 193453451Speter position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 193553451Speter charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 193653451Speter int ngrps = 0; /* Number of groups actually used. */ 193753451Speter position pos; /* Current position being considered. */ 193853451Speter charclass matches; /* Set of matching characters. */ 193953451Speter int matchesf; /* True if matches is nonempty. */ 194053451Speter charclass intersect; /* Intersection with some label set. */ 194153451Speter int intersectf; /* True if intersect is nonempty. */ 194253451Speter charclass leftovers; /* Stuff in the label that didn't match. */ 194353451Speter int leftoversf; /* True if leftovers is nonempty. */ 194453451Speter static charclass letters; /* Set of characters considered letters. */ 194553451Speter static charclass newline; /* Set of characters that aren't newline. */ 194653451Speter position_set follows; /* Union of the follows of some group. */ 194753451Speter position_set tmp; /* Temporary space for merging sets. */ 194853451Speter int state; /* New state. */ 194953451Speter int wants_newline; /* New state wants to know newline context. */ 195053451Speter int state_newline; /* New state on a newline transition. */ 195153451Speter int wants_letter; /* New state wants to know letter context. */ 195253451Speter int state_letter; /* New state on a letter transition. */ 195353472Sobrien static int initialized; /* Flag for static initialization. */ 1954131557Stjr#ifdef MBS_SUPPORT 1955131557Stjr int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1956131557Stjr#endif 195753451Speter int i, j, k; 195853451Speter 195953451Speter /* Initialize the set of letters, if necessary. */ 196053451Speter if (! initialized) 196153451Speter { 196253451Speter initialized = 1; 196353451Speter for (i = 0; i < NOTCHAR; ++i) 196453472Sobrien if (IS_WORD_CONSTITUENT(i)) 196553451Speter setbit(i, letters); 196655379Sobrien setbit(eolbyte, newline); 196753451Speter } 196853451Speter 196953451Speter zeroset(matches); 197053451Speter 197153451Speter for (i = 0; i < d->states[s].elems.nelem; ++i) 197253451Speter { 197353451Speter pos = d->states[s].elems.elems[i]; 197453451Speter if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 197553451Speter setbit(d->tokens[pos.index], matches); 197653451Speter else if (d->tokens[pos.index] >= CSET) 197753451Speter copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1978131557Stjr#ifdef MBS_SUPPORT 1979131557Stjr else if (d->tokens[pos.index] == ANYCHAR 1980131557Stjr || d->tokens[pos.index] == MBCSET) 1981131557Stjr /* MB_CUR_MAX > 1 */ 1982131557Stjr { 1983131557Stjr /* ANYCHAR and MBCSET must match with a single character, so we 1984131557Stjr must put it to d->states[s].mbps, which contains the positions 1985131557Stjr which can match with a single character not a byte. */ 1986131557Stjr if (d->states[s].mbps.nelem == 0) 1987131557Stjr { 1988131557Stjr MALLOC(d->states[s].mbps.elems, position, 1989131557Stjr d->states[s].elems.nelem); 1990131557Stjr } 1991131557Stjr insert(pos, &(d->states[s].mbps)); 1992131557Stjr continue; 1993131557Stjr } 1994131557Stjr#endif /* MBS_SUPPORT */ 199553451Speter else 199653451Speter continue; 199753451Speter 199853451Speter /* Some characters may need to be eliminated from matches because 199953451Speter they fail in the current context. */ 200053451Speter if (pos.constraint != 0xFF) 200153451Speter { 200253451Speter if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 200353451Speter d->states[s].newline, 1)) 200455379Sobrien clrbit(eolbyte, matches); 200553451Speter if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 200653451Speter d->states[s].newline, 0)) 200753451Speter for (j = 0; j < CHARCLASS_INTS; ++j) 200853451Speter matches[j] &= newline[j]; 200953451Speter if (! MATCHES_LETTER_CONTEXT(pos.constraint, 201053451Speter d->states[s].letter, 1)) 201153451Speter for (j = 0; j < CHARCLASS_INTS; ++j) 201253451Speter matches[j] &= ~letters[j]; 201353451Speter if (! MATCHES_LETTER_CONTEXT(pos.constraint, 201453451Speter d->states[s].letter, 0)) 201553451Speter for (j = 0; j < CHARCLASS_INTS; ++j) 201653451Speter matches[j] &= letters[j]; 201753451Speter 201853451Speter /* If there are no characters left, there's no point in going on. */ 201953451Speter for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 202053472Sobrien continue; 202153451Speter if (j == CHARCLASS_INTS) 202253451Speter continue; 202353451Speter } 202453451Speter 202553451Speter for (j = 0; j < ngrps; ++j) 202653451Speter { 202753451Speter /* If matches contains a single character only, and the current 202853451Speter group's label doesn't contain that character, go on to the 202953451Speter next group. */ 203053451Speter if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 203153451Speter && !tstbit(d->tokens[pos.index], labels[j])) 203253451Speter continue; 203353451Speter 203453451Speter /* Check if this group's label has a nonempty intersection with 203553451Speter matches. */ 203653451Speter intersectf = 0; 203753451Speter for (k = 0; k < CHARCLASS_INTS; ++k) 203853472Sobrien (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 203953451Speter if (! intersectf) 204053451Speter continue; 204153451Speter 204253451Speter /* It does; now find the set differences both ways. */ 204353451Speter leftoversf = matchesf = 0; 204453451Speter for (k = 0; k < CHARCLASS_INTS; ++k) 204553451Speter { 204653451Speter /* Even an optimizing compiler can't know this for sure. */ 204753451Speter int match = matches[k], label = labels[j][k]; 204853451Speter 204953472Sobrien (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 205053472Sobrien (matches[k] = match & ~label) ? (matchesf = 1) : 0; 205153451Speter } 205253451Speter 205353451Speter /* If there were leftovers, create a new group labeled with them. */ 205453451Speter if (leftoversf) 205553451Speter { 205653451Speter copyset(leftovers, labels[ngrps]); 205753451Speter copyset(intersect, labels[j]); 205853451Speter MALLOC(grps[ngrps].elems, position, d->nleaves); 205953451Speter copy(&grps[j], &grps[ngrps]); 206053451Speter ++ngrps; 206153451Speter } 206253451Speter 206353451Speter /* Put the position in the current group. Note that there is no 206453451Speter reason to call insert() here. */ 206553451Speter grps[j].elems[grps[j].nelem++] = pos; 206653451Speter 206753451Speter /* If every character matching the current position has been 206853451Speter accounted for, we're done. */ 206953451Speter if (! matchesf) 207053451Speter break; 207153451Speter } 207253451Speter 207353451Speter /* If we've passed the last group, and there are still characters 207453451Speter unaccounted for, then we'll have to create a new group. */ 207553451Speter if (j == ngrps) 207653451Speter { 207753451Speter copyset(matches, labels[ngrps]); 207853451Speter zeroset(matches); 207953451Speter MALLOC(grps[ngrps].elems, position, d->nleaves); 208053451Speter grps[ngrps].nelem = 1; 208153451Speter grps[ngrps].elems[0] = pos; 208253451Speter ++ngrps; 208353451Speter } 208453451Speter } 208553451Speter 208653451Speter MALLOC(follows.elems, position, d->nleaves); 208753451Speter MALLOC(tmp.elems, position, d->nleaves); 208853451Speter 208953451Speter /* If we are a searching matcher, the default transition is to a state 209053451Speter containing the positions of state 0, otherwise the default transition 209153451Speter is to fail miserably. */ 209253451Speter if (d->searchflag) 209353451Speter { 209453451Speter wants_newline = 0; 209553451Speter wants_letter = 0; 209653451Speter for (i = 0; i < d->states[0].elems.nelem; ++i) 209753451Speter { 209853451Speter if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 209953451Speter wants_newline = 1; 210053451Speter if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 210153451Speter wants_letter = 1; 210253451Speter } 210353451Speter copy(&d->states[0].elems, &follows); 210453451Speter state = state_index(d, &follows, 0, 0); 210553451Speter if (wants_newline) 210653451Speter state_newline = state_index(d, &follows, 1, 0); 210753451Speter else 210853451Speter state_newline = state; 210953451Speter if (wants_letter) 211053451Speter state_letter = state_index(d, &follows, 0, 1); 211153451Speter else 211253451Speter state_letter = state; 211353451Speter for (i = 0; i < NOTCHAR; ++i) 211453472Sobrien trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 211555379Sobrien trans[eolbyte] = state_newline; 211653451Speter } 211753451Speter else 211853451Speter for (i = 0; i < NOTCHAR; ++i) 211953451Speter trans[i] = -1; 212053451Speter 212153451Speter for (i = 0; i < ngrps; ++i) 212253451Speter { 212353451Speter follows.nelem = 0; 212453451Speter 212553451Speter /* Find the union of the follows of the positions of the group. 212653451Speter This is a hideously inefficient loop. Fix it someday. */ 212753451Speter for (j = 0; j < grps[i].nelem; ++j) 212853451Speter for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 212953451Speter insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 213053451Speter 2131131557Stjr#ifdef MBS_SUPPORT 2132131557Stjr if (MB_CUR_MAX > 1) 2133131557Stjr { 2134131557Stjr /* If a token in follows.elems is not 1st byte of a multibyte 2135131557Stjr character, or the states of follows must accept the bytes 2136131557Stjr which are not 1st byte of the multibyte character. 2137131557Stjr Then, if a state of follows encounter a byte, it must not be 2138131557Stjr a 1st byte of a multibyte character nor singlebyte character. 2139131557Stjr We cansel to add state[0].follows to next state, because 2140131557Stjr state[0] must accept 1st-byte 2141131557Stjr 2142131557Stjr For example, we assume <sb a> is a certain singlebyte 2143131557Stjr character, <mb A> is a certain multibyte character, and the 2144131557Stjr codepoint of <sb a> equals the 2nd byte of the codepoint of 2145131557Stjr <mb A>. 2146131557Stjr When state[0] accepts <sb a>, state[i] transit to state[i+1] 2147131557Stjr by accepting accepts 1st byte of <mb A>, and state[i+1] 2148131557Stjr accepts 2nd byte of <mb A>, if state[i+1] encounter the 2149131557Stjr codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2150131557Stjr <mb A>, so we can not add state[0]. */ 2151131557Stjr 2152131557Stjr next_isnt_1st_byte = 0; 2153131557Stjr for (j = 0; j < follows.nelem; ++j) 2154131557Stjr { 2155131557Stjr if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2156131557Stjr { 2157131557Stjr next_isnt_1st_byte = 1; 2158131557Stjr break; 2159131557Stjr } 2160131557Stjr } 2161131557Stjr } 2162131557Stjr#endif 2163131557Stjr 216453451Speter /* If we are building a searching matcher, throw in the positions 216553451Speter of state 0 as well. */ 2166131557Stjr#ifdef MBS_SUPPORT 2167131557Stjr if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2168131557Stjr#else 216953451Speter if (d->searchflag) 2170131557Stjr#endif 217153451Speter for (j = 0; j < d->states[0].elems.nelem; ++j) 217253451Speter insert(d->states[0].elems.elems[j], &follows); 217353451Speter 217453451Speter /* Find out if the new state will want any context information. */ 217553451Speter wants_newline = 0; 217655379Sobrien if (tstbit(eolbyte, labels[i])) 217753451Speter for (j = 0; j < follows.nelem; ++j) 217853451Speter if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 217953451Speter wants_newline = 1; 218053451Speter 218153451Speter wants_letter = 0; 218253451Speter for (j = 0; j < CHARCLASS_INTS; ++j) 218353451Speter if (labels[i][j] & letters[j]) 218453451Speter break; 218553451Speter if (j < CHARCLASS_INTS) 218653451Speter for (j = 0; j < follows.nelem; ++j) 218753451Speter if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 218853451Speter wants_letter = 1; 218953451Speter 219053451Speter /* Find the state(s) corresponding to the union of the follows. */ 219153451Speter state = state_index(d, &follows, 0, 0); 219253451Speter if (wants_newline) 219353451Speter state_newline = state_index(d, &follows, 1, 0); 219453451Speter else 219553451Speter state_newline = state; 219653451Speter if (wants_letter) 219753451Speter state_letter = state_index(d, &follows, 0, 1); 219853451Speter else 219953451Speter state_letter = state; 220053451Speter 220153451Speter /* Set the transitions for each character in the current label. */ 220253451Speter for (j = 0; j < CHARCLASS_INTS; ++j) 220353451Speter for (k = 0; k < INTBITS; ++k) 220453451Speter if (labels[i][j] & 1 << k) 220553451Speter { 220653451Speter int c = j * INTBITS + k; 220753451Speter 220855379Sobrien if (c == eolbyte) 220953451Speter trans[c] = state_newline; 221053472Sobrien else if (IS_WORD_CONSTITUENT(c)) 221153451Speter trans[c] = state_letter; 221253451Speter else if (c < NOTCHAR) 221353451Speter trans[c] = state; 221453451Speter } 221553451Speter } 221653451Speter 221753451Speter for (i = 0; i < ngrps; ++i) 221853451Speter free(grps[i].elems); 221953451Speter free(follows.elems); 222053451Speter free(tmp.elems); 222153451Speter} 222253451Speter 222353451Speter/* Some routines for manipulating a compiled dfa's transition tables. 222453451Speter Each state may or may not have a transition table; if it does, and it 222553451Speter is a non-accepting state, then d->trans[state] points to its table. 222653451Speter If it is an accepting state then d->fails[state] points to its table. 222753451Speter If it has no table at all, then d->trans[state] is NULL. 222853451Speter TODO: Improve this comment, get rid of the unnecessary redundancy. */ 222953451Speter 223053451Speterstatic void 223156919Srubuild_state (int s, struct dfa *d) 223253451Speter{ 223353451Speter int *trans; /* The new transition table. */ 223453451Speter int i; 223553451Speter 223653451Speter /* Set an upper limit on the number of transition tables that will ever 223753451Speter exist at once. 1024 is arbitrary. The idea is that the frequently 223853451Speter used transition tables will be quickly rebuilt, whereas the ones that 223953451Speter were only needed once or twice will be cleared away. */ 224053451Speter if (d->trcount >= 1024) 224153451Speter { 224253451Speter for (i = 0; i < d->tralloc; ++i) 224353451Speter if (d->trans[i]) 224453451Speter { 224553451Speter free((ptr_t) d->trans[i]); 224653451Speter d->trans[i] = NULL; 224753451Speter } 224853451Speter else if (d->fails[i]) 224953451Speter { 225053451Speter free((ptr_t) d->fails[i]); 225153451Speter d->fails[i] = NULL; 225253451Speter } 225353451Speter d->trcount = 0; 225453451Speter } 225553451Speter 225653451Speter ++d->trcount; 225753451Speter 225853451Speter /* Set up the success bits for this state. */ 225953451Speter d->success[s] = 0; 226053451Speter if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 226153451Speter s, *d)) 226253451Speter d->success[s] |= 4; 226353451Speter if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 226453451Speter s, *d)) 226553451Speter d->success[s] |= 2; 226653451Speter if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 226753451Speter s, *d)) 226853451Speter d->success[s] |= 1; 226953451Speter 227053451Speter MALLOC(trans, int, NOTCHAR); 227153451Speter dfastate(s, d, trans); 227253451Speter 227353451Speter /* Now go through the new transition table, and make sure that the trans 227453451Speter and fail arrays are allocated large enough to hold a pointer for the 227553451Speter largest state mentioned in the table. */ 227653451Speter for (i = 0; i < NOTCHAR; ++i) 227753451Speter if (trans[i] >= d->tralloc) 227853451Speter { 227953451Speter int oldalloc = d->tralloc; 228053451Speter 228153451Speter while (trans[i] >= d->tralloc) 228253451Speter d->tralloc *= 2; 228353451Speter REALLOC(d->realtrans, int *, d->tralloc + 1); 228453451Speter d->trans = d->realtrans + 1; 228553451Speter REALLOC(d->fails, int *, d->tralloc); 228653451Speter REALLOC(d->success, int, d->tralloc); 228753451Speter while (oldalloc < d->tralloc) 228853451Speter { 228953451Speter d->trans[oldalloc] = NULL; 229053451Speter d->fails[oldalloc++] = NULL; 229153451Speter } 229253451Speter } 229353451Speter 2294131557Stjr /* Newline is a sentinel. */ 229555379Sobrien trans[eolbyte] = -1; 229653451Speter 229753451Speter if (ACCEPTING(s, *d)) 229853451Speter d->fails[s] = trans; 229953451Speter else 230053451Speter d->trans[s] = trans; 230153451Speter} 230253451Speter 230353451Speterstatic void 230456919Srubuild_state_zero (struct dfa *d) 230553451Speter{ 230653451Speter d->tralloc = 1; 230753451Speter d->trcount = 0; 230853451Speter CALLOC(d->realtrans, int *, d->tralloc + 1); 230953451Speter d->trans = d->realtrans + 1; 231053451Speter CALLOC(d->fails, int *, d->tralloc); 231153451Speter MALLOC(d->success, int, d->tralloc); 231253451Speter build_state(0, d); 231353451Speter} 231453451Speter 2315131557Stjr#ifdef MBS_SUPPORT 2316131557Stjr/* Multibyte character handling sub-routins for dfaexec. */ 2317131557Stjr 2318131557Stjr/* Initial state may encounter the byte which is not a singlebyte character 2319131557Stjr nor 1st byte of a multibyte character. But it is incorrect for initial 2320131557Stjr state to accept such a byte. 2321131557Stjr For example, in sjis encoding the regular expression like "\\" accepts 2322131557Stjr the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2323131557Stjr 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2324131557Stjr character nor 1st byte of a multibyte character. */ 2325131557Stjr#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2326131557Stjr if (s == 0) \ 2327131557Stjr { \ 2328131557Stjr while (inputwcs[p - buf_begin] == 0 \ 2329131557Stjr && mblen_buf[p - buf_begin] > 0 \ 2330131557Stjr && p < buf_end) \ 2331131557Stjr ++p; \ 2332131557Stjr if (p >= end) \ 2333131557Stjr { \ 2334131557Stjr free(mblen_buf); \ 2335131557Stjr free(inputwcs); \ 2336131557Stjr return (size_t) -1; \ 2337131557Stjr } \ 2338131557Stjr } 2339131557Stjr 2340131557Stjrstatic void 2341131557Stjrrealloc_trans_if_necessary(struct dfa *d, int new_state) 2342131557Stjr{ 2343131557Stjr /* Make sure that the trans and fail arrays are allocated large enough 2344131557Stjr to hold a pointer for the new state. */ 2345131557Stjr if (new_state >= d->tralloc) 2346131557Stjr { 2347131557Stjr int oldalloc = d->tralloc; 2348131557Stjr 2349131557Stjr while (new_state >= d->tralloc) 2350131557Stjr d->tralloc *= 2; 2351131557Stjr REALLOC(d->realtrans, int *, d->tralloc + 1); 2352131557Stjr d->trans = d->realtrans + 1; 2353131557Stjr REALLOC(d->fails, int *, d->tralloc); 2354131557Stjr REALLOC(d->success, int, d->tralloc); 2355131557Stjr while (oldalloc < d->tralloc) 2356131557Stjr { 2357131557Stjr d->trans[oldalloc] = NULL; 2358131557Stjr d->fails[oldalloc++] = NULL; 2359131557Stjr } 2360131557Stjr } 2361131557Stjr} 2362131557Stjr 2363131557Stjr/* Return values of transit_state_singlebyte(), and 2364131557Stjr transit_state_consume_1char. */ 2365131557Stjrtypedef enum 2366131557Stjr{ 2367131557Stjr TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2368131557Stjr TRANSIT_STATE_DONE, /* State transition has finished. */ 2369131557Stjr TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2370131557Stjr} status_transit_state; 2371131557Stjr 2372131557Stjr/* Consume a single byte and transit state from 's' to '*next_state'. 2373131557Stjr This function is almost same as the state transition routin in dfaexec(). 2374131557Stjr But state transition is done just once, otherwise matching succeed or 2375131557Stjr reach the end of the buffer. */ 2376131557Stjrstatic status_transit_state 2377131557Stjrtransit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2378131557Stjr int *next_state) 2379131557Stjr{ 2380131557Stjr int *t; 2381131557Stjr int works = s; 2382131557Stjr 2383131557Stjr status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2384131557Stjr 2385131557Stjr while (rval == TRANSIT_STATE_IN_PROGRESS) 2386131557Stjr { 2387131557Stjr if ((t = d->trans[works]) != NULL) 2388131557Stjr { 2389131557Stjr works = t[*p]; 2390131557Stjr rval = TRANSIT_STATE_DONE; 2391131557Stjr if (works < 0) 2392131557Stjr works = 0; 2393131557Stjr } 2394131557Stjr else if (works < 0) 2395131557Stjr { 2396131557Stjr if (p == buf_end) 2397131557Stjr /* At the moment, it must not happen. */ 2398131557Stjr return TRANSIT_STATE_END_BUFFER; 2399131557Stjr works = 0; 2400131557Stjr } 2401131557Stjr else if (d->fails[works]) 2402131557Stjr { 2403131557Stjr works = d->fails[works][*p]; 2404131557Stjr rval = TRANSIT_STATE_DONE; 2405131557Stjr } 2406131557Stjr else 2407131557Stjr { 2408131557Stjr build_state(works, d); 2409131557Stjr } 2410131557Stjr } 2411131557Stjr *next_state = works; 2412131557Stjr return rval; 2413131557Stjr} 2414131557Stjr 2415131557Stjr/* Check whether period can match or not in the current context. If it can, 2416131557Stjr return the amount of the bytes with which period can match, otherwise 2417131557Stjr return 0. 2418131557Stjr `pos' is the position of the period. `index' is the index from the 2419131557Stjr buf_begin, and it is the current position in the buffer. */ 2420131557Stjrstatic int 2421131557Stjrmatch_anychar (struct dfa *d, int s, position pos, int index) 2422131557Stjr{ 2423131557Stjr int newline = 0; 2424131557Stjr int letter = 0; 2425131557Stjr wchar_t wc; 2426131557Stjr int mbclen; 2427131557Stjr 2428131557Stjr wc = inputwcs[index]; 2429131557Stjr mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2430131557Stjr 2431131557Stjr /* Check context. */ 2432131557Stjr if (wc == (wchar_t)eolbyte) 2433131557Stjr { 2434131557Stjr if (!(syntax_bits & RE_DOT_NEWLINE)) 2435131557Stjr return 0; 2436131557Stjr newline = 1; 2437131557Stjr } 2438131557Stjr else if (wc == (wchar_t)'\0') 2439131557Stjr { 2440131557Stjr if (syntax_bits & RE_DOT_NOT_NULL) 2441131557Stjr return 0; 2442131557Stjr newline = 1; 2443131557Stjr } 2444131557Stjr 2445131557Stjr if (iswalnum(wc) || wc == L'_') 2446131557Stjr letter = 1; 2447131557Stjr 2448131557Stjr if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2449131557Stjr newline, d->states[s].letter, letter)) 2450131557Stjr return 0; 2451131557Stjr 2452131557Stjr return mbclen; 2453131557Stjr} 2454131557Stjr 2455131557Stjr/* Check whether bracket expression can match or not in the current context. 2456131557Stjr If it can, return the amount of the bytes with which expression can match, 2457131557Stjr otherwise return 0. 2458131557Stjr `pos' is the position of the bracket expression. `index' is the index 2459131557Stjr from the buf_begin, and it is the current position in the buffer. */ 2460131557Stjrint 2461131557Stjrmatch_mb_charset (struct dfa *d, int s, position pos, int index) 2462131557Stjr{ 2463131557Stjr int i; 2464131557Stjr int match; /* Flag which represent that matching succeed. */ 2465131557Stjr int match_len; /* Length of the character (or collating element) 2466131557Stjr with which this operator match. */ 2467250823Spfg size_t op_len; /* Length of the operator. */ 2468131557Stjr char buffer[128]; 2469131557Stjr wchar_t wcbuf[6]; 2470131557Stjr 2471131557Stjr /* Pointer to the structure to which we are currently reffering. */ 2472131557Stjr struct mb_char_classes *work_mbc; 2473131557Stjr 2474131557Stjr int newline = 0; 2475131557Stjr int letter = 0; 2476131557Stjr wchar_t wc; /* Current reffering character. */ 2477131557Stjr 2478131557Stjr wc = inputwcs[index]; 2479131557Stjr 2480131557Stjr /* Check context. */ 2481131557Stjr if (wc == (wchar_t)eolbyte) 2482131557Stjr { 2483131557Stjr if (!(syntax_bits & RE_DOT_NEWLINE)) 2484131557Stjr return 0; 2485131557Stjr newline = 1; 2486131557Stjr } 2487131557Stjr else if (wc == (wchar_t)'\0') 2488131557Stjr { 2489131557Stjr if (syntax_bits & RE_DOT_NOT_NULL) 2490131557Stjr return 0; 2491131557Stjr newline = 1; 2492131557Stjr } 2493131557Stjr if (iswalnum(wc) || wc == L'_') 2494131557Stjr letter = 1; 2495131557Stjr if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2496131557Stjr newline, d->states[s].letter, letter)) 2497131557Stjr return 0; 2498131557Stjr 2499131557Stjr /* Assign the current reffering operator to work_mbc. */ 2500131557Stjr work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2501131557Stjr match = !work_mbc->invert; 2502131557Stjr match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2503131557Stjr 2504131557Stjr /* match with a character class? */ 2505131557Stjr for (i = 0; i<work_mbc->nch_classes; i++) 2506131557Stjr { 2507131557Stjr if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2508131557Stjr goto charset_matched; 2509131557Stjr } 2510131557Stjr 2511131557Stjr strncpy(buffer, buf_begin + index, match_len); 2512131557Stjr buffer[match_len] = '\0'; 2513131557Stjr 2514131557Stjr /* match with an equivalent class? */ 2515131557Stjr for (i = 0; i<work_mbc->nequivs; i++) 2516131557Stjr { 2517131557Stjr op_len = strlen(work_mbc->equivs[i]); 2518131557Stjr strncpy(buffer, buf_begin + index, op_len); 2519131557Stjr buffer[op_len] = '\0'; 2520131557Stjr if (strcoll(work_mbc->equivs[i], buffer) == 0) 2521131557Stjr { 2522131557Stjr match_len = op_len; 2523131557Stjr goto charset_matched; 2524131557Stjr } 2525131557Stjr } 2526131557Stjr 2527131557Stjr /* match with a collating element? */ 2528131557Stjr for (i = 0; i<work_mbc->ncoll_elems; i++) 2529131557Stjr { 2530131557Stjr op_len = strlen(work_mbc->coll_elems[i]); 2531131557Stjr strncpy(buffer, buf_begin + index, op_len); 2532131557Stjr buffer[op_len] = '\0'; 2533131557Stjr 2534131557Stjr if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2535131557Stjr { 2536131557Stjr match_len = op_len; 2537131557Stjr goto charset_matched; 2538131557Stjr } 2539131557Stjr } 2540131557Stjr 2541131557Stjr wcbuf[0] = wc; 2542131557Stjr wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2543131557Stjr 2544131557Stjr /* match with a range? */ 2545131557Stjr for (i = 0; i<work_mbc->nranges; i++) 2546131557Stjr { 2547131557Stjr wcbuf[2] = work_mbc->range_sts[i]; 2548131557Stjr wcbuf[4] = work_mbc->range_ends[i]; 2549131557Stjr 2550131557Stjr if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2551131557Stjr wcscoll(wcbuf+4, wcbuf) >= 0) 2552131557Stjr goto charset_matched; 2553131557Stjr } 2554131557Stjr 2555131557Stjr /* match with a character? */ 2556146200Stjr if (case_fold) 2557146200Stjr wc = towlower (wc); 2558131557Stjr for (i = 0; i<work_mbc->nchars; i++) 2559131557Stjr { 2560131557Stjr if (wc == work_mbc->chars[i]) 2561131557Stjr goto charset_matched; 2562131557Stjr } 2563131557Stjr 2564131557Stjr match = !match; 2565131557Stjr 2566131557Stjr charset_matched: 2567131557Stjr return match ? match_len : 0; 2568131557Stjr} 2569131557Stjr 2570131557Stjr/* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2571131557Stjr array which corresponds to `d->states[s].mbps.elem' and each element of 2572131557Stjr the array contains the amount of the bytes with which the element can 2573131557Stjr match. 2574131557Stjr `index' is the index from the buf_begin, and it is the current position 2575131557Stjr in the buffer. 2576131557Stjr Caller MUST free the array which this function return. */ 2577131557Stjrstatic int* 2578131557Stjrcheck_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2579131557Stjr{ 2580131557Stjr int i; 2581131557Stjr int* rarray; 2582131557Stjr 2583131557Stjr MALLOC(rarray, int, d->states[s].mbps.nelem); 2584131557Stjr for (i = 0; i < d->states[s].mbps.nelem; ++i) 2585131557Stjr { 2586131557Stjr position pos = d->states[s].mbps.elems[i]; 2587131557Stjr switch(d->tokens[pos.index]) 2588131557Stjr { 2589131557Stjr case ANYCHAR: 2590131557Stjr rarray[i] = match_anychar(d, s, pos, index); 2591131557Stjr break; 2592131557Stjr case MBCSET: 2593131557Stjr rarray[i] = match_mb_charset(d, s, pos, index); 2594131557Stjr break; 2595131557Stjr default: 2596131557Stjr break; /* can not happen. */ 2597131557Stjr } 2598131557Stjr } 2599131557Stjr return rarray; 2600131557Stjr} 2601131557Stjr 2602131557Stjr/* Consume a single character and enumerate all of the positions which can 2603131557Stjr be next position from the state `s'. 2604131557Stjr `match_lens' is the input. It can be NULL, but it can also be the output 2605131557Stjr of check_matching_with_multibyte_ops() for optimization. 2606131557Stjr `mbclen' and `pps' are the output. `mbclen' is the length of the 2607131557Stjr character consumed, and `pps' is the set this function enumerate. */ 2608131557Stjrstatic status_transit_state 2609131557Stjrtransit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2610131557Stjr int *match_lens, int *mbclen, position_set *pps) 2611131557Stjr{ 2612131557Stjr int i, j; 2613131557Stjr int s1, s2; 2614131557Stjr int* work_mbls; 2615131557Stjr status_transit_state rs = TRANSIT_STATE_DONE; 2616131557Stjr 2617131557Stjr /* Calculate the length of the (single/multi byte) character 2618131557Stjr to which p points. */ 2619131557Stjr *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2620131557Stjr : mblen_buf[*pp - buf_begin]; 2621131557Stjr 2622131557Stjr /* Calculate the state which can be reached from the state `s' by 2623131557Stjr consuming `*mbclen' single bytes from the buffer. */ 2624131557Stjr s1 = s; 2625131557Stjr for (i = 0; i < *mbclen; i++) 2626131557Stjr { 2627131557Stjr s2 = s1; 2628131557Stjr rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2629131557Stjr } 2630131557Stjr /* Copy the positions contained by `s1' to the set `pps'. */ 2631131557Stjr copy(&(d->states[s1].elems), pps); 2632131557Stjr 2633131557Stjr /* Check (inputed)match_lens, and initialize if it is NULL. */ 2634131557Stjr if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2635131557Stjr work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2636131557Stjr else 2637131557Stjr work_mbls = match_lens; 2638131557Stjr 2639131557Stjr /* Add all of the positions which can be reached from `s' by consuming 2640131557Stjr a single character. */ 2641131557Stjr for (i = 0; i < d->states[s].mbps.nelem ; i++) 2642131557Stjr { 2643131557Stjr if (work_mbls[i] == *mbclen) 2644131557Stjr for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2645131557Stjr j++) 2646131557Stjr insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2647131557Stjr pps); 2648131557Stjr } 2649131557Stjr 2650131557Stjr if (match_lens == NULL && work_mbls != NULL) 2651131557Stjr free(work_mbls); 2652131557Stjr return rs; 2653131557Stjr} 2654131557Stjr 2655131557Stjr/* Transit state from s, then return new state and update the pointer of the 2656131557Stjr buffer. This function is for some operator which can match with a multi- 2657131557Stjr byte character or a collating element(which may be multi characters). */ 2658131557Stjrstatic int 2659131557Stjrtransit_state (struct dfa *d, int s, unsigned char const **pp) 2660131557Stjr{ 2661131557Stjr int s1; 2662131557Stjr int mbclen; /* The length of current input multibyte character. */ 2663131557Stjr int maxlen = 0; 2664131557Stjr int i, j; 2665131557Stjr int *match_lens = NULL; 2666131557Stjr int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2667131557Stjr position_set follows; 2668131557Stjr unsigned char const *p1 = *pp; 2669131557Stjr status_transit_state rs; 2670131557Stjr wchar_t wc; 2671131557Stjr 2672131557Stjr if (nelem > 0) 2673131557Stjr /* This state has (a) multibyte operator(s). 2674131557Stjr We check whether each of them can match or not. */ 2675131557Stjr { 2676131557Stjr /* Note: caller must free the return value of this function. */ 2677131557Stjr match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2678131557Stjr 2679131557Stjr for (i = 0; i < nelem; i++) 2680131557Stjr /* Search the operator which match the longest string, 2681131557Stjr in this state. */ 2682131557Stjr { 2683131557Stjr if (match_lens[i] > maxlen) 2684131557Stjr maxlen = match_lens[i]; 2685131557Stjr } 2686131557Stjr } 2687131557Stjr 2688131557Stjr if (nelem == 0 || maxlen == 0) 2689131557Stjr /* This state has no multibyte operator which can match. 2690131557Stjr We need to check only one singlebyte character. */ 2691131557Stjr { 2692131557Stjr status_transit_state rs; 2693131557Stjr rs = transit_state_singlebyte(d, s, *pp, &s1); 2694131557Stjr 2695131557Stjr /* We must update the pointer if state transition succeeded. */ 2696131557Stjr if (rs == TRANSIT_STATE_DONE) 2697131557Stjr ++*pp; 2698131557Stjr 2699131557Stjr if (match_lens != NULL) 2700131557Stjr free(match_lens); 2701131557Stjr return s1; 2702131557Stjr } 2703131557Stjr 2704131557Stjr /* This state has some operators which can match a multibyte character. */ 2705131557Stjr follows.nelem = 0; 2706131557Stjr MALLOC(follows.elems, position, d->nleaves); 2707131557Stjr 2708131557Stjr /* `maxlen' may be longer than the length of a character, because it may 2709131557Stjr not be a character but a (multi character) collating element. 2710131557Stjr We enumerate all of the positions which `s' can reach by consuming 2711131557Stjr `maxlen' bytes. */ 2712131557Stjr rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2713131557Stjr 2714131557Stjr wc = inputwcs[*pp - mbclen - buf_begin]; 2715131557Stjr s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2716131557Stjr realloc_trans_if_necessary(d, s1); 2717131557Stjr 2718131557Stjr while (*pp - p1 < maxlen) 2719131557Stjr { 2720131557Stjr follows.nelem = 0; 2721131557Stjr rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2722131557Stjr 2723131557Stjr for (i = 0; i < nelem ; i++) 2724131557Stjr { 2725131557Stjr if (match_lens[i] == *pp - p1) 2726131557Stjr for (j = 0; 2727131557Stjr j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2728131557Stjr insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2729131557Stjr &follows); 2730131557Stjr } 2731131557Stjr 2732131557Stjr wc = inputwcs[*pp - mbclen - buf_begin]; 2733131557Stjr s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2734131557Stjr realloc_trans_if_necessary(d, s1); 2735131557Stjr } 2736131557Stjr free(match_lens); 2737131557Stjr free(follows.elems); 2738131557Stjr return s1; 2739131557Stjr} 2740131557Stjr 2741131557Stjr#endif 2742131557Stjr 274353451Speter/* Search through a buffer looking for a match to the given struct dfa. 274453451Speter Find the first occurrence of a string matching the regexp in the buffer, 2745131557Stjr and the shortest possible version thereof. Return the offset of the first 2746131557Stjr character after the match, or (size_t) -1 if none is found. BEGIN points to 2747131557Stjr the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2748131557Stjr is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 274953451Speter where we're supposed to store a 1 if backreferencing happened and the 275053451Speter match needs to be verified by a backtracking matcher. Otherwise 275153451Speter we store a 0 in *backref. */ 2752131557Stjrsize_t 2753146199Stjrdfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 275453451Speter{ 2755131557Stjr register int s; /* Current state. */ 2756131557Stjr register unsigned char const *p; /* Current input character. */ 2757131557Stjr register unsigned char const *end; /* One past the last input character. */ 275853472Sobrien register int **trans, *t; /* Copy of d->trans so it can be optimized 275953451Speter into a register. */ 276055379Sobrien register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 276153472Sobrien static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 276253472Sobrien static int sbit_init; 276353451Speter 276453451Speter if (! sbit_init) 276553451Speter { 276653451Speter int i; 276753451Speter 276853451Speter sbit_init = 1; 276953451Speter for (i = 0; i < NOTCHAR; ++i) 277053472Sobrien sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 277155379Sobrien sbit[eol] = 4; 277253451Speter } 277353451Speter 277453451Speter if (! d->tralloc) 277553451Speter build_state_zero(d); 277653451Speter 2777131557Stjr s = 0; 2778131557Stjr p = (unsigned char const *) begin; 2779131557Stjr end = p + size; 278053451Speter trans = d->trans; 278153451Speter 2782131557Stjr#ifdef MBS_SUPPORT 2783131557Stjr if (MB_CUR_MAX > 1) 2784131557Stjr { 2785146199Stjr int remain_bytes, i; 2786131557Stjr buf_begin = begin; 2787131557Stjr buf_end = end; 2788146199Stjr 2789146199Stjr /* initialize mblen_buf, and inputwcs. */ 2790146199Stjr MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2791146199Stjr MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2792146199Stjr memset(&mbs, 0, sizeof(mbstate_t)); 2793146199Stjr remain_bytes = 0; 2794146199Stjr for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2795131576Stjr { 2796146199Stjr if (remain_bytes == 0) 2797131557Stjr { 2798146199Stjr remain_bytes 2799146199Stjr = mbrtowc(inputwcs + i, begin + i, 2800146199Stjr end - (unsigned char const *)begin - i + 1, &mbs); 2801146199Stjr if (remain_bytes <= 1) 2802131557Stjr { 2803146199Stjr remain_bytes = 0; 2804146199Stjr inputwcs[i] = (wchar_t)begin[i]; 2805146199Stjr mblen_buf[i] = 0; 2806131557Stjr } 2807131557Stjr else 2808131557Stjr { 2809131557Stjr mblen_buf[i] = remain_bytes; 2810131557Stjr remain_bytes--; 2811131557Stjr } 2812131557Stjr } 2813146199Stjr else 2814131557Stjr { 2815146199Stjr mblen_buf[i] = remain_bytes; 2816146199Stjr inputwcs[i] = 0; 2817146199Stjr remain_bytes--; 2818131557Stjr } 2819131557Stjr } 2820146199Stjr mblen_buf[i] = 0; 2821146199Stjr inputwcs[i] = 0; /* sentinel */ 2822131557Stjr } 2823131557Stjr#endif /* MBS_SUPPORT */ 2824131557Stjr 282553451Speter for (;;) 282653451Speter { 2827131557Stjr#ifdef MBS_SUPPORT 2828131557Stjr if (MB_CUR_MAX > 1) 2829131557Stjr while ((t = trans[s])) 2830131557Stjr { 2831131557Stjr if (d->states[s].mbps.nelem != 0) 2832131557Stjr { 2833131557Stjr /* Can match with a multibyte character( and multi character 2834131557Stjr collating element). */ 2835131557Stjr unsigned char const *nextp; 283653451Speter 2837131557Stjr SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2838131557Stjr 2839131557Stjr nextp = p; 2840131557Stjr s = transit_state(d, s, &nextp); 2841131557Stjr p = nextp; 2842131557Stjr 2843131557Stjr /* Trans table might be updated. */ 2844131557Stjr trans = d->trans; 2845131557Stjr } 2846131557Stjr else 2847131557Stjr { 2848131557Stjr SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2849131557Stjr s = t[*p++]; 2850131557Stjr } 2851131557Stjr } 2852131557Stjr else 2853131557Stjr#endif /* MBS_SUPPORT */ 2854131557Stjr while ((t = trans[s])) 2855131557Stjr s = t[*p++]; 2856131557Stjr 2857131557Stjr if (s < 0) 285853451Speter { 2859131557Stjr if (p == end) 2860131557Stjr { 2861131557Stjr#ifdef MBS_SUPPORT 2862131557Stjr if (MB_CUR_MAX > 1) 2863131557Stjr { 2864131557Stjr free(mblen_buf); 2865131557Stjr free(inputwcs); 2866131557Stjr } 2867131557Stjr#endif /* MBS_SUPPORT */ 2868131557Stjr return (size_t) -1; 2869131557Stjr } 2870131557Stjr s = 0; 2871131557Stjr } 2872131557Stjr else if ((t = d->fails[s])) 2873131557Stjr { 287453451Speter if (d->success[s] & sbit[*p]) 287553451Speter { 287653451Speter if (backref) 287753472Sobrien *backref = (d->states[s].backref != 0); 2878131557Stjr#ifdef MBS_SUPPORT 2879131557Stjr if (MB_CUR_MAX > 1) 2880131557Stjr { 2881131557Stjr free(mblen_buf); 2882131557Stjr free(inputwcs); 2883131557Stjr } 2884131557Stjr#endif /* MBS_SUPPORT */ 2885131557Stjr return (char const *) p - begin; 288653451Speter } 288753451Speter 2888131557Stjr#ifdef MBS_SUPPORT 2889131557Stjr if (MB_CUR_MAX > 1) 2890131557Stjr { 2891131557Stjr SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2892131557Stjr if (d->states[s].mbps.nelem != 0) 2893131557Stjr { 2894131557Stjr /* Can match with a multibyte character( and multi 2895131557Stjr character collating element). */ 2896131557Stjr unsigned char const *nextp; 2897131557Stjr nextp = p; 2898131557Stjr s = transit_state(d, s, &nextp); 2899131557Stjr p = nextp; 2900131557Stjr 2901131557Stjr /* Trans table might be updated. */ 2902131557Stjr trans = d->trans; 2903131557Stjr } 2904131557Stjr else 2905131557Stjr s = t[*p++]; 2906131557Stjr } 2907131557Stjr else 2908131557Stjr#endif /* MBS_SUPPORT */ 2909131557Stjr s = t[*p++]; 291053451Speter } 2911131557Stjr else 291253451Speter { 291353451Speter build_state(s, d); 291453451Speter trans = d->trans; 291553451Speter } 291653451Speter } 291753451Speter} 291853451Speter 291953451Speter/* Initialize the components of a dfa that the other routines don't 292053451Speter initialize for themselves. */ 292153451Spetervoid 292256919Srudfainit (struct dfa *d) 292353451Speter{ 292453451Speter d->calloc = 1; 292553451Speter MALLOC(d->charclasses, charclass, d->calloc); 292653451Speter d->cindex = 0; 292753451Speter 292853451Speter d->talloc = 1; 292953451Speter MALLOC(d->tokens, token, d->talloc); 293053451Speter d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2931131557Stjr#ifdef MBS_SUPPORT 2932131557Stjr if (MB_CUR_MAX > 1) 2933131557Stjr { 2934131557Stjr d->nmultibyte_prop = 1; 2935131557Stjr MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2936131557Stjr d->nmbcsets = 0; 2937131557Stjr d->mbcsets_alloc = 1; 2938131557Stjr MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2939131557Stjr } 2940131557Stjr#endif 294153451Speter 294253451Speter d->searchflag = 0; 294353451Speter d->tralloc = 0; 294453451Speter 294553451Speter d->musts = 0; 294653451Speter} 294753451Speter 294853451Speter/* Parse and analyze a single string of the given length. */ 294953451Spetervoid 2950131557Stjrdfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 295153451Speter{ 295253451Speter if (case_fold) /* dummy folding in service of dfamust() */ 295353451Speter { 295453472Sobrien char *lcopy; 295553451Speter int i; 295653451Speter 295753472Sobrien lcopy = malloc(len); 295853472Sobrien if (!lcopy) 295953472Sobrien dfaerror(_("out of memory")); 296053451Speter 296153451Speter /* This is a kludge. */ 296253451Speter case_fold = 0; 296353451Speter for (i = 0; i < len; ++i) 296453472Sobrien if (ISUPPER ((unsigned char) s[i])) 296553472Sobrien lcopy[i] = tolower ((unsigned char) s[i]); 296653451Speter else 296753472Sobrien lcopy[i] = s[i]; 296853451Speter 296953451Speter dfainit(d); 297053472Sobrien dfaparse(lcopy, len, d); 297153472Sobrien free(lcopy); 297253451Speter dfamust(d); 297353451Speter d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 297453451Speter case_fold = 1; 297553451Speter dfaparse(s, len, d); 297653451Speter dfaanalyze(d, searchflag); 297753451Speter } 297853451Speter else 297953451Speter { 298053451Speter dfainit(d); 298153451Speter dfaparse(s, len, d); 298253451Speter dfamust(d); 298353451Speter dfaanalyze(d, searchflag); 298453451Speter } 298553451Speter} 298653451Speter 298753451Speter/* Free the storage held by the components of a dfa. */ 298853451Spetervoid 298956919Srudfafree (struct dfa *d) 299053451Speter{ 299153451Speter int i; 299253451Speter struct dfamust *dm, *ndm; 299353451Speter 299453451Speter free((ptr_t) d->charclasses); 299553451Speter free((ptr_t) d->tokens); 2996131557Stjr 2997131557Stjr#ifdef MBS_SUPPORT 2998131557Stjr if (MB_CUR_MAX > 1) 2999131557Stjr { 3000131557Stjr free((ptr_t) d->multibyte_prop); 3001131557Stjr for (i = 0; i < d->nmbcsets; ++i) 3002131557Stjr { 3003131557Stjr int j; 3004131557Stjr struct mb_char_classes *p = &(d->mbcsets[i]); 3005131557Stjr if (p->chars != NULL) 3006131557Stjr free(p->chars); 3007131557Stjr if (p->ch_classes != NULL) 3008131557Stjr free(p->ch_classes); 3009131557Stjr if (p->range_sts != NULL) 3010131557Stjr free(p->range_sts); 3011131557Stjr if (p->range_ends != NULL) 3012131557Stjr free(p->range_ends); 3013131557Stjr 3014131557Stjr for (j = 0; j < p->nequivs; ++j) 3015131557Stjr free(p->equivs[j]); 3016131557Stjr if (p->equivs != NULL) 3017131557Stjr free(p->equivs); 3018131557Stjr 3019131557Stjr for (j = 0; j < p->ncoll_elems; ++j) 3020131557Stjr free(p->coll_elems[j]); 3021131557Stjr if (p->coll_elems != NULL) 3022131557Stjr free(p->coll_elems); 3023131557Stjr } 3024131557Stjr free((ptr_t) d->mbcsets); 3025131557Stjr } 3026131557Stjr#endif /* MBS_SUPPORT */ 3027131557Stjr 302853451Speter for (i = 0; i < d->sindex; ++i) 302953451Speter free((ptr_t) d->states[i].elems.elems); 303053451Speter free((ptr_t) d->states); 303153451Speter for (i = 0; i < d->tindex; ++i) 303253451Speter if (d->follows[i].elems) 303353451Speter free((ptr_t) d->follows[i].elems); 303453451Speter free((ptr_t) d->follows); 303553451Speter for (i = 0; i < d->tralloc; ++i) 303653451Speter if (d->trans[i]) 303753451Speter free((ptr_t) d->trans[i]); 303853451Speter else if (d->fails[i]) 303953451Speter free((ptr_t) d->fails[i]); 304053472Sobrien if (d->realtrans) free((ptr_t) d->realtrans); 304153472Sobrien if (d->fails) free((ptr_t) d->fails); 304253472Sobrien if (d->success) free((ptr_t) d->success); 304353451Speter for (dm = d->musts; dm; dm = ndm) 304453451Speter { 304553451Speter ndm = dm->next; 304653451Speter free(dm->must); 304753451Speter free((ptr_t) dm); 304853451Speter } 304953451Speter} 305053451Speter 305153451Speter/* Having found the postfix representation of the regular expression, 305253451Speter try to find a long sequence of characters that must appear in any line 305353451Speter containing the r.e. 305453451Speter Finding a "longest" sequence is beyond the scope here; 305553451Speter we take an easy way out and hope for the best. 305653451Speter (Take "(ab|a)b"--please.) 305753451Speter 305853451Speter We do a bottom-up calculation of sequences of characters that must appear 305953451Speter in matches of r.e.'s represented by trees rooted at the nodes of the postfix 306053451Speter representation: 306153451Speter sequences that must appear at the left of the match ("left") 306253451Speter sequences that must appear at the right of the match ("right") 306353451Speter lists of sequences that must appear somewhere in the match ("in") 306453451Speter sequences that must constitute the match ("is") 306553451Speter 306653451Speter When we get to the root of the tree, we use one of the longest of its 306753451Speter calculated "in" sequences as our answer. The sequence we find is returned in 306853451Speter d->must (where "d" is the single argument passed to "dfamust"); 306953451Speter the length of the sequence is returned in d->mustn. 307053451Speter 307153451Speter The sequences calculated for the various types of node (in pseudo ANSI c) 307253451Speter are shown below. "p" is the operand of unary operators (and the left-hand 307353451Speter operand of binary operators); "q" is the right-hand operand of binary 307453451Speter operators. 307553451Speter 307653451Speter "ZERO" means "a zero-length sequence" below. 307753451Speter 307853451Speter Type left right is in 307953451Speter ---- ---- ----- -- -- 308053451Speter char c # c # c # c # c 308153451Speter 3082131557Stjr ANYCHAR ZERO ZERO ZERO ZERO 3083131557Stjr 3084131557Stjr MBCSET ZERO ZERO ZERO ZERO 3085131557Stjr 308653451Speter CSET ZERO ZERO ZERO ZERO 308753451Speter 308853451Speter STAR ZERO ZERO ZERO ZERO 308953451Speter 309053451Speter QMARK ZERO ZERO ZERO ZERO 309153451Speter 309253451Speter PLUS p->left p->right ZERO p->in 309353451Speter 309453451Speter CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 309553451Speter p->left : q->right : q->is!=ZERO) ? q->in plus 309653451Speter p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 309753451Speter ZERO 309853451Speter 309953451Speter OR longest common longest common (do p->is and substrings common to 310053451Speter leading trailing q->is have same p->in and q->in 310153451Speter (sub)sequence (sub)sequence length and 310253451Speter of p->left of p->right content) ? 310353451Speter and q->left and q->right p->is : NULL 310453451Speter 310553451Speter If there's anything else we recognize in the tree, all four sequences get set 310653451Speter to zero-length sequences. If there's something we don't recognize in the tree, 310753451Speter we just return a zero-length sequence. 310853451Speter 310953451Speter Break ties in favor of infrequent letters (choosing 'zzz' in preference to 311053451Speter 'aaa')? 311153451Speter 311253451Speter And. . .is it here or someplace that we might ponder "optimizations" such as 311353451Speter egrep 'psi|epsilon' -> egrep 'psi' 311453451Speter egrep 'pepsi|epsilon' -> egrep 'epsi' 311553451Speter (Yes, we now find "epsi" as a "string 311653451Speter that must occur", but we might also 311753451Speter simplify the *entire* r.e. being sought) 311853451Speter grep '[c]' -> grep 'c' 311953451Speter grep '(ab|a)b' -> grep 'ab' 312053451Speter grep 'ab*' -> grep 'a' 312153451Speter grep 'a*b' -> grep 'b' 312253451Speter 312353451Speter There are several issues: 312453451Speter 312553451Speter Is optimization easy (enough)? 312653451Speter 312753451Speter Does optimization actually accomplish anything, 312853451Speter or is the automaton you get from "psi|epsilon" (for example) 312953451Speter the same as the one you get from "psi" (for example)? 313053451Speter 313153451Speter Are optimizable r.e.'s likely to be used in real-life situations 313253451Speter (something like 'ab*' is probably unlikely; something like is 313353451Speter 'psi|epsilon' is likelier)? */ 313453451Speter 313553451Speterstatic char * 313656919Sruicatalloc (char *old, char *new) 313753451Speter{ 313853451Speter char *result; 313953472Sobrien size_t oldsize, newsize; 314053451Speter 314153451Speter newsize = (new == NULL) ? 0 : strlen(new); 314253451Speter if (old == NULL) 314353451Speter oldsize = 0; 314453451Speter else if (newsize == 0) 314553451Speter return old; 314653451Speter else oldsize = strlen(old); 314753451Speter if (old == NULL) 314853451Speter result = (char *) malloc(newsize + 1); 314953451Speter else 315053451Speter result = (char *) realloc((void *) old, oldsize + newsize + 1); 315153451Speter if (result != NULL && new != NULL) 315253451Speter (void) strcpy(result + oldsize, new); 315353451Speter return result; 315453451Speter} 315553451Speter 315653451Speterstatic char * 315756919Sruicpyalloc (char *string) 315853451Speter{ 315953451Speter return icatalloc((char *) NULL, string); 316053451Speter} 316153451Speter 316253451Speterstatic char * 316356919Sruistrstr (char *lookin, char *lookfor) 316453451Speter{ 316553451Speter char *cp; 316653472Sobrien size_t len; 316753451Speter 316853451Speter len = strlen(lookfor); 316953451Speter for (cp = lookin; *cp != '\0'; ++cp) 317053451Speter if (strncmp(cp, lookfor, len) == 0) 317153451Speter return cp; 317253451Speter return NULL; 317353451Speter} 317453451Speter 317553451Speterstatic void 317656919Sruifree (char *cp) 317753451Speter{ 317853451Speter if (cp != NULL) 317953451Speter free(cp); 318053451Speter} 318153451Speter 318253451Speterstatic void 318356919Srufreelist (char **cpp) 318453451Speter{ 318553451Speter int i; 318653451Speter 318753451Speter if (cpp == NULL) 318853451Speter return; 318953451Speter for (i = 0; cpp[i] != NULL; ++i) 319053451Speter { 319153451Speter free(cpp[i]); 319253451Speter cpp[i] = NULL; 319353451Speter } 319453451Speter} 319553451Speter 319653451Speterstatic char ** 319756919Sruenlist (char **cpp, char *new, size_t len) 319853451Speter{ 319953451Speter int i, j; 320053451Speter 320153451Speter if (cpp == NULL) 320253451Speter return NULL; 320353451Speter if ((new = icpyalloc(new)) == NULL) 320453451Speter { 320553451Speter freelist(cpp); 320653451Speter return NULL; 320753451Speter } 320853451Speter new[len] = '\0'; 320953451Speter /* Is there already something in the list that's new (or longer)? */ 321053451Speter for (i = 0; cpp[i] != NULL; ++i) 321153451Speter if (istrstr(cpp[i], new) != NULL) 321253451Speter { 321353451Speter free(new); 321453451Speter return cpp; 321553451Speter } 321653451Speter /* Eliminate any obsoleted strings. */ 321753451Speter j = 0; 321853451Speter while (cpp[j] != NULL) 321953451Speter if (istrstr(new, cpp[j]) == NULL) 322053451Speter ++j; 322153451Speter else 322253451Speter { 322353451Speter free(cpp[j]); 322453451Speter if (--i == j) 322553451Speter break; 322653451Speter cpp[j] = cpp[i]; 322753451Speter cpp[i] = NULL; 322853451Speter } 322953451Speter /* Add the new string. */ 323053451Speter cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 323153451Speter if (cpp == NULL) 323253451Speter return NULL; 323353451Speter cpp[i] = new; 323453451Speter cpp[i + 1] = NULL; 323553451Speter return cpp; 323653451Speter} 323753451Speter 323853451Speter/* Given pointers to two strings, return a pointer to an allocated 323953451Speter list of their distinct common substrings. Return NULL if something 324053451Speter seems wild. */ 324153451Speterstatic char ** 324256919Srucomsubs (char *left, char *right) 324353451Speter{ 324453451Speter char **cpp; 324553451Speter char *lcp; 324653451Speter char *rcp; 324753472Sobrien size_t i, len; 324853451Speter 324953451Speter if (left == NULL || right == NULL) 325053451Speter return NULL; 325153451Speter cpp = (char **) malloc(sizeof *cpp); 325253451Speter if (cpp == NULL) 325353451Speter return NULL; 325453451Speter cpp[0] = NULL; 325553451Speter for (lcp = left; *lcp != '\0'; ++lcp) 325653451Speter { 325753451Speter len = 0; 3258131557Stjr rcp = strchr (right, *lcp); 325953451Speter while (rcp != NULL) 326053451Speter { 326153451Speter for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 326253472Sobrien continue; 326353451Speter if (i > len) 326453451Speter len = i; 3265131557Stjr rcp = strchr (rcp + 1, *lcp); 326653451Speter } 326753451Speter if (len == 0) 326853451Speter continue; 326953451Speter if ((cpp = enlist(cpp, lcp, len)) == NULL) 327053451Speter break; 327153451Speter } 327253451Speter return cpp; 327353451Speter} 327453451Speter 327553451Speterstatic char ** 327656919Sruaddlists (char **old, char **new) 327753451Speter{ 327853451Speter int i; 327953451Speter 328053451Speter if (old == NULL || new == NULL) 328153451Speter return NULL; 328253451Speter for (i = 0; new[i] != NULL; ++i) 328353451Speter { 328453451Speter old = enlist(old, new[i], strlen(new[i])); 328553451Speter if (old == NULL) 328653451Speter break; 328753451Speter } 328853451Speter return old; 328953451Speter} 329053451Speter 329153451Speter/* Given two lists of substrings, return a new list giving substrings 329253451Speter common to both. */ 329353451Speterstatic char ** 329456919Sruinboth (char **left, char **right) 329553451Speter{ 329653451Speter char **both; 329753451Speter char **temp; 329853451Speter int lnum, rnum; 329953451Speter 330053451Speter if (left == NULL || right == NULL) 330153451Speter return NULL; 330253451Speter both = (char **) malloc(sizeof *both); 330353451Speter if (both == NULL) 330453451Speter return NULL; 330553451Speter both[0] = NULL; 330653451Speter for (lnum = 0; left[lnum] != NULL; ++lnum) 330753451Speter { 330853451Speter for (rnum = 0; right[rnum] != NULL; ++rnum) 330953451Speter { 331053451Speter temp = comsubs(left[lnum], right[rnum]); 331153451Speter if (temp == NULL) 331253451Speter { 331353451Speter freelist(both); 331453451Speter return NULL; 331553451Speter } 331653451Speter both = addlists(both, temp); 331753451Speter freelist(temp); 331853472Sobrien free(temp); 331953451Speter if (both == NULL) 332053451Speter return NULL; 332153451Speter } 332253451Speter } 332353451Speter return both; 332453451Speter} 332553451Speter 332653451Spetertypedef struct 332753451Speter{ 332853451Speter char **in; 332953451Speter char *left; 333053451Speter char *right; 333153451Speter char *is; 333253451Speter} must; 333353451Speter 333453451Speterstatic void 333556919Sruresetmust (must *mp) 333653451Speter{ 333753451Speter mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 333853451Speter freelist(mp->in); 333953451Speter} 334053451Speter 334153451Speterstatic void 334256919Srudfamust (struct dfa *dfa) 334353451Speter{ 334453451Speter must *musts; 334553451Speter must *mp; 334653451Speter char *result; 334753451Speter int ri; 334853451Speter int i; 334953451Speter int exact; 335053451Speter token t; 335153451Speter static must must0; 335253451Speter struct dfamust *dm; 335353472Sobrien static char empty_string[] = ""; 335453451Speter 335553472Sobrien result = empty_string; 335653451Speter exact = 0; 335753451Speter musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 335853451Speter if (musts == NULL) 335953451Speter return; 336053451Speter mp = musts; 336153451Speter for (i = 0; i <= dfa->tindex; ++i) 336253451Speter mp[i] = must0; 336353451Speter for (i = 0; i <= dfa->tindex; ++i) 336453451Speter { 336553451Speter mp[i].in = (char **) malloc(sizeof *mp[i].in); 336653451Speter mp[i].left = malloc(2); 336753451Speter mp[i].right = malloc(2); 336853451Speter mp[i].is = malloc(2); 336953451Speter if (mp[i].in == NULL || mp[i].left == NULL || 337053451Speter mp[i].right == NULL || mp[i].is == NULL) 337153451Speter goto done; 337253451Speter mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 337353451Speter mp[i].in[0] = NULL; 337453451Speter } 337553451Speter#ifdef DEBUG 337653451Speter fprintf(stderr, "dfamust:\n"); 337753451Speter for (i = 0; i < dfa->tindex; ++i) 337853451Speter { 337953451Speter fprintf(stderr, " %d:", i); 338053451Speter prtok(dfa->tokens[i]); 338153451Speter } 338253451Speter putc('\n', stderr); 338353451Speter#endif 338453451Speter for (ri = 0; ri < dfa->tindex; ++ri) 338553451Speter { 338653451Speter switch (t = dfa->tokens[ri]) 338753451Speter { 338853451Speter case LPAREN: 338953451Speter case RPAREN: 339053451Speter goto done; /* "cannot happen" */ 339153451Speter case EMPTY: 339253451Speter case BEGLINE: 339353451Speter case ENDLINE: 339453451Speter case BEGWORD: 339553451Speter case ENDWORD: 339653451Speter case LIMWORD: 339753451Speter case NOTLIMWORD: 339853451Speter case BACKREF: 339953451Speter resetmust(mp); 340053451Speter break; 340153451Speter case STAR: 340253451Speter case QMARK: 340353451Speter if (mp <= musts) 340453451Speter goto done; /* "cannot happen" */ 340553451Speter --mp; 340653451Speter resetmust(mp); 340753451Speter break; 340853451Speter case OR: 340953451Speter case ORTOP: 341053451Speter if (mp < &musts[2]) 341153451Speter goto done; /* "cannot happen" */ 341253451Speter { 341353451Speter char **new; 341453451Speter must *lmp; 341553451Speter must *rmp; 341653451Speter int j, ln, rn, n; 341753451Speter 341853451Speter rmp = --mp; 341953451Speter lmp = --mp; 342053451Speter /* Guaranteed to be. Unlikely, but. . . */ 342153451Speter if (strcmp(lmp->is, rmp->is) != 0) 342253451Speter lmp->is[0] = '\0'; 342353451Speter /* Left side--easy */ 342453451Speter i = 0; 342553451Speter while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 342653451Speter ++i; 342753451Speter lmp->left[i] = '\0'; 342853451Speter /* Right side */ 342953451Speter ln = strlen(lmp->right); 343053451Speter rn = strlen(rmp->right); 343153451Speter n = ln; 343253451Speter if (n > rn) 343353451Speter n = rn; 343453451Speter for (i = 0; i < n; ++i) 343553451Speter if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 343653451Speter break; 343753451Speter for (j = 0; j < i; ++j) 343853451Speter lmp->right[j] = lmp->right[(ln - i) + j]; 343953451Speter lmp->right[j] = '\0'; 344053451Speter new = inboth(lmp->in, rmp->in); 344153451Speter if (new == NULL) 344253451Speter goto done; 344353451Speter freelist(lmp->in); 344453451Speter free((char *) lmp->in); 344553451Speter lmp->in = new; 344653451Speter } 344753451Speter break; 344853451Speter case PLUS: 344953451Speter if (mp <= musts) 345053451Speter goto done; /* "cannot happen" */ 345153451Speter --mp; 345253451Speter mp->is[0] = '\0'; 345353451Speter break; 345453451Speter case END: 345553451Speter if (mp != &musts[1]) 345653451Speter goto done; /* "cannot happen" */ 345753451Speter for (i = 0; musts[0].in[i] != NULL; ++i) 345853451Speter if (strlen(musts[0].in[i]) > strlen(result)) 345953451Speter result = musts[0].in[i]; 346053451Speter if (strcmp(result, musts[0].is) == 0) 346153451Speter exact = 1; 346253451Speter goto done; 346353451Speter case CAT: 346453451Speter if (mp < &musts[2]) 346553451Speter goto done; /* "cannot happen" */ 346653451Speter { 346753451Speter must *lmp; 346853451Speter must *rmp; 346953451Speter 347053451Speter rmp = --mp; 347153451Speter lmp = --mp; 347253451Speter /* In. Everything in left, plus everything in 347353451Speter right, plus catenation of 347453451Speter left's right and right's left. */ 347553451Speter lmp->in = addlists(lmp->in, rmp->in); 347653451Speter if (lmp->in == NULL) 347753451Speter goto done; 347853451Speter if (lmp->right[0] != '\0' && 347953451Speter rmp->left[0] != '\0') 348053451Speter { 348153451Speter char *tp; 348253451Speter 348353451Speter tp = icpyalloc(lmp->right); 348453451Speter if (tp == NULL) 348553451Speter goto done; 348653451Speter tp = icatalloc(tp, rmp->left); 348753451Speter if (tp == NULL) 348853451Speter goto done; 348953451Speter lmp->in = enlist(lmp->in, tp, 349053451Speter strlen(tp)); 349153451Speter free(tp); 349253451Speter if (lmp->in == NULL) 349353451Speter goto done; 349453451Speter } 349553451Speter /* Left-hand */ 349653451Speter if (lmp->is[0] != '\0') 349753451Speter { 349853451Speter lmp->left = icatalloc(lmp->left, 349953451Speter rmp->left); 350053451Speter if (lmp->left == NULL) 350153451Speter goto done; 350253451Speter } 350353451Speter /* Right-hand */ 350453451Speter if (rmp->is[0] == '\0') 350553451Speter lmp->right[0] = '\0'; 350653451Speter lmp->right = icatalloc(lmp->right, rmp->right); 350753451Speter if (lmp->right == NULL) 350853451Speter goto done; 350953451Speter /* Guaranteed to be */ 351053451Speter if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 351153451Speter { 351253451Speter lmp->is = icatalloc(lmp->is, rmp->is); 351353451Speter if (lmp->is == NULL) 351453451Speter goto done; 351553451Speter } 351653451Speter else 351753451Speter lmp->is[0] = '\0'; 351853451Speter } 351953451Speter break; 352053451Speter default: 352153451Speter if (t < END) 352253451Speter { 352353451Speter /* "cannot happen" */ 352453451Speter goto done; 352553451Speter } 352653451Speter else if (t == '\0') 352753451Speter { 352853451Speter /* not on *my* shift */ 352953451Speter goto done; 353053451Speter } 3531131557Stjr else if (t >= CSET 3532131557Stjr#ifdef MBS_SUPPORT 3533131557Stjr || t == ANYCHAR 3534131557Stjr || t == MBCSET 3535131557Stjr#endif /* MBS_SUPPORT */ 3536131557Stjr ) 353753451Speter { 353853451Speter /* easy enough */ 353953451Speter resetmust(mp); 354053451Speter } 354153451Speter else 354253451Speter { 354353451Speter /* plain character */ 354453451Speter resetmust(mp); 354553451Speter mp->is[0] = mp->left[0] = mp->right[0] = t; 354653451Speter mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 354753472Sobrien mp->in = enlist(mp->in, mp->is, (size_t)1); 354853451Speter if (mp->in == NULL) 354953451Speter goto done; 355053451Speter } 355153451Speter break; 355253451Speter } 355353451Speter#ifdef DEBUG 355453451Speter fprintf(stderr, " node: %d:", ri); 355553451Speter prtok(dfa->tokens[ri]); 355653451Speter fprintf(stderr, "\n in:"); 355753451Speter for (i = 0; mp->in[i]; ++i) 355853451Speter fprintf(stderr, " \"%s\"", mp->in[i]); 355953451Speter fprintf(stderr, "\n is: \"%s\"\n", mp->is); 356053451Speter fprintf(stderr, " left: \"%s\"\n", mp->left); 356153451Speter fprintf(stderr, " right: \"%s\"\n", mp->right); 356253451Speter#endif 356353451Speter ++mp; 356453451Speter } 356553451Speter done: 356653451Speter if (strlen(result)) 356753451Speter { 356853451Speter dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 356953451Speter dm->exact = exact; 357053451Speter dm->must = malloc(strlen(result) + 1); 357153451Speter strcpy(dm->must, result); 357253451Speter dm->next = dfa->musts; 357353451Speter dfa->musts = dm; 357453451Speter } 357553451Speter mp = musts; 357653451Speter for (i = 0; i <= dfa->tindex; ++i) 357753451Speter { 357853451Speter freelist(mp[i].in); 357953451Speter ifree((char *) mp[i].in); 358053451Speter ifree(mp[i].left); 358153451Speter ifree(mp[i].right); 358253451Speter ifree(mp[i].is); 358353451Speter } 358453451Speter free((char *) mp); 358553451Speter} 3586131557Stjr/* vim:set shiftwidth=2: */ 3587