185587Sobrien/**************************************************************** 285587SobrienCopyright (C) Lucent Technologies 1997 385587SobrienAll Rights Reserved 485587Sobrien 585587SobrienPermission to use, copy, modify, and distribute this software and 685587Sobrienits documentation for any purpose and without fee is hereby 785587Sobriengranted, provided that the above copyright notice appear in all 885587Sobriencopies and that both that the copyright notice and this 985587Sobrienpermission notice and warranty disclaimer appear in supporting 1085587Sobriendocumentation, and that the name Lucent Technologies or any of 1185587Sobrienits entities not be used in advertising or publicity pertaining 1285587Sobriento distribution of the software without specific, written prior 1385587Sobrienpermission. 1485587Sobrien 1585587SobrienLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 1685587SobrienINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 1785587SobrienIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 1885587SobrienSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1985587SobrienWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 2085587SobrienIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 2185587SobrienARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 2285587SobrienTHIS SOFTWARE. 2385587Sobrien****************************************************************/ 2485587Sobrien 25170331Srafan/* lasciate ogne speranza, voi ch'intrate. */ 2685587Sobrien 27201989Sru#include <sys/cdefs.h> 28201989Sru__FBSDID("$FreeBSD$"); 29201989Sru 3085587Sobrien#define DEBUG 3185587Sobrien 3285587Sobrien#include <ctype.h> 3385587Sobrien#include <stdio.h> 3485587Sobrien#include <string.h> 3585587Sobrien#include <stdlib.h> 3685587Sobrien#include "awk.h" 3785587Sobrien#include "ytab.h" 3885587Sobrien 39118194Sru#define HAT (NCHARS+2) /* matches ^ in regular expr */ 4085587Sobrien /* NCHARS is 2**n */ 4185587Sobrien#define MAXLIN 22 4285587Sobrien 4385587Sobrien#define type(v) (v)->nobj /* badly overloaded here */ 4485587Sobrien#define info(v) (v)->ntype /* badly overloaded here */ 4585587Sobrien#define left(v) (v)->narg[0] 4685587Sobrien#define right(v) (v)->narg[1] 4785587Sobrien#define parent(v) (v)->nnext 4885587Sobrien 4985587Sobrien#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 50170331Srafan#define ELEAF case EMPTYRE: /* empty string in regexp */ 5185587Sobrien#define UNARY case STAR: case PLUS: case QUEST: 5285587Sobrien 5385587Sobrien/* encoding in tree Nodes: 54170331Srafan leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 5585587Sobrien left is index, right contains value or pointer to value 5685587Sobrien unary (STAR, PLUS, QUEST): left is child, right is null 5785587Sobrien binary (CAT, OR): left and right are children 5885587Sobrien parent contains pointer to parent 5985587Sobrien*/ 6085587Sobrien 6185587Sobrien 6285587Sobrienint *setvec; 6385587Sobrienint *tmpset; 6485587Sobrienint maxsetvec = 0; 6585587Sobrien 6685587Sobrienint rtok; /* next token in current re */ 6785587Sobrienint rlxval; 6885587Sobrienstatic uschar *rlxstr; 6985587Sobrienstatic uschar *prestr; /* current position in current re */ 7085587Sobrienstatic uschar *lastre; /* origin of last re */ 7185587Sobrien 7285587Sobrienstatic int setcnt; 7385587Sobrienstatic int poscnt; 7485587Sobrien 7585587Sobrienchar *patbeg; 7685587Sobrienint patlen; 7785587Sobrien 7885587Sobrien#define NFA 20 /* cache this many dynamic fa's */ 7985587Sobrienfa *fatab[NFA]; 8085587Sobrienint nfatab = 0; /* entries in fatab */ 8185587Sobrien 82107806Sobrienfa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 8385587Sobrien{ 8485587Sobrien int i, use, nuse; 8585587Sobrien fa *pfa; 8685587Sobrien static int now = 1; 8785587Sobrien 8885587Sobrien if (setvec == 0) { /* first time through any RE */ 8985587Sobrien maxsetvec = MAXLIN; 9085587Sobrien setvec = (int *) malloc(maxsetvec * sizeof(int)); 9185587Sobrien tmpset = (int *) malloc(maxsetvec * sizeof(int)); 9285587Sobrien if (setvec == 0 || tmpset == 0) 9385587Sobrien overflo("out of space initializing makedfa"); 9485587Sobrien } 9585587Sobrien 9685587Sobrien if (compile_time) /* a constant for sure */ 9785587Sobrien return mkdfa(s, anchor); 9885587Sobrien for (i = 0; i < nfatab; i++) /* is it there already? */ 9985587Sobrien if (fatab[i]->anchor == anchor 10090902Sdes && strcmp((const char *) fatab[i]->restr, s) == 0) { 10185587Sobrien fatab[i]->use = now++; 10285587Sobrien return fatab[i]; 10385587Sobrien } 10485587Sobrien pfa = mkdfa(s, anchor); 10585587Sobrien if (nfatab < NFA) { /* room for another */ 10685587Sobrien fatab[nfatab] = pfa; 10785587Sobrien fatab[nfatab]->use = now++; 10885587Sobrien nfatab++; 10985587Sobrien return pfa; 11085587Sobrien } 11185587Sobrien use = fatab[0]->use; /* replace least-recently used */ 11285587Sobrien nuse = 0; 11385587Sobrien for (i = 1; i < nfatab; i++) 11485587Sobrien if (fatab[i]->use < use) { 11585587Sobrien use = fatab[i]->use; 11685587Sobrien nuse = i; 11785587Sobrien } 11885587Sobrien freefa(fatab[nuse]); 11985587Sobrien fatab[nuse] = pfa; 12085587Sobrien pfa->use = now++; 12185587Sobrien return pfa; 12285587Sobrien} 12385587Sobrien 124107806Sobrienfa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 12585587Sobrien /* anchor = 1 for anchored matches, else 0 */ 12685587Sobrien{ 12785587Sobrien Node *p, *p1; 12885587Sobrien fa *f; 12985587Sobrien 13085587Sobrien p = reparse(s); 13185587Sobrien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 13285587Sobrien /* put ALL STAR in front of reg. exp. */ 13385587Sobrien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 13485587Sobrien /* put FINAL after reg. exp. */ 13585587Sobrien 13685587Sobrien poscnt = 0; 13785587Sobrien penter(p1); /* enter parent pointers and leaf indices */ 13885587Sobrien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 13985587Sobrien overflo("out of space for fa"); 14085587Sobrien f->accept = poscnt-1; /* penter has computed number of positions in re */ 14185587Sobrien cfoll(f, p1); /* set up follow sets */ 14285587Sobrien freetr(p1); 14385587Sobrien if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 14485587Sobrien overflo("out of space in makedfa"); 14585587Sobrien if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 14685587Sobrien overflo("out of space in makedfa"); 14785587Sobrien *f->posns[1] = 0; 14885587Sobrien f->initstat = makeinit(f, anchor); 14985587Sobrien f->anchor = anchor; 15085587Sobrien f->restr = (uschar *) tostring(s); 15185587Sobrien return f; 15285587Sobrien} 15385587Sobrien 15485587Sobrienint makeinit(fa *f, int anchor) 15585587Sobrien{ 15685587Sobrien int i, k; 15785587Sobrien 15885587Sobrien f->curstat = 2; 15985587Sobrien f->out[2] = 0; 16085587Sobrien f->reset = 0; 16185587Sobrien k = *(f->re[0].lfollow); 16285587Sobrien xfree(f->posns[2]); 16385587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 16485587Sobrien overflo("out of space in makeinit"); 16585587Sobrien for (i=0; i <= k; i++) { 16685587Sobrien (f->posns[2])[i] = (f->re[0].lfollow)[i]; 16785587Sobrien } 16885587Sobrien if ((f->posns[2])[1] == f->accept) 16985587Sobrien f->out[2] = 1; 17085587Sobrien for (i=0; i < NCHARS; i++) 17185587Sobrien f->gototab[2][i] = 0; 17285587Sobrien f->curstat = cgoto(f, 2, HAT); 17385587Sobrien if (anchor) { 17485587Sobrien *f->posns[2] = k-1; /* leave out position 0 */ 17585587Sobrien for (i=0; i < k; i++) { 17685587Sobrien (f->posns[0])[i] = (f->posns[2])[i]; 17785587Sobrien } 17885587Sobrien 17985587Sobrien f->out[0] = f->out[2]; 18085587Sobrien if (f->curstat != 2) 18185587Sobrien --(*f->posns[f->curstat]); 18285587Sobrien } 18385587Sobrien return f->curstat; 18485587Sobrien} 18585587Sobrien 18685587Sobrienvoid penter(Node *p) /* set up parent pointers and leaf indices */ 18785587Sobrien{ 18885587Sobrien switch (type(p)) { 189170331Srafan ELEAF 19085587Sobrien LEAF 19185587Sobrien info(p) = poscnt; 19285587Sobrien poscnt++; 19385587Sobrien break; 19485587Sobrien UNARY 19585587Sobrien penter(left(p)); 19685587Sobrien parent(left(p)) = p; 19785587Sobrien break; 19885587Sobrien case CAT: 19985587Sobrien case OR: 20085587Sobrien penter(left(p)); 20185587Sobrien penter(right(p)); 20285587Sobrien parent(left(p)) = p; 20385587Sobrien parent(right(p)) = p; 20485587Sobrien break; 20585587Sobrien default: /* can't happen */ 20685587Sobrien FATAL("can't happen: unknown type %d in penter", type(p)); 20785587Sobrien break; 20885587Sobrien } 20985587Sobrien} 21085587Sobrien 21185587Sobrienvoid freetr(Node *p) /* free parse tree */ 21285587Sobrien{ 21385587Sobrien switch (type(p)) { 214170331Srafan ELEAF 21585587Sobrien LEAF 21685587Sobrien xfree(p); 21785587Sobrien break; 21885587Sobrien UNARY 21985587Sobrien freetr(left(p)); 22085587Sobrien xfree(p); 22185587Sobrien break; 22285587Sobrien case CAT: 22385587Sobrien case OR: 22485587Sobrien freetr(left(p)); 22585587Sobrien freetr(right(p)); 22685587Sobrien xfree(p); 22785587Sobrien break; 22885587Sobrien default: /* can't happen */ 22985587Sobrien FATAL("can't happen: unknown type %d in freetr", type(p)); 23085587Sobrien break; 23185587Sobrien } 23285587Sobrien} 23385587Sobrien 23485587Sobrien/* in the parsing of regular expressions, metacharacters like . have */ 23585587Sobrien/* to be seen literally; \056 is not a metacharacter. */ 23685587Sobrien 237224731Sruint hexstr(uschar **pp) /* find and eval hex string at pp, return new p */ 23885587Sobrien{ /* only pick up one 8-bit byte (2 chars) */ 23985587Sobrien uschar *p; 24085587Sobrien int n = 0; 24185587Sobrien int i; 24285587Sobrien 24385587Sobrien for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 24485587Sobrien if (isdigit(*p)) 24585587Sobrien n = 16 * n + *p - '0'; 24685587Sobrien else if (*p >= 'a' && *p <= 'f') 24785587Sobrien n = 16 * n + *p - 'a' + 10; 24885587Sobrien else if (*p >= 'A' && *p <= 'F') 24985587Sobrien n = 16 * n + *p - 'A' + 10; 25085587Sobrien } 251224731Sru *pp = (uschar *) p; 25285587Sobrien return n; 25385587Sobrien} 25485587Sobrien 25585587Sobrien#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 25685587Sobrien 257224731Sruint quoted(uschar **pp) /* pick up next thing after a \\ */ 25885587Sobrien /* and increment *pp */ 25985587Sobrien{ 260224731Sru uschar *p = *pp; 26185587Sobrien int c; 26285587Sobrien 26385587Sobrien if ((c = *p++) == 't') 26485587Sobrien c = '\t'; 26585587Sobrien else if (c == 'n') 26685587Sobrien c = '\n'; 26785587Sobrien else if (c == 'f') 26885587Sobrien c = '\f'; 26985587Sobrien else if (c == 'r') 27085587Sobrien c = '\r'; 27185587Sobrien else if (c == 'b') 27285587Sobrien c = '\b'; 27385587Sobrien else if (c == '\\') 27485587Sobrien c = '\\'; 27585587Sobrien else if (c == 'x') { /* hexadecimal goo follows */ 27685587Sobrien c = hexstr(&p); /* this adds a null if number is invalid */ 27785587Sobrien } else if (isoctdigit(c)) { /* \d \dd \ddd */ 27885587Sobrien int n = c - '0'; 27985587Sobrien if (isoctdigit(*p)) { 28085587Sobrien n = 8 * n + *p++ - '0'; 28185587Sobrien if (isoctdigit(*p)) 28285587Sobrien n = 8 * n + *p++ - '0'; 28385587Sobrien } 28485587Sobrien c = n; 28585587Sobrien } /* else */ 28685587Sobrien /* c = c; */ 28785587Sobrien *pp = p; 28885587Sobrien return c; 28985587Sobrien} 29085587Sobrien 291201989Srustatic int collate_range_cmp(int a, int b) 292201989Sru{ 293201989Sru static char s[2][2]; 294201989Sru 295201989Sru if ((uschar)a == (uschar)b) 296201989Sru return 0; 297201989Sru s[0][0] = a; 298201989Sru s[1][0] = b; 299201989Sru return (strcoll(s[0], s[1])); 300201989Sru} 301201989Sru 302107806Sobrienchar *cclenter(const char *argp) /* add a character class */ 303107806Sobrien{ 30485587Sobrien int i, c, c2; 305201989Sru int j; 30685587Sobrien uschar *p = (uschar *) argp; 30785587Sobrien uschar *op, *bp; 30885587Sobrien static uschar *buf = 0; 30985587Sobrien static int bufsz = 100; 31085587Sobrien 31185587Sobrien op = p; 31285587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 31385587Sobrien FATAL("out of space for character class [%.10s...] 1", p); 31485587Sobrien bp = buf; 31585587Sobrien for (i = 0; (c = *p++) != 0; ) { 31685587Sobrien if (c == '\\') { 317224731Sru c = quoted(&p); 31885587Sobrien } else if (c == '-' && i > 0 && bp[-1] != 0) { 31985587Sobrien if (*p != 0) { 32085587Sobrien c = bp[-1]; 32185587Sobrien c2 = *p++; 32285587Sobrien if (c2 == '\\') 323224731Sru c2 = quoted(&p); 324201989Sru if (collate_range_cmp(c, c2) > 0) { 32585587Sobrien bp--; 32685587Sobrien i--; 32785587Sobrien continue; 32885587Sobrien } 329201989Sru for (j = 0; j < NCHARS; j++) { 330201989Sru if ((collate_range_cmp(c, j) > 0) || 331201989Sru collate_range_cmp(j, c2) > 0) 332201989Sru continue; 333170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 33485587Sobrien FATAL("out of space for character class [%.10s...] 2", p); 335201989Sru *bp++ = j; 33685587Sobrien i++; 33785587Sobrien } 33885587Sobrien continue; 33985587Sobrien } 34085587Sobrien } 341170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 34285587Sobrien FATAL("out of space for character class [%.10s...] 3", p); 34385587Sobrien *bp++ = c; 34485587Sobrien i++; 34585587Sobrien } 34685587Sobrien *bp = 0; 34785587Sobrien dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 34885587Sobrien xfree(op); 34985587Sobrien return (char *) tostring((char *) buf); 35085587Sobrien} 35185587Sobrien 352107806Sobrienvoid overflo(const char *s) 35385587Sobrien{ 35485587Sobrien FATAL("regular expression too big: %.30s...", s); 35585587Sobrien} 35685587Sobrien 35785587Sobrienvoid cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 35885587Sobrien{ 35985587Sobrien int i; 36085587Sobrien int *p; 36185587Sobrien 36285587Sobrien switch (type(v)) { 363170331Srafan ELEAF 36485587Sobrien LEAF 36585587Sobrien f->re[info(v)].ltype = type(v); 36685587Sobrien f->re[info(v)].lval.np = right(v); 36785587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 36885587Sobrien maxsetvec *= 4; 36985587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 37085587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 37185587Sobrien if (setvec == 0 || tmpset == 0) 37285587Sobrien overflo("out of space in cfoll()"); 37385587Sobrien } 37485587Sobrien for (i = 0; i <= f->accept; i++) 37585587Sobrien setvec[i] = 0; 37685587Sobrien setcnt = 0; 37785587Sobrien follow(v); /* computes setvec and setcnt */ 37885587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 37985587Sobrien overflo("out of space building follow set"); 38085587Sobrien f->re[info(v)].lfollow = p; 38185587Sobrien *p = setcnt; 38285587Sobrien for (i = f->accept; i >= 0; i--) 38385587Sobrien if (setvec[i] == 1) 38485587Sobrien *++p = i; 38585587Sobrien break; 38685587Sobrien UNARY 38785587Sobrien cfoll(f,left(v)); 38885587Sobrien break; 38985587Sobrien case CAT: 39085587Sobrien case OR: 39185587Sobrien cfoll(f,left(v)); 39285587Sobrien cfoll(f,right(v)); 39385587Sobrien break; 39485587Sobrien default: /* can't happen */ 39585587Sobrien FATAL("can't happen: unknown type %d in cfoll", type(v)); 39685587Sobrien } 39785587Sobrien} 39885587Sobrien 39985587Sobrienint first(Node *p) /* collects initially active leaves of p into setvec */ 400170331Srafan /* returns 0 if p matches empty string */ 40185587Sobrien{ 40285587Sobrien int b, lp; 40385587Sobrien 40485587Sobrien switch (type(p)) { 405170331Srafan ELEAF 40685587Sobrien LEAF 40785587Sobrien lp = info(p); /* look for high-water mark of subscripts */ 40885587Sobrien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 40985587Sobrien maxsetvec *= 4; 41085587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 41185587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 41285587Sobrien if (setvec == 0 || tmpset == 0) 41385587Sobrien overflo("out of space in first()"); 41485587Sobrien } 415170331Srafan if (type(p) == EMPTYRE) { 416170331Srafan setvec[lp] = 0; 417170331Srafan return(0); 418170331Srafan } 41985587Sobrien if (setvec[lp] != 1) { 42085587Sobrien setvec[lp] = 1; 42185587Sobrien setcnt++; 42285587Sobrien } 42385587Sobrien if (type(p) == CCL && (*(char *) right(p)) == '\0') 42485587Sobrien return(0); /* empty CCL */ 42585587Sobrien else return(1); 42685587Sobrien case PLUS: 42785587Sobrien if (first(left(p)) == 0) return(0); 42885587Sobrien return(1); 42985587Sobrien case STAR: 43085587Sobrien case QUEST: 43185587Sobrien first(left(p)); 43285587Sobrien return(0); 43385587Sobrien case CAT: 43485587Sobrien if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 43585587Sobrien return(1); 43685587Sobrien case OR: 43785587Sobrien b = first(right(p)); 43885587Sobrien if (first(left(p)) == 0 || b == 0) return(0); 43985587Sobrien return(1); 44085587Sobrien } 44185587Sobrien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 44285587Sobrien return(-1); 44385587Sobrien} 44485587Sobrien 44585587Sobrienvoid follow(Node *v) /* collects leaves that can follow v into setvec */ 44685587Sobrien{ 44785587Sobrien Node *p; 44885587Sobrien 44985587Sobrien if (type(v) == FINAL) 45085587Sobrien return; 45185587Sobrien p = parent(v); 45285587Sobrien switch (type(p)) { 45385587Sobrien case STAR: 45485587Sobrien case PLUS: 45585587Sobrien first(v); 45685587Sobrien follow(p); 45785587Sobrien return; 45885587Sobrien 45985587Sobrien case OR: 46085587Sobrien case QUEST: 46185587Sobrien follow(p); 46285587Sobrien return; 46385587Sobrien 46485587Sobrien case CAT: 46585587Sobrien if (v == left(p)) { /* v is left child of p */ 46685587Sobrien if (first(right(p)) == 0) { 46785587Sobrien follow(p); 46885587Sobrien return; 46985587Sobrien } 47085587Sobrien } else /* v is right child */ 47185587Sobrien follow(p); 47285587Sobrien return; 47385587Sobrien } 47485587Sobrien} 47585587Sobrien 476107806Sobrienint member(int c, const char *sarg) /* is c in s? */ 47785587Sobrien{ 47885587Sobrien uschar *s = (uschar *) sarg; 47985587Sobrien 48085587Sobrien while (*s) 48185587Sobrien if (c == *s++) 48285587Sobrien return(1); 48385587Sobrien return(0); 48485587Sobrien} 48585587Sobrien 486107806Sobrienint match(fa *f, const char *p0) /* shortest match ? */ 48785587Sobrien{ 48885587Sobrien int s, ns; 48985587Sobrien uschar *p = (uschar *) p0; 49085587Sobrien 49185587Sobrien s = f->reset ? makeinit(f,0) : f->initstat; 49285587Sobrien if (f->out[s]) 49385587Sobrien return(1); 49485587Sobrien do { 495170331Srafan /* assert(*p < NCHARS); */ 49685587Sobrien if ((ns = f->gototab[s][*p]) != 0) 49785587Sobrien s = ns; 49885587Sobrien else 49985587Sobrien s = cgoto(f, s, *p); 50085587Sobrien if (f->out[s]) 50185587Sobrien return(1); 50285587Sobrien } while (*p++ != 0); 50385587Sobrien return(0); 50485587Sobrien} 50585587Sobrien 506107806Sobrienint pmatch(fa *f, const char *p0) /* longest match, for sub */ 50785587Sobrien{ 50885587Sobrien int s, ns; 50985587Sobrien uschar *p = (uschar *) p0; 51085587Sobrien uschar *q; 51185587Sobrien int i, k; 51285587Sobrien 513125601Sru /* s = f->reset ? makeinit(f,1) : f->initstat; */ 514125601Sru if (f->reset) { 515125601Sru f->initstat = s = makeinit(f,1); 516125601Sru } else { 517125601Sru s = f->initstat; 518125601Sru } 51985587Sobrien patbeg = (char *) p; 52085587Sobrien patlen = -1; 52185587Sobrien do { 52285587Sobrien q = p; 52385587Sobrien do { 52485587Sobrien if (f->out[s]) /* final state */ 52585587Sobrien patlen = q-p; 526170331Srafan /* assert(*q < NCHARS); */ 52785587Sobrien if ((ns = f->gototab[s][*q]) != 0) 52885587Sobrien s = ns; 52985587Sobrien else 53085587Sobrien s = cgoto(f, s, *q); 53185587Sobrien if (s == 1) { /* no transition */ 53285587Sobrien if (patlen >= 0) { 53385587Sobrien patbeg = (char *) p; 53485587Sobrien return(1); 53585587Sobrien } 53685587Sobrien else 53785587Sobrien goto nextin; /* no match */ 53885587Sobrien } 53985587Sobrien } while (*q++ != 0); 54085587Sobrien if (f->out[s]) 54185587Sobrien patlen = q-p-1; /* don't count $ */ 54285587Sobrien if (patlen >= 0) { 54385587Sobrien patbeg = (char *) p; 54485587Sobrien return(1); 54585587Sobrien } 54685587Sobrien nextin: 54785587Sobrien s = 2; 54885587Sobrien if (f->reset) { 54985587Sobrien for (i = 2; i <= f->curstat; i++) 55085587Sobrien xfree(f->posns[i]); 55185587Sobrien k = *f->posns[0]; 55285587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 55385587Sobrien overflo("out of space in pmatch"); 55485587Sobrien for (i = 0; i <= k; i++) 55585587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 55685587Sobrien f->initstat = f->curstat = 2; 55785587Sobrien f->out[2] = f->out[0]; 55885587Sobrien for (i = 0; i < NCHARS; i++) 55985587Sobrien f->gototab[2][i] = 0; 56085587Sobrien } 56185587Sobrien } while (*p++ != 0); 56285587Sobrien return (0); 56385587Sobrien} 56485587Sobrien 565107806Sobrienint nematch(fa *f, const char *p0) /* non-empty match, for sub */ 56685587Sobrien{ 56785587Sobrien int s, ns; 56885587Sobrien uschar *p = (uschar *) p0; 56985587Sobrien uschar *q; 57085587Sobrien int i, k; 57185587Sobrien 572125601Sru /* s = f->reset ? makeinit(f,1) : f->initstat; */ 573125601Sru if (f->reset) { 574125601Sru f->initstat = s = makeinit(f,1); 575125601Sru } else { 576125601Sru s = f->initstat; 577125601Sru } 57885587Sobrien patlen = -1; 57985587Sobrien while (*p) { 58085587Sobrien q = p; 58185587Sobrien do { 58285587Sobrien if (f->out[s]) /* final state */ 58385587Sobrien patlen = q-p; 584170331Srafan /* assert(*q < NCHARS); */ 58585587Sobrien if ((ns = f->gototab[s][*q]) != 0) 58685587Sobrien s = ns; 58785587Sobrien else 58885587Sobrien s = cgoto(f, s, *q); 58985587Sobrien if (s == 1) { /* no transition */ 59085587Sobrien if (patlen > 0) { 59185587Sobrien patbeg = (char *) p; 59285587Sobrien return(1); 59385587Sobrien } else 59485587Sobrien goto nnextin; /* no nonempty match */ 59585587Sobrien } 59685587Sobrien } while (*q++ != 0); 59785587Sobrien if (f->out[s]) 59885587Sobrien patlen = q-p-1; /* don't count $ */ 59985587Sobrien if (patlen > 0 ) { 60085587Sobrien patbeg = (char *) p; 60185587Sobrien return(1); 60285587Sobrien } 60385587Sobrien nnextin: 60485587Sobrien s = 2; 60585587Sobrien if (f->reset) { 60685587Sobrien for (i = 2; i <= f->curstat; i++) 60785587Sobrien xfree(f->posns[i]); 60885587Sobrien k = *f->posns[0]; 60985587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 61085587Sobrien overflo("out of state space"); 61185587Sobrien for (i = 0; i <= k; i++) 61285587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 61385587Sobrien f->initstat = f->curstat = 2; 61485587Sobrien f->out[2] = f->out[0]; 61585587Sobrien for (i = 0; i < NCHARS; i++) 61685587Sobrien f->gototab[2][i] = 0; 61785587Sobrien } 61885587Sobrien p++; 61985587Sobrien } 62085587Sobrien return (0); 62185587Sobrien} 62285587Sobrien 623107806SobrienNode *reparse(const char *p) /* parses regular expression pointed to by p */ 62485587Sobrien{ /* uses relex() to scan regular expression */ 62585587Sobrien Node *np; 62685587Sobrien 62785587Sobrien dprintf( ("reparse <%s>\n", p) ); 62885587Sobrien lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 62985587Sobrien rtok = relex(); 630107806Sobrien /* GNU compatibility: an empty regexp matches anything */ 631170331Srafan if (rtok == '\0') { 632107806Sobrien /* FATAL("empty regular expression"); previous */ 633170331Srafan return(op2(EMPTYRE, NIL, NIL)); 634170331Srafan } 63585587Sobrien np = regexp(); 63685587Sobrien if (rtok != '\0') 63785587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 63885587Sobrien return(np); 63985587Sobrien} 64085587Sobrien 64185587SobrienNode *regexp(void) /* top-level parse of reg expr */ 64285587Sobrien{ 64385587Sobrien return (alt(concat(primary()))); 64485587Sobrien} 64585587Sobrien 64685587SobrienNode *primary(void) 64785587Sobrien{ 64885587Sobrien Node *np; 64985587Sobrien 65085587Sobrien switch (rtok) { 65185587Sobrien case CHAR: 65285587Sobrien np = op2(CHAR, NIL, itonp(rlxval)); 65385587Sobrien rtok = relex(); 65485587Sobrien return (unary(np)); 65585587Sobrien case ALL: 65685587Sobrien rtok = relex(); 65785587Sobrien return (unary(op2(ALL, NIL, NIL))); 658170331Srafan case EMPTYRE: 659170331Srafan rtok = relex(); 660170331Srafan return (unary(op2(ALL, NIL, NIL))); 66185587Sobrien case DOT: 66285587Sobrien rtok = relex(); 66385587Sobrien return (unary(op2(DOT, NIL, NIL))); 66485587Sobrien case CCL: 66585587Sobrien np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 66685587Sobrien rtok = relex(); 66785587Sobrien return (unary(np)); 66885587Sobrien case NCCL: 66985587Sobrien np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 67085587Sobrien rtok = relex(); 67185587Sobrien return (unary(np)); 67285587Sobrien case '^': 67385587Sobrien rtok = relex(); 67485587Sobrien return (unary(op2(CHAR, NIL, itonp(HAT)))); 67585587Sobrien case '$': 67685587Sobrien rtok = relex(); 67785587Sobrien return (unary(op2(CHAR, NIL, NIL))); 67885587Sobrien case '(': 67985587Sobrien rtok = relex(); 68085587Sobrien if (rtok == ')') { /* special pleading for () */ 68185587Sobrien rtok = relex(); 68285587Sobrien return unary(op2(CCL, NIL, (Node *) tostring(""))); 68385587Sobrien } 68485587Sobrien np = regexp(); 68585587Sobrien if (rtok == ')') { 68685587Sobrien rtok = relex(); 68785587Sobrien return (unary(np)); 68885587Sobrien } 68985587Sobrien else 69085587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 69185587Sobrien default: 69285587Sobrien FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 69385587Sobrien } 69485587Sobrien return 0; /*NOTREACHED*/ 69585587Sobrien} 69685587Sobrien 69785587SobrienNode *concat(Node *np) 69885587Sobrien{ 69985587Sobrien switch (rtok) { 700170331Srafan case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': 70185587Sobrien return (concat(op2(CAT, np, primary()))); 70285587Sobrien } 70385587Sobrien return (np); 70485587Sobrien} 70585587Sobrien 70685587SobrienNode *alt(Node *np) 70785587Sobrien{ 70885587Sobrien if (rtok == OR) { 70985587Sobrien rtok = relex(); 71085587Sobrien return (alt(op2(OR, np, concat(primary())))); 71185587Sobrien } 71285587Sobrien return (np); 71385587Sobrien} 71485587Sobrien 71585587SobrienNode *unary(Node *np) 71685587Sobrien{ 71785587Sobrien switch (rtok) { 71885587Sobrien case STAR: 71985587Sobrien rtok = relex(); 72085587Sobrien return (unary(op2(STAR, np, NIL))); 72185587Sobrien case PLUS: 72285587Sobrien rtok = relex(); 72385587Sobrien return (unary(op2(PLUS, np, NIL))); 72485587Sobrien case QUEST: 72585587Sobrien rtok = relex(); 72685587Sobrien return (unary(op2(QUEST, np, NIL))); 72785587Sobrien default: 72885587Sobrien return (np); 72985587Sobrien } 73085587Sobrien} 73185587Sobrien 73290902Sdes/* 73390902Sdes * Character class definitions conformant to the POSIX locale as 73490902Sdes * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 73590902Sdes * and operating character sets are both ASCII (ISO646) or supersets 73690902Sdes * thereof. 73790902Sdes * 73890902Sdes * Note that to avoid overflowing the temporary buffer used in 73990902Sdes * relex(), the expanded character class (prior to range expansion) 74090902Sdes * must be less than twice the size of their full name. 74190902Sdes */ 742112336Sobrien 743112336Sobrien/* Because isblank doesn't show up in any of the header files on any 744112336Sobrien * system i use, it's defined here. if some other locale has a richer 745112336Sobrien * definition of "blank", define HAS_ISBLANK and provide your own 746112336Sobrien * version. 747118194Sru * the parentheses here are an attempt to find a path through the maze 748118194Sru * of macro definition and/or function and/or version provided. thanks 749118194Sru * to nelson beebe for the suggestion; let's see if it works everywhere. 750112336Sobrien */ 751112336Sobrien 752201951Sru/* #define HAS_ISBLANK */ 753112336Sobrien#ifndef HAS_ISBLANK 754112336Sobrien 755221381Sruint (xisblank)(int c) 756112336Sobrien{ 757112336Sobrien return c==' ' || c=='\t'; 758112336Sobrien} 759112336Sobrien 760112336Sobrien#endif 761112336Sobrien 76290902Sdesstruct charclass { 76390902Sdes const char *cc_name; 76490902Sdes int cc_namelen; 765112336Sobrien int (*cc_func)(int); 76690902Sdes} charclasses[] = { 767112336Sobrien { "alnum", 5, isalnum }, 768112336Sobrien { "alpha", 5, isalpha }, 769221381Sru#ifndef HAS_ISBLANK 770221381Sru { "blank", 5, isspace }, /* was isblank */ 771221381Sru#else 772112336Sobrien { "blank", 5, isblank }, 773221381Sru#endif 774112336Sobrien { "cntrl", 5, iscntrl }, 775112336Sobrien { "digit", 5, isdigit }, 776112336Sobrien { "graph", 5, isgraph }, 777112336Sobrien { "lower", 5, islower }, 778112336Sobrien { "print", 5, isprint }, 779112336Sobrien { "punct", 5, ispunct }, 780112336Sobrien { "space", 5, isspace }, 781112336Sobrien { "upper", 5, isupper }, 782112336Sobrien { "xdigit", 6, isxdigit }, 78390902Sdes { NULL, 0, NULL }, 78490902Sdes}; 78590902Sdes 78690902Sdes 78785587Sobrienint relex(void) /* lexical analyzer for reparse */ 78885587Sobrien{ 78985587Sobrien int c, n; 79085587Sobrien int cflag; 79185587Sobrien static uschar *buf = 0; 79285587Sobrien static int bufsz = 100; 79385587Sobrien uschar *bp; 79490902Sdes struct charclass *cc; 795112336Sobrien int i; 79685587Sobrien 79785587Sobrien switch (c = *prestr++) { 79885587Sobrien case '|': return OR; 79985587Sobrien case '*': return STAR; 80085587Sobrien case '+': return PLUS; 80185587Sobrien case '?': return QUEST; 80285587Sobrien case '.': return DOT; 80385587Sobrien case '\0': prestr--; return '\0'; 80485587Sobrien case '^': 80585587Sobrien case '$': 80685587Sobrien case '(': 80785587Sobrien case ')': 80885587Sobrien return c; 80985587Sobrien case '\\': 810224731Sru rlxval = quoted(&prestr); 81185587Sobrien return CHAR; 81285587Sobrien default: 81385587Sobrien rlxval = c; 81485587Sobrien return CHAR; 81585587Sobrien case '[': 81685587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 81785587Sobrien FATAL("out of space in reg expr %.10s..", lastre); 81885587Sobrien bp = buf; 81985587Sobrien if (*prestr == '^') { 82085587Sobrien cflag = 1; 82185587Sobrien prestr++; 82285587Sobrien } 82385587Sobrien else 82485587Sobrien cflag = 0; 82590902Sdes n = 2 * strlen((const char *) prestr)+1; 826170331Srafan if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 82785587Sobrien FATAL("out of space for reg expr %.10s...", lastre); 82885587Sobrien for (; ; ) { 82985587Sobrien if ((c = *prestr++) == '\\') { 83085587Sobrien *bp++ = '\\'; 83185587Sobrien if ((c = *prestr++) == '\0') 83285587Sobrien FATAL("nonterminated character class %.20s...", lastre); 83385587Sobrien *bp++ = c; 83485587Sobrien /* } else if (c == '\n') { */ 83585587Sobrien /* FATAL("newline in character class %.20s...", lastre); */ 83690902Sdes } else if (c == '[' && *prestr == ':') { 83790902Sdes /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 83890902Sdes for (cc = charclasses; cc->cc_name; cc++) 83990902Sdes if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 84090902Sdes break; 84190902Sdes if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 84290902Sdes prestr[2 + cc->cc_namelen] == ']') { 84390902Sdes prestr += cc->cc_namelen + 3; 844112336Sobrien for (i = 0; i < NCHARS; i++) { 845170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 846112336Sobrien FATAL("out of space for reg expr %.10s...", lastre); 847112336Sobrien if (cc->cc_func(i)) { 848112336Sobrien *bp++ = i; 849112336Sobrien n++; 850112336Sobrien } 851112336Sobrien } 85290902Sdes } else 85390902Sdes *bp++ = c; 85485587Sobrien } else if (c == '\0') { 85585587Sobrien FATAL("nonterminated character class %.20s", lastre); 85685587Sobrien } else if (bp == buf) { /* 1st char is special */ 85785587Sobrien *bp++ = c; 85885587Sobrien } else if (c == ']') { 85985587Sobrien *bp++ = 0; 86085587Sobrien rlxstr = (uschar *) tostring((char *) buf); 86185587Sobrien if (cflag == 0) 86285587Sobrien return CCL; 86385587Sobrien else 86485587Sobrien return NCCL; 86585587Sobrien } else 86685587Sobrien *bp++ = c; 86785587Sobrien } 86885587Sobrien } 86985587Sobrien} 87085587Sobrien 87185587Sobrienint cgoto(fa *f, int s, int c) 87285587Sobrien{ 87385587Sobrien int i, j, k; 87485587Sobrien int *p, *q; 87585587Sobrien 876146299Sru assert(c == HAT || c < NCHARS); 87785587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 87885587Sobrien maxsetvec *= 4; 87985587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 88085587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 88185587Sobrien if (setvec == 0 || tmpset == 0) 88285587Sobrien overflo("out of space in cgoto()"); 88385587Sobrien } 88485587Sobrien for (i = 0; i <= f->accept; i++) 88585587Sobrien setvec[i] = 0; 88685587Sobrien setcnt = 0; 88785587Sobrien /* compute positions of gototab[s,c] into setvec */ 88885587Sobrien p = f->posns[s]; 88985587Sobrien for (i = 1; i <= *p; i++) { 89085587Sobrien if ((k = f->re[p[i]].ltype) != FINAL) { 89185587Sobrien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 89285587Sobrien || (k == DOT && c != 0 && c != HAT) 89385587Sobrien || (k == ALL && c != 0) 894170331Srafan || (k == EMPTYRE && c != 0) 89585587Sobrien || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 89685587Sobrien || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 89785587Sobrien q = f->re[p[i]].lfollow; 89885587Sobrien for (j = 1; j <= *q; j++) { 89985587Sobrien if (q[j] >= maxsetvec) { 90085587Sobrien maxsetvec *= 4; 90185587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 902201951Sru tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 90385587Sobrien if (setvec == 0 || tmpset == 0) 90485587Sobrien overflo("cgoto overflow"); 90585587Sobrien } 90685587Sobrien if (setvec[q[j]] == 0) { 90785587Sobrien setcnt++; 90885587Sobrien setvec[q[j]] = 1; 90985587Sobrien } 91085587Sobrien } 91185587Sobrien } 91285587Sobrien } 91385587Sobrien } 91485587Sobrien /* determine if setvec is a previous state */ 91585587Sobrien tmpset[0] = setcnt; 91685587Sobrien j = 1; 91785587Sobrien for (i = f->accept; i >= 0; i--) 91885587Sobrien if (setvec[i]) { 91985587Sobrien tmpset[j++] = i; 92085587Sobrien } 92185587Sobrien /* tmpset == previous state? */ 92285587Sobrien for (i = 1; i <= f->curstat; i++) { 92385587Sobrien p = f->posns[i]; 92485587Sobrien if ((k = tmpset[0]) != p[0]) 92585587Sobrien goto different; 92685587Sobrien for (j = 1; j <= k; j++) 92785587Sobrien if (tmpset[j] != p[j]) 92885587Sobrien goto different; 92985587Sobrien /* setvec is state i */ 93085587Sobrien f->gototab[s][c] = i; 93185587Sobrien return i; 93285587Sobrien different:; 93385587Sobrien } 93485587Sobrien 93585587Sobrien /* add tmpset to current set of states */ 93685587Sobrien if (f->curstat >= NSTATES-1) { 93785587Sobrien f->curstat = 2; 93885587Sobrien f->reset = 1; 93985587Sobrien for (i = 2; i < NSTATES; i++) 94085587Sobrien xfree(f->posns[i]); 94185587Sobrien } else 94285587Sobrien ++(f->curstat); 94385587Sobrien for (i = 0; i < NCHARS; i++) 94485587Sobrien f->gototab[f->curstat][i] = 0; 94585587Sobrien xfree(f->posns[f->curstat]); 94685587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 94785587Sobrien overflo("out of space in cgoto"); 94885587Sobrien 94985587Sobrien f->posns[f->curstat] = p; 95085587Sobrien f->gototab[s][c] = f->curstat; 95185587Sobrien for (i = 0; i <= setcnt; i++) 95285587Sobrien p[i] = tmpset[i]; 95385587Sobrien if (setvec[f->accept]) 95485587Sobrien f->out[f->curstat] = 1; 95585587Sobrien else 95685587Sobrien f->out[f->curstat] = 0; 95785587Sobrien return f->curstat; 95885587Sobrien} 95985587Sobrien 96085587Sobrien 96185587Sobrienvoid freefa(fa *f) /* free a finite automaton */ 96285587Sobrien{ 96385587Sobrien int i; 96485587Sobrien 96585587Sobrien if (f == NULL) 96685587Sobrien return; 96785587Sobrien for (i = 0; i <= f->curstat; i++) 96885587Sobrien xfree(f->posns[i]); 96985587Sobrien for (i = 0; i <= f->accept; i++) { 97085587Sobrien xfree(f->re[i].lfollow); 97185587Sobrien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 97285587Sobrien xfree((f->re[i].lval.np)); 97385587Sobrien } 97485587Sobrien xfree(f->restr); 97585587Sobrien xfree(f); 97685587Sobrien} 977