11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation 7227753Stheraven * All rights reserved. 8227753Stheraven * Portions of this software were developed by David Chisnall 9227753Stheraven * under sponsorship from the FreeBSD Foundation. 10227753Stheraven * 111573Srgrimes * This code is derived from software contributed to Berkeley by 121573Srgrimes * Henry Spencer. 131573Srgrimes * 141573Srgrimes * Redistribution and use in source and binary forms, with or without 151573Srgrimes * modification, are permitted provided that the following conditions 161573Srgrimes * are met: 171573Srgrimes * 1. Redistributions of source code must retain the above copyright 181573Srgrimes * notice, this list of conditions and the following disclaimer. 191573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 201573Srgrimes * notice, this list of conditions and the following disclaimer in the 211573Srgrimes * documentation and/or other materials provided with the distribution. 221573Srgrimes * 4. Neither the name of the University nor the names of its contributors 231573Srgrimes * may be used to endorse or promote products derived from this software 241573Srgrimes * without specific prior written permission. 251573Srgrimes * 261573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 271573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 281573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 291573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 301573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 311573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 321573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 331573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 341573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 351573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 361573Srgrimes * SUCH DAMAGE. 371573Srgrimes * 381573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 391573Srgrimes */ 401573Srgrimes 411573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 421573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 431573Srgrimes#endif /* LIBC_SCCS and not lint */ 4492986Sobrien#include <sys/cdefs.h> 4592986Sobrien__FBSDID("$FreeBSD: stable/10/lib/libc/regex/regcomp.c 325394 2017-11-04 14:45:36Z pfg $"); 461573Srgrimes 471573Srgrimes#include <sys/types.h> 481573Srgrimes#include <stdio.h> 491573Srgrimes#include <string.h> 501573Srgrimes#include <ctype.h> 511573Srgrimes#include <limits.h> 521573Srgrimes#include <stdlib.h> 531573Srgrimes#include <regex.h> 54132019Stjr#include <wchar.h> 55132019Stjr#include <wctype.h> 561573Srgrimes 5719277Sache#include "collate.h" 5819277Sache 591573Srgrimes#include "utils.h" 601573Srgrimes#include "regex2.h" 611573Srgrimes 621573Srgrimes#include "cname.h" 631573Srgrimes 641573Srgrimes/* 651573Srgrimes * parse structure, passed up and down to avoid global variables and 661573Srgrimes * other clumsinesses 671573Srgrimes */ 681573Srgrimesstruct parse { 691573Srgrimes char *next; /* next character in RE */ 701573Srgrimes char *end; /* end of string (-> NUL normally) */ 711573Srgrimes int error; /* has an error been seen? */ 721573Srgrimes sop *strip; /* malloced strip */ 731573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 741573Srgrimes sopno slen; /* malloced strip length (used) */ 751573Srgrimes int ncsalloc; /* number of csets allocated */ 761573Srgrimes struct re_guts *g; 771573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 781573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 791573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 801573Srgrimes}; 811573Srgrimes 821573Srgrimes/* ========= begin header generated by ./mkh ========= */ 831573Srgrimes#ifdef __cplusplus 841573Srgrimesextern "C" { 851573Srgrimes#endif 861573Srgrimes 871573Srgrimes/* === regcomp.c === */ 88227435Skevlostatic void p_ere(struct parse *p, int stop); 8992905Sobrienstatic void p_ere_exp(struct parse *p); 9092905Sobrienstatic void p_str(struct parse *p); 91227435Skevlostatic void p_bre(struct parse *p, int end1, int end2); 9292905Sobrienstatic int p_simp_re(struct parse *p, int starordinary); 9392905Sobrienstatic int p_count(struct parse *p); 9492905Sobrienstatic void p_bracket(struct parse *p); 9592905Sobrienstatic void p_b_term(struct parse *p, cset *cs); 9692905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs); 9792905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs); 98132019Stjrstatic wint_t p_b_symbol(struct parse *p); 99132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc); 100132019Stjrstatic wint_t othercase(wint_t ch); 101132019Stjrstatic void bothcases(struct parse *p, wint_t ch); 102132019Stjrstatic void ordinary(struct parse *p, wint_t ch); 10392905Sobrienstatic void nonnewline(struct parse *p); 10492905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to); 10592905Sobrienstatic int seterr(struct parse *p, int e); 10692905Sobrienstatic cset *allocset(struct parse *p); 10792905Sobrienstatic void freeset(struct parse *p, cset *cs); 108132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch); 109132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 110132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 111132019Stjrstatic wint_t singleton(cset *cs); 11292905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish); 11392905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd); 11492905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 11592905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value); 116227414Skevlostatic int enlarge(struct parse *p, sopno size); 11792905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g); 11892905Sobrienstatic void findmust(struct parse *p, struct re_guts *g); 119131973Stjrstatic int altoffset(sop *scan, int offset); 12092905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g); 12192905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g); 12292905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g); 123132019Stjrstatic wint_t wgetnext(struct parse *p); 1241573Srgrimes 1251573Srgrimes#ifdef __cplusplus 1261573Srgrimes} 1271573Srgrimes#endif 1281573Srgrimes/* ========= end header generated by ./mkh ========= */ 1291573Srgrimes 1301573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1311573Srgrimes 1321573Srgrimes/* 1331573Srgrimes * macros for use with parse structure 1341573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1351573Srgrimes */ 1361573Srgrimes#define PEEK() (*p->next) 1371573Srgrimes#define PEEK2() (*(p->next+1)) 1381573Srgrimes#define MORE() (p->next < p->end) 1391573Srgrimes#define MORE2() (p->next+1 < p->end) 1401573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1411573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1421573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1431573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1441573Srgrimes#define NEXT() (p->next++) 1451573Srgrimes#define NEXT2() (p->next += 2) 1461573Srgrimes#define NEXTn(n) (p->next += (n)) 1471573Srgrimes#define GETNEXT() (*p->next++) 148132019Stjr#define WGETNEXT() wgetnext(p) 1491573Srgrimes#define SETERROR(e) seterr(p, (e)) 1501573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1511573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1521573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1531573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1541573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1551573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1561573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1571573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1581573Srgrimes#define HERE() (p->slen) 1591573Srgrimes#define THERE() (p->slen - 1) 1601573Srgrimes#define THERETHERE() (p->slen - 2) 1611573Srgrimes#define DROP(n) (p->slen -= (n)) 1621573Srgrimes 1631573Srgrimes#ifndef NDEBUG 1641573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1651573Srgrimes#else 1661573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1671573Srgrimes#endif 1681573Srgrimes 16962232Sdcs/* Macro used by computejump()/computematchjump() */ 17062232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 17162232Sdcs 1721573Srgrimes/* 1731573Srgrimes - regcomp - interface for parser and compilation 1741573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1751573Srgrimes = #define REG_BASIC 0000 1761573Srgrimes = #define REG_EXTENDED 0001 1771573Srgrimes = #define REG_ICASE 0002 1781573Srgrimes = #define REG_NOSUB 0004 1791573Srgrimes = #define REG_NEWLINE 0010 1801573Srgrimes = #define REG_NOSPEC 0020 1811573Srgrimes = #define REG_PEND 0040 1821573Srgrimes = #define REG_DUMP 0200 1831573Srgrimes */ 1841573Srgrimesint /* 0 success, otherwise REG_something */ 185170528Sdelphijregcomp(regex_t * __restrict preg, 186170528Sdelphij const char * __restrict pattern, 187170528Sdelphij int cflags) 1881573Srgrimes{ 1891573Srgrimes struct parse pa; 19092889Sobrien struct re_guts *g; 19192889Sobrien struct parse *p = &pa; 19292889Sobrien int i; 19392889Sobrien size_t len; 194278910Sdelphij size_t maxlen; 1951573Srgrimes#ifdef REDEBUG 1961573Srgrimes# define GOODFLAGS(f) (f) 1971573Srgrimes#else 1981573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1991573Srgrimes#endif 2001573Srgrimes 2011573Srgrimes cflags = GOODFLAGS(cflags); 2021573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 2031573Srgrimes return(REG_INVARG); 2041573Srgrimes 2051573Srgrimes if (cflags®_PEND) { 2061573Srgrimes if (preg->re_endp < pattern) 2071573Srgrimes return(REG_INVARG); 2081573Srgrimes len = preg->re_endp - pattern; 2091573Srgrimes } else 2101573Srgrimes len = strlen((char *)pattern); 2111573Srgrimes 2121573Srgrimes /* do the mallocs early so failure handling is easy */ 213131973Stjr g = (struct re_guts *)malloc(sizeof(struct re_guts)); 2141573Srgrimes if (g == NULL) 2151573Srgrimes return(REG_ESPACE); 216278910Sdelphij /* 217278910Sdelphij * Limit the pattern space to avoid a 32-bit overflow on buffer 218278910Sdelphij * extension. Also avoid any signed overflow in case of conversion 219278910Sdelphij * so make the real limit based on a 31-bit overflow. 220278910Sdelphij * 221278910Sdelphij * Likely not applicable on 64-bit systems but handle the case 222278910Sdelphij * generically (who are we to stop people from using ~715MB+ 223278910Sdelphij * patterns?). 224278910Sdelphij */ 225278910Sdelphij maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 226278910Sdelphij if (len >= maxlen) { 227278910Sdelphij free((char *)g); 228278910Sdelphij return(REG_ESPACE); 229278910Sdelphij } 2301573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 231278910Sdelphij assert(p->ssize >= len); 232278910Sdelphij 2331573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2341573Srgrimes p->slen = 0; 2351573Srgrimes if (p->strip == NULL) { 2361573Srgrimes free((char *)g); 2371573Srgrimes return(REG_ESPACE); 2381573Srgrimes } 2391573Srgrimes 2401573Srgrimes /* set things up */ 2411573Srgrimes p->g = g; 2421573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2431573Srgrimes p->end = p->next + len; 2441573Srgrimes p->error = 0; 2451573Srgrimes p->ncsalloc = 0; 2461573Srgrimes for (i = 0; i < NPAREN; i++) { 2471573Srgrimes p->pbegin[i] = 0; 2481573Srgrimes p->pend[i] = 0; 2491573Srgrimes } 2501573Srgrimes g->sets = NULL; 2511573Srgrimes g->ncsets = 0; 2521573Srgrimes g->cflags = cflags; 2531573Srgrimes g->iflags = 0; 2541573Srgrimes g->nbol = 0; 2551573Srgrimes g->neol = 0; 2561573Srgrimes g->must = NULL; 25762391Sdcs g->moffset = -1; 25862263Sdcs g->charjump = NULL; 25962263Sdcs g->matchjump = NULL; 2601573Srgrimes g->mlen = 0; 2611573Srgrimes g->nsub = 0; 2621573Srgrimes g->backrefs = 0; 2631573Srgrimes 2641573Srgrimes /* do it */ 2651573Srgrimes EMIT(OEND, 0); 2661573Srgrimes g->firststate = THERE(); 2671573Srgrimes if (cflags®_EXTENDED) 2681573Srgrimes p_ere(p, OUT); 2691573Srgrimes else if (cflags®_NOSPEC) 2701573Srgrimes p_str(p); 2711573Srgrimes else 2721573Srgrimes p_bre(p, OUT, OUT); 2731573Srgrimes EMIT(OEND, 0); 2741573Srgrimes g->laststate = THERE(); 2751573Srgrimes 2761573Srgrimes /* tidy up loose ends and fill things in */ 2771573Srgrimes stripsnug(p, g); 2781573Srgrimes findmust(p, g); 27962232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 28062232Sdcs * than three characters 28162232Sdcs */ 28262232Sdcs if(g->mlen > 3) { 28362232Sdcs computejumps(p, g); 28462232Sdcs computematchjumps(p, g); 28562755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 28662232Sdcs free(g->charjump); 28762232Sdcs g->charjump = NULL; 28862232Sdcs } 28962232Sdcs } 2901573Srgrimes g->nplus = pluscount(p, g); 2911573Srgrimes g->magic = MAGIC2; 2921573Srgrimes preg->re_nsub = g->nsub; 2931573Srgrimes preg->re_g = g; 2941573Srgrimes preg->re_magic = MAGIC1; 2951573Srgrimes#ifndef REDEBUG 2961573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2971573Srgrimes if (g->iflags&BAD) 2981573Srgrimes SETERROR(REG_ASSERT); 2991573Srgrimes#endif 3001573Srgrimes 3011573Srgrimes /* win or lose, we're done */ 3021573Srgrimes if (p->error != 0) /* lose */ 3031573Srgrimes regfree(preg); 3041573Srgrimes return(p->error); 3051573Srgrimes} 3061573Srgrimes 3071573Srgrimes/* 3081573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 309227435Skevlo == static void p_ere(struct parse *p, int_t stop); 3101573Srgrimes */ 3111573Srgrimesstatic void 312170528Sdelphijp_ere(struct parse *p, 313227435Skevlo int stop) /* character this ERE should end at */ 3141573Srgrimes{ 31592889Sobrien char c; 31692889Sobrien sopno prevback; 31792889Sobrien sopno prevfwd; 31892889Sobrien sopno conc; 31992889Sobrien int first = 1; /* is this the first alternative? */ 3201573Srgrimes 3211573Srgrimes for (;;) { 3221573Srgrimes /* do a bunch of concatenated expressions */ 3231573Srgrimes conc = HERE(); 3241573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3251573Srgrimes p_ere_exp(p); 32617141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3271573Srgrimes 3281573Srgrimes if (!EAT('|')) 3291573Srgrimes break; /* NOTE BREAK OUT */ 3301573Srgrimes 3311573Srgrimes if (first) { 3321573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3331573Srgrimes prevfwd = conc; 3341573Srgrimes prevback = conc; 3351573Srgrimes first = 0; 3361573Srgrimes } 3371573Srgrimes ASTERN(OOR1, prevback); 3381573Srgrimes prevback = THERE(); 3391573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3401573Srgrimes prevfwd = HERE(); 3411573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3421573Srgrimes } 3431573Srgrimes 3441573Srgrimes if (!first) { /* tail-end fixups */ 3451573Srgrimes AHEAD(prevfwd); 3461573Srgrimes ASTERN(O_CH, prevback); 3471573Srgrimes } 3481573Srgrimes 3491573Srgrimes assert(!MORE() || SEE(stop)); 3501573Srgrimes} 3511573Srgrimes 3521573Srgrimes/* 3531573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 35492889Sobrien == static void p_ere_exp(struct parse *p); 3551573Srgrimes */ 3561573Srgrimesstatic void 357170528Sdelphijp_ere_exp(struct parse *p) 3581573Srgrimes{ 35992889Sobrien char c; 360132019Stjr wint_t wc; 36192889Sobrien sopno pos; 36292889Sobrien int count; 36392889Sobrien int count2; 36492889Sobrien sopno subno; 3651573Srgrimes int wascaret = 0; 3661573Srgrimes 3671573Srgrimes assert(MORE()); /* caller should have ensured this */ 3681573Srgrimes c = GETNEXT(); 3691573Srgrimes 3701573Srgrimes pos = HERE(); 3711573Srgrimes switch (c) { 3721573Srgrimes case '(': 37317141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3741573Srgrimes p->g->nsub++; 3751573Srgrimes subno = p->g->nsub; 3761573Srgrimes if (subno < NPAREN) 3771573Srgrimes p->pbegin[subno] = HERE(); 3781573Srgrimes EMIT(OLPAREN, subno); 3791573Srgrimes if (!SEE(')')) 3801573Srgrimes p_ere(p, ')'); 3811573Srgrimes if (subno < NPAREN) { 3821573Srgrimes p->pend[subno] = HERE(); 3831573Srgrimes assert(p->pend[subno] != 0); 3841573Srgrimes } 3851573Srgrimes EMIT(ORPAREN, subno); 38617141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3871573Srgrimes break; 3881573Srgrimes#ifndef POSIX_MISTAKE 3891573Srgrimes case ')': /* happens only if no current unmatched ( */ 3901573Srgrimes /* 3911573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3921573Srgrimes * this until slightly too late for 1003.2, and none of the 3931573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3941573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3951573Srgrimes * we can get it fixed. 3961573Srgrimes */ 3971573Srgrimes SETERROR(REG_EPAREN); 3981573Srgrimes break; 3991573Srgrimes#endif 4001573Srgrimes case '^': 4011573Srgrimes EMIT(OBOL, 0); 4021573Srgrimes p->g->iflags |= USEBOL; 4031573Srgrimes p->g->nbol++; 4041573Srgrimes wascaret = 1; 4051573Srgrimes break; 4061573Srgrimes case '$': 4071573Srgrimes EMIT(OEOL, 0); 4081573Srgrimes p->g->iflags |= USEEOL; 4091573Srgrimes p->g->neol++; 4101573Srgrimes break; 4111573Srgrimes case '|': 4121573Srgrimes SETERROR(REG_EMPTY); 4131573Srgrimes break; 4141573Srgrimes case '*': 4151573Srgrimes case '+': 4161573Srgrimes case '?': 4171573Srgrimes SETERROR(REG_BADRPT); 4181573Srgrimes break; 4191573Srgrimes case '.': 4201573Srgrimes if (p->g->cflags®_NEWLINE) 4211573Srgrimes nonnewline(p); 4221573Srgrimes else 4231573Srgrimes EMIT(OANY, 0); 4241573Srgrimes break; 4251573Srgrimes case '[': 4261573Srgrimes p_bracket(p); 4271573Srgrimes break; 4281573Srgrimes case '\\': 42917141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 430132019Stjr wc = WGETNEXT(); 431269484Spfg switch (wc) { 432269484Spfg case '<': 433269484Spfg EMIT(OBOW, 0); 434269484Spfg break; 435269484Spfg case '>': 436269484Spfg EMIT(OEOW, 0); 437269484Spfg break; 438269484Spfg default: 439269484Spfg ordinary(p, wc); 440269484Spfg break; 441269484Spfg } 4421573Srgrimes break; 4431573Srgrimes case '{': /* okay as ordinary except if digit follows */ 44449094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4451573Srgrimes /* FALLTHROUGH */ 4461573Srgrimes default: 447318030Sbrooks if (p->error != 0) 448318030Sbrooks return; 449132019Stjr p->next--; 450132019Stjr wc = WGETNEXT(); 451132019Stjr ordinary(p, wc); 4521573Srgrimes break; 4531573Srgrimes } 4541573Srgrimes 4551573Srgrimes if (!MORE()) 4561573Srgrimes return; 4571573Srgrimes c = PEEK(); 4581573Srgrimes /* we call { a repetition if followed by a digit */ 4591573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 46049094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4611573Srgrimes return; /* no repetition, we're done */ 4621573Srgrimes NEXT(); 4631573Srgrimes 46417141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4651573Srgrimes switch (c) { 4661573Srgrimes case '*': /* implemented as +? */ 4671573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4681573Srgrimes INSERT(OPLUS_, pos); 4691573Srgrimes ASTERN(O_PLUS, pos); 4701573Srgrimes INSERT(OQUEST_, pos); 4711573Srgrimes ASTERN(O_QUEST, pos); 4721573Srgrimes break; 4731573Srgrimes case '+': 4741573Srgrimes INSERT(OPLUS_, pos); 4751573Srgrimes ASTERN(O_PLUS, pos); 4761573Srgrimes break; 4771573Srgrimes case '?': 4781573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4791573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4801573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4811573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4821573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4831573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4841573Srgrimes ASTERN(O_CH, THERETHERE()); 4851573Srgrimes break; 4861573Srgrimes case '{': 4871573Srgrimes count = p_count(p); 4881573Srgrimes if (EAT(',')) { 48949094Sache if (isdigit((uch)PEEK())) { 4901573Srgrimes count2 = p_count(p); 49117141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4921573Srgrimes } else /* single number with comma */ 4931573Srgrimes count2 = INFINITY; 4941573Srgrimes } else /* just a single number */ 4951573Srgrimes count2 = count; 4961573Srgrimes repeat(p, pos, count, count2); 4971573Srgrimes if (!EAT('}')) { /* error heuristics */ 4981573Srgrimes while (MORE() && PEEK() != '}') 4991573Srgrimes NEXT(); 50017141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 5011573Srgrimes SETERROR(REG_BADBR); 5021573Srgrimes } 5031573Srgrimes break; 5041573Srgrimes } 5051573Srgrimes 5061573Srgrimes if (!MORE()) 5071573Srgrimes return; 5081573Srgrimes c = PEEK(); 5091573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 51049094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 5111573Srgrimes return; 5121573Srgrimes SETERROR(REG_BADRPT); 5131573Srgrimes} 5141573Srgrimes 5151573Srgrimes/* 5161573Srgrimes - p_str - string (no metacharacters) "parser" 51792889Sobrien == static void p_str(struct parse *p); 5181573Srgrimes */ 5191573Srgrimesstatic void 520170528Sdelphijp_str(struct parse *p) 5211573Srgrimes{ 52217141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 5231573Srgrimes while (MORE()) 524132019Stjr ordinary(p, WGETNEXT()); 5251573Srgrimes} 5261573Srgrimes 5271573Srgrimes/* 5281573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 529227435Skevlo == static void p_bre(struct parse *p, int end1, \ 530227435Skevlo == int end2); 5311573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 5321573Srgrimes * 5331573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 534131973Stjr * taken as an ordinary character and then revised to be an anchor. 5351573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5361573Srgrimes */ 5371573Srgrimesstatic void 538170528Sdelphijp_bre(struct parse *p, 539227435Skevlo int end1, /* first terminating character */ 540227435Skevlo int end2) /* second terminating character */ 5411573Srgrimes{ 54292889Sobrien sopno start = HERE(); 54392889Sobrien int first = 1; /* first subexpression? */ 54492889Sobrien int wasdollar = 0; 5451573Srgrimes 5461573Srgrimes if (EAT('^')) { 5471573Srgrimes EMIT(OBOL, 0); 5481573Srgrimes p->g->iflags |= USEBOL; 5491573Srgrimes p->g->nbol++; 5501573Srgrimes } 5511573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5521573Srgrimes wasdollar = p_simp_re(p, first); 5531573Srgrimes first = 0; 5541573Srgrimes } 5551573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5561573Srgrimes DROP(1); 5571573Srgrimes EMIT(OEOL, 0); 5581573Srgrimes p->g->iflags |= USEEOL; 5591573Srgrimes p->g->neol++; 5601573Srgrimes } 5611573Srgrimes 56217141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5631573Srgrimes} 5641573Srgrimes 5651573Srgrimes/* 5661573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 56792889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5681573Srgrimes */ 5691573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 570170528Sdelphijp_simp_re(struct parse *p, 571170528Sdelphij int starordinary) /* is a leading * an ordinary character? */ 5721573Srgrimes{ 57392889Sobrien int c; 57492889Sobrien int count; 57592889Sobrien int count2; 57692889Sobrien sopno pos; 57792889Sobrien int i; 578132019Stjr wint_t wc; 57992889Sobrien sopno subno; 5801573Srgrimes# define BACKSL (1<<CHAR_BIT) 5811573Srgrimes 5821573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5831573Srgrimes 5841573Srgrimes assert(MORE()); /* caller should have ensured this */ 5851573Srgrimes c = GETNEXT(); 5861573Srgrimes if (c == '\\') { 58717141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 58849094Sache c = BACKSL | GETNEXT(); 5891573Srgrimes } 5901573Srgrimes switch (c) { 5911573Srgrimes case '.': 5921573Srgrimes if (p->g->cflags®_NEWLINE) 5931573Srgrimes nonnewline(p); 5941573Srgrimes else 5951573Srgrimes EMIT(OANY, 0); 5961573Srgrimes break; 5971573Srgrimes case '[': 5981573Srgrimes p_bracket(p); 5991573Srgrimes break; 600269484Spfg case BACKSL|'<': 601269484Spfg EMIT(OBOW, 0); 602269484Spfg break; 603269484Spfg case BACKSL|'>': 604269484Spfg EMIT(OEOW, 0); 605269484Spfg break; 6061573Srgrimes case BACKSL|'{': 6071573Srgrimes SETERROR(REG_BADRPT); 6081573Srgrimes break; 6091573Srgrimes case BACKSL|'(': 6101573Srgrimes p->g->nsub++; 6111573Srgrimes subno = p->g->nsub; 6121573Srgrimes if (subno < NPAREN) 6131573Srgrimes p->pbegin[subno] = HERE(); 6141573Srgrimes EMIT(OLPAREN, subno); 6151573Srgrimes /* the MORE here is an error heuristic */ 6161573Srgrimes if (MORE() && !SEETWO('\\', ')')) 6171573Srgrimes p_bre(p, '\\', ')'); 6181573Srgrimes if (subno < NPAREN) { 6191573Srgrimes p->pend[subno] = HERE(); 6201573Srgrimes assert(p->pend[subno] != 0); 6211573Srgrimes } 6221573Srgrimes EMIT(ORPAREN, subno); 62317141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 6241573Srgrimes break; 6251573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 6261573Srgrimes case BACKSL|'}': 6271573Srgrimes SETERROR(REG_EPAREN); 6281573Srgrimes break; 6291573Srgrimes case BACKSL|'1': 6301573Srgrimes case BACKSL|'2': 6311573Srgrimes case BACKSL|'3': 6321573Srgrimes case BACKSL|'4': 6331573Srgrimes case BACKSL|'5': 6341573Srgrimes case BACKSL|'6': 6351573Srgrimes case BACKSL|'7': 6361573Srgrimes case BACKSL|'8': 6371573Srgrimes case BACKSL|'9': 6381573Srgrimes i = (c&~BACKSL) - '0'; 6391573Srgrimes assert(i < NPAREN); 6401573Srgrimes if (p->pend[i] != 0) { 6411573Srgrimes assert(i <= p->g->nsub); 6421573Srgrimes EMIT(OBACK_, i); 6431573Srgrimes assert(p->pbegin[i] != 0); 6441573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6451573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6461573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6471573Srgrimes EMIT(O_BACK, i); 6481573Srgrimes } else 6491573Srgrimes SETERROR(REG_ESUBREG); 6501573Srgrimes p->g->backrefs = 1; 6511573Srgrimes break; 6521573Srgrimes case '*': 65317141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6541573Srgrimes /* FALLTHROUGH */ 6551573Srgrimes default: 656318030Sbrooks if (p->error != 0) 657318030Sbrooks return(0); /* Definitely not $... */ 658132019Stjr p->next--; 659132019Stjr wc = WGETNEXT(); 660132019Stjr ordinary(p, wc); 6611573Srgrimes break; 6621573Srgrimes } 6631573Srgrimes 6641573Srgrimes if (EAT('*')) { /* implemented as +? */ 6651573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6661573Srgrimes INSERT(OPLUS_, pos); 6671573Srgrimes ASTERN(O_PLUS, pos); 6681573Srgrimes INSERT(OQUEST_, pos); 6691573Srgrimes ASTERN(O_QUEST, pos); 6701573Srgrimes } else if (EATTWO('\\', '{')) { 6711573Srgrimes count = p_count(p); 6721573Srgrimes if (EAT(',')) { 67349094Sache if (MORE() && isdigit((uch)PEEK())) { 6741573Srgrimes count2 = p_count(p); 67517141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6761573Srgrimes } else /* single number with comma */ 6771573Srgrimes count2 = INFINITY; 6781573Srgrimes } else /* just a single number */ 6791573Srgrimes count2 = count; 6801573Srgrimes repeat(p, pos, count, count2); 6811573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6821573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6831573Srgrimes NEXT(); 68417141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6851573Srgrimes SETERROR(REG_BADBR); 6861573Srgrimes } 68749094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6881573Srgrimes return(1); 6891573Srgrimes 6901573Srgrimes return(0); 6911573Srgrimes} 6921573Srgrimes 6931573Srgrimes/* 6941573Srgrimes - p_count - parse a repetition count 69592889Sobrien == static int p_count(struct parse *p); 6961573Srgrimes */ 6971573Srgrimesstatic int /* the value */ 698170528Sdelphijp_count(struct parse *p) 6991573Srgrimes{ 70092889Sobrien int count = 0; 70192889Sobrien int ndigits = 0; 7021573Srgrimes 70349094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 7041573Srgrimes count = count*10 + (GETNEXT() - '0'); 7051573Srgrimes ndigits++; 7061573Srgrimes } 7071573Srgrimes 70817141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 7091573Srgrimes return(count); 7101573Srgrimes} 7111573Srgrimes 7121573Srgrimes/* 7131573Srgrimes - p_bracket - parse a bracketed character list 71492889Sobrien == static void p_bracket(struct parse *p); 7151573Srgrimes */ 7161573Srgrimesstatic void 717170528Sdelphijp_bracket(struct parse *p) 7181573Srgrimes{ 719132019Stjr cset *cs; 720132019Stjr wint_t ch; 7211573Srgrimes 7221573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 7231573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 7241573Srgrimes EMIT(OBOW, 0); 7251573Srgrimes NEXTn(6); 7261573Srgrimes return; 7271573Srgrimes } 7281573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 7291573Srgrimes EMIT(OEOW, 0); 7301573Srgrimes NEXTn(6); 7311573Srgrimes return; 7321573Srgrimes } 7331573Srgrimes 734132019Stjr if ((cs = allocset(p)) == NULL) 735132019Stjr return; 736132019Stjr 737132019Stjr if (p->g->cflags®_ICASE) 738132019Stjr cs->icase = 1; 7391573Srgrimes if (EAT('^')) 740132019Stjr cs->invert = 1; 7411573Srgrimes if (EAT(']')) 742132019Stjr CHadd(p, cs, ']'); 7431573Srgrimes else if (EAT('-')) 744132019Stjr CHadd(p, cs, '-'); 7451573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7461573Srgrimes p_b_term(p, cs); 7471573Srgrimes if (EAT('-')) 748132019Stjr CHadd(p, cs, '-'); 74917141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7501573Srgrimes 7511573Srgrimes if (p->error != 0) /* don't mess things up further */ 7521573Srgrimes return; 7531573Srgrimes 754132019Stjr if (cs->invert && p->g->cflags®_NEWLINE) 755132019Stjr cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 7561573Srgrimes 757132019Stjr if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 758132019Stjr ordinary(p, ch); 7591573Srgrimes freeset(p, cs); 7601573Srgrimes } else 761132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 7621573Srgrimes} 7631573Srgrimes 7641573Srgrimes/* 7651573Srgrimes - p_b_term - parse one term of a bracketed character list 76692889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7671573Srgrimes */ 7681573Srgrimesstatic void 769170528Sdelphijp_b_term(struct parse *p, cset *cs) 7701573Srgrimes{ 77192889Sobrien char c; 772132019Stjr wint_t start, finish; 773132019Stjr wint_t i; 774227753Stheraven struct xlocale_collate *table = 775227753Stheraven (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 7761573Srgrimes 7771573Srgrimes /* classify what we've got */ 7781573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7791573Srgrimes case '[': 7801573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7811573Srgrimes break; 7821573Srgrimes case '-': 7831573Srgrimes SETERROR(REG_ERANGE); 7841573Srgrimes return; /* NOTE RETURN */ 7851573Srgrimes default: 7861573Srgrimes c = '\0'; 7871573Srgrimes break; 7881573Srgrimes } 7891573Srgrimes 7901573Srgrimes switch (c) { 7911573Srgrimes case ':': /* character class */ 7921573Srgrimes NEXT2(); 79317141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7941573Srgrimes c = PEEK(); 79517141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7961573Srgrimes p_b_cclass(p, cs); 79717141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 79817141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7991573Srgrimes break; 8001573Srgrimes case '=': /* equivalence class */ 8011573Srgrimes NEXT2(); 80217141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8031573Srgrimes c = PEEK(); 80417141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 8051573Srgrimes p_b_eclass(p, cs); 80617141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 80717141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 8081573Srgrimes break; 8091573Srgrimes default: /* symbol, ordinary character, or range */ 8101573Srgrimes start = p_b_symbol(p); 8111573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 8121573Srgrimes /* range */ 8131573Srgrimes NEXT(); 8141573Srgrimes if (EAT('-')) 8151573Srgrimes finish = '-'; 8161573Srgrimes else 8171573Srgrimes finish = p_b_symbol(p); 8181573Srgrimes } else 8191573Srgrimes finish = start; 82017514Sache if (start == finish) 821132019Stjr CHadd(p, cs, start); 82217514Sache else { 823303185Sache if (table->__collate_load_error || MB_CUR_MAX > 1) { 824303185Sache (void)REQUIRE(start <= finish, REG_ERANGE); 825132019Stjr CHaddrange(p, cs, start, finish); 82624637Sache } else { 827303185Sache (void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE); 828132019Stjr for (i = 0; i <= UCHAR_MAX; i++) { 829303185Sache if ( __wcollate_range_cmp(start, i) <= 0 830303185Sache && __wcollate_range_cmp(i, finish) <= 0 83124637Sache ) 832132019Stjr CHadd(p, cs, i); 83324637Sache } 83417514Sache } 83517514Sache } 8361573Srgrimes break; 8371573Srgrimes } 8381573Srgrimes} 8391573Srgrimes 8401573Srgrimes/* 8411573Srgrimes - p_b_cclass - parse a character-class name and deal with it 84292889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8431573Srgrimes */ 8441573Srgrimesstatic void 845170528Sdelphijp_b_cclass(struct parse *p, cset *cs) 8461573Srgrimes{ 84792889Sobrien char *sp = p->next; 84892889Sobrien size_t len; 849132019Stjr wctype_t wct; 850132019Stjr char clname[16]; 8511573Srgrimes 85217508Sache while (MORE() && isalpha((uch)PEEK())) 8531573Srgrimes NEXT(); 8541573Srgrimes len = p->next - sp; 855132019Stjr if (len >= sizeof(clname) - 1) { 8561573Srgrimes SETERROR(REG_ECTYPE); 8571573Srgrimes return; 8581573Srgrimes } 859132019Stjr memcpy(clname, sp, len); 860132019Stjr clname[len] = '\0'; 861132019Stjr if ((wct = wctype(clname)) == 0) { 862132019Stjr SETERROR(REG_ECTYPE); 863132019Stjr return; 86417508Sache } 865132019Stjr CHaddtype(p, cs, wct); 8661573Srgrimes} 8671573Srgrimes 8681573Srgrimes/* 8691573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 87092889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 8711573Srgrimes * 8721573Srgrimes * This implementation is incomplete. xxx 8731573Srgrimes */ 8741573Srgrimesstatic void 875170528Sdelphijp_b_eclass(struct parse *p, cset *cs) 8761573Srgrimes{ 877132019Stjr wint_t c; 8781573Srgrimes 8791573Srgrimes c = p_b_coll_elem(p, '='); 880132019Stjr CHadd(p, cs, c); 8811573Srgrimes} 8821573Srgrimes 8831573Srgrimes/* 8841573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 885227414Skevlo == static wint_t p_b_symbol(struct parse *p); 8861573Srgrimes */ 887132019Stjrstatic wint_t /* value of symbol */ 888170528Sdelphijp_b_symbol(struct parse *p) 8891573Srgrimes{ 890132019Stjr wint_t value; 8911573Srgrimes 89217141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8931573Srgrimes if (!EATTWO('[', '.')) 894132019Stjr return(WGETNEXT()); 8951573Srgrimes 8961573Srgrimes /* collating symbol */ 8971573Srgrimes value = p_b_coll_elem(p, '.'); 89817141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 8991573Srgrimes return(value); 9001573Srgrimes} 9011573Srgrimes 9021573Srgrimes/* 9031573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 904227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 9051573Srgrimes */ 906132019Stjrstatic wint_t /* value of collating element */ 907170528Sdelphijp_b_coll_elem(struct parse *p, 908170528Sdelphij wint_t endc) /* name ended by endc,']' */ 9091573Srgrimes{ 91092889Sobrien char *sp = p->next; 91192889Sobrien struct cname *cp; 91292889Sobrien int len; 913132019Stjr mbstate_t mbs; 914132019Stjr wchar_t wc; 915132019Stjr size_t clen; 9161573Srgrimes 9171573Srgrimes while (MORE() && !SEETWO(endc, ']')) 9181573Srgrimes NEXT(); 9191573Srgrimes if (!MORE()) { 9201573Srgrimes SETERROR(REG_EBRACK); 9211573Srgrimes return(0); 9221573Srgrimes } 9231573Srgrimes len = p->next - sp; 9241573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 925325394Spfg if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 9261573Srgrimes return(cp->code); /* known name */ 927132019Stjr memset(&mbs, 0, sizeof(mbs)); 928132019Stjr if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 929132019Stjr return (wc); /* single character */ 930132019Stjr else if (clen == (size_t)-1 || clen == (size_t)-2) 931132019Stjr SETERROR(REG_ILLSEQ); 932132019Stjr else 933132019Stjr SETERROR(REG_ECOLLATE); /* neither */ 9341573Srgrimes return(0); 9351573Srgrimes} 9361573Srgrimes 9371573Srgrimes/* 9381573Srgrimes - othercase - return the case counterpart of an alphabetic 939227414Skevlo == static wint_t othercase(wint_t ch); 9401573Srgrimes */ 941132019Stjrstatic wint_t /* if no counterpart, return ch */ 942170528Sdelphijothercase(wint_t ch) 9431573Srgrimes{ 944132019Stjr assert(iswalpha(ch)); 945132019Stjr if (iswupper(ch)) 946132019Stjr return(towlower(ch)); 947132019Stjr else if (iswlower(ch)) 948132019Stjr return(towupper(ch)); 9491573Srgrimes else /* peculiar, but could happen */ 9501573Srgrimes return(ch); 9511573Srgrimes} 9521573Srgrimes 9531573Srgrimes/* 9541573Srgrimes - bothcases - emit a dualcase version of a two-case character 955227414Skevlo == static void bothcases(struct parse *p, wint_t ch); 9561573Srgrimes * 9571573Srgrimes * Boy, is this implementation ever a kludge... 9581573Srgrimes */ 9591573Srgrimesstatic void 960170528Sdelphijbothcases(struct parse *p, wint_t ch) 9611573Srgrimes{ 96292889Sobrien char *oldnext = p->next; 96392889Sobrien char *oldend = p->end; 964132019Stjr char bracket[3 + MB_LEN_MAX]; 965132019Stjr size_t n; 966132019Stjr mbstate_t mbs; 9671573Srgrimes 9681573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 9691573Srgrimes p->next = bracket; 970132019Stjr memset(&mbs, 0, sizeof(mbs)); 971132019Stjr n = wcrtomb(bracket, ch, &mbs); 972132019Stjr assert(n != (size_t)-1); 973132019Stjr bracket[n] = ']'; 974132019Stjr bracket[n + 1] = '\0'; 975132019Stjr p->end = bracket+n+1; 9761573Srgrimes p_bracket(p); 977132019Stjr assert(p->next == p->end); 9781573Srgrimes p->next = oldnext; 9791573Srgrimes p->end = oldend; 9801573Srgrimes} 9811573Srgrimes 9821573Srgrimes/* 9831573Srgrimes - ordinary - emit an ordinary character 984227414Skevlo == static void ordinary(struct parse *p, wint_t ch); 9851573Srgrimes */ 9861573Srgrimesstatic void 987170528Sdelphijordinary(struct parse *p, wint_t ch) 9881573Srgrimes{ 989132019Stjr cset *cs; 9901573Srgrimes 991132019Stjr if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 9921573Srgrimes bothcases(p, ch); 993132019Stjr else if ((ch & OPDMASK) == ch) 994132019Stjr EMIT(OCHAR, ch); 995132019Stjr else { 996132019Stjr /* 997132019Stjr * Kludge: character is too big to fit into an OCHAR operand. 998132019Stjr * Emit a singleton set. 999132019Stjr */ 1000132019Stjr if ((cs = allocset(p)) == NULL) 1001132019Stjr return; 1002132019Stjr CHadd(p, cs, ch); 1003132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 1004132019Stjr } 10051573Srgrimes} 10061573Srgrimes 10071573Srgrimes/* 10081573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 100992889Sobrien == static void nonnewline(struct parse *p); 10101573Srgrimes * 10111573Srgrimes * Boy, is this implementation ever a kludge... 10121573Srgrimes */ 10131573Srgrimesstatic void 1014170528Sdelphijnonnewline(struct parse *p) 10151573Srgrimes{ 101692889Sobrien char *oldnext = p->next; 101792889Sobrien char *oldend = p->end; 10181573Srgrimes char bracket[4]; 10191573Srgrimes 10201573Srgrimes p->next = bracket; 10211573Srgrimes p->end = bracket+3; 10221573Srgrimes bracket[0] = '^'; 10231573Srgrimes bracket[1] = '\n'; 10241573Srgrimes bracket[2] = ']'; 10251573Srgrimes bracket[3] = '\0'; 10261573Srgrimes p_bracket(p); 10271573Srgrimes assert(p->next == bracket+3); 10281573Srgrimes p->next = oldnext; 10291573Srgrimes p->end = oldend; 10301573Srgrimes} 10311573Srgrimes 10321573Srgrimes/* 10331573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 103492889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 10351573Srgrimes */ 10361573Srgrimesstatic void 1037170528Sdelphijrepeat(struct parse *p, 1038170528Sdelphij sopno start, /* operand from here to end of strip */ 1039170528Sdelphij int from, /* repeated from this number */ 1040170528Sdelphij int to) /* to this number of times (maybe INFINITY) */ 10411573Srgrimes{ 104292889Sobrien sopno finish = HERE(); 10431573Srgrimes# define N 2 10441573Srgrimes# define INF 3 10451573Srgrimes# define REP(f, t) ((f)*8 + (t)) 10461573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 104792889Sobrien sopno copy; 10481573Srgrimes 10491573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 10501573Srgrimes return; 10511573Srgrimes 10521573Srgrimes assert(from <= to); 10531573Srgrimes 10541573Srgrimes switch (REP(MAP(from), MAP(to))) { 10551573Srgrimes case REP(0, 0): /* must be user doing this */ 10561573Srgrimes DROP(finish-start); /* drop the operand */ 10571573Srgrimes break; 10581573Srgrimes case REP(0, 1): /* as x{1,1}? */ 10591573Srgrimes case REP(0, N): /* as x{1,n}? */ 10601573Srgrimes case REP(0, INF): /* as x{1,}? */ 10611573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10621573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 10631573Srgrimes repeat(p, start+1, 1, to); 10641573Srgrimes ASTERN(OOR1, start); 10651573Srgrimes AHEAD(start); /* ... fix it */ 10661573Srgrimes EMIT(OOR2, 0); 10671573Srgrimes AHEAD(THERE()); 10681573Srgrimes ASTERN(O_CH, THERETHERE()); 10691573Srgrimes break; 10701573Srgrimes case REP(1, 1): /* trivial case */ 10711573Srgrimes /* done */ 10721573Srgrimes break; 10731573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 10741573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10751573Srgrimes INSERT(OCH_, start); 10761573Srgrimes ASTERN(OOR1, start); 10771573Srgrimes AHEAD(start); 10781573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 10791573Srgrimes AHEAD(THERE()); /* ...so fix it */ 10801573Srgrimes ASTERN(O_CH, THERETHERE()); 10811573Srgrimes copy = dupl(p, start+1, finish+1); 10821573Srgrimes assert(copy == finish+4); 10831573Srgrimes repeat(p, copy, 1, to-1); 10841573Srgrimes break; 10851573Srgrimes case REP(1, INF): /* as x+ */ 10861573Srgrimes INSERT(OPLUS_, start); 10871573Srgrimes ASTERN(O_PLUS, start); 10881573Srgrimes break; 10891573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 10901573Srgrimes copy = dupl(p, start, finish); 10911573Srgrimes repeat(p, copy, from-1, to-1); 10921573Srgrimes break; 10931573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 10941573Srgrimes copy = dupl(p, start, finish); 10951573Srgrimes repeat(p, copy, from-1, to); 10961573Srgrimes break; 10971573Srgrimes default: /* "can't happen" */ 10981573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 10991573Srgrimes break; 11001573Srgrimes } 11011573Srgrimes} 11021573Srgrimes 11031573Srgrimes/* 1104132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1105132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the 1106132019Stjr - character can't be converted. Returns the number of bytes consumed. 1107132019Stjr */ 1108132019Stjrstatic wint_t 1109170528Sdelphijwgetnext(struct parse *p) 1110132019Stjr{ 1111132019Stjr mbstate_t mbs; 1112132019Stjr wchar_t wc; 1113132019Stjr size_t n; 1114132019Stjr 1115132019Stjr memset(&mbs, 0, sizeof(mbs)); 1116132019Stjr n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1117132019Stjr if (n == (size_t)-1 || n == (size_t)-2) { 1118132019Stjr SETERROR(REG_ILLSEQ); 1119132019Stjr return (0); 1120132019Stjr } 1121132019Stjr if (n == 0) 1122132019Stjr n = 1; 1123132019Stjr p->next += n; 1124132019Stjr return (wc); 1125132019Stjr} 1126132019Stjr 1127132019Stjr/* 11281573Srgrimes - seterr - set an error condition 112992889Sobrien == static int seterr(struct parse *p, int e); 11301573Srgrimes */ 11311573Srgrimesstatic int /* useless but makes type checking happy */ 1132170528Sdelphijseterr(struct parse *p, int e) 11331573Srgrimes{ 11341573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11351573Srgrimes p->error = e; 11361573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11371573Srgrimes p->end = nuls; 11381573Srgrimes return(0); /* make the return value well-defined */ 11391573Srgrimes} 11401573Srgrimes 11411573Srgrimes/* 11421573Srgrimes - allocset - allocate a set of characters for [] 114392889Sobrien == static cset *allocset(struct parse *p); 11441573Srgrimes */ 11451573Srgrimesstatic cset * 1146170528Sdelphijallocset(struct parse *p) 11471573Srgrimes{ 1148132019Stjr cset *cs, *ncs; 11491573Srgrimes 1150132019Stjr ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1151132019Stjr if (ncs == NULL) { 1152132019Stjr SETERROR(REG_ESPACE); 1153132019Stjr return (NULL); 11541573Srgrimes } 1155132019Stjr p->g->sets = ncs; 1156132019Stjr cs = &p->g->sets[p->g->ncsets++]; 1157132019Stjr memset(cs, 0, sizeof(*cs)); 11581573Srgrimes 11591573Srgrimes return(cs); 11601573Srgrimes} 11611573Srgrimes 11621573Srgrimes/* 11631573Srgrimes - freeset - free a now-unused set 116492889Sobrien == static void freeset(struct parse *p, cset *cs); 11651573Srgrimes */ 11661573Srgrimesstatic void 1167170528Sdelphijfreeset(struct parse *p, cset *cs) 11681573Srgrimes{ 116992889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 11701573Srgrimes 1171132019Stjr free(cs->wides); 1172132019Stjr free(cs->ranges); 1173132019Stjr free(cs->types); 1174132019Stjr memset(cs, 0, sizeof(*cs)); 11751573Srgrimes if (cs == top-1) /* recover only the easy case */ 11761573Srgrimes p->g->ncsets--; 11771573Srgrimes} 11781573Srgrimes 11791573Srgrimes/* 1180132019Stjr - singleton - Determine whether a set contains only one character, 1181132019Stjr - returning it if so, otherwise returning OUT. 11821573Srgrimes */ 1183132019Stjrstatic wint_t 1184170528Sdelphijsingleton(cset *cs) 11851573Srgrimes{ 1186132019Stjr wint_t i, s, n; 11871573Srgrimes 1188132019Stjr for (i = n = 0; i < NC; i++) 1189132019Stjr if (CHIN(cs, i)) { 1190132019Stjr n++; 1191132019Stjr s = i; 11921573Srgrimes } 1193132019Stjr if (n == 1) 1194132019Stjr return (s); 1195132019Stjr if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1196132019Stjr cs->icase == 0) 1197132019Stjr return (cs->wides[0]); 1198132019Stjr /* Don't bother handling the other cases. */ 1199132019Stjr return (OUT); 1200132019Stjr} 12011573Srgrimes 1202132019Stjr/* 1203132019Stjr - CHadd - add character to character set. 1204132019Stjr */ 1205132019Stjrstatic void 1206170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch) 1207132019Stjr{ 1208134802Stjr wint_t nch, *newwides; 1209132019Stjr assert(ch >= 0); 1210134802Stjr if (ch < NC) 1211132019Stjr cs->bmp[ch >> 3] |= 1 << (ch & 7); 1212134802Stjr else { 1213132019Stjr newwides = realloc(cs->wides, (cs->nwides + 1) * 1214132019Stjr sizeof(*cs->wides)); 1215132019Stjr if (newwides == NULL) { 1216132019Stjr SETERROR(REG_ESPACE); 1217132019Stjr return; 1218132019Stjr } 1219132019Stjr cs->wides = newwides; 1220132019Stjr cs->wides[cs->nwides++] = ch; 12211573Srgrimes } 1222134802Stjr if (cs->icase) { 1223134802Stjr if ((nch = towlower(ch)) < NC) 1224134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1225134802Stjr if ((nch = towupper(ch)) < NC) 1226134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1227134802Stjr } 12281573Srgrimes} 12291573Srgrimes 12301573Srgrimes/* 1231132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set. 12321573Srgrimes */ 1233132019Stjrstatic void 1234170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 12351573Srgrimes{ 1236132019Stjr crange *newranges; 12371573Srgrimes 1238132019Stjr for (; min < NC && min <= max; min++) 1239132019Stjr CHadd(p, cs, min); 1240132019Stjr if (min >= max) 1241132019Stjr return; 1242132019Stjr newranges = realloc(cs->ranges, (cs->nranges + 1) * 1243132019Stjr sizeof(*cs->ranges)); 1244132019Stjr if (newranges == NULL) { 1245132019Stjr SETERROR(REG_ESPACE); 1246132019Stjr return; 1247132019Stjr } 1248132019Stjr cs->ranges = newranges; 1249132019Stjr cs->ranges[cs->nranges].min = min; 1250247596Sdelphij cs->ranges[cs->nranges].max = max; 1251132019Stjr cs->nranges++; 12521573Srgrimes} 12531573Srgrimes 12541573Srgrimes/* 1255132019Stjr - CHaddtype - add all characters of a certain type to a character set. 12561573Srgrimes */ 1257132019Stjrstatic void 1258170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct) 12591573Srgrimes{ 1260132019Stjr wint_t i; 1261132019Stjr wctype_t *newtypes; 12621573Srgrimes 1263134802Stjr for (i = 0; i < NC; i++) 1264132019Stjr if (iswctype(i, wct)) 1265132019Stjr CHadd(p, cs, i); 1266132019Stjr newtypes = realloc(cs->types, (cs->ntypes + 1) * 1267132019Stjr sizeof(*cs->types)); 1268132019Stjr if (newtypes == NULL) { 1269132019Stjr SETERROR(REG_ESPACE); 1270132019Stjr return; 1271132019Stjr } 1272132019Stjr cs->types = newtypes; 1273132019Stjr cs->types[cs->ntypes++] = wct; 12741573Srgrimes} 12751573Srgrimes 12761573Srgrimes/* 12771573Srgrimes - dupl - emit a duplicate of a bunch of sops 127892889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 12791573Srgrimes */ 12801573Srgrimesstatic sopno /* start of duplicate */ 1281170528Sdelphijdupl(struct parse *p, 1282170528Sdelphij sopno start, /* from here */ 1283170528Sdelphij sopno finish) /* to this less one */ 12841573Srgrimes{ 128592889Sobrien sopno ret = HERE(); 128692889Sobrien sopno len = finish - start; 12871573Srgrimes 12881573Srgrimes assert(finish >= start); 12891573Srgrimes if (len == 0) 12901573Srgrimes return(ret); 1291227414Skevlo if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1292227414Skevlo return(ret); 12931573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 12941573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 12951573Srgrimes p->slen += len; 12961573Srgrimes return(ret); 12971573Srgrimes} 12981573Srgrimes 12991573Srgrimes/* 13001573Srgrimes - doemit - emit a strip operator 130192889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 13021573Srgrimes * 13031573Srgrimes * It might seem better to implement this as a macro with a function as 13041573Srgrimes * hard-case backup, but it's just too big and messy unless there are 13051573Srgrimes * some changes to the data structures. Maybe later. 13061573Srgrimes */ 13071573Srgrimesstatic void 1308170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd) 13091573Srgrimes{ 13101573Srgrimes /* avoid making error situations worse */ 13111573Srgrimes if (p->error != 0) 13121573Srgrimes return; 13131573Srgrimes 13141573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 13151573Srgrimes assert(opnd < 1<<OPSHIFT); 13161573Srgrimes 13171573Srgrimes /* deal with undersized strip */ 13181573Srgrimes if (p->slen >= p->ssize) 1319227414Skevlo if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1320227414Skevlo return; 13211573Srgrimes 13221573Srgrimes /* finally, it's all reduced to the easy case */ 13231573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 13241573Srgrimes} 13251573Srgrimes 13261573Srgrimes/* 13271573Srgrimes - doinsert - insert a sop into the strip 132892889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 13291573Srgrimes */ 13301573Srgrimesstatic void 1331170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos) 13321573Srgrimes{ 133392889Sobrien sopno sn; 133492889Sobrien sop s; 133592889Sobrien int i; 13361573Srgrimes 13371573Srgrimes /* avoid making error situations worse */ 13381573Srgrimes if (p->error != 0) 13391573Srgrimes return; 13401573Srgrimes 13411573Srgrimes sn = HERE(); 13421573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 13431573Srgrimes assert(HERE() == sn+1); 13441573Srgrimes s = p->strip[sn]; 13451573Srgrimes 13461573Srgrimes /* adjust paren pointers */ 13471573Srgrimes assert(pos > 0); 13481573Srgrimes for (i = 1; i < NPAREN; i++) { 13491573Srgrimes if (p->pbegin[i] >= pos) { 13501573Srgrimes p->pbegin[i]++; 13511573Srgrimes } 13521573Srgrimes if (p->pend[i] >= pos) { 13531573Srgrimes p->pend[i]++; 13541573Srgrimes } 13551573Srgrimes } 13561573Srgrimes 13571573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13581573Srgrimes (HERE()-pos-1)*sizeof(sop)); 13591573Srgrimes p->strip[pos] = s; 13601573Srgrimes} 13611573Srgrimes 13621573Srgrimes/* 13631573Srgrimes - dofwd - complete a forward reference 136492889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 13651573Srgrimes */ 13661573Srgrimesstatic void 1367170528Sdelphijdofwd(struct parse *p, sopno pos, sop value) 13681573Srgrimes{ 13691573Srgrimes /* avoid making error situations worse */ 13701573Srgrimes if (p->error != 0) 13711573Srgrimes return; 13721573Srgrimes 13731573Srgrimes assert(value < 1<<OPSHIFT); 13741573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 13751573Srgrimes} 13761573Srgrimes 13771573Srgrimes/* 13781573Srgrimes - enlarge - enlarge the strip 1379227414Skevlo == static int enlarge(struct parse *p, sopno size); 13801573Srgrimes */ 1381227414Skevlostatic int 1382170528Sdelphijenlarge(struct parse *p, sopno size) 13831573Srgrimes{ 138492889Sobrien sop *sp; 13851573Srgrimes 13861573Srgrimes if (p->ssize >= size) 1387227414Skevlo return 1; 13881573Srgrimes 13891573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 13901573Srgrimes if (sp == NULL) { 13911573Srgrimes SETERROR(REG_ESPACE); 1392227414Skevlo return 0; 13931573Srgrimes } 13941573Srgrimes p->strip = sp; 13951573Srgrimes p->ssize = size; 1396227414Skevlo return 1; 13971573Srgrimes} 13981573Srgrimes 13991573Srgrimes/* 14001573Srgrimes - stripsnug - compact the strip 140192889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 14021573Srgrimes */ 14031573Srgrimesstatic void 1404170528Sdelphijstripsnug(struct parse *p, struct re_guts *g) 14051573Srgrimes{ 14061573Srgrimes g->nstates = p->slen; 14071573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 14081573Srgrimes if (g->strip == NULL) { 14091573Srgrimes SETERROR(REG_ESPACE); 14101573Srgrimes g->strip = p->strip; 14111573Srgrimes } 14121573Srgrimes} 14131573Srgrimes 14141573Srgrimes/* 14151573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 141692889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 14171573Srgrimes * 14181573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 14191573Srgrimes * for common subsequences. Someday. This code is simple and finds most 14201573Srgrimes * of the interesting cases. 14211573Srgrimes * 14221573Srgrimes * Note that must and mlen got initialized during setup. 14231573Srgrimes */ 14241573Srgrimesstatic void 1425170528Sdelphijfindmust(struct parse *p, struct re_guts *g) 14261573Srgrimes{ 142792889Sobrien sop *scan; 14281573Srgrimes sop *start; 142992889Sobrien sop *newstart; 143092889Sobrien sopno newlen; 143192889Sobrien sop s; 143292889Sobrien char *cp; 143362391Sdcs int offset; 1434132019Stjr char buf[MB_LEN_MAX]; 1435132019Stjr size_t clen; 1436132019Stjr mbstate_t mbs; 14371573Srgrimes 14381573Srgrimes /* avoid making error situations worse */ 14391573Srgrimes if (p->error != 0) 14401573Srgrimes return; 14411573Srgrimes 1442132019Stjr /* 1443132019Stjr * It's not generally safe to do a ``char'' substring search on 1444132019Stjr * multibyte character strings, but it's safe for at least 1445132019Stjr * UTF-8 (see RFC 3629). 1446132019Stjr */ 1447132019Stjr if (MB_CUR_MAX > 1 && 1448132019Stjr strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1449132019Stjr return; 1450132019Stjr 14511573Srgrimes /* find the longest OCHAR sequence in strip */ 14521573Srgrimes newlen = 0; 145362391Sdcs offset = 0; 145462391Sdcs g->moffset = 0; 14551573Srgrimes scan = g->strip + 1; 14561573Srgrimes do { 14571573Srgrimes s = *scan++; 14581573Srgrimes switch (OP(s)) { 14591573Srgrimes case OCHAR: /* sequence member */ 1460132019Stjr if (newlen == 0) { /* new sequence */ 1461132019Stjr memset(&mbs, 0, sizeof(mbs)); 14621573Srgrimes newstart = scan - 1; 1463132019Stjr } 1464132019Stjr clen = wcrtomb(buf, OPND(s), &mbs); 1465132019Stjr if (clen == (size_t)-1) 1466132019Stjr goto toohard; 1467132019Stjr newlen += clen; 14681573Srgrimes break; 14691573Srgrimes case OPLUS_: /* things that don't break one */ 14701573Srgrimes case OLPAREN: 14711573Srgrimes case ORPAREN: 14721573Srgrimes break; 14731573Srgrimes case OQUEST_: /* things that must be skipped */ 14741573Srgrimes case OCH_: 1475131973Stjr offset = altoffset(scan, offset); 14761573Srgrimes scan--; 14771573Srgrimes do { 14781573Srgrimes scan += OPND(s); 14791573Srgrimes s = *scan; 14801573Srgrimes /* assert() interferes w debug printouts */ 14811573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 14821573Srgrimes OP(s) != OOR2) { 14831573Srgrimes g->iflags |= BAD; 14841573Srgrimes return; 14851573Srgrimes } 14861573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 1487102411Scharnier /* FALLTHROUGH */ 148862391Sdcs case OBOW: /* things that break a sequence */ 148962391Sdcs case OEOW: 149062391Sdcs case OBOL: 149162391Sdcs case OEOL: 149262391Sdcs case O_QUEST: 149362391Sdcs case O_CH: 149462391Sdcs case OEND: 14951573Srgrimes if (newlen > g->mlen) { /* ends one */ 14961573Srgrimes start = newstart; 14971573Srgrimes g->mlen = newlen; 149862391Sdcs if (offset > -1) { 149962391Sdcs g->moffset += offset; 150062391Sdcs offset = newlen; 150162391Sdcs } else 150262391Sdcs g->moffset = offset; 150362391Sdcs } else { 150462391Sdcs if (offset > -1) 150562391Sdcs offset += newlen; 15061573Srgrimes } 15071573Srgrimes newlen = 0; 15081573Srgrimes break; 150962391Sdcs case OANY: 151062391Sdcs if (newlen > g->mlen) { /* ends one */ 151162391Sdcs start = newstart; 151262391Sdcs g->mlen = newlen; 151362391Sdcs if (offset > -1) { 151462391Sdcs g->moffset += offset; 151562391Sdcs offset = newlen; 151662391Sdcs } else 151762391Sdcs g->moffset = offset; 151862391Sdcs } else { 151962391Sdcs if (offset > -1) 152062391Sdcs offset += newlen; 152162391Sdcs } 152262391Sdcs if (offset > -1) 152362391Sdcs offset++; 152462391Sdcs newlen = 0; 152562391Sdcs break; 152662391Sdcs case OANYOF: /* may or may not invalidate offset */ 152762391Sdcs /* First, everything as OANY */ 152862391Sdcs if (newlen > g->mlen) { /* ends one */ 152962391Sdcs start = newstart; 153062391Sdcs g->mlen = newlen; 153162391Sdcs if (offset > -1) { 153262391Sdcs g->moffset += offset; 153362391Sdcs offset = newlen; 153462391Sdcs } else 153562391Sdcs g->moffset = offset; 153662391Sdcs } else { 153762391Sdcs if (offset > -1) 153862391Sdcs offset += newlen; 153962391Sdcs } 154062391Sdcs if (offset > -1) 154162391Sdcs offset++; 154262391Sdcs newlen = 0; 154362391Sdcs break; 1544132019Stjr toohard: 154562391Sdcs default: 154662391Sdcs /* Anything here makes it impossible or too hard 154762391Sdcs * to calculate the offset -- so we give up; 154862391Sdcs * save the last known good offset, in case the 154962391Sdcs * must sequence doesn't occur later. 155062391Sdcs */ 155162391Sdcs if (newlen > g->mlen) { /* ends one */ 155262391Sdcs start = newstart; 155362391Sdcs g->mlen = newlen; 155462391Sdcs if (offset > -1) 155562391Sdcs g->moffset += offset; 155662391Sdcs else 155762391Sdcs g->moffset = offset; 155862391Sdcs } 155962391Sdcs offset = -1; 156062391Sdcs newlen = 0; 156162391Sdcs break; 15621573Srgrimes } 15631573Srgrimes } while (OP(s) != OEND); 15641573Srgrimes 156562391Sdcs if (g->mlen == 0) { /* there isn't one */ 156662391Sdcs g->moffset = -1; 15671573Srgrimes return; 156862391Sdcs } 15691573Srgrimes 15701573Srgrimes /* turn it into a character string */ 15711573Srgrimes g->must = malloc((size_t)g->mlen + 1); 15721573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 15731573Srgrimes g->mlen = 0; 157462391Sdcs g->moffset = -1; 15751573Srgrimes return; 15761573Srgrimes } 15771573Srgrimes cp = g->must; 15781573Srgrimes scan = start; 1579132019Stjr memset(&mbs, 0, sizeof(mbs)); 1580132019Stjr while (cp < g->must + g->mlen) { 15811573Srgrimes while (OP(s = *scan++) != OCHAR) 15821573Srgrimes continue; 1583132019Stjr clen = wcrtomb(cp, OPND(s), &mbs); 1584132019Stjr assert(clen != (size_t)-1); 1585132019Stjr cp += clen; 15861573Srgrimes } 15871573Srgrimes assert(cp == g->must + g->mlen); 15881573Srgrimes *cp++ = '\0'; /* just on general principles */ 15891573Srgrimes} 15901573Srgrimes 15911573Srgrimes/* 159262391Sdcs - altoffset - choose biggest offset among multiple choices 1593131973Stjr == static int altoffset(sop *scan, int offset); 159462391Sdcs * 159562391Sdcs * Compute, recursively if necessary, the largest offset among multiple 159662391Sdcs * re paths. 159762391Sdcs */ 159862391Sdcsstatic int 1599170528Sdelphijaltoffset(sop *scan, int offset) 160062391Sdcs{ 160162391Sdcs int largest; 160262391Sdcs int try; 160362391Sdcs sop s; 160462391Sdcs 160562391Sdcs /* If we gave up already on offsets, return */ 160662391Sdcs if (offset == -1) 160762391Sdcs return -1; 160862391Sdcs 160962391Sdcs largest = 0; 161062391Sdcs try = 0; 161162391Sdcs s = *scan++; 161262391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 161362391Sdcs switch (OP(s)) { 161462391Sdcs case OOR1: 161562391Sdcs if (try > largest) 161662391Sdcs largest = try; 161762391Sdcs try = 0; 161862391Sdcs break; 161962391Sdcs case OQUEST_: 162062391Sdcs case OCH_: 1621131973Stjr try = altoffset(scan, try); 162262391Sdcs if (try == -1) 162362391Sdcs return -1; 162462391Sdcs scan--; 162562391Sdcs do { 162662391Sdcs scan += OPND(s); 162762391Sdcs s = *scan; 162862391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 162962391Sdcs OP(s) != OOR2) 163062391Sdcs return -1; 163162391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 163262855Sdcs /* We must skip to the next position, or we'll 163362855Sdcs * leave altoffset() too early. 163462855Sdcs */ 163562855Sdcs scan++; 163662391Sdcs break; 163762391Sdcs case OANYOF: 163862391Sdcs case OCHAR: 163962391Sdcs case OANY: 164062391Sdcs try++; 164162391Sdcs case OBOW: 164262391Sdcs case OEOW: 164362391Sdcs case OLPAREN: 164462391Sdcs case ORPAREN: 164562391Sdcs case OOR2: 164662391Sdcs break; 164762391Sdcs default: 164862391Sdcs try = -1; 164962391Sdcs break; 165062391Sdcs } 165162391Sdcs if (try == -1) 165262391Sdcs return -1; 165362391Sdcs s = *scan++; 165462391Sdcs } 165562391Sdcs 165662391Sdcs if (try > largest) 165762391Sdcs largest = try; 165862391Sdcs 165962391Sdcs return largest+offset; 166062391Sdcs} 166162391Sdcs 166262391Sdcs/* 166362232Sdcs - computejumps - compute char jumps for BM scan 166492889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 166562232Sdcs * 166662232Sdcs * This algorithm assumes g->must exists and is has size greater than 166762232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 166862232Sdcs * Sara Baase. 166962232Sdcs * 167062232Sdcs * A char jump is the number of characters one needs to jump based on 167162232Sdcs * the value of the character from the text that was mismatched. 167262232Sdcs */ 167362232Sdcsstatic void 1674170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g) 167562232Sdcs{ 167662232Sdcs int ch; 167762232Sdcs int mindex; 167862232Sdcs 167962232Sdcs /* Avoid making errors worse */ 168062232Sdcs if (p->error != 0) 168162232Sdcs return; 168262232Sdcs 168362848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 168462232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 168562232Sdcs return; 168662754Sdcs /* Adjust for signed chars, if necessary */ 168762754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 168862232Sdcs 168962232Sdcs /* If the character does not exist in the pattern, the jump 169062232Sdcs * is equal to the number of characters in the pattern. 169162232Sdcs */ 169262754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 169362232Sdcs g->charjump[ch] = g->mlen; 169462232Sdcs 169562232Sdcs /* If the character does exist, compute the jump that would 169662232Sdcs * take us to the last character in the pattern equal to it 169762232Sdcs * (notice that we match right to left, so that last character 169862232Sdcs * is the first one that would be matched). 169962232Sdcs */ 170062232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 1701111010Snectar g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 170262232Sdcs} 170362232Sdcs 170462232Sdcs/* 170562232Sdcs - computematchjumps - compute match jumps for BM scan 170692889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 170762232Sdcs * 170862232Sdcs * This algorithm assumes g->must exists and is has size greater than 170962232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 171062232Sdcs * Sara Baase. 171162232Sdcs * 171262232Sdcs * A match jump is the number of characters one needs to advance based 171362232Sdcs * on the already-matched suffix. 171462232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 171562232Sdcs * the search algorithm works. 171662232Sdcs */ 171762232Sdcsstatic void 1718170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g) 171962232Sdcs{ 172062232Sdcs int mindex; /* General "must" iterator */ 172162232Sdcs int suffix; /* Keeps track of matching suffix */ 172262232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 172362232Sdcs int* pmatches; /* pmatches[k] points to the next i 172462232Sdcs * such that i+1...mlen is a substring 172562232Sdcs * of k+1...k+mlen-i-1 172662232Sdcs */ 172762232Sdcs 172862232Sdcs /* Avoid making errors worse */ 172962232Sdcs if (p->error != 0) 173062232Sdcs return; 173162232Sdcs 173262848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 173362232Sdcs if (pmatches == NULL) { 173462232Sdcs g->matchjump = NULL; 173562232Sdcs return; 173662232Sdcs } 173762232Sdcs 173862848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 1739276548Sdelphij if (g->matchjump == NULL) { /* Not a fatal error */ 1740276548Sdelphij free(pmatches); 174162232Sdcs return; 1742276548Sdelphij } 174362232Sdcs 174462232Sdcs /* Set maximum possible jump for each character in the pattern */ 174562232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 174662232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 174762232Sdcs 174862232Sdcs /* Compute pmatches[] */ 174962232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 175062232Sdcs mindex--, suffix--) { 175162232Sdcs pmatches[mindex] = suffix; 175262232Sdcs 175362232Sdcs /* If a mismatch is found, interrupting the substring, 175462232Sdcs * compute the matchjump for that position. If no 175562232Sdcs * mismatch is found, then a text substring mismatched 175662232Sdcs * against the suffix will also mismatch against the 175762232Sdcs * substring. 175862232Sdcs */ 175962232Sdcs while (suffix < g->mlen 176062232Sdcs && g->must[mindex] != g->must[suffix]) { 176162232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 176262232Sdcs g->mlen - mindex - 1); 176362232Sdcs suffix = pmatches[suffix]; 176462232Sdcs } 176562232Sdcs } 176662232Sdcs 176762232Sdcs /* Compute the matchjump up to the last substring found to jump 176862232Sdcs * to the beginning of the largest must pattern prefix matching 176962232Sdcs * it's own suffix. 177062232Sdcs */ 177162232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 177262232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 177362232Sdcs g->mlen + suffix - mindex); 177462232Sdcs 177562232Sdcs ssuffix = pmatches[suffix]; 177662391Sdcs while (suffix < g->mlen) { 177762673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 177862232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 177962232Sdcs g->mlen + ssuffix - suffix); 178062232Sdcs suffix++; 178162232Sdcs } 178286208Sdcs if (suffix < g->mlen) 178386208Sdcs ssuffix = pmatches[ssuffix]; 178462232Sdcs } 178562232Sdcs 178662232Sdcs free(pmatches); 178762232Sdcs} 178862232Sdcs 178962232Sdcs/* 17901573Srgrimes - pluscount - count + nesting 179192889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 17921573Srgrimes */ 17931573Srgrimesstatic sopno /* nesting depth */ 1794170528Sdelphijpluscount(struct parse *p, struct re_guts *g) 17951573Srgrimes{ 179692889Sobrien sop *scan; 179792889Sobrien sop s; 179892889Sobrien sopno plusnest = 0; 179992889Sobrien sopno maxnest = 0; 18001573Srgrimes 18011573Srgrimes if (p->error != 0) 18021573Srgrimes return(0); /* there may not be an OEND */ 18031573Srgrimes 18041573Srgrimes scan = g->strip + 1; 18051573Srgrimes do { 18061573Srgrimes s = *scan++; 18071573Srgrimes switch (OP(s)) { 18081573Srgrimes case OPLUS_: 18091573Srgrimes plusnest++; 18101573Srgrimes break; 18111573Srgrimes case O_PLUS: 18121573Srgrimes if (plusnest > maxnest) 18131573Srgrimes maxnest = plusnest; 18141573Srgrimes plusnest--; 18151573Srgrimes break; 18161573Srgrimes } 18171573Srgrimes } while (OP(s) != OEND); 18181573Srgrimes if (plusnest != 0) 18191573Srgrimes g->iflags |= BAD; 18201573Srgrimes return(maxnest); 18211573Srgrimes} 1822