11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation
7227753Stheraven * All rights reserved.
8227753Stheraven * Portions of this software were developed by David Chisnall
9227753Stheraven * under sponsorship from the FreeBSD Foundation.
10227753Stheraven *
111573Srgrimes * This code is derived from software contributed to Berkeley by
121573Srgrimes * Henry Spencer.
131573Srgrimes *
141573Srgrimes * Redistribution and use in source and binary forms, with or without
151573Srgrimes * modification, are permitted provided that the following conditions
161573Srgrimes * are met:
171573Srgrimes * 1. Redistributions of source code must retain the above copyright
181573Srgrimes *    notice, this list of conditions and the following disclaimer.
191573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
201573Srgrimes *    notice, this list of conditions and the following disclaimer in the
211573Srgrimes *    documentation and/or other materials provided with the distribution.
221573Srgrimes * 4. Neither the name of the University nor the names of its contributors
231573Srgrimes *    may be used to endorse or promote products derived from this software
241573Srgrimes *    without specific prior written permission.
251573Srgrimes *
261573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
271573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
281573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
291573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
301573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
311573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
321573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
331573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
341573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
351573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
361573Srgrimes * SUCH DAMAGE.
371573Srgrimes *
381573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
391573Srgrimes */
401573Srgrimes
411573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
421573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
431573Srgrimes#endif /* LIBC_SCCS and not lint */
4492986Sobrien#include <sys/cdefs.h>
4592986Sobrien__FBSDID("$FreeBSD: stable/10/lib/libc/regex/regcomp.c 325394 2017-11-04 14:45:36Z pfg $");
461573Srgrimes
471573Srgrimes#include <sys/types.h>
481573Srgrimes#include <stdio.h>
491573Srgrimes#include <string.h>
501573Srgrimes#include <ctype.h>
511573Srgrimes#include <limits.h>
521573Srgrimes#include <stdlib.h>
531573Srgrimes#include <regex.h>
54132019Stjr#include <wchar.h>
55132019Stjr#include <wctype.h>
561573Srgrimes
5719277Sache#include "collate.h"
5819277Sache
591573Srgrimes#include "utils.h"
601573Srgrimes#include "regex2.h"
611573Srgrimes
621573Srgrimes#include "cname.h"
631573Srgrimes
641573Srgrimes/*
651573Srgrimes * parse structure, passed up and down to avoid global variables and
661573Srgrimes * other clumsinesses
671573Srgrimes */
681573Srgrimesstruct parse {
691573Srgrimes	char *next;		/* next character in RE */
701573Srgrimes	char *end;		/* end of string (-> NUL normally) */
711573Srgrimes	int error;		/* has an error been seen? */
721573Srgrimes	sop *strip;		/* malloced strip */
731573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
741573Srgrimes	sopno slen;		/* malloced strip length (used) */
751573Srgrimes	int ncsalloc;		/* number of csets allocated */
761573Srgrimes	struct re_guts *g;
771573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
781573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
791573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
801573Srgrimes};
811573Srgrimes
821573Srgrimes/* ========= begin header generated by ./mkh ========= */
831573Srgrimes#ifdef __cplusplus
841573Srgrimesextern "C" {
851573Srgrimes#endif
861573Srgrimes
871573Srgrimes/* === regcomp.c === */
88227435Skevlostatic void p_ere(struct parse *p, int stop);
8992905Sobrienstatic void p_ere_exp(struct parse *p);
9092905Sobrienstatic void p_str(struct parse *p);
91227435Skevlostatic void p_bre(struct parse *p, int end1, int end2);
9292905Sobrienstatic int p_simp_re(struct parse *p, int starordinary);
9392905Sobrienstatic int p_count(struct parse *p);
9492905Sobrienstatic void p_bracket(struct parse *p);
9592905Sobrienstatic void p_b_term(struct parse *p, cset *cs);
9692905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs);
9792905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs);
98132019Stjrstatic wint_t p_b_symbol(struct parse *p);
99132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc);
100132019Stjrstatic wint_t othercase(wint_t ch);
101132019Stjrstatic void bothcases(struct parse *p, wint_t ch);
102132019Stjrstatic void ordinary(struct parse *p, wint_t ch);
10392905Sobrienstatic void nonnewline(struct parse *p);
10492905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to);
10592905Sobrienstatic int seterr(struct parse *p, int e);
10692905Sobrienstatic cset *allocset(struct parse *p);
10792905Sobrienstatic void freeset(struct parse *p, cset *cs);
108132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch);
109132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
110132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
111132019Stjrstatic wint_t singleton(cset *cs);
11292905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish);
11392905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd);
11492905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
11592905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value);
116227414Skevlostatic int enlarge(struct parse *p, sopno size);
11792905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g);
11892905Sobrienstatic void findmust(struct parse *p, struct re_guts *g);
119131973Stjrstatic int altoffset(sop *scan, int offset);
12092905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g);
12192905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g);
12292905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g);
123132019Stjrstatic wint_t wgetnext(struct parse *p);
1241573Srgrimes
1251573Srgrimes#ifdef __cplusplus
1261573Srgrimes}
1271573Srgrimes#endif
1281573Srgrimes/* ========= end header generated by ./mkh ========= */
1291573Srgrimes
1301573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1311573Srgrimes
1321573Srgrimes/*
1331573Srgrimes * macros for use with parse structure
1341573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1351573Srgrimes */
1361573Srgrimes#define	PEEK()	(*p->next)
1371573Srgrimes#define	PEEK2()	(*(p->next+1))
1381573Srgrimes#define	MORE()	(p->next < p->end)
1391573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1401573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1411573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1421573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1431573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1441573Srgrimes#define	NEXT()	(p->next++)
1451573Srgrimes#define	NEXT2()	(p->next += 2)
1461573Srgrimes#define	NEXTn(n)	(p->next += (n))
1471573Srgrimes#define	GETNEXT()	(*p->next++)
148132019Stjr#define	WGETNEXT()	wgetnext(p)
1491573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1501573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1511573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1521573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1531573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1541573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1551573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1561573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1571573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1581573Srgrimes#define	HERE()		(p->slen)
1591573Srgrimes#define	THERE()		(p->slen - 1)
1601573Srgrimes#define	THERETHERE()	(p->slen - 2)
1611573Srgrimes#define	DROP(n)	(p->slen -= (n))
1621573Srgrimes
1631573Srgrimes#ifndef NDEBUG
1641573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1651573Srgrimes#else
1661573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1671573Srgrimes#endif
1681573Srgrimes
16962232Sdcs/* Macro used by computejump()/computematchjump() */
17062232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
17162232Sdcs
1721573Srgrimes/*
1731573Srgrimes - regcomp - interface for parser and compilation
1741573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1751573Srgrimes = #define	REG_BASIC	0000
1761573Srgrimes = #define	REG_EXTENDED	0001
1771573Srgrimes = #define	REG_ICASE	0002
1781573Srgrimes = #define	REG_NOSUB	0004
1791573Srgrimes = #define	REG_NEWLINE	0010
1801573Srgrimes = #define	REG_NOSPEC	0020
1811573Srgrimes = #define	REG_PEND	0040
1821573Srgrimes = #define	REG_DUMP	0200
1831573Srgrimes */
1841573Srgrimesint				/* 0 success, otherwise REG_something */
185170528Sdelphijregcomp(regex_t * __restrict preg,
186170528Sdelphij	const char * __restrict pattern,
187170528Sdelphij	int cflags)
1881573Srgrimes{
1891573Srgrimes	struct parse pa;
19092889Sobrien	struct re_guts *g;
19192889Sobrien	struct parse *p = &pa;
19292889Sobrien	int i;
19392889Sobrien	size_t len;
194278910Sdelphij	size_t maxlen;
1951573Srgrimes#ifdef REDEBUG
1961573Srgrimes#	define	GOODFLAGS(f)	(f)
1971573Srgrimes#else
1981573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
1991573Srgrimes#endif
2001573Srgrimes
2011573Srgrimes	cflags = GOODFLAGS(cflags);
2021573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
2031573Srgrimes		return(REG_INVARG);
2041573Srgrimes
2051573Srgrimes	if (cflags&REG_PEND) {
2061573Srgrimes		if (preg->re_endp < pattern)
2071573Srgrimes			return(REG_INVARG);
2081573Srgrimes		len = preg->re_endp - pattern;
2091573Srgrimes	} else
2101573Srgrimes		len = strlen((char *)pattern);
2111573Srgrimes
2121573Srgrimes	/* do the mallocs early so failure handling is easy */
213131973Stjr	g = (struct re_guts *)malloc(sizeof(struct re_guts));
2141573Srgrimes	if (g == NULL)
2151573Srgrimes		return(REG_ESPACE);
216278910Sdelphij	/*
217278910Sdelphij	 * Limit the pattern space to avoid a 32-bit overflow on buffer
218278910Sdelphij	 * extension.  Also avoid any signed overflow in case of conversion
219278910Sdelphij	 * so make the real limit based on a 31-bit overflow.
220278910Sdelphij	 *
221278910Sdelphij	 * Likely not applicable on 64-bit systems but handle the case
222278910Sdelphij	 * generically (who are we to stop people from using ~715MB+
223278910Sdelphij	 * patterns?).
224278910Sdelphij	 */
225278910Sdelphij	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
226278910Sdelphij	if (len >= maxlen) {
227278910Sdelphij		free((char *)g);
228278910Sdelphij		return(REG_ESPACE);
229278910Sdelphij	}
2301573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
231278910Sdelphij	assert(p->ssize >= len);
232278910Sdelphij
2331573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2341573Srgrimes	p->slen = 0;
2351573Srgrimes	if (p->strip == NULL) {
2361573Srgrimes		free((char *)g);
2371573Srgrimes		return(REG_ESPACE);
2381573Srgrimes	}
2391573Srgrimes
2401573Srgrimes	/* set things up */
2411573Srgrimes	p->g = g;
2421573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2431573Srgrimes	p->end = p->next + len;
2441573Srgrimes	p->error = 0;
2451573Srgrimes	p->ncsalloc = 0;
2461573Srgrimes	for (i = 0; i < NPAREN; i++) {
2471573Srgrimes		p->pbegin[i] = 0;
2481573Srgrimes		p->pend[i] = 0;
2491573Srgrimes	}
2501573Srgrimes	g->sets = NULL;
2511573Srgrimes	g->ncsets = 0;
2521573Srgrimes	g->cflags = cflags;
2531573Srgrimes	g->iflags = 0;
2541573Srgrimes	g->nbol = 0;
2551573Srgrimes	g->neol = 0;
2561573Srgrimes	g->must = NULL;
25762391Sdcs	g->moffset = -1;
25862263Sdcs	g->charjump = NULL;
25962263Sdcs	g->matchjump = NULL;
2601573Srgrimes	g->mlen = 0;
2611573Srgrimes	g->nsub = 0;
2621573Srgrimes	g->backrefs = 0;
2631573Srgrimes
2641573Srgrimes	/* do it */
2651573Srgrimes	EMIT(OEND, 0);
2661573Srgrimes	g->firststate = THERE();
2671573Srgrimes	if (cflags&REG_EXTENDED)
2681573Srgrimes		p_ere(p, OUT);
2691573Srgrimes	else if (cflags&REG_NOSPEC)
2701573Srgrimes		p_str(p);
2711573Srgrimes	else
2721573Srgrimes		p_bre(p, OUT, OUT);
2731573Srgrimes	EMIT(OEND, 0);
2741573Srgrimes	g->laststate = THERE();
2751573Srgrimes
2761573Srgrimes	/* tidy up loose ends and fill things in */
2771573Srgrimes	stripsnug(p, g);
2781573Srgrimes	findmust(p, g);
27962232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
28062232Sdcs	 * than three characters
28162232Sdcs	 */
28262232Sdcs	if(g->mlen > 3) {
28362232Sdcs		computejumps(p, g);
28462232Sdcs		computematchjumps(p, g);
28562755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
28662232Sdcs			free(g->charjump);
28762232Sdcs			g->charjump = NULL;
28862232Sdcs		}
28962232Sdcs	}
2901573Srgrimes	g->nplus = pluscount(p, g);
2911573Srgrimes	g->magic = MAGIC2;
2921573Srgrimes	preg->re_nsub = g->nsub;
2931573Srgrimes	preg->re_g = g;
2941573Srgrimes	preg->re_magic = MAGIC1;
2951573Srgrimes#ifndef REDEBUG
2961573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2971573Srgrimes	if (g->iflags&BAD)
2981573Srgrimes		SETERROR(REG_ASSERT);
2991573Srgrimes#endif
3001573Srgrimes
3011573Srgrimes	/* win or lose, we're done */
3021573Srgrimes	if (p->error != 0)	/* lose */
3031573Srgrimes		regfree(preg);
3041573Srgrimes	return(p->error);
3051573Srgrimes}
3061573Srgrimes
3071573Srgrimes/*
3081573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
309227435Skevlo == static void p_ere(struct parse *p, int_t stop);
3101573Srgrimes */
3111573Srgrimesstatic void
312170528Sdelphijp_ere(struct parse *p,
313227435Skevlo	int stop)		/* character this ERE should end at */
3141573Srgrimes{
31592889Sobrien	char c;
31692889Sobrien	sopno prevback;
31792889Sobrien	sopno prevfwd;
31892889Sobrien	sopno conc;
31992889Sobrien	int first = 1;		/* is this the first alternative? */
3201573Srgrimes
3211573Srgrimes	for (;;) {
3221573Srgrimes		/* do a bunch of concatenated expressions */
3231573Srgrimes		conc = HERE();
3241573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3251573Srgrimes			p_ere_exp(p);
32617141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3271573Srgrimes
3281573Srgrimes		if (!EAT('|'))
3291573Srgrimes			break;		/* NOTE BREAK OUT */
3301573Srgrimes
3311573Srgrimes		if (first) {
3321573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3331573Srgrimes			prevfwd = conc;
3341573Srgrimes			prevback = conc;
3351573Srgrimes			first = 0;
3361573Srgrimes		}
3371573Srgrimes		ASTERN(OOR1, prevback);
3381573Srgrimes		prevback = THERE();
3391573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3401573Srgrimes		prevfwd = HERE();
3411573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3421573Srgrimes	}
3431573Srgrimes
3441573Srgrimes	if (!first) {		/* tail-end fixups */
3451573Srgrimes		AHEAD(prevfwd);
3461573Srgrimes		ASTERN(O_CH, prevback);
3471573Srgrimes	}
3481573Srgrimes
3491573Srgrimes	assert(!MORE() || SEE(stop));
3501573Srgrimes}
3511573Srgrimes
3521573Srgrimes/*
3531573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
35492889Sobrien == static void p_ere_exp(struct parse *p);
3551573Srgrimes */
3561573Srgrimesstatic void
357170528Sdelphijp_ere_exp(struct parse *p)
3581573Srgrimes{
35992889Sobrien	char c;
360132019Stjr	wint_t wc;
36192889Sobrien	sopno pos;
36292889Sobrien	int count;
36392889Sobrien	int count2;
36492889Sobrien	sopno subno;
3651573Srgrimes	int wascaret = 0;
3661573Srgrimes
3671573Srgrimes	assert(MORE());		/* caller should have ensured this */
3681573Srgrimes	c = GETNEXT();
3691573Srgrimes
3701573Srgrimes	pos = HERE();
3711573Srgrimes	switch (c) {
3721573Srgrimes	case '(':
37317141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3741573Srgrimes		p->g->nsub++;
3751573Srgrimes		subno = p->g->nsub;
3761573Srgrimes		if (subno < NPAREN)
3771573Srgrimes			p->pbegin[subno] = HERE();
3781573Srgrimes		EMIT(OLPAREN, subno);
3791573Srgrimes		if (!SEE(')'))
3801573Srgrimes			p_ere(p, ')');
3811573Srgrimes		if (subno < NPAREN) {
3821573Srgrimes			p->pend[subno] = HERE();
3831573Srgrimes			assert(p->pend[subno] != 0);
3841573Srgrimes		}
3851573Srgrimes		EMIT(ORPAREN, subno);
38617141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3871573Srgrimes		break;
3881573Srgrimes#ifndef POSIX_MISTAKE
3891573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3901573Srgrimes		/*
3911573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3921573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3931573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3941573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3951573Srgrimes		 * we can get it fixed.
3961573Srgrimes		 */
3971573Srgrimes		SETERROR(REG_EPAREN);
3981573Srgrimes		break;
3991573Srgrimes#endif
4001573Srgrimes	case '^':
4011573Srgrimes		EMIT(OBOL, 0);
4021573Srgrimes		p->g->iflags |= USEBOL;
4031573Srgrimes		p->g->nbol++;
4041573Srgrimes		wascaret = 1;
4051573Srgrimes		break;
4061573Srgrimes	case '$':
4071573Srgrimes		EMIT(OEOL, 0);
4081573Srgrimes		p->g->iflags |= USEEOL;
4091573Srgrimes		p->g->neol++;
4101573Srgrimes		break;
4111573Srgrimes	case '|':
4121573Srgrimes		SETERROR(REG_EMPTY);
4131573Srgrimes		break;
4141573Srgrimes	case '*':
4151573Srgrimes	case '+':
4161573Srgrimes	case '?':
4171573Srgrimes		SETERROR(REG_BADRPT);
4181573Srgrimes		break;
4191573Srgrimes	case '.':
4201573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4211573Srgrimes			nonnewline(p);
4221573Srgrimes		else
4231573Srgrimes			EMIT(OANY, 0);
4241573Srgrimes		break;
4251573Srgrimes	case '[':
4261573Srgrimes		p_bracket(p);
4271573Srgrimes		break;
4281573Srgrimes	case '\\':
42917141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
430132019Stjr		wc = WGETNEXT();
431269484Spfg		switch (wc) {
432269484Spfg		case '<':
433269484Spfg			EMIT(OBOW, 0);
434269484Spfg			break;
435269484Spfg		case '>':
436269484Spfg			EMIT(OEOW, 0);
437269484Spfg			break;
438269484Spfg		default:
439269484Spfg			ordinary(p, wc);
440269484Spfg			break;
441269484Spfg		}
4421573Srgrimes		break;
4431573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
44449094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4451573Srgrimes		/* FALLTHROUGH */
4461573Srgrimes	default:
447318030Sbrooks		if (p->error != 0)
448318030Sbrooks			return;
449132019Stjr		p->next--;
450132019Stjr		wc = WGETNEXT();
451132019Stjr		ordinary(p, wc);
4521573Srgrimes		break;
4531573Srgrimes	}
4541573Srgrimes
4551573Srgrimes	if (!MORE())
4561573Srgrimes		return;
4571573Srgrimes	c = PEEK();
4581573Srgrimes	/* we call { a repetition if followed by a digit */
4591573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
46049094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4611573Srgrimes		return;		/* no repetition, we're done */
4621573Srgrimes	NEXT();
4631573Srgrimes
46417141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4651573Srgrimes	switch (c) {
4661573Srgrimes	case '*':	/* implemented as +? */
4671573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4681573Srgrimes		INSERT(OPLUS_, pos);
4691573Srgrimes		ASTERN(O_PLUS, pos);
4701573Srgrimes		INSERT(OQUEST_, pos);
4711573Srgrimes		ASTERN(O_QUEST, pos);
4721573Srgrimes		break;
4731573Srgrimes	case '+':
4741573Srgrimes		INSERT(OPLUS_, pos);
4751573Srgrimes		ASTERN(O_PLUS, pos);
4761573Srgrimes		break;
4771573Srgrimes	case '?':
4781573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4791573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4801573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4811573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4821573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4831573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4841573Srgrimes		ASTERN(O_CH, THERETHERE());
4851573Srgrimes		break;
4861573Srgrimes	case '{':
4871573Srgrimes		count = p_count(p);
4881573Srgrimes		if (EAT(',')) {
48949094Sache			if (isdigit((uch)PEEK())) {
4901573Srgrimes				count2 = p_count(p);
49117141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4921573Srgrimes			} else		/* single number with comma */
4931573Srgrimes				count2 = INFINITY;
4941573Srgrimes		} else		/* just a single number */
4951573Srgrimes			count2 = count;
4961573Srgrimes		repeat(p, pos, count, count2);
4971573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4981573Srgrimes			while (MORE() && PEEK() != '}')
4991573Srgrimes				NEXT();
50017141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
5011573Srgrimes			SETERROR(REG_BADBR);
5021573Srgrimes		}
5031573Srgrimes		break;
5041573Srgrimes	}
5051573Srgrimes
5061573Srgrimes	if (!MORE())
5071573Srgrimes		return;
5081573Srgrimes	c = PEEK();
5091573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
51049094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
5111573Srgrimes		return;
5121573Srgrimes	SETERROR(REG_BADRPT);
5131573Srgrimes}
5141573Srgrimes
5151573Srgrimes/*
5161573Srgrimes - p_str - string (no metacharacters) "parser"
51792889Sobrien == static void p_str(struct parse *p);
5181573Srgrimes */
5191573Srgrimesstatic void
520170528Sdelphijp_str(struct parse *p)
5211573Srgrimes{
52217141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
5231573Srgrimes	while (MORE())
524132019Stjr		ordinary(p, WGETNEXT());
5251573Srgrimes}
5261573Srgrimes
5271573Srgrimes/*
5281573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
529227435Skevlo == static void p_bre(struct parse *p,  int end1, \
530227435Skevlo ==	int end2);
5311573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
5321573Srgrimes *
5331573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
534131973Stjr * taken as an ordinary character and then revised to be an anchor.
5351573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5361573Srgrimes */
5371573Srgrimesstatic void
538170528Sdelphijp_bre(struct parse *p,
539227435Skevlo	int end1,		/* first terminating character */
540227435Skevlo	int end2)		/* second terminating character */
5411573Srgrimes{
54292889Sobrien	sopno start = HERE();
54392889Sobrien	int first = 1;			/* first subexpression? */
54492889Sobrien	int wasdollar = 0;
5451573Srgrimes
5461573Srgrimes	if (EAT('^')) {
5471573Srgrimes		EMIT(OBOL, 0);
5481573Srgrimes		p->g->iflags |= USEBOL;
5491573Srgrimes		p->g->nbol++;
5501573Srgrimes	}
5511573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5521573Srgrimes		wasdollar = p_simp_re(p, first);
5531573Srgrimes		first = 0;
5541573Srgrimes	}
5551573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5561573Srgrimes		DROP(1);
5571573Srgrimes		EMIT(OEOL, 0);
5581573Srgrimes		p->g->iflags |= USEEOL;
5591573Srgrimes		p->g->neol++;
5601573Srgrimes	}
5611573Srgrimes
56217141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5631573Srgrimes}
5641573Srgrimes
5651573Srgrimes/*
5661573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
56792889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5681573Srgrimes */
5691573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
570170528Sdelphijp_simp_re(struct parse *p,
571170528Sdelphij	int starordinary)	/* is a leading * an ordinary character? */
5721573Srgrimes{
57392889Sobrien	int c;
57492889Sobrien	int count;
57592889Sobrien	int count2;
57692889Sobrien	sopno pos;
57792889Sobrien	int i;
578132019Stjr	wint_t wc;
57992889Sobrien	sopno subno;
5801573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5811573Srgrimes
5821573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5831573Srgrimes
5841573Srgrimes	assert(MORE());		/* caller should have ensured this */
5851573Srgrimes	c = GETNEXT();
5861573Srgrimes	if (c == '\\') {
58717141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
58849094Sache		c = BACKSL | GETNEXT();
5891573Srgrimes	}
5901573Srgrimes	switch (c) {
5911573Srgrimes	case '.':
5921573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5931573Srgrimes			nonnewline(p);
5941573Srgrimes		else
5951573Srgrimes			EMIT(OANY, 0);
5961573Srgrimes		break;
5971573Srgrimes	case '[':
5981573Srgrimes		p_bracket(p);
5991573Srgrimes		break;
600269484Spfg	case BACKSL|'<':
601269484Spfg		EMIT(OBOW, 0);
602269484Spfg		break;
603269484Spfg	case BACKSL|'>':
604269484Spfg		EMIT(OEOW, 0);
605269484Spfg		break;
6061573Srgrimes	case BACKSL|'{':
6071573Srgrimes		SETERROR(REG_BADRPT);
6081573Srgrimes		break;
6091573Srgrimes	case BACKSL|'(':
6101573Srgrimes		p->g->nsub++;
6111573Srgrimes		subno = p->g->nsub;
6121573Srgrimes		if (subno < NPAREN)
6131573Srgrimes			p->pbegin[subno] = HERE();
6141573Srgrimes		EMIT(OLPAREN, subno);
6151573Srgrimes		/* the MORE here is an error heuristic */
6161573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
6171573Srgrimes			p_bre(p, '\\', ')');
6181573Srgrimes		if (subno < NPAREN) {
6191573Srgrimes			p->pend[subno] = HERE();
6201573Srgrimes			assert(p->pend[subno] != 0);
6211573Srgrimes		}
6221573Srgrimes		EMIT(ORPAREN, subno);
62317141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
6241573Srgrimes		break;
6251573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
6261573Srgrimes	case BACKSL|'}':
6271573Srgrimes		SETERROR(REG_EPAREN);
6281573Srgrimes		break;
6291573Srgrimes	case BACKSL|'1':
6301573Srgrimes	case BACKSL|'2':
6311573Srgrimes	case BACKSL|'3':
6321573Srgrimes	case BACKSL|'4':
6331573Srgrimes	case BACKSL|'5':
6341573Srgrimes	case BACKSL|'6':
6351573Srgrimes	case BACKSL|'7':
6361573Srgrimes	case BACKSL|'8':
6371573Srgrimes	case BACKSL|'9':
6381573Srgrimes		i = (c&~BACKSL) - '0';
6391573Srgrimes		assert(i < NPAREN);
6401573Srgrimes		if (p->pend[i] != 0) {
6411573Srgrimes			assert(i <= p->g->nsub);
6421573Srgrimes			EMIT(OBACK_, i);
6431573Srgrimes			assert(p->pbegin[i] != 0);
6441573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6451573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6461573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6471573Srgrimes			EMIT(O_BACK, i);
6481573Srgrimes		} else
6491573Srgrimes			SETERROR(REG_ESUBREG);
6501573Srgrimes		p->g->backrefs = 1;
6511573Srgrimes		break;
6521573Srgrimes	case '*':
65317141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6541573Srgrimes		/* FALLTHROUGH */
6551573Srgrimes	default:
656318030Sbrooks		if (p->error != 0)
657318030Sbrooks			return(0);	/* Definitely not $... */
658132019Stjr		p->next--;
659132019Stjr		wc = WGETNEXT();
660132019Stjr		ordinary(p, wc);
6611573Srgrimes		break;
6621573Srgrimes	}
6631573Srgrimes
6641573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6651573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6661573Srgrimes		INSERT(OPLUS_, pos);
6671573Srgrimes		ASTERN(O_PLUS, pos);
6681573Srgrimes		INSERT(OQUEST_, pos);
6691573Srgrimes		ASTERN(O_QUEST, pos);
6701573Srgrimes	} else if (EATTWO('\\', '{')) {
6711573Srgrimes		count = p_count(p);
6721573Srgrimes		if (EAT(',')) {
67349094Sache			if (MORE() && isdigit((uch)PEEK())) {
6741573Srgrimes				count2 = p_count(p);
67517141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6761573Srgrimes			} else		/* single number with comma */
6771573Srgrimes				count2 = INFINITY;
6781573Srgrimes		} else		/* just a single number */
6791573Srgrimes			count2 = count;
6801573Srgrimes		repeat(p, pos, count, count2);
6811573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6821573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6831573Srgrimes				NEXT();
68417141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6851573Srgrimes			SETERROR(REG_BADBR);
6861573Srgrimes		}
68749094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6881573Srgrimes		return(1);
6891573Srgrimes
6901573Srgrimes	return(0);
6911573Srgrimes}
6921573Srgrimes
6931573Srgrimes/*
6941573Srgrimes - p_count - parse a repetition count
69592889Sobrien == static int p_count(struct parse *p);
6961573Srgrimes */
6971573Srgrimesstatic int			/* the value */
698170528Sdelphijp_count(struct parse *p)
6991573Srgrimes{
70092889Sobrien	int count = 0;
70192889Sobrien	int ndigits = 0;
7021573Srgrimes
70349094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
7041573Srgrimes		count = count*10 + (GETNEXT() - '0');
7051573Srgrimes		ndigits++;
7061573Srgrimes	}
7071573Srgrimes
70817141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
7091573Srgrimes	return(count);
7101573Srgrimes}
7111573Srgrimes
7121573Srgrimes/*
7131573Srgrimes - p_bracket - parse a bracketed character list
71492889Sobrien == static void p_bracket(struct parse *p);
7151573Srgrimes */
7161573Srgrimesstatic void
717170528Sdelphijp_bracket(struct parse *p)
7181573Srgrimes{
719132019Stjr	cset *cs;
720132019Stjr	wint_t ch;
7211573Srgrimes
7221573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
7231573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
7241573Srgrimes		EMIT(OBOW, 0);
7251573Srgrimes		NEXTn(6);
7261573Srgrimes		return;
7271573Srgrimes	}
7281573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
7291573Srgrimes		EMIT(OEOW, 0);
7301573Srgrimes		NEXTn(6);
7311573Srgrimes		return;
7321573Srgrimes	}
7331573Srgrimes
734132019Stjr	if ((cs = allocset(p)) == NULL)
735132019Stjr		return;
736132019Stjr
737132019Stjr	if (p->g->cflags&REG_ICASE)
738132019Stjr		cs->icase = 1;
7391573Srgrimes	if (EAT('^'))
740132019Stjr		cs->invert = 1;
7411573Srgrimes	if (EAT(']'))
742132019Stjr		CHadd(p, cs, ']');
7431573Srgrimes	else if (EAT('-'))
744132019Stjr		CHadd(p, cs, '-');
7451573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7461573Srgrimes		p_b_term(p, cs);
7471573Srgrimes	if (EAT('-'))
748132019Stjr		CHadd(p, cs, '-');
74917141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7501573Srgrimes
7511573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7521573Srgrimes		return;
7531573Srgrimes
754132019Stjr	if (cs->invert && p->g->cflags&REG_NEWLINE)
755132019Stjr		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
7561573Srgrimes
757132019Stjr	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
758132019Stjr		ordinary(p, ch);
7591573Srgrimes		freeset(p, cs);
7601573Srgrimes	} else
761132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
7621573Srgrimes}
7631573Srgrimes
7641573Srgrimes/*
7651573Srgrimes - p_b_term - parse one term of a bracketed character list
76692889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7671573Srgrimes */
7681573Srgrimesstatic void
769170528Sdelphijp_b_term(struct parse *p, cset *cs)
7701573Srgrimes{
77192889Sobrien	char c;
772132019Stjr	wint_t start, finish;
773132019Stjr	wint_t i;
774227753Stheraven	struct xlocale_collate *table =
775227753Stheraven		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
7761573Srgrimes
7771573Srgrimes	/* classify what we've got */
7781573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7791573Srgrimes	case '[':
7801573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7811573Srgrimes		break;
7821573Srgrimes	case '-':
7831573Srgrimes		SETERROR(REG_ERANGE);
7841573Srgrimes		return;			/* NOTE RETURN */
7851573Srgrimes	default:
7861573Srgrimes		c = '\0';
7871573Srgrimes		break;
7881573Srgrimes	}
7891573Srgrimes
7901573Srgrimes	switch (c) {
7911573Srgrimes	case ':':		/* character class */
7921573Srgrimes		NEXT2();
79317141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7941573Srgrimes		c = PEEK();
79517141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7961573Srgrimes		p_b_cclass(p, cs);
79717141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
79817141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7991573Srgrimes		break;
8001573Srgrimes	case '=':		/* equivalence class */
8011573Srgrimes		NEXT2();
80217141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
8031573Srgrimes		c = PEEK();
80417141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
8051573Srgrimes		p_b_eclass(p, cs);
80617141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
80717141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
8081573Srgrimes		break;
8091573Srgrimes	default:		/* symbol, ordinary character, or range */
8101573Srgrimes		start = p_b_symbol(p);
8111573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
8121573Srgrimes			/* range */
8131573Srgrimes			NEXT();
8141573Srgrimes			if (EAT('-'))
8151573Srgrimes				finish = '-';
8161573Srgrimes			else
8171573Srgrimes				finish = p_b_symbol(p);
8181573Srgrimes		} else
8191573Srgrimes			finish = start;
82017514Sache		if (start == finish)
821132019Stjr			CHadd(p, cs, start);
82217514Sache		else {
823303185Sache			if (table->__collate_load_error || MB_CUR_MAX > 1) {
824303185Sache				(void)REQUIRE(start <= finish, REG_ERANGE);
825132019Stjr				CHaddrange(p, cs, start, finish);
82624637Sache			} else {
827303185Sache				(void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE);
828132019Stjr				for (i = 0; i <= UCHAR_MAX; i++) {
829303185Sache					if (   __wcollate_range_cmp(start, i) <= 0
830303185Sache					    && __wcollate_range_cmp(i, finish) <= 0
83124637Sache					   )
832132019Stjr						CHadd(p, cs, i);
83324637Sache				}
83417514Sache			}
83517514Sache		}
8361573Srgrimes		break;
8371573Srgrimes	}
8381573Srgrimes}
8391573Srgrimes
8401573Srgrimes/*
8411573Srgrimes - p_b_cclass - parse a character-class name and deal with it
84292889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8431573Srgrimes */
8441573Srgrimesstatic void
845170528Sdelphijp_b_cclass(struct parse *p, cset *cs)
8461573Srgrimes{
84792889Sobrien	char *sp = p->next;
84892889Sobrien	size_t len;
849132019Stjr	wctype_t wct;
850132019Stjr	char clname[16];
8511573Srgrimes
85217508Sache	while (MORE() && isalpha((uch)PEEK()))
8531573Srgrimes		NEXT();
8541573Srgrimes	len = p->next - sp;
855132019Stjr	if (len >= sizeof(clname) - 1) {
8561573Srgrimes		SETERROR(REG_ECTYPE);
8571573Srgrimes		return;
8581573Srgrimes	}
859132019Stjr	memcpy(clname, sp, len);
860132019Stjr	clname[len] = '\0';
861132019Stjr	if ((wct = wctype(clname)) == 0) {
862132019Stjr		SETERROR(REG_ECTYPE);
863132019Stjr		return;
86417508Sache	}
865132019Stjr	CHaddtype(p, cs, wct);
8661573Srgrimes}
8671573Srgrimes
8681573Srgrimes/*
8691573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
87092889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
8711573Srgrimes *
8721573Srgrimes * This implementation is incomplete. xxx
8731573Srgrimes */
8741573Srgrimesstatic void
875170528Sdelphijp_b_eclass(struct parse *p, cset *cs)
8761573Srgrimes{
877132019Stjr	wint_t c;
8781573Srgrimes
8791573Srgrimes	c = p_b_coll_elem(p, '=');
880132019Stjr	CHadd(p, cs, c);
8811573Srgrimes}
8821573Srgrimes
8831573Srgrimes/*
8841573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
885227414Skevlo == static wint_t p_b_symbol(struct parse *p);
8861573Srgrimes */
887132019Stjrstatic wint_t			/* value of symbol */
888170528Sdelphijp_b_symbol(struct parse *p)
8891573Srgrimes{
890132019Stjr	wint_t value;
8911573Srgrimes
89217141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
8931573Srgrimes	if (!EATTWO('[', '.'))
894132019Stjr		return(WGETNEXT());
8951573Srgrimes
8961573Srgrimes	/* collating symbol */
8971573Srgrimes	value = p_b_coll_elem(p, '.');
89817141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
8991573Srgrimes	return(value);
9001573Srgrimes}
9011573Srgrimes
9021573Srgrimes/*
9031573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
904227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
9051573Srgrimes */
906132019Stjrstatic wint_t			/* value of collating element */
907170528Sdelphijp_b_coll_elem(struct parse *p,
908170528Sdelphij	wint_t endc)		/* name ended by endc,']' */
9091573Srgrimes{
91092889Sobrien	char *sp = p->next;
91192889Sobrien	struct cname *cp;
91292889Sobrien	int len;
913132019Stjr	mbstate_t mbs;
914132019Stjr	wchar_t wc;
915132019Stjr	size_t clen;
9161573Srgrimes
9171573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
9181573Srgrimes		NEXT();
9191573Srgrimes	if (!MORE()) {
9201573Srgrimes		SETERROR(REG_EBRACK);
9211573Srgrimes		return(0);
9221573Srgrimes	}
9231573Srgrimes	len = p->next - sp;
9241573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
925325394Spfg		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
9261573Srgrimes			return(cp->code);	/* known name */
927132019Stjr	memset(&mbs, 0, sizeof(mbs));
928132019Stjr	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
929132019Stjr		return (wc);			/* single character */
930132019Stjr	else if (clen == (size_t)-1 || clen == (size_t)-2)
931132019Stjr		SETERROR(REG_ILLSEQ);
932132019Stjr	else
933132019Stjr		SETERROR(REG_ECOLLATE);		/* neither */
9341573Srgrimes	return(0);
9351573Srgrimes}
9361573Srgrimes
9371573Srgrimes/*
9381573Srgrimes - othercase - return the case counterpart of an alphabetic
939227414Skevlo == static wint_t othercase(wint_t ch);
9401573Srgrimes */
941132019Stjrstatic wint_t			/* if no counterpart, return ch */
942170528Sdelphijothercase(wint_t ch)
9431573Srgrimes{
944132019Stjr	assert(iswalpha(ch));
945132019Stjr	if (iswupper(ch))
946132019Stjr		return(towlower(ch));
947132019Stjr	else if (iswlower(ch))
948132019Stjr		return(towupper(ch));
9491573Srgrimes	else			/* peculiar, but could happen */
9501573Srgrimes		return(ch);
9511573Srgrimes}
9521573Srgrimes
9531573Srgrimes/*
9541573Srgrimes - bothcases - emit a dualcase version of a two-case character
955227414Skevlo == static void bothcases(struct parse *p, wint_t ch);
9561573Srgrimes *
9571573Srgrimes * Boy, is this implementation ever a kludge...
9581573Srgrimes */
9591573Srgrimesstatic void
960170528Sdelphijbothcases(struct parse *p, wint_t ch)
9611573Srgrimes{
96292889Sobrien	char *oldnext = p->next;
96392889Sobrien	char *oldend = p->end;
964132019Stjr	char bracket[3 + MB_LEN_MAX];
965132019Stjr	size_t n;
966132019Stjr	mbstate_t mbs;
9671573Srgrimes
9681573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
9691573Srgrimes	p->next = bracket;
970132019Stjr	memset(&mbs, 0, sizeof(mbs));
971132019Stjr	n = wcrtomb(bracket, ch, &mbs);
972132019Stjr	assert(n != (size_t)-1);
973132019Stjr	bracket[n] = ']';
974132019Stjr	bracket[n + 1] = '\0';
975132019Stjr	p->end = bracket+n+1;
9761573Srgrimes	p_bracket(p);
977132019Stjr	assert(p->next == p->end);
9781573Srgrimes	p->next = oldnext;
9791573Srgrimes	p->end = oldend;
9801573Srgrimes}
9811573Srgrimes
9821573Srgrimes/*
9831573Srgrimes - ordinary - emit an ordinary character
984227414Skevlo == static void ordinary(struct parse *p, wint_t ch);
9851573Srgrimes */
9861573Srgrimesstatic void
987170528Sdelphijordinary(struct parse *p, wint_t ch)
9881573Srgrimes{
989132019Stjr	cset *cs;
9901573Srgrimes
991132019Stjr	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
9921573Srgrimes		bothcases(p, ch);
993132019Stjr	else if ((ch & OPDMASK) == ch)
994132019Stjr		EMIT(OCHAR, ch);
995132019Stjr	else {
996132019Stjr		/*
997132019Stjr		 * Kludge: character is too big to fit into an OCHAR operand.
998132019Stjr		 * Emit a singleton set.
999132019Stjr		 */
1000132019Stjr		if ((cs = allocset(p)) == NULL)
1001132019Stjr			return;
1002132019Stjr		CHadd(p, cs, ch);
1003132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
1004132019Stjr	}
10051573Srgrimes}
10061573Srgrimes
10071573Srgrimes/*
10081573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
100992889Sobrien == static void nonnewline(struct parse *p);
10101573Srgrimes *
10111573Srgrimes * Boy, is this implementation ever a kludge...
10121573Srgrimes */
10131573Srgrimesstatic void
1014170528Sdelphijnonnewline(struct parse *p)
10151573Srgrimes{
101692889Sobrien	char *oldnext = p->next;
101792889Sobrien	char *oldend = p->end;
10181573Srgrimes	char bracket[4];
10191573Srgrimes
10201573Srgrimes	p->next = bracket;
10211573Srgrimes	p->end = bracket+3;
10221573Srgrimes	bracket[0] = '^';
10231573Srgrimes	bracket[1] = '\n';
10241573Srgrimes	bracket[2] = ']';
10251573Srgrimes	bracket[3] = '\0';
10261573Srgrimes	p_bracket(p);
10271573Srgrimes	assert(p->next == bracket+3);
10281573Srgrimes	p->next = oldnext;
10291573Srgrimes	p->end = oldend;
10301573Srgrimes}
10311573Srgrimes
10321573Srgrimes/*
10331573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
103492889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
10351573Srgrimes */
10361573Srgrimesstatic void
1037170528Sdelphijrepeat(struct parse *p,
1038170528Sdelphij	sopno start,		/* operand from here to end of strip */
1039170528Sdelphij	int from,		/* repeated from this number */
1040170528Sdelphij	int to)			/* to this number of times (maybe INFINITY) */
10411573Srgrimes{
104292889Sobrien	sopno finish = HERE();
10431573Srgrimes#	define	N	2
10441573Srgrimes#	define	INF	3
10451573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
10461573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
104792889Sobrien	sopno copy;
10481573Srgrimes
10491573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
10501573Srgrimes		return;
10511573Srgrimes
10521573Srgrimes	assert(from <= to);
10531573Srgrimes
10541573Srgrimes	switch (REP(MAP(from), MAP(to))) {
10551573Srgrimes	case REP(0, 0):			/* must be user doing this */
10561573Srgrimes		DROP(finish-start);	/* drop the operand */
10571573Srgrimes		break;
10581573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
10591573Srgrimes	case REP(0, N):			/* as x{1,n}? */
10601573Srgrimes	case REP(0, INF):		/* as x{1,}? */
10611573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10621573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
10631573Srgrimes		repeat(p, start+1, 1, to);
10641573Srgrimes		ASTERN(OOR1, start);
10651573Srgrimes		AHEAD(start);			/* ... fix it */
10661573Srgrimes		EMIT(OOR2, 0);
10671573Srgrimes		AHEAD(THERE());
10681573Srgrimes		ASTERN(O_CH, THERETHERE());
10691573Srgrimes		break;
10701573Srgrimes	case REP(1, 1):			/* trivial case */
10711573Srgrimes		/* done */
10721573Srgrimes		break;
10731573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
10741573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10751573Srgrimes		INSERT(OCH_, start);
10761573Srgrimes		ASTERN(OOR1, start);
10771573Srgrimes		AHEAD(start);
10781573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
10791573Srgrimes		AHEAD(THERE());			/* ...so fix it */
10801573Srgrimes		ASTERN(O_CH, THERETHERE());
10811573Srgrimes		copy = dupl(p, start+1, finish+1);
10821573Srgrimes		assert(copy == finish+4);
10831573Srgrimes		repeat(p, copy, 1, to-1);
10841573Srgrimes		break;
10851573Srgrimes	case REP(1, INF):		/* as x+ */
10861573Srgrimes		INSERT(OPLUS_, start);
10871573Srgrimes		ASTERN(O_PLUS, start);
10881573Srgrimes		break;
10891573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
10901573Srgrimes		copy = dupl(p, start, finish);
10911573Srgrimes		repeat(p, copy, from-1, to-1);
10921573Srgrimes		break;
10931573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
10941573Srgrimes		copy = dupl(p, start, finish);
10951573Srgrimes		repeat(p, copy, from-1, to);
10961573Srgrimes		break;
10971573Srgrimes	default:			/* "can't happen" */
10981573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
10991573Srgrimes		break;
11001573Srgrimes	}
11011573Srgrimes}
11021573Srgrimes
11031573Srgrimes/*
1104132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1105132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the
1106132019Stjr - character can't be converted. Returns the number of bytes consumed.
1107132019Stjr */
1108132019Stjrstatic wint_t
1109170528Sdelphijwgetnext(struct parse *p)
1110132019Stjr{
1111132019Stjr	mbstate_t mbs;
1112132019Stjr	wchar_t wc;
1113132019Stjr	size_t n;
1114132019Stjr
1115132019Stjr	memset(&mbs, 0, sizeof(mbs));
1116132019Stjr	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1117132019Stjr	if (n == (size_t)-1 || n == (size_t)-2) {
1118132019Stjr		SETERROR(REG_ILLSEQ);
1119132019Stjr		return (0);
1120132019Stjr	}
1121132019Stjr	if (n == 0)
1122132019Stjr		n = 1;
1123132019Stjr	p->next += n;
1124132019Stjr	return (wc);
1125132019Stjr}
1126132019Stjr
1127132019Stjr/*
11281573Srgrimes - seterr - set an error condition
112992889Sobrien == static int seterr(struct parse *p, int e);
11301573Srgrimes */
11311573Srgrimesstatic int			/* useless but makes type checking happy */
1132170528Sdelphijseterr(struct parse *p, int e)
11331573Srgrimes{
11341573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
11351573Srgrimes		p->error = e;
11361573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
11371573Srgrimes	p->end = nuls;
11381573Srgrimes	return(0);		/* make the return value well-defined */
11391573Srgrimes}
11401573Srgrimes
11411573Srgrimes/*
11421573Srgrimes - allocset - allocate a set of characters for []
114392889Sobrien == static cset *allocset(struct parse *p);
11441573Srgrimes */
11451573Srgrimesstatic cset *
1146170528Sdelphijallocset(struct parse *p)
11471573Srgrimes{
1148132019Stjr	cset *cs, *ncs;
11491573Srgrimes
1150132019Stjr	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1151132019Stjr	if (ncs == NULL) {
1152132019Stjr		SETERROR(REG_ESPACE);
1153132019Stjr		return (NULL);
11541573Srgrimes	}
1155132019Stjr	p->g->sets = ncs;
1156132019Stjr	cs = &p->g->sets[p->g->ncsets++];
1157132019Stjr	memset(cs, 0, sizeof(*cs));
11581573Srgrimes
11591573Srgrimes	return(cs);
11601573Srgrimes}
11611573Srgrimes
11621573Srgrimes/*
11631573Srgrimes - freeset - free a now-unused set
116492889Sobrien == static void freeset(struct parse *p, cset *cs);
11651573Srgrimes */
11661573Srgrimesstatic void
1167170528Sdelphijfreeset(struct parse *p, cset *cs)
11681573Srgrimes{
116992889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
11701573Srgrimes
1171132019Stjr	free(cs->wides);
1172132019Stjr	free(cs->ranges);
1173132019Stjr	free(cs->types);
1174132019Stjr	memset(cs, 0, sizeof(*cs));
11751573Srgrimes	if (cs == top-1)	/* recover only the easy case */
11761573Srgrimes		p->g->ncsets--;
11771573Srgrimes}
11781573Srgrimes
11791573Srgrimes/*
1180132019Stjr - singleton - Determine whether a set contains only one character,
1181132019Stjr - returning it if so, otherwise returning OUT.
11821573Srgrimes */
1183132019Stjrstatic wint_t
1184170528Sdelphijsingleton(cset *cs)
11851573Srgrimes{
1186132019Stjr	wint_t i, s, n;
11871573Srgrimes
1188132019Stjr	for (i = n = 0; i < NC; i++)
1189132019Stjr		if (CHIN(cs, i)) {
1190132019Stjr			n++;
1191132019Stjr			s = i;
11921573Srgrimes		}
1193132019Stjr	if (n == 1)
1194132019Stjr		return (s);
1195132019Stjr	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1196132019Stjr	    cs->icase == 0)
1197132019Stjr		return (cs->wides[0]);
1198132019Stjr	/* Don't bother handling the other cases. */
1199132019Stjr	return (OUT);
1200132019Stjr}
12011573Srgrimes
1202132019Stjr/*
1203132019Stjr - CHadd - add character to character set.
1204132019Stjr */
1205132019Stjrstatic void
1206170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch)
1207132019Stjr{
1208134802Stjr	wint_t nch, *newwides;
1209132019Stjr	assert(ch >= 0);
1210134802Stjr	if (ch < NC)
1211132019Stjr		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1212134802Stjr	else {
1213132019Stjr		newwides = realloc(cs->wides, (cs->nwides + 1) *
1214132019Stjr		    sizeof(*cs->wides));
1215132019Stjr		if (newwides == NULL) {
1216132019Stjr			SETERROR(REG_ESPACE);
1217132019Stjr			return;
1218132019Stjr		}
1219132019Stjr		cs->wides = newwides;
1220132019Stjr		cs->wides[cs->nwides++] = ch;
12211573Srgrimes	}
1222134802Stjr	if (cs->icase) {
1223134802Stjr		if ((nch = towlower(ch)) < NC)
1224134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1225134802Stjr		if ((nch = towupper(ch)) < NC)
1226134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1227134802Stjr	}
12281573Srgrimes}
12291573Srgrimes
12301573Srgrimes/*
1231132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set.
12321573Srgrimes */
1233132019Stjrstatic void
1234170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
12351573Srgrimes{
1236132019Stjr	crange *newranges;
12371573Srgrimes
1238132019Stjr	for (; min < NC && min <= max; min++)
1239132019Stjr		CHadd(p, cs, min);
1240132019Stjr	if (min >= max)
1241132019Stjr		return;
1242132019Stjr	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1243132019Stjr	    sizeof(*cs->ranges));
1244132019Stjr	if (newranges == NULL) {
1245132019Stjr		SETERROR(REG_ESPACE);
1246132019Stjr		return;
1247132019Stjr	}
1248132019Stjr	cs->ranges = newranges;
1249132019Stjr	cs->ranges[cs->nranges].min = min;
1250247596Sdelphij	cs->ranges[cs->nranges].max = max;
1251132019Stjr	cs->nranges++;
12521573Srgrimes}
12531573Srgrimes
12541573Srgrimes/*
1255132019Stjr - CHaddtype - add all characters of a certain type to a character set.
12561573Srgrimes */
1257132019Stjrstatic void
1258170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct)
12591573Srgrimes{
1260132019Stjr	wint_t i;
1261132019Stjr	wctype_t *newtypes;
12621573Srgrimes
1263134802Stjr	for (i = 0; i < NC; i++)
1264132019Stjr		if (iswctype(i, wct))
1265132019Stjr			CHadd(p, cs, i);
1266132019Stjr	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1267132019Stjr	    sizeof(*cs->types));
1268132019Stjr	if (newtypes == NULL) {
1269132019Stjr		SETERROR(REG_ESPACE);
1270132019Stjr		return;
1271132019Stjr	}
1272132019Stjr	cs->types = newtypes;
1273132019Stjr	cs->types[cs->ntypes++] = wct;
12741573Srgrimes}
12751573Srgrimes
12761573Srgrimes/*
12771573Srgrimes - dupl - emit a duplicate of a bunch of sops
127892889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
12791573Srgrimes */
12801573Srgrimesstatic sopno			/* start of duplicate */
1281170528Sdelphijdupl(struct parse *p,
1282170528Sdelphij	sopno start,		/* from here */
1283170528Sdelphij	sopno finish)		/* to this less one */
12841573Srgrimes{
128592889Sobrien	sopno ret = HERE();
128692889Sobrien	sopno len = finish - start;
12871573Srgrimes
12881573Srgrimes	assert(finish >= start);
12891573Srgrimes	if (len == 0)
12901573Srgrimes		return(ret);
1291227414Skevlo	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1292227414Skevlo		return(ret);
12931573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
12941573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
12951573Srgrimes	p->slen += len;
12961573Srgrimes	return(ret);
12971573Srgrimes}
12981573Srgrimes
12991573Srgrimes/*
13001573Srgrimes - doemit - emit a strip operator
130192889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
13021573Srgrimes *
13031573Srgrimes * It might seem better to implement this as a macro with a function as
13041573Srgrimes * hard-case backup, but it's just too big and messy unless there are
13051573Srgrimes * some changes to the data structures.  Maybe later.
13061573Srgrimes */
13071573Srgrimesstatic void
1308170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd)
13091573Srgrimes{
13101573Srgrimes	/* avoid making error situations worse */
13111573Srgrimes	if (p->error != 0)
13121573Srgrimes		return;
13131573Srgrimes
13141573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
13151573Srgrimes	assert(opnd < 1<<OPSHIFT);
13161573Srgrimes
13171573Srgrimes	/* deal with undersized strip */
13181573Srgrimes	if (p->slen >= p->ssize)
1319227414Skevlo		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1320227414Skevlo			return;
13211573Srgrimes
13221573Srgrimes	/* finally, it's all reduced to the easy case */
13231573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
13241573Srgrimes}
13251573Srgrimes
13261573Srgrimes/*
13271573Srgrimes - doinsert - insert a sop into the strip
132892889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
13291573Srgrimes */
13301573Srgrimesstatic void
1331170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos)
13321573Srgrimes{
133392889Sobrien	sopno sn;
133492889Sobrien	sop s;
133592889Sobrien	int i;
13361573Srgrimes
13371573Srgrimes	/* avoid making error situations worse */
13381573Srgrimes	if (p->error != 0)
13391573Srgrimes		return;
13401573Srgrimes
13411573Srgrimes	sn = HERE();
13421573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
13431573Srgrimes	assert(HERE() == sn+1);
13441573Srgrimes	s = p->strip[sn];
13451573Srgrimes
13461573Srgrimes	/* adjust paren pointers */
13471573Srgrimes	assert(pos > 0);
13481573Srgrimes	for (i = 1; i < NPAREN; i++) {
13491573Srgrimes		if (p->pbegin[i] >= pos) {
13501573Srgrimes			p->pbegin[i]++;
13511573Srgrimes		}
13521573Srgrimes		if (p->pend[i] >= pos) {
13531573Srgrimes			p->pend[i]++;
13541573Srgrimes		}
13551573Srgrimes	}
13561573Srgrimes
13571573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13581573Srgrimes						(HERE()-pos-1)*sizeof(sop));
13591573Srgrimes	p->strip[pos] = s;
13601573Srgrimes}
13611573Srgrimes
13621573Srgrimes/*
13631573Srgrimes - dofwd - complete a forward reference
136492889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
13651573Srgrimes */
13661573Srgrimesstatic void
1367170528Sdelphijdofwd(struct parse *p, sopno pos, sop value)
13681573Srgrimes{
13691573Srgrimes	/* avoid making error situations worse */
13701573Srgrimes	if (p->error != 0)
13711573Srgrimes		return;
13721573Srgrimes
13731573Srgrimes	assert(value < 1<<OPSHIFT);
13741573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
13751573Srgrimes}
13761573Srgrimes
13771573Srgrimes/*
13781573Srgrimes - enlarge - enlarge the strip
1379227414Skevlo == static int enlarge(struct parse *p, sopno size);
13801573Srgrimes */
1381227414Skevlostatic int
1382170528Sdelphijenlarge(struct parse *p, sopno size)
13831573Srgrimes{
138492889Sobrien	sop *sp;
13851573Srgrimes
13861573Srgrimes	if (p->ssize >= size)
1387227414Skevlo		return 1;
13881573Srgrimes
13891573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
13901573Srgrimes	if (sp == NULL) {
13911573Srgrimes		SETERROR(REG_ESPACE);
1392227414Skevlo		return 0;
13931573Srgrimes	}
13941573Srgrimes	p->strip = sp;
13951573Srgrimes	p->ssize = size;
1396227414Skevlo	return 1;
13971573Srgrimes}
13981573Srgrimes
13991573Srgrimes/*
14001573Srgrimes - stripsnug - compact the strip
140192889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
14021573Srgrimes */
14031573Srgrimesstatic void
1404170528Sdelphijstripsnug(struct parse *p, struct re_guts *g)
14051573Srgrimes{
14061573Srgrimes	g->nstates = p->slen;
14071573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
14081573Srgrimes	if (g->strip == NULL) {
14091573Srgrimes		SETERROR(REG_ESPACE);
14101573Srgrimes		g->strip = p->strip;
14111573Srgrimes	}
14121573Srgrimes}
14131573Srgrimes
14141573Srgrimes/*
14151573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
141692889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
14171573Srgrimes *
14181573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
14191573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
14201573Srgrimes * of the interesting cases.
14211573Srgrimes *
14221573Srgrimes * Note that must and mlen got initialized during setup.
14231573Srgrimes */
14241573Srgrimesstatic void
1425170528Sdelphijfindmust(struct parse *p, struct re_guts *g)
14261573Srgrimes{
142792889Sobrien	sop *scan;
14281573Srgrimes	sop *start;
142992889Sobrien	sop *newstart;
143092889Sobrien	sopno newlen;
143192889Sobrien	sop s;
143292889Sobrien	char *cp;
143362391Sdcs	int offset;
1434132019Stjr	char buf[MB_LEN_MAX];
1435132019Stjr	size_t clen;
1436132019Stjr	mbstate_t mbs;
14371573Srgrimes
14381573Srgrimes	/* avoid making error situations worse */
14391573Srgrimes	if (p->error != 0)
14401573Srgrimes		return;
14411573Srgrimes
1442132019Stjr	/*
1443132019Stjr	 * It's not generally safe to do a ``char'' substring search on
1444132019Stjr	 * multibyte character strings, but it's safe for at least
1445132019Stjr	 * UTF-8 (see RFC 3629).
1446132019Stjr	 */
1447132019Stjr	if (MB_CUR_MAX > 1 &&
1448132019Stjr	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1449132019Stjr		return;
1450132019Stjr
14511573Srgrimes	/* find the longest OCHAR sequence in strip */
14521573Srgrimes	newlen = 0;
145362391Sdcs	offset = 0;
145462391Sdcs	g->moffset = 0;
14551573Srgrimes	scan = g->strip + 1;
14561573Srgrimes	do {
14571573Srgrimes		s = *scan++;
14581573Srgrimes		switch (OP(s)) {
14591573Srgrimes		case OCHAR:		/* sequence member */
1460132019Stjr			if (newlen == 0) {		/* new sequence */
1461132019Stjr				memset(&mbs, 0, sizeof(mbs));
14621573Srgrimes				newstart = scan - 1;
1463132019Stjr			}
1464132019Stjr			clen = wcrtomb(buf, OPND(s), &mbs);
1465132019Stjr			if (clen == (size_t)-1)
1466132019Stjr				goto toohard;
1467132019Stjr			newlen += clen;
14681573Srgrimes			break;
14691573Srgrimes		case OPLUS_:		/* things that don't break one */
14701573Srgrimes		case OLPAREN:
14711573Srgrimes		case ORPAREN:
14721573Srgrimes			break;
14731573Srgrimes		case OQUEST_:		/* things that must be skipped */
14741573Srgrimes		case OCH_:
1475131973Stjr			offset = altoffset(scan, offset);
14761573Srgrimes			scan--;
14771573Srgrimes			do {
14781573Srgrimes				scan += OPND(s);
14791573Srgrimes				s = *scan;
14801573Srgrimes				/* assert() interferes w debug printouts */
14811573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
14821573Srgrimes							OP(s) != OOR2) {
14831573Srgrimes					g->iflags |= BAD;
14841573Srgrimes					return;
14851573Srgrimes				}
14861573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1487102411Scharnier			/* FALLTHROUGH */
148862391Sdcs		case OBOW:		/* things that break a sequence */
148962391Sdcs		case OEOW:
149062391Sdcs		case OBOL:
149162391Sdcs		case OEOL:
149262391Sdcs		case O_QUEST:
149362391Sdcs		case O_CH:
149462391Sdcs		case OEND:
14951573Srgrimes			if (newlen > g->mlen) {		/* ends one */
14961573Srgrimes				start = newstart;
14971573Srgrimes				g->mlen = newlen;
149862391Sdcs				if (offset > -1) {
149962391Sdcs					g->moffset += offset;
150062391Sdcs					offset = newlen;
150162391Sdcs				} else
150262391Sdcs					g->moffset = offset;
150362391Sdcs			} else {
150462391Sdcs				if (offset > -1)
150562391Sdcs					offset += newlen;
15061573Srgrimes			}
15071573Srgrimes			newlen = 0;
15081573Srgrimes			break;
150962391Sdcs		case OANY:
151062391Sdcs			if (newlen > g->mlen) {		/* ends one */
151162391Sdcs				start = newstart;
151262391Sdcs				g->mlen = newlen;
151362391Sdcs				if (offset > -1) {
151462391Sdcs					g->moffset += offset;
151562391Sdcs					offset = newlen;
151662391Sdcs				} else
151762391Sdcs					g->moffset = offset;
151862391Sdcs			} else {
151962391Sdcs				if (offset > -1)
152062391Sdcs					offset += newlen;
152162391Sdcs			}
152262391Sdcs			if (offset > -1)
152362391Sdcs				offset++;
152462391Sdcs			newlen = 0;
152562391Sdcs			break;
152662391Sdcs		case OANYOF:		/* may or may not invalidate offset */
152762391Sdcs			/* First, everything as OANY */
152862391Sdcs			if (newlen > g->mlen) {		/* ends one */
152962391Sdcs				start = newstart;
153062391Sdcs				g->mlen = newlen;
153162391Sdcs				if (offset > -1) {
153262391Sdcs					g->moffset += offset;
153362391Sdcs					offset = newlen;
153462391Sdcs				} else
153562391Sdcs					g->moffset = offset;
153662391Sdcs			} else {
153762391Sdcs				if (offset > -1)
153862391Sdcs					offset += newlen;
153962391Sdcs			}
154062391Sdcs			if (offset > -1)
154162391Sdcs				offset++;
154262391Sdcs			newlen = 0;
154362391Sdcs			break;
1544132019Stjr		toohard:
154562391Sdcs		default:
154662391Sdcs			/* Anything here makes it impossible or too hard
154762391Sdcs			 * to calculate the offset -- so we give up;
154862391Sdcs			 * save the last known good offset, in case the
154962391Sdcs			 * must sequence doesn't occur later.
155062391Sdcs			 */
155162391Sdcs			if (newlen > g->mlen) {		/* ends one */
155262391Sdcs				start = newstart;
155362391Sdcs				g->mlen = newlen;
155462391Sdcs				if (offset > -1)
155562391Sdcs					g->moffset += offset;
155662391Sdcs				else
155762391Sdcs					g->moffset = offset;
155862391Sdcs			}
155962391Sdcs			offset = -1;
156062391Sdcs			newlen = 0;
156162391Sdcs			break;
15621573Srgrimes		}
15631573Srgrimes	} while (OP(s) != OEND);
15641573Srgrimes
156562391Sdcs	if (g->mlen == 0) {		/* there isn't one */
156662391Sdcs		g->moffset = -1;
15671573Srgrimes		return;
156862391Sdcs	}
15691573Srgrimes
15701573Srgrimes	/* turn it into a character string */
15711573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
15721573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
15731573Srgrimes		g->mlen = 0;
157462391Sdcs		g->moffset = -1;
15751573Srgrimes		return;
15761573Srgrimes	}
15771573Srgrimes	cp = g->must;
15781573Srgrimes	scan = start;
1579132019Stjr	memset(&mbs, 0, sizeof(mbs));
1580132019Stjr	while (cp < g->must + g->mlen) {
15811573Srgrimes		while (OP(s = *scan++) != OCHAR)
15821573Srgrimes			continue;
1583132019Stjr		clen = wcrtomb(cp, OPND(s), &mbs);
1584132019Stjr		assert(clen != (size_t)-1);
1585132019Stjr		cp += clen;
15861573Srgrimes	}
15871573Srgrimes	assert(cp == g->must + g->mlen);
15881573Srgrimes	*cp++ = '\0';		/* just on general principles */
15891573Srgrimes}
15901573Srgrimes
15911573Srgrimes/*
159262391Sdcs - altoffset - choose biggest offset among multiple choices
1593131973Stjr == static int altoffset(sop *scan, int offset);
159462391Sdcs *
159562391Sdcs * Compute, recursively if necessary, the largest offset among multiple
159662391Sdcs * re paths.
159762391Sdcs */
159862391Sdcsstatic int
1599170528Sdelphijaltoffset(sop *scan, int offset)
160062391Sdcs{
160162391Sdcs	int largest;
160262391Sdcs	int try;
160362391Sdcs	sop s;
160462391Sdcs
160562391Sdcs	/* If we gave up already on offsets, return */
160662391Sdcs	if (offset == -1)
160762391Sdcs		return -1;
160862391Sdcs
160962391Sdcs	largest = 0;
161062391Sdcs	try = 0;
161162391Sdcs	s = *scan++;
161262391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
161362391Sdcs		switch (OP(s)) {
161462391Sdcs		case OOR1:
161562391Sdcs			if (try > largest)
161662391Sdcs				largest = try;
161762391Sdcs			try = 0;
161862391Sdcs			break;
161962391Sdcs		case OQUEST_:
162062391Sdcs		case OCH_:
1621131973Stjr			try = altoffset(scan, try);
162262391Sdcs			if (try == -1)
162362391Sdcs				return -1;
162462391Sdcs			scan--;
162562391Sdcs			do {
162662391Sdcs				scan += OPND(s);
162762391Sdcs				s = *scan;
162862391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
162962391Sdcs							OP(s) != OOR2)
163062391Sdcs					return -1;
163162391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
163262855Sdcs			/* We must skip to the next position, or we'll
163362855Sdcs			 * leave altoffset() too early.
163462855Sdcs			 */
163562855Sdcs			scan++;
163662391Sdcs			break;
163762391Sdcs		case OANYOF:
163862391Sdcs		case OCHAR:
163962391Sdcs		case OANY:
164062391Sdcs			try++;
164162391Sdcs		case OBOW:
164262391Sdcs		case OEOW:
164362391Sdcs		case OLPAREN:
164462391Sdcs		case ORPAREN:
164562391Sdcs		case OOR2:
164662391Sdcs			break;
164762391Sdcs		default:
164862391Sdcs			try = -1;
164962391Sdcs			break;
165062391Sdcs		}
165162391Sdcs		if (try == -1)
165262391Sdcs			return -1;
165362391Sdcs		s = *scan++;
165462391Sdcs	}
165562391Sdcs
165662391Sdcs	if (try > largest)
165762391Sdcs		largest = try;
165862391Sdcs
165962391Sdcs	return largest+offset;
166062391Sdcs}
166162391Sdcs
166262391Sdcs/*
166362232Sdcs - computejumps - compute char jumps for BM scan
166492889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
166562232Sdcs *
166662232Sdcs * This algorithm assumes g->must exists and is has size greater than
166762232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
166862232Sdcs * Sara Baase.
166962232Sdcs *
167062232Sdcs * A char jump is the number of characters one needs to jump based on
167162232Sdcs * the value of the character from the text that was mismatched.
167262232Sdcs */
167362232Sdcsstatic void
1674170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g)
167562232Sdcs{
167662232Sdcs	int ch;
167762232Sdcs	int mindex;
167862232Sdcs
167962232Sdcs	/* Avoid making errors worse */
168062232Sdcs	if (p->error != 0)
168162232Sdcs		return;
168262232Sdcs
168362848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
168462232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
168562232Sdcs		return;
168662754Sdcs	/* Adjust for signed chars, if necessary */
168762754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
168862232Sdcs
168962232Sdcs	/* If the character does not exist in the pattern, the jump
169062232Sdcs	 * is equal to the number of characters in the pattern.
169162232Sdcs	 */
169262754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
169362232Sdcs		g->charjump[ch] = g->mlen;
169462232Sdcs
169562232Sdcs	/* If the character does exist, compute the jump that would
169662232Sdcs	 * take us to the last character in the pattern equal to it
169762232Sdcs	 * (notice that we match right to left, so that last character
169862232Sdcs	 * is the first one that would be matched).
169962232Sdcs	 */
170062232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
1701111010Snectar		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
170262232Sdcs}
170362232Sdcs
170462232Sdcs/*
170562232Sdcs - computematchjumps - compute match jumps for BM scan
170692889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
170762232Sdcs *
170862232Sdcs * This algorithm assumes g->must exists and is has size greater than
170962232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
171062232Sdcs * Sara Baase.
171162232Sdcs *
171262232Sdcs * A match jump is the number of characters one needs to advance based
171362232Sdcs * on the already-matched suffix.
171462232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
171562232Sdcs * the search algorithm works.
171662232Sdcs */
171762232Sdcsstatic void
1718170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g)
171962232Sdcs{
172062232Sdcs	int mindex;		/* General "must" iterator */
172162232Sdcs	int suffix;		/* Keeps track of matching suffix */
172262232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
172362232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
172462232Sdcs				 * such that i+1...mlen is a substring
172562232Sdcs				 * of k+1...k+mlen-i-1
172662232Sdcs				 */
172762232Sdcs
172862232Sdcs	/* Avoid making errors worse */
172962232Sdcs	if (p->error != 0)
173062232Sdcs		return;
173162232Sdcs
173262848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
173362232Sdcs	if (pmatches == NULL) {
173462232Sdcs		g->matchjump = NULL;
173562232Sdcs		return;
173662232Sdcs	}
173762232Sdcs
173862848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
1739276548Sdelphij	if (g->matchjump == NULL) {	/* Not a fatal error */
1740276548Sdelphij		free(pmatches);
174162232Sdcs		return;
1742276548Sdelphij	}
174362232Sdcs
174462232Sdcs	/* Set maximum possible jump for each character in the pattern */
174562232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
174662232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
174762232Sdcs
174862232Sdcs	/* Compute pmatches[] */
174962232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
175062232Sdcs	    mindex--, suffix--) {
175162232Sdcs		pmatches[mindex] = suffix;
175262232Sdcs
175362232Sdcs		/* If a mismatch is found, interrupting the substring,
175462232Sdcs		 * compute the matchjump for that position. If no
175562232Sdcs		 * mismatch is found, then a text substring mismatched
175662232Sdcs		 * against the suffix will also mismatch against the
175762232Sdcs		 * substring.
175862232Sdcs		 */
175962232Sdcs		while (suffix < g->mlen
176062232Sdcs		    && g->must[mindex] != g->must[suffix]) {
176162232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
176262232Sdcs			    g->mlen - mindex - 1);
176362232Sdcs			suffix = pmatches[suffix];
176462232Sdcs		}
176562232Sdcs	}
176662232Sdcs
176762232Sdcs	/* Compute the matchjump up to the last substring found to jump
176862232Sdcs	 * to the beginning of the largest must pattern prefix matching
176962232Sdcs	 * it's own suffix.
177062232Sdcs	 */
177162232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
177262232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
177362232Sdcs		    g->mlen + suffix - mindex);
177462232Sdcs
177562232Sdcs        ssuffix = pmatches[suffix];
177662391Sdcs        while (suffix < g->mlen) {
177762673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
177862232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
177962232Sdcs			    g->mlen + ssuffix - suffix);
178062232Sdcs                        suffix++;
178162232Sdcs                }
178286208Sdcs		if (suffix < g->mlen)
178386208Sdcs                	ssuffix = pmatches[ssuffix];
178462232Sdcs        }
178562232Sdcs
178662232Sdcs	free(pmatches);
178762232Sdcs}
178862232Sdcs
178962232Sdcs/*
17901573Srgrimes - pluscount - count + nesting
179192889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
17921573Srgrimes */
17931573Srgrimesstatic sopno			/* nesting depth */
1794170528Sdelphijpluscount(struct parse *p, struct re_guts *g)
17951573Srgrimes{
179692889Sobrien	sop *scan;
179792889Sobrien	sop s;
179892889Sobrien	sopno plusnest = 0;
179992889Sobrien	sopno maxnest = 0;
18001573Srgrimes
18011573Srgrimes	if (p->error != 0)
18021573Srgrimes		return(0);	/* there may not be an OEND */
18031573Srgrimes
18041573Srgrimes	scan = g->strip + 1;
18051573Srgrimes	do {
18061573Srgrimes		s = *scan++;
18071573Srgrimes		switch (OP(s)) {
18081573Srgrimes		case OPLUS_:
18091573Srgrimes			plusnest++;
18101573Srgrimes			break;
18111573Srgrimes		case O_PLUS:
18121573Srgrimes			if (plusnest > maxnest)
18131573Srgrimes				maxnest = plusnest;
18141573Srgrimes			plusnest--;
18151573Srgrimes			break;
18161573Srgrimes		}
18171573Srgrimes	} while (OP(s) != OEND);
18181573Srgrimes	if (plusnest != 0)
18191573Srgrimes		g->iflags |= BAD;
18201573Srgrimes	return(maxnest);
18211573Srgrimes}
1822