1198090Srdivacky/*-
2198090Srdivacky * This code is derived from OpenBSD's libc/regex, original license follows:
3198090Srdivacky *
4198090Srdivacky * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5198090Srdivacky * Copyright (c) 1992, 1993, 1994
6198090Srdivacky *	The Regents of the University of California.  All rights reserved.
7198090Srdivacky *
8198090Srdivacky * This code is derived from software contributed to Berkeley by
9198090Srdivacky * Henry Spencer.
10198090Srdivacky *
11198090Srdivacky * Redistribution and use in source and binary forms, with or without
12198090Srdivacky * modification, are permitted provided that the following conditions
13198090Srdivacky * are met:
14198090Srdivacky * 1. Redistributions of source code must retain the above copyright
15198090Srdivacky *    notice, this list of conditions and the following disclaimer.
16198090Srdivacky * 2. Redistributions in binary form must reproduce the above copyright
17198090Srdivacky *    notice, this list of conditions and the following disclaimer in the
18198090Srdivacky *    documentation and/or other materials provided with the distribution.
19198090Srdivacky * 3. Neither the name of the University nor the names of its contributors
20198090Srdivacky *    may be used to endorse or promote products derived from this software
21198090Srdivacky *    without specific prior written permission.
22198090Srdivacky *
23198090Srdivacky * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24198090Srdivacky * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25198090Srdivacky * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26198090Srdivacky * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27198090Srdivacky * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28198090Srdivacky * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29198090Srdivacky * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30198090Srdivacky * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31198090Srdivacky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32198090Srdivacky * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33198090Srdivacky * SUCH DAMAGE.
34198090Srdivacky *
35198090Srdivacky *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
36198090Srdivacky */
37198090Srdivacky
38198090Srdivacky#include <sys/types.h>
39198090Srdivacky#include <stdio.h>
40198090Srdivacky#include <string.h>
41198090Srdivacky#include <ctype.h>
42198090Srdivacky#include <limits.h>
43198090Srdivacky#include <stdlib.h>
44198090Srdivacky#include "regex_impl.h"
45198090Srdivacky
46198090Srdivacky#include "regutils.h"
47198090Srdivacky#include "regex2.h"
48198090Srdivacky
49198090Srdivacky#include "regcclass.h"
50198090Srdivacky#include "regcname.h"
51198090Srdivacky
52198090Srdivacky/*
53198090Srdivacky * parse structure, passed up and down to avoid global variables and
54198090Srdivacky * other clumsinesses
55198090Srdivacky */
56198090Srdivackystruct parse {
57198090Srdivacky	char *next;		/* next character in RE */
58198090Srdivacky	char *end;		/* end of string (-> NUL normally) */
59198090Srdivacky	int error;		/* has an error been seen? */
60198090Srdivacky	sop *strip;		/* malloced strip */
61198090Srdivacky	sopno ssize;		/* malloced strip size (allocated) */
62198090Srdivacky	sopno slen;		/* malloced strip length (used) */
63198090Srdivacky	int ncsalloc;		/* number of csets allocated */
64198090Srdivacky	struct re_guts *g;
65198090Srdivacky#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
66198090Srdivacky	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
67198090Srdivacky	sopno pend[NPAREN];	/* -> ) ([0] unused) */
68198090Srdivacky};
69198090Srdivacky
70198090Srdivackystatic void p_ere(struct parse *, int);
71198090Srdivackystatic void p_ere_exp(struct parse *);
72198090Srdivackystatic void p_str(struct parse *);
73198090Srdivackystatic void p_bre(struct parse *, int, int);
74198090Srdivackystatic int p_simp_re(struct parse *, int);
75198090Srdivackystatic int p_count(struct parse *);
76198090Srdivackystatic void p_bracket(struct parse *);
77198090Srdivackystatic void p_b_term(struct parse *, cset *);
78198090Srdivackystatic void p_b_cclass(struct parse *, cset *);
79198090Srdivackystatic void p_b_eclass(struct parse *, cset *);
80198090Srdivackystatic char p_b_symbol(struct parse *);
81198090Srdivackystatic char p_b_coll_elem(struct parse *, int);
82198090Srdivackystatic char othercase(int);
83198090Srdivackystatic void bothcases(struct parse *, int);
84198090Srdivackystatic void ordinary(struct parse *, int);
85198090Srdivackystatic void nonnewline(struct parse *);
86198090Srdivackystatic void repeat(struct parse *, sopno, int, int);
87198090Srdivackystatic int seterr(struct parse *, int);
88198090Srdivackystatic cset *allocset(struct parse *);
89198090Srdivackystatic void freeset(struct parse *, cset *);
90198090Srdivackystatic int freezeset(struct parse *, cset *);
91198090Srdivackystatic int firstch(struct parse *, cset *);
92198090Srdivackystatic int nch(struct parse *, cset *);
93198090Srdivackystatic void mcadd(struct parse *, cset *, const char *);
94198090Srdivackystatic void mcinvert(struct parse *, cset *);
95198090Srdivackystatic void mccase(struct parse *, cset *);
96198090Srdivackystatic int isinsets(struct re_guts *, int);
97198090Srdivackystatic int samesets(struct re_guts *, int, int);
98198090Srdivackystatic void categorize(struct parse *, struct re_guts *);
99198090Srdivackystatic sopno dupl(struct parse *, sopno, sopno);
100198090Srdivackystatic void doemit(struct parse *, sop, size_t);
101198090Srdivackystatic void doinsert(struct parse *, sop, size_t, sopno);
102198090Srdivackystatic void dofwd(struct parse *, sopno, sop);
103198090Srdivackystatic void enlarge(struct parse *, sopno);
104198090Srdivackystatic void stripsnug(struct parse *, struct re_guts *);
105198090Srdivackystatic void findmust(struct parse *, struct re_guts *);
106198090Srdivackystatic sopno pluscount(struct parse *, struct re_guts *);
107198090Srdivacky
108198090Srdivackystatic char nuls[10];		/* place to point scanner in event of error */
109198090Srdivacky
110198090Srdivacky/*
111198090Srdivacky * macros for use with parse structure
112198090Srdivacky * BEWARE:  these know that the parse structure is named `p' !!!
113198090Srdivacky */
114198090Srdivacky#define	PEEK()	(*p->next)
115198090Srdivacky#define	PEEK2()	(*(p->next+1))
116198090Srdivacky#define	MORE()	(p->next < p->end)
117198090Srdivacky#define	MORE2()	(p->next+1 < p->end)
118198090Srdivacky#define	SEE(c)	(MORE() && PEEK() == (c))
119198090Srdivacky#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
120198090Srdivacky#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
121198090Srdivacky#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
122198090Srdivacky#define	NEXT()	(p->next++)
123198090Srdivacky#define	NEXT2()	(p->next += 2)
124198090Srdivacky#define	NEXTn(n)	(p->next += (n))
125198090Srdivacky#define	GETNEXT()	(*p->next++)
126198090Srdivacky#define	SETERROR(e)	seterr(p, (e))
127198090Srdivacky#define	REQUIRE(co, e)	(void)((co) || SETERROR(e))
128198090Srdivacky#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
129198090Srdivacky#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
130198090Srdivacky#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
131198090Srdivacky#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
132198090Srdivacky#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
133198090Srdivacky#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
134198090Srdivacky#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
135198090Srdivacky#define	HERE()		(p->slen)
136198090Srdivacky#define	THERE()		(p->slen - 1)
137198090Srdivacky#define	THERETHERE()	(p->slen - 2)
138198090Srdivacky#define	DROP(n)	(p->slen -= (n))
139198090Srdivacky
140198090Srdivacky#ifdef	_POSIX2_RE_DUP_MAX
141198090Srdivacky#define	DUPMAX	_POSIX2_RE_DUP_MAX
142198090Srdivacky#else
143198090Srdivacky#define	DUPMAX	255
144198090Srdivacky#endif
145198090Srdivacky#define	INFINITY	(DUPMAX + 1)
146198090Srdivacky
147198090Srdivacky#ifndef NDEBUG
148198090Srdivackystatic int never = 0;		/* for use in asserts; shuts lint up */
149198090Srdivacky#else
150198090Srdivacky#define	never	0		/* some <assert.h>s have bugs too */
151198090Srdivacky#endif
152198090Srdivacky
153198090Srdivacky/*
154198090Srdivacky - llvm_regcomp - interface for parser and compilation
155198090Srdivacky */
156198090Srdivackyint				/* 0 success, otherwise REG_something */
157198090Srdivackyllvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
158198090Srdivacky{
159198090Srdivacky	struct parse pa;
160198090Srdivacky	struct re_guts *g;
161198090Srdivacky	struct parse *p = &pa;
162198090Srdivacky	int i;
163198090Srdivacky	size_t len;
164198090Srdivacky#ifdef REDEBUG
165198090Srdivacky#	define	GOODFLAGS(f)	(f)
166198090Srdivacky#else
167198090Srdivacky#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
168198090Srdivacky#endif
169198090Srdivacky
170198090Srdivacky	cflags = GOODFLAGS(cflags);
171198090Srdivacky	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
172198090Srdivacky		return(REG_INVARG);
173198090Srdivacky
174198090Srdivacky	if (cflags&REG_PEND) {
175198090Srdivacky		if (preg->re_endp < pattern)
176198090Srdivacky			return(REG_INVARG);
177198090Srdivacky		len = preg->re_endp - pattern;
178198090Srdivacky	} else
179198090Srdivacky		len = strlen((const char *)pattern);
180198090Srdivacky
181198090Srdivacky	/* do the mallocs early so failure handling is easy */
182198090Srdivacky	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
183198090Srdivacky							(NC-1)*sizeof(cat_t));
184198090Srdivacky	if (g == NULL)
185198090Srdivacky		return(REG_ESPACE);
186198090Srdivacky	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
187198090Srdivacky	p->strip = (sop *)calloc(p->ssize, sizeof(sop));
188198090Srdivacky	p->slen = 0;
189198090Srdivacky	if (p->strip == NULL) {
190198090Srdivacky		free((char *)g);
191198090Srdivacky		return(REG_ESPACE);
192198090Srdivacky	}
193198090Srdivacky
194198090Srdivacky	/* set things up */
195198090Srdivacky	p->g = g;
196198090Srdivacky	p->next = (char *)pattern;	/* convenience; we do not modify it */
197198090Srdivacky	p->end = p->next + len;
198198090Srdivacky	p->error = 0;
199198090Srdivacky	p->ncsalloc = 0;
200198090Srdivacky	for (i = 0; i < NPAREN; i++) {
201198090Srdivacky		p->pbegin[i] = 0;
202198090Srdivacky		p->pend[i] = 0;
203198090Srdivacky	}
204198090Srdivacky	g->csetsize = NC;
205198090Srdivacky	g->sets = NULL;
206198090Srdivacky	g->setbits = NULL;
207198090Srdivacky	g->ncsets = 0;
208198090Srdivacky	g->cflags = cflags;
209198090Srdivacky	g->iflags = 0;
210198090Srdivacky	g->nbol = 0;
211198090Srdivacky	g->neol = 0;
212198090Srdivacky	g->must = NULL;
213198090Srdivacky	g->mlen = 0;
214198090Srdivacky	g->nsub = 0;
215198090Srdivacky	g->ncategories = 1;	/* category 0 is "everything else" */
216198090Srdivacky	g->categories = &g->catspace[-(CHAR_MIN)];
217198090Srdivacky	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
218198090Srdivacky	g->backrefs = 0;
219198090Srdivacky
220198090Srdivacky	/* do it */
221198090Srdivacky	EMIT(OEND, 0);
222198090Srdivacky	g->firststate = THERE();
223198090Srdivacky	if (cflags&REG_EXTENDED)
224198090Srdivacky		p_ere(p, OUT);
225198090Srdivacky	else if (cflags&REG_NOSPEC)
226198090Srdivacky		p_str(p);
227198090Srdivacky	else
228198090Srdivacky		p_bre(p, OUT, OUT);
229198090Srdivacky	EMIT(OEND, 0);
230198090Srdivacky	g->laststate = THERE();
231198090Srdivacky
232198090Srdivacky	/* tidy up loose ends and fill things in */
233198090Srdivacky	categorize(p, g);
234198090Srdivacky	stripsnug(p, g);
235198090Srdivacky	findmust(p, g);
236198090Srdivacky	g->nplus = pluscount(p, g);
237198090Srdivacky	g->magic = MAGIC2;
238198090Srdivacky	preg->re_nsub = g->nsub;
239198090Srdivacky	preg->re_g = g;
240198090Srdivacky	preg->re_magic = MAGIC1;
241198090Srdivacky#ifndef REDEBUG
242198090Srdivacky	/* not debugging, so can't rely on the assert() in llvm_regexec() */
243198090Srdivacky	if (g->iflags&REGEX_BAD)
244198090Srdivacky		SETERROR(REG_ASSERT);
245198090Srdivacky#endif
246198090Srdivacky
247198090Srdivacky	/* win or lose, we're done */
248198090Srdivacky	if (p->error != 0)	/* lose */
249198090Srdivacky		llvm_regfree(preg);
250198090Srdivacky	return(p->error);
251198090Srdivacky}
252198090Srdivacky
253198090Srdivacky/*
254198090Srdivacky - p_ere - ERE parser top level, concatenation and alternation
255198090Srdivacky */
256198090Srdivackystatic void
257198090Srdivackyp_ere(struct parse *p, int stop)	/* character this ERE should end at */
258198090Srdivacky{
259198090Srdivacky	char c;
260198090Srdivacky	sopno prevback = 0;
261198090Srdivacky	sopno prevfwd = 0;
262198090Srdivacky	sopno conc;
263198090Srdivacky	int first = 1;		/* is this the first alternative? */
264198090Srdivacky
265198090Srdivacky	for (;;) {
266198090Srdivacky		/* do a bunch of concatenated expressions */
267198090Srdivacky		conc = HERE();
268198090Srdivacky		while (MORE() && (c = PEEK()) != '|' && c != stop)
269198090Srdivacky			p_ere_exp(p);
270198090Srdivacky		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
271198090Srdivacky
272198090Srdivacky		if (!EAT('|'))
273198090Srdivacky			break;		/* NOTE BREAK OUT */
274198090Srdivacky
275198090Srdivacky		if (first) {
276198090Srdivacky			INSERT(OCH_, conc);	/* offset is wrong */
277198090Srdivacky			prevfwd = conc;
278198090Srdivacky			prevback = conc;
279198090Srdivacky			first = 0;
280198090Srdivacky		}
281198090Srdivacky		ASTERN(OOR1, prevback);
282198090Srdivacky		prevback = THERE();
283198090Srdivacky		AHEAD(prevfwd);			/* fix previous offset */
284198090Srdivacky		prevfwd = HERE();
285198090Srdivacky		EMIT(OOR2, 0);			/* offset is very wrong */
286198090Srdivacky	}
287198090Srdivacky
288198090Srdivacky	if (!first) {		/* tail-end fixups */
289198090Srdivacky		AHEAD(prevfwd);
290198090Srdivacky		ASTERN(O_CH, prevback);
291198090Srdivacky	}
292198090Srdivacky
293198090Srdivacky	assert(!MORE() || SEE(stop));
294198090Srdivacky}
295198090Srdivacky
296198090Srdivacky/*
297198090Srdivacky - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
298198090Srdivacky */
299198090Srdivackystatic void
300198090Srdivackyp_ere_exp(struct parse *p)
301198090Srdivacky{
302198090Srdivacky	char c;
303198090Srdivacky	sopno pos;
304198090Srdivacky	int count;
305198090Srdivacky	int count2;
306249423Sdim	int backrefnum;
307198090Srdivacky	sopno subno;
308198090Srdivacky	int wascaret = 0;
309198090Srdivacky
310198090Srdivacky	assert(MORE());		/* caller should have ensured this */
311198090Srdivacky	c = GETNEXT();
312198090Srdivacky
313198090Srdivacky	pos = HERE();
314198090Srdivacky	switch (c) {
315198090Srdivacky	case '(':
316198090Srdivacky		REQUIRE(MORE(), REG_EPAREN);
317198090Srdivacky		p->g->nsub++;
318198090Srdivacky		subno = p->g->nsub;
319198090Srdivacky		if (subno < NPAREN)
320198090Srdivacky			p->pbegin[subno] = HERE();
321198090Srdivacky		EMIT(OLPAREN, subno);
322198090Srdivacky		if (!SEE(')'))
323198090Srdivacky			p_ere(p, ')');
324198090Srdivacky		if (subno < NPAREN) {
325198090Srdivacky			p->pend[subno] = HERE();
326198090Srdivacky			assert(p->pend[subno] != 0);
327198090Srdivacky		}
328198090Srdivacky		EMIT(ORPAREN, subno);
329198090Srdivacky		MUSTEAT(')', REG_EPAREN);
330198090Srdivacky		break;
331198090Srdivacky#ifndef POSIX_MISTAKE
332198090Srdivacky	case ')':		/* happens only if no current unmatched ( */
333198090Srdivacky		/*
334198090Srdivacky		 * You may ask, why the ifndef?  Because I didn't notice
335198090Srdivacky		 * this until slightly too late for 1003.2, and none of the
336198090Srdivacky		 * other 1003.2 regular-expression reviewers noticed it at
337198090Srdivacky		 * all.  So an unmatched ) is legal POSIX, at least until
338198090Srdivacky		 * we can get it fixed.
339198090Srdivacky		 */
340198090Srdivacky		SETERROR(REG_EPAREN);
341198090Srdivacky		break;
342198090Srdivacky#endif
343198090Srdivacky	case '^':
344198090Srdivacky		EMIT(OBOL, 0);
345198090Srdivacky		p->g->iflags |= USEBOL;
346198090Srdivacky		p->g->nbol++;
347198090Srdivacky		wascaret = 1;
348198090Srdivacky		break;
349198090Srdivacky	case '$':
350198090Srdivacky		EMIT(OEOL, 0);
351198090Srdivacky		p->g->iflags |= USEEOL;
352198090Srdivacky		p->g->neol++;
353198090Srdivacky		break;
354198090Srdivacky	case '|':
355198090Srdivacky		SETERROR(REG_EMPTY);
356198090Srdivacky		break;
357198090Srdivacky	case '*':
358198090Srdivacky	case '+':
359198090Srdivacky	case '?':
360198090Srdivacky		SETERROR(REG_BADRPT);
361198090Srdivacky		break;
362198090Srdivacky	case '.':
363198090Srdivacky		if (p->g->cflags&REG_NEWLINE)
364198090Srdivacky			nonnewline(p);
365198090Srdivacky		else
366198090Srdivacky			EMIT(OANY, 0);
367198090Srdivacky		break;
368198090Srdivacky	case '[':
369198090Srdivacky		p_bracket(p);
370198090Srdivacky		break;
371198090Srdivacky	case '\\':
372198090Srdivacky		REQUIRE(MORE(), REG_EESCAPE);
373198090Srdivacky		c = GETNEXT();
374249423Sdim		if (c >= '1' && c <= '9') {
375249423Sdim			/* \[0-9] is taken to be a back-reference to a previously specified
376249423Sdim			 * matching group. backrefnum will hold the number. The matching
377249423Sdim			 * group must exist (i.e. if \4 is found there must have been at
378249423Sdim			 * least 4 matching groups specified in the pattern previously).
379249423Sdim			 */
380249423Sdim			backrefnum = c - '0';
381249423Sdim			if (p->pend[backrefnum] == 0) {
382249423Sdim				SETERROR(REG_ESUBREG);
383249423Sdim				break;
384249423Sdim			}
385249423Sdim
386249423Sdim			/* Make sure everything checks out and emit the sequence
387249423Sdim			 * that marks a back-reference to the parse structure.
388249423Sdim			 */
389249423Sdim			assert(backrefnum <= p->g->nsub);
390249423Sdim			EMIT(OBACK_, backrefnum);
391249423Sdim			assert(p->pbegin[backrefnum] != 0);
392249423Sdim			assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
393249423Sdim			assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
394249423Sdim			(void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
395249423Sdim			EMIT(O_BACK, backrefnum);
396249423Sdim			p->g->backrefs = 1;
397249423Sdim		} else {
398249423Sdim			/* Other chars are simply themselves when escaped with a backslash.
399249423Sdim			 */
400249423Sdim			ordinary(p, c);
401249423Sdim		}
402198090Srdivacky		break;
403198090Srdivacky	case '{':		/* okay as ordinary except if digit follows */
404198090Srdivacky		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
405198090Srdivacky		/* FALLTHROUGH */
406198090Srdivacky	default:
407198090Srdivacky		ordinary(p, c);
408198090Srdivacky		break;
409198090Srdivacky	}
410198090Srdivacky
411198090Srdivacky	if (!MORE())
412198090Srdivacky		return;
413198090Srdivacky	c = PEEK();
414198090Srdivacky	/* we call { a repetition if followed by a digit */
415198090Srdivacky	if (!( c == '*' || c == '+' || c == '?' ||
416198090Srdivacky				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
417198090Srdivacky		return;		/* no repetition, we're done */
418198090Srdivacky	NEXT();
419198090Srdivacky
420198090Srdivacky	REQUIRE(!wascaret, REG_BADRPT);
421198090Srdivacky	switch (c) {
422198090Srdivacky	case '*':	/* implemented as +? */
423198090Srdivacky		/* this case does not require the (y|) trick, noKLUDGE */
424198090Srdivacky		INSERT(OPLUS_, pos);
425198090Srdivacky		ASTERN(O_PLUS, pos);
426198090Srdivacky		INSERT(OQUEST_, pos);
427198090Srdivacky		ASTERN(O_QUEST, pos);
428198090Srdivacky		break;
429198090Srdivacky	case '+':
430198090Srdivacky		INSERT(OPLUS_, pos);
431198090Srdivacky		ASTERN(O_PLUS, pos);
432198090Srdivacky		break;
433198090Srdivacky	case '?':
434198090Srdivacky		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
435198090Srdivacky		INSERT(OCH_, pos);		/* offset slightly wrong */
436198090Srdivacky		ASTERN(OOR1, pos);		/* this one's right */
437198090Srdivacky		AHEAD(pos);			/* fix the OCH_ */
438198090Srdivacky		EMIT(OOR2, 0);			/* offset very wrong... */
439198090Srdivacky		AHEAD(THERE());			/* ...so fix it */
440198090Srdivacky		ASTERN(O_CH, THERETHERE());
441198090Srdivacky		break;
442198090Srdivacky	case '{':
443198090Srdivacky		count = p_count(p);
444198090Srdivacky		if (EAT(',')) {
445198090Srdivacky			if (isdigit((uch)PEEK())) {
446198090Srdivacky				count2 = p_count(p);
447198090Srdivacky				REQUIRE(count <= count2, REG_BADBR);
448198090Srdivacky			} else		/* single number with comma */
449198090Srdivacky				count2 = INFINITY;
450198090Srdivacky		} else		/* just a single number */
451198090Srdivacky			count2 = count;
452198090Srdivacky		repeat(p, pos, count, count2);
453198090Srdivacky		if (!EAT('}')) {	/* error heuristics */
454198090Srdivacky			while (MORE() && PEEK() != '}')
455198090Srdivacky				NEXT();
456198090Srdivacky			REQUIRE(MORE(), REG_EBRACE);
457198090Srdivacky			SETERROR(REG_BADBR);
458198090Srdivacky		}
459198090Srdivacky		break;
460198090Srdivacky	}
461198090Srdivacky
462198090Srdivacky	if (!MORE())
463198090Srdivacky		return;
464198090Srdivacky	c = PEEK();
465198090Srdivacky	if (!( c == '*' || c == '+' || c == '?' ||
466198090Srdivacky				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
467198090Srdivacky		return;
468198090Srdivacky	SETERROR(REG_BADRPT);
469198090Srdivacky}
470198090Srdivacky
471198090Srdivacky/*
472198090Srdivacky - p_str - string (no metacharacters) "parser"
473198090Srdivacky */
474198090Srdivackystatic void
475198090Srdivackyp_str(struct parse *p)
476198090Srdivacky{
477198090Srdivacky	REQUIRE(MORE(), REG_EMPTY);
478198090Srdivacky	while (MORE())
479198090Srdivacky		ordinary(p, GETNEXT());
480198090Srdivacky}
481198090Srdivacky
482198090Srdivacky/*
483198090Srdivacky - p_bre - BRE parser top level, anchoring and concatenation
484198090Srdivacky * Giving end1 as OUT essentially eliminates the end1/end2 check.
485198090Srdivacky *
486198090Srdivacky * This implementation is a bit of a kludge, in that a trailing $ is first
487198090Srdivacky * taken as an ordinary character and then revised to be an anchor.  The
488198090Srdivacky * only undesirable side effect is that '$' gets included as a character
489198090Srdivacky * category in such cases.  This is fairly harmless; not worth fixing.
490198090Srdivacky * The amount of lookahead needed to avoid this kludge is excessive.
491198090Srdivacky */
492198090Srdivackystatic void
493198090Srdivackyp_bre(struct parse *p,
494198090Srdivacky    int end1,		/* first terminating character */
495198090Srdivacky    int end2)		/* second terminating character */
496198090Srdivacky{
497198090Srdivacky	sopno start = HERE();
498198090Srdivacky	int first = 1;			/* first subexpression? */
499198090Srdivacky	int wasdollar = 0;
500198090Srdivacky
501198090Srdivacky	if (EAT('^')) {
502198090Srdivacky		EMIT(OBOL, 0);
503198090Srdivacky		p->g->iflags |= USEBOL;
504198090Srdivacky		p->g->nbol++;
505198090Srdivacky	}
506198090Srdivacky	while (MORE() && !SEETWO(end1, end2)) {
507198090Srdivacky		wasdollar = p_simp_re(p, first);
508198090Srdivacky		first = 0;
509198090Srdivacky	}
510198090Srdivacky	if (wasdollar) {	/* oops, that was a trailing anchor */
511198090Srdivacky		DROP(1);
512198090Srdivacky		EMIT(OEOL, 0);
513198090Srdivacky		p->g->iflags |= USEEOL;
514198090Srdivacky		p->g->neol++;
515198090Srdivacky	}
516198090Srdivacky
517198090Srdivacky	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
518198090Srdivacky}
519198090Srdivacky
520198090Srdivacky/*
521198090Srdivacky - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
522198090Srdivacky */
523198090Srdivackystatic int			/* was the simple RE an unbackslashed $? */
524198090Srdivackyp_simp_re(struct parse *p,
525198090Srdivacky    int starordinary)		/* is a leading * an ordinary character? */
526198090Srdivacky{
527198090Srdivacky	int c;
528198090Srdivacky	int count;
529198090Srdivacky	int count2;
530198090Srdivacky	sopno pos;
531198090Srdivacky	int i;
532198090Srdivacky	sopno subno;
533198090Srdivacky#	define	BACKSL	(1<<CHAR_BIT)
534198090Srdivacky
535198090Srdivacky	pos = HERE();		/* repetion op, if any, covers from here */
536198090Srdivacky
537198090Srdivacky	assert(MORE());		/* caller should have ensured this */
538198090Srdivacky	c = GETNEXT();
539198090Srdivacky	if (c == '\\') {
540198090Srdivacky		REQUIRE(MORE(), REG_EESCAPE);
541198090Srdivacky		c = BACKSL | GETNEXT();
542198090Srdivacky	}
543198090Srdivacky	switch (c) {
544198090Srdivacky	case '.':
545198090Srdivacky		if (p->g->cflags&REG_NEWLINE)
546198090Srdivacky			nonnewline(p);
547198090Srdivacky		else
548198090Srdivacky			EMIT(OANY, 0);
549198090Srdivacky		break;
550198090Srdivacky	case '[':
551198090Srdivacky		p_bracket(p);
552198090Srdivacky		break;
553198090Srdivacky	case BACKSL|'{':
554198090Srdivacky		SETERROR(REG_BADRPT);
555198090Srdivacky		break;
556198090Srdivacky	case BACKSL|'(':
557198090Srdivacky		p->g->nsub++;
558198090Srdivacky		subno = p->g->nsub;
559198090Srdivacky		if (subno < NPAREN)
560198090Srdivacky			p->pbegin[subno] = HERE();
561198090Srdivacky		EMIT(OLPAREN, subno);
562198090Srdivacky		/* the MORE here is an error heuristic */
563198090Srdivacky		if (MORE() && !SEETWO('\\', ')'))
564198090Srdivacky			p_bre(p, '\\', ')');
565198090Srdivacky		if (subno < NPAREN) {
566198090Srdivacky			p->pend[subno] = HERE();
567198090Srdivacky			assert(p->pend[subno] != 0);
568198090Srdivacky		}
569198090Srdivacky		EMIT(ORPAREN, subno);
570198090Srdivacky		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
571198090Srdivacky		break;
572198090Srdivacky	case BACKSL|')':	/* should not get here -- must be user */
573198090Srdivacky	case BACKSL|'}':
574198090Srdivacky		SETERROR(REG_EPAREN);
575198090Srdivacky		break;
576198090Srdivacky	case BACKSL|'1':
577198090Srdivacky	case BACKSL|'2':
578198090Srdivacky	case BACKSL|'3':
579198090Srdivacky	case BACKSL|'4':
580198090Srdivacky	case BACKSL|'5':
581198090Srdivacky	case BACKSL|'6':
582198090Srdivacky	case BACKSL|'7':
583198090Srdivacky	case BACKSL|'8':
584198090Srdivacky	case BACKSL|'9':
585198090Srdivacky		i = (c&~BACKSL) - '0';
586198090Srdivacky		assert(i < NPAREN);
587198090Srdivacky		if (p->pend[i] != 0) {
588198090Srdivacky			assert(i <= p->g->nsub);
589198090Srdivacky			EMIT(OBACK_, i);
590198090Srdivacky			assert(p->pbegin[i] != 0);
591198090Srdivacky			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
592198090Srdivacky			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
593198090Srdivacky			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
594198090Srdivacky			EMIT(O_BACK, i);
595198090Srdivacky		} else
596198090Srdivacky			SETERROR(REG_ESUBREG);
597198090Srdivacky		p->g->backrefs = 1;
598198090Srdivacky		break;
599198090Srdivacky	case '*':
600198090Srdivacky		REQUIRE(starordinary, REG_BADRPT);
601198090Srdivacky		/* FALLTHROUGH */
602198090Srdivacky	default:
603198090Srdivacky		ordinary(p, (char)c);
604198090Srdivacky		break;
605198090Srdivacky	}
606198090Srdivacky
607198090Srdivacky	if (EAT('*')) {		/* implemented as +? */
608198090Srdivacky		/* this case does not require the (y|) trick, noKLUDGE */
609198090Srdivacky		INSERT(OPLUS_, pos);
610198090Srdivacky		ASTERN(O_PLUS, pos);
611198090Srdivacky		INSERT(OQUEST_, pos);
612198090Srdivacky		ASTERN(O_QUEST, pos);
613198090Srdivacky	} else if (EATTWO('\\', '{')) {
614198090Srdivacky		count = p_count(p);
615198090Srdivacky		if (EAT(',')) {
616198090Srdivacky			if (MORE() && isdigit((uch)PEEK())) {
617198090Srdivacky				count2 = p_count(p);
618198090Srdivacky				REQUIRE(count <= count2, REG_BADBR);
619198090Srdivacky			} else		/* single number with comma */
620198090Srdivacky				count2 = INFINITY;
621198090Srdivacky		} else		/* just a single number */
622198090Srdivacky			count2 = count;
623198090Srdivacky		repeat(p, pos, count, count2);
624198090Srdivacky		if (!EATTWO('\\', '}')) {	/* error heuristics */
625198090Srdivacky			while (MORE() && !SEETWO('\\', '}'))
626198090Srdivacky				NEXT();
627198090Srdivacky			REQUIRE(MORE(), REG_EBRACE);
628198090Srdivacky			SETERROR(REG_BADBR);
629198090Srdivacky		}
630198090Srdivacky	} else if (c == '$')	/* $ (but not \$) ends it */
631198090Srdivacky		return(1);
632198090Srdivacky
633198090Srdivacky	return(0);
634198090Srdivacky}
635198090Srdivacky
636198090Srdivacky/*
637198090Srdivacky - p_count - parse a repetition count
638198090Srdivacky */
639198090Srdivackystatic int			/* the value */
640198090Srdivackyp_count(struct parse *p)
641198090Srdivacky{
642198090Srdivacky	int count = 0;
643198090Srdivacky	int ndigits = 0;
644198090Srdivacky
645198090Srdivacky	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
646198090Srdivacky		count = count*10 + (GETNEXT() - '0');
647198090Srdivacky		ndigits++;
648198090Srdivacky	}
649198090Srdivacky
650198090Srdivacky	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
651198090Srdivacky	return(count);
652198090Srdivacky}
653198090Srdivacky
654198090Srdivacky/*
655198090Srdivacky - p_bracket - parse a bracketed character list
656198090Srdivacky *
657198090Srdivacky * Note a significant property of this code:  if the allocset() did SETERROR,
658198090Srdivacky * no set operations are done.
659198090Srdivacky */
660198090Srdivackystatic void
661198090Srdivackyp_bracket(struct parse *p)
662198090Srdivacky{
663198090Srdivacky	cset *cs;
664198090Srdivacky	int invert = 0;
665198090Srdivacky
666198090Srdivacky	/* Dept of Truly Sickening Special-Case Kludges */
667198090Srdivacky	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
668198090Srdivacky		EMIT(OBOW, 0);
669198090Srdivacky		NEXTn(6);
670198090Srdivacky		return;
671198090Srdivacky	}
672198090Srdivacky	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
673198090Srdivacky		EMIT(OEOW, 0);
674198090Srdivacky		NEXTn(6);
675198090Srdivacky		return;
676198090Srdivacky	}
677198090Srdivacky
678198090Srdivacky	if ((cs = allocset(p)) == NULL) {
679198090Srdivacky		/* allocset did set error status in p */
680198090Srdivacky		return;
681198090Srdivacky	}
682198090Srdivacky
683198090Srdivacky	if (EAT('^'))
684198090Srdivacky		invert++;	/* make note to invert set at end */
685198090Srdivacky	if (EAT(']'))
686198090Srdivacky		CHadd(cs, ']');
687198090Srdivacky	else if (EAT('-'))
688198090Srdivacky		CHadd(cs, '-');
689198090Srdivacky	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
690198090Srdivacky		p_b_term(p, cs);
691198090Srdivacky	if (EAT('-'))
692198090Srdivacky		CHadd(cs, '-');
693198090Srdivacky	MUSTEAT(']', REG_EBRACK);
694198090Srdivacky
695198090Srdivacky	if (p->error != 0) {	/* don't mess things up further */
696198090Srdivacky		freeset(p, cs);
697198090Srdivacky		return;
698198090Srdivacky	}
699198090Srdivacky
700198090Srdivacky	if (p->g->cflags&REG_ICASE) {
701198090Srdivacky		int i;
702198090Srdivacky		int ci;
703198090Srdivacky
704198090Srdivacky		for (i = p->g->csetsize - 1; i >= 0; i--)
705198090Srdivacky			if (CHIN(cs, i) && isalpha(i)) {
706198090Srdivacky				ci = othercase(i);
707198090Srdivacky				if (ci != i)
708198090Srdivacky					CHadd(cs, ci);
709198090Srdivacky			}
710198090Srdivacky		if (cs->multis != NULL)
711198090Srdivacky			mccase(p, cs);
712198090Srdivacky	}
713198090Srdivacky	if (invert) {
714198090Srdivacky		int i;
715198090Srdivacky
716198090Srdivacky		for (i = p->g->csetsize - 1; i >= 0; i--)
717198090Srdivacky			if (CHIN(cs, i))
718198090Srdivacky				CHsub(cs, i);
719198090Srdivacky			else
720198090Srdivacky				CHadd(cs, i);
721198090Srdivacky		if (p->g->cflags&REG_NEWLINE)
722198090Srdivacky			CHsub(cs, '\n');
723198090Srdivacky		if (cs->multis != NULL)
724198090Srdivacky			mcinvert(p, cs);
725198090Srdivacky	}
726198090Srdivacky
727198090Srdivacky	assert(cs->multis == NULL);		/* xxx */
728198090Srdivacky
729198090Srdivacky	if (nch(p, cs) == 1) {		/* optimize singleton sets */
730198090Srdivacky		ordinary(p, firstch(p, cs));
731198090Srdivacky		freeset(p, cs);
732198090Srdivacky	} else
733198090Srdivacky		EMIT(OANYOF, freezeset(p, cs));
734198090Srdivacky}
735198090Srdivacky
736198090Srdivacky/*
737198090Srdivacky - p_b_term - parse one term of a bracketed character list
738198090Srdivacky */
739198090Srdivackystatic void
740198090Srdivackyp_b_term(struct parse *p, cset *cs)
741198090Srdivacky{
742198090Srdivacky	char c;
743198090Srdivacky	char start, finish;
744198090Srdivacky	int i;
745198090Srdivacky
746198090Srdivacky	/* classify what we've got */
747198090Srdivacky	switch ((MORE()) ? PEEK() : '\0') {
748198090Srdivacky	case '[':
749198090Srdivacky		c = (MORE2()) ? PEEK2() : '\0';
750198090Srdivacky		break;
751198090Srdivacky	case '-':
752198090Srdivacky		SETERROR(REG_ERANGE);
753198090Srdivacky		return;			/* NOTE RETURN */
754198090Srdivacky		break;
755198090Srdivacky	default:
756198090Srdivacky		c = '\0';
757198090Srdivacky		break;
758198090Srdivacky	}
759198090Srdivacky
760198090Srdivacky	switch (c) {
761198090Srdivacky	case ':':		/* character class */
762198090Srdivacky		NEXT2();
763198090Srdivacky		REQUIRE(MORE(), REG_EBRACK);
764198090Srdivacky		c = PEEK();
765198090Srdivacky		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
766198090Srdivacky		p_b_cclass(p, cs);
767198090Srdivacky		REQUIRE(MORE(), REG_EBRACK);
768198090Srdivacky		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
769198090Srdivacky		break;
770198090Srdivacky	case '=':		/* equivalence class */
771198090Srdivacky		NEXT2();
772198090Srdivacky		REQUIRE(MORE(), REG_EBRACK);
773198090Srdivacky		c = PEEK();
774198090Srdivacky		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
775198090Srdivacky		p_b_eclass(p, cs);
776198090Srdivacky		REQUIRE(MORE(), REG_EBRACK);
777198090Srdivacky		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
778198090Srdivacky		break;
779198090Srdivacky	default:		/* symbol, ordinary character, or range */
780198090Srdivacky/* xxx revision needed for multichar stuff */
781198090Srdivacky		start = p_b_symbol(p);
782198090Srdivacky		if (SEE('-') && MORE2() && PEEK2() != ']') {
783198090Srdivacky			/* range */
784198090Srdivacky			NEXT();
785198090Srdivacky			if (EAT('-'))
786198090Srdivacky				finish = '-';
787198090Srdivacky			else
788198090Srdivacky				finish = p_b_symbol(p);
789198090Srdivacky		} else
790198090Srdivacky			finish = start;
791198090Srdivacky/* xxx what about signed chars here... */
792198090Srdivacky		REQUIRE(start <= finish, REG_ERANGE);
793198090Srdivacky		for (i = start; i <= finish; i++)
794198090Srdivacky			CHadd(cs, i);
795198090Srdivacky		break;
796198090Srdivacky	}
797198090Srdivacky}
798198090Srdivacky
799198090Srdivacky/*
800198090Srdivacky - p_b_cclass - parse a character-class name and deal with it
801198090Srdivacky */
802198090Srdivackystatic void
803198090Srdivackyp_b_cclass(struct parse *p, cset *cs)
804198090Srdivacky{
805198090Srdivacky	char *sp = p->next;
806198090Srdivacky	struct cclass *cp;
807198090Srdivacky	size_t len;
808198090Srdivacky	const char *u;
809198090Srdivacky	char c;
810198090Srdivacky
811221345Sdim	while (MORE() && isalpha((uch)PEEK()))
812198090Srdivacky		NEXT();
813198090Srdivacky	len = p->next - sp;
814198090Srdivacky	for (cp = cclasses; cp->name != NULL; cp++)
815198090Srdivacky		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
816198090Srdivacky			break;
817198090Srdivacky	if (cp->name == NULL) {
818198090Srdivacky		/* oops, didn't find it */
819198090Srdivacky		SETERROR(REG_ECTYPE);
820198090Srdivacky		return;
821198090Srdivacky	}
822198090Srdivacky
823198090Srdivacky	u = cp->chars;
824198090Srdivacky	while ((c = *u++) != '\0')
825198090Srdivacky		CHadd(cs, c);
826198090Srdivacky	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
827198090Srdivacky		MCadd(p, cs, u);
828198090Srdivacky}
829198090Srdivacky
830198090Srdivacky/*
831198090Srdivacky - p_b_eclass - parse an equivalence-class name and deal with it
832198090Srdivacky *
833198090Srdivacky * This implementation is incomplete. xxx
834198090Srdivacky */
835198090Srdivackystatic void
836198090Srdivackyp_b_eclass(struct parse *p, cset *cs)
837198090Srdivacky{
838198090Srdivacky	char c;
839198090Srdivacky
840198090Srdivacky	c = p_b_coll_elem(p, '=');
841198090Srdivacky	CHadd(cs, c);
842198090Srdivacky}
843198090Srdivacky
844198090Srdivacky/*
845198090Srdivacky - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
846198090Srdivacky */
847198090Srdivackystatic char			/* value of symbol */
848198090Srdivackyp_b_symbol(struct parse *p)
849198090Srdivacky{
850198090Srdivacky	char value;
851198090Srdivacky
852198090Srdivacky	REQUIRE(MORE(), REG_EBRACK);
853198090Srdivacky	if (!EATTWO('[', '.'))
854198090Srdivacky		return(GETNEXT());
855198090Srdivacky
856198090Srdivacky	/* collating symbol */
857198090Srdivacky	value = p_b_coll_elem(p, '.');
858198090Srdivacky	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
859198090Srdivacky	return(value);
860198090Srdivacky}
861198090Srdivacky
862198090Srdivacky/*
863198090Srdivacky - p_b_coll_elem - parse a collating-element name and look it up
864198090Srdivacky */
865198090Srdivackystatic char			/* value of collating element */
866198090Srdivackyp_b_coll_elem(struct parse *p,
867198090Srdivacky    int endc)			/* name ended by endc,']' */
868198090Srdivacky{
869198090Srdivacky	char *sp = p->next;
870198090Srdivacky	struct cname *cp;
871198090Srdivacky	int len;
872198090Srdivacky
873198090Srdivacky	while (MORE() && !SEETWO(endc, ']'))
874198090Srdivacky		NEXT();
875198090Srdivacky	if (!MORE()) {
876198090Srdivacky		SETERROR(REG_EBRACK);
877198090Srdivacky		return(0);
878198090Srdivacky	}
879198090Srdivacky	len = p->next - sp;
880198090Srdivacky	for (cp = cnames; cp->name != NULL; cp++)
881198090Srdivacky		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
882198090Srdivacky			return(cp->code);	/* known name */
883198090Srdivacky	if (len == 1)
884198090Srdivacky		return(*sp);	/* single character */
885198090Srdivacky	SETERROR(REG_ECOLLATE);			/* neither */
886198090Srdivacky	return(0);
887198090Srdivacky}
888198090Srdivacky
889198090Srdivacky/*
890198090Srdivacky - othercase - return the case counterpart of an alphabetic
891198090Srdivacky */
892198090Srdivackystatic char			/* if no counterpart, return ch */
893198090Srdivackyothercase(int ch)
894198090Srdivacky{
895198090Srdivacky	ch = (uch)ch;
896198090Srdivacky	assert(isalpha(ch));
897198090Srdivacky	if (isupper(ch))
898198090Srdivacky		return ((uch)tolower(ch));
899198090Srdivacky	else if (islower(ch))
900198090Srdivacky		return ((uch)toupper(ch));
901198090Srdivacky	else			/* peculiar, but could happen */
902198090Srdivacky		return(ch);
903198090Srdivacky}
904198090Srdivacky
905198090Srdivacky/*
906198090Srdivacky - bothcases - emit a dualcase version of a two-case character
907198090Srdivacky *
908198090Srdivacky * Boy, is this implementation ever a kludge...
909198090Srdivacky */
910198090Srdivackystatic void
911198090Srdivackybothcases(struct parse *p, int ch)
912198090Srdivacky{
913198090Srdivacky	char *oldnext = p->next;
914198090Srdivacky	char *oldend = p->end;
915198090Srdivacky	char bracket[3];
916198090Srdivacky
917198090Srdivacky	ch = (uch)ch;
918198090Srdivacky	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
919198090Srdivacky	p->next = bracket;
920198090Srdivacky	p->end = bracket+2;
921198090Srdivacky	bracket[0] = ch;
922198090Srdivacky	bracket[1] = ']';
923198090Srdivacky	bracket[2] = '\0';
924198090Srdivacky	p_bracket(p);
925198090Srdivacky	assert(p->next == bracket+2);
926198090Srdivacky	p->next = oldnext;
927198090Srdivacky	p->end = oldend;
928198090Srdivacky}
929198090Srdivacky
930198090Srdivacky/*
931198090Srdivacky - ordinary - emit an ordinary character
932198090Srdivacky */
933198090Srdivackystatic void
934198090Srdivackyordinary(struct parse *p, int ch)
935198090Srdivacky{
936198090Srdivacky	cat_t *cap = p->g->categories;
937198090Srdivacky
938198090Srdivacky	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
939198090Srdivacky		bothcases(p, ch);
940198090Srdivacky	else {
941198090Srdivacky		EMIT(OCHAR, (uch)ch);
942198090Srdivacky		if (cap[ch] == 0)
943198090Srdivacky			cap[ch] = p->g->ncategories++;
944198090Srdivacky	}
945198090Srdivacky}
946198090Srdivacky
947198090Srdivacky/*
948198090Srdivacky - nonnewline - emit REG_NEWLINE version of OANY
949198090Srdivacky *
950198090Srdivacky * Boy, is this implementation ever a kludge...
951198090Srdivacky */
952198090Srdivackystatic void
953198090Srdivackynonnewline(struct parse *p)
954198090Srdivacky{
955198090Srdivacky	char *oldnext = p->next;
956198090Srdivacky	char *oldend = p->end;
957198090Srdivacky	char bracket[4];
958198090Srdivacky
959198090Srdivacky	p->next = bracket;
960198090Srdivacky	p->end = bracket+3;
961198090Srdivacky	bracket[0] = '^';
962198090Srdivacky	bracket[1] = '\n';
963198090Srdivacky	bracket[2] = ']';
964198090Srdivacky	bracket[3] = '\0';
965198090Srdivacky	p_bracket(p);
966198090Srdivacky	assert(p->next == bracket+3);
967198090Srdivacky	p->next = oldnext;
968198090Srdivacky	p->end = oldend;
969198090Srdivacky}
970198090Srdivacky
971198090Srdivacky/*
972198090Srdivacky - repeat - generate code for a bounded repetition, recursively if needed
973198090Srdivacky */
974198090Srdivackystatic void
975198090Srdivackyrepeat(struct parse *p,
976198090Srdivacky    sopno start,		/* operand from here to end of strip */
977198090Srdivacky    int from,			/* repeated from this number */
978198090Srdivacky    int to)			/* to this number of times (maybe INFINITY) */
979198090Srdivacky{
980198090Srdivacky	sopno finish = HERE();
981198090Srdivacky#	define	N	2
982198090Srdivacky#	define	INF	3
983198090Srdivacky#	define	REP(f, t)	((f)*8 + (t))
984198090Srdivacky#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
985198090Srdivacky	sopno copy;
986198090Srdivacky
987198090Srdivacky	if (p->error != 0)	/* head off possible runaway recursion */
988198090Srdivacky		return;
989198090Srdivacky
990198090Srdivacky	assert(from <= to);
991198090Srdivacky
992198090Srdivacky	switch (REP(MAP(from), MAP(to))) {
993198090Srdivacky	case REP(0, 0):			/* must be user doing this */
994198090Srdivacky		DROP(finish-start);	/* drop the operand */
995198090Srdivacky		break;
996198090Srdivacky	case REP(0, 1):			/* as x{1,1}? */
997198090Srdivacky	case REP(0, N):			/* as x{1,n}? */
998198090Srdivacky	case REP(0, INF):		/* as x{1,}? */
999198090Srdivacky		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1000198090Srdivacky		INSERT(OCH_, start);		/* offset is wrong... */
1001198090Srdivacky		repeat(p, start+1, 1, to);
1002198090Srdivacky		ASTERN(OOR1, start);
1003198090Srdivacky		AHEAD(start);			/* ... fix it */
1004198090Srdivacky		EMIT(OOR2, 0);
1005198090Srdivacky		AHEAD(THERE());
1006198090Srdivacky		ASTERN(O_CH, THERETHERE());
1007198090Srdivacky		break;
1008198090Srdivacky	case REP(1, 1):			/* trivial case */
1009198090Srdivacky		/* done */
1010198090Srdivacky		break;
1011198090Srdivacky	case REP(1, N):			/* as x?x{1,n-1} */
1012198090Srdivacky		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1013198090Srdivacky		INSERT(OCH_, start);
1014198090Srdivacky		ASTERN(OOR1, start);
1015198090Srdivacky		AHEAD(start);
1016198090Srdivacky		EMIT(OOR2, 0);			/* offset very wrong... */
1017198090Srdivacky		AHEAD(THERE());			/* ...so fix it */
1018198090Srdivacky		ASTERN(O_CH, THERETHERE());
1019198090Srdivacky		copy = dupl(p, start+1, finish+1);
1020198090Srdivacky		assert(copy == finish+4);
1021198090Srdivacky		repeat(p, copy, 1, to-1);
1022198090Srdivacky		break;
1023198090Srdivacky	case REP(1, INF):		/* as x+ */
1024198090Srdivacky		INSERT(OPLUS_, start);
1025198090Srdivacky		ASTERN(O_PLUS, start);
1026198090Srdivacky		break;
1027198090Srdivacky	case REP(N, N):			/* as xx{m-1,n-1} */
1028198090Srdivacky		copy = dupl(p, start, finish);
1029198090Srdivacky		repeat(p, copy, from-1, to-1);
1030198090Srdivacky		break;
1031198090Srdivacky	case REP(N, INF):		/* as xx{n-1,INF} */
1032198090Srdivacky		copy = dupl(p, start, finish);
1033198090Srdivacky		repeat(p, copy, from-1, to);
1034198090Srdivacky		break;
1035198090Srdivacky	default:			/* "can't happen" */
1036198090Srdivacky		SETERROR(REG_ASSERT);	/* just in case */
1037198090Srdivacky		break;
1038198090Srdivacky	}
1039198090Srdivacky}
1040198090Srdivacky
1041198090Srdivacky/*
1042198090Srdivacky - seterr - set an error condition
1043198090Srdivacky */
1044198090Srdivackystatic int			/* useless but makes type checking happy */
1045198090Srdivackyseterr(struct parse *p, int e)
1046198090Srdivacky{
1047198090Srdivacky	if (p->error == 0)	/* keep earliest error condition */
1048198090Srdivacky		p->error = e;
1049198090Srdivacky	p->next = nuls;		/* try to bring things to a halt */
1050198090Srdivacky	p->end = nuls;
1051198090Srdivacky	return(0);		/* make the return value well-defined */
1052198090Srdivacky}
1053198090Srdivacky
1054198090Srdivacky/*
1055198090Srdivacky - allocset - allocate a set of characters for []
1056198090Srdivacky */
1057198090Srdivackystatic cset *
1058198090Srdivackyallocset(struct parse *p)
1059198090Srdivacky{
1060198090Srdivacky	int no = p->g->ncsets++;
1061198090Srdivacky	size_t nc;
1062198090Srdivacky	size_t nbytes;
1063198090Srdivacky	cset *cs;
1064198090Srdivacky	size_t css = (size_t)p->g->csetsize;
1065198090Srdivacky	int i;
1066198090Srdivacky
1067198090Srdivacky	if (no >= p->ncsalloc) {	/* need another column of space */
1068198090Srdivacky		void *ptr;
1069198090Srdivacky
1070198090Srdivacky		p->ncsalloc += CHAR_BIT;
1071198090Srdivacky		nc = p->ncsalloc;
1072198090Srdivacky		assert(nc % CHAR_BIT == 0);
1073198090Srdivacky		nbytes = nc / CHAR_BIT * css;
1074198090Srdivacky
1075198090Srdivacky		ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1076198090Srdivacky		if (ptr == NULL)
1077198090Srdivacky			goto nomem;
1078198090Srdivacky		p->g->sets = ptr;
1079198090Srdivacky
1080198090Srdivacky		ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1081198090Srdivacky		if (ptr == NULL)
1082198090Srdivacky			goto nomem;
1083198090Srdivacky		p->g->setbits = ptr;
1084198090Srdivacky
1085198090Srdivacky		for (i = 0; i < no; i++)
1086198090Srdivacky			p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1087198090Srdivacky
1088198090Srdivacky		(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1089198090Srdivacky	}
1090198090Srdivacky	/* XXX should not happen */
1091198090Srdivacky	if (p->g->sets == NULL || p->g->setbits == NULL)
1092198090Srdivacky		goto nomem;
1093198090Srdivacky
1094198090Srdivacky	cs = &p->g->sets[no];
1095198090Srdivacky	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1096198090Srdivacky	cs->mask = 1 << ((no) % CHAR_BIT);
1097198090Srdivacky	cs->hash = 0;
1098198090Srdivacky	cs->smultis = 0;
1099198090Srdivacky	cs->multis = NULL;
1100198090Srdivacky
1101198090Srdivacky	return(cs);
1102198090Srdivackynomem:
1103198090Srdivacky	free(p->g->sets);
1104198090Srdivacky	p->g->sets = NULL;
1105198090Srdivacky	free(p->g->setbits);
1106198090Srdivacky	p->g->setbits = NULL;
1107198090Srdivacky
1108198090Srdivacky	SETERROR(REG_ESPACE);
1109198090Srdivacky	/* caller's responsibility not to do set ops */
1110198090Srdivacky	return(NULL);
1111198090Srdivacky}
1112198090Srdivacky
1113198090Srdivacky/*
1114198090Srdivacky - freeset - free a now-unused set
1115198090Srdivacky */
1116198090Srdivackystatic void
1117198090Srdivackyfreeset(struct parse *p, cset *cs)
1118198090Srdivacky{
1119198090Srdivacky	size_t i;
1120198090Srdivacky	cset *top = &p->g->sets[p->g->ncsets];
1121198090Srdivacky	size_t css = (size_t)p->g->csetsize;
1122198090Srdivacky
1123198090Srdivacky	for (i = 0; i < css; i++)
1124198090Srdivacky		CHsub(cs, i);
1125198090Srdivacky	if (cs == top-1)	/* recover only the easy case */
1126198090Srdivacky		p->g->ncsets--;
1127198090Srdivacky}
1128198090Srdivacky
1129198090Srdivacky/*
1130198090Srdivacky - freezeset - final processing on a set of characters
1131198090Srdivacky *
1132198090Srdivacky * The main task here is merging identical sets.  This is usually a waste
1133198090Srdivacky * of time (although the hash code minimizes the overhead), but can win
1134198090Srdivacky * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1135198090Srdivacky * is done using addition rather than xor -- all ASCII [aA] sets xor to
1136198090Srdivacky * the same value!
1137198090Srdivacky */
1138198090Srdivackystatic int			/* set number */
1139198090Srdivackyfreezeset(struct parse *p, cset *cs)
1140198090Srdivacky{
1141198090Srdivacky	uch h = cs->hash;
1142198090Srdivacky	size_t i;
1143198090Srdivacky	cset *top = &p->g->sets[p->g->ncsets];
1144198090Srdivacky	cset *cs2;
1145198090Srdivacky	size_t css = (size_t)p->g->csetsize;
1146198090Srdivacky
1147198090Srdivacky	/* look for an earlier one which is the same */
1148198090Srdivacky	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1149198090Srdivacky		if (cs2->hash == h && cs2 != cs) {
1150198090Srdivacky			/* maybe */
1151198090Srdivacky			for (i = 0; i < css; i++)
1152198090Srdivacky				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1153198090Srdivacky					break;		/* no */
1154198090Srdivacky			if (i == css)
1155198090Srdivacky				break;			/* yes */
1156198090Srdivacky		}
1157198090Srdivacky
1158198090Srdivacky	if (cs2 < top) {	/* found one */
1159198090Srdivacky		freeset(p, cs);
1160198090Srdivacky		cs = cs2;
1161198090Srdivacky	}
1162198090Srdivacky
1163198090Srdivacky	return((int)(cs - p->g->sets));
1164198090Srdivacky}
1165198090Srdivacky
1166198090Srdivacky/*
1167198090Srdivacky - firstch - return first character in a set (which must have at least one)
1168198090Srdivacky */
1169198090Srdivackystatic int			/* character; there is no "none" value */
1170198090Srdivackyfirstch(struct parse *p, cset *cs)
1171198090Srdivacky{
1172198090Srdivacky	size_t i;
1173198090Srdivacky	size_t css = (size_t)p->g->csetsize;
1174198090Srdivacky
1175198090Srdivacky	for (i = 0; i < css; i++)
1176198090Srdivacky		if (CHIN(cs, i))
1177198090Srdivacky			return((char)i);
1178198090Srdivacky	assert(never);
1179198090Srdivacky	return(0);		/* arbitrary */
1180198090Srdivacky}
1181198090Srdivacky
1182198090Srdivacky/*
1183198090Srdivacky - nch - number of characters in a set
1184198090Srdivacky */
1185198090Srdivackystatic int
1186198090Srdivackynch(struct parse *p, cset *cs)
1187198090Srdivacky{
1188198090Srdivacky	size_t i;
1189198090Srdivacky	size_t css = (size_t)p->g->csetsize;
1190198090Srdivacky	int n = 0;
1191198090Srdivacky
1192198090Srdivacky	for (i = 0; i < css; i++)
1193198090Srdivacky		if (CHIN(cs, i))
1194198090Srdivacky			n++;
1195198090Srdivacky	return(n);
1196198090Srdivacky}
1197198090Srdivacky
1198198090Srdivacky/*
1199198090Srdivacky - mcadd - add a collating element to a cset
1200198090Srdivacky */
1201198090Srdivackystatic void
1202198090Srdivackymcadd( struct parse *p, cset *cs, const char *cp)
1203198090Srdivacky{
1204198090Srdivacky	size_t oldend = cs->smultis;
1205198090Srdivacky	void *np;
1206198090Srdivacky
1207198090Srdivacky	cs->smultis += strlen(cp) + 1;
1208198090Srdivacky	np = realloc(cs->multis, cs->smultis);
1209198090Srdivacky	if (np == NULL) {
1210198090Srdivacky		if (cs->multis)
1211198090Srdivacky			free(cs->multis);
1212198090Srdivacky		cs->multis = NULL;
1213198090Srdivacky		SETERROR(REG_ESPACE);
1214198090Srdivacky		return;
1215198090Srdivacky	}
1216198090Srdivacky	cs->multis = np;
1217198090Srdivacky
1218198090Srdivacky	llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1219198090Srdivacky}
1220198090Srdivacky
1221198090Srdivacky/*
1222198090Srdivacky - mcinvert - invert the list of collating elements in a cset
1223198090Srdivacky *
1224198090Srdivacky * This would have to know the set of possibilities.  Implementation
1225198090Srdivacky * is deferred.
1226198090Srdivacky */
1227198090Srdivacky/* ARGSUSED */
1228198090Srdivackystatic void
1229198090Srdivackymcinvert(struct parse *p, cset *cs)
1230198090Srdivacky{
1231198090Srdivacky	assert(cs->multis == NULL);	/* xxx */
1232198090Srdivacky}
1233198090Srdivacky
1234198090Srdivacky/*
1235198090Srdivacky - mccase - add case counterparts of the list of collating elements in a cset
1236198090Srdivacky *
1237198090Srdivacky * This would have to know the set of possibilities.  Implementation
1238198090Srdivacky * is deferred.
1239198090Srdivacky */
1240198090Srdivacky/* ARGSUSED */
1241198090Srdivackystatic void
1242198090Srdivackymccase(struct parse *p, cset *cs)
1243198090Srdivacky{
1244198090Srdivacky	assert(cs->multis == NULL);	/* xxx */
1245198090Srdivacky}
1246198090Srdivacky
1247198090Srdivacky/*
1248198090Srdivacky - isinsets - is this character in any sets?
1249198090Srdivacky */
1250198090Srdivackystatic int			/* predicate */
1251198090Srdivackyisinsets(struct re_guts *g, int c)
1252198090Srdivacky{
1253198090Srdivacky	uch *col;
1254198090Srdivacky	int i;
1255198090Srdivacky	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1256198090Srdivacky	unsigned uc = (uch)c;
1257198090Srdivacky
1258198090Srdivacky	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1259198090Srdivacky		if (col[uc] != 0)
1260198090Srdivacky			return(1);
1261198090Srdivacky	return(0);
1262198090Srdivacky}
1263198090Srdivacky
1264198090Srdivacky/*
1265198090Srdivacky - samesets - are these two characters in exactly the same sets?
1266198090Srdivacky */
1267198090Srdivackystatic int			/* predicate */
1268198090Srdivackysamesets(struct re_guts *g, int c1, int c2)
1269198090Srdivacky{
1270198090Srdivacky	uch *col;
1271198090Srdivacky	int i;
1272198090Srdivacky	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1273198090Srdivacky	unsigned uc1 = (uch)c1;
1274198090Srdivacky	unsigned uc2 = (uch)c2;
1275198090Srdivacky
1276198090Srdivacky	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1277198090Srdivacky		if (col[uc1] != col[uc2])
1278198090Srdivacky			return(0);
1279198090Srdivacky	return(1);
1280198090Srdivacky}
1281198090Srdivacky
1282198090Srdivacky/*
1283198090Srdivacky - categorize - sort out character categories
1284198090Srdivacky */
1285198090Srdivackystatic void
1286198090Srdivackycategorize(struct parse *p, struct re_guts *g)
1287198090Srdivacky{
1288198090Srdivacky	cat_t *cats = g->categories;
1289198090Srdivacky	int c;
1290198090Srdivacky	int c2;
1291198090Srdivacky	cat_t cat;
1292198090Srdivacky
1293198090Srdivacky	/* avoid making error situations worse */
1294198090Srdivacky	if (p->error != 0)
1295198090Srdivacky		return;
1296198090Srdivacky
1297198090Srdivacky	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1298198090Srdivacky		if (cats[c] == 0 && isinsets(g, c)) {
1299198090Srdivacky			cat = g->ncategories++;
1300198090Srdivacky			cats[c] = cat;
1301198090Srdivacky			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1302198090Srdivacky				if (cats[c2] == 0 && samesets(g, c, c2))
1303198090Srdivacky					cats[c2] = cat;
1304198090Srdivacky		}
1305198090Srdivacky}
1306198090Srdivacky
1307198090Srdivacky/*
1308198090Srdivacky - dupl - emit a duplicate of a bunch of sops
1309198090Srdivacky */
1310198090Srdivackystatic sopno			/* start of duplicate */
1311198090Srdivackydupl(struct parse *p,
1312198090Srdivacky    sopno start,		/* from here */
1313198090Srdivacky    sopno finish)		/* to this less one */
1314198090Srdivacky{
1315198090Srdivacky	sopno ret = HERE();
1316198090Srdivacky	sopno len = finish - start;
1317198090Srdivacky
1318198090Srdivacky	assert(finish >= start);
1319198090Srdivacky	if (len == 0)
1320198090Srdivacky		return(ret);
1321198090Srdivacky	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1322198090Srdivacky	assert(p->ssize >= p->slen + len);
1323198090Srdivacky	(void) memmove((char *)(p->strip + p->slen),
1324198090Srdivacky		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1325198090Srdivacky	p->slen += len;
1326198090Srdivacky	return(ret);
1327198090Srdivacky}
1328198090Srdivacky
1329198090Srdivacky/*
1330198090Srdivacky - doemit - emit a strip operator
1331198090Srdivacky *
1332198090Srdivacky * It might seem better to implement this as a macro with a function as
1333198090Srdivacky * hard-case backup, but it's just too big and messy unless there are
1334198090Srdivacky * some changes to the data structures.  Maybe later.
1335198090Srdivacky */
1336198090Srdivackystatic void
1337198090Srdivackydoemit(struct parse *p, sop op, size_t opnd)
1338198090Srdivacky{
1339198090Srdivacky	/* avoid making error situations worse */
1340198090Srdivacky	if (p->error != 0)
1341198090Srdivacky		return;
1342198090Srdivacky
1343198090Srdivacky	/* deal with oversize operands ("can't happen", more or less) */
1344198090Srdivacky	assert(opnd < 1<<OPSHIFT);
1345198090Srdivacky
1346198090Srdivacky	/* deal with undersized strip */
1347198090Srdivacky	if (p->slen >= p->ssize)
1348198090Srdivacky		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1349198090Srdivacky	assert(p->slen < p->ssize);
1350198090Srdivacky
1351198090Srdivacky	/* finally, it's all reduced to the easy case */
1352198090Srdivacky	p->strip[p->slen++] = SOP(op, opnd);
1353198090Srdivacky}
1354198090Srdivacky
1355198090Srdivacky/*
1356198090Srdivacky - doinsert - insert a sop into the strip
1357198090Srdivacky */
1358198090Srdivackystatic void
1359198090Srdivackydoinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1360198090Srdivacky{
1361198090Srdivacky	sopno sn;
1362198090Srdivacky	sop s;
1363198090Srdivacky	int i;
1364198090Srdivacky
1365198090Srdivacky	/* avoid making error situations worse */
1366198090Srdivacky	if (p->error != 0)
1367198090Srdivacky		return;
1368198090Srdivacky
1369198090Srdivacky	sn = HERE();
1370198090Srdivacky	EMIT(op, opnd);		/* do checks, ensure space */
1371198090Srdivacky	assert(HERE() == sn+1);
1372198090Srdivacky	s = p->strip[sn];
1373198090Srdivacky
1374198090Srdivacky	/* adjust paren pointers */
1375198090Srdivacky	assert(pos > 0);
1376198090Srdivacky	for (i = 1; i < NPAREN; i++) {
1377198090Srdivacky		if (p->pbegin[i] >= pos) {
1378198090Srdivacky			p->pbegin[i]++;
1379198090Srdivacky		}
1380198090Srdivacky		if (p->pend[i] >= pos) {
1381198090Srdivacky			p->pend[i]++;
1382198090Srdivacky		}
1383198090Srdivacky	}
1384198090Srdivacky
1385198090Srdivacky	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1386198090Srdivacky						(HERE()-pos-1)*sizeof(sop));
1387198090Srdivacky	p->strip[pos] = s;
1388198090Srdivacky}
1389198090Srdivacky
1390198090Srdivacky/*
1391198090Srdivacky - dofwd - complete a forward reference
1392198090Srdivacky */
1393198090Srdivackystatic void
1394198090Srdivackydofwd(struct parse *p, sopno pos, sop value)
1395198090Srdivacky{
1396198090Srdivacky	/* avoid making error situations worse */
1397198090Srdivacky	if (p->error != 0)
1398198090Srdivacky		return;
1399198090Srdivacky
1400198090Srdivacky	assert(value < 1<<OPSHIFT);
1401198090Srdivacky	p->strip[pos] = OP(p->strip[pos]) | value;
1402198090Srdivacky}
1403198090Srdivacky
1404198090Srdivacky/*
1405198090Srdivacky - enlarge - enlarge the strip
1406198090Srdivacky */
1407198090Srdivackystatic void
1408198090Srdivackyenlarge(struct parse *p, sopno size)
1409198090Srdivacky{
1410198090Srdivacky	sop *sp;
1411198090Srdivacky
1412198090Srdivacky	if (p->ssize >= size)
1413198090Srdivacky		return;
1414198090Srdivacky
1415198090Srdivacky	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1416198090Srdivacky	if (sp == NULL) {
1417198090Srdivacky		SETERROR(REG_ESPACE);
1418198090Srdivacky		return;
1419198090Srdivacky	}
1420198090Srdivacky	p->strip = sp;
1421198090Srdivacky	p->ssize = size;
1422198090Srdivacky}
1423198090Srdivacky
1424198090Srdivacky/*
1425198090Srdivacky - stripsnug - compact the strip
1426198090Srdivacky */
1427198090Srdivackystatic void
1428198090Srdivackystripsnug(struct parse *p, struct re_guts *g)
1429198090Srdivacky{
1430198090Srdivacky	g->nstates = p->slen;
1431198090Srdivacky	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1432198090Srdivacky	if (g->strip == NULL) {
1433198090Srdivacky		SETERROR(REG_ESPACE);
1434198090Srdivacky		g->strip = p->strip;
1435198090Srdivacky	}
1436198090Srdivacky}
1437198090Srdivacky
1438198090Srdivacky/*
1439198090Srdivacky - findmust - fill in must and mlen with longest mandatory literal string
1440198090Srdivacky *
1441198090Srdivacky * This algorithm could do fancy things like analyzing the operands of |
1442198090Srdivacky * for common subsequences.  Someday.  This code is simple and finds most
1443198090Srdivacky * of the interesting cases.
1444198090Srdivacky *
1445198090Srdivacky * Note that must and mlen got initialized during setup.
1446198090Srdivacky */
1447198090Srdivackystatic void
1448198090Srdivackyfindmust(struct parse *p, struct re_guts *g)
1449198090Srdivacky{
1450198090Srdivacky	sop *scan;
1451198090Srdivacky	sop *start = 0; /* start initialized in the default case, after that */
1452198090Srdivacky	sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1453198090Srdivacky	sopno newlen;
1454198090Srdivacky	sop s;
1455198090Srdivacky	char *cp;
1456198090Srdivacky	sopno i;
1457198090Srdivacky
1458198090Srdivacky	/* avoid making error situations worse */
1459198090Srdivacky	if (p->error != 0)
1460198090Srdivacky		return;
1461198090Srdivacky
1462198090Srdivacky	/* find the longest OCHAR sequence in strip */
1463198090Srdivacky	newlen = 0;
1464198090Srdivacky	scan = g->strip + 1;
1465198090Srdivacky	do {
1466198090Srdivacky		s = *scan++;
1467198090Srdivacky		switch (OP(s)) {
1468198090Srdivacky		case OCHAR:		/* sequence member */
1469198090Srdivacky			if (newlen == 0)		/* new sequence */
1470198090Srdivacky				newstart = scan - 1;
1471198090Srdivacky			newlen++;
1472198090Srdivacky			break;
1473198090Srdivacky		case OPLUS_:		/* things that don't break one */
1474198090Srdivacky		case OLPAREN:
1475198090Srdivacky		case ORPAREN:
1476198090Srdivacky			break;
1477198090Srdivacky		case OQUEST_:		/* things that must be skipped */
1478198090Srdivacky		case OCH_:
1479198090Srdivacky			scan--;
1480198090Srdivacky			do {
1481198090Srdivacky				scan += OPND(s);
1482198090Srdivacky				s = *scan;
1483198090Srdivacky				/* assert() interferes w debug printouts */
1484198090Srdivacky				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1485198090Srdivacky							OP(s) != OOR2) {
1486198090Srdivacky					g->iflags |= REGEX_BAD;
1487198090Srdivacky					return;
1488198090Srdivacky				}
1489198090Srdivacky			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1490198090Srdivacky			/* fallthrough */
1491198090Srdivacky		default:		/* things that break a sequence */
1492198090Srdivacky			if (newlen > g->mlen) {		/* ends one */
1493198090Srdivacky				start = newstart;
1494198090Srdivacky				g->mlen = newlen;
1495198090Srdivacky			}
1496198090Srdivacky			newlen = 0;
1497198090Srdivacky			break;
1498198090Srdivacky		}
1499198090Srdivacky	} while (OP(s) != OEND);
1500198090Srdivacky
1501198090Srdivacky	if (g->mlen == 0)		/* there isn't one */
1502198090Srdivacky		return;
1503198090Srdivacky
1504198090Srdivacky	/* turn it into a character string */
1505198090Srdivacky	g->must = malloc((size_t)g->mlen + 1);
1506198090Srdivacky	if (g->must == NULL) {		/* argh; just forget it */
1507198090Srdivacky		g->mlen = 0;
1508198090Srdivacky		return;
1509198090Srdivacky	}
1510198090Srdivacky	cp = g->must;
1511198090Srdivacky	scan = start;
1512198090Srdivacky	for (i = g->mlen; i > 0; i--) {
1513198090Srdivacky		while (OP(s = *scan++) != OCHAR)
1514198090Srdivacky			continue;
1515198090Srdivacky		assert(cp < g->must + g->mlen);
1516198090Srdivacky		*cp++ = (char)OPND(s);
1517198090Srdivacky	}
1518198090Srdivacky	assert(cp == g->must + g->mlen);
1519198090Srdivacky	*cp++ = '\0';		/* just on general principles */
1520198090Srdivacky}
1521198090Srdivacky
1522198090Srdivacky/*
1523198090Srdivacky - pluscount - count + nesting
1524198090Srdivacky */
1525198090Srdivackystatic sopno			/* nesting depth */
1526198090Srdivackypluscount(struct parse *p, struct re_guts *g)
1527198090Srdivacky{
1528198090Srdivacky	sop *scan;
1529198090Srdivacky	sop s;
1530198090Srdivacky	sopno plusnest = 0;
1531198090Srdivacky	sopno maxnest = 0;
1532198090Srdivacky
1533198090Srdivacky	if (p->error != 0)
1534198090Srdivacky		return(0);	/* there may not be an OEND */
1535198090Srdivacky
1536198090Srdivacky	scan = g->strip + 1;
1537198090Srdivacky	do {
1538198090Srdivacky		s = *scan++;
1539198090Srdivacky		switch (OP(s)) {
1540198090Srdivacky		case OPLUS_:
1541198090Srdivacky			plusnest++;
1542198090Srdivacky			break;
1543198090Srdivacky		case O_PLUS:
1544198090Srdivacky			if (plusnest > maxnest)
1545198090Srdivacky				maxnest = plusnest;
1546198090Srdivacky			plusnest--;
1547198090Srdivacky			break;
1548198090Srdivacky		}
1549198090Srdivacky	} while (OP(s) != OEND);
1550198090Srdivacky	if (plusnest != 0)
1551198090Srdivacky		g->iflags |= REGEX_BAD;
1552198090Srdivacky	return(maxnest);
1553198090Srdivacky}
1554