1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * Copyright (c) 2011 The FreeBSD Foundation
9 *
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * Henry Spencer.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 *    notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in the
23 *    documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the University nor the names of its contributors
25 *    may be used to endorse or promote products derived from this software
26 *    without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41#include <sys/types.h>
42#include <stdio.h>
43#include <string.h>
44#include <ctype.h>
45#include <limits.h>
46#include <stdlib.h>
47#include <regex.h>
48#include <stdbool.h>
49#include <wchar.h>
50#include <wctype.h>
51
52#ifndef LIBREGEX
53#include "collate.h"
54#endif
55
56#include "utils.h"
57#include "regex2.h"
58
59#include "cname.h"
60
61/*
62 * Branching context, used to keep track of branch state for all of the branch-
63 * aware functions. In addition to keeping track of branch positions for the
64 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
65 * detection of whether ^ is acting as an anchor or being used erroneously and
66 * also for whether we're in a sub-expression or not.
67 */
68struct branchc {
69	sopno start;
70	sopno back;
71	sopno fwd;
72
73	int nbranch;
74	int nchain;
75	bool outer;
76	bool terminate;
77};
78
79/*
80 * parse structure, passed up and down to avoid global variables and
81 * other clumsinesses
82 */
83struct parse {
84	const char *next;	/* next character in RE */
85	const char *end;	/* end of string (-> NUL normally) */
86	int error;		/* has an error been seen? */
87	int gnuext;
88	sop *strip;		/* malloced strip */
89	sopno ssize;		/* malloced strip size (allocated) */
90	sopno slen;		/* malloced strip length (used) */
91	int ncsalloc;		/* number of csets allocated */
92	struct re_guts *g;
93#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
94	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
95	sopno pend[NPAREN];	/* -> ) ([0] unused) */
96	bool allowbranch;	/* can this expression branch? */
97	bool bre;		/* convenience; is this a BRE? */
98	int pflags;		/* other parsing flags -- legacy escapes? */
99	bool (*parse_expr)(struct parse *, struct branchc *);
100	void (*pre_parse)(struct parse *, struct branchc *);
101	void (*post_parse)(struct parse *, struct branchc *);
102};
103
104#define PFLAG_LEGACY_ESC	0x00000001
105
106/* ========= begin header generated by ./mkh ========= */
107#ifdef __cplusplus
108extern "C" {
109#endif
110
111/* === regcomp.c === */
112static bool p_ere_exp(struct parse *p, struct branchc *bc);
113static void p_str(struct parse *p);
114static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
115static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
116static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
117static bool p_branch_empty(struct parse *p, struct branchc *bc);
118static bool p_branch_do(struct parse *p, struct branchc *bc);
119static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
120static void p_bre_post_parse(struct parse *p, struct branchc *bc);
121static void p_re(struct parse *p, int end1, int end2);
122static bool p_simp_re(struct parse *p, struct branchc *bc);
123static int p_count(struct parse *p);
124static void p_bracket(struct parse *p);
125static int p_range_cmp(wchar_t c1, wchar_t c2);
126static void p_b_term(struct parse *p, cset *cs);
127static int p_b_pseudoclass(struct parse *p, char c);
128static void p_b_cclass(struct parse *p, cset *cs);
129static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
130static void p_b_eclass(struct parse *p, cset *cs);
131static wint_t p_b_symbol(struct parse *p);
132static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
133static bool may_escape(struct parse *p, const wint_t ch);
134static wint_t othercase(wint_t ch);
135static void bothcases(struct parse *p, wint_t ch);
136static void ordinary(struct parse *p, wint_t ch);
137static void nonnewline(struct parse *p);
138static void repeat(struct parse *p, sopno start, int from, int to);
139static int seterr(struct parse *p, int e);
140static cset *allocset(struct parse *p);
141static void freeset(struct parse *p, cset *cs);
142static void CHadd(struct parse *p, cset *cs, wint_t ch);
143static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
144static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
145static wint_t singleton(cset *cs);
146static sopno dupl(struct parse *p, sopno start, sopno finish);
147static void doemit(struct parse *p, sop op, size_t opnd);
148static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
149static void dofwd(struct parse *p, sopno pos, sop value);
150static int enlarge(struct parse *p, sopno size);
151static void stripsnug(struct parse *p, struct re_guts *g);
152static void findmust(struct parse *p, struct re_guts *g);
153static int altoffset(sop *scan, int offset);
154static void computejumps(struct parse *p, struct re_guts *g);
155static void computematchjumps(struct parse *p, struct re_guts *g);
156static sopno pluscount(struct parse *p, struct re_guts *g);
157static wint_t wgetnext(struct parse *p);
158
159#ifdef __cplusplus
160}
161#endif
162/* ========= end header generated by ./mkh ========= */
163
164static char nuls[10];		/* place to point scanner in event of error */
165
166/*
167 * macros for use with parse structure
168 * BEWARE:  these know that the parse structure is named `p' !!!
169 */
170#define	PEEK()	(*p->next)
171#define	PEEK2()	(*(p->next+1))
172#define	MORE()	(p->end - p->next > 0)
173#define	MORE2()	(p->end - p->next > 1)
174#define	SEE(c)	(MORE() && PEEK() == (c))
175#define	SEETWO(a, b)	(MORE2() && PEEK() == (a) && PEEK2() == (b))
176#define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
177#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
178#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
179#define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
180#define	NEXT()	(p->next++)
181#define	NEXT2()	(p->next += 2)
182#define	NEXTn(n)	(p->next += (n))
183#define	GETNEXT()	(*p->next++)
184#define	WGETNEXT()	wgetnext(p)
185#define	SETERROR(e)	seterr(p, (e))
186#define	REQUIRE(co, e)	((co) || SETERROR(e))
187#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
188#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
189#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
190#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
191#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
192#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
193#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
194#define	HERE()		(p->slen)
195#define	THERE()		(p->slen - 1)
196#define	THERETHERE()	(p->slen - 2)
197#define	DROP(n)	(p->slen -= (n))
198
199/* Macro used by computejump()/computematchjump() */
200#define MIN(a,b)	((a)<(b)?(a):(b))
201
202static int				/* 0 success, otherwise REG_something */
203regcomp_internal(regex_t * __restrict preg,
204	const char * __restrict pattern,
205	int cflags, int pflags)
206{
207	struct parse pa;
208	struct re_guts *g;
209	struct parse *p = &pa;
210	int i;
211	size_t len;
212	size_t maxlen;
213#ifdef REDEBUG
214#	define	GOODFLAGS(f)	(f)
215#else
216#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
217#endif
218
219	cflags = GOODFLAGS(cflags);
220	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
221		return(REG_INVARG);
222
223	if (cflags&REG_PEND) {
224		if (preg->re_endp < pattern)
225			return(REG_INVARG);
226		len = preg->re_endp - pattern;
227	} else
228		len = strlen(pattern);
229
230	/* do the mallocs early so failure handling is easy */
231	g = (struct re_guts *)malloc(sizeof(struct re_guts));
232	if (g == NULL)
233		return(REG_ESPACE);
234	/*
235	 * Limit the pattern space to avoid a 32-bit overflow on buffer
236	 * extension.  Also avoid any signed overflow in case of conversion
237	 * so make the real limit based on a 31-bit overflow.
238	 *
239	 * Likely not applicable on 64-bit systems but handle the case
240	 * generically (who are we to stop people from using ~715MB+
241	 * patterns?).
242	 */
243	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
244	if (len >= maxlen) {
245		free((char *)g);
246		return(REG_ESPACE);
247	}
248	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
249	assert(p->ssize >= len);
250
251	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
252	p->slen = 0;
253	if (p->strip == NULL) {
254		free((char *)g);
255		return(REG_ESPACE);
256	}
257
258	/* set things up */
259	p->g = g;
260	p->next = pattern;	/* convenience; we do not modify it */
261	p->end = p->next + len;
262	p->error = 0;
263	p->ncsalloc = 0;
264	p->pflags = pflags;
265	for (i = 0; i < NPAREN; i++) {
266		p->pbegin[i] = 0;
267		p->pend[i] = 0;
268	}
269#ifdef LIBREGEX
270	if (cflags&REG_POSIX) {
271		p->gnuext = false;
272		p->allowbranch = (cflags & REG_EXTENDED) != 0;
273	} else
274		p->gnuext = p->allowbranch = true;
275#else
276	p->gnuext = false;
277	p->allowbranch = (cflags & REG_EXTENDED) != 0;
278#endif
279	if (cflags & REG_EXTENDED) {
280		p->bre = false;
281		p->parse_expr = p_ere_exp;
282		p->pre_parse = NULL;
283		p->post_parse = NULL;
284	} else {
285		p->bre = true;
286		p->parse_expr = p_simp_re;
287		p->pre_parse = p_bre_pre_parse;
288		p->post_parse = p_bre_post_parse;
289	}
290	g->sets = NULL;
291	g->ncsets = 0;
292	g->cflags = cflags;
293	g->iflags = 0;
294	g->nbol = 0;
295	g->neol = 0;
296	g->must = NULL;
297	g->moffset = -1;
298	g->charjump = NULL;
299	g->matchjump = NULL;
300	g->mlen = 0;
301	g->nsub = 0;
302	g->backrefs = 0;
303
304	/* do it */
305	EMIT(OEND, 0);
306	g->firststate = THERE();
307	if (cflags & REG_NOSPEC)
308		p_str(p);
309	else
310		p_re(p, OUT, OUT);
311	EMIT(OEND, 0);
312	g->laststate = THERE();
313
314	/* tidy up loose ends and fill things in */
315	stripsnug(p, g);
316	findmust(p, g);
317	/* only use Boyer-Moore algorithm if the pattern is bigger
318	 * than three characters
319	 */
320	if(g->mlen > 3) {
321		computejumps(p, g);
322		computematchjumps(p, g);
323		if(g->matchjump == NULL && g->charjump != NULL) {
324			free(&g->charjump[CHAR_MIN]);
325			g->charjump = NULL;
326		}
327	}
328	g->nplus = pluscount(p, g);
329	g->magic = MAGIC2;
330	preg->re_nsub = g->nsub;
331	preg->re_g = g;
332	preg->re_magic = MAGIC1;
333#ifndef REDEBUG
334	/* not debugging, so can't rely on the assert() in regexec() */
335	if (g->iflags&BAD)
336		SETERROR(REG_ASSERT);
337#endif
338
339	/* win or lose, we're done */
340	if (p->error != 0)	/* lose */
341		regfree(preg);
342	return(p->error);
343}
344
345/*
346 - regcomp - interface for parser and compilation
347 = extern int regcomp(regex_t *, const char *, int);
348 = #define	REG_BASIC	0000
349 = #define	REG_EXTENDED	0001
350 = #define	REG_ICASE	0002
351 = #define	REG_NOSUB	0004
352 = #define	REG_NEWLINE	0010
353 = #define	REG_NOSPEC	0020
354 = #define	REG_PEND	0040
355 = #define	REG_DUMP	0200
356 */
357int				/* 0 success, otherwise REG_something */
358regcomp(regex_t * __restrict preg,
359	const char * __restrict pattern,
360	int cflags)
361{
362
363	return (regcomp_internal(preg, pattern, cflags, 0));
364}
365
366#ifndef LIBREGEX
367/*
368 * Legacy interface that requires more lax escaping behavior.
369 */
370int
371freebsd12_regcomp(regex_t * __restrict preg,
372	const char * __restrict pattern,
373	int cflags, int pflags)
374{
375
376	return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC));
377}
378
379__sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0);
380#endif	/* !LIBREGEX */
381
382/*
383 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
384 - return whether we should terminate or not
385 == static bool p_ere_exp(struct parse *p);
386 */
387static bool
388p_ere_exp(struct parse *p, struct branchc *bc)
389{
390	char c;
391	wint_t wc;
392	sopno pos;
393	int count;
394	int count2;
395#ifdef LIBREGEX
396	int i;
397	int handled;
398#endif
399	sopno subno;
400	int wascaret = 0;
401
402	(void)bc;
403	assert(MORE());		/* caller should have ensured this */
404	c = GETNEXT();
405
406#ifdef LIBREGEX
407	handled = 0;
408#endif
409	pos = HERE();
410	switch (c) {
411	case '(':
412		(void)REQUIRE(MORE(), REG_EPAREN);
413		p->g->nsub++;
414		subno = p->g->nsub;
415		if (subno < NPAREN)
416			p->pbegin[subno] = HERE();
417		EMIT(OLPAREN, subno);
418		if (!SEE(')'))
419			p_re(p, ')', IGN);
420		if (subno < NPAREN) {
421			p->pend[subno] = HERE();
422			assert(p->pend[subno] != 0);
423		}
424		EMIT(ORPAREN, subno);
425		(void)MUSTEAT(')', REG_EPAREN);
426		break;
427#ifndef POSIX_MISTAKE
428	case ')':		/* happens only if no current unmatched ( */
429		/*
430		 * You may ask, why the ifndef?  Because I didn't notice
431		 * this until slightly too late for 1003.2, and none of the
432		 * other 1003.2 regular-expression reviewers noticed it at
433		 * all.  So an unmatched ) is legal POSIX, at least until
434		 * we can get it fixed.
435		 */
436		SETERROR(REG_EPAREN);
437		break;
438#endif
439	case '^':
440		EMIT(OBOL, 0);
441		p->g->iflags |= USEBOL;
442		p->g->nbol++;
443		wascaret = 1;
444		break;
445	case '$':
446		EMIT(OEOL, 0);
447		p->g->iflags |= USEEOL;
448		p->g->neol++;
449		break;
450	case '|':
451		SETERROR(REG_EMPTY);
452		break;
453	case '*':
454	case '+':
455	case '?':
456#ifndef NO_STRICT_REGEX
457	case '{':
458#endif
459		SETERROR(REG_BADRPT);
460		break;
461	case '.':
462		if (p->g->cflags&REG_NEWLINE)
463			nonnewline(p);
464		else
465			EMIT(OANY, 0);
466		break;
467	case '[':
468		p_bracket(p);
469		break;
470	case '\\':
471		(void)REQUIRE(MORE(), REG_EESCAPE);
472		wc = WGETNEXT();
473#ifdef LIBREGEX
474		if (p->gnuext) {
475			handled = 1;
476			switch (wc) {
477			case '`':
478				EMIT(OBOS, 0);
479				break;
480			case '\'':
481				EMIT(OEOS, 0);
482				break;
483			case 'B':
484				EMIT(ONWBND, 0);
485				break;
486			case 'b':
487				EMIT(OWBND, 0);
488				break;
489			case 'W':
490			case 'w':
491			case 'S':
492			case 's':
493				p_b_pseudoclass(p, wc);
494				break;
495			case '1':
496			case '2':
497			case '3':
498			case '4':
499			case '5':
500			case '6':
501			case '7':
502			case '8':
503			case '9':
504				i = wc - '0';
505				assert(i < NPAREN);
506				if (p->pend[i] != 0) {
507					assert(i <= p->g->nsub);
508					EMIT(OBACK_, i);
509					assert(p->pbegin[i] != 0);
510					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
511					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
512					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
513					EMIT(O_BACK, i);
514				} else
515					SETERROR(REG_ESUBREG);
516				p->g->backrefs = 1;
517				break;
518			default:
519				handled = 0;
520			}
521			/* Don't proceed to the POSIX bits if we've already handled it */
522			if (handled)
523				break;
524		}
525#endif
526		switch (wc) {
527		case '<':
528			EMIT(OBOW, 0);
529			break;
530		case '>':
531			EMIT(OEOW, 0);
532			break;
533		default:
534			if (may_escape(p, wc))
535				ordinary(p, wc);
536			else
537				SETERROR(REG_EESCAPE);
538			break;
539		}
540		break;
541#ifdef NO_STRICT_REGEX
542	case '{':               /* okay as ordinary except if digit follows */
543	    (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
544	    /* FALLTHROUGH */
545#endif
546	default:
547		if (p->error != 0)
548			return (false);
549		p->next--;
550		wc = WGETNEXT();
551		ordinary(p, wc);
552		break;
553	}
554
555	if (!MORE())
556		return (false);
557	c = PEEK();
558	/* we call { a repetition if followed by a digit */
559	if (!( c == '*' || c == '+' || c == '?' ||
560#ifdef NO_STRICT_REGEX
561	       (c == '{' && MORE2() && isdigit((uch)PEEK2()))
562#else
563	       c == '{'
564#endif
565	       ))
566		return (false);		/* no repetition, we're done */
567#ifndef NO_STRICT_REGEX
568	else if (c == '{')
569		(void)REQUIRE(MORE2() && \
570		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
571#endif
572	NEXT();
573
574	(void)REQUIRE(!wascaret, REG_BADRPT);
575	switch (c) {
576	case '*':	/* implemented as +? */
577		/* this case does not require the (y|) trick, noKLUDGE */
578		INSERT(OPLUS_, pos);
579		ASTERN(O_PLUS, pos);
580		INSERT(OQUEST_, pos);
581		ASTERN(O_QUEST, pos);
582		break;
583	case '+':
584		INSERT(OPLUS_, pos);
585		ASTERN(O_PLUS, pos);
586		break;
587	case '?':
588		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
589		INSERT(OCH_, pos);		/* offset slightly wrong */
590		ASTERN(OOR1, pos);		/* this one's right */
591		AHEAD(pos);			/* fix the OCH_ */
592		EMIT(OOR2, 0);			/* offset very wrong... */
593		AHEAD(THERE());			/* ...so fix it */
594		ASTERN(O_CH, THERETHERE());
595		break;
596	case '{':
597		count = p_count(p);
598		if (EAT(',')) {
599			if (isdigit((uch)PEEK())) {
600				count2 = p_count(p);
601				(void)REQUIRE(count <= count2, REG_BADBR);
602			} else		/* single number with comma */
603				count2 = INFINITY;
604		} else		/* just a single number */
605			count2 = count;
606		repeat(p, pos, count, count2);
607		if (!EAT('}')) {	/* error heuristics */
608			while (MORE() && PEEK() != '}')
609				NEXT();
610			(void)REQUIRE(MORE(), REG_EBRACE);
611			SETERROR(REG_BADBR);
612		}
613		break;
614	}
615
616	if (!MORE())
617		return (false);
618	c = PEEK();
619	if (!( c == '*' || c == '+' || c == '?' ||
620				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
621		return (false);
622	SETERROR(REG_BADRPT);
623	return (false);
624}
625
626/*
627 - p_str - string (no metacharacters) "parser"
628 == static void p_str(struct parse *p);
629 */
630static void
631p_str(struct parse *p)
632{
633	(void)REQUIRE(MORE(), REG_EMPTY);
634	while (MORE())
635		ordinary(p, WGETNEXT());
636}
637
638/*
639 * Eat consecutive branch delimiters for the kind of expression that we are
640 * parsing, return the number of delimiters that we ate.
641 */
642static int
643p_branch_eat_delim(struct parse *p, struct branchc *bc)
644{
645	int nskip;
646
647	(void)bc;
648	nskip = 0;
649	while (EATSPEC('|'))
650		++nskip;
651	return (nskip);
652}
653
654/*
655 * Insert necessary branch book-keeping operations. This emits a
656 * bogus 'next' offset, since we still have more to parse
657 */
658static void
659p_branch_ins_offset(struct parse *p, struct branchc *bc)
660{
661
662	if (bc->nbranch == 0) {
663		INSERT(OCH_, bc->start);	/* offset is wrong */
664		bc->fwd = bc->start;
665		bc->back = bc->start;
666	}
667
668	ASTERN(OOR1, bc->back);
669	bc->back = THERE();
670	AHEAD(bc->fwd);			/* fix previous offset */
671	bc->fwd = HERE();
672	EMIT(OOR2, 0);			/* offset is very wrong */
673	++bc->nbranch;
674}
675
676/*
677 * Fix the offset of the tail branch, if we actually had any branches.
678 * This is to correct the bogus placeholder offset that we use.
679 */
680static void
681p_branch_fix_tail(struct parse *p, struct branchc *bc)
682{
683
684	/* Fix bogus offset at the tail if we actually have branches */
685	if (bc->nbranch > 0) {
686		AHEAD(bc->fwd);
687		ASTERN(O_CH, bc->back);
688	}
689}
690
691/*
692 * Signal to the parser that an empty branch has been encountered; this will,
693 * in the future, be used to allow for more permissive behavior with empty
694 * branches. The return value should indicate whether parsing may continue
695 * or not.
696 */
697static bool
698p_branch_empty(struct parse *p, struct branchc *bc)
699{
700
701	(void)bc;
702	SETERROR(REG_EMPTY);
703	return (false);
704}
705
706/*
707 * Take care of any branching requirements. This includes inserting the
708 * appropriate branching instructions as well as eating all of the branch
709 * delimiters until we either run out of pattern or need to parse more pattern.
710 */
711static bool
712p_branch_do(struct parse *p, struct branchc *bc)
713{
714	int ate = 0;
715
716	ate = p_branch_eat_delim(p, bc);
717	if (ate == 0)
718		return (false);
719	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
720		/*
721		 * Halt parsing only if we have an empty branch and p_branch_empty
722		 * indicates that we must not continue. In the future, this will not
723		 * necessarily be an error.
724		 */
725		return (false);
726	p_branch_ins_offset(p, bc);
727
728	return (true);
729}
730
731static void
732p_bre_pre_parse(struct parse *p, struct branchc *bc)
733{
734
735	(void) bc;
736	/*
737	 * Does not move cleanly into expression parser because of
738	 * ordinary interpration of * at the beginning position of
739	 * an expression.
740	 */
741	if (EAT('^')) {
742		EMIT(OBOL, 0);
743		p->g->iflags |= USEBOL;
744		p->g->nbol++;
745	}
746}
747
748static void
749p_bre_post_parse(struct parse *p, struct branchc *bc)
750{
751
752	/* Expression is terminating due to EOL token */
753	if (bc->terminate) {
754		DROP(1);
755		EMIT(OEOL, 0);
756		p->g->iflags |= USEEOL;
757		p->g->neol++;
758	}
759}
760
761/*
762 - p_re - Top level parser, concatenation and BRE anchoring
763 == static void p_re(struct parse *p, int end1, int end2);
764 * Giving end1 as OUT essentially eliminates the end1/end2 check.
765 *
766 * This implementation is a bit of a kludge, in that a trailing $ is first
767 * taken as an ordinary character and then revised to be an anchor.
768 * The amount of lookahead needed to avoid this kludge is excessive.
769 */
770static void
771p_re(struct parse *p,
772	int end1,	/* first terminating character */
773	int end2)	/* second terminating character; ignored for EREs */
774{
775	struct branchc bc;
776
777	bc.nbranch = 0;
778	if (end1 == OUT && end2 == OUT)
779		bc.outer = true;
780	else
781		bc.outer = false;
782#define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
783	for (;;) {
784		bc.start = HERE();
785		bc.nchain = 0;
786		bc.terminate = false;
787		if (p->pre_parse != NULL)
788			p->pre_parse(p, &bc);
789		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
790			bc.terminate = p->parse_expr(p, &bc);
791			++bc.nchain;
792		}
793		if (p->post_parse != NULL)
794			p->post_parse(p, &bc);
795		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
796#ifdef LIBREGEX
797		if (HERE() == bc.start && !p_branch_empty(p, &bc))
798			break;
799#endif
800		if (!p->allowbranch)
801			break;
802		/*
803		 * p_branch_do's return value indicates whether we should
804		 * continue parsing or not. This is both for correctness and
805		 * a slight optimization, because it will check if we've
806		 * encountered an empty branch or the end of the string
807		 * immediately following a branch delimiter.
808		 */
809		if (!p_branch_do(p, &bc))
810			break;
811	}
812#undef SEE_END
813	if (p->allowbranch)
814		p_branch_fix_tail(p, &bc);
815	assert(!MORE() || SEE(end1));
816}
817
818/*
819 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
820 == static bool p_simp_re(struct parse *p, struct branchc *bc);
821 */
822static bool			/* was the simple RE an unbackslashed $? */
823p_simp_re(struct parse *p, struct branchc *bc)
824{
825	int c;
826	int cc;			/* convenient/control character */
827	int count;
828	int count2;
829	sopno pos;
830	bool handled;
831	int i;
832	wint_t wc;
833	sopno subno;
834#	define	BACKSL	(1<<CHAR_BIT)
835
836	pos = HERE();		/* repetition op, if any, covers from here */
837	handled = false;
838
839	assert(MORE());		/* caller should have ensured this */
840	c = (uch)GETNEXT();
841	if (c == '\\') {
842		(void)REQUIRE(MORE(), REG_EESCAPE);
843		cc = (uch)GETNEXT();
844		c = BACKSL | cc;
845#ifdef LIBREGEX
846		if (p->gnuext) {
847			handled = true;
848			switch (c) {
849			case BACKSL|'`':
850				EMIT(OBOS, 0);
851				break;
852			case BACKSL|'\'':
853				EMIT(OEOS, 0);
854				break;
855			case BACKSL|'B':
856				EMIT(ONWBND, 0);
857				break;
858			case BACKSL|'b':
859				EMIT(OWBND, 0);
860				break;
861			case BACKSL|'W':
862			case BACKSL|'w':
863			case BACKSL|'S':
864			case BACKSL|'s':
865				p_b_pseudoclass(p, cc);
866				break;
867			default:
868				handled = false;
869			}
870		}
871#endif
872	}
873	if (!handled) {
874		switch (c) {
875		case '.':
876			if (p->g->cflags&REG_NEWLINE)
877				nonnewline(p);
878			else
879				EMIT(OANY, 0);
880			break;
881		case '[':
882			p_bracket(p);
883			break;
884		case BACKSL|'<':
885			EMIT(OBOW, 0);
886			break;
887		case BACKSL|'>':
888			EMIT(OEOW, 0);
889			break;
890		case BACKSL|'{':
891			SETERROR(REG_BADRPT);
892			break;
893		case BACKSL|'(':
894			p->g->nsub++;
895			subno = p->g->nsub;
896			if (subno < NPAREN)
897				p->pbegin[subno] = HERE();
898			EMIT(OLPAREN, subno);
899			/* the MORE here is an error heuristic */
900			if (MORE() && !SEETWO('\\', ')'))
901				p_re(p, '\\', ')');
902			if (subno < NPAREN) {
903				p->pend[subno] = HERE();
904				assert(p->pend[subno] != 0);
905			}
906			EMIT(ORPAREN, subno);
907			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
908			break;
909		case BACKSL|')':	/* should not get here -- must be user */
910#ifdef NO_STRICT_REGEX
911		case BACKSL|'}':
912#endif
913			SETERROR(REG_EPAREN);
914			break;
915		case BACKSL|'1':
916		case BACKSL|'2':
917		case BACKSL|'3':
918		case BACKSL|'4':
919		case BACKSL|'5':
920		case BACKSL|'6':
921		case BACKSL|'7':
922		case BACKSL|'8':
923		case BACKSL|'9':
924			i = (c&~BACKSL) - '0';
925			assert(i < NPAREN);
926			if (p->pend[i] != 0) {
927				assert(i <= p->g->nsub);
928				EMIT(OBACK_, i);
929				assert(p->pbegin[i] != 0);
930				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
931				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
932				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
933				EMIT(O_BACK, i);
934			} else
935				SETERROR(REG_ESUBREG);
936			p->g->backrefs = 1;
937			break;
938		case '*':
939			/*
940			 * Ordinary if used as the first character beyond BOL anchor of
941			 * a (sub-)expression, counts as a bad repetition operator if it
942			 * appears otherwise.
943			 */
944			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
945			/* FALLTHROUGH */
946		default:
947			if (p->error != 0)
948				return (false);	/* Definitely not $... */
949			p->next--;
950			wc = WGETNEXT();
951			if ((c & BACKSL) == 0 || may_escape(p, wc))
952				ordinary(p, wc);
953			else
954				SETERROR(REG_EESCAPE);
955			break;
956		}
957	}
958
959	if (EAT('*')) {		/* implemented as +? */
960		/* this case does not require the (y|) trick, noKLUDGE */
961		INSERT(OPLUS_, pos);
962		ASTERN(O_PLUS, pos);
963		INSERT(OQUEST_, pos);
964		ASTERN(O_QUEST, pos);
965#ifdef LIBREGEX
966	} else if (p->gnuext && EATTWO('\\', '?')) {
967		INSERT(OQUEST_, pos);
968		ASTERN(O_QUEST, pos);
969	} else if (p->gnuext && EATTWO('\\', '+')) {
970		INSERT(OPLUS_, pos);
971		ASTERN(O_PLUS, pos);
972#endif
973	} else if (EATTWO('\\', '{')) {
974		count = p_count(p);
975		if (EAT(',')) {
976			if (MORE() && isdigit((uch)PEEK())) {
977				count2 = p_count(p);
978				(void)REQUIRE(count <= count2, REG_BADBR);
979			} else		/* single number with comma */
980				count2 = INFINITY;
981		} else		/* just a single number */
982			count2 = count;
983		repeat(p, pos, count, count2);
984		if (!EATTWO('\\', '}')) {	/* error heuristics */
985			while (MORE() && !SEETWO('\\', '}'))
986				NEXT();
987			(void)REQUIRE(MORE(), REG_EBRACE);
988			SETERROR(REG_BADBR);
989		}
990	} else if (c == '$')     /* $ (but not \$) ends it */
991		return (true);
992
993	return (false);
994}
995
996/*
997 - p_count - parse a repetition count
998 == static int p_count(struct parse *p);
999 */
1000static int			/* the value */
1001p_count(struct parse *p)
1002{
1003	int count = 0;
1004	int ndigits = 0;
1005
1006	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1007		count = count*10 + ((uch)GETNEXT() - '0');
1008		ndigits++;
1009	}
1010
1011	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1012	return(count);
1013}
1014
1015/*
1016 - p_bracket - parse a bracketed character list
1017 == static void p_bracket(struct parse *p);
1018 */
1019static void
1020p_bracket(struct parse *p)
1021{
1022	cset *cs;
1023	wint_t ch;
1024
1025	/* Dept of Truly Sickening Special-Case Kludges */
1026	if (p->end - p->next > 5) {
1027		if (strncmp(p->next, "[:<:]]", 6) == 0) {
1028			EMIT(OBOW, 0);
1029			NEXTn(6);
1030			return;
1031		}
1032		if (strncmp(p->next, "[:>:]]", 6) == 0) {
1033			EMIT(OEOW, 0);
1034			NEXTn(6);
1035			return;
1036		}
1037	}
1038
1039	if ((cs = allocset(p)) == NULL)
1040		return;
1041
1042	if (p->g->cflags&REG_ICASE)
1043		cs->icase = 1;
1044	if (EAT('^'))
1045		cs->invert = 1;
1046	if (EAT(']'))
1047		CHadd(p, cs, ']');
1048	else if (EAT('-'))
1049		CHadd(p, cs, '-');
1050	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1051		p_b_term(p, cs);
1052	if (EAT('-'))
1053		CHadd(p, cs, '-');
1054	(void)MUSTEAT(']', REG_EBRACK);
1055
1056	if (p->error != 0)	/* don't mess things up further */
1057		return;
1058
1059	if (cs->invert && p->g->cflags&REG_NEWLINE)
1060		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1061
1062	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
1063		ordinary(p, ch);
1064		freeset(p, cs);
1065	} else
1066		EMIT(OANYOF, (int)(cs - p->g->sets));
1067}
1068
1069static int
1070p_range_cmp(wchar_t c1, wchar_t c2)
1071{
1072#ifndef LIBREGEX
1073	return __wcollate_range_cmp(c1, c2);
1074#else
1075	/* Copied from libc/collate __wcollate_range_cmp */
1076	wchar_t s1[2], s2[2];
1077
1078	s1[0] = c1;
1079	s1[1] = L'\0';
1080	s2[0] = c2;
1081	s2[1] = L'\0';
1082	return (wcscoll(s1, s2));
1083#endif
1084}
1085
1086/*
1087 - p_b_term - parse one term of a bracketed character list
1088 == static void p_b_term(struct parse *p, cset *cs);
1089 */
1090static void
1091p_b_term(struct parse *p, cset *cs)
1092{
1093	char c;
1094	wint_t start, finish;
1095	wint_t i;
1096#ifndef LIBREGEX
1097	struct xlocale_collate *table =
1098		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1099#endif
1100	/* classify what we've got */
1101	switch ((MORE()) ? PEEK() : '\0') {
1102	case '[':
1103		c = (MORE2()) ? PEEK2() : '\0';
1104		break;
1105	case '-':
1106		SETERROR(REG_ERANGE);
1107		return;			/* NOTE RETURN */
1108	default:
1109		c = '\0';
1110		break;
1111	}
1112
1113	switch (c) {
1114	case ':':		/* character class */
1115		NEXT2();
1116		(void)REQUIRE(MORE(), REG_EBRACK);
1117		c = PEEK();
1118		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1119		p_b_cclass(p, cs);
1120		(void)REQUIRE(MORE(), REG_EBRACK);
1121		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1122		break;
1123	case '=':		/* equivalence class */
1124		NEXT2();
1125		(void)REQUIRE(MORE(), REG_EBRACK);
1126		c = PEEK();
1127		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1128		p_b_eclass(p, cs);
1129		(void)REQUIRE(MORE(), REG_EBRACK);
1130		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1131		break;
1132	default:		/* symbol, ordinary character, or range */
1133		start = p_b_symbol(p);
1134		if (SEE('-') && MORE2() && PEEK2() != ']') {
1135			/* range */
1136			NEXT();
1137			if (EAT('-'))
1138				finish = '-';
1139			else
1140				finish = p_b_symbol(p);
1141		} else
1142			finish = start;
1143		if (start == finish)
1144			CHadd(p, cs, start);
1145		else {
1146#ifndef LIBREGEX
1147			if (table->__collate_load_error || MB_CUR_MAX > 1) {
1148#else
1149			if (MB_CUR_MAX > 1) {
1150#endif
1151				(void)REQUIRE(start <= finish, REG_ERANGE);
1152				CHaddrange(p, cs, start, finish);
1153			} else {
1154				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1155				for (i = 0; i <= UCHAR_MAX; i++) {
1156					if (p_range_cmp(start, i) <= 0 &&
1157					    p_range_cmp(i, finish) <= 0 )
1158						CHadd(p, cs, i);
1159				}
1160			}
1161		}
1162		break;
1163	}
1164}
1165
1166/*
1167 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1168 == static int p_b_pseudoclass(struct parse *p, char c)
1169 */
1170static int
1171p_b_pseudoclass(struct parse *p, char c) {
1172	cset *cs;
1173
1174	if ((cs = allocset(p)) == NULL)
1175		return(0);
1176
1177	if (p->g->cflags&REG_ICASE)
1178		cs->icase = 1;
1179
1180	switch (c) {
1181	case 'W':
1182		cs->invert = 1;
1183		/* PASSTHROUGH */
1184	case 'w':
1185		p_b_cclass_named(p, cs, "alnum");
1186		break;
1187	case 'S':
1188		cs->invert = 1;
1189		/* PASSTHROUGH */
1190	case 's':
1191		p_b_cclass_named(p, cs, "space");
1192		break;
1193	default:
1194		return(0);
1195	}
1196
1197	EMIT(OANYOF, (int)(cs - p->g->sets));
1198	return(1);
1199}
1200
1201/*
1202 - p_b_cclass - parse a character-class name and deal with it
1203 == static void p_b_cclass(struct parse *p, cset *cs);
1204 */
1205static void
1206p_b_cclass(struct parse *p, cset *cs)
1207{
1208	const char *sp = p->next;
1209	size_t len;
1210	char clname[16];
1211
1212	while (MORE() && isalpha((uch)PEEK()))
1213		NEXT();
1214	len = p->next - sp;
1215	if (len >= sizeof(clname) - 1) {
1216		SETERROR(REG_ECTYPE);
1217		return;
1218	}
1219	memcpy(clname, sp, len);
1220	clname[len] = '\0';
1221
1222	p_b_cclass_named(p, cs, clname);
1223}
1224/*
1225 - p_b_cclass_named - deal with a named character class
1226 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1227 */
1228static void
1229p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1230	wctype_t wct;
1231
1232	if ((wct = wctype(clname)) == 0) {
1233		SETERROR(REG_ECTYPE);
1234		return;
1235	}
1236	CHaddtype(p, cs, wct);
1237}
1238
1239/*
1240 - p_b_eclass - parse an equivalence-class name and deal with it
1241 == static void p_b_eclass(struct parse *p, cset *cs);
1242 *
1243 * This implementation is incomplete. xxx
1244 */
1245static void
1246p_b_eclass(struct parse *p, cset *cs)
1247{
1248	wint_t c;
1249
1250	c = p_b_coll_elem(p, '=');
1251	CHadd(p, cs, c);
1252}
1253
1254/*
1255 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1256 == static wint_t p_b_symbol(struct parse *p);
1257 */
1258static wint_t			/* value of symbol */
1259p_b_symbol(struct parse *p)
1260{
1261	wint_t value;
1262
1263	(void)REQUIRE(MORE(), REG_EBRACK);
1264	if (!EATTWO('[', '.'))
1265		return(WGETNEXT());
1266
1267	/* collating symbol */
1268	value = p_b_coll_elem(p, '.');
1269	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1270	return(value);
1271}
1272
1273/*
1274 - p_b_coll_elem - parse a collating-element name and look it up
1275 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1276 */
1277static wint_t			/* value of collating element */
1278p_b_coll_elem(struct parse *p,
1279	wint_t endc)		/* name ended by endc,']' */
1280{
1281	const char *sp = p->next;
1282	struct cname *cp;
1283	mbstate_t mbs;
1284	wchar_t wc;
1285	size_t clen, len;
1286
1287	while (MORE() && !SEETWO(endc, ']'))
1288		NEXT();
1289	if (!MORE()) {
1290		SETERROR(REG_EBRACK);
1291		return(0);
1292	}
1293	len = p->next - sp;
1294	for (cp = cnames; cp->name != NULL; cp++)
1295		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1296			return(cp->code);	/* known name */
1297	memset(&mbs, 0, sizeof(mbs));
1298	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1299		return (wc);			/* single character */
1300	else if (clen == (size_t)-1 || clen == (size_t)-2)
1301		SETERROR(REG_ILLSEQ);
1302	else
1303		SETERROR(REG_ECOLLATE);		/* neither */
1304	return(0);
1305}
1306
1307/*
1308 - may_escape - determine whether 'ch' is escape-able in the current context
1309 == static int may_escape(struct parse *p, const wint_t ch)
1310 */
1311static bool
1312may_escape(struct parse *p, const wint_t ch)
1313{
1314
1315	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1316		return (true);
1317	if (iswalpha(ch) || ch == '\'' || ch == '`')
1318		return (false);
1319	return (true);
1320#ifdef NOTYET
1321	/*
1322	 * Build a whitelist of characters that may be escaped to produce an
1323	 * ordinary in the current context. This assumes that these have not
1324	 * been otherwise interpreted as a special character. Escaping an
1325	 * ordinary character yields undefined results according to
1326	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1327	 * advantage of this and use escaped ordinary characters to provide
1328	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1329	 */
1330	switch(ch) {
1331	case '|':
1332	case '+':
1333	case '?':
1334		/* The above characters may not be escaped in BREs */
1335		if (!(p->g->cflags&REG_EXTENDED))
1336			return (false);
1337		/* Fallthrough */
1338	case '(':
1339	case ')':
1340	case '{':
1341	case '}':
1342	case '.':
1343	case '[':
1344	case ']':
1345	case '\\':
1346	case '*':
1347	case '^':
1348	case '$':
1349		return (true);
1350	default:
1351		return (false);
1352	}
1353#endif
1354}
1355
1356/*
1357 - othercase - return the case counterpart of an alphabetic
1358 == static wint_t othercase(wint_t ch);
1359 */
1360static wint_t			/* if no counterpart, return ch */
1361othercase(wint_t ch)
1362{
1363	assert(iswalpha(ch));
1364	if (iswupper(ch))
1365		return(towlower(ch));
1366	else if (iswlower(ch))
1367		return(towupper(ch));
1368	else			/* peculiar, but could happen */
1369		return(ch);
1370}
1371
1372/*
1373 - bothcases - emit a dualcase version of a two-case character
1374 == static void bothcases(struct parse *p, wint_t ch);
1375 *
1376 * Boy, is this implementation ever a kludge...
1377 */
1378static void
1379bothcases(struct parse *p, wint_t ch)
1380{
1381	const char *oldnext = p->next;
1382	const char *oldend = p->end;
1383	char bracket[3 + MB_LEN_MAX];
1384	size_t n;
1385	mbstate_t mbs;
1386
1387	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1388	p->next = bracket;
1389	memset(&mbs, 0, sizeof(mbs));
1390	n = wcrtomb(bracket, ch, &mbs);
1391	assert(n != (size_t)-1);
1392	bracket[n] = ']';
1393	bracket[n + 1] = '\0';
1394	p->end = bracket+n+1;
1395	p_bracket(p);
1396	assert(p->next == p->end);
1397	p->next = oldnext;
1398	p->end = oldend;
1399}
1400
1401/*
1402 - ordinary - emit an ordinary character
1403 == static void ordinary(struct parse *p, wint_t ch);
1404 */
1405static void
1406ordinary(struct parse *p, wint_t ch)
1407{
1408	cset *cs;
1409
1410	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1411		bothcases(p, ch);
1412	else if ((ch & OPDMASK) == ch)
1413		EMIT(OCHAR, ch);
1414	else {
1415		/*
1416		 * Kludge: character is too big to fit into an OCHAR operand.
1417		 * Emit a singleton set.
1418		 */
1419		if ((cs = allocset(p)) == NULL)
1420			return;
1421		CHadd(p, cs, ch);
1422		EMIT(OANYOF, (int)(cs - p->g->sets));
1423	}
1424}
1425
1426/*
1427 - nonnewline - emit REG_NEWLINE version of OANY
1428 == static void nonnewline(struct parse *p);
1429 *
1430 * Boy, is this implementation ever a kludge...
1431 */
1432static void
1433nonnewline(struct parse *p)
1434{
1435	const char *oldnext = p->next;
1436	const char *oldend = p->end;
1437	char bracket[4];
1438
1439	p->next = bracket;
1440	p->end = bracket+3;
1441	bracket[0] = '^';
1442	bracket[1] = '\n';
1443	bracket[2] = ']';
1444	bracket[3] = '\0';
1445	p_bracket(p);
1446	assert(p->next == bracket+3);
1447	p->next = oldnext;
1448	p->end = oldend;
1449}
1450
1451/*
1452 - repeat - generate code for a bounded repetition, recursively if needed
1453 == static void repeat(struct parse *p, sopno start, int from, int to);
1454 */
1455static void
1456repeat(struct parse *p,
1457	sopno start,		/* operand from here to end of strip */
1458	int from,		/* repeated from this number */
1459	int to)			/* to this number of times (maybe INFINITY) */
1460{
1461	sopno finish = HERE();
1462#	define	N	2
1463#	define	INF	3
1464#	define	REP(f, t)	((f)*8 + (t))
1465#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1466	sopno copy;
1467
1468	if (p->error != 0)	/* head off possible runaway recursion */
1469		return;
1470
1471	assert(from <= to);
1472
1473	switch (REP(MAP(from), MAP(to))) {
1474	case REP(0, 0):			/* must be user doing this */
1475		DROP(finish-start);	/* drop the operand */
1476		break;
1477	case REP(0, 1):			/* as x{1,1}? */
1478	case REP(0, N):			/* as x{1,n}? */
1479	case REP(0, INF):		/* as x{1,}? */
1480		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1481		INSERT(OCH_, start);		/* offset is wrong... */
1482		repeat(p, start+1, 1, to);
1483		ASTERN(OOR1, start);
1484		AHEAD(start);			/* ... fix it */
1485		EMIT(OOR2, 0);
1486		AHEAD(THERE());
1487		ASTERN(O_CH, THERETHERE());
1488		break;
1489	case REP(1, 1):			/* trivial case */
1490		/* done */
1491		break;
1492	case REP(1, N):			/* as x?x{1,n-1} */
1493		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1494		INSERT(OCH_, start);
1495		ASTERN(OOR1, start);
1496		AHEAD(start);
1497		EMIT(OOR2, 0);			/* offset very wrong... */
1498		AHEAD(THERE());			/* ...so fix it */
1499		ASTERN(O_CH, THERETHERE());
1500		copy = dupl(p, start+1, finish+1);
1501		assert(copy == finish+4);
1502		repeat(p, copy, 1, to-1);
1503		break;
1504	case REP(1, INF):		/* as x+ */
1505		INSERT(OPLUS_, start);
1506		ASTERN(O_PLUS, start);
1507		break;
1508	case REP(N, N):			/* as xx{m-1,n-1} */
1509		copy = dupl(p, start, finish);
1510		repeat(p, copy, from-1, to-1);
1511		break;
1512	case REP(N, INF):		/* as xx{n-1,INF} */
1513		copy = dupl(p, start, finish);
1514		repeat(p, copy, from-1, to);
1515		break;
1516	default:			/* "can't happen" */
1517		SETERROR(REG_ASSERT);	/* just in case */
1518		break;
1519	}
1520}
1521
1522/*
1523 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1524 - character from the parse struct, signals a REG_ILLSEQ error if the
1525 - character can't be converted. Returns the number of bytes consumed.
1526 */
1527static wint_t
1528wgetnext(struct parse *p)
1529{
1530	mbstate_t mbs;
1531	wchar_t wc;
1532	size_t n;
1533
1534	memset(&mbs, 0, sizeof(mbs));
1535	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1536	if (n == (size_t)-1 || n == (size_t)-2) {
1537		SETERROR(REG_ILLSEQ);
1538		return (0);
1539	}
1540	if (n == 0)
1541		n = 1;
1542	p->next += n;
1543	return (wc);
1544}
1545
1546/*
1547 - seterr - set an error condition
1548 == static int seterr(struct parse *p, int e);
1549 */
1550static int			/* useless but makes type checking happy */
1551seterr(struct parse *p, int e)
1552{
1553	if (p->error == 0)	/* keep earliest error condition */
1554		p->error = e;
1555	p->next = nuls;		/* try to bring things to a halt */
1556	p->end = nuls;
1557	return(0);		/* make the return value well-defined */
1558}
1559
1560/*
1561 - allocset - allocate a set of characters for []
1562 == static cset *allocset(struct parse *p);
1563 */
1564static cset *
1565allocset(struct parse *p)
1566{
1567	cset *cs, *ncs;
1568
1569	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1570	if (ncs == NULL) {
1571		SETERROR(REG_ESPACE);
1572		return (NULL);
1573	}
1574	p->g->sets = ncs;
1575	cs = &p->g->sets[p->g->ncsets++];
1576	memset(cs, 0, sizeof(*cs));
1577
1578	return(cs);
1579}
1580
1581/*
1582 - freeset - free a now-unused set
1583 == static void freeset(struct parse *p, cset *cs);
1584 */
1585static void
1586freeset(struct parse *p, cset *cs)
1587{
1588	cset *top = &p->g->sets[p->g->ncsets];
1589
1590	free(cs->wides);
1591	free(cs->ranges);
1592	free(cs->types);
1593	memset(cs, 0, sizeof(*cs));
1594	if (cs == top-1)	/* recover only the easy case */
1595		p->g->ncsets--;
1596}
1597
1598/*
1599 - singleton - Determine whether a set contains only one character,
1600 - returning it if so, otherwise returning OUT.
1601 */
1602static wint_t
1603singleton(cset *cs)
1604{
1605	wint_t i, s, n;
1606
1607	/* Exclude the complicated cases we don't want to deal with */
1608	if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
1609		return (OUT);
1610
1611	if (cs->nwides > 1)
1612		return (OUT);
1613
1614	/* Count the number of characters present in the bitmap */
1615	for (i = n = 0; i < NC; i++)
1616		if (CHIN(cs, i)) {
1617			n++;
1618			s = i;
1619		}
1620
1621	if (n > 1)
1622		return (OUT);
1623
1624	if (n == 1) {
1625		if (cs->nwides == 0)
1626			return (s);
1627		else
1628			return (OUT);
1629	}
1630	if (cs->nwides == 1)
1631		return (cs->wides[0]);
1632
1633	return (OUT);
1634}
1635
1636/*
1637 - CHadd - add character to character set.
1638 */
1639static void
1640CHadd(struct parse *p, cset *cs, wint_t ch)
1641{
1642	wint_t nch, *newwides;
1643	assert(ch >= 0);
1644	if (ch < NC)
1645		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1646	else {
1647		newwides = reallocarray(cs->wides, cs->nwides + 1,
1648		    sizeof(*cs->wides));
1649		if (newwides == NULL) {
1650			SETERROR(REG_ESPACE);
1651			return;
1652		}
1653		cs->wides = newwides;
1654		cs->wides[cs->nwides++] = ch;
1655	}
1656	if (cs->icase) {
1657		if ((nch = towlower(ch)) < NC)
1658			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1659		if ((nch = towupper(ch)) < NC)
1660			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1661	}
1662}
1663
1664/*
1665 - CHaddrange - add all characters in the range [min,max] to a character set.
1666 */
1667static void
1668CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1669{
1670	crange *newranges;
1671
1672	for (; min < NC && min <= max; min++)
1673		CHadd(p, cs, min);
1674	if (min >= max)
1675		return;
1676	newranges = reallocarray(cs->ranges, cs->nranges + 1,
1677	    sizeof(*cs->ranges));
1678	if (newranges == NULL) {
1679		SETERROR(REG_ESPACE);
1680		return;
1681	}
1682	cs->ranges = newranges;
1683	cs->ranges[cs->nranges].min = min;
1684	cs->ranges[cs->nranges].max = max;
1685	cs->nranges++;
1686}
1687
1688/*
1689 - CHaddtype - add all characters of a certain type to a character set.
1690 */
1691static void
1692CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1693{
1694	wint_t i;
1695	wctype_t *newtypes;
1696
1697	for (i = 0; i < NC; i++)
1698		if (iswctype(i, wct))
1699			CHadd(p, cs, i);
1700	newtypes = reallocarray(cs->types, cs->ntypes + 1,
1701	    sizeof(*cs->types));
1702	if (newtypes == NULL) {
1703		SETERROR(REG_ESPACE);
1704		return;
1705	}
1706	cs->types = newtypes;
1707	cs->types[cs->ntypes++] = wct;
1708}
1709
1710/*
1711 - dupl - emit a duplicate of a bunch of sops
1712 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1713 */
1714static sopno			/* start of duplicate */
1715dupl(struct parse *p,
1716	sopno start,		/* from here */
1717	sopno finish)		/* to this less one */
1718{
1719	sopno ret = HERE();
1720	sopno len = finish - start;
1721
1722	assert(finish >= start);
1723	if (len == 0)
1724		return(ret);
1725	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1726		return(ret);
1727	(void) memcpy((char *)(p->strip + p->slen),
1728		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1729	p->slen += len;
1730	return(ret);
1731}
1732
1733/*
1734 - doemit - emit a strip operator
1735 == static void doemit(struct parse *p, sop op, size_t opnd);
1736 *
1737 * It might seem better to implement this as a macro with a function as
1738 * hard-case backup, but it's just too big and messy unless there are
1739 * some changes to the data structures.  Maybe later.
1740 */
1741static void
1742doemit(struct parse *p, sop op, size_t opnd)
1743{
1744	/* avoid making error situations worse */
1745	if (p->error != 0)
1746		return;
1747
1748	/* deal with oversize operands ("can't happen", more or less) */
1749	assert(opnd < 1<<OPSHIFT);
1750
1751	/* deal with undersized strip */
1752	if (p->slen >= p->ssize)
1753		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1754			return;
1755
1756	/* finally, it's all reduced to the easy case */
1757	p->strip[p->slen++] = SOP(op, opnd);
1758}
1759
1760/*
1761 - doinsert - insert a sop into the strip
1762 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1763 */
1764static void
1765doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1766{
1767	sopno sn;
1768	sop s;
1769	int i;
1770
1771	/* avoid making error situations worse */
1772	if (p->error != 0)
1773		return;
1774
1775	sn = HERE();
1776	EMIT(op, opnd);		/* do checks, ensure space */
1777	assert(HERE() == sn+1);
1778	s = p->strip[sn];
1779
1780	/* adjust paren pointers */
1781	assert(pos > 0);
1782	for (i = 1; i < NPAREN; i++) {
1783		if (p->pbegin[i] >= pos) {
1784			p->pbegin[i]++;
1785		}
1786		if (p->pend[i] >= pos) {
1787			p->pend[i]++;
1788		}
1789	}
1790
1791	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1792						(HERE()-pos-1)*sizeof(sop));
1793	p->strip[pos] = s;
1794}
1795
1796/*
1797 - dofwd - complete a forward reference
1798 == static void dofwd(struct parse *p, sopno pos, sop value);
1799 */
1800static void
1801dofwd(struct parse *p, sopno pos, sop value)
1802{
1803	/* avoid making error situations worse */
1804	if (p->error != 0)
1805		return;
1806
1807	assert(value < 1<<OPSHIFT);
1808	p->strip[pos] = OP(p->strip[pos]) | value;
1809}
1810
1811/*
1812 - enlarge - enlarge the strip
1813 == static int enlarge(struct parse *p, sopno size);
1814 */
1815static int
1816enlarge(struct parse *p, sopno size)
1817{
1818	sop *sp;
1819
1820	if (p->ssize >= size)
1821		return 1;
1822
1823	sp = reallocarray(p->strip, size, sizeof(sop));
1824	if (sp == NULL) {
1825		SETERROR(REG_ESPACE);
1826		return 0;
1827	}
1828	p->strip = sp;
1829	p->ssize = size;
1830	return 1;
1831}
1832
1833/*
1834 - stripsnug - compact the strip
1835 == static void stripsnug(struct parse *p, struct re_guts *g);
1836 */
1837static void
1838stripsnug(struct parse *p, struct re_guts *g)
1839{
1840	g->nstates = p->slen;
1841	g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop));
1842	if (g->strip == NULL) {
1843		SETERROR(REG_ESPACE);
1844		g->strip = p->strip;
1845	}
1846}
1847
1848/*
1849 - findmust - fill in must and mlen with longest mandatory literal string
1850 == static void findmust(struct parse *p, struct re_guts *g);
1851 *
1852 * This algorithm could do fancy things like analyzing the operands of |
1853 * for common subsequences.  Someday.  This code is simple and finds most
1854 * of the interesting cases.
1855 *
1856 * Note that must and mlen got initialized during setup.
1857 */
1858static void
1859findmust(struct parse *p, struct re_guts *g)
1860{
1861	sop *scan;
1862	sop *start = NULL;
1863	sop *newstart = NULL;
1864	sopno newlen;
1865	sop s;
1866	char *cp;
1867	int offset;
1868	char buf[MB_LEN_MAX];
1869	size_t clen;
1870	mbstate_t mbs;
1871
1872	/* avoid making error situations worse */
1873	if (p->error != 0)
1874		return;
1875
1876	/*
1877	 * It's not generally safe to do a ``char'' substring search on
1878	 * multibyte character strings, but it's safe for at least
1879	 * UTF-8 (see RFC 3629).
1880	 */
1881	if (MB_CUR_MAX > 1 &&
1882	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1883		return;
1884
1885	/* find the longest OCHAR sequence in strip */
1886	newlen = 0;
1887	offset = 0;
1888	g->moffset = 0;
1889	scan = g->strip + 1;
1890	do {
1891		s = *scan++;
1892		switch (OP(s)) {
1893		case OCHAR:		/* sequence member */
1894			if (newlen == 0) {		/* new sequence */
1895				memset(&mbs, 0, sizeof(mbs));
1896				newstart = scan - 1;
1897			}
1898			clen = wcrtomb(buf, OPND(s), &mbs);
1899			if (clen == (size_t)-1)
1900				goto toohard;
1901			newlen += clen;
1902			break;
1903		case OPLUS_:		/* things that don't break one */
1904		case OLPAREN:
1905		case ORPAREN:
1906			break;
1907		case OQUEST_:		/* things that must be skipped */
1908		case OCH_:
1909			offset = altoffset(scan, offset);
1910			scan--;
1911			do {
1912				scan += OPND(s);
1913				s = *scan;
1914				/* assert() interferes w debug printouts */
1915				if (OP(s) != (sop)O_QUEST &&
1916				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1917					g->iflags |= BAD;
1918					return;
1919				}
1920			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1921			/* FALLTHROUGH */
1922		case OBOW:		/* things that break a sequence */
1923		case OEOW:
1924		case OBOL:
1925		case OEOL:
1926		case OBOS:
1927		case OEOS:
1928		case OWBND:
1929		case ONWBND:
1930		case O_QUEST:
1931		case O_CH:
1932		case OEND:
1933			if (newlen > (sopno)g->mlen) {		/* ends one */
1934				start = newstart;
1935				g->mlen = newlen;
1936				if (offset > -1) {
1937					g->moffset += offset;
1938					offset = newlen;
1939				} else
1940					g->moffset = offset;
1941			} else {
1942				if (offset > -1)
1943					offset += newlen;
1944			}
1945			newlen = 0;
1946			break;
1947		case OANY:
1948			if (newlen > (sopno)g->mlen) {		/* ends one */
1949				start = newstart;
1950				g->mlen = newlen;
1951				if (offset > -1) {
1952					g->moffset += offset;
1953					offset = newlen;
1954				} else
1955					g->moffset = offset;
1956			} else {
1957				if (offset > -1)
1958					offset += newlen;
1959			}
1960			if (offset > -1)
1961				offset++;
1962			newlen = 0;
1963			break;
1964		case OANYOF:		/* may or may not invalidate offset */
1965			/* First, everything as OANY */
1966			if (newlen > (sopno)g->mlen) {		/* ends one */
1967				start = newstart;
1968				g->mlen = newlen;
1969				if (offset > -1) {
1970					g->moffset += offset;
1971					offset = newlen;
1972				} else
1973					g->moffset = offset;
1974			} else {
1975				if (offset > -1)
1976					offset += newlen;
1977			}
1978			if (offset > -1)
1979				offset++;
1980			newlen = 0;
1981			break;
1982		toohard:
1983		default:
1984			/* Anything here makes it impossible or too hard
1985			 * to calculate the offset -- so we give up;
1986			 * save the last known good offset, in case the
1987			 * must sequence doesn't occur later.
1988			 */
1989			if (newlen > (sopno)g->mlen) {		/* ends one */
1990				start = newstart;
1991				g->mlen = newlen;
1992				if (offset > -1)
1993					g->moffset += offset;
1994				else
1995					g->moffset = offset;
1996			}
1997			offset = -1;
1998			newlen = 0;
1999			break;
2000		}
2001	} while (OP(s) != OEND);
2002
2003	if (g->mlen == 0) {		/* there isn't one */
2004		g->moffset = -1;
2005		return;
2006	}
2007
2008	/* turn it into a character string */
2009	g->must = malloc((size_t)g->mlen + 1);
2010	if (g->must == NULL) {		/* argh; just forget it */
2011		g->mlen = 0;
2012		g->moffset = -1;
2013		return;
2014	}
2015	cp = g->must;
2016	scan = start;
2017	memset(&mbs, 0, sizeof(mbs));
2018	while (cp < g->must + g->mlen) {
2019		while (OP(s = *scan++) != OCHAR)
2020			continue;
2021		clen = wcrtomb(cp, OPND(s), &mbs);
2022		assert(clen != (size_t)-1);
2023		cp += clen;
2024	}
2025	assert(cp == g->must + g->mlen);
2026	*cp++ = '\0';		/* just on general principles */
2027}
2028
2029/*
2030 - altoffset - choose biggest offset among multiple choices
2031 == static int altoffset(sop *scan, int offset);
2032 *
2033 * Compute, recursively if necessary, the largest offset among multiple
2034 * re paths.
2035 */
2036static int
2037altoffset(sop *scan, int offset)
2038{
2039	int largest;
2040	int try;
2041	sop s;
2042
2043	/* If we gave up already on offsets, return */
2044	if (offset == -1)
2045		return -1;
2046
2047	largest = 0;
2048	try = 0;
2049	s = *scan++;
2050	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
2051		switch (OP(s)) {
2052		case OOR1:
2053			if (try > largest)
2054				largest = try;
2055			try = 0;
2056			break;
2057		case OQUEST_:
2058		case OCH_:
2059			try = altoffset(scan, try);
2060			if (try == -1)
2061				return -1;
2062			scan--;
2063			do {
2064				scan += OPND(s);
2065				s = *scan;
2066				if (OP(s) != (sop)O_QUEST &&
2067				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
2068					return -1;
2069			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
2070			/* We must skip to the next position, or we'll
2071			 * leave altoffset() too early.
2072			 */
2073			scan++;
2074			break;
2075		case OANYOF:
2076		case OCHAR:
2077		case OANY:
2078			try++;
2079		case OBOW:
2080		case OEOW:
2081		case OWBND:
2082		case ONWBND:
2083		case OLPAREN:
2084		case ORPAREN:
2085		case OOR2:
2086			break;
2087		default:
2088			try = -1;
2089			break;
2090		}
2091		if (try == -1)
2092			return -1;
2093		s = *scan++;
2094	}
2095
2096	if (try > largest)
2097		largest = try;
2098
2099	return largest+offset;
2100}
2101
2102/*
2103 - computejumps - compute char jumps for BM scan
2104 == static void computejumps(struct parse *p, struct re_guts *g);
2105 *
2106 * This algorithm assumes g->must exists and is has size greater than
2107 * zero. It's based on the algorithm found on Computer Algorithms by
2108 * Sara Baase.
2109 *
2110 * A char jump is the number of characters one needs to jump based on
2111 * the value of the character from the text that was mismatched.
2112 */
2113static void
2114computejumps(struct parse *p, struct re_guts *g)
2115{
2116	int ch;
2117	int mindex;
2118
2119	/* Avoid making errors worse */
2120	if (p->error != 0)
2121		return;
2122
2123	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int));
2124	if (g->charjump == NULL)	/* Not a fatal error */
2125		return;
2126	/* Adjust for signed chars, if necessary */
2127	g->charjump = &g->charjump[-(CHAR_MIN)];
2128
2129	/* If the character does not exist in the pattern, the jump
2130	 * is equal to the number of characters in the pattern.
2131	 */
2132	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2133		g->charjump[ch] = g->mlen;
2134
2135	/* If the character does exist, compute the jump that would
2136	 * take us to the last character in the pattern equal to it
2137	 * (notice that we match right to left, so that last character
2138	 * is the first one that would be matched).
2139	 */
2140	for (mindex = 0; mindex < g->mlen; mindex++)
2141		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2142}
2143
2144/*
2145 - computematchjumps - compute match jumps for BM scan
2146 == static void computematchjumps(struct parse *p, struct re_guts *g);
2147 *
2148 * This algorithm assumes g->must exists and is has size greater than
2149 * zero. It's based on the algorithm found on Computer Algorithms by
2150 * Sara Baase.
2151 *
2152 * A match jump is the number of characters one needs to advance based
2153 * on the already-matched suffix.
2154 * Notice that all values here are minus (g->mlen-1), because of the way
2155 * the search algorithm works.
2156 */
2157static void
2158computematchjumps(struct parse *p, struct re_guts *g)
2159{
2160	int mindex;		/* General "must" iterator */
2161	int suffix;		/* Keeps track of matching suffix */
2162	int ssuffix;		/* Keeps track of suffixes' suffix */
2163	int* pmatches;		/* pmatches[k] points to the next i
2164				 * such that i+1...mlen is a substring
2165				 * of k+1...k+mlen-i-1
2166				 */
2167
2168	/* Avoid making errors worse */
2169	if (p->error != 0)
2170		return;
2171
2172	pmatches = (int*) malloc(g->mlen * sizeof(int));
2173	if (pmatches == NULL) {
2174		g->matchjump = NULL;
2175		return;
2176	}
2177
2178	g->matchjump = (int*) malloc(g->mlen * sizeof(int));
2179	if (g->matchjump == NULL) {	/* Not a fatal error */
2180		free(pmatches);
2181		return;
2182	}
2183
2184	/* Set maximum possible jump for each character in the pattern */
2185	for (mindex = 0; mindex < g->mlen; mindex++)
2186		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2187
2188	/* Compute pmatches[] */
2189	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2190	    mindex--, suffix--) {
2191		pmatches[mindex] = suffix;
2192
2193		/* If a mismatch is found, interrupting the substring,
2194		 * compute the matchjump for that position. If no
2195		 * mismatch is found, then a text substring mismatched
2196		 * against the suffix will also mismatch against the
2197		 * substring.
2198		 */
2199		while (suffix < g->mlen
2200		    && g->must[mindex] != g->must[suffix]) {
2201			g->matchjump[suffix] = MIN(g->matchjump[suffix],
2202			    g->mlen - mindex - 1);
2203			suffix = pmatches[suffix];
2204		}
2205	}
2206
2207	/* Compute the matchjump up to the last substring found to jump
2208	 * to the beginning of the largest must pattern prefix matching
2209	 * it's own suffix.
2210	 */
2211	for (mindex = 0; mindex <= suffix; mindex++)
2212		g->matchjump[mindex] = MIN(g->matchjump[mindex],
2213		    g->mlen + suffix - mindex);
2214
2215        ssuffix = pmatches[suffix];
2216        while (suffix < g->mlen) {
2217                while (suffix <= ssuffix && suffix < g->mlen) {
2218                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
2219			    g->mlen + ssuffix - suffix);
2220                        suffix++;
2221                }
2222		if (suffix < g->mlen)
2223                	ssuffix = pmatches[ssuffix];
2224        }
2225
2226	free(pmatches);
2227}
2228
2229/*
2230 - pluscount - count + nesting
2231 == static sopno pluscount(struct parse *p, struct re_guts *g);
2232 */
2233static sopno			/* nesting depth */
2234pluscount(struct parse *p, struct re_guts *g)
2235{
2236	sop *scan;
2237	sop s;
2238	sopno plusnest = 0;
2239	sopno maxnest = 0;
2240
2241	if (p->error != 0)
2242		return(0);	/* there may not be an OEND */
2243
2244	scan = g->strip + 1;
2245	do {
2246		s = *scan++;
2247		switch (OP(s)) {
2248		case OPLUS_:
2249			plusnest++;
2250			break;
2251		case O_PLUS:
2252			if (plusnest > maxnest)
2253				maxnest = plusnest;
2254			plusnest--;
2255			break;
2256		}
2257	} while (OP(s) != OEND);
2258	if (plusnest != 0)
2259		g->iflags |= BAD;
2260	return(maxnest);
2261}
2262