1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#define	DEBUG
28
29#include <ctype.h>
30#include <limits.h>
31#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include "awk.h"
35#include "awkgram.tab.h"
36
37#define MAXLIN 22
38
39#define type(v)		(v)->nobj	/* badly overloaded here */
40#define info(v)		(v)->ntype	/* badly overloaded here */
41#define left(v)		(v)->narg[0]
42#define right(v)	(v)->narg[1]
43#define parent(v)	(v)->nnext
44
45#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46#define ELEAF	case EMPTYRE:		/* empty string in regexp */
47#define UNARY	case STAR: case PLUS: case QUEST:
48
49/* encoding in tree Nodes:
50	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51		left is index, right contains value or pointer to value
52	unary (STAR, PLUS, QUEST): left is child, right is null
53	binary (CAT, OR): left and right are children
54	parent contains pointer to parent
55*/
56
57
58int	*setvec;
59int	*tmpset;
60int	maxsetvec = 0;
61
62int	rtok;		/* next token in current re */
63int	rlxval;
64static const uschar	*rlxstr;
65static const uschar	*prestr;	/* current position in current re */
66static const uschar	*lastre;	/* origin of last re */
67static const uschar	*lastatom;	/* origin of last Atom */
68static const uschar	*starttok;
69static const uschar 	*basestr;	/* starts with original, replaced during
70				   repetition processing */
71static const uschar 	*firstbasestr;
72
73static	int setcnt;
74static	int poscnt;
75
76const char	*patbeg;
77int	patlen;
78
79#define	NFA	128	/* cache this many dynamic fa's */
80fa	*fatab[NFA];
81int	nfatab	= 0;	/* entries in fatab */
82
83extern int u8_nextlen(const char *s);
84
85
86/* utf-8 mechanism:
87
88   For most of Awk, utf-8 strings just "work", since they look like
89   null-terminated sequences of 8-bit bytes.
90
91   Functions like length(), index(), and substr() have to operate
92   in units of utf-8 characters.  The u8_* functions in run.c
93   handle this.
94
95   Regular expressions are more complicated, since the basic
96   mechanism of the goto table used 8-bit byte indices into the
97   gototab entries to compute the next state.  Unicode is a lot
98   bigger, so the gototab entries are now structs with a character
99   and a next state. These are sorted by code point and binary
100   searched.
101
102   Throughout the RE mechanism in b.c, utf-8 characters are
103   converted to their utf-32 value.  This mostly shows up in
104   cclenter, which expands character class ranges like a-z and now
105   alpha-omega.  The size of a gototab array is still about 256.
106   This should be dynamic, but for now things work ok for a single
107   code page of Unicode, which is the most likely case.
108
109   The code changes are localized in run.c and b.c.  I have added a
110   handful of functions to somewhat better hide the implementation,
111   but a lot more could be done.
112
113 */
114
115static int entry_cmp(const void *l, const void *r);
116static int get_gototab(fa*, int, int);
117static int set_gototab(fa*, int, int, int);
118static void clear_gototab(fa*, int);
119extern int u8_rune(int *, const char *);
120
121static int *
122intalloc(size_t n, const char *f)
123{
124	int *p = (int *) calloc(n, sizeof(int));
125	if (p == NULL)
126		overflo(f);
127	return p;
128}
129
130static void
131resizesetvec(const char *f)
132{
133	if (maxsetvec == 0)
134		maxsetvec = MAXLIN;
135	else
136		maxsetvec *= 4;
137	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139	if (setvec == NULL || tmpset == NULL)
140		overflo(f);
141}
142
143static void
144resize_state(fa *f, int state)
145{
146	gtt *p;
147	uschar *p2;
148	int **p3;
149	int i, new_count;
150
151	if (++state < f->state_count)
152		return;
153
154	new_count = state + 10; /* needs to be tuned */
155
156	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157	if (p == NULL)
158		goto out;
159	f->gototab = p;
160
161	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162	if (p2 == NULL)
163		goto out;
164	f->out = p2;
165
166	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167	if (p3 == NULL)
168		goto out;
169	f->posns = p3;
170
171	for (i = f->state_count; i < new_count; ++i) {
172		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173		if (f->gototab[i].entries == NULL)
174			goto out;
175		f->gototab[i].allocated = NCHARS;
176		f->gototab[i].inuse = 0;
177		f->out[i] = 0;
178		f->posns[i] = NULL;
179	}
180	f->state_count = new_count;
181	return;
182out:
183	overflo(__func__);
184}
185
186fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
187{
188	int i, use, nuse;
189	fa *pfa;
190	static int now = 1;
191
192	if (setvec == NULL) {	/* first time through any RE */
193		resizesetvec(__func__);
194	}
195
196	if (compile_time != RUNNING)	/* a constant for sure */
197		return mkdfa(s, anchor);
198	for (i = 0; i < nfatab; i++)	/* is it there already? */
199		if (fatab[i]->anchor == anchor
200		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
201			fatab[i]->use = now++;
202			return fatab[i];
203		}
204	pfa = mkdfa(s, anchor);
205	if (nfatab < NFA) {	/* room for another */
206		fatab[nfatab] = pfa;
207		fatab[nfatab]->use = now++;
208		nfatab++;
209		return pfa;
210	}
211	use = fatab[0]->use;	/* replace least-recently used */
212	nuse = 0;
213	for (i = 1; i < nfatab; i++)
214		if (fatab[i]->use < use) {
215			use = fatab[i]->use;
216			nuse = i;
217		}
218	freefa(fatab[nuse]);
219	fatab[nuse] = pfa;
220	pfa->use = now++;
221	return pfa;
222}
223
224fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225				/* anchor = true for anchored matches, else false */
226{
227	Node *p, *p1;
228	fa *f;
229
230	firstbasestr = (const uschar *) s;
231	basestr = firstbasestr;
232	p = reparse(s);
233	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234		/* put ALL STAR in front of reg.  exp. */
235	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236		/* put FINAL after reg.  exp. */
237
238	poscnt = 0;
239	penter(p1);	/* enter parent pointers and leaf indices */
240	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241		overflo(__func__);
242	f->accept = poscnt-1;	/* penter has computed number of positions in re */
243	cfoll(f, p1);	/* set up follow sets */
244	freetr(p1);
245	resize_state(f, 1);
246	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247	f->posns[1] = intalloc(1, __func__);
248	*f->posns[1] = 0;
249	f->initstat = makeinit(f, anchor);
250	f->anchor = anchor;
251	f->restr = (uschar *) tostring(s);
252	if (firstbasestr != basestr) {
253		if (basestr)
254			xfree(basestr);
255	}
256	return f;
257}
258
259int makeinit(fa *f, bool anchor)
260{
261	int i, k;
262
263	f->curstat = 2;
264	f->out[2] = 0;
265	k = *(f->re[0].lfollow);
266	xfree(f->posns[2]);
267	f->posns[2] = intalloc(k + 1,  __func__);
268	for (i = 0; i <= k; i++) {
269		(f->posns[2])[i] = (f->re[0].lfollow)[i];
270	}
271	if ((f->posns[2])[1] == f->accept)
272		f->out[2] = 1;
273	clear_gototab(f, 2);
274	f->curstat = cgoto(f, 2, HAT);
275	if (anchor) {
276		*f->posns[2] = k-1;	/* leave out position 0 */
277		for (i = 0; i < k; i++) {
278			(f->posns[0])[i] = (f->posns[2])[i];
279		}
280
281		f->out[0] = f->out[2];
282		if (f->curstat != 2)
283			--(*f->posns[f->curstat]);
284	}
285	return f->curstat;
286}
287
288void penter(Node *p)	/* set up parent pointers and leaf indices */
289{
290	switch (type(p)) {
291	ELEAF
292	LEAF
293		info(p) = poscnt;
294		poscnt++;
295		break;
296	UNARY
297		penter(left(p));
298		parent(left(p)) = p;
299		break;
300	case CAT:
301	case OR:
302		penter(left(p));
303		penter(right(p));
304		parent(left(p)) = p;
305		parent(right(p)) = p;
306		break;
307	case ZERO:
308		break;
309	default:	/* can't happen */
310		FATAL("can't happen: unknown type %d in penter", type(p));
311		break;
312	}
313}
314
315void freetr(Node *p)	/* free parse tree */
316{
317	switch (type(p)) {
318	ELEAF
319	LEAF
320		xfree(p);
321		break;
322	UNARY
323	case ZERO:
324		freetr(left(p));
325		xfree(p);
326		break;
327	case CAT:
328	case OR:
329		freetr(left(p));
330		freetr(right(p));
331		xfree(p);
332		break;
333	default:	/* can't happen */
334		FATAL("can't happen: unknown type %d in freetr", type(p));
335		break;
336	}
337}
338
339/* in the parsing of regular expressions, metacharacters like . have */
340/* to be seen literally;  \056 is not a metacharacter. */
341
342int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
343{			/* only pick up one 8-bit byte (2 chars) */
344	const uschar *p;
345	int n = 0;
346	int i;
347
348	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349		if (isdigit((int) *p))
350			n = 16 * n + *p - '0';
351		else if (*p >= 'a' && *p <= 'f')
352			n = 16 * n + *p - 'a' + 10;
353		else if (*p >= 'A' && *p <= 'F')
354			n = 16 * n + *p - 'A' + 10;
355	}
356	*pp = p;
357	return n;
358}
359
360
361
362#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
363
364int quoted(const uschar **pp)	/* pick up next thing after a \\ */
365			/* and increment *pp */
366{
367	const uschar *p = *pp;
368	int c;
369
370/* BUG: should advance by utf-8 char even if makes no sense */
371
372	if ((c = *p++) == 't') {
373		c = '\t';
374	} else if (c == 'n') {
375		c = '\n';
376	} else if (c == 'f') {
377		c = '\f';
378	} else if (c == 'r') {
379		c = '\r';
380	} else if (c == 'b') {
381		c = '\b';
382	} else if (c == 'v') {
383		c = '\v';
384	} else if (c == 'a') {
385		c = '\a';
386	} else if (c == '\\') {
387		c = '\\';
388	} else if (c == 'x') {	/* 2 hex digits follow */
389		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
390	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
391		c = hexstr(&p, 8);
392	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
393		int n = c - '0';
394		if (isoctdigit(*p)) {
395			n = 8 * n + *p++ - '0';
396			if (isoctdigit(*p))
397				n = 8 * n + *p++ - '0';
398		}
399		c = n;
400	} /* else */
401		/* c = c; */
402	*pp = p;
403	return c;
404}
405
406int *cclenter(const char *argp)	/* add a character class */
407{
408	int i, c, c2;
409	int n;
410	const uschar *p = (const uschar *) argp;
411	int *bp, *retp;
412	static int *buf = NULL;
413	static int bufsz = 100;
414
415	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
416		FATAL("out of space for character class [%.10s...] 1", p);
417	bp = buf;
418	for (i = 0; *p != 0; ) {
419		n = u8_rune(&c, (const char *) p);
420		p += n;
421		if (c == '\\') {
422			c = quoted(&p);
423		} else if (c == '-' && i > 0 && bp[-1] != 0) {
424			if (*p != 0) {
425				c = bp[-1];
426				/* c2 = *p++; */
427				n = u8_rune(&c2, (const char *) p);
428				p += n;
429				if (c2 == '\\')
430					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431				if (c > c2) {	/* empty; ignore */
432					bp--;
433					i--;
434					continue;
435				}
436				while (c < c2) {
437					if (i >= bufsz) {
438						bufsz *= 2;
439						buf = (int *) realloc(buf, bufsz * sizeof(int));
440						if (buf == NULL)
441							FATAL("out of space for character class [%.10s...] 2", p);
442						bp = buf + i;
443					}
444					*bp++ = ++c;
445					i++;
446				}
447				continue;
448			}
449		}
450		if (i >= bufsz) {
451			bufsz *= 2;
452			buf = (int *) realloc(buf, bufsz * sizeof(int));
453			if (buf == NULL)
454				FATAL("out of space for character class [%.10s...] 2", p);
455			bp = buf + i;
456		}
457		*bp++ = c;
458		i++;
459	}
460	*bp = 0;
461	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462	/* xfree(op);  BUG: what are we freeing here? */
463	retp = (int *) calloc(bp-buf+1, sizeof(int));
464	for (i = 0; i < bp-buf+1; i++)
465		retp[i] = buf[i];
466	return retp;
467}
468
469void overflo(const char *s)
470{
471	FATAL("regular expression too big: out of space in %.30s...", s);
472}
473
474void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
475{
476	int i;
477	int *p;
478
479	switch (type(v)) {
480	ELEAF
481	LEAF
482		f->re[info(v)].ltype = type(v);
483		f->re[info(v)].lval.np = right(v);
484		while (f->accept >= maxsetvec) {	/* guessing here! */
485			resizesetvec(__func__);
486		}
487		for (i = 0; i <= f->accept; i++)
488			setvec[i] = 0;
489		setcnt = 0;
490		follow(v);	/* computes setvec and setcnt */
491		p = intalloc(setcnt + 1, __func__);
492		f->re[info(v)].lfollow = p;
493		*p = setcnt;
494		for (i = f->accept; i >= 0; i--)
495			if (setvec[i] == 1)
496				*++p = i;
497		break;
498	UNARY
499		cfoll(f,left(v));
500		break;
501	case CAT:
502	case OR:
503		cfoll(f,left(v));
504		cfoll(f,right(v));
505		break;
506	case ZERO:
507		break;
508	default:	/* can't happen */
509		FATAL("can't happen: unknown type %d in cfoll", type(v));
510	}
511}
512
513int first(Node *p)	/* collects initially active leaves of p into setvec */
514			/* returns 0 if p matches empty string */
515{
516	int b, lp;
517
518	switch (type(p)) {
519	ELEAF
520	LEAF
521		lp = info(p);	/* look for high-water mark of subscripts */
522		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
523			resizesetvec(__func__);
524		}
525		if (type(p) == EMPTYRE) {
526			setvec[lp] = 0;
527			return(0);
528		}
529		if (setvec[lp] != 1) {
530			setvec[lp] = 1;
531			setcnt++;
532		}
533		if (type(p) == CCL && (*(int *) right(p)) == 0)
534			return(0);		/* empty CCL */
535		return(1);
536	case PLUS:
537		if (first(left(p)) == 0)
538			return(0);
539		return(1);
540	case STAR:
541	case QUEST:
542		first(left(p));
543		return(0);
544	case CAT:
545		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
546		return(1);
547	case OR:
548		b = first(right(p));
549		if (first(left(p)) == 0 || b == 0) return(0);
550		return(1);
551	case ZERO:
552		return 0;
553	}
554	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
555	return(-1);
556}
557
558void follow(Node *v)	/* collects leaves that can follow v into setvec */
559{
560	Node *p;
561
562	if (type(v) == FINAL)
563		return;
564	p = parent(v);
565	switch (type(p)) {
566	case STAR:
567	case PLUS:
568		first(v);
569		follow(p);
570		return;
571
572	case OR:
573	case QUEST:
574		follow(p);
575		return;
576
577	case CAT:
578		if (v == left(p)) {	/* v is left child of p */
579			if (first(right(p)) == 0) {
580				follow(p);
581				return;
582			}
583		} else		/* v is right child */
584			follow(p);
585		return;
586	}
587}
588
589int member(int c, int *sarg)	/* is c in s? */
590{
591	int *s = (int *) sarg;
592
593	while (*s)
594		if (c == *s++)
595			return(1);
596	return(0);
597}
598
599static void resize_gototab(fa *f, int state)
600{
601	size_t new_size = f->gototab[state].allocated * 2;
602	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603	if (p == NULL)
604		overflo(__func__);
605
606	// need to initialized the new memory to zero
607	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
608	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
609
610	f->gototab[state].allocated = new_size;			// update gototab info
611	f->gototab[state].entries = p;
612}
613
614static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615{
616	gtte key;
617	gtte *item;
618
619	key.ch = ch;
620	key.state = 0;	/* irrelevant */
621	item = (gtte *) bsearch(& key, f->gototab[state].entries,
622			f->gototab[state].inuse, sizeof(gtte),
623			entry_cmp);
624
625	if (item == NULL)
626		return 0;
627	else
628		return item->state;
629}
630
631static int entry_cmp(const void *l, const void *r)
632{
633	const gtte *left, *right;
634
635	left = (const gtte *) l;
636	right = (const gtte *) r;
637
638	return left->ch - right->ch;
639}
640
641static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642{
643	if (f->gototab[state].inuse == 0) {
644		f->gototab[state].entries[0].ch = ch;
645		f->gototab[state].entries[0].state = val;
646		f->gototab[state].inuse++;
647		return val;
648	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649		// not seen yet, insert and return
650		gtt *tab = & f->gototab[state];
651		if (tab->inuse + 1 >= tab->allocated)
652			resize_gototab(f, state);
653
654		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
655		f->gototab[state].entries[f->gototab[state].inuse].state = val;
656		f->gototab[state].inuse++;
657		return val;
658	} else {
659		// maybe we have it, maybe we don't
660		gtte key;
661		gtte *item;
662
663		key.ch = ch;
664		key.state = 0;	/* irrelevant */
665		item = (gtte *) bsearch(& key, f->gototab[state].entries,
666				f->gototab[state].inuse, sizeof(gtte),
667				entry_cmp);
668
669		if (item != NULL) {
670			// we have it, update state and return
671			item->state = val;
672			return item->state;
673		}
674		// otherwise, fall through to insert and reallocate.
675	}
676
677	gtt *tab = & f->gototab[state];
678	if (tab->inuse + 1 >= tab->allocated)
679		resize_gototab(f, state);
680	f->gototab[state].entries[tab->inuse].ch = ch;
681	f->gototab[state].entries[tab->inuse].state = val;
682	++tab->inuse;
683
684	qsort(f->gototab[state].entries,
685		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686
687	return val; /* not used anywhere at the moment */
688}
689
690static void clear_gototab(fa *f, int state)
691{
692	memset(f->gototab[state].entries, 0,
693		f->gototab[state].allocated * sizeof(gtte));
694	f->gototab[state].inuse = 0;
695}
696
697int match(fa *f, const char *p0)	/* shortest match ? */
698{
699	int s, ns;
700	int n;
701	int rune;
702	const uschar *p = (const uschar *) p0;
703
704	/* return pmatch(f, p0); does it matter whether longest or shortest? */
705
706	s = f->initstat;
707	assert (s < f->state_count);
708
709	if (f->out[s])
710		return(1);
711	do {
712		/* assert(*p < NCHARS); */
713		n = u8_rune(&rune, (const char *) p);
714		if ((ns = get_gototab(f, s, rune)) != 0)
715			s = ns;
716		else
717			s = cgoto(f, s, rune);
718		if (f->out[s])
719			return(1);
720		if (*p == 0)
721			break;
722		p += n;
723	} while (1);  /* was *p++ != 0 */
724	return(0);
725}
726
727int pmatch(fa *f, const char *p0)	/* longest match, for sub */
728{
729	int s, ns;
730	int n;
731	int rune;
732	const uschar *p = (const uschar *) p0;
733	const uschar *q;
734
735	s = f->initstat;
736	assert(s < f->state_count);
737
738	patbeg = (const char *)p;
739	patlen = -1;
740	do {
741		q = p;
742		do {
743			if (f->out[s])		/* final state */
744				patlen = q-p;
745			/* assert(*q < NCHARS); */
746			n = u8_rune(&rune, (const char *) q);
747			if ((ns = get_gototab(f, s, rune)) != 0)
748				s = ns;
749			else
750				s = cgoto(f, s, rune);
751
752			assert(s < f->state_count);
753
754			if (s == 1) {	/* no transition */
755				if (patlen >= 0) {
756					patbeg = (const char *) p;
757					return(1);
758				}
759				else
760					goto nextin;	/* no match */
761			}
762			if (*q == 0)
763				break;
764			q += n;
765		} while (1);
766		q++;  /* was *q++ */
767		if (f->out[s])
768			patlen = q-p-1;	/* don't count $ */
769		if (patlen >= 0) {
770			patbeg = (const char *) p;
771			return(1);
772		}
773	nextin:
774		s = 2;
775		if (*p == 0)
776			break;
777		n = u8_rune(&rune, (const char *) p);
778		p += n;
779	} while (1); /* was *p++ */
780	return (0);
781}
782
783int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
784{
785	int s, ns;
786        int n;
787        int rune;
788	const uschar *p = (const uschar *) p0;
789	const uschar *q;
790
791	s = f->initstat;
792	assert(s < f->state_count);
793
794	patbeg = (const char *)p;
795	patlen = -1;
796	while (*p) {
797		q = p;
798		do {
799			if (f->out[s])		/* final state */
800				patlen = q-p;
801			/* assert(*q < NCHARS); */
802			n = u8_rune(&rune, (const char *) q);
803			if ((ns = get_gototab(f, s, rune)) != 0)
804				s = ns;
805			else
806				s = cgoto(f, s, rune);
807			if (s == 1) {	/* no transition */
808				if (patlen > 0) {
809					patbeg = (const char *) p;
810					return(1);
811				} else
812					goto nnextin;	/* no nonempty match */
813			}
814			if (*q == 0)
815				break;
816			q += n;
817		} while (1);
818		q++;
819		if (f->out[s])
820			patlen = q-p-1;	/* don't count $ */
821		if (patlen > 0 ) {
822			patbeg = (const char *) p;
823			return(1);
824		}
825	nnextin:
826		s = 2;
827		p++;
828	}
829	return (0);
830}
831
832
833/*
834 * NAME
835 *     fnematch
836 *
837 * DESCRIPTION
838 *     A stream-fed version of nematch which transfers characters to a
839 *     null-terminated buffer. All characters up to and including the last
840 *     character of the matching text or EOF are placed in the buffer. If
841 *     a match is found, patbeg and patlen are set appropriately.
842 *
843 * RETURN VALUES
844 *     false    No match found.
845 *     true     Match found.
846 */
847
848bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
849{
850	char *i, *j, *k, *buf = *pbuf;
851	int bufsize = *pbufsize;
852	int c, n, ns, s;
853
854	s = pfa->initstat;
855	patlen = 0;
856
857	/*
858	 * buf <= i <= j <= k <= buf+bufsize
859	 *
860	 * i: origin of active substring
861	 * j: current character
862	 * k: destination of the next getc
863	 */
864
865	i = j = k = buf;
866
867	do {
868		/*
869		 * Call u8_rune with at least awk_mb_cur_max ahead in
870		 * the buffer until EOF interferes.
871		 */
872		if (k - j < awk_mb_cur_max) {
873			if (k + awk_mb_cur_max > buf + bufsize) {
874				char *obuf = buf;
875				adjbuf((char **) &buf, &bufsize,
876				    bufsize + awk_mb_cur_max,
877				    quantum, 0, "fnematch");
878
879				/* buf resized, maybe moved. update pointers */
880				*pbufsize = bufsize;
881				if (obuf != buf) {
882					i = buf + (i - obuf);
883					j = buf + (j - obuf);
884					k = buf + (k - obuf);
885					*pbuf = buf;
886					if (patlen)
887						patbeg = buf + (patbeg - obuf);
888				}
889			}
890			for (n = awk_mb_cur_max ; n > 0; n--) {
891				*k++ = (c = getc(f)) != EOF ? c : 0;
892				if (c == EOF) {
893					if (ferror(f))
894						FATAL("fnematch: getc error");
895					break;
896				}
897			}
898		}
899
900		j += u8_rune(&c, j);
901
902		if ((ns = get_gototab(pfa, s, c)) != 0)
903			s = ns;
904		else
905			s = cgoto(pfa, s, c);
906
907		if (pfa->out[s]) {	/* final state */
908			patbeg = i;
909			patlen = j - i;
910			if (c == 0)	/* don't count $ */
911				patlen--;
912		}
913
914		if (c && s != 1)
915			continue;  /* origin i still viable, next j */
916		if (patlen)
917			break;     /* best match found */
918
919		/* no match at origin i, next i and start over */
920		i += u8_rune(&c, i);
921		if (c == 0)
922			break;    /* no match */
923		j = i;
924		s = 2;
925	} while (1);
926
927	if (patlen) {
928		/*
929		 * Under no circumstances is the last character fed to
930		 * the automaton part of the match. It is EOF's nullbyte,
931		 * or it sent the automaton into a state with no further
932		 * transitions available (s==1), or both. Room for a
933		 * terminating nullbyte is guaranteed.
934		 *
935		 * ungetc any chars after the end of matching text
936		 * (except for EOF's nullbyte, if present) and null
937		 * terminate the buffer.
938		 */
939		do
940			if (*--k && ungetc(*k, f) == EOF)
941				FATAL("unable to ungetc '%c'", *k);
942		while (k > patbeg + patlen);
943		*k = '\0';
944		return true;
945	}
946	else
947		return false;
948}
949
950Node *reparse(const char *p)	/* parses regular expression pointed to by p */
951{			/* uses relex() to scan regular expression */
952	Node *np;
953
954	DPRINTF("reparse <%s>\n", p);
955	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
956	rtok = relex();
957	/* GNU compatibility: an empty regexp matches anything */
958	if (rtok == '\0') {
959		/* FATAL("empty regular expression"); previous */
960		return(op2(EMPTYRE, NIL, NIL));
961	}
962	np = regexp();
963	if (rtok != '\0')
964		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
965	return(np);
966}
967
968Node *regexp(void)	/* top-level parse of reg expr */
969{
970	return (alt(concat(primary())));
971}
972
973Node *primary(void)
974{
975	Node *np;
976	int savelastatom;
977
978	switch (rtok) {
979	case CHAR:
980		lastatom = starttok;
981		np = op2(CHAR, NIL, itonp(rlxval));
982		rtok = relex();
983		return (unary(np));
984	case ALL:
985		rtok = relex();
986		return (unary(op2(ALL, NIL, NIL)));
987	case EMPTYRE:
988		rtok = relex();
989		return (unary(op2(EMPTYRE, NIL, NIL)));
990	case DOT:
991		lastatom = starttok;
992		rtok = relex();
993		return (unary(op2(DOT, NIL, NIL)));
994	case CCL:
995		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
996		lastatom = starttok;
997		rtok = relex();
998		return (unary(np));
999	case NCCL:
1000		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1001		lastatom = starttok;
1002		rtok = relex();
1003		return (unary(np));
1004	case '^':
1005		rtok = relex();
1006		return (unary(op2(CHAR, NIL, itonp(HAT))));
1007	case '$':
1008		rtok = relex();
1009		return (unary(op2(CHAR, NIL, NIL)));
1010	case '(':
1011		lastatom = starttok;
1012		savelastatom = starttok - basestr; /* Retain over recursion */
1013		rtok = relex();
1014		if (rtok == ')') {	/* special pleading for () */
1015			rtok = relex();
1016			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1017		}
1018		np = regexp();
1019		if (rtok == ')') {
1020			lastatom = basestr + savelastatom; /* Restore */
1021			rtok = relex();
1022			return (unary(np));
1023		}
1024		else
1025			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1026	default:
1027		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1028	}
1029	return 0;	/*NOTREACHED*/
1030}
1031
1032Node *concat(Node *np)
1033{
1034	switch (rtok) {
1035	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1036		return (concat(op2(CAT, np, primary())));
1037	case EMPTYRE:
1038		rtok = relex();
1039		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1040				primary())));
1041	}
1042	return (np);
1043}
1044
1045Node *alt(Node *np)
1046{
1047	if (rtok == OR) {
1048		rtok = relex();
1049		return (alt(op2(OR, np, concat(primary()))));
1050	}
1051	return (np);
1052}
1053
1054Node *unary(Node *np)
1055{
1056	switch (rtok) {
1057	case STAR:
1058		rtok = relex();
1059		return (unary(op2(STAR, np, NIL)));
1060	case PLUS:
1061		rtok = relex();
1062		return (unary(op2(PLUS, np, NIL)));
1063	case QUEST:
1064		rtok = relex();
1065		return (unary(op2(QUEST, np, NIL)));
1066	case ZERO:
1067		rtok = relex();
1068		return (unary(op2(ZERO, np, NIL)));
1069	default:
1070		return (np);
1071	}
1072}
1073
1074/*
1075 * Character class definitions conformant to the POSIX locale as
1076 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1077 * and operating character sets are both ASCII (ISO646) or supersets
1078 * thereof.
1079 *
1080 * Note that to avoid overflowing the temporary buffer used in
1081 * relex(), the expanded character class (prior to range expansion)
1082 * must be less than twice the size of their full name.
1083 */
1084
1085/* Because isblank doesn't show up in any of the header files on any
1086 * system i use, it's defined here.  if some other locale has a richer
1087 * definition of "blank", define HAS_ISBLANK and provide your own
1088 * version.
1089 * the parentheses here are an attempt to find a path through the maze
1090 * of macro definition and/or function and/or version provided.  thanks
1091 * to nelson beebe for the suggestion; let's see if it works everywhere.
1092 */
1093
1094/* #define HAS_ISBLANK */
1095#ifndef HAS_ISBLANK
1096
1097int (xisblank)(int c)
1098{
1099	return c==' ' || c=='\t';
1100}
1101
1102#endif
1103
1104static const struct charclass {
1105	const char *cc_name;
1106	int cc_namelen;
1107	int (*cc_func)(int);
1108} charclasses[] = {
1109	{ "alnum",	5,	isalnum },
1110	{ "alpha",	5,	isalpha },
1111#ifndef HAS_ISBLANK
1112	{ "blank",	5,	xisblank },
1113#else
1114	{ "blank",	5,	isblank },
1115#endif
1116	{ "cntrl",	5,	iscntrl },
1117	{ "digit",	5,	isdigit },
1118	{ "graph",	5,	isgraph },
1119	{ "lower",	5,	islower },
1120	{ "print",	5,	isprint },
1121	{ "punct",	5,	ispunct },
1122	{ "space",	5,	isspace },
1123	{ "upper",	5,	isupper },
1124	{ "xdigit",	6,	isxdigit },
1125	{ NULL,		0,	NULL },
1126};
1127
1128#define REPEAT_SIMPLE		0
1129#define REPEAT_PLUS_APPENDED	1
1130#define REPEAT_WITH_Q		2
1131#define REPEAT_ZERO		3
1132
1133static int
1134replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1135	       int atomlen, int firstnum, int secondnum, int special_case)
1136{
1137	int i, j;
1138	uschar *buf = 0;
1139	int ret = 1;
1140	int init_q = (firstnum == 0);		/* first added char will be ? */
1141	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1142	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1143	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1144	int size = prefix_length +  suffix_length;
1145
1146	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1147		size += atomlen*(firstnum-1);
1148	}
1149
1150	/* Adjust size of buffer for special cases */
1151	if (special_case == REPEAT_PLUS_APPENDED) {
1152		size++;		/* for the final + */
1153	} else if (special_case == REPEAT_WITH_Q) {
1154		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1155	} else if (special_case == REPEAT_ZERO) {
1156		size += 2;	/* just a null ERE: () */
1157	}
1158	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1159		FATAL("out of space in reg expr %.10s..", lastre);
1160	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1161	j = prefix_length;
1162	if (special_case == REPEAT_ZERO) {
1163		j -= atomlen;
1164		buf[j++] = '(';
1165		buf[j++] = ')';
1166	}
1167	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1168		memcpy(&buf[j], atom, atomlen);
1169		j += atomlen;
1170	}
1171	if (special_case == REPEAT_PLUS_APPENDED) {
1172		buf[j++] = '+';
1173	} else if (special_case == REPEAT_WITH_Q) {
1174		if (init_q)
1175			buf[j++] = '?';
1176		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1177			memcpy(&buf[j], atom, atomlen);
1178			j += atomlen;
1179			buf[j++] = '?';
1180		}
1181	}
1182	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1183	j += suffix_length;
1184	buf[j] = '\0';
1185	/* free old basestr */
1186	if (firstbasestr != basestr) {
1187		if (basestr)
1188			xfree(basestr);
1189	}
1190	basestr = buf;
1191	prestr  = buf + prefix_length;
1192	if (special_case == REPEAT_ZERO) {
1193		prestr  -= atomlen;
1194		ret++;
1195	}
1196	return ret;
1197}
1198
1199static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1200		  int atomlen, int firstnum, int secondnum)
1201{
1202	/*
1203	   In general, the repetition specifier or "bound" is replaced here
1204	   by an equivalent ERE string, repeating the immediately previous atom
1205	   and appending ? and + as needed. Note that the first copy of the
1206	   atom is left in place, except in the special_case of a zero-repeat
1207	   (i.e., {0}).
1208	 */
1209	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1210		if (firstnum < 2) {
1211			/* 0 or 1: should be handled before you get here */
1212			FATAL("internal error");
1213		} else {
1214			return replace_repeat(reptok, reptoklen, atom, atomlen,
1215				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1216		}
1217	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1218		if (firstnum == 0) {	/* {0} or {0,0} */
1219			/* This case is unusual because the resulting
1220			   replacement string might actually be SMALLER than
1221			   the original ERE */
1222			return replace_repeat(reptok, reptoklen, atom, atomlen,
1223					firstnum, secondnum, REPEAT_ZERO);
1224		} else {		/* (firstnum >= 1) */
1225			return replace_repeat(reptok, reptoklen, atom, atomlen,
1226					firstnum, secondnum, REPEAT_SIMPLE);
1227		}
1228	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1229		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1230		return replace_repeat(reptok, reptoklen, atom, atomlen,
1231					firstnum, secondnum, REPEAT_WITH_Q);
1232	} else {	/* Error - shouldn't be here (n>m) */
1233		FATAL("internal error");
1234	}
1235	return 0;
1236}
1237
1238int relex(void)		/* lexical analyzer for reparse */
1239{
1240	int c, n;
1241	int cflag;
1242	static uschar *buf = NULL;
1243	static int bufsz = 100;
1244	uschar *bp;
1245	const struct charclass *cc;
1246	int i;
1247	int num, m;
1248	bool commafound, digitfound;
1249	const uschar *startreptok;
1250	static int parens = 0;
1251
1252rescan:
1253	starttok = prestr;
1254
1255	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1256		prestr += n;
1257		starttok = prestr;
1258		return CHAR;
1259	}
1260
1261	switch (c = *prestr++) {
1262	case '|': return OR;
1263	case '*': return STAR;
1264	case '+': return PLUS;
1265	case '?': return QUEST;
1266	case '.': return DOT;
1267	case '\0': prestr--; return '\0';
1268	case '^':
1269	case '$':
1270		return c;
1271	case '(':
1272		parens++;
1273		return c;
1274	case ')':
1275		if (parens) {
1276			parens--;
1277			return c;
1278		}
1279		/* unmatched close parenthesis; per POSIX, treat as literal */
1280		rlxval = c;
1281		return CHAR;
1282	case '\\':
1283		rlxval = quoted(&prestr);
1284		return CHAR;
1285	default:
1286		rlxval = c;
1287		return CHAR;
1288	case '[':
1289		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1290			FATAL("out of space in reg expr %.10s..", lastre);
1291		bp = buf;
1292		if (*prestr == '^') {
1293			cflag = 1;
1294			prestr++;
1295		}
1296		else
1297			cflag = 0;
1298		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1299		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1300			FATAL("out of space for reg expr %.10s...", lastre);
1301		for (; ; ) {
1302			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1303				for (i = 0; i < n; i++)
1304					*bp++ = *prestr++;
1305				continue;
1306			}
1307			if ((c = *prestr++) == '\\') {
1308				*bp++ = '\\';
1309				if ((c = *prestr++) == '\0')
1310					FATAL("nonterminated character class %.20s...", lastre);
1311				*bp++ = c;
1312			/* } else if (c == '\n') { */
1313			/* 	FATAL("newline in character class %.20s...", lastre); */
1314			} else if (c == '[' && *prestr == ':') {
1315				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1316				for (cc = charclasses; cc->cc_name; cc++)
1317					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1318						break;
1319				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1320				    prestr[2 + cc->cc_namelen] == ']') {
1321					prestr += cc->cc_namelen + 3;
1322					/*
1323					 * BUG: We begin at 1, instead of 0, since we
1324					 * would otherwise prematurely terminate the
1325					 * string for classes like [[:cntrl:]]. This
1326					 * means that we can't match the NUL character,
1327					 * not without first adapting the entire
1328					 * program to track each string's length.
1329					 */
1330					for (i = 1; i <= UCHAR_MAX; i++) {
1331						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1332						    FATAL("out of space for reg expr %.10s...", lastre);
1333						if (cc->cc_func(i)) {
1334							/* escape backslash */
1335							if (i == '\\') {
1336								*bp++ = '\\';
1337								n++;
1338							}
1339
1340							*bp++ = i;
1341							n++;
1342						}
1343					}
1344				} else
1345					*bp++ = c;
1346			} else if (c == '[' && *prestr == '.') {
1347				char collate_char;
1348				prestr++;
1349				collate_char = *prestr++;
1350				if (*prestr == '.' && prestr[1] == ']') {
1351					prestr += 2;
1352					/* Found it: map via locale TBD: for
1353					   now, simply return this char.  This
1354					   is sufficient to pass conformance
1355					   test awk.ex 156
1356					 */
1357					if (*prestr == ']') {
1358						prestr++;
1359						rlxval = collate_char;
1360						return CHAR;
1361					}
1362				}
1363			} else if (c == '[' && *prestr == '=') {
1364				char equiv_char;
1365				prestr++;
1366				equiv_char = *prestr++;
1367				if (*prestr == '=' && prestr[1] == ']') {
1368					prestr += 2;
1369					/* Found it: map via locale TBD: for now
1370					   simply return this char. This is
1371					   sufficient to pass conformance test
1372					   awk.ex 156
1373					 */
1374					if (*prestr == ']') {
1375						prestr++;
1376						rlxval = equiv_char;
1377						return CHAR;
1378					}
1379				}
1380			} else if (c == '\0') {
1381				FATAL("nonterminated character class %.20s", lastre);
1382			} else if (bp == buf) {	/* 1st char is special */
1383				*bp++ = c;
1384			} else if (c == ']') {
1385				*bp++ = 0;
1386				rlxstr = (uschar *) tostring((char *) buf);
1387				if (cflag == 0)
1388					return CCL;
1389				else
1390					return NCCL;
1391			} else
1392				*bp++ = c;
1393		}
1394		break;
1395	case '{':
1396		if (isdigit((int) *(prestr))) {
1397			num = 0;	/* Process as a repetition */
1398			n = -1; m = -1;
1399			commafound = false;
1400			digitfound = false;
1401			startreptok = prestr-1;
1402			/* Remember start of previous atom here ? */
1403		} else {        	/* just a { char, not a repetition */
1404			rlxval = c;
1405			return CHAR;
1406                }
1407		for (; ; ) {
1408			if ((c = *prestr++) == '}') {
1409				if (commafound) {
1410					if (digitfound) { /* {n,m} */
1411						m = num;
1412						if (m < n)
1413							FATAL("illegal repetition expression: class %.20s",
1414								lastre);
1415						if (n == 0 && m == 1) {
1416							return QUEST;
1417						}
1418					} else {	/* {n,} */
1419						if (n == 0)
1420							return STAR;
1421						else if (n == 1)
1422							return PLUS;
1423					}
1424				} else {
1425					if (digitfound) { /* {n} same as {n,n} */
1426						n = num;
1427						m = num;
1428					} else {	/* {} */
1429						FATAL("illegal repetition expression: class %.20s",
1430							lastre);
1431					}
1432				}
1433				if (repeat(starttok, prestr-starttok, lastatom,
1434					   startreptok - lastatom, n, m) > 0) {
1435					if (n == 0 && m == 0) {
1436						return ZERO;
1437					}
1438					/* must rescan input for next token */
1439					goto rescan;
1440				}
1441				/* Failed to replace: eat up {...} characters
1442				   and treat like just PLUS */
1443				return PLUS;
1444			} else if (c == '\0') {
1445				FATAL("nonterminated character class %.20s",
1446					lastre);
1447			} else if (isdigit(c)) {
1448				num = 10 * num + c - '0';
1449				digitfound = true;
1450			} else if (c == ',') {
1451				if (commafound)
1452					FATAL("illegal repetition expression: class %.20s",
1453						lastre);
1454				/* looking for {n,} or {n,m} */
1455				commafound = true;
1456				n = num;
1457				digitfound = false; /* reset */
1458				num = 0;
1459			} else {
1460				FATAL("illegal repetition expression: class %.20s",
1461					lastre);
1462			}
1463		}
1464		break;
1465	}
1466}
1467
1468int cgoto(fa *f, int s, int c)
1469{
1470	int *p, *q;
1471	int i, j, k;
1472
1473	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1474	while (f->accept >= maxsetvec) {	/* guessing here! */
1475		resizesetvec(__func__);
1476	}
1477	for (i = 0; i <= f->accept; i++)
1478		setvec[i] = 0;
1479	setcnt = 0;
1480	resize_state(f, s);
1481	/* compute positions of gototab[s,c] into setvec */
1482	p = f->posns[s];
1483	for (i = 1; i <= *p; i++) {
1484		if ((k = f->re[p[i]].ltype) != FINAL) {
1485			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1486			 || (k == DOT && c != 0 && c != HAT)
1487			 || (k == ALL && c != 0)
1488			 || (k == EMPTYRE && c != 0)
1489			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1490			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1491				q = f->re[p[i]].lfollow;
1492				for (j = 1; j <= *q; j++) {
1493					if (q[j] >= maxsetvec) {
1494						resizesetvec(__func__);
1495					}
1496					if (setvec[q[j]] == 0) {
1497						setcnt++;
1498						setvec[q[j]] = 1;
1499					}
1500				}
1501			}
1502		}
1503	}
1504	/* determine if setvec is a previous state */
1505	tmpset[0] = setcnt;
1506	j = 1;
1507	for (i = f->accept; i >= 0; i--)
1508		if (setvec[i]) {
1509			tmpset[j++] = i;
1510		}
1511	resize_state(f, f->curstat > s ? f->curstat : s);
1512	/* tmpset == previous state? */
1513	for (i = 1; i <= f->curstat; i++) {
1514		p = f->posns[i];
1515		if ((k = tmpset[0]) != p[0])
1516			goto different;
1517		for (j = 1; j <= k; j++)
1518			if (tmpset[j] != p[j])
1519				goto different;
1520		/* setvec is state i */
1521		if (c != HAT)
1522			set_gototab(f, s, c, i);
1523		return i;
1524	  different:;
1525	}
1526
1527	/* add tmpset to current set of states */
1528	++(f->curstat);
1529	resize_state(f, f->curstat);
1530	clear_gototab(f, f->curstat);
1531	xfree(f->posns[f->curstat]);
1532	p = intalloc(setcnt + 1, __func__);
1533
1534	f->posns[f->curstat] = p;
1535	if (c != HAT)
1536		set_gototab(f, s, c, f->curstat);
1537	for (i = 0; i <= setcnt; i++)
1538		p[i] = tmpset[i];
1539	if (setvec[f->accept])
1540		f->out[f->curstat] = 1;
1541	else
1542		f->out[f->curstat] = 0;
1543	return f->curstat;
1544}
1545
1546
1547void freefa(fa *f)	/* free a finite automaton */
1548{
1549	int i;
1550
1551	if (f == NULL)
1552		return;
1553	for (i = 0; i < f->state_count; i++)
1554		xfree(f->gototab[i].entries);
1555	xfree(f->gototab);
1556	for (i = 0; i <= f->curstat; i++)
1557		xfree(f->posns[i]);
1558	for (i = 0; i <= f->accept; i++) {
1559		xfree(f->re[i].lfollow);
1560		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1561			xfree(f->re[i].lval.np);
1562	}
1563	xfree(f->restr);
1564	xfree(f->out);
1565	xfree(f->posns);
1566	xfree(f->gototab);
1567	xfree(f);
1568}
1569