1254225Speter/*	$NetBSD: regex2.h,v 1.5 2011/11/23 15:43:39 tnozaki Exp $ */
2254225Speter
3254225Speter/*-
4254225Speter * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5254225Speter * Copyright (c) 1992, 1993, 1994
6254225Speter *	The Regents of the University of California.  All rights reserved.
7254225Speter *
8254225Speter * This code is derived from software contributed to Berkeley by
9254225Speter * Henry Spencer of the University of Toronto.
10254225Speter *
11254225Speter * Redistribution and use in source and binary forms, with or without
12254225Speter * modification, are permitted provided that the following conditions
13254225Speter * are met:
14254225Speter * 1. Redistributions of source code must retain the above copyright
15254225Speter *    notice, this list of conditions and the following disclaimer.
16254225Speter * 2. Redistributions in binary form must reproduce the above copyright
17254225Speter *    notice, this list of conditions and the following disclaimer in the
18254225Speter *    documentation and/or other materials provided with the distribution.
19254225Speter * 3. All advertising materials mentioning features or use of this software
20254225Speter *    must display the following acknowledgement:
21254225Speter *	This product includes software developed by the University of
22254225Speter *	California, Berkeley and its contributors.
23254225Speter * 4. Neither the name of the University nor the names of its contributors
24254225Speter *    may be used to endorse or promote products derived from this software
25254225Speter *    without specific prior written permission.
26254225Speter *
27254225Speter * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28254225Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29254225Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30254225Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31254225Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32254225Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33254225Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34254225Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35254225Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36254225Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37254225Speter * SUCH DAMAGE.
38254225Speter *
39254225Speter *	@(#)regex2.h	8.3 (Berkeley) 3/16/94
40254225Speter */
41254225Speter
42254225Speter/*
43254225Speter * First, the stuff that ends up in the outside-world include file
44254225Speter = typedef off_t regoff_t;
45254225Speter = typedef struct {
46254225Speter = 	int re_magic;
47254225Speter = 	size_t re_nsub;		// number of parenthesized subexpressions
48254225Speter = 	const char *re_endp;	// end pointer for REG_PEND
49254225Speter = 	struct re_guts *re_g;	// none of your business :-)
50254225Speter = } regex_t;
51254225Speter = typedef struct {
52254225Speter = 	regoff_t rm_so;		// start of match
53254225Speter = 	regoff_t rm_eo;		// end of match
54254225Speter = } regmatch_t;
55254225Speter */
56254225Speter/*
57254225Speter * internals of regex_t
58254225Speter */
59254225Speter#define	MAGIC1	((('r'^0200)<<8) | 'e')
60254225Speter
61254225Speter/*
62254225Speter * The internal representation is a *strip*, a sequence of
63254225Speter * operators ending with an endmarker.  (Some terminology etc. is a
64254225Speter * historical relic of earlier versions which used multiple strips.)
65254225Speter * Certain oddities in the representation are there to permit running
66254225Speter * the machinery backwards; in particular, any deviation from sequential
67254225Speter * flow must be marked at both its source and its destination.  Some
68254225Speter * fine points:
69254225Speter *
70254225Speter * - OPLUS_ and O_PLUS are *inside* the loop they create.
71254225Speter * - OQUEST_ and O_QUEST are *outside* the bypass they create.
72254225Speter * - OCH_ and O_CH are *outside* the multi-way branch they create, while
73254225Speter *   OOR1 and OOR2 are respectively the end and the beginning of one of
74254225Speter *   the branches.  Note that there is an implicit OOR2 following OCH_
75254225Speter *   and an implicit OOR1 preceding O_CH.
76254225Speter *
77254225Speter * In state representations, an operator's bit is on to signify a state
78254225Speter * immediately *preceding* "execution" of that operator.
79254225Speter */
80254225Spetertypedef char sop;	/* strip operator */
81254225Spetertypedef size_t sopno;
82254225Speter/* operators			   meaning	operand			*/
83254225Speter/*						(back, fwd are offsets)	*/
84254225Speter#define	OEND	(1)		/* endmarker	-			*/
85254225Speter#define	OCHAR	(2)		/* character	unsigned char		*/
86254225Speter#define	OBOL	(3)		/* left anchor	-			*/
87254225Speter#define	OEOL	(4)		/* right anchor	-			*/
88254225Speter#define	OANY	(5)		/* .		-			*/
89254225Speter#define	OANYOF	(6)		/* [...]	set number		*/
90254225Speter#define	OBACK_	(7)		/* begin \d	paren number		*/
91254225Speter#define	O_BACK	(8)		/* end \d	paren number		*/
92254225Speter#define	OPLUS_	(9)		/* + prefix	fwd to suffix		*/
93254225Speter#define	O_PLUS	(10)		/* + suffix	back to prefix		*/
94254225Speter#define	OQUEST_	(11)		/* ? prefix	fwd to suffix		*/
95254225Speter#define	O_QUEST	(12)		/* ? suffix	back to prefix		*/
96254225Speter#define	OLPAREN	(13)		/* (		fwd to )		*/
97254225Speter#define	ORPAREN	(14)		/* )		back to (		*/
98254225Speter#define	OCH_	(15)		/* begin choice	fwd to OOR2		*/
99254225Speter#define	OOR1	(16)		/* | pt. 1	back to OOR1 or OCH_	*/
100254225Speter#define	OOR2	(17)		/* | pt. 2	fwd to OOR2 or O_CH	*/
101254225Speter#define	O_CH	(18)		/* end choice	back to OOR1		*/
102254225Speter#define	OBOW	(19)		/* begin word	-			*/
103254225Speter#define	OEOW	(20)		/* end word	-			*/
104254225Speter
105254225Speter/*
106254225Speter * Structure for [] character-set representation.  Character sets are
107254225Speter * done as bit vectors, grouped 8 to a byte vector for compactness.
108254225Speter * The individual set therefore has both a pointer to the byte vector
109254225Speter * and a mask to pick out the relevant bit of each byte.  A hash code
110254225Speter * simplifies testing whether two sets could be identical.
111254225Speter *
112254225Speter * This will get trickier for multicharacter collating elements.  As
113254225Speter * preliminary hooks for dealing with such things, we also carry along
114254225Speter * a string of multi-character elements, and decide the size of the
115254225Speter * vectors at run time.
116254225Speter */
117254225Spetertypedef struct {
118254225Speter	uch *ptr;		/* -> uch [csetsize] */
119254225Speter	uch mask;		/* bit within array */
120254225Speter	uch hash;		/* hash code */
121254225Speter	size_t smultis;
122254225Speter	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
123254225Speter} cset;
124254225Speter/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
125254225Speter#define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
126254225Speter#define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
127254225Speter#define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
128254225Speter#define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* regcomp() internal fns */
129254225Speter#define	MCsub(p, cs, cp)	mcsub(p, cs, cp)
130254225Speter#define	MCin(p, cs, cp)	mcin(p, cs, cp)
131254225Speter
132254225Speter/* stuff for character categories */
133254225Spetertypedef RCHAR_T cat_t;
134254225Speter
135254225Speter/*
136254225Speter * main compiled-expression structure
137254225Speter */
138254225Speterstruct re_guts {
139254225Speter	int magic;
140254225Speter#		define	MAGIC2	((('R'^0200)<<8)|'E')
141254225Speter	sop *strip;		/* malloced area for strip */
142254225Speter	RCHAR_T *stripdata;	/* malloced area for stripdata */
143254225Speter	size_t csetsize;	/* number of bits in a cset vector */
144254225Speter	size_t ncsets;		/* number of csets in use */
145254225Speter	cset *sets;		/* -> cset [ncsets] */
146254225Speter	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
147254225Speter	int cflags;		/* copy of regcomp() cflags argument */
148254225Speter	sopno nstates;		/* = number of sops */
149254225Speter	sopno firststate;	/* the initial OEND (normally 0) */
150254225Speter	sopno laststate;	/* the final OEND */
151254225Speter	int iflags;		/* internal flags */
152254225Speter#		define	USEBOL	01	/* used ^ */
153254225Speter#		define	USEEOL	02	/* used $ */
154254225Speter#		define	BAD	04	/* something wrong */
155254225Speter	size_t nbol;		/* number of ^ used */
156254225Speter	size_t neol;		/* number of $ used */
157254225Speter#if 0
158254225Speter	size_t ncategories;	/* how many character categories */
159254225Speter	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
160254225Speter#endif
161254225Speter	RCHAR_T *must;		/* match must contain this string */
162254225Speter	size_t mlen;		/* length of must */
163254225Speter	size_t nsub;		/* copy of re_nsub */
164254225Speter	int backrefs;		/* does it use back references? */
165254225Speter	sopno nplus;		/* how deep does it nest +s? */
166254225Speter	/* catspace must be last */
167254225Speter#if 0
168254225Speter	cat_t catspace[1];	/* actually [NC] */
169254225Speter#endif
170254225Speter};
171254225Speter
172254225Speter/* misc utilities */
173254225Speter#define OUT	REOF	/* a non-character value */
174254225Speter#define	ISWORD(c) ((c) == '_' || (ISGRAPH((UCHAR_T)c) && !ISPUNCT((UCHAR_T)c)))
175