1253592Serwin/*
2253592Serwin * Copyright (C) 2013  Internet Systems Consortium, Inc. ("ISC")
3253592Serwin *
4253592Serwin * Permission to use, copy, modify, and/or distribute this software for any
5253592Serwin * purpose with or without fee is hereby granted, provided that the above
6253592Serwin * copyright notice and this permission notice appear in all copies.
7253592Serwin *
8253592Serwin * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9253592Serwin * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10253592Serwin * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11253592Serwin * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12253592Serwin * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13253592Serwin * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14253592Serwin * PERFORMANCE OF THIS SOFTWARE.
15253592Serwin */
16253592Serwin
17253592Serwin#include <config.h>
18253592Serwin
19253592Serwin#include <isc/file.h>
20253592Serwin#include <isc/regex.h>
21253592Serwin#include <isc/string.h>
22253592Serwin
23253592Serwin#if VALREGEX_REPORT_REASON
24253592Serwin#define FAIL(x) do { reason = (x); goto error; } while(0)
25253592Serwin#else
26253592Serwin#define FAIL(x) goto error
27253592Serwin#endif
28253592Serwin
29253592Serwin/*
30253592Serwin * Validate the regular expression 'C' locale.
31253592Serwin */
32253592Serwinint
33253592Serwinisc_regex_validate(const char *c) {
34253592Serwin	enum {
35253592Serwin		none, parse_bracket, parse_bound,
36253592Serwin		parse_ce, parse_ec, parse_cc
37253592Serwin	} state = none;
38253592Serwin	/* Well known character classes. */
39253592Serwin	const char *cc[] = {
40253592Serwin		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
41253592Serwin		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
42253592Serwin		":print:", ":xdigit:"
43253592Serwin	};
44253592Serwin	isc_boolean_t seen_comma = ISC_FALSE;
45253592Serwin	isc_boolean_t seen_high = ISC_FALSE;
46253592Serwin	isc_boolean_t seen_char = ISC_FALSE;
47253592Serwin	isc_boolean_t seen_ec = ISC_FALSE;
48253592Serwin	isc_boolean_t seen_ce = ISC_FALSE;
49253592Serwin	isc_boolean_t have_atom = ISC_FALSE;
50253592Serwin	int group = 0;
51253592Serwin	int range = 0;
52253592Serwin	int sub = 0;
53253592Serwin	isc_boolean_t empty_ok = ISC_FALSE;
54253592Serwin	isc_boolean_t neg = ISC_FALSE;
55253592Serwin	isc_boolean_t was_multiple = ISC_FALSE;
56253592Serwin	unsigned int low = 0;
57253592Serwin	unsigned int high = 0;
58253592Serwin	const char *ccname = NULL;
59253592Serwin	int range_start = 0;
60253592Serwin#if VALREGEX_REPORT_REASON
61253592Serwin	const char *reason = "";
62253592Serwin#endif
63253592Serwin
64253592Serwin	if (c == NULL || *c == 0)
65253592Serwin		FAIL("empty string");
66253592Serwin
67253592Serwin	while (c != NULL && *c != 0) {
68253592Serwin		switch (state) {
69253592Serwin		case none:
70253592Serwin			switch (*c) {
71253592Serwin			case '\\':	/* make literal */
72253592Serwin				++c;
73253592Serwin				switch (*c) {
74253592Serwin				case '1': case '2': case '3':
75253592Serwin				case '4': case '5': case '6':
76253592Serwin				case '7': case '8': case '9':
77253592Serwin					if ((*c - '0') > sub)
78253592Serwin						FAIL("bad back reference");
79253592Serwin					have_atom = ISC_TRUE;
80253592Serwin					was_multiple = ISC_FALSE;
81253592Serwin					break;
82253592Serwin				case 0:
83253592Serwin					FAIL("escaped end-of-string");
84253592Serwin				default:
85253592Serwin					goto literal;
86253592Serwin				}
87253592Serwin				++c;
88253592Serwin				break;
89253592Serwin			case '[':	/* bracket start */
90253592Serwin				++c;
91253592Serwin				neg = ISC_FALSE;
92253592Serwin				was_multiple = ISC_FALSE;
93253592Serwin				seen_char = ISC_FALSE;
94253592Serwin				state = parse_bracket;
95253592Serwin				break;
96253592Serwin			case '{': 	/* bound start */
97253592Serwin				switch (c[1]) {
98253592Serwin				case '0': case '1': case '2': case '3':
99253592Serwin				case '4': case '5': case '6': case '7':
100253592Serwin				case '8': case '9':
101253592Serwin					if (!have_atom)
102253592Serwin						FAIL("no atom");
103253592Serwin					if (was_multiple)
104253592Serwin						FAIL("was multiple");
105253592Serwin					seen_comma = ISC_FALSE;
106253592Serwin					seen_high = ISC_FALSE;
107253592Serwin					low = high = 0;
108253592Serwin					state = parse_bound;
109253592Serwin					break;
110253592Serwin				default:
111253592Serwin					goto literal;
112253592Serwin				}
113253592Serwin				++c;
114253592Serwin				have_atom = ISC_TRUE;
115253592Serwin				was_multiple = ISC_TRUE;
116253592Serwin				break;
117253592Serwin			case '}':
118253592Serwin				goto literal;
119253592Serwin			case '(':	/* group start */
120253592Serwin				have_atom = ISC_FALSE;
121253592Serwin				was_multiple = ISC_FALSE;
122253592Serwin				empty_ok = ISC_TRUE;
123253592Serwin				++group;
124253592Serwin				++sub;
125253592Serwin				++c;
126253592Serwin				break;
127253592Serwin			case ')':	/* group end */
128253592Serwin				if (group && !have_atom && !empty_ok)
129253592Serwin					FAIL("empty alternative");
130253592Serwin				have_atom = ISC_TRUE;
131253592Serwin				was_multiple = ISC_FALSE;
132253592Serwin				if (group != 0)
133253592Serwin					--group;
134253592Serwin				++c;
135253592Serwin				break;
136253592Serwin			case '|':	/* alternative seperator */
137253592Serwin				if (!have_atom)
138253592Serwin					FAIL("no atom");
139253592Serwin				have_atom = ISC_FALSE;
140253592Serwin				empty_ok = ISC_FALSE;
141253592Serwin				was_multiple = ISC_FALSE;
142253592Serwin				++c;
143253592Serwin				break;
144253592Serwin			case '^':
145253592Serwin			case '$':
146253592Serwin				have_atom = ISC_TRUE;
147253592Serwin				was_multiple = ISC_TRUE;
148253592Serwin				++c;
149253592Serwin				break;
150253592Serwin			case '+':
151253592Serwin			case '*':
152253592Serwin			case '?':
153253592Serwin				if (was_multiple)
154253592Serwin					FAIL("was multiple");
155253592Serwin				if (!have_atom)
156253592Serwin					FAIL("no atom");
157253592Serwin				have_atom = ISC_TRUE;
158253592Serwin				was_multiple = ISC_TRUE;
159253592Serwin				++c;
160253592Serwin				break;
161253592Serwin			case '.':
162253592Serwin			default:
163253592Serwin			literal:
164253592Serwin				have_atom = ISC_TRUE;
165253592Serwin				was_multiple = ISC_FALSE;
166253592Serwin				++c;
167253592Serwin				break;
168253592Serwin			}
169253592Serwin			break;
170253592Serwin		case parse_bound:
171253592Serwin			switch (*c) {
172253592Serwin			case '0': case '1': case '2': case '3': case '4':
173253592Serwin			case '5': case '6': case '7': case '8': case '9':
174253592Serwin				if (!seen_comma) {
175253592Serwin					low = low * 10 + *c - '0';
176253592Serwin					if (low > 255)
177253592Serwin						FAIL("lower bound too big");
178253592Serwin				} else {
179253592Serwin					seen_high = ISC_TRUE;
180253592Serwin					high = high * 10 + *c - '0';
181253592Serwin					if (high > 255)
182253592Serwin						FAIL("upper bound too big");
183253592Serwin				}
184253592Serwin				++c;
185253592Serwin				break;
186253592Serwin			case ',':
187253592Serwin				if (seen_comma)
188253592Serwin					FAIL("multiple commas");
189253592Serwin				seen_comma = ISC_TRUE;
190253592Serwin				++c;
191253592Serwin				break;
192253592Serwin			default:
193253592Serwin			case '{':
194253592Serwin				FAIL("non digit/comma");
195253592Serwin			case '}':
196253592Serwin				if (seen_high && low > high)
197253592Serwin					FAIL("bad parse bound");
198253592Serwin				seen_comma = ISC_FALSE;
199253592Serwin				state = none;
200253592Serwin				++c;
201253592Serwin				break;
202253592Serwin			}
203253592Serwin			break;
204253592Serwin		case parse_bracket:
205253592Serwin			switch (*c) {
206253592Serwin			case '^':
207253592Serwin				if (seen_char || neg) goto inside;
208253592Serwin				neg = ISC_TRUE;
209253592Serwin				++c;
210253592Serwin				break;
211253592Serwin			case '-':
212253592Serwin				if (range == 2) goto inside;
213253592Serwin				if (!seen_char) goto inside;
214253592Serwin				if (range == 1)
215253592Serwin					FAIL("bad range");
216253592Serwin				range = 2;
217253592Serwin				++c;
218253592Serwin				break;
219253592Serwin			case '[':
220253592Serwin				++c;
221253592Serwin				switch (*c) {
222253592Serwin				case '.':	/* collating element */
223253592Serwin					if (range) --range;
224253592Serwin					++c;
225253592Serwin					state = parse_ce;
226253592Serwin					seen_ce = ISC_FALSE;
227253592Serwin					break;
228253592Serwin				case '=':	/* equivalence class */
229253592Serwin					if (range == 2)
230253592Serwin					    FAIL("equivalence class in range");
231253592Serwin					++c;
232253592Serwin					state = parse_ec;
233253592Serwin					seen_ec = ISC_FALSE;
234253592Serwin					break;
235253592Serwin				case ':':	/* character class */
236253592Serwin					if (range == 2)
237253592Serwin					      FAIL("character class in range");
238253592Serwin					ccname = c;
239253592Serwin					++c;
240253592Serwin					state = parse_cc;
241253592Serwin					break;
242253592Serwin				}
243253592Serwin				seen_char = ISC_TRUE;
244253592Serwin				break;
245253592Serwin			case ']':
246253592Serwin				if (!c[1] && !seen_char)
247253592Serwin					FAIL("unfinished brace");
248253592Serwin				if (!seen_char)
249253592Serwin					goto inside;
250253592Serwin				++c;
251253592Serwin				range = 0;
252253592Serwin				have_atom = ISC_TRUE;
253253592Serwin				state = none;
254253592Serwin				break;
255253592Serwin			default:
256253592Serwin			inside:
257253592Serwin				seen_char = ISC_TRUE;
258253592Serwin				if (range == 2 && *c < range_start)
259253592Serwin					FAIL("out of order range");
260253592Serwin				if (range != 0)
261253592Serwin					--range;
262253592Serwin				range_start = *c;
263253592Serwin				++c;
264253592Serwin				break;
265253592Serwin			};
266253592Serwin			break;
267253592Serwin		case parse_ce:
268253592Serwin			switch (*c) {
269253592Serwin			case '.':
270253592Serwin				++c;
271253592Serwin				switch (*c) {
272253592Serwin				case ']':
273253592Serwin					if (!seen_ce)
274253592Serwin						 FAIL("empty ce");
275253592Serwin					++c;
276253592Serwin					state = parse_bracket;
277253592Serwin					break;
278253592Serwin				default:
279253592Serwin					if (seen_ce)
280253592Serwin						range_start = 256;
281253592Serwin					else
282253592Serwin						range_start = '.';
283253592Serwin					seen_ce = ISC_TRUE;
284253592Serwin					break;
285253592Serwin				}
286253592Serwin				break;
287253592Serwin			default:
288253592Serwin				if (seen_ce)
289253592Serwin					range_start = 256;
290253592Serwin				else
291253592Serwin					range_start = *c;
292253592Serwin				seen_ce = ISC_TRUE;
293253592Serwin				++c;
294253592Serwin				break;
295253592Serwin			}
296253592Serwin			break;
297253592Serwin		case parse_ec:
298253592Serwin			switch (*c) {
299253592Serwin			case '=':
300253592Serwin				++c;
301253592Serwin				switch (*c) {
302253592Serwin				case ']':
303253592Serwin					if (!seen_ec)
304253592Serwin						FAIL("no ec");
305253592Serwin					++c;
306253592Serwin					state = parse_bracket;
307253592Serwin					break;
308253592Serwin				default:
309253592Serwin					seen_ec = ISC_TRUE;
310253592Serwin					break;
311253592Serwin				}
312253592Serwin				break;
313253592Serwin			default:
314253592Serwin				seen_ec = ISC_TRUE;
315253592Serwin				++c;
316253592Serwin				break;
317253592Serwin			}
318253592Serwin			break;
319253592Serwin		case parse_cc:
320253592Serwin			switch (*c) {
321253592Serwin			case ':':
322253592Serwin				++c;
323253592Serwin				switch (*c) {
324253592Serwin				case ']': {
325253592Serwin					unsigned int i;
326253592Serwin					isc_boolean_t found = ISC_FALSE;
327253592Serwin					for (i = 0;
328253592Serwin					     i < sizeof(cc)/sizeof(*cc);
329253592Serwin					     i++)
330253592Serwin					{
331253592Serwin						unsigned int len;
332253592Serwin						len = strlen(cc[i]);
333253592Serwin						if (len !=
334253592Serwin						    (unsigned int)(c - ccname))
335253592Serwin							continue;
336253592Serwin						if (strncmp(cc[i], ccname, len))
337253592Serwin							continue;
338253592Serwin						found = ISC_TRUE;
339253592Serwin					}
340253592Serwin					if (!found)
341253592Serwin						FAIL("unknown cc");
342253592Serwin					++c;
343253592Serwin					state = parse_bracket;
344253592Serwin					break;
345253592Serwin					}
346253592Serwin				default:
347253592Serwin					break;
348253592Serwin				}
349253592Serwin				break;
350253592Serwin			default:
351253592Serwin				++c;
352253592Serwin				break;
353253592Serwin			}
354253592Serwin			break;
355253592Serwin		}
356253592Serwin	}
357253592Serwin	if (group != 0)
358253592Serwin		FAIL("group open");
359253592Serwin	if (state != none)
360253592Serwin		FAIL("incomplete");
361253592Serwin	if (!have_atom)
362253592Serwin		FAIL("no atom");
363253592Serwin	return (sub);
364253592Serwin
365253592Serwin error:
366253592Serwin#if VALREGEX_REPORT_REASON
367253592Serwin	fprintf(stderr, "%s\n", reason);
368253592Serwin#endif
369253592Serwin	return (-1);
370253592Serwin}
371