regexp.c revision 293290
1/*
2 * Copyright (c) 1980, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32
33__FBSDID("$FreeBSD: stable/10/usr.bin/vgrind/regexp.c 293290 2016-01-07 00:40:51Z bdrewery $");
34
35#ifndef lint
36static const char copyright[] =
37"@(#) Copyright (c) 1980, 1993\n\
38	The Regents of the University of California.  All rights reserved.\n";
39#endif
40
41#ifndef lint
42static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43#endif
44
45#include <ctype.h>
46#include <stdlib.h>
47#include <stdbool.h>
48#include <string.h>
49
50#include "extern.h"
51
52static void	expconv(void);
53
54bool	 _escaped;	/* true if we are currently x_escaped */
55char	*s_start;	/* start of string */
56bool	 l_onecase;	/* true if upper and lower equivalent */
57
58#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
59
60/*  STRNCMP -	like strncmp except that we convert the
61 *	 	first string to lower case before comparing
62 *		if l_onecase is set.
63 */
64
65int
66STRNCMP(register char *s1, register char *s2, register int len)
67{
68	if (l_onecase) {
69	    do
70		if (*s2 - makelower(*s1))
71			return (*s2 - makelower(*s1));
72		else {
73			s2++;
74			s1++;
75		}
76	    while (--len);
77	} else {
78	    do
79		if (*s2 - *s1)
80			return (*s2 - *s1);
81		else {
82			s2++;
83			s1++;
84		}
85	    while (--len);
86	}
87	return(0);
88}
89
90/*	The following routine converts an irregular expression to
91 *	internal format.
92 *
93 *	Either meta symbols (\a \d or \p) or character strings or
94 *	operations ( alternation or perenthesizing ) can be
95 *	specified.  Each starts with a descriptor byte.  The descriptor
96 *	byte has STR set for strings, META set for meta symbols
97 *	and OPER set for operations.
98 *	The descriptor byte can also have the OPT bit set if the object
99 *	defined is optional.  Also ALT can be set to indicate an alternation.
100 *
101 *	For metasymbols the byte following the descriptor byte identities
102 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
103 *	strings the byte after the descriptor is a character count for
104 *	the string:
105 *
106 *		meta symbols := descriptor
107 *				symbol
108 *
109 *		strings :=	descriptor
110 *				character count
111 *				the string
112 *
113 *		operatins :=	descriptor
114 *				symbol
115 *				character count
116 */
117
118/*
119 *  handy macros for accessing parts of match blocks
120 */
121#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
122#define MNEXT(A) (A+2)		/* character following a metasymbol block */
123
124#define OSYM(A) (*(A+1))	/* symbol in an operation block */
125#define OCNT(A) (*(A+2))	/* character count */
126#define ONEXT(A) (A+3)		/* next character after the operation */
127#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
128
129#define SCNT(A) (*(A+1))	/* byte count of a string */
130#define SSTR(A) (A+2)		/* address of the string */
131#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
132
133/*
134 *  bit flags in the descriptor
135 */
136#define OPT 1
137#define STR 2
138#define META 4
139#define ALT 8
140#define OPER 16
141
142static char *ccre;	/* pointer to current position in converted exp*/
143static char *ure;	/* pointer current position in unconverted exp */
144
145/* re: unconverted irregular expression */
146char *
147convexp(char *re)
148{
149    register char *cre;		/* pointer to converted regular expression */
150
151    /* allocate room for the converted expression */
152    if (re == NULL)
153	return (NULL);
154    if (*re == '\0')
155	return (NULL);
156    cre = malloc(4 * strlen(re) + 3);
157    ccre = cre;
158    ure = re;
159
160    /* start the conversion with a \a */
161    *cre = META | OPT;
162    MSYM(cre) = 'a';
163    ccre = MNEXT(cre);
164
165    /* start the conversion (its recursive) */
166    expconv ();
167    *ccre = 0;
168    return (cre);
169}
170
171static void
172expconv()
173{
174    register char *cs;		/* pointer to current symbol in converted exp */
175    register char c;		/* character being processed */
176    register char *acs;		/* pinter to last alternate */
177    register int temp;
178
179    /* let the conversion begin */
180    acs = NULL;
181    cs = NULL;
182    while (*ure) {
183	switch (c = *ure++) {
184
185	case '\\':
186	    switch (c = *ure++) {
187
188	    /* escaped characters are just characters */
189	    default:
190		if (cs == NULL || (*cs & STR) == 0) {
191		    cs = ccre;
192		    *cs = STR;
193		    SCNT(cs) = 1;
194		    ccre += 2;
195		} else
196		    SCNT(cs)++;
197		*ccre++ = c;
198		break;
199
200	    /* normal(?) metacharacters */
201	    case 'a':
202	    case 'd':
203	    case 'e':
204	    case 'p':
205		if (acs != NULL && acs != cs) {
206		    do {
207			temp = OCNT(acs);
208			OCNT(acs) = ccre - acs;
209			acs -= temp;
210		    } while (temp != 0);
211		    acs = NULL;
212		}
213		cs = ccre;
214		*cs = META;
215		MSYM(cs) = c;
216		ccre = MNEXT(cs);
217		break;
218	    }
219	    break;
220
221	/* just put the symbol in */
222	case '^':
223	case '$':
224	    if (acs != NULL && acs != cs) {
225		do {
226		    temp = OCNT(acs);
227		    OCNT(acs) = ccre - acs;
228		    acs -= temp;
229		} while (temp != 0);
230		acs = NULL;
231	    }
232	    cs = ccre;
233	    *cs = META;
234	    MSYM(cs) = c;
235	    ccre = MNEXT(cs);
236	    break;
237
238	/* mark the last match sequence as optional */
239	case '?':
240	    if (cs)
241	    	*cs = *cs | OPT;
242	    break;
243
244	/* recurse and define a subexpression */
245	case '(':
246	    if (acs != NULL && acs != cs) {
247		do {
248		    temp = OCNT(acs);
249		    OCNT(acs) = ccre - acs;
250		    acs -= temp;
251		} while (temp != 0);
252		acs = NULL;
253	    }
254	    cs = ccre;
255	    *cs = OPER;
256	    OSYM(cs) = '(';
257	    ccre = ONEXT(cs);
258	    expconv();
259	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
260	    break;
261
262	/* return from a recursion */
263	case ')':
264	    if (acs != NULL) {
265		do {
266		    temp = OCNT(acs);
267		    OCNT(acs) = ccre - acs;
268		    acs -= temp;
269		} while (temp != 0);
270		acs = NULL;
271	    }
272	    cs = ccre;
273	    *cs = META;
274	    MSYM(cs) = c;
275	    ccre = MNEXT(cs);
276	    return;
277
278	/* mark the last match sequence as having an alternate */
279	/* the third byte will contain an offset to jump over the */
280	/* alternate match in case the first did not fail */
281	case '|':
282	    if (acs != NULL && acs != cs)
283		OCNT(ccre) = ccre - acs;	/* make a back pointer */
284	    else
285		OCNT(ccre) = 0;
286	    *cs |= ALT;
287	    cs = ccre;
288	    *cs = OPER;
289	    OSYM(cs) = '|';
290	    ccre = ONEXT(cs);
291	    acs = cs;	/* remember that the pointer is to be filles */
292	    break;
293
294	/* if its not a metasymbol just build a scharacter string */
295	default:
296	    if (cs == NULL || (*cs & STR) == 0) {
297		cs = ccre;
298		*cs = STR;
299		SCNT(cs) = 1;
300		ccre = SSTR(cs);
301	    } else
302		SCNT(cs)++;
303	    *ccre++ = c;
304	    break;
305	}
306    }
307    if (acs != NULL) {
308	do {
309	    temp = OCNT(acs);
310	    OCNT(acs) = ccre - acs;
311	    acs -= temp;
312	} while (temp != 0);
313	acs = NULL;
314    }
315    return;
316}
317/* end of convertre */
318
319
320/*
321 *	The following routine recognises an irregular expression
322 *	with the following special characters:
323 *
324 *		\?	-	means last match was optional
325 *		\a	-	matches any number of characters
326 *		\d	-	matches any number of spaces and tabs
327 *		\p	-	matches any number of alphanumeric
328 *				characters. The
329 *				characters matched will be copied into
330 *				the area pointed to by 'name'.
331 *		\|	-	alternation
332 *		\( \)	-	grouping used mostly for alternation and
333 *				optionality
334 *
335 *	The irregular expression must be translated to internal form
336 *	prior to calling this routine
337 *
338 *	The value returned is the pointer to the first non \a
339 *	character matched.
340 */
341
342/*
343 *  s: string to check for a match in
344 *  re: a converted irregular expression
345 *  mstring: where to put whatever matches a \p
346 */
347char *
348expmatch (register char *s, register char *re, register char *mstring)
349{
350    register char *cs;		/* the current symbol */
351    register char *ptr,*s1;	/* temporary pointer */
352    bool matched;	/* a temporary bool */
353
354    /* initial conditions */
355    if (re == NULL)
356	return (NULL);
357    cs = re;
358    matched = false;
359
360    /* loop till expression string is exhausted (or at least pretty tired) */
361    while (*cs) {
362	switch (*cs & (OPER | STR | META)) {
363
364	/* try to match a string */
365	case STR:
366	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
367	    if (matched) {
368
369		/* hoorah it matches */
370		s += SCNT(cs);
371		cs = SNEXT(cs);
372	    } else if (*cs & ALT) {
373
374		/* alternation, skip to next expression */
375		cs = SNEXT(cs);
376	    } else if (*cs & OPT) {
377
378		/* the match is optional */
379		cs = SNEXT(cs);
380		matched = 1;		/* indicate a successful match */
381	    } else {
382
383		/* no match, error return */
384		return (NULL);
385	    }
386	    break;
387
388	/* an operator, do something fancy */
389	case OPER:
390	    switch (OSYM(cs)) {
391
392	    /* this is an alternation */
393	    case '|':
394		if (matched)
395
396		    /* last thing in the alternation was a match, skip ahead */
397		    cs = OPTR(cs);
398		else
399
400		    /* no match, keep trying */
401		    cs = ONEXT(cs);
402		break;
403
404	    /* this is a grouping, recurse */
405	    case '(':
406		ptr = expmatch(s, ONEXT(cs), mstring);
407		if (ptr != NULL) {
408
409		    /* the subexpression matched */
410		    matched = 1;
411		    s = ptr;
412		} else if (*cs & ALT) {
413
414		    /* alternation, skip to next expression */
415		    matched = 0;
416		} else if (*cs & OPT) {
417
418		    /* the match is optional */
419		    matched = 1;	/* indicate a successful match */
420		} else {
421
422		    /* no match, error return */
423		    return (NULL);
424		}
425		cs = OPTR(cs);
426		break;
427	    }
428	    break;
429
430	/* try to match a metasymbol */
431	case META:
432	    switch (MSYM(cs)) {
433
434	    /* try to match anything and remember what was matched */
435	    case 'p':
436		/*
437		 *  This is really the same as trying the match the
438		 *  remaining parts of the expression to any subset
439		 *  of the string.
440		 */
441		s1 = s;
442		do {
443		    ptr = expmatch(s1, MNEXT(cs), mstring);
444		    if (ptr != NULL && s1 != s) {
445
446			/* we have a match, remember the match */
447			strncpy (mstring, s, s1 - s);
448			mstring[s1 - s] = '\0';
449			return (ptr);
450		    } else if (ptr != NULL && (*cs & OPT)) {
451
452			/* it was aoptional so no match is ok */
453			return (ptr);
454		    } else if (ptr != NULL) {
455
456			/* not optional and we still matched */
457			return (NULL);
458		    }
459		    if (!(isalnum(*s1) || *s1 == '_' ||
460			  /* C++ destructor */
461			  *s1 == '~' ||
462			  /* C++ scope operator */
463			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
464			   (s1++, true))))
465			return (NULL);
466		    if (*s1 == '\\')
467			_escaped = _escaped ? false : true;
468		    else
469			_escaped = false;
470		} while (*s1++);
471		return (NULL);
472
473	    /* try to match anything */
474	    case 'a':
475		/*
476		 *  This is really the same as trying the match the
477		 *  remaining parts of the expression to any subset
478		 *  of the string.
479		 */
480		s1 = s;
481		do {
482		    ptr = expmatch(s1, MNEXT(cs), mstring);
483		    if (ptr != NULL && s1 != s) {
484
485			/* we have a match */
486			return (ptr);
487		    } else if (ptr != NULL && (*cs & OPT)) {
488
489			/* it was aoptional so no match is ok */
490			return (ptr);
491		    } else if (ptr != NULL) {
492
493			/* not optional and we still matched */
494			return (NULL);
495		    }
496		    if (*s1 == '\\')
497			_escaped = _escaped ? false : true;
498		    else
499			_escaped = false;
500		} while (*s1++);
501		return (NULL);
502
503	    /* fail if we are currently _escaped */
504	    case 'e':
505		if (_escaped)
506		    return(NULL);
507		cs = MNEXT(cs);
508		break;
509
510	    /* match any number of tabs and spaces */
511	    case 'd':
512		ptr = s;
513		while (*s == ' ' || *s == '\t')
514		    s++;
515		if (s != ptr || s == s_start) {
516
517		    /* match, be happy */
518		    matched = 1;
519		    cs = MNEXT(cs);
520		} else if (*s == '\n' || *s == '\0') {
521
522		    /* match, be happy */
523		    matched = 1;
524		    cs = MNEXT(cs);
525		} else if (*cs & ALT) {
526
527		    /* try the next part */
528		    matched = 0;
529		    cs = MNEXT(cs);
530		} else if (*cs & OPT) {
531
532		    /* doesn't matter */
533		    matched = 1;
534		    cs = MNEXT(cs);
535		} else
536
537		    /* no match, error return */
538		    return (NULL);
539		break;
540
541	    /* check for end of line */
542	    case '$':
543		if (*s == '\0' || *s == '\n') {
544
545		    /* match, be happy */
546		    s++;
547		    matched = 1;
548		    cs = MNEXT(cs);
549		} else if (*cs & ALT) {
550
551		    /* try the next part */
552		    matched = 0;
553		    cs = MNEXT(cs);
554		} else if (*cs & OPT) {
555
556		    /* doesn't matter */
557		    matched = 1;
558		    cs = MNEXT(cs);
559		} else
560
561		    /* no match, error return */
562		    return (NULL);
563		break;
564
565	    /* check for start of line */
566	    case '^':
567		if (s == s_start) {
568
569		    /* match, be happy */
570		    matched = 1;
571		    cs = MNEXT(cs);
572		} else if (*cs & ALT) {
573
574		    /* try the next part */
575		    matched = 0;
576		    cs = MNEXT(cs);
577		} else if (*cs & OPT) {
578
579		    /* doesn't matter */
580		    matched = 1;
581		    cs = MNEXT(cs);
582		} else
583
584		    /* no match, error return */
585		    return (NULL);
586		break;
587
588	    /* end of a subexpression, return success */
589	    case ')':
590		return (s);
591	    }
592	    break;
593	}
594    }
595    return (s);
596}
597