1/*
2 * Copyright (c) 1980, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32
33__FBSDID("$FreeBSD$");
34
35#ifndef lint
36static const char copyright[] =
37"@(#) Copyright (c) 1980, 1993\n\
38	The Regents of the University of California.  All rights reserved.\n";
39#endif
40
41#ifndef lint
42static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43#endif
44
45#include <ctype.h>
46#include <stdlib.h>
47#include <string.h>
48
49#include "extern.h"
50
51#define FALSE	0
52#define TRUE	!(FALSE)
53#define NIL	0
54
55static void	expconv(void);
56
57boolean	 _escaped;	/* true if we are currently _escaped */
58char	*s_start;	/* start of string */
59boolean	 l_onecase;	/* true if upper and lower equivalent */
60
61#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
62
63/*  STRNCMP -	like strncmp except that we convert the
64 *	 	first string to lower case before comparing
65 *		if l_onecase is set.
66 */
67
68int
69STRNCMP(s1, s2, len)
70	register char *s1,*s2;
71	register int len;
72{
73	if (l_onecase) {
74	    do
75		if (*s2 - makelower(*s1))
76			return (*s2 - makelower(*s1));
77		else {
78			s2++;
79			s1++;
80		}
81	    while (--len);
82	} else {
83	    do
84		if (*s2 - *s1)
85			return (*s2 - *s1);
86		else {
87			s2++;
88			s1++;
89		}
90	    while (--len);
91	}
92	return(0);
93}
94
95/*	The following routine converts an irregular expression to
96 *	internal format.
97 *
98 *	Either meta symbols (\a \d or \p) or character strings or
99 *	operations ( alternation or perenthesizing ) can be
100 *	specified.  Each starts with a descriptor byte.  The descriptor
101 *	byte has STR set for strings, META set for meta symbols
102 *	and OPER set for operations.
103 *	The descriptor byte can also have the OPT bit set if the object
104 *	defined is optional.  Also ALT can be set to indicate an alternation.
105 *
106 *	For metasymbols the byte following the descriptor byte identities
107 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
108 *	strings the byte after the descriptor is a character count for
109 *	the string:
110 *
111 *		meta symbols := descriptor
112 *				symbol
113 *
114 *		strings :=	descriptor
115 *				character count
116 *				the string
117 *
118 *		operatins :=	descriptor
119 *				symbol
120 *				character count
121 */
122
123/*
124 *  handy macros for accessing parts of match blocks
125 */
126#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
127#define MNEXT(A) (A+2)		/* character following a metasymbol block */
128
129#define OSYM(A) (*(A+1))	/* symbol in an operation block */
130#define OCNT(A) (*(A+2))	/* character count */
131#define ONEXT(A) (A+3)		/* next character after the operation */
132#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
133
134#define SCNT(A) (*(A+1))	/* byte count of a string */
135#define SSTR(A) (A+2)		/* address of the string */
136#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
137
138/*
139 *  bit flags in the descriptor
140 */
141#define OPT 1
142#define STR 2
143#define META 4
144#define ALT 8
145#define OPER 16
146
147static char *ccre;	/* pointer to current position in converted exp*/
148static char *ure;	/* pointer current position in unconverted exp */
149
150char *
151convexp(re)
152    char *re;		/* unconverted irregular expression */
153{
154    register char *cre;		/* pointer to converted regular expression */
155
156    /* allocate room for the converted expression */
157    if (re == NIL)
158	return (NIL);
159    if (*re == '\0')
160	return (NIL);
161    cre = malloc (4 * strlen(re) + 3);
162    ccre = cre;
163    ure = re;
164
165    /* start the conversion with a \a */
166    *cre = META | OPT;
167    MSYM(cre) = 'a';
168    ccre = MNEXT(cre);
169
170    /* start the conversion (its recursive) */
171    expconv ();
172    *ccre = 0;
173    return (cre);
174}
175
176static void
177expconv()
178{
179    register char *cs;		/* pointer to current symbol in converted exp */
180    register char c;		/* character being processed */
181    register char *acs;		/* pinter to last alternate */
182    register int temp;
183
184    /* let the conversion begin */
185    acs = NIL;
186    cs = NIL;
187    while (*ure != NIL) {
188	switch (c = *ure++) {
189
190	case '\\':
191	    switch (c = *ure++) {
192
193	    /* escaped characters are just characters */
194	    default:
195		if (cs == NIL || (*cs & STR) == 0) {
196		    cs = ccre;
197		    *cs = STR;
198		    SCNT(cs) = 1;
199		    ccre += 2;
200		} else
201		    SCNT(cs)++;
202		*ccre++ = c;
203		break;
204
205	    /* normal(?) metacharacters */
206	    case 'a':
207	    case 'd':
208	    case 'e':
209	    case 'p':
210		if (acs != NIL && acs != cs) {
211		    do {
212			temp = OCNT(acs);
213			OCNT(acs) = ccre - acs;
214			acs -= temp;
215		    } while (temp != 0);
216		    acs = NIL;
217		}
218		cs = ccre;
219		*cs = META;
220		MSYM(cs) = c;
221		ccre = MNEXT(cs);
222		break;
223	    }
224	    break;
225
226	/* just put the symbol in */
227	case '^':
228	case '$':
229	    if (acs != NIL && acs != cs) {
230		do {
231		    temp = OCNT(acs);
232		    OCNT(acs) = ccre - acs;
233		    acs -= temp;
234		} while (temp != 0);
235		acs = NIL;
236	    }
237	    cs = ccre;
238	    *cs = META;
239	    MSYM(cs) = c;
240	    ccre = MNEXT(cs);
241	    break;
242
243	/* mark the last match sequence as optional */
244	case '?':
245	    if (cs)
246	    	*cs = *cs | OPT;
247	    break;
248
249	/* recurse and define a subexpression */
250	case '(':
251	    if (acs != NIL && acs != cs) {
252		do {
253		    temp = OCNT(acs);
254		    OCNT(acs) = ccre - acs;
255		    acs -= temp;
256		} while (temp != 0);
257		acs = NIL;
258	    }
259	    cs = ccre;
260	    *cs = OPER;
261	    OSYM(cs) = '(';
262	    ccre = ONEXT(cs);
263	    expconv ();
264	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
265	    break;
266
267	/* return from a recursion */
268	case ')':
269	    if (acs != NIL) {
270		do {
271		    temp = OCNT(acs);
272		    OCNT(acs) = ccre - acs;
273		    acs -= temp;
274		} while (temp != 0);
275		acs = NIL;
276	    }
277	    cs = ccre;
278	    *cs = META;
279	    MSYM(cs) = c;
280	    ccre = MNEXT(cs);
281	    return;
282
283	/* mark the last match sequence as having an alternate */
284	/* the third byte will contain an offset to jump over the */
285	/* alternate match in case the first did not fail */
286	case '|':
287	    if (acs != NIL && acs != cs)
288		OCNT(ccre) = ccre - acs;	/* make a back pointer */
289	    else
290		OCNT(ccre) = 0;
291	    *cs |= ALT;
292	    cs = ccre;
293	    *cs = OPER;
294	    OSYM(cs) = '|';
295	    ccre = ONEXT(cs);
296	    acs = cs;	/* remember that the pointer is to be filles */
297	    break;
298
299	/* if its not a metasymbol just build a scharacter string */
300	default:
301	    if (cs == NIL || (*cs & STR) == 0) {
302		cs = ccre;
303		*cs = STR;
304		SCNT(cs) = 1;
305		ccre = SSTR(cs);
306	    } else
307		SCNT(cs)++;
308	    *ccre++ = c;
309	    break;
310	}
311    }
312    if (acs != NIL) {
313	do {
314	    temp = OCNT(acs);
315	    OCNT(acs) = ccre - acs;
316	    acs -= temp;
317	} while (temp != 0);
318	acs = NIL;
319    }
320    return;
321}
322/* end of convertre */
323
324
325/*
326 *	The following routine recognises an irregular expresion
327 *	with the following special characters:
328 *
329 *		\?	-	means last match was optional
330 *		\a	-	matches any number of characters
331 *		\d	-	matches any number of spaces and tabs
332 *		\p	-	matches any number of alphanumeric
333 *				characters. The
334 *				characters matched will be copied into
335 *				the area pointed to by 'name'.
336 *		\|	-	alternation
337 *		\( \)	-	grouping used mostly for alternation and
338 *				optionality
339 *
340 *	The irregular expression must be translated to internal form
341 *	prior to calling this routine
342 *
343 *	The value returned is the pointer to the first non \a
344 *	character matched.
345 */
346
347char *
348expmatch (s, re, mstring)
349    register char *s;		/* string to check for a match in */
350    register char *re;		/* a converted irregular expression */
351    register char *mstring;	/* where to put whatever matches a \p */
352{
353    register char *cs;		/* the current symbol */
354    register char *ptr,*s1;	/* temporary pointer */
355    boolean matched;		/* a temporary boolean */
356
357    /* initial conditions */
358    if (re == NIL)
359	return (NIL);
360    cs = re;
361    matched = FALSE;
362
363    /* loop till expression string is exhausted (or at least pretty tired) */
364    while (*cs) {
365	switch (*cs & (OPER | STR | META)) {
366
367	/* try to match a string */
368	case STR:
369	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
370	    if (matched) {
371
372		/* hoorah it matches */
373		s += SCNT(cs);
374		cs = SNEXT(cs);
375	    } else if (*cs & ALT) {
376
377		/* alternation, skip to next expression */
378		cs = SNEXT(cs);
379	    } else if (*cs & OPT) {
380
381		/* the match is optional */
382		cs = SNEXT(cs);
383		matched = 1;		/* indicate a successful match */
384	    } else {
385
386		/* no match, error return */
387		return (NIL);
388	    }
389	    break;
390
391	/* an operator, do something fancy */
392	case OPER:
393	    switch (OSYM(cs)) {
394
395	    /* this is an alternation */
396	    case '|':
397		if (matched)
398
399		    /* last thing in the alternation was a match, skip ahead */
400		    cs = OPTR(cs);
401		else
402
403		    /* no match, keep trying */
404		    cs = ONEXT(cs);
405		break;
406
407	    /* this is a grouping, recurse */
408	    case '(':
409		ptr = expmatch (s, ONEXT(cs), mstring);
410		if (ptr != NIL) {
411
412		    /* the subexpression matched */
413		    matched = 1;
414		    s = ptr;
415		} else if (*cs & ALT) {
416
417		    /* alternation, skip to next expression */
418		    matched = 0;
419		} else if (*cs & OPT) {
420
421		    /* the match is optional */
422		    matched = 1;	/* indicate a successful match */
423		} else {
424
425		    /* no match, error return */
426		    return (NIL);
427		}
428		cs = OPTR(cs);
429		break;
430	    }
431	    break;
432
433	/* try to match a metasymbol */
434	case META:
435	    switch (MSYM(cs)) {
436
437	    /* try to match anything and remember what was matched */
438	    case 'p':
439		/*
440		 *  This is really the same as trying the match the
441		 *  remaining parts of the expression to any subset
442		 *  of the string.
443		 */
444		s1 = s;
445		do {
446		    ptr = expmatch (s1, MNEXT(cs), mstring);
447		    if (ptr != NIL && s1 != s) {
448
449			/* we have a match, remember the match */
450			strncpy (mstring, s, s1 - s);
451			mstring[s1 - s] = '\0';
452			return (ptr);
453		    } else if (ptr != NIL && (*cs & OPT)) {
454
455			/* it was aoptional so no match is ok */
456			return (ptr);
457		    } else if (ptr != NIL) {
458
459			/* not optional and we still matched */
460			return (NIL);
461		    }
462		    if (!(isalnum(*s1) || *s1 == '_' ||
463			  /* C++ destructor */
464			  *s1 == '~' ||
465			  /* C++ scope operator */
466			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
467			   (s1++, TRUE))))
468			return (NIL);
469		    if (*s1 == '\\')
470			_escaped = _escaped ? FALSE : TRUE;
471		    else
472			_escaped = FALSE;
473		} while (*s1++);
474		return (NIL);
475
476	    /* try to match anything */
477	    case 'a':
478		/*
479		 *  This is really the same as trying the match the
480		 *  remaining parts of the expression to any subset
481		 *  of the string.
482		 */
483		s1 = s;
484		do {
485		    ptr = expmatch (s1, MNEXT(cs), mstring);
486		    if (ptr != NIL && s1 != s) {
487
488			/* we have a match */
489			return (ptr);
490		    } else if (ptr != NIL && (*cs & OPT)) {
491
492			/* it was aoptional so no match is ok */
493			return (ptr);
494		    } else if (ptr != NIL) {
495
496			/* not optional and we still matched */
497			return (NIL);
498		    }
499		    if (*s1 == '\\')
500			_escaped = _escaped ? FALSE : TRUE;
501		    else
502			_escaped = FALSE;
503		} while (*s1++);
504		return (NIL);
505
506	    /* fail if we are currently _escaped */
507	    case 'e':
508		if (_escaped)
509		    return(NIL);
510		cs = MNEXT(cs);
511		break;
512
513	    /* match any number of tabs and spaces */
514	    case 'd':
515		ptr = s;
516		while (*s == ' ' || *s == '\t')
517		    s++;
518		if (s != ptr || s == s_start) {
519
520		    /* match, be happy */
521		    matched = 1;
522		    cs = MNEXT(cs);
523		} else if (*s == '\n' || *s == '\0') {
524
525		    /* match, be happy */
526		    matched = 1;
527		    cs = MNEXT(cs);
528		} else if (*cs & ALT) {
529
530		    /* try the next part */
531		    matched = 0;
532		    cs = MNEXT(cs);
533		} else if (*cs & OPT) {
534
535		    /* doesn't matter */
536		    matched = 1;
537		    cs = MNEXT(cs);
538		} else
539
540		    /* no match, error return */
541		    return (NIL);
542		break;
543
544	    /* check for end of line */
545	    case '$':
546		if (*s == '\0' || *s == '\n') {
547
548		    /* match, be happy */
549		    s++;
550		    matched = 1;
551		    cs = MNEXT(cs);
552		} else if (*cs & ALT) {
553
554		    /* try the next part */
555		    matched = 0;
556		    cs = MNEXT(cs);
557		} else if (*cs & OPT) {
558
559		    /* doesn't matter */
560		    matched = 1;
561		    cs = MNEXT(cs);
562		} else
563
564		    /* no match, error return */
565		    return (NIL);
566		break;
567
568	    /* check for start of line */
569	    case '^':
570		if (s == s_start) {
571
572		    /* match, be happy */
573		    matched = 1;
574		    cs = MNEXT(cs);
575		} else if (*cs & ALT) {
576
577		    /* try the next part */
578		    matched = 0;
579		    cs = MNEXT(cs);
580		} else if (*cs & OPT) {
581
582		    /* doesn't matter */
583		    matched = 1;
584		    cs = MNEXT(cs);
585		} else
586
587		    /* no match, error return */
588		    return (NIL);
589		break;
590
591	    /* end of a subexpression, return success */
592	    case ')':
593		return (s);
594	    }
595	    break;
596	}
597    }
598    return (s);
599}
600