1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1980, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <ctype.h>
34#include <stdlib.h>
35#include <stdbool.h>
36#include <string.h>
37
38#include "extern.h"
39
40static void	expconv(void);
41
42bool	 _escaped;	/* true if we are currently x_escaped */
43char	*s_start;	/* start of string */
44bool	 l_onecase;	/* true if upper and lower equivalent */
45
46#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
47
48/*  STRNCMP -	like strncmp except that we convert the
49 *	 	first string to lower case before comparing
50 *		if l_onecase is set.
51 */
52
53int
54STRNCMP(register char *s1, register char *s2, register int len)
55{
56	if (l_onecase) {
57	    do
58		if (*s2 - makelower(*s1))
59			return (*s2 - makelower(*s1));
60		else {
61			s2++;
62			s1++;
63		}
64	    while (--len);
65	} else {
66	    do
67		if (*s2 - *s1)
68			return (*s2 - *s1);
69		else {
70			s2++;
71			s1++;
72		}
73	    while (--len);
74	}
75	return(0);
76}
77
78/*	The following routine converts an irregular expression to
79 *	internal format.
80 *
81 *	Either meta symbols (\a \d or \p) or character strings or
82 *	operations ( alternation or parenthesizing ) can be
83 *	specified.  Each starts with a descriptor byte.  The descriptor
84 *	byte has STR set for strings, META set for meta symbols
85 *	and OPER set for operations.
86 *	The descriptor byte can also have the OPT bit set if the object
87 *	defined is optional.  Also ALT can be set to indicate an alternation.
88 *
89 *	For metasymbols the byte following the descriptor byte identities
90 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
91 *	strings the byte after the descriptor is a character count for
92 *	the string:
93 *
94 *		meta symbols := descriptor
95 *				symbol
96 *
97 *		strings :=	descriptor
98 *				character count
99 *				the string
100 *
101 *		operations :=	descriptor
102 *				symbol
103 *				character count
104 */
105
106/*
107 *  handy macros for accessing parts of match blocks
108 */
109#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
110#define MNEXT(A) (A+2)		/* character following a metasymbol block */
111
112#define OSYM(A) (*(A+1))	/* symbol in an operation block */
113#define OCNT(A) (*(A+2))	/* character count */
114#define ONEXT(A) (A+3)		/* next character after the operation */
115#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
116
117#define SCNT(A) (*(A+1))	/* byte count of a string */
118#define SSTR(A) (A+2)		/* address of the string */
119#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
120
121/*
122 *  bit flags in the descriptor
123 */
124#define OPT 1
125#define STR 2
126#define META 4
127#define ALT 8
128#define OPER 16
129
130static char *ccre;	/* pointer to current position in converted exp*/
131static char *ure;	/* pointer current position in unconverted exp */
132
133/* re: unconverted irregular expression */
134char *
135convexp(char *re)
136{
137    register char *cre;		/* pointer to converted regular expression */
138
139    /* allocate room for the converted expression */
140    if (re == NULL)
141	return (NULL);
142    if (*re == '\0')
143	return (NULL);
144    cre = malloc(4 * strlen(re) + 3);
145    ccre = cre;
146    ure = re;
147
148    /* start the conversion with a \a */
149    *cre = META | OPT;
150    MSYM(cre) = 'a';
151    ccre = MNEXT(cre);
152
153    /* start the conversion (its recursive) */
154    expconv ();
155    *ccre = 0;
156    return (cre);
157}
158
159static void
160expconv()
161{
162    register char *cs;		/* pointer to current symbol in converted exp */
163    register char c;		/* character being processed */
164    register char *acs;		/* pinter to last alternate */
165    register int temp;
166
167    /* let the conversion begin */
168    acs = NULL;
169    cs = NULL;
170    while (*ure) {
171	switch (c = *ure++) {
172
173	case '\\':
174	    switch (c = *ure++) {
175
176	    /* escaped characters are just characters */
177	    default:
178		if (cs == NULL || (*cs & STR) == 0) {
179		    cs = ccre;
180		    *cs = STR;
181		    SCNT(cs) = 1;
182		    ccre += 2;
183		} else
184		    SCNT(cs)++;
185		*ccre++ = c;
186		break;
187
188	    /* normal(?) metacharacters */
189	    case 'a':
190	    case 'd':
191	    case 'e':
192	    case 'p':
193		if (acs != NULL && acs != cs) {
194		    do {
195			temp = OCNT(acs);
196			OCNT(acs) = ccre - acs;
197			acs -= temp;
198		    } while (temp != 0);
199		    acs = NULL;
200		}
201		cs = ccre;
202		*cs = META;
203		MSYM(cs) = c;
204		ccre = MNEXT(cs);
205		break;
206	    }
207	    break;
208
209	/* just put the symbol in */
210	case '^':
211	case '$':
212	    if (acs != NULL && acs != cs) {
213		do {
214		    temp = OCNT(acs);
215		    OCNT(acs) = ccre - acs;
216		    acs -= temp;
217		} while (temp != 0);
218		acs = NULL;
219	    }
220	    cs = ccre;
221	    *cs = META;
222	    MSYM(cs) = c;
223	    ccre = MNEXT(cs);
224	    break;
225
226	/* mark the last match sequence as optional */
227	case '?':
228	    if (cs)
229	    	*cs = *cs | OPT;
230	    break;
231
232	/* recurse and define a subexpression */
233	case '(':
234	    if (acs != NULL && acs != cs) {
235		do {
236		    temp = OCNT(acs);
237		    OCNT(acs) = ccre - acs;
238		    acs -= temp;
239		} while (temp != 0);
240		acs = NULL;
241	    }
242	    cs = ccre;
243	    *cs = OPER;
244	    OSYM(cs) = '(';
245	    ccre = ONEXT(cs);
246	    expconv();
247	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
248	    break;
249
250	/* return from a recursion */
251	case ')':
252	    if (acs != NULL) {
253		do {
254		    temp = OCNT(acs);
255		    OCNT(acs) = ccre - acs;
256		    acs -= temp;
257		} while (temp != 0);
258		acs = NULL;
259	    }
260	    cs = ccre;
261	    *cs = META;
262	    MSYM(cs) = c;
263	    ccre = MNEXT(cs);
264	    return;
265
266	/* mark the last match sequence as having an alternate */
267	/* the third byte will contain an offset to jump over the */
268	/* alternate match in case the first did not fail */
269	case '|':
270	    if (acs != NULL && acs != cs)
271		OCNT(ccre) = ccre - acs;	/* make a back pointer */
272	    else
273		OCNT(ccre) = 0;
274	    *cs |= ALT;
275	    cs = ccre;
276	    *cs = OPER;
277	    OSYM(cs) = '|';
278	    ccre = ONEXT(cs);
279	    acs = cs;	/* remember that the pointer is to be filles */
280	    break;
281
282	/* if its not a metasymbol just build a scharacter string */
283	default:
284	    if (cs == NULL || (*cs & STR) == 0) {
285		cs = ccre;
286		*cs = STR;
287		SCNT(cs) = 1;
288		ccre = SSTR(cs);
289	    } else
290		SCNT(cs)++;
291	    *ccre++ = c;
292	    break;
293	}
294    }
295    if (acs != NULL) {
296	do {
297	    temp = OCNT(acs);
298	    OCNT(acs) = ccre - acs;
299	    acs -= temp;
300	} while (temp != 0);
301	acs = NULL;
302    }
303    return;
304}
305/* end of convertre */
306
307
308/*
309 *	The following routine recognises an irregular expression
310 *	with the following special characters:
311 *
312 *		\?	-	means last match was optional
313 *		\a	-	matches any number of characters
314 *		\d	-	matches any number of spaces and tabs
315 *		\p	-	matches any number of alphanumeric
316 *				characters. The
317 *				characters matched will be copied into
318 *				the area pointed to by 'name'.
319 *		\|	-	alternation
320 *		\( \)	-	grouping used mostly for alternation and
321 *				optionality
322 *
323 *	The irregular expression must be translated to internal form
324 *	prior to calling this routine
325 *
326 *	The value returned is the pointer to the first non \a
327 *	character matched.
328 */
329
330/*
331 *  s: string to check for a match in
332 *  re: a converted irregular expression
333 *  mstring: where to put whatever matches a \p
334 */
335char *
336expmatch (register char *s, register char *re, register char *mstring)
337{
338    register char *cs;		/* the current symbol */
339    register char *ptr,*s1;	/* temporary pointer */
340    bool matched;	/* a temporary bool */
341
342    /* initial conditions */
343    if (re == NULL)
344	return (NULL);
345    cs = re;
346    matched = false;
347
348    /* loop till expression string is exhausted (or at least pretty tired) */
349    while (*cs) {
350	switch (*cs & (OPER | STR | META)) {
351
352	/* try to match a string */
353	case STR:
354	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
355	    if (matched) {
356
357		/* hoorah it matches */
358		s += SCNT(cs);
359		cs = SNEXT(cs);
360	    } else if (*cs & ALT) {
361
362		/* alternation, skip to next expression */
363		cs = SNEXT(cs);
364	    } else if (*cs & OPT) {
365
366		/* the match is optional */
367		cs = SNEXT(cs);
368		matched = 1;		/* indicate a successful match */
369	    } else {
370
371		/* no match, error return */
372		return (NULL);
373	    }
374	    break;
375
376	/* an operator, do something fancy */
377	case OPER:
378	    switch (OSYM(cs)) {
379
380	    /* this is an alternation */
381	    case '|':
382		if (matched)
383
384		    /* last thing in the alternation was a match, skip ahead */
385		    cs = OPTR(cs);
386		else
387
388		    /* no match, keep trying */
389		    cs = ONEXT(cs);
390		break;
391
392	    /* this is a grouping, recurse */
393	    case '(':
394		ptr = expmatch(s, ONEXT(cs), mstring);
395		if (ptr != NULL) {
396
397		    /* the subexpression matched */
398		    matched = 1;
399		    s = ptr;
400		} else if (*cs & ALT) {
401
402		    /* alternation, skip to next expression */
403		    matched = 0;
404		} else if (*cs & OPT) {
405
406		    /* the match is optional */
407		    matched = 1;	/* indicate a successful match */
408		} else {
409
410		    /* no match, error return */
411		    return (NULL);
412		}
413		cs = OPTR(cs);
414		break;
415	    }
416	    break;
417
418	/* try to match a metasymbol */
419	case META:
420	    switch (MSYM(cs)) {
421
422	    /* try to match anything and remember what was matched */
423	    case 'p':
424		/*
425		 *  This is really the same as trying the match the
426		 *  remaining parts of the expression to any subset
427		 *  of the string.
428		 */
429		s1 = s;
430		do {
431		    ptr = expmatch(s1, MNEXT(cs), mstring);
432		    if (ptr != NULL && s1 != s) {
433
434			/* we have a match, remember the match */
435			strncpy (mstring, s, s1 - s);
436			mstring[s1 - s] = '\0';
437			return (ptr);
438		    } else if (ptr != NULL && (*cs & OPT)) {
439
440			/* it was aoptional so no match is ok */
441			return (ptr);
442		    } else if (ptr != NULL) {
443
444			/* not optional and we still matched */
445			return (NULL);
446		    }
447		    if (!(isalnum(*s1) || *s1 == '_' ||
448			  /* C++ destructor */
449			  *s1 == '~' ||
450			  /* C++ scope operator */
451			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
452			   (s1++, true))))
453			return (NULL);
454		    if (*s1 == '\\')
455			_escaped = _escaped ? false : true;
456		    else
457			_escaped = false;
458		} while (*s1++);
459		return (NULL);
460
461	    /* try to match anything */
462	    case 'a':
463		/*
464		 *  This is really the same as trying the match the
465		 *  remaining parts of the expression to any subset
466		 *  of the string.
467		 */
468		s1 = s;
469		do {
470		    ptr = expmatch(s1, MNEXT(cs), mstring);
471		    if (ptr != NULL && s1 != s) {
472
473			/* we have a match */
474			return (ptr);
475		    } else if (ptr != NULL && (*cs & OPT)) {
476
477			/* it was aoptional so no match is ok */
478			return (ptr);
479		    } else if (ptr != NULL) {
480
481			/* not optional and we still matched */
482			return (NULL);
483		    }
484		    if (*s1 == '\\')
485			_escaped = _escaped ? false : true;
486		    else
487			_escaped = false;
488		} while (*s1++);
489		return (NULL);
490
491	    /* fail if we are currently _escaped */
492	    case 'e':
493		if (_escaped)
494		    return(NULL);
495		cs = MNEXT(cs);
496		break;
497
498	    /* match any number of tabs and spaces */
499	    case 'd':
500		ptr = s;
501		while (*s == ' ' || *s == '\t')
502		    s++;
503		if (s != ptr || s == s_start) {
504
505		    /* match, be happy */
506		    matched = 1;
507		    cs = MNEXT(cs);
508		} else if (*s == '\n' || *s == '\0') {
509
510		    /* match, be happy */
511		    matched = 1;
512		    cs = MNEXT(cs);
513		} else if (*cs & ALT) {
514
515		    /* try the next part */
516		    matched = 0;
517		    cs = MNEXT(cs);
518		} else if (*cs & OPT) {
519
520		    /* doesn't matter */
521		    matched = 1;
522		    cs = MNEXT(cs);
523		} else
524
525		    /* no match, error return */
526		    return (NULL);
527		break;
528
529	    /* check for end of line */
530	    case '$':
531		if (*s == '\0' || *s == '\n') {
532
533		    /* match, be happy */
534		    s++;
535		    matched = 1;
536		    cs = MNEXT(cs);
537		} else if (*cs & ALT) {
538
539		    /* try the next part */
540		    matched = 0;
541		    cs = MNEXT(cs);
542		} else if (*cs & OPT) {
543
544		    /* doesn't matter */
545		    matched = 1;
546		    cs = MNEXT(cs);
547		} else
548
549		    /* no match, error return */
550		    return (NULL);
551		break;
552
553	    /* check for start of line */
554	    case '^':
555		if (s == s_start) {
556
557		    /* match, be happy */
558		    matched = 1;
559		    cs = MNEXT(cs);
560		} else if (*cs & ALT) {
561
562		    /* try the next part */
563		    matched = 0;
564		    cs = MNEXT(cs);
565		} else if (*cs & OPT) {
566
567		    /* doesn't matter */
568		    matched = 1;
569		    cs = MNEXT(cs);
570		} else
571
572		    /* no match, error return */
573		    return (NULL);
574		break;
575
576	    /* end of a subexpression, return success */
577	    case ')':
578		return (s);
579	    }
580	    break;
581	}
582    }
583    return (s);
584}
585