1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32
33
34#include <sys/types.h>
35
36#include <ctype.h>
37#include <err.h>
38#include <errno.h>
39#include <stddef.h>
40#include <stdio.h>
41#include <stdlib.h>
42#include <string.h>
43#include <wchar.h>
44#include <wctype.h>
45
46#include "extern.h"
47
48static int      backslash(STR *, int *);
49static int	bracket(STR *);
50static void	genclass(STR *);
51static void	genequiv(STR *);
52static int      genrange(STR *, int);
53static void	genseq(STR *);
54
55wint_t
56next(STR *s)
57{
58	int is_octal;
59	wint_t ch;
60	wchar_t wch;
61	size_t clen;
62
63	switch (s->state) {
64	case EOS:
65		return (0);
66	case INFINITE:
67		return (1);
68	case NORMAL:
69		switch (*s->str) {
70		case '\0':
71			s->state = EOS;
72			return (0);
73		case '\\':
74			s->lastch = backslash(s, &is_octal);
75			break;
76		case '[':
77			if (bracket(s))
78				return (next(s));
79			/* FALLTHROUGH */
80		default:
81			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
82			if (clen == (size_t)-1 || clen == (size_t)-2 ||
83			    clen == 0)
84				errc(1, EILSEQ, NULL);
85			is_octal = 0;
86			s->lastch = wch;
87			s->str += clen;
88			break;
89		}
90
91		/* We can start a range at any time. */
92		if (s->str[0] == '-' && genrange(s, is_octal))
93			return (next(s));
94		return (1);
95	case RANGE:
96		if (s->cnt-- == 0) {
97			s->state = NORMAL;
98			return (next(s));
99		}
100		++s->lastch;
101		return (1);
102	case SEQUENCE:
103		if (s->cnt-- == 0) {
104			s->state = NORMAL;
105			return (next(s));
106		}
107		return (1);
108	case CCLASS:
109	case CCLASS_UPPER:
110	case CCLASS_LOWER:
111		s->cnt++;
112		ch = nextwctype(s->lastch, s->cclass);
113		if (ch == -1) {
114			s->state = NORMAL;
115			return (next(s));
116		}
117		s->lastch = ch;
118		return (1);
119	case SET:
120		if ((ch = s->set[s->cnt++]) == OOBCH) {
121			s->state = NORMAL;
122			return (next(s));
123		}
124		s->lastch = ch;
125		return (1);
126	default:
127		return (0);
128	}
129	/* NOTREACHED */
130}
131
132static int
133bracket(STR *s)
134{
135	char *p;
136
137	switch (s->str[1]) {
138	case ':':				/* "[:class:]" */
139		if ((p = strchr(s->str + 2, ']')) == NULL)
140			return (0);
141		if (*(p - 1) != ':' || p - s->str < 4)
142			goto repeat;
143		*(p - 1) = '\0';
144		s->str += 2;
145		genclass(s);
146		s->str = p + 1;
147		return (1);
148	case '=':				/* "[=equiv=]" */
149		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
150			return (0);
151		if (*(p - 1) != '=' || p - s->str < 4)
152			goto repeat;
153		s->str += 2;
154		genequiv(s);
155		return (1);
156	default:				/* "[\###*n]" or "[#*n]" */
157	repeat:
158		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
159			return (0);
160		if (p[0] != '*' || strchr(p, ']') == NULL)
161			return (0);
162		s->str += 1;
163		genseq(s);
164		return (1);
165	}
166	/* NOTREACHED */
167}
168
169static void
170genclass(STR *s)
171{
172
173	if ((s->cclass = wctype(s->str)) == 0)
174		errx(1, "unknown class %s", s->str);
175	s->cnt = 0;
176	s->lastch = -1;		/* incremented before check in next() */
177	if (strcmp(s->str, "upper") == 0)
178		s->state = CCLASS_UPPER;
179	else if (strcmp(s->str, "lower") == 0)
180		s->state = CCLASS_LOWER;
181	else
182		s->state = CCLASS;
183}
184
185static void
186genequiv(STR *s)
187{
188	int i, p, pri;
189	char src[2], dst[3];
190	size_t clen;
191	wchar_t wc;
192
193	if (*s->str == '\\') {
194		s->equiv[0] = backslash(s, NULL);
195		if (*s->str != '=')
196			errx(1, "misplaced equivalence equals sign");
197		s->str += 2;
198	} else {
199		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
200		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
201			errc(1, EILSEQ, NULL);
202		s->equiv[0] = wc;
203		if (s->str[clen] != '=')
204			errx(1, "misplaced equivalence equals sign");
205		s->str += clen + 2;
206	}
207
208	/*
209	 * Calculate the set of all characters in the same equivalence class
210	 * as the specified character (they will have the same primary
211	 * collation weights).
212	 * XXX Knows too much about how strxfrm() is implemented. Assumes
213	 * it fills the string with primary collation weight bytes. Only one-
214	 * to-one mappings are supported.
215	 * XXX Equivalence classes not supported in multibyte locales.
216	 */
217	src[0] = (char)s->equiv[0];
218	src[1] = '\0';
219	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
220		pri = (unsigned char)*dst;
221		for (p = 1, i = 1; i < NCHARS_SB; i++) {
222			*src = i;
223			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
224			    pri == (unsigned char)*dst)
225				s->equiv[p++] = i;
226		}
227		s->equiv[p] = OOBCH;
228	}
229
230	s->cnt = 0;
231	s->state = SET;
232	s->set = s->equiv;
233}
234
235static int
236genrange(STR *s, int was_octal)
237{
238	int stopval, octal;
239	char *savestart;
240	int n, cnt, *p;
241	size_t clen;
242	wchar_t wc;
243
244	octal = 0;
245	savestart = s->str;
246	if (*++s->str == '\\')
247		stopval = backslash(s, &octal);
248	else {
249		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
250		if (clen == (size_t)-1 || clen == (size_t)-2)
251			errc(1, EILSEQ, NULL);
252		stopval = wc;
253		s->str += clen;
254	}
255	/*
256	 * XXX Characters are not ordered according to collating sequence in
257	 * multibyte locales.
258	 */
259	if (octal || was_octal || MB_CUR_MAX > 1) {
260		if (stopval < s->lastch) {
261			s->str = savestart;
262			return (0);
263		}
264		s->cnt = stopval - s->lastch + 1;
265		s->state = RANGE;
266		--s->lastch;
267		return (1);
268	}
269	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
270		s->str = savestart;
271		return (0);
272	}
273	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
274		err(1, "genrange() malloc");
275	for (cnt = 0; cnt < NCHARS_SB; cnt++)
276		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
277		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
278			*p++ = cnt;
279	*p = OOBCH;
280	n = p - s->set;
281
282	s->cnt = 0;
283	s->state = SET;
284	if (n > 1)
285		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
286	return (1);
287}
288
289static void
290genseq(STR *s)
291{
292	char *ep;
293	wchar_t wc;
294	size_t clen;
295
296	if (s->which == STRING1)
297		errx(1, "sequences only valid in string2");
298
299	if (*s->str == '\\')
300		s->lastch = backslash(s, NULL);
301	else {
302		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
303		if (clen == (size_t)-1 || clen == (size_t)-2)
304			errc(1, EILSEQ, NULL);
305		s->lastch = wc;
306		s->str += clen;
307	}
308	if (*s->str != '*')
309		errx(1, "misplaced sequence asterisk");
310
311	switch (*++s->str) {
312	case '\\':
313		s->cnt = backslash(s, NULL);
314		break;
315	case ']':
316		s->cnt = 0;
317		++s->str;
318		break;
319	default:
320		if (isdigit((u_char)*s->str)) {
321			s->cnt = strtol(s->str, &ep, 0);
322			if (*ep == ']') {
323				s->str = ep + 1;
324				break;
325			}
326		}
327		errx(1, "illegal sequence count");
328		/* NOTREACHED */
329	}
330
331	s->state = s->cnt ? SEQUENCE : INFINITE;
332}
333
334/*
335 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
336 * an escape code or a literal character.
337 */
338static int
339backslash(STR *s, int *is_octal)
340{
341	int ch, cnt, val;
342
343	if (is_octal != NULL)
344		*is_octal = 0;
345	for (cnt = val = 0;;) {
346		ch = (u_char)*++s->str;
347		if (!isdigit(ch) || ch > '7')
348			break;
349		val = val * 8 + ch - '0';
350		if (++cnt == 3) {
351			++s->str;
352			break;
353		}
354	}
355	if (cnt) {
356		if (is_octal != NULL)
357			*is_octal = 1;
358		return (val);
359	}
360	if (ch != '\0')
361		++s->str;
362	switch (ch) {
363		case 'a':			/* escape characters */
364			return ('\7');
365		case 'b':
366			return ('\b');
367		case 'f':
368			return ('\f');
369		case 'n':
370			return ('\n');
371		case 'r':
372			return ('\r');
373		case 't':
374			return ('\t');
375		case 'v':
376			return ('\13');
377		case '\0':			/*  \" -> \ */
378			s->state = EOS;
379			return ('\\');
380		default:			/* \x" -> x */
381			return (ch);
382	}
383}
384