11590Srgrimes/*-
21590Srgrimes * Copyright (c) 1991, 1993
31590Srgrimes *	The Regents of the University of California.  All rights reserved.
41590Srgrimes *
51590Srgrimes * Redistribution and use in source and binary forms, with or without
61590Srgrimes * modification, are permitted provided that the following conditions
71590Srgrimes * are met:
81590Srgrimes * 1. Redistributions of source code must retain the above copyright
91590Srgrimes *    notice, this list of conditions and the following disclaimer.
101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111590Srgrimes *    notice, this list of conditions and the following disclaimer in the
121590Srgrimes *    documentation and/or other materials provided with the distribution.
131590Srgrimes * 4. Neither the name of the University nor the names of its contributors
141590Srgrimes *    may be used to endorse or promote products derived from this software
151590Srgrimes *    without specific prior written permission.
161590Srgrimes *
171590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
181590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
211590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
221590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
261590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271590Srgrimes * SUCH DAMAGE.
281590Srgrimes */
291590Srgrimes
3087705Smarkm#include <sys/cdefs.h>
3187705Smarkm
3287705Smarkm__FBSDID("$FreeBSD$");
3387705Smarkm
341590Srgrimes#ifndef lint
3587705Smarkmstatic const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
3628368Scharnier#endif
371590Srgrimes
381590Srgrimes#include <sys/types.h>
391590Srgrimes
4028368Scharnier#include <ctype.h>
4128368Scharnier#include <err.h>
42131846Stjr#include <errno.h>
43200462Sdelphij#include <stddef.h>
44200462Sdelphij#include <stdio.h>
451590Srgrimes#include <stdlib.h>
461590Srgrimes#include <string.h>
47131846Stjr#include <wchar.h>
48131846Stjr#include <wctype.h>
491590Srgrimes
501590Srgrimes#include "extern.h"
511590Srgrimes
52118412Sachestatic int      backslash(STR *, int *);
5392922Simpstatic int	bracket(STR *);
5492922Simpstatic void	genclass(STR *);
5592922Simpstatic void	genequiv(STR *);
56118412Sachestatic int      genrange(STR *, int);
5792922Simpstatic void	genseq(STR *);
581590Srgrimes
59131846Stjrwint_t
60226360Sednext(STR *s)
611590Srgrimes{
62131846Stjr	int is_octal;
63131846Stjr	wint_t ch;
64131846Stjr	wchar_t wch;
65131846Stjr	size_t clen;
661590Srgrimes
671590Srgrimes	switch (s->state) {
681590Srgrimes	case EOS:
691590Srgrimes		return (0);
701590Srgrimes	case INFINITE:
711590Srgrimes		return (1);
721590Srgrimes	case NORMAL:
73131846Stjr		switch (*s->str) {
741590Srgrimes		case '\0':
751590Srgrimes			s->state = EOS;
761590Srgrimes			return (0);
771590Srgrimes		case '\\':
78118412Sache			s->lastch = backslash(s, &is_octal);
791590Srgrimes			break;
801590Srgrimes		case '[':
811590Srgrimes			if (bracket(s))
821590Srgrimes				return (next(s));
831590Srgrimes			/* FALLTHROUGH */
841590Srgrimes		default:
85131846Stjr			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
86131846Stjr			if (clen == (size_t)-1 || clen == (size_t)-2 ||
87131846Stjr			    clen == 0)
88131846Stjr				errc(1, EILSEQ, NULL);
89118412Sache			is_octal = 0;
90131846Stjr			s->lastch = wch;
91131846Stjr			s->str += clen;
921590Srgrimes			break;
931590Srgrimes		}
941590Srgrimes
951590Srgrimes		/* We can start a range at any time. */
96118412Sache		if (s->str[0] == '-' && genrange(s, is_octal))
971590Srgrimes			return (next(s));
981590Srgrimes		return (1);
99118415Sache	case RANGE:
100118415Sache		if (s->cnt-- == 0) {
101118415Sache			s->state = NORMAL;
102118415Sache			return (next(s));
103118415Sache		}
104118415Sache		++s->lastch;
105118415Sache		return (1);
1061590Srgrimes	case SEQUENCE:
1071590Srgrimes		if (s->cnt-- == 0) {
1081590Srgrimes			s->state = NORMAL;
1091590Srgrimes			return (next(s));
1101590Srgrimes		}
1111590Srgrimes		return (1);
112131846Stjr	case CCLASS:
113131846Stjr	case CCLASS_UPPER:
114131846Stjr	case CCLASS_LOWER:
115131846Stjr		s->cnt++;
116131846Stjr		ch = nextwctype(s->lastch, s->cclass);
117131846Stjr		if (ch == -1) {
118131846Stjr			s->state = NORMAL;
119131846Stjr			return (next(s));
120131846Stjr		}
121131846Stjr		s->lastch = ch;
122131846Stjr		return (1);
1231590Srgrimes	case SET:
124118399Sache		if ((ch = s->set[s->cnt++]) == OOBCH) {
1251590Srgrimes			s->state = NORMAL;
1261590Srgrimes			return (next(s));
1271590Srgrimes		}
128118399Sache		s->lastch = ch;
1291590Srgrimes		return (1);
13087705Smarkm	default:
13187705Smarkm		return (0);
1321590Srgrimes	}
1331590Srgrimes	/* NOTREACHED */
1341590Srgrimes}
1351590Srgrimes
1361590Srgrimesstatic int
137226360Sedbracket(STR *s)
1381590Srgrimes{
13987705Smarkm	char *p;
1401590Srgrimes
1411590Srgrimes	switch (s->str[1]) {
1421590Srgrimes	case ':':				/* "[:class:]" */
14398242Stjr		if ((p = strchr(s->str + 2, ']')) == NULL)
1441590Srgrimes			return (0);
14598242Stjr		if (*(p - 1) != ':' || p - s->str < 4)
14698242Stjr			goto repeat;
14798242Stjr		*(p - 1) = '\0';
1481590Srgrimes		s->str += 2;
1491590Srgrimes		genclass(s);
15098242Stjr		s->str = p + 1;
1511590Srgrimes		return (1);
1521590Srgrimes	case '=':				/* "[=equiv=]" */
153213284Sjilles		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
1541590Srgrimes			return (0);
15598242Stjr		if (*(p - 1) != '=' || p - s->str < 4)
15698242Stjr			goto repeat;
1571590Srgrimes		s->str += 2;
1581590Srgrimes		genequiv(s);
1591590Srgrimes		return (1);
1601590Srgrimes	default:				/* "[\###*n]" or "[#*n]" */
16198242Stjr	repeat:
1621590Srgrimes		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
1631590Srgrimes			return (0);
164229403Sed		if (p[0] != '*' || strchr(p, ']') == NULL)
1651590Srgrimes			return (0);
1661590Srgrimes		s->str += 1;
1671590Srgrimes		genseq(s);
1681590Srgrimes		return (1);
1691590Srgrimes	}
1701590Srgrimes	/* NOTREACHED */
1711590Srgrimes}
1721590Srgrimes
1731590Srgrimesstatic void
174226360Sedgenclass(STR *s)
1751590Srgrimes{
1761590Srgrimes
177131846Stjr	if ((s->cclass = wctype(s->str)) == 0)
17828368Scharnier		errx(1, "unknown class %s", s->str);
1791590Srgrimes	s->cnt = 0;
180131846Stjr	s->lastch = -1;		/* incremented before check in next() */
181118371Sache	if (strcmp(s->str, "upper") == 0)
182131846Stjr		s->state = CCLASS_UPPER;
183118475Sache	else if (strcmp(s->str, "lower") == 0)
184131846Stjr		s->state = CCLASS_LOWER;
185118475Sache	else
186131846Stjr		s->state = CCLASS;
1871590Srgrimes}
1881590Srgrimes
1891590Srgrimesstatic void
190226360Sedgenequiv(STR *s)
1911590Srgrimes{
19298210Stjr	int i, p, pri;
19398210Stjr	char src[2], dst[3];
194131846Stjr	size_t clen;
195131846Stjr	wchar_t wc;
19698210Stjr
1971590Srgrimes	if (*s->str == '\\') {
198118412Sache		s->equiv[0] = backslash(s, NULL);
1991590Srgrimes		if (*s->str != '=')
20028368Scharnier			errx(1, "misplaced equivalence equals sign");
20198215Stjr		s->str += 2;
2021590Srgrimes	} else {
203131846Stjr		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
204131846Stjr		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
205131846Stjr			errc(1, EILSEQ, NULL);
206131846Stjr		s->equiv[0] = wc;
207131846Stjr		if (s->str[clen] != '=')
20828368Scharnier			errx(1, "misplaced equivalence equals sign");
209131846Stjr		s->str += clen + 2;
2101590Srgrimes	}
21198210Stjr
21298210Stjr	/*
21398210Stjr	 * Calculate the set of all characters in the same equivalence class
21498210Stjr	 * as the specified character (they will have the same primary
21598210Stjr	 * collation weights).
21698210Stjr	 * XXX Knows too much about how strxfrm() is implemented. Assumes
21798210Stjr	 * it fills the string with primary collation weight bytes. Only one-
21898210Stjr	 * to-one mappings are supported.
219131846Stjr	 * XXX Equivalence classes not supported in multibyte locales.
22098210Stjr	 */
221131846Stjr	src[0] = (char)s->equiv[0];
22298210Stjr	src[1] = '\0';
223131846Stjr	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
22498210Stjr		pri = (unsigned char)*dst;
225131846Stjr		for (p = 1, i = 1; i < NCHARS_SB; i++) {
22698210Stjr			*src = i;
22798210Stjr			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
22898210Stjr			    pri == (unsigned char)*dst)
22998210Stjr				s->equiv[p++] = i;
23098210Stjr		}
23198210Stjr		s->equiv[p] = OOBCH;
23298210Stjr	}
23398210Stjr
2341590Srgrimes	s->cnt = 0;
2351590Srgrimes	s->state = SET;
2361590Srgrimes	s->set = s->equiv;
2371590Srgrimes}
2381590Srgrimes
2391590Srgrimesstatic int
240118412Sachegenrange(STR *s, int was_octal)
2411590Srgrimes{
242118412Sache	int stopval, octal;
2431590Srgrimes	char *savestart;
244118372Sache	int n, cnt, *p;
245131846Stjr	size_t clen;
246131846Stjr	wchar_t wc;
2471590Srgrimes
248118412Sache	octal = 0;
2491590Srgrimes	savestart = s->str;
250131846Stjr	if (*++s->str == '\\')
251131846Stjr		stopval = backslash(s, &octal);
252131846Stjr	else {
253131846Stjr		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
254131846Stjr		if (clen == (size_t)-1 || clen == (size_t)-2)
255131846Stjr			errc(1, EILSEQ, NULL);
256131846Stjr		stopval = wc;
257131846Stjr		s->str += clen;
2581590Srgrimes	}
259131846Stjr	/*
260131846Stjr	 * XXX Characters are not ordered according to collating sequence in
261131846Stjr	 * multibyte locales.
262131846Stjr	 */
263131846Stjr	if (octal || was_octal || MB_CUR_MAX > 1) {
264131846Stjr		if (stopval < s->lastch) {
265131846Stjr			s->str = savestart;
266131846Stjr			return (0);
267131846Stjr		}
268118415Sache		s->cnt = stopval - s->lastch + 1;
269118415Sache		s->state = RANGE;
270118415Sache		--s->lastch;
271118415Sache		return (1);
272118415Sache	}
273131846Stjr	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
274131846Stjr		s->str = savestart;
275131846Stjr		return (0);
276131846Stjr	}
277131846Stjr	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
278118372Sache		err(1, "genrange() malloc");
279131846Stjr	for (cnt = 0; cnt < NCHARS_SB; cnt++)
280118415Sache		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
281118415Sache		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
282118372Sache			*p++ = cnt;
283118372Sache	*p = OOBCH;
284118372Sache	n = p - s->set;
285118372Sache
286118372Sache	s->cnt = 0;
287118372Sache	s->state = SET;
288118415Sache	if (n > 1)
289118372Sache		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
2901590Srgrimes	return (1);
2911590Srgrimes}
2921590Srgrimes
2931590Srgrimesstatic void
294226360Sedgenseq(STR *s)
2951590Srgrimes{
2961590Srgrimes	char *ep;
297131846Stjr	wchar_t wc;
298131846Stjr	size_t clen;
2991590Srgrimes
3001590Srgrimes	if (s->which == STRING1)
30128368Scharnier		errx(1, "sequences only valid in string2");
3021590Srgrimes
3031590Srgrimes	if (*s->str == '\\')
304118412Sache		s->lastch = backslash(s, NULL);
305131846Stjr	else {
306131846Stjr		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
307131846Stjr		if (clen == (size_t)-1 || clen == (size_t)-2)
308131846Stjr			errc(1, EILSEQ, NULL);
309131846Stjr		s->lastch = wc;
310131846Stjr		s->str += clen;
311131846Stjr	}
3121590Srgrimes	if (*s->str != '*')
31328368Scharnier		errx(1, "misplaced sequence asterisk");
3141590Srgrimes
3151590Srgrimes	switch (*++s->str) {
3161590Srgrimes	case '\\':
317118412Sache		s->cnt = backslash(s, NULL);
3181590Srgrimes		break;
3191590Srgrimes	case ']':
3201590Srgrimes		s->cnt = 0;
3211590Srgrimes		++s->str;
3221590Srgrimes		break;
3231590Srgrimes	default:
32414720Sjoerg		if (isdigit((u_char)*s->str)) {
3251590Srgrimes			s->cnt = strtol(s->str, &ep, 0);
3261590Srgrimes			if (*ep == ']') {
3271590Srgrimes				s->str = ep + 1;
3281590Srgrimes				break;
3291590Srgrimes			}
3301590Srgrimes		}
33128368Scharnier		errx(1, "illegal sequence count");
3321590Srgrimes		/* NOTREACHED */
3331590Srgrimes	}
3341590Srgrimes
3351590Srgrimes	s->state = s->cnt ? SEQUENCE : INFINITE;
3361590Srgrimes}
3371590Srgrimes
3381590Srgrimes/*
3391590Srgrimes * Translate \??? into a character.  Up to 3 octal digits, if no digits either
3401590Srgrimes * an escape code or a literal character.
3411590Srgrimes */
3421590Srgrimesstatic int
343118412Sachebackslash(STR *s, int *is_octal)
3441590Srgrimes{
34587705Smarkm	int ch, cnt, val;
3461590Srgrimes
347118412Sache	if (is_octal != NULL)
348118412Sache		*is_octal = 0;
3491590Srgrimes	for (cnt = val = 0;;) {
35014720Sjoerg		ch = (u_char)*++s->str;
351137685Sjkh		if (!isdigit(ch) || ch > '7')
3521590Srgrimes			break;
3531590Srgrimes		val = val * 8 + ch - '0';
3541590Srgrimes		if (++cnt == 3) {
3551590Srgrimes			++s->str;
3561590Srgrimes			break;
3571590Srgrimes		}
3581590Srgrimes	}
359118412Sache	if (cnt) {
360118412Sache		if (is_octal != NULL)
361118412Sache			*is_octal = 1;
3621590Srgrimes		return (val);
363118412Sache	}
3641590Srgrimes	if (ch != '\0')
3651590Srgrimes		++s->str;
3661590Srgrimes	switch (ch) {
3671590Srgrimes		case 'a':			/* escape characters */
3681590Srgrimes			return ('\7');
3691590Srgrimes		case 'b':
3701590Srgrimes			return ('\b');
3711590Srgrimes		case 'f':
3721590Srgrimes			return ('\f');
3731590Srgrimes		case 'n':
3741590Srgrimes			return ('\n');
3751590Srgrimes		case 'r':
3761590Srgrimes			return ('\r');
3771590Srgrimes		case 't':
3781590Srgrimes			return ('\t');
3791590Srgrimes		case 'v':
3801590Srgrimes			return ('\13');
3811590Srgrimes		case '\0':			/*  \" -> \ */
3821590Srgrimes			s->state = EOS;
3831590Srgrimes			return ('\\');
3841590Srgrimes		default:			/* \x" -> x */
3851590Srgrimes			return (ch);
3861590Srgrimes	}
3871590Srgrimes}
388