1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1992 Diomidis Spinellis.
5 * Copyright (c) 1992, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Diomidis Spinellis of Imperial College, University of London.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#include <sys/cdefs.h>
37
38#include <sys/types.h>
39#include <sys/stat.h>
40
41#include <ctype.h>
42#include <err.h>
43#include <errno.h>
44#include <fcntl.h>
45#include <limits.h>
46#include <regex.h>
47#include <stdbool.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <string.h>
51#include <wchar.h>
52
53#include "defs.h"
54#include "extern.h"
55
56#define LHSZ	128
57#define	LHMASK	(LHSZ - 1)
58static struct labhash {
59	struct	labhash *lh_next;
60	u_int	lh_hash;
61	struct	s_command *lh_cmd;
62	int	lh_ref;
63} *labels[LHSZ];
64
65static char	 *compile_addr(char *, struct s_addr *);
66static char	 *compile_ccl(char **, char *);
67static char	 *compile_delimited(char *, char *, int);
68static char	 *compile_flags(char *, struct s_subst *);
69static regex_t	 *compile_re(char *, int);
70static char	 *compile_subst(char *, struct s_subst *);
71static char	 *compile_text(void);
72static char	 *compile_tr(char *, struct s_tr **);
73static struct s_command
74		**compile_stream(struct s_command **);
75static char	 *duptoeol(char *, const char *);
76static void	  enterlabel(struct s_command *);
77static struct s_command
78		 *findlabel(char *);
79static void	  fixuplabel(struct s_command *, struct s_command *);
80static void	  uselabel(void);
81
82/*
83 * Command specification.  This is used to drive the command parser.
84 */
85struct s_format {
86	char code;				/* Command code */
87	int naddr;				/* Number of address args */
88	enum e_args args;			/* Argument type */
89};
90
91static struct s_format cmd_fmts[] = {
92	{'{', 2, GROUP},
93	{'}', 0, ENDGROUP},
94	{'a', 1, TEXT},
95	{'b', 2, BRANCH},
96	{'c', 2, TEXT},
97	{'d', 2, EMPTY},
98	{'D', 2, EMPTY},
99	{'g', 2, EMPTY},
100	{'G', 2, EMPTY},
101	{'h', 2, EMPTY},
102	{'H', 2, EMPTY},
103	{'i', 1, TEXT},
104	{'l', 2, EMPTY},
105	{'n', 2, EMPTY},
106	{'N', 2, EMPTY},
107	{'p', 2, EMPTY},
108	{'P', 2, EMPTY},
109	{'q', 1, EMPTY},
110	{'r', 1, RFILE},
111	{'s', 2, SUBST},
112	{'t', 2, BRANCH},
113	{'w', 2, WFILE},
114	{'x', 2, EMPTY},
115	{'y', 2, TR},
116	{'!', 2, NONSEL},
117	{':', 0, LABEL},
118	{'#', 0, COMMENT},
119	{'=', 1, EMPTY},
120	{'\0', 0, COMMENT},
121};
122
123/* The compiled program. */
124struct s_command *prog;
125
126/*
127 * Compile the program into prog.
128 * Initialise appends.
129 */
130void
131compile(void)
132{
133	*compile_stream(&prog) = NULL;
134	fixuplabel(prog, NULL);
135	uselabel();
136	if (appendnum == 0)
137		appends = NULL;
138	else if ((appends = malloc(sizeof(struct s_appends) * appendnum)) ==
139	    NULL)
140		err(1, "malloc");
141	if ((match = malloc((maxnsub + 1) * sizeof(regmatch_t))) == NULL)
142		err(1, "malloc");
143}
144
145#define EATSPACE() do {							\
146	if (p)								\
147		while (*p && isspace((unsigned char)*p))                \
148			p++;						\
149	} while (0)
150
151static struct s_command **
152compile_stream(struct s_command **link)
153{
154	char *p;
155	static char lbuf[_POSIX2_LINE_MAX + 1];	/* To save stack */
156	struct s_command *cmd, *cmd2, *stack;
157	struct s_format *fp;
158	char re[_POSIX2_LINE_MAX + 1];
159	int naddr;				/* Number of addresses */
160
161	stack = NULL;
162	for (;;) {
163		if ((p = cu_fgets(lbuf, sizeof(lbuf), NULL)) == NULL) {
164			if (stack != NULL)
165				errx(1, "%lu: %s: unexpected EOF (pending }'s)",
166							linenum, fname);
167			return (link);
168		}
169
170semicolon:	EATSPACE();
171		if (p) {
172			if (*p == '#' || *p == '\0')
173				continue;
174			else if (*p == ';') {
175				p++;
176				goto semicolon;
177			}
178		}
179		if ((*link = cmd = malloc(sizeof(struct s_command))) == NULL)
180			err(1, "malloc");
181		link = &cmd->next;
182		cmd->startline = cmd->nonsel = 0;
183		/* First parse the addresses */
184		naddr = 0;
185
186/* Valid characters to start an address */
187#define	addrchar(c)	(strchr("0123456789/\\$", (c)))
188		if (addrchar(*p)) {
189			naddr++;
190			if ((cmd->a1 = malloc(sizeof(struct s_addr))) == NULL)
191				err(1, "malloc");
192			p = compile_addr(p, cmd->a1);
193			EATSPACE();				/* EXTENSION */
194			if (*p == ',') {
195				p++;
196				EATSPACE();			/* EXTENSION */
197				naddr++;
198				if ((cmd->a2 = malloc(sizeof(struct s_addr)))
199				    == NULL)
200					err(1, "malloc");
201				p = compile_addr(p, cmd->a2);
202				EATSPACE();
203			} else
204				cmd->a2 = NULL;
205		} else
206			cmd->a1 = cmd->a2 = NULL;
207
208nonsel:		/* Now parse the command */
209		if (!*p)
210			errx(1, "%lu: %s: command expected", linenum, fname);
211		cmd->code = *p;
212		for (fp = cmd_fmts; fp->code; fp++)
213			if (fp->code == *p)
214				break;
215		if (!fp->code)
216			errx(1, "%lu: %s: invalid command code %c", linenum, fname, *p);
217		if (naddr > fp->naddr)
218			errx(1,
219				"%lu: %s: command %c expects up to %d address(es), found %d",
220				linenum, fname, *p, fp->naddr, naddr);
221		switch (fp->args) {
222		case NONSEL:			/* ! */
223			p++;
224			EATSPACE();
225			cmd->nonsel = 1;
226			goto nonsel;
227		case GROUP:			/* { */
228			p++;
229			EATSPACE();
230			cmd->next = stack;
231			stack = cmd;
232			link = &cmd->u.c;
233			if (*p)
234				goto semicolon;
235			break;
236		case ENDGROUP:
237			/*
238			 * Short-circuit command processing, since end of
239			 * group is really just a noop.
240			 */
241			cmd->nonsel = 1;
242			if (stack == NULL)
243				errx(1, "%lu: %s: unexpected }", linenum, fname);
244			cmd2 = stack;
245			stack = cmd2->next;
246			cmd2->next = cmd;
247			/*FALLTHROUGH*/
248		case EMPTY:		/* d D g G h H l n N p P q x = \0 */
249			p++;
250			EATSPACE();
251			if (*p == ';') {
252				p++;
253				link = &cmd->next;
254				goto semicolon;
255			}
256			if (*p)
257				errx(1, "%lu: %s: extra characters at the end of %c command",
258						linenum, fname, cmd->code);
259			break;
260		case TEXT:			/* a c i */
261			p++;
262			EATSPACE();
263			if (*p != '\\')
264				errx(1,
265"%lu: %s: command %c expects \\ followed by text", linenum, fname, cmd->code);
266			p++;
267			EATSPACE();
268			if (*p)
269				errx(1,
270				"%lu: %s: extra characters after \\ at the end of %c command",
271				linenum, fname, cmd->code);
272			cmd->t = compile_text();
273			break;
274		case COMMENT:			/* \0 # */
275			break;
276		case WFILE:			/* w */
277			p++;
278			EATSPACE();
279			if (*p == '\0')
280				errx(1, "%lu: %s: filename expected", linenum, fname);
281			cmd->t = duptoeol(p, "w command");
282			if (aflag)
283				cmd->u.fd = -1;
284			else if ((cmd->u.fd = open(p,
285			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
286			    DEFFILEMODE)) == -1)
287				err(1, "%s", p);
288			break;
289		case RFILE:			/* r */
290			p++;
291			EATSPACE();
292			if (*p == '\0')
293				errx(1, "%lu: %s: filename expected", linenum, fname);
294			else
295				cmd->t = duptoeol(p, "read command");
296			break;
297		case BRANCH:			/* b t */
298			p++;
299			EATSPACE();
300			if (*p == '\0')
301				cmd->t = NULL;
302			else
303				cmd->t = duptoeol(p, "branch");
304			break;
305		case LABEL:			/* : */
306			p++;
307			EATSPACE();
308			cmd->t = duptoeol(p, "label");
309			if (strlen(p) == 0)
310				errx(1, "%lu: %s: empty label", linenum, fname);
311			enterlabel(cmd);
312			break;
313		case SUBST:			/* s */
314			p++;
315			if (*p == '\0' || *p == '\\')
316				errx(1,
317"%lu: %s: substitute pattern can not be delimited by newline or backslash",
318					linenum, fname);
319			if ((cmd->u.s = calloc(1, sizeof(struct s_subst))) == NULL)
320				err(1, "malloc");
321			p = compile_delimited(p, re, 0);
322			if (p == NULL)
323				errx(1,
324				"%lu: %s: unterminated substitute pattern", linenum, fname);
325
326			/* Compile RE with no case sensitivity temporarily */
327			if (*re == '\0')
328				cmd->u.s->re = NULL;
329			else
330				cmd->u.s->re = compile_re(re, 0);
331			--p;
332			p = compile_subst(p, cmd->u.s);
333			p = compile_flags(p, cmd->u.s);
334
335			/* Recompile RE with case sensitivity from "I" flag if any */
336			if (*re == '\0')
337				cmd->u.s->re = NULL;
338			else
339				cmd->u.s->re = compile_re(re, cmd->u.s->icase);
340			EATSPACE();
341			if (*p == ';') {
342				p++;
343				link = &cmd->next;
344				goto semicolon;
345			}
346			break;
347		case TR:			/* y */
348			p++;
349			p = compile_tr(p, &cmd->u.y);
350			EATSPACE();
351			if (*p == ';') {
352				p++;
353				link = &cmd->next;
354				goto semicolon;
355			}
356			if (*p)
357				errx(1,
358"%lu: %s: extra text at the end of a transform command", linenum, fname);
359			break;
360		}
361	}
362}
363
364static int
365hex2char(const char *in, char *out, int len)
366{
367	long ord;
368	char *endptr, hexbuf[3];
369
370	hexbuf[0] = in[0];
371	hexbuf[1] = len > 1 ? in[1] : '\0';
372	hexbuf[2] = '\0';
373
374	errno = 0;
375	ord = strtol(hexbuf, &endptr, 16);
376	if (*endptr != '\0' || errno != 0)
377		return (ERANGE);
378	*out = (char)ord;
379	return (0);
380}
381
382static bool
383hexdigit(char c)
384{
385	int lc;
386
387	lc = tolower(c);
388	return isdigit(lc) || (lc >= 'a' && lc <= 'f');
389}
390
391static bool
392dohex(const char *in, char *out, int *len)
393{
394	int tmplen;
395
396	if (!hexdigit(in[0]))
397		return (false);
398	tmplen = 1;
399	if (hexdigit(in[1]))
400		++tmplen;
401	if (hex2char(in, out, tmplen) == 0) {
402		*len = tmplen;
403		return (true);
404	}
405
406	return (false);
407}
408
409/*
410 * Get a delimited string.  P points to the delimiter of the string; d points
411 * to a buffer area.  Newline and delimiter escapes are processed; other
412 * escapes are ignored.
413 *
414 * Returns a pointer to the first character after the final delimiter or NULL
415 * in the case of a non-terminated string.  The character array d is filled
416 * with the processed string.
417 */
418static char *
419compile_delimited(char *p, char *d, int is_tr)
420{
421	int hexlen;
422	char c;
423
424	c = *p++;
425	if (c == '\0')
426		return (NULL);
427	else if (c == '\\')
428		errx(1, "%lu: %s: \\ can not be used as a string delimiter",
429				linenum, fname);
430	else if (c == '\n')
431		errx(1, "%lu: %s: newline can not be used as a string delimiter",
432				linenum, fname);
433	while (*p) {
434		if (*p == '[' && *p != c) {
435			if (!is_tr) {
436				if ((d = compile_ccl(&p, d)) == NULL) {
437					errx(1,
438					    "%lu: %s: unbalanced brackets ([])",
439					    linenum, fname);
440				}
441				continue;
442			}
443		} else if (*p == '\\' && p[1] == '[') {
444			if (is_tr)
445				p++;
446			else
447				*d++ = *p++;
448		} else if (*p == '\\' && p[1] == c) {
449			p++;
450		} else if (*p == '\\' &&
451		    (p[1] == 'n' || p[1] == 'r' || p[1] == 't')) {
452			switch (p[1]) {
453			case 'n':
454				*d++ = '\n';
455				break;
456			case 'r':
457				*d++ = '\r';
458				break;
459			case 't':
460				*d++ = '\t';
461				break;
462			}
463			p += 2;
464			continue;
465		} else if (*p == '\\' && p[1] == 'x') {
466			if (dohex(&p[2], d, &hexlen)) {
467				++d;
468				p += hexlen + 2;
469				continue;
470			}
471		} else if (*p == '\\' && p[1] == '\\') {
472			if (is_tr)
473				p++;
474			else
475				*d++ = *p++;
476		} else if (*p == c) {
477			*d = '\0';
478			return (p + 1);
479		}
480		*d++ = *p++;
481	}
482	return (NULL);
483}
484
485
486/* compile_ccl: expand a POSIX character class */
487static char *
488compile_ccl(char **sp, char *t)
489{
490	int c, d, hexlen;
491	char *s = *sp;
492
493	*t++ = *s++;
494	if (*s == '^')
495		*t++ = *s++;
496	if (*s == ']')
497		*t++ = *s++;
498	for (; *s && (*t = *s) != ']'; s++, t++) {
499		if (*s == '[' && ((d = *(s+1)) == '.' || d == ':' || d == '=')) {
500			*++t = *++s, t++, s++;
501			for (c = *s; (*t = *s) != ']' || c != d; s++, t++)
502				if ((c = *s) == '\0')
503					return NULL;
504		} else if (*s == '\\') {
505			switch (s[1]) {
506			case 'n':
507				*t = '\n';
508				s++;
509				break;
510			case 'r':
511				*t = '\r';
512				s++;
513				break;
514			case 't':
515				*t = '\t';
516				s++;
517				break;
518			case 'x':
519				if (dohex(&s[2], t, &hexlen))
520					s += hexlen + 1;
521				break;
522			}
523		}
524	}
525	return (*s == ']') ? *sp = ++s, ++t : NULL;
526}
527
528/*
529 * Compiles the regular expression in RE and returns a pointer to the compiled
530 * regular expression.
531 * Cflags are passed to regcomp.
532 */
533static regex_t *
534compile_re(char *re, int case_insensitive)
535{
536	regex_t *rep;
537	int eval, flags;
538
539
540	flags = rflags;
541	if (case_insensitive)
542		flags |= REG_ICASE;
543	if ((rep = malloc(sizeof(regex_t))) == NULL)
544		err(1, "malloc");
545	if ((eval = regcomp(rep, re, flags)) != 0)
546		errx(1, "%lu: %s: RE error: %s",
547				linenum, fname, strregerror(eval, rep));
548	if (maxnsub < rep->re_nsub)
549		maxnsub = rep->re_nsub;
550	return (rep);
551}
552
553/*
554 * Compile the substitution string of a regular expression and set res to
555 * point to a saved copy of it.  Nsub is the number of parenthesized regular
556 * expressions.
557 */
558static char *
559compile_subst(char *p, struct s_subst *s)
560{
561	static char lbuf[_POSIX2_LINE_MAX + 1];
562	int asize, hexlen, size;
563	u_char ref;
564	char c, *text, *op, *sp;
565	int more = 1, sawesc = 0;
566
567	c = *p++;			/* Terminator character */
568	if (c == '\0')
569		return (NULL);
570
571	s->maxbref = 0;
572	s->linenum = linenum;
573	asize = 2 * _POSIX2_LINE_MAX + 1;
574	if ((text = malloc(asize)) == NULL)
575		err(1, "malloc");
576	size = 0;
577	do {
578		op = sp = text + size;
579		for (; *p; p++) {
580			if (*p == '\\' || sawesc) {
581				/*
582				 * If this is a continuation from the last
583				 * buffer, we won't have a character to
584				 * skip over.
585				 */
586				if (sawesc)
587					sawesc = 0;
588				else
589					p++;
590
591				if (*p == '\0') {
592					/*
593					 * This escaped character is continued
594					 * in the next part of the line.  Note
595					 * this fact, then cause the loop to
596					 * exit w/ normal EOL case and reenter
597					 * above with the new buffer.
598					 */
599					sawesc = 1;
600					p--;
601					continue;
602				} else if (strchr("123456789", *p) != NULL) {
603					*sp++ = '\\';
604					ref = *p - '0';
605					if (s->re != NULL &&
606					    ref > s->re->re_nsub)
607						errx(1, "%lu: %s: \\%c not defined in the RE",
608								linenum, fname, *p);
609					if (s->maxbref < ref)
610						s->maxbref = ref;
611				} else {
612					switch (*p) {
613					case '&':
614					case '\\':
615						*sp++ = '\\';
616						break;
617					case 'n':
618						*p = '\n';
619						break;
620					case 'r':
621						*p = '\r';
622						break;
623					case 't':
624						*p = '\t';
625						break;
626					case 'x':
627#define	ADVANCE_N(s, n)					\
628	do {						\
629		char *adv = (s);			\
630		while (*(adv + (n) - 1) != '\0') {	\
631			*adv = *(adv + (n));		\
632			++adv;				\
633		}					\
634		*adv = '\0';				\
635	} while (0);
636						if (dohex(&p[1], p, &hexlen)) {
637							ADVANCE_N(p + 1,
638							    hexlen);
639						}
640						break;
641					}
642				}
643			} else if (*p == c) {
644				if (*++p == '\0' && more) {
645					if (cu_fgets(lbuf, sizeof(lbuf), &more))
646						p = lbuf;
647				}
648				*sp++ = '\0';
649				size += sp - op;
650				if ((s->new = realloc(text, size)) == NULL)
651					err(1, "realloc");
652				return (p);
653			} else if (*p == '\n') {
654				errx(1,
655"%lu: %s: unescaped newline inside substitute pattern", linenum, fname);
656				/* NOTREACHED */
657			}
658			*sp++ = *p;
659		}
660		size += sp - op;
661		if (asize - size < _POSIX2_LINE_MAX + 1) {
662			asize *= 2;
663			if ((text = realloc(text, asize)) == NULL)
664				err(1, "realloc");
665		}
666	} while (cu_fgets(p = lbuf, sizeof(lbuf), &more) != NULL);
667	errx(1, "%lu: %s: unterminated substitute in regular expression",
668			linenum, fname);
669	/* NOTREACHED */
670}
671
672/*
673 * Compile the flags of the s command
674 */
675static char *
676compile_flags(char *p, struct s_subst *s)
677{
678	int gn;			/* True if we have seen g or n */
679	unsigned long nval;
680	char wfile[_POSIX2_LINE_MAX + 1], *q, *eq;
681
682	s->n = 1;				/* Default */
683	s->p = 0;
684	s->wfile = NULL;
685	s->wfd = -1;
686	s->icase = 0;
687	for (gn = 0;;) {
688		EATSPACE();			/* EXTENSION */
689		switch (*p) {
690		case 'g':
691			if (gn)
692				errx(1,
693"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
694			gn = 1;
695			s->n = 0;
696			break;
697		case '\0':
698		case '\n':
699		case ';':
700			return (p);
701		case 'p':
702			s->p = 1;
703			break;
704		case 'i':
705		case 'I':
706			s->icase = 1;
707			break;
708		case '1': case '2': case '3':
709		case '4': case '5': case '6':
710		case '7': case '8': case '9':
711			if (gn)
712				errx(1,
713"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
714			gn = 1;
715			errno = 0;
716			nval = strtol(p, &p, 10);
717			if (errno == ERANGE || nval > INT_MAX)
718				errx(1,
719"%lu: %s: overflow in the 'N' substitute flag", linenum, fname);
720			s->n = nval;
721			p--;
722			break;
723		case 'w':
724			p++;
725#ifdef HISTORIC_PRACTICE
726			if (*p != ' ') {
727				warnx("%lu: %s: space missing before w wfile", linenum, fname);
728				return (p);
729			}
730#endif
731			EATSPACE();
732			q = wfile;
733			eq = wfile + sizeof(wfile) - 1;
734			while (*p) {
735				if (*p == '\n')
736					break;
737				if (q >= eq)
738					err(1, "wfile too long");
739				*q++ = *p++;
740			}
741			*q = '\0';
742			if (q == wfile)
743				errx(1, "%lu: %s: no wfile specified", linenum, fname);
744			s->wfile = strdup(wfile);
745			if (!aflag && (s->wfd = open(wfile,
746			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
747			    DEFFILEMODE)) == -1)
748				err(1, "%s", wfile);
749			return (p);
750		default:
751			errx(1, "%lu: %s: bad flag in substitute command: '%c'",
752					linenum, fname, *p);
753			break;
754		}
755		p++;
756	}
757}
758
759/*
760 * Compile a translation set of strings into a lookup table.
761 */
762static char *
763compile_tr(char *p, struct s_tr **py)
764{
765	struct s_tr *y;
766	int i;
767	const char *op, *np;
768	char old[_POSIX2_LINE_MAX + 1];
769	char new[_POSIX2_LINE_MAX + 1];
770	size_t oclen, oldlen, nclen, newlen;
771	mbstate_t mbs1, mbs2;
772
773	if ((*py = y = malloc(sizeof(*y))) == NULL)
774		err(1, NULL);
775	y->multis = NULL;
776	y->nmultis = 0;
777
778	if (*p == '\0' || *p == '\\')
779		errx(1,
780	"%lu: %s: transform pattern can not be delimited by newline or backslash",
781			linenum, fname);
782	p = compile_delimited(p, old, 1);
783	if (p == NULL)
784		errx(1, "%lu: %s: unterminated transform source string",
785				linenum, fname);
786	p = compile_delimited(p - 1, new, 1);
787	if (p == NULL)
788		errx(1, "%lu: %s: unterminated transform target string",
789				linenum, fname);
790	EATSPACE();
791	op = old;
792	oldlen = mbsrtowcs(NULL, &op, 0, NULL);
793	if (oldlen == (size_t)-1)
794		err(1, NULL);
795	np = new;
796	newlen = mbsrtowcs(NULL, &np, 0, NULL);
797	if (newlen == (size_t)-1)
798		err(1, NULL);
799	if (newlen != oldlen)
800		errx(1, "%lu: %s: transform strings are not the same length",
801				linenum, fname);
802	if (MB_CUR_MAX == 1) {
803		/*
804		 * The single-byte encoding case is easy: generate a
805		 * lookup table.
806		 */
807		for (i = 0; i <= UCHAR_MAX; i++)
808			y->bytetab[i] = (char)i;
809		for (; *op; op++, np++)
810			y->bytetab[(u_char)*op] = *np;
811	} else {
812		/*
813		 * Multi-byte encoding case: generate a lookup table as
814		 * above, but only for single-byte characters. The first
815		 * bytes of multi-byte characters have their lookup table
816		 * entries set to 0, which causes do_tr() to search through
817		 * an auxiliary vector of multi-byte mappings.
818		 */
819		memset(&mbs1, 0, sizeof(mbs1));
820		memset(&mbs2, 0, sizeof(mbs2));
821		for (i = 0; i <= UCHAR_MAX; i++)
822			y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
823		while (*op != '\0') {
824			oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
825			if (oclen == (size_t)-1 || oclen == (size_t)-2)
826				errc(1, EILSEQ, NULL);
827			nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
828			if (nclen == (size_t)-1 || nclen == (size_t)-2)
829				errc(1, EILSEQ, NULL);
830			if (oclen == 1 && nclen == 1)
831				y->bytetab[(u_char)*op] = *np;
832			else {
833				y->bytetab[(u_char)*op] = 0;
834				y->multis = realloc(y->multis,
835				    (y->nmultis + 1) * sizeof(*y->multis));
836				if (y->multis == NULL)
837					err(1, NULL);
838				i = y->nmultis++;
839				y->multis[i].fromlen = oclen;
840				memcpy(y->multis[i].from, op, oclen);
841				y->multis[i].tolen = nclen;
842				memcpy(y->multis[i].to, np, nclen);
843			}
844			op += oclen;
845			np += nclen;
846		}
847	}
848	return (p);
849}
850
851/*
852 * Compile the text following an a, c, or i command.
853 */
854static char *
855compile_text(void)
856{
857	int asize, esc_nl, size;
858	char *text, *p, *op, *s;
859	char lbuf[_POSIX2_LINE_MAX + 1];
860
861	asize = 2 * _POSIX2_LINE_MAX + 1;
862	if ((text = malloc(asize)) == NULL)
863		err(1, "malloc");
864	size = 0;
865	while (cu_fgets(lbuf, sizeof(lbuf), NULL) != NULL) {
866		op = s = text + size;
867		p = lbuf;
868#ifdef LEGACY_BSDSED_COMPAT
869		EATSPACE();
870#endif
871		for (esc_nl = 0; *p != '\0'; p++) {
872			if (*p == '\\' && p[1] != '\0' && *++p == '\n')
873				esc_nl = 1;
874			*s++ = *p;
875		}
876		size += s - op;
877		if (!esc_nl) {
878			*s = '\0';
879			break;
880		}
881		if (asize - size < _POSIX2_LINE_MAX + 1) {
882			asize *= 2;
883			if ((text = realloc(text, asize)) == NULL)
884				err(1, "realloc");
885		}
886	}
887	text[size] = '\0';
888	if ((p = realloc(text, size + 1)) == NULL)
889		err(1, "realloc");
890	return (p);
891}
892
893/*
894 * Get an address and return a pointer to the first character after
895 * it.  Fill the structure pointed to according to the address.
896 */
897static char *
898compile_addr(char *p, struct s_addr *a)
899{
900	char *end, re[_POSIX2_LINE_MAX + 1];
901	int icase;
902
903	icase = 0;
904
905	a->type = 0;
906	switch (*p) {
907	case '\\':				/* Context address */
908		++p;
909		/* FALLTHROUGH */
910	case '/':				/* Context address */
911		p = compile_delimited(p, re, 0);
912		if (p == NULL)
913			errx(1, "%lu: %s: unterminated regular expression", linenum, fname);
914		/* Check for case insensitive regexp flag */
915		if (*p == 'I') {
916			icase = 1;
917			p++;
918		}
919		if (*re == '\0')
920			a->u.r = NULL;
921		else
922			a->u.r = compile_re(re, icase);
923		a->type = AT_RE;
924		return (p);
925
926	case '$':				/* Last line */
927		a->type = AT_LAST;
928		return (p + 1);
929
930	case '+':				/* Relative line number */
931		a->type = AT_RELLINE;
932		p++;
933		/* FALLTHROUGH */
934						/* Line number */
935	case '0': case '1': case '2': case '3': case '4':
936	case '5': case '6': case '7': case '8': case '9':
937		if (a->type == 0)
938			a->type = AT_LINE;
939		a->u.l = strtol(p, &end, 10);
940		return (end);
941	default:
942		errx(1, "%lu: %s: expected context address", linenum, fname);
943		return (NULL);
944	}
945}
946
947/*
948 * duptoeol --
949 *	Return a copy of all the characters up to \n or \0.
950 */
951static char *
952duptoeol(char *s, const char *ctype)
953{
954	size_t len;
955	int ws;
956	char *p, *start;
957
958	ws = 0;
959	for (start = s; *s != '\0' && *s != '\n'; ++s)
960		ws = isspace((unsigned char)*s);
961	*s = '\0';
962	if (ws)
963		warnx("%lu: %s: whitespace after %s", linenum, fname, ctype);
964	len = s - start + 1;
965	if ((p = malloc(len)) == NULL)
966		err(1, "malloc");
967	return (memmove(p, start, len));
968}
969
970/*
971 * Convert goto label names to addresses, and count a and r commands, in
972 * the given subset of the script.  Free the memory used by labels in b
973 * and t commands (but not by :).
974 *
975 * TODO: Remove } nodes
976 */
977static void
978fixuplabel(struct s_command *cp, struct s_command *end)
979{
980
981	for (; cp != end; cp = cp->next)
982		switch (cp->code) {
983		case 'a':
984		case 'r':
985			appendnum++;
986			break;
987		case 'b':
988		case 't':
989			/* Resolve branch target. */
990			if (cp->t == NULL) {
991				cp->u.c = NULL;
992				break;
993			}
994			if ((cp->u.c = findlabel(cp->t)) == NULL)
995				errx(1, "%lu: %s: undefined label '%s'", linenum, fname, cp->t);
996			free(cp->t);
997			break;
998		case '{':
999			/* Do interior commands. */
1000			fixuplabel(cp->u.c, cp->next);
1001			break;
1002		}
1003}
1004
1005/*
1006 * Associate the given command label for later lookup.
1007 */
1008static void
1009enterlabel(struct s_command *cp)
1010{
1011	struct labhash **lhp, *lh;
1012	u_char *p;
1013	u_int h, c;
1014
1015	for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++)
1016		h = (h << 5) + h + c;
1017	lhp = &labels[h & LHMASK];
1018	for (lh = *lhp; lh != NULL; lh = lh->lh_next)
1019		if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0)
1020			errx(1, "%lu: %s: duplicate label '%s'", linenum, fname, cp->t);
1021	if ((lh = malloc(sizeof *lh)) == NULL)
1022		err(1, "malloc");
1023	lh->lh_next = *lhp;
1024	lh->lh_hash = h;
1025	lh->lh_cmd = cp;
1026	lh->lh_ref = 0;
1027	*lhp = lh;
1028}
1029
1030/*
1031 * Find the label contained in the command l in the command linked
1032 * list cp.  L is excluded from the search.  Return NULL if not found.
1033 */
1034static struct s_command *
1035findlabel(char *name)
1036{
1037	struct labhash *lh;
1038	u_char *p;
1039	u_int h, c;
1040
1041	for (h = 0, p = (u_char *)name; (c = *p) != 0; p++)
1042		h = (h << 5) + h + c;
1043	for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) {
1044		if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) {
1045			lh->lh_ref = 1;
1046			return (lh->lh_cmd);
1047		}
1048	}
1049	return (NULL);
1050}
1051
1052/*
1053 * Warn about any unused labels.  As a side effect, release the label hash
1054 * table space.
1055 */
1056static void
1057uselabel(void)
1058{
1059	struct labhash *lh, *next;
1060	int i;
1061
1062	for (i = 0; i < LHSZ; i++) {
1063		for (lh = labels[i]; lh != NULL; lh = next) {
1064			next = lh->lh_next;
1065			if (!lh->lh_ref)
1066				warnx("%lu: %s: unused label '%s'",
1067				    linenum, fname, lh->lh_cmd->t);
1068			free(lh);
1069		}
1070	}
1071}
1072