process.c revision 170608
1/*-
2 * Copyright (c) 1992 Diomidis Spinellis.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Diomidis Spinellis of Imperial College, University of London.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/usr.bin/sed/process.c 170608 2007-06-12 12:05:24Z yar $");
36
37#ifndef lint
38static const char sccsid[] = "@(#)process.c	8.6 (Berkeley) 4/20/94";
39#endif
40
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
44#include <sys/uio.h>
45
46#include <ctype.h>
47#include <err.h>
48#include <errno.h>
49#include <fcntl.h>
50#include <limits.h>
51#include <regex.h>
52#include <stdio.h>
53#include <stdlib.h>
54#include <string.h>
55#include <unistd.h>
56#include <wchar.h>
57#include <wctype.h>
58
59#include "defs.h"
60#include "extern.h"
61
62static SPACE HS, PS, SS, YS;
63#define	pd		PS.deleted
64#define	ps		PS.space
65#define	psl		PS.len
66#define	hs		HS.space
67#define	hsl		HS.len
68
69static __inline int	 applies(struct s_command *);
70static void		 do_tr(struct s_tr *);
71static void		 flush_appends(void);
72static void		 lputs(char *, size_t);
73static __inline int	 regexec_e(regex_t *, const char *, int, int, size_t);
74static void		 regsub(SPACE *, char *, char *);
75static int		 substitute(struct s_command *);
76
77struct s_appends *appends;	/* Array of pointers to strings to append. */
78static int appendx;		/* Index into appends array. */
79int appendnum;			/* Size of appends array. */
80
81static int lastaddr;		/* Set by applies if last address of a range. */
82static int sdone;		/* If any substitutes since last line input. */
83				/* Iov structure for 'w' commands. */
84static regex_t *defpreg;
85size_t maxnsub;
86regmatch_t *match;
87
88#define OUT(s) { fwrite(s, sizeof(u_char), psl, outfile); fputc('\n', outfile); }
89
90void
91process(void)
92{
93	struct s_command *cp;
94	SPACE tspace;
95	size_t oldpsl = 0;
96	char *p;
97
98	p = NULL;
99
100	for (linenum = 0; mf_fgets(&PS, REPLACE);) {
101		pd = 0;
102top:
103		cp = prog;
104redirect:
105		while (cp != NULL) {
106			if (!applies(cp)) {
107				cp = cp->next;
108				continue;
109			}
110			switch (cp->code) {
111			case '{':
112				cp = cp->u.c;
113				goto redirect;
114			case 'a':
115				if (appendx >= appendnum)
116					if ((appends = realloc(appends,
117					    sizeof(struct s_appends) *
118					    (appendnum *= 2))) == NULL)
119						err(1, "realloc");
120				appends[appendx].type = AP_STRING;
121				appends[appendx].s = cp->t;
122				appends[appendx].len = strlen(cp->t);
123				appendx++;
124				break;
125			case 'b':
126				cp = cp->u.c;
127				goto redirect;
128			case 'c':
129				pd = 1;
130				psl = 0;
131				if (cp->a2 == NULL || lastaddr || lastline())
132					(void)fprintf(outfile, "%s", cp->t);
133				break;
134			case 'd':
135				pd = 1;
136				goto new;
137			case 'D':
138				if (pd)
139					goto new;
140				if (psl == 0 ||
141				    (p = memchr(ps, '\n', psl)) == NULL) {
142					pd = 1;
143					goto new;
144				} else {
145					psl -= (p + 1) - ps;
146					memmove(ps, p + 1, psl);
147					goto top;
148				}
149			case 'g':
150				cspace(&PS, hs, hsl, REPLACE);
151				break;
152			case 'G':
153				cspace(&PS, "\n", 1, APPEND);
154				cspace(&PS, hs, hsl, APPEND);
155				break;
156			case 'h':
157				cspace(&HS, ps, psl, REPLACE);
158				break;
159			case 'H':
160				cspace(&HS, "\n", 1, APPEND);
161				cspace(&HS, ps, psl, APPEND);
162				break;
163			case 'i':
164				(void)fprintf(outfile, "%s", cp->t);
165				break;
166			case 'l':
167				lputs(ps, psl);
168				break;
169			case 'n':
170				if (!nflag && !pd)
171					OUT(ps)
172				flush_appends();
173				if (!mf_fgets(&PS, REPLACE))
174					exit(0);
175				pd = 0;
176				break;
177			case 'N':
178				flush_appends();
179				cspace(&PS, "\n", 1, APPEND);
180				if (!mf_fgets(&PS, APPEND))
181					exit(0);
182				break;
183			case 'p':
184				if (pd)
185					break;
186				OUT(ps)
187				break;
188			case 'P':
189				if (pd)
190					break;
191				if ((p = memchr(ps, '\n', psl)) != NULL) {
192					oldpsl = psl;
193					psl = p - ps;
194				}
195				OUT(ps)
196				if (p != NULL)
197					psl = oldpsl;
198				break;
199			case 'q':
200				if (!nflag && !pd)
201					OUT(ps)
202				flush_appends();
203				exit(0);
204			case 'r':
205				if (appendx >= appendnum)
206					if ((appends = realloc(appends,
207					    sizeof(struct s_appends) *
208					    (appendnum *= 2))) == NULL)
209						err(1, "realloc");
210				appends[appendx].type = AP_FILE;
211				appends[appendx].s = cp->t;
212				appends[appendx].len = strlen(cp->t);
213				appendx++;
214				break;
215			case 's':
216				sdone |= substitute(cp);
217				break;
218			case 't':
219				if (sdone) {
220					sdone = 0;
221					cp = cp->u.c;
222					goto redirect;
223				}
224				break;
225			case 'w':
226				if (pd)
227					break;
228				if (cp->u.fd == -1 && (cp->u.fd = open(cp->t,
229				    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
230				    DEFFILEMODE)) == -1)
231					err(1, "%s", cp->t);
232				if (write(cp->u.fd, ps, psl) != psl ||
233				    write(cp->u.fd, "\n", 1) != 1)
234					err(1, "%s", cp->t);
235				break;
236			case 'x':
237				/*
238				 * If the hold space is null, make it empty
239				 * but not null.  Otherwise the pattern space
240				 * will become null after the swap, which is
241				 * an abnormal condition.
242				 */
243				if (hs == NULL)
244					cspace(&HS, "", 0, REPLACE);
245				tspace = PS;
246				PS = HS;
247				HS = tspace;
248				break;
249			case 'y':
250				if (pd || psl == 0)
251					break;
252				do_tr(cp->u.y);
253				break;
254			case ':':
255			case '}':
256				break;
257			case '=':
258				(void)fprintf(outfile, "%lu\n", linenum);
259			}
260			cp = cp->next;
261		} /* for all cp */
262
263new:		if (!nflag && !pd)
264			OUT(ps)
265		flush_appends();
266	} /* for all lines */
267}
268
269/*
270 * TRUE if the address passed matches the current program state
271 * (lastline, linenumber, ps).
272 */
273#define	MATCH(a)							\
274	((a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, psl) :	\
275	    (a)->type == AT_LINE ? linenum == (a)->u.l : lastline())
276
277/*
278 * Return TRUE if the command applies to the current line.  Sets the inrange
279 * flag to process ranges.  Interprets the non-select (``!'') flag.
280 */
281static __inline int
282applies(struct s_command *cp)
283{
284	int r;
285
286	lastaddr = 0;
287	if (cp->a1 == NULL && cp->a2 == NULL)
288		r = 1;
289	else if (cp->a2)
290		if (cp->inrange) {
291			if (MATCH(cp->a2)) {
292				cp->inrange = 0;
293				lastaddr = 1;
294				r = 1;
295			} else if (cp->a2->type == AT_LINE &&
296				   linenum > cp->a2->u.l) {
297				/*
298				 * We missed the 2nd address due to a branch,
299				 * so just close the range and return false.
300				 */
301				cp->inrange = 0;
302				r = 0;
303			} else
304				r = 1;
305		} else if (MATCH(cp->a1)) {
306			/*
307			 * If the second address is a number less than or
308			 * equal to the line number first selected, only
309			 * one line shall be selected.
310			 *	-- POSIX 1003.2
311			 */
312			if (cp->a2->type == AT_LINE &&
313			    linenum >= cp->a2->u.l)
314				lastaddr = 1;
315			else
316				cp->inrange = 1;
317			r = 1;
318		} else
319			r = 0;
320	else
321		r = MATCH(cp->a1);
322	return (cp->nonsel ? ! r : r);
323}
324
325/*
326 * Reset the sed processor to its initial state.
327 */
328void
329resetstate(void)
330{
331	struct s_command *cp;
332
333	/*
334	 * Reset all inrange markers.
335	 */
336	for (cp = prog; cp; cp = cp->code == '{' ? cp->u.c : cp->next)
337		if (cp->a2)
338			cp->inrange = 0;
339
340	/*
341	 * Clear out the hold space.
342	 */
343	cspace(&HS, "", 0, REPLACE);
344}
345
346/*
347 * substitute --
348 *	Do substitutions in the pattern space.  Currently, we build a
349 *	copy of the new pattern space in the substitute space structure
350 *	and then swap them.
351 */
352static int
353substitute(struct s_command *cp)
354{
355	SPACE tspace;
356	regex_t *re;
357	regoff_t re_off, slen;
358	int lastempty, n;
359	char *s;
360
361	s = ps;
362	re = cp->u.s->re;
363	if (re == NULL) {
364		if (defpreg != NULL && cp->u.s->maxbref > defpreg->re_nsub) {
365			linenum = cp->u.s->linenum;
366			errx(1, "%lu: %s: \\%d not defined in the RE",
367					linenum, fname, cp->u.s->maxbref);
368		}
369	}
370	if (!regexec_e(re, s, 0, 0, psl))
371		return (0);
372
373	SS.len = 0;				/* Clean substitute space. */
374	slen = psl;
375	n = cp->u.s->n;
376	lastempty = 1;
377
378	switch (n) {
379	case 0:					/* Global */
380		do {
381			if (lastempty || match[0].rm_so != match[0].rm_eo) {
382				/* Locate start of replaced string. */
383				re_off = match[0].rm_so;
384				/* Copy leading retained string. */
385				cspace(&SS, s, re_off, APPEND);
386				/* Add in regular expression. */
387				regsub(&SS, s, cp->u.s->new);
388			}
389
390			/* Move past this match. */
391			if (match[0].rm_so != match[0].rm_eo) {
392				s += match[0].rm_eo;
393				slen -= match[0].rm_eo;
394				lastempty = 0;
395			} else {
396				if (match[0].rm_so < slen)
397					cspace(&SS, s + match[0].rm_so, 1,
398					    APPEND);
399				s += match[0].rm_so + 1;
400				slen -= match[0].rm_so + 1;
401				lastempty = 1;
402			}
403		} while (slen >= 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
404		/* Copy trailing retained string. */
405		if (slen > 0)
406			cspace(&SS, s, slen, APPEND);
407		break;
408	default:				/* Nth occurrence */
409		while (--n) {
410			if (match[0].rm_eo == match[0].rm_so)
411				match[0].rm_eo = match[0].rm_so + 1;
412			s += match[0].rm_eo;
413			slen -= match[0].rm_eo;
414			if (slen < 0)
415				return (0);
416			if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
417				return (0);
418		}
419		/* FALLTHROUGH */
420	case 1:					/* 1st occurrence */
421		/* Locate start of replaced string. */
422		re_off = match[0].rm_so + (s - ps);
423		/* Copy leading retained string. */
424		cspace(&SS, ps, re_off, APPEND);
425		/* Add in regular expression. */
426		regsub(&SS, s, cp->u.s->new);
427		/* Copy trailing retained string. */
428		s += match[0].rm_eo;
429		slen -= match[0].rm_eo;
430		cspace(&SS, s, slen, APPEND);
431		break;
432	}
433
434	/*
435	 * Swap the substitute space and the pattern space, and make sure
436	 * that any leftover pointers into stdio memory get lost.
437	 */
438	tspace = PS;
439	PS = SS;
440	SS = tspace;
441	SS.space = SS.back;
442
443	/* Handle the 'p' flag. */
444	if (cp->u.s->p)
445		OUT(ps)
446
447	/* Handle the 'w' flag. */
448	if (cp->u.s->wfile && !pd) {
449		if (cp->u.s->wfd == -1 && (cp->u.s->wfd = open(cp->u.s->wfile,
450		    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, DEFFILEMODE)) == -1)
451			err(1, "%s", cp->u.s->wfile);
452		if (write(cp->u.s->wfd, ps, psl) != psl ||
453		    write(cp->u.s->wfd, "\n", 1) != 1)
454			err(1, "%s", cp->u.s->wfile);
455	}
456	return (1);
457}
458
459/*
460 * do_tr --
461 *	Perform translation ('y' command) in the pattern space.
462 */
463static void
464do_tr(struct s_tr *y)
465{
466	SPACE tmp;
467	char c, *p;
468	size_t clen, left;
469	int i;
470
471	if (MB_CUR_MAX == 1) {
472		/*
473		 * Single-byte encoding: perform in-place translation
474		 * of the pattern space.
475		 */
476		for (p = ps; p < &ps[psl]; p++)
477			*p = y->bytetab[(u_char)*p];
478	} else {
479		/*
480		 * Multi-byte encoding: perform translation into the
481		 * translation space, then swap the translation and
482		 * pattern spaces.
483		 */
484		/* Clean translation space. */
485		YS.len = 0;
486		for (p = ps, left = psl; left > 0; p += clen, left -= clen) {
487			if ((c = y->bytetab[(u_char)*p]) != '\0') {
488				cspace(&YS, &c, 1, APPEND);
489				clen = 1;
490				continue;
491			}
492			for (i = 0; i < y->nmultis; i++)
493				if (left >= y->multis[i].fromlen &&
494				    memcmp(p, y->multis[i].from,
495				    y->multis[i].fromlen) == 0)
496					break;
497			if (i < y->nmultis) {
498				cspace(&YS, y->multis[i].to,
499				    y->multis[i].tolen, APPEND);
500				clen = y->multis[i].fromlen;
501			} else {
502				cspace(&YS, p, 1, APPEND);
503				clen = 1;
504			}
505		}
506		/* Swap the translation space and the pattern space. */
507		tmp = PS;
508		PS = YS;
509		YS = tmp;
510		YS.space = YS.back;
511	}
512}
513
514/*
515 * Flush append requests.  Always called before reading a line,
516 * therefore it also resets the substitution done (sdone) flag.
517 */
518static void
519flush_appends(void)
520{
521	FILE *f;
522	int count, i;
523	char buf[8 * 1024];
524
525	for (i = 0; i < appendx; i++)
526		switch (appends[i].type) {
527		case AP_STRING:
528			fwrite(appends[i].s, sizeof(char), appends[i].len,
529			    outfile);
530			break;
531		case AP_FILE:
532			/*
533			 * Read files probably shouldn't be cached.  Since
534			 * it's not an error to read a non-existent file,
535			 * it's possible that another program is interacting
536			 * with the sed script through the filesystem.  It
537			 * would be truly bizarre, but possible.  It's probably
538			 * not that big a performance win, anyhow.
539			 */
540			if ((f = fopen(appends[i].s, "r")) == NULL)
541				break;
542			while ((count = fread(buf, sizeof(char), sizeof(buf), f)))
543				(void)fwrite(buf, sizeof(char), count, outfile);
544			(void)fclose(f);
545			break;
546		}
547	if (ferror(outfile))
548		errx(1, "%s: %s", outfname, strerror(errno ? errno : EIO));
549	appendx = sdone = 0;
550}
551
552static void
553lputs(char *s, size_t len)
554{
555	static const char escapes[] = "\\\a\b\f\r\t\v";
556	int c, col, width;
557	char *p;
558	struct winsize win;
559	static int termwidth = -1;
560	size_t clen, i;
561	wchar_t wc;
562	mbstate_t mbs;
563
564	if (outfile != stdout)
565		termwidth = 60;
566	if (termwidth == -1) {
567		if ((p = getenv("COLUMNS")) && *p != '\0')
568			termwidth = atoi(p);
569		else if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) == 0 &&
570		    win.ws_col > 0)
571			termwidth = win.ws_col;
572		else
573			termwidth = 60;
574	}
575
576	memset(&mbs, 0, sizeof(mbs));
577	col = 0;
578	while (len != 0) {
579		clen = mbrtowc(&wc, s, len, &mbs);
580		if (clen == 0)
581			clen = 1;
582		if (clen == (size_t)-1 || clen == (size_t)-2) {
583			wc = (unsigned char)*s;
584			clen = 1;
585			memset(&mbs, 0, sizeof(mbs));
586		}
587		if (wc == '\n') {
588			if (col + 1 >= termwidth)
589				fprintf(outfile, "\\\n");
590			fputc('$', outfile);
591			fputc('\n', outfile);
592			col = 0;
593		} else if (iswprint(wc)) {
594			width = wcwidth(wc);
595			if (col + width >= termwidth) {
596				fprintf(outfile, "\\\n");
597				col = 0;
598			}
599			fwrite(s, 1, clen, outfile);
600			col += width;
601		} else if (wc != L'\0' && (c = wctob(wc)) != EOF &&
602		    (p = strchr(escapes, c)) != NULL) {
603			if (col + 2 >= termwidth) {
604				fprintf(outfile, "\\\n");
605				col = 0;
606			}
607			fprintf(outfile, "\\%c", "\\abfrtv"[p - escapes]);
608			col += 2;
609		} else {
610			if (col + 4 * clen >= termwidth) {
611				fprintf(outfile, "\\\n");
612				col = 0;
613			}
614			for (i = 0; i < clen; i++)
615				fprintf(outfile, "\\%03o",
616				    (int)(unsigned char)s[i]);
617			col += 4 * clen;
618		}
619		s += clen;
620		len -= clen;
621	}
622	if (col + 1 >= termwidth)
623		fprintf(outfile, "\\\n");
624	(void)fputc('$', outfile);
625	(void)fputc('\n', outfile);
626	if (ferror(outfile))
627		errx(1, "%s: %s", outfname, strerror(errno ? errno : EIO));
628}
629
630static __inline int
631regexec_e(regex_t *preg, const char *string, int eflags, int nomatch,
632	size_t slen)
633{
634	int eval;
635
636	if (preg == NULL) {
637		if (defpreg == NULL)
638			errx(1, "first RE may not be empty");
639	} else
640		defpreg = preg;
641
642	/* Set anchors */
643	match[0].rm_so = 0;
644	match[0].rm_eo = slen;
645
646	eval = regexec(defpreg, string,
647	    nomatch ? 0 : maxnsub + 1, match, eflags | REG_STARTEND);
648	switch(eval) {
649	case 0:
650		return (1);
651	case REG_NOMATCH:
652		return (0);
653	}
654	errx(1, "RE error: %s", strregerror(eval, defpreg));
655	/* NOTREACHED */
656}
657
658/*
659 * regsub - perform substitutions after a regexp match
660 * Based on a routine by Henry Spencer
661 */
662static void
663regsub(SPACE *sp, char *string, char *src)
664{
665	int len, no;
666	char c, *dst;
667
668#define	NEEDSP(reqlen)							\
669	/* XXX What is the +1 for? */					\
670	if (sp->len + (reqlen) + 1 >= sp->blen) {			\
671		sp->blen += (reqlen) + 1024;				\
672		if ((sp->space = sp->back = realloc(sp->back, sp->blen)) \
673		    == NULL)						\
674			err(1, "realloc");				\
675		dst = sp->space + sp->len;				\
676	}
677
678	dst = sp->space + sp->len;
679	while ((c = *src++) != '\0') {
680		if (c == '&')
681			no = 0;
682		else if (c == '\\' && isdigit((unsigned char)*src))
683			no = *src++ - '0';
684		else
685			no = -1;
686		if (no < 0) {		/* Ordinary character. */
687			if (c == '\\' && (*src == '\\' || *src == '&'))
688				c = *src++;
689			NEEDSP(1);
690			*dst++ = c;
691			++sp->len;
692		} else if (match[no].rm_so != -1 && match[no].rm_eo != -1) {
693			len = match[no].rm_eo - match[no].rm_so;
694			NEEDSP(len);
695			memmove(dst, string + match[no].rm_so, len);
696			dst += len;
697			sp->len += len;
698		}
699	}
700	NEEDSP(1);
701	*dst = '\0';
702}
703
704/*
705 * cspace --
706 *	Concatenate space: append the source space to the destination space,
707 *	allocating new space as necessary.
708 */
709void
710cspace(SPACE *sp, const char *p, size_t len, enum e_spflag spflag)
711{
712	size_t tlen;
713
714	/* Make sure SPACE has enough memory and ramp up quickly. */
715	tlen = sp->len + len + 1;
716	if (tlen > sp->blen) {
717		sp->blen = tlen + 1024;
718		if ((sp->space = sp->back = realloc(sp->back, sp->blen)) ==
719		    NULL)
720			err(1, "realloc");
721	}
722
723	if (spflag == REPLACE)
724		sp->len = 0;
725
726	memmove(sp->space + sp->len, p, len);
727
728	sp->space[sp->len += len] = '\0';
729}
730
731/*
732 * Close all cached opened files and report any errors
733 */
734void
735cfclose(struct s_command *cp, struct s_command *end)
736{
737
738	for (; cp != end; cp = cp->next)
739		switch(cp->code) {
740		case 's':
741			if (cp->u.s->wfd != -1 && close(cp->u.s->wfd))
742				err(1, "%s", cp->u.s->wfile);
743			cp->u.s->wfd = -1;
744			break;
745		case 'w':
746			if (cp->u.fd != -1 && close(cp->u.fd))
747				err(1, "%s", cp->t);
748			cp->u.fd = -1;
749			break;
750		case '{':
751			cfclose(cp->u.c, cp->next);
752			break;
753		}
754}
755