1/* $Id: read.c,v 1.220 2021/06/27 17:57:54 schwarze Exp $ */
2/*
3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org>
4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 * Top-level functions of the mandoc(3) parser:
20 * Parser and input encoding selection, decompression,
21 * handling of input bytes, characters, lines, and files,
22 * handling of roff(7) loops and file inclusion,
23 * and steering of the various parsers.
24 */
25#include "config.h"
26
27#include <sys/types.h>
28#include <sys/mman.h>
29#include <sys/stat.h>
30
31#include <assert.h>
32#include <ctype.h>
33#include <errno.h>
34#include <fcntl.h>
35#include <stdarg.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <unistd.h>
40#include <zlib.h>
41
42#include "mandoc_aux.h"
43#include "mandoc.h"
44#include "roff.h"
45#include "mdoc.h"
46#include "man.h"
47#include "mandoc_parse.h"
48#include "libmandoc.h"
49#include "roff_int.h"
50#include "tag.h"
51
52#define	REPARSE_LIMIT	1000
53
54struct	mparse {
55	struct roff	 *roff; /* roff parser (!NULL) */
56	struct roff_man	 *man; /* man parser */
57	struct buf	 *primary; /* buffer currently being parsed */
58	struct buf	 *secondary; /* copy of top level input */
59	struct buf	 *loop; /* open .while request line */
60	const char	 *os_s; /* default operating system */
61	int		  options; /* parser options */
62	int		  gzip; /* current input file is gzipped */
63	int		  filenc; /* encoding of the current file */
64	int		  reparse_count; /* finite interp. stack */
65	int		  line; /* line number in the file */
66};
67
68static	void	  choose_parser(struct mparse *);
69static	void	  free_buf_list(struct buf *);
70static	void	  resize_buf(struct buf *, size_t);
71static	int	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
72static	int	  read_whole_file(struct mparse *, int, struct buf *, int *);
73static	void	  mparse_end(struct mparse *);
74
75
76static void
77resize_buf(struct buf *buf, size_t initial)
78{
79
80	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
81	buf->buf = mandoc_realloc(buf->buf, buf->sz);
82}
83
84static void
85free_buf_list(struct buf *buf)
86{
87	struct buf *tmp;
88
89	while (buf != NULL) {
90		tmp = buf;
91		buf = tmp->next;
92		free(tmp->buf);
93		free(tmp);
94	}
95}
96
97static void
98choose_parser(struct mparse *curp)
99{
100	char		*cp, *ep;
101	int		 format;
102
103	/*
104	 * If neither command line arguments -mdoc or -man select
105	 * a parser nor the roff parser found a .Dd or .TH macro
106	 * yet, look ahead in the main input buffer.
107	 */
108
109	if ((format = roff_getformat(curp->roff)) == 0) {
110		cp = curp->primary->buf;
111		ep = cp + curp->primary->sz;
112		while (cp < ep) {
113			if (*cp == '.' || *cp == '\'') {
114				cp++;
115				if (cp[0] == 'D' && cp[1] == 'd') {
116					format = MPARSE_MDOC;
117					break;
118				}
119				if (cp[0] == 'T' && cp[1] == 'H') {
120					format = MPARSE_MAN;
121					break;
122				}
123			}
124			cp = memchr(cp, '\n', ep - cp);
125			if (cp == NULL)
126				break;
127			cp++;
128		}
129	}
130
131	if (format == MPARSE_MDOC) {
132		curp->man->meta.macroset = MACROSET_MDOC;
133		if (curp->man->mdocmac == NULL)
134			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
135	} else {
136		curp->man->meta.macroset = MACROSET_MAN;
137		if (curp->man->manmac == NULL)
138			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
139	}
140	curp->man->meta.first->tok = TOKEN_NONE;
141}
142
143/*
144 * Main parse routine for a buffer.
145 * It assumes encoding and line numbering are already set up.
146 * It can recurse directly (for invocations of user-defined
147 * macros, inline equations, and input line traps)
148 * and indirectly (for .so file inclusion).
149 */
150static int
151mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
152{
153	struct buf	 ln;
154	struct buf	*firstln, *lastln, *thisln, *loop;
155	char		*cp;
156	size_t		 pos; /* byte number in the ln buffer */
157	size_t		 spos; /* at the start of the current line parse */
158	int		 line_result, result;
159	int		 of;
160	int		 lnn; /* line number in the real file */
161	int		 fd;
162	int		 inloop; /* Saw .while on this level. */
163	unsigned char	 c;
164
165	ln.sz = 256;
166	ln.buf = mandoc_malloc(ln.sz);
167	ln.next = NULL;
168	firstln = lastln = loop = NULL;
169	lnn = curp->line;
170	pos = 0;
171	inloop = 0;
172	result = ROFF_CONT;
173
174	while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
175		if (start) {
176			curp->line = lnn;
177			curp->reparse_count = 0;
178
179			if (lnn < 3 &&
180			    curp->filenc & MPARSE_UTF8 &&
181			    curp->filenc & MPARSE_LATIN1)
182				curp->filenc = preconv_cue(&blk, i);
183		}
184		spos = pos;
185
186		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
187
188			/*
189			 * When finding an unescaped newline character,
190			 * leave the character loop to process the line.
191			 * Skip a preceding carriage return, if any.
192			 */
193
194			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
195			    '\n' == blk.buf[i + 1])
196				++i;
197			if ('\n' == blk.buf[i]) {
198				++i;
199				++lnn;
200				break;
201			}
202
203			/*
204			 * Make sure we have space for the worst
205			 * case of 12 bytes: "\\[u10ffff]\n\0"
206			 */
207
208			if (pos + 12 > ln.sz)
209				resize_buf(&ln, 256);
210
211			/*
212			 * Encode 8-bit input.
213			 */
214
215			c = blk.buf[i];
216			if (c & 0x80) {
217				if ( ! (curp->filenc && preconv_encode(
218				    &blk, &i, &ln, &pos, &curp->filenc))) {
219					mandoc_msg(MANDOCERR_CHAR_BAD,
220					    curp->line, pos, "0x%x", c);
221					ln.buf[pos++] = '?';
222					i++;
223				}
224				continue;
225			}
226
227			/*
228			 * Exclude control characters.
229			 */
230
231			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
232				mandoc_msg(c == 0x00 || c == 0x04 ||
233				    c > 0x0a ? MANDOCERR_CHAR_BAD :
234				    MANDOCERR_CHAR_UNSUPP,
235				    curp->line, pos, "0x%x", c);
236				i++;
237				if (c != '\r')
238					ln.buf[pos++] = '?';
239				continue;
240			}
241
242			ln.buf[pos++] = blk.buf[i++];
243		}
244		ln.buf[pos] = '\0';
245
246		/*
247		 * Maintain a lookaside buffer of all lines.
248		 * parsed from this input source.
249		 */
250
251		thisln = mandoc_malloc(sizeof(*thisln));
252		thisln->buf = mandoc_strdup(ln.buf);
253		thisln->sz = strlen(ln.buf) + 1;
254		thisln->next = NULL;
255		if (firstln == NULL) {
256			firstln = lastln = thisln;
257			if (curp->secondary == NULL)
258				curp->secondary = firstln;
259		} else {
260			lastln->next = thisln;
261			lastln = thisln;
262		}
263
264		/* XXX Ugly hack to mark the end of the input. */
265
266		if (i == blk.sz || blk.buf[i] == '\0') {
267			if (pos + 2 > ln.sz)
268				resize_buf(&ln, 256);
269			ln.buf[pos++] = '\n';
270			ln.buf[pos] = '\0';
271		}
272
273		/*
274		 * A significant amount of complexity is contained by
275		 * the roff preprocessor.  It's line-oriented but can be
276		 * expressed on one line, so we need at times to
277		 * readjust our starting point and re-run it.  The roff
278		 * preprocessor can also readjust the buffers with new
279		 * data, so we pass them in wholesale.
280		 */
281
282		of = 0;
283rerun:
284		line_result = roff_parseln(curp->roff, curp->line,
285		    &ln, &of, start && spos == 0 ? pos : 0);
286
287		/* Process options. */
288
289		if (line_result & ROFF_APPEND)
290			assert(line_result == (ROFF_IGN | ROFF_APPEND));
291
292		if (line_result & ROFF_USERCALL)
293			assert((line_result & ROFF_MASK) == ROFF_REPARSE);
294
295		if (line_result & ROFF_USERRET) {
296			assert(line_result == (ROFF_IGN | ROFF_USERRET));
297			if (start == 0) {
298				/* Return from the current macro. */
299				result = ROFF_USERRET;
300				goto out;
301			}
302		}
303
304		switch (line_result & ROFF_LOOPMASK) {
305		case ROFF_IGN:
306			break;
307		case ROFF_WHILE:
308			if (curp->loop != NULL) {
309				if (loop == curp->loop)
310					break;
311				mandoc_msg(MANDOCERR_WHILE_NEST,
312				    curp->line, pos, NULL);
313			}
314			curp->loop = thisln;
315			loop = NULL;
316			inloop = 1;
317			break;
318		case ROFF_LOOPCONT:
319		case ROFF_LOOPEXIT:
320			if (curp->loop == NULL) {
321				mandoc_msg(MANDOCERR_WHILE_FAIL,
322				    curp->line, pos, NULL);
323				break;
324			}
325			if (inloop == 0) {
326				mandoc_msg(MANDOCERR_WHILE_INTO,
327				    curp->line, pos, NULL);
328				curp->loop = loop = NULL;
329				break;
330			}
331			if (line_result & ROFF_LOOPCONT)
332				loop = curp->loop;
333			else {
334				curp->loop = loop = NULL;
335				inloop = 0;
336			}
337			break;
338		default:
339			abort();
340		}
341
342		/* Process the main instruction from the roff parser. */
343
344		switch (line_result & ROFF_MASK) {
345		case ROFF_IGN:
346			break;
347		case ROFF_CONT:
348			if (curp->man->meta.macroset == MACROSET_NONE)
349				choose_parser(curp);
350			if ((curp->man->meta.macroset == MACROSET_MDOC ?
351			     mdoc_parseln(curp->man, curp->line, ln.buf, of) :
352			     man_parseln(curp->man, curp->line, ln.buf, of)
353			    ) == 2)
354				goto out;
355			break;
356		case ROFF_RERUN:
357			goto rerun;
358		case ROFF_REPARSE:
359			if (++curp->reparse_count > REPARSE_LIMIT) {
360				/* Abort and return to the top level. */
361				result = ROFF_IGN;
362				mandoc_msg(MANDOCERR_ROFFLOOP,
363				    curp->line, pos, NULL);
364				goto out;
365			}
366			result = mparse_buf_r(curp, ln, of, 0);
367			if (line_result & ROFF_USERCALL) {
368				roff_userret(curp->roff);
369				/* Continue normally. */
370				if (result & ROFF_USERRET)
371					result = ROFF_CONT;
372			}
373			if (start == 0 && result != ROFF_CONT)
374				goto out;
375			break;
376		case ROFF_SO:
377			if ( ! (curp->options & MPARSE_SO) &&
378			    (i >= blk.sz || blk.buf[i] == '\0')) {
379				curp->man->meta.sodest =
380				    mandoc_strdup(ln.buf + of);
381				goto out;
382			}
383			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
384				mparse_readfd(curp, fd, ln.buf + of);
385				close(fd);
386			} else {
387				mandoc_msg(MANDOCERR_SO_FAIL,
388				    curp->line, of, ".so %s: %s",
389				    ln.buf + of, strerror(errno));
390				ln.sz = mandoc_asprintf(&cp,
391				    ".sp\nSee the file %s.\n.sp",
392				    ln.buf + of);
393				free(ln.buf);
394				ln.buf = cp;
395				of = 0;
396				mparse_buf_r(curp, ln, of, 0);
397			}
398			break;
399		default:
400			abort();
401		}
402
403		/* Start the next input line. */
404
405		if (loop != NULL &&
406		    (line_result & ROFF_LOOPMASK) == ROFF_IGN)
407			loop = loop->next;
408
409		if (loop != NULL) {
410			if ((line_result & ROFF_APPEND) == 0)
411				*ln.buf = '\0';
412			if (ln.sz < loop->sz)
413				resize_buf(&ln, loop->sz);
414			(void)strlcat(ln.buf, loop->buf, ln.sz);
415			of = 0;
416			goto rerun;
417		}
418
419		pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
420	}
421out:
422	if (inloop) {
423		if (result != ROFF_USERRET)
424			mandoc_msg(MANDOCERR_WHILE_OUTOF,
425			    curp->line, pos, NULL);
426		curp->loop = NULL;
427	}
428	free(ln.buf);
429	if (firstln != curp->secondary)
430		free_buf_list(firstln);
431	return result;
432}
433
434static int
435read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
436{
437	struct stat	 st;
438	gzFile		 gz;
439	size_t		 off;
440	ssize_t		 ssz;
441	int		 gzerrnum, retval;
442
443	if (fstat(fd, &st) == -1) {
444		mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno));
445		return -1;
446	}
447
448	/*
449	 * If we're a regular file, try just reading in the whole entry
450	 * via mmap().  This is faster than reading it into blocks, and
451	 * since each file is only a few bytes to begin with, I'm not
452	 * concerned that this is going to tank any machines.
453	 */
454
455	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
456		if (st.st_size > 0x7fffffff) {
457			mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
458			return -1;
459		}
460		*with_mmap = 1;
461		fb->sz = (size_t)st.st_size;
462		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
463		if (fb->buf != MAP_FAILED)
464			return 0;
465	}
466
467	if (curp->gzip) {
468		/*
469		 * Duplicating the file descriptor is required
470		 * because we will have to call gzclose(3)
471		 * to free memory used internally by zlib,
472		 * but that will also close the file descriptor,
473		 * which this function must not do.
474		 */
475		if ((fd = dup(fd)) == -1) {
476			mandoc_msg(MANDOCERR_DUP, 0, 0,
477			    "%s", strerror(errno));
478			return -1;
479		}
480		if ((gz = gzdopen(fd, "rb")) == NULL) {
481			mandoc_msg(MANDOCERR_GZDOPEN, 0, 0,
482			    "%s", strerror(errno));
483			close(fd);
484			return -1;
485		}
486	} else
487		gz = NULL;
488
489	/*
490	 * If this isn't a regular file (like, say, stdin), then we must
491	 * go the old way and just read things in bit by bit.
492	 */
493
494	*with_mmap = 0;
495	off = 0;
496	retval = -1;
497	fb->sz = 0;
498	fb->buf = NULL;
499	for (;;) {
500		if (off == fb->sz) {
501			if (fb->sz == (1U << 31)) {
502				mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
503				break;
504			}
505			resize_buf(fb, 65536);
506		}
507		ssz = curp->gzip ?
508		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
509		    read(fd, fb->buf + (int)off, fb->sz - off);
510		if (ssz == 0) {
511			fb->sz = off;
512			retval = 0;
513			break;
514		}
515		if (ssz == -1) {
516			if (curp->gzip)
517				(void)gzerror(gz, &gzerrnum);
518			mandoc_msg(MANDOCERR_READ, 0, 0, "%s",
519			    curp->gzip && gzerrnum != Z_ERRNO ?
520			    zError(gzerrnum) : strerror(errno));
521			break;
522		}
523		off += (size_t)ssz;
524	}
525
526	if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
527		mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s",
528		    gzerrnum == Z_ERRNO ? strerror(errno) :
529		    zError(gzerrnum));
530	if (retval == -1) {
531		free(fb->buf);
532		fb->buf = NULL;
533	}
534	return retval;
535}
536
537static void
538mparse_end(struct mparse *curp)
539{
540	if (curp->man->meta.macroset == MACROSET_NONE)
541		curp->man->meta.macroset = MACROSET_MAN;
542	if (curp->man->meta.macroset == MACROSET_MDOC)
543		mdoc_endparse(curp->man);
544	else
545		man_endparse(curp->man);
546	roff_endparse(curp->roff);
547}
548
549/*
550 * Read the whole file into memory and call the parsers.
551 * Called recursively when an .so request is encountered.
552 */
553void
554mparse_readfd(struct mparse *curp, int fd, const char *filename)
555{
556	static int	 recursion_depth;
557
558	struct buf	 blk;
559	struct buf	*save_primary;
560	const char	*save_filename, *cp;
561	size_t		 offset;
562	int		 save_filenc, save_lineno;
563	int		 with_mmap;
564
565	if (recursion_depth > 64) {
566		mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
567		return;
568	} else if (recursion_depth == 0 &&
569	    (cp = strrchr(filename, '.')) != NULL &&
570            cp[1] >= '1' && cp[1] <= '9')
571                curp->man->filesec = cp[1];
572        else
573                curp->man->filesec = '\0';
574
575	if (read_whole_file(curp, fd, &blk, &with_mmap) == -1)
576		return;
577
578	/*
579	 * Save some properties of the parent file.
580	 */
581
582	save_primary = curp->primary;
583	save_filenc = curp->filenc;
584	save_lineno = curp->line;
585	save_filename = mandoc_msg_getinfilename();
586
587	curp->primary = &blk;
588	curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
589	curp->line = 1;
590	mandoc_msg_setinfilename(filename);
591
592	/* Skip an UTF-8 byte order mark. */
593	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
594	    (unsigned char)blk.buf[0] == 0xef &&
595	    (unsigned char)blk.buf[1] == 0xbb &&
596	    (unsigned char)blk.buf[2] == 0xbf) {
597		offset = 3;
598		curp->filenc &= ~MPARSE_LATIN1;
599	} else
600		offset = 0;
601
602	recursion_depth++;
603	mparse_buf_r(curp, blk, offset, 1);
604	if (--recursion_depth == 0)
605		mparse_end(curp);
606
607	/*
608	 * Clean up and restore saved parent properties.
609	 */
610
611	if (with_mmap)
612		munmap(blk.buf, blk.sz);
613	else
614		free(blk.buf);
615
616	curp->primary = save_primary;
617	curp->filenc = save_filenc;
618	curp->line = save_lineno;
619	if (save_filename != NULL)
620		mandoc_msg_setinfilename(save_filename);
621}
622
623int
624mparse_open(struct mparse *curp, const char *file)
625{
626	char		 *cp;
627	int		  fd, save_errno;
628
629	cp = strrchr(file, '.');
630	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
631
632	/* First try to use the filename as it is. */
633
634	if ((fd = open(file, O_RDONLY)) != -1)
635		return fd;
636
637	/*
638	 * If that doesn't work and the filename doesn't
639	 * already  end in .gz, try appending .gz.
640	 */
641
642	if ( ! curp->gzip) {
643		save_errno = errno;
644		mandoc_asprintf(&cp, "%s.gz", file);
645		fd = open(cp, O_RDONLY);
646		free(cp);
647		errno = save_errno;
648		if (fd != -1) {
649			curp->gzip = 1;
650			return fd;
651		}
652	}
653
654	/* Neither worked, give up. */
655
656	return -1;
657}
658
659struct mparse *
660mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
661{
662	struct mparse	*curp;
663
664	curp = mandoc_calloc(1, sizeof(struct mparse));
665
666	curp->options = options;
667	curp->os_s = os_s;
668
669	curp->roff = roff_alloc(options);
670	curp->man = roff_man_alloc(curp->roff, curp->os_s,
671		curp->options & MPARSE_QUICK ? 1 : 0);
672	if (curp->options & MPARSE_MDOC) {
673		curp->man->meta.macroset = MACROSET_MDOC;
674		if (curp->man->mdocmac == NULL)
675			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
676	} else if (curp->options & MPARSE_MAN) {
677		curp->man->meta.macroset = MACROSET_MAN;
678		if (curp->man->manmac == NULL)
679			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
680	}
681	curp->man->meta.first->tok = TOKEN_NONE;
682	curp->man->meta.os_e = os_e;
683	tag_alloc();
684	return curp;
685}
686
687void
688mparse_reset(struct mparse *curp)
689{
690	tag_free();
691	roff_reset(curp->roff);
692	roff_man_reset(curp->man);
693	free_buf_list(curp->secondary);
694	curp->secondary = NULL;
695	curp->gzip = 0;
696	tag_alloc();
697}
698
699void
700mparse_free(struct mparse *curp)
701{
702	tag_free();
703	roffhash_free(curp->man->mdocmac);
704	roffhash_free(curp->man->manmac);
705	roff_man_free(curp->man);
706	roff_free(curp->roff);
707	free_buf_list(curp->secondary);
708	free(curp);
709}
710
711struct roff_meta *
712mparse_result(struct mparse *curp)
713{
714	roff_state_reset(curp->man);
715	if (curp->options & MPARSE_VALIDATE) {
716		if (curp->man->meta.macroset == MACROSET_MDOC)
717			mdoc_validate(curp->man);
718		else
719			man_validate(curp->man);
720		tag_postprocess(curp->man, curp->man->meta.first);
721	}
722	return &curp->man->meta;
723}
724
725void
726mparse_copy(const struct mparse *p)
727{
728	struct buf	*buf;
729
730	for (buf = p->secondary; buf != NULL; buf = buf->next)
731		puts(buf->buf);
732}
733