mdoc.c revision 322249
1/*	$Id: mdoc.c,v 1.267 2017/06/17 13:06:16 schwarze Exp $ */
2/*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010, 2012-2017 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include "config.h"
19
20#include <sys/types.h>
21
22#include <assert.h>
23#include <ctype.h>
24#include <stdarg.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29
30#include "mandoc_aux.h"
31#include "mandoc.h"
32#include "roff.h"
33#include "mdoc.h"
34#include "libmandoc.h"
35#include "roff_int.h"
36#include "libmdoc.h"
37
38const	char *const __mdoc_argnames[MDOC_ARG_MAX] = {
39	"split",		"nosplit",		"ragged",
40	"unfilled",		"literal",		"file",
41	"offset",		"bullet",		"dash",
42	"hyphen",		"item",			"enum",
43	"tag",			"diag",			"hang",
44	"ohang",		"inset",		"column",
45	"width",		"compact",		"std",
46	"filled",		"words",		"emphasis",
47	"symbolic",		"nested",		"centered"
48};
49const	char * const *mdoc_argnames = __mdoc_argnames;
50
51static	int		  mdoc_ptext(struct roff_man *, int, char *, int);
52static	int		  mdoc_pmacro(struct roff_man *, int, char *, int);
53
54
55/*
56 * Main parse routine.  Parses a single line -- really just hands off to
57 * the macro (mdoc_pmacro()) or text parser (mdoc_ptext()).
58 */
59int
60mdoc_parseln(struct roff_man *mdoc, int ln, char *buf, int offs)
61{
62
63	if (mdoc->last->type != ROFFT_EQN || ln > mdoc->last->line)
64		mdoc->flags |= MDOC_NEWLINE;
65
66	/*
67	 * Let the roff nS register switch SYNOPSIS mode early,
68	 * such that the parser knows at all times
69	 * whether this mode is on or off.
70	 * Note that this mode is also switched by the Sh macro.
71	 */
72	if (roff_getreg(mdoc->roff, "nS"))
73		mdoc->flags |= MDOC_SYNOPSIS;
74	else
75		mdoc->flags &= ~MDOC_SYNOPSIS;
76
77	return roff_getcontrol(mdoc->roff, buf, &offs) ?
78	    mdoc_pmacro(mdoc, ln, buf, offs) :
79	    mdoc_ptext(mdoc, ln, buf, offs);
80}
81
82void
83mdoc_macro(MACRO_PROT_ARGS)
84{
85	assert(tok >= MDOC_Dd && tok < MDOC_MAX);
86	(*mdoc_macros[tok].fp)(mdoc, tok, line, ppos, pos, buf);
87}
88
89void
90mdoc_tail_alloc(struct roff_man *mdoc, int line, int pos, enum roff_tok tok)
91{
92	struct roff_node *p;
93
94	p = roff_node_alloc(mdoc, line, pos, ROFFT_TAIL, tok);
95	roff_node_append(mdoc, p);
96	mdoc->next = ROFF_NEXT_CHILD;
97}
98
99struct roff_node *
100mdoc_endbody_alloc(struct roff_man *mdoc, int line, int pos,
101    enum roff_tok tok, struct roff_node *body)
102{
103	struct roff_node *p;
104
105	body->flags |= NODE_ENDED;
106	body->parent->flags |= NODE_ENDED;
107	p = roff_node_alloc(mdoc, line, pos, ROFFT_BODY, tok);
108	p->body = body;
109	p->norm = body->norm;
110	p->end = ENDBODY_SPACE;
111	roff_node_append(mdoc, p);
112	mdoc->next = ROFF_NEXT_SIBLING;
113	return p;
114}
115
116struct roff_node *
117mdoc_block_alloc(struct roff_man *mdoc, int line, int pos,
118    enum roff_tok tok, struct mdoc_arg *args)
119{
120	struct roff_node *p;
121
122	p = roff_node_alloc(mdoc, line, pos, ROFFT_BLOCK, tok);
123	p->args = args;
124	if (p->args)
125		(args->refcnt)++;
126
127	switch (tok) {
128	case MDOC_Bd:
129	case MDOC_Bf:
130	case MDOC_Bl:
131	case MDOC_En:
132	case MDOC_Rs:
133		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
134		break;
135	default:
136		break;
137	}
138	roff_node_append(mdoc, p);
139	mdoc->next = ROFF_NEXT_CHILD;
140	return p;
141}
142
143void
144mdoc_elem_alloc(struct roff_man *mdoc, int line, int pos,
145     enum roff_tok tok, struct mdoc_arg *args)
146{
147	struct roff_node *p;
148
149	p = roff_node_alloc(mdoc, line, pos, ROFFT_ELEM, tok);
150	p->args = args;
151	if (p->args)
152		(args->refcnt)++;
153
154	switch (tok) {
155	case MDOC_An:
156		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
157		break;
158	default:
159		break;
160	}
161	roff_node_append(mdoc, p);
162	mdoc->next = ROFF_NEXT_CHILD;
163}
164
165void
166mdoc_node_relink(struct roff_man *mdoc, struct roff_node *p)
167{
168
169	roff_node_unlink(mdoc, p);
170	p->prev = p->next = NULL;
171	roff_node_append(mdoc, p);
172}
173
174/*
175 * Parse free-form text, that is, a line that does not begin with the
176 * control character.
177 */
178static int
179mdoc_ptext(struct roff_man *mdoc, int line, char *buf, int offs)
180{
181	struct roff_node *n;
182	const char	 *cp, *sp;
183	char		 *c, *ws, *end;
184
185	n = mdoc->last;
186
187	/*
188	 * If a column list contains plain text, assume an implicit item
189	 * macro.  This can happen one or more times at the beginning
190	 * of such a list, intermixed with non-It mdoc macros and with
191	 * nodes generated on the roff level, for example by tbl.
192	 */
193
194	if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
195	     n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
196	    (n->parent != NULL && n->parent->tok == MDOC_Bl &&
197	     n->parent->norm->Bl.type == LIST_column)) {
198		mdoc->flags |= MDOC_FREECOL;
199		mdoc_macro(mdoc, MDOC_It, line, offs, &offs, buf);
200		return 1;
201	}
202
203	/*
204	 * Search for the beginning of unescaped trailing whitespace (ws)
205	 * and for the first character not to be output (end).
206	 */
207
208	/* FIXME: replace with strcspn(). */
209	ws = NULL;
210	for (c = end = buf + offs; *c; c++) {
211		switch (*c) {
212		case ' ':
213			if (NULL == ws)
214				ws = c;
215			continue;
216		case '\t':
217			/*
218			 * Always warn about trailing tabs,
219			 * even outside literal context,
220			 * where they should be put on the next line.
221			 */
222			if (NULL == ws)
223				ws = c;
224			/*
225			 * Strip trailing tabs in literal context only;
226			 * outside, they affect the next line.
227			 */
228			if (MDOC_LITERAL & mdoc->flags)
229				continue;
230			break;
231		case '\\':
232			/* Skip the escaped character, too, if any. */
233			if (c[1])
234				c++;
235			/* FALLTHROUGH */
236		default:
237			ws = NULL;
238			break;
239		}
240		end = c + 1;
241	}
242	*end = '\0';
243
244	if (ws)
245		mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse,
246		    line, (int)(ws-buf), NULL);
247
248	/*
249	 * Blank lines are allowed in no-fill mode
250	 * and cancel preceding \c,
251	 * but add a single vertical space elsewhere.
252	 */
253
254	if (buf[offs] == '\0' && ! (mdoc->flags & MDOC_LITERAL)) {
255		switch (mdoc->last->type) {
256		case ROFFT_TEXT:
257			sp = mdoc->last->string;
258			cp = end = strchr(sp, '\0') - 2;
259			if (cp < sp || cp[0] != '\\' || cp[1] != 'c')
260				break;
261			while (cp > sp && cp[-1] == '\\')
262				cp--;
263			if ((end - cp) % 2)
264				break;
265			*end = '\0';
266			return 1;
267		default:
268			break;
269		}
270		mandoc_msg(MANDOCERR_FI_BLANK, mdoc->parse,
271		    line, (int)(c - buf), NULL);
272		roff_elem_alloc(mdoc, line, offs, ROFF_sp);
273		mdoc->last->flags |= NODE_VALID | NODE_ENDED;
274		mdoc->next = ROFF_NEXT_SIBLING;
275		return 1;
276	}
277
278	roff_word_alloc(mdoc, line, offs, buf+offs);
279
280	if (mdoc->flags & MDOC_LITERAL)
281		return 1;
282
283	/*
284	 * End-of-sentence check.  If the last character is an unescaped
285	 * EOS character, then flag the node as being the end of a
286	 * sentence.  The front-end will know how to interpret this.
287	 */
288
289	assert(buf < end);
290
291	if (mandoc_eos(buf+offs, (size_t)(end-buf-offs)))
292		mdoc->last->flags |= NODE_EOS;
293
294	for (c = buf + offs; c != NULL; c = strchr(c + 1, '.')) {
295		if (c - buf < offs + 2)
296			continue;
297		if (end - c < 3)
298			break;
299		if (c[1] != ' ' ||
300		    isalpha((unsigned char)c[-2]) == 0 ||
301		    isalpha((unsigned char)c[-1]) == 0 ||
302		    (c[-2] == 'n' && c[-1] == 'c') ||
303		    (c[-2] == 'v' && c[-1] == 's'))
304			continue;
305		c += 2;
306		if (*c == ' ')
307			c++;
308		if (*c == ' ')
309			c++;
310		if (isupper((unsigned char)(*c)))
311			mandoc_msg(MANDOCERR_EOS, mdoc->parse,
312			    line, (int)(c - buf), NULL);
313	}
314
315	return 1;
316}
317
318/*
319 * Parse a macro line, that is, a line beginning with the control
320 * character.
321 */
322static int
323mdoc_pmacro(struct roff_man *mdoc, int ln, char *buf, int offs)
324{
325	struct roff_node *n;
326	const char	 *cp;
327	size_t		  sz;
328	enum roff_tok	  tok;
329	int		  sv;
330
331	/* Determine the line macro. */
332
333	sv = offs;
334	tok = TOKEN_NONE;
335	for (sz = 0; sz < 4 && strchr(" \t\\", buf[offs]) == NULL; sz++)
336		offs++;
337	if (sz == 2 || sz == 3)
338		tok = roffhash_find(mdoc->mdocmac, buf + sv, sz);
339	if (tok == TOKEN_NONE) {
340		mandoc_msg(MANDOCERR_MACRO, mdoc->parse,
341		    ln, sv, buf + sv - 1);
342		return 1;
343	}
344
345	/* Skip a leading escape sequence or tab. */
346
347	switch (buf[offs]) {
348	case '\\':
349		cp = buf + offs + 1;
350		mandoc_escape(&cp, NULL, NULL);
351		offs = cp - buf;
352		break;
353	case '\t':
354		offs++;
355		break;
356	default:
357		break;
358	}
359
360	/* Jump to the next non-whitespace word. */
361
362	while (buf[offs] == ' ')
363		offs++;
364
365	/*
366	 * Trailing whitespace.  Note that tabs are allowed to be passed
367	 * into the parser as "text", so we only warn about spaces here.
368	 */
369
370	if ('\0' == buf[offs] && ' ' == buf[offs - 1])
371		mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse,
372		    ln, offs - 1, NULL);
373
374	/*
375	 * If an initial macro or a list invocation, divert directly
376	 * into macro processing.
377	 */
378
379	n = mdoc->last;
380	if (n == NULL || tok == MDOC_It || tok == MDOC_El) {
381		mdoc_macro(mdoc, tok, ln, sv, &offs, buf);
382		return 1;
383	}
384
385	/*
386	 * If a column list contains a non-It macro, assume an implicit
387	 * item macro.  This can happen one or more times at the
388	 * beginning of such a list, intermixed with text lines and
389	 * with nodes generated on the roff level, for example by tbl.
390	 */
391
392	if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
393	     n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
394	    (n->parent != NULL && n->parent->tok == MDOC_Bl &&
395	     n->parent->norm->Bl.type == LIST_column)) {
396		mdoc->flags |= MDOC_FREECOL;
397		mdoc_macro(mdoc, MDOC_It, ln, sv, &sv, buf);
398		return 1;
399	}
400
401	/* Normal processing of a macro. */
402
403	mdoc_macro(mdoc, tok, ln, sv, &offs, buf);
404
405	/* In quick mode (for mandocdb), abort after the NAME section. */
406
407	if (mdoc->quick && MDOC_Sh == tok &&
408	    SEC_NAME != mdoc->last->sec)
409		return 2;
410
411	return 1;
412}
413
414enum mdelim
415mdoc_isdelim(const char *p)
416{
417
418	if ('\0' == p[0])
419		return DELIM_NONE;
420
421	if ('\0' == p[1])
422		switch (p[0]) {
423		case '(':
424		case '[':
425			return DELIM_OPEN;
426		case '|':
427			return DELIM_MIDDLE;
428		case '.':
429		case ',':
430		case ';':
431		case ':':
432		case '?':
433		case '!':
434		case ')':
435		case ']':
436			return DELIM_CLOSE;
437		default:
438			return DELIM_NONE;
439		}
440
441	if ('\\' != p[0])
442		return DELIM_NONE;
443
444	if (0 == strcmp(p + 1, "."))
445		return DELIM_CLOSE;
446	if (0 == strcmp(p + 1, "fR|\\fP"))
447		return DELIM_MIDDLE;
448
449	return DELIM_NONE;
450}
451
452void
453mdoc_validate(struct roff_man *mdoc)
454{
455
456	mdoc->last = mdoc->first;
457	mdoc_node_validate(mdoc);
458	mdoc_state_reset(mdoc);
459}
460