1/*	$Id: mandoc.c,v 1.119 2021/08/10 12:55:03 schwarze Exp $ */
2/*
3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011-2015, 2017-2021 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include "config.h"
19
20#include <sys/types.h>
21
22#include <assert.h>
23#include <ctype.h>
24#include <errno.h>
25#include <limits.h>
26#include <stdlib.h>
27#include <stdio.h>
28#include <string.h>
29#include <time.h>
30
31#include "mandoc_aux.h"
32#include "mandoc.h"
33#include "roff.h"
34#include "libmandoc.h"
35#include "roff_int.h"
36
37static	int	 a2time(time_t *, const char *, const char *);
38static	char	*time2a(time_t);
39
40
41enum mandoc_esc
42mandoc_font(const char *cp, int sz)
43{
44	switch (sz) {
45	case 0:
46		return ESCAPE_FONTPREV;
47	case 1:
48		switch (cp[0]) {
49		case 'B':
50		case '3':
51			return ESCAPE_FONTBOLD;
52		case 'I':
53		case '2':
54			return ESCAPE_FONTITALIC;
55		case 'P':
56			return ESCAPE_FONTPREV;
57		case 'R':
58		case '1':
59			return ESCAPE_FONTROMAN;
60		case '4':
61			return ESCAPE_FONTBI;
62		default:
63			return ESCAPE_ERROR;
64		}
65	case 2:
66		switch (cp[0]) {
67		case 'B':
68			switch (cp[1]) {
69			case 'I':
70				return ESCAPE_FONTBI;
71			default:
72				return ESCAPE_ERROR;
73			}
74		case 'C':
75			switch (cp[1]) {
76			case 'B':
77				return ESCAPE_FONTCB;
78			case 'I':
79				return ESCAPE_FONTCI;
80			case 'R':
81			case 'W':
82				return ESCAPE_FONTCR;
83			default:
84				return ESCAPE_ERROR;
85			}
86		default:
87			return ESCAPE_ERROR;
88		}
89	default:
90		return ESCAPE_ERROR;
91	}
92}
93
94enum mandoc_esc
95mandoc_escape(const char **end, const char **start, int *sz)
96{
97	const char	*local_start;
98	int		 local_sz, c, i;
99	char		 term;
100	enum mandoc_esc	 gly;
101
102	/*
103	 * When the caller doesn't provide return storage,
104	 * use local storage.
105	 */
106
107	if (NULL == start)
108		start = &local_start;
109	if (NULL == sz)
110		sz = &local_sz;
111
112	/*
113	 * Treat "\E" just like "\";
114	 * it only makes a difference in copy mode.
115	 */
116
117	if (**end == 'E')
118		++*end;
119
120	/*
121	 * Beyond the backslash, at least one input character
122	 * is part of the escape sequence.  With one exception
123	 * (see below), that character won't be returned.
124	 */
125
126	gly = ESCAPE_ERROR;
127	*start = ++*end;
128	*sz = 0;
129	term = '\0';
130
131	switch ((*start)[-1]) {
132	/*
133	 * First the glyphs.  There are several different forms of
134	 * these, but each eventually returns a substring of the glyph
135	 * name.
136	 */
137	case '(':
138		gly = ESCAPE_SPECIAL;
139		*sz = 2;
140		break;
141	case '[':
142		if (**start == ' ') {
143			++*end;
144			return ESCAPE_ERROR;
145		}
146		gly = ESCAPE_SPECIAL;
147		term = ']';
148		break;
149	case 'C':
150		if ('\'' != **start)
151			return ESCAPE_ERROR;
152		*start = ++*end;
153		gly = ESCAPE_SPECIAL;
154		term = '\'';
155		break;
156
157	/*
158	 * Escapes taking no arguments at all.
159	 */
160	case '!':
161	case '?':
162		return ESCAPE_UNSUPP;
163	case '%':
164	case '&':
165	case ')':
166	case ',':
167	case '/':
168	case '^':
169	case 'a':
170	case 'd':
171	case 'r':
172	case 't':
173	case 'u':
174	case '{':
175	case '|':
176	case '}':
177		return ESCAPE_IGNORE;
178	case 'c':
179		return ESCAPE_NOSPACE;
180	case 'p':
181		return ESCAPE_BREAK;
182
183	/*
184	 * The \z escape is supposed to output the following
185	 * character without advancing the cursor position.
186	 * Since we are mostly dealing with terminal mode,
187	 * let us just skip the next character.
188	 */
189	case 'z':
190		return ESCAPE_SKIPCHAR;
191
192	/*
193	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
194	 * 'X' is the trigger.  These have opaque sub-strings.
195	 */
196	case 'F':
197	case 'f':
198	case 'g':
199	case 'k':
200	case 'M':
201	case 'm':
202	case 'n':
203	case 'O':
204	case 'V':
205	case 'Y':
206	case '*':
207		switch ((*start)[-1]) {
208		case 'f':
209			gly = ESCAPE_FONT;
210			break;
211		case '*':
212			gly = ESCAPE_DEVICE;
213			break;
214		default:
215			gly = ESCAPE_IGNORE;
216			break;
217		}
218		switch (**start) {
219		case '(':
220			if ((*start)[-1] == 'O')
221				gly = ESCAPE_ERROR;
222			*start = ++*end;
223			*sz = 2;
224			break;
225		case '[':
226			if ((*start)[-1] == 'O')
227				gly = (*start)[1] == '5' ?
228				    ESCAPE_UNSUPP : ESCAPE_ERROR;
229			*start = ++*end;
230			term = ']';
231			break;
232		default:
233			if ((*start)[-1] == 'O') {
234				switch (**start) {
235				case '0':
236					gly = ESCAPE_UNSUPP;
237					break;
238				case '1':
239				case '2':
240				case '3':
241				case '4':
242					break;
243				default:
244					gly = ESCAPE_ERROR;
245					break;
246				}
247			}
248			*sz = 1;
249			break;
250		}
251		break;
252
253	/*
254	 * These escapes are of the form \X'Y', where 'X' is the trigger
255	 * and 'Y' is any string.  These have opaque sub-strings.
256	 * The \B and \w escapes are handled in roff.c, roff_res().
257	 */
258	case 'A':
259	case 'b':
260	case 'D':
261	case 'R':
262	case 'X':
263	case 'Z':
264		gly = ESCAPE_IGNORE;
265		/* FALLTHROUGH */
266	case 'o':
267		if (**start == '\0')
268			return ESCAPE_ERROR;
269		if (gly == ESCAPE_ERROR)
270			gly = ESCAPE_OVERSTRIKE;
271		term = **start;
272		*start = ++*end;
273		break;
274
275	/*
276	 * These escapes are of the form \X'N', where 'X' is the trigger
277	 * and 'N' resolves to a numerical expression.
278	 */
279	case 'h':
280	case 'H':
281	case 'L':
282	case 'l':
283	case 'S':
284	case 'v':
285	case 'x':
286		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
287			if ('\0' != **start)
288				++*end;
289			return ESCAPE_ERROR;
290		}
291		switch ((*start)[-1]) {
292		case 'h':
293			gly = ESCAPE_HORIZ;
294			break;
295		case 'l':
296			gly = ESCAPE_HLINE;
297			break;
298		default:
299			gly = ESCAPE_IGNORE;
300			break;
301		}
302		term = **start;
303		*start = ++*end;
304		break;
305
306	/*
307	 * Special handling for the numbered character escape.
308	 * XXX Do any other escapes need similar handling?
309	 */
310	case 'N':
311		if ('\0' == **start)
312			return ESCAPE_ERROR;
313		(*end)++;
314		if (isdigit((unsigned char)**start)) {
315			*sz = 1;
316			return ESCAPE_IGNORE;
317		}
318		(*start)++;
319		while (isdigit((unsigned char)**end))
320			(*end)++;
321		*sz = *end - *start;
322		if ('\0' != **end)
323			(*end)++;
324		return ESCAPE_NUMBERED;
325
326	/*
327	 * Sizes get a special category of their own.
328	 */
329	case 's':
330		gly = ESCAPE_IGNORE;
331
332		/* See +/- counts as a sign. */
333		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
334			*start = ++*end;
335
336		switch (**end) {
337		case '(':
338			*start = ++*end;
339			*sz = 2;
340			break;
341		case '[':
342			*start = ++*end;
343			term = ']';
344			break;
345		case '\'':
346			*start = ++*end;
347			term = '\'';
348			break;
349		case '3':
350		case '2':
351		case '1':
352			*sz = (*end)[-1] == 's' &&
353			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
354			break;
355		default:
356			*sz = 1;
357			break;
358		}
359
360		break;
361
362	/*
363	 * Several special characters can be encoded as
364	 * one-byte escape sequences without using \[].
365	 */
366	case ' ':
367	case '\'':
368	case '-':
369	case '.':
370	case '0':
371	case ':':
372	case '_':
373	case '`':
374	case 'e':
375	case '~':
376		gly = ESCAPE_SPECIAL;
377		/* FALLTHROUGH */
378	default:
379		if (gly == ESCAPE_ERROR)
380			gly = ESCAPE_UNDEF;
381		*start = --*end;
382		*sz = 1;
383		break;
384	}
385
386	/*
387	 * Read up to the terminating character,
388	 * paying attention to nested escapes.
389	 */
390
391	if ('\0' != term) {
392		while (**end != term) {
393			switch (**end) {
394			case '\0':
395				return ESCAPE_ERROR;
396			case '\\':
397				(*end)++;
398				if (ESCAPE_ERROR ==
399				    mandoc_escape(end, NULL, NULL))
400					return ESCAPE_ERROR;
401				break;
402			default:
403				(*end)++;
404				break;
405			}
406		}
407		*sz = (*end)++ - *start;
408
409		/*
410		 * The file chars.c only provides one common list
411		 * of character names, but \[-] == \- is the only
412		 * one of the characters with one-byte names that
413		 * allows enclosing the name in brackets.
414		 */
415		if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
416			return ESCAPE_ERROR;
417	} else {
418		assert(*sz > 0);
419		if ((size_t)*sz > strlen(*start))
420			return ESCAPE_ERROR;
421		*end += *sz;
422	}
423
424	/* Run post-processors. */
425
426	switch (gly) {
427	case ESCAPE_FONT:
428		gly = mandoc_font(*start, *sz);
429		break;
430	case ESCAPE_SPECIAL:
431		if (**start == 'c') {
432			if (*sz < 6 || *sz > 7 ||
433			    strncmp(*start, "char", 4) != 0 ||
434			    (int)strspn(*start + 4, "0123456789") + 4 < *sz)
435				break;
436			c = 0;
437			for (i = 4; i < *sz; i++)
438				c = 10 * c + ((*start)[i] - '0');
439			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
440				break;
441			*start += 4;
442			*sz -= 4;
443			gly = ESCAPE_NUMBERED;
444			break;
445		}
446
447		/*
448		 * Unicode escapes are defined in groff as \[u0000]
449		 * to \[u10FFFF], where the contained value must be
450		 * a valid Unicode codepoint.  Here, however, only
451		 * check the length and range.
452		 */
453		if (**start != 'u' || *sz < 5 || *sz > 7)
454			break;
455		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
456			break;
457		if (*sz == 6 && (*start)[1] == '0')
458			break;
459		if (*sz == 5 && (*start)[1] == 'D' &&
460		    strchr("89ABCDEF", (*start)[2]) != NULL)
461			break;
462		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
463		    + 1 == *sz)
464			gly = ESCAPE_UNICODE;
465		break;
466	case ESCAPE_DEVICE:
467		assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
468		break;
469	default:
470		break;
471	}
472
473	return gly;
474}
475
476static int
477a2time(time_t *t, const char *fmt, const char *p)
478{
479	struct tm	 tm;
480	char		*pp;
481
482	memset(&tm, 0, sizeof(struct tm));
483
484	pp = NULL;
485#if HAVE_STRPTIME
486	pp = strptime(p, fmt, &tm);
487#endif
488	if (NULL != pp && '\0' == *pp) {
489		*t = mktime(&tm);
490		return 1;
491	}
492
493	return 0;
494}
495
496static char *
497time2a(time_t t)
498{
499	struct tm	*tm;
500	char		*buf, *p;
501	size_t		 ssz;
502	int		 isz;
503
504	buf = NULL;
505	tm = localtime(&t);
506	if (tm == NULL)
507		goto fail;
508
509	/*
510	 * Reserve space:
511	 * up to 9 characters for the month (September) + blank
512	 * up to 2 characters for the day + comma + blank
513	 * 4 characters for the year and a terminating '\0'
514	 */
515
516	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
517
518	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
519		goto fail;
520	p += (int)ssz;
521
522	/*
523	 * The output format is just "%d" here, not "%2d" or "%02d".
524	 * That's also the reason why we can't just format the
525	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
526	 * Besides, the present approach is less prone to buffer
527	 * overflows, in case anybody should ever introduce the bug
528	 * of looking at LC_TIME.
529	 */
530
531	isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday);
532	if (isz < 0 || isz > 4)
533		goto fail;
534	p += isz;
535
536	if (strftime(p, 4 + 1, "%Y", tm) == 0)
537		goto fail;
538	return buf;
539
540fail:
541	free(buf);
542	return mandoc_strdup("");
543}
544
545char *
546mandoc_normdate(struct roff_node *nch, struct roff_node *nbl)
547{
548	char		*cp;
549	time_t		 t;
550
551	/* No date specified. */
552
553	if (nch == NULL) {
554		if (nbl == NULL)
555			mandoc_msg(MANDOCERR_DATE_MISSING, 0, 0, NULL);
556		else
557			mandoc_msg(MANDOCERR_DATE_MISSING, nbl->line,
558			    nbl->pos, "%s", roff_name[nbl->tok]);
559		return mandoc_strdup("");
560	}
561	if (*nch->string == '\0') {
562		mandoc_msg(MANDOCERR_DATE_MISSING, nch->line,
563		    nch->pos, "%s", roff_name[nbl->tok]);
564		return mandoc_strdup("");
565	}
566	if (strcmp(nch->string, "$" "Mdocdate$") == 0)
567		return time2a(time(NULL));
568
569	/* Valid mdoc(7) date format. */
570
571	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", nch->string) ||
572	    a2time(&t, "%b %d, %Y", nch->string)) {
573		cp = time2a(t);
574		if (t > time(NULL) + 86400)
575			mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line,
576			    nch->pos, "%s %s", roff_name[nbl->tok], cp);
577		else if (*nch->string != '$' &&
578		    strcmp(nch->string, cp) != 0)
579			mandoc_msg(MANDOCERR_DATE_NORM, nch->line,
580			    nch->pos, "%s %s", roff_name[nbl->tok], cp);
581		return cp;
582	}
583
584	/* In man(7), do not warn about the legacy format. */
585
586	if (a2time(&t, "%Y-%m-%d", nch->string) == 0)
587		mandoc_msg(MANDOCERR_DATE_BAD, nch->line, nch->pos,
588		    "%s %s", roff_name[nbl->tok], nch->string);
589	else if (t > time(NULL) + 86400)
590		mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, nch->pos,
591		    "%s %s", roff_name[nbl->tok], nch->string);
592	else if (nbl->tok == MDOC_Dd)
593		mandoc_msg(MANDOCERR_DATE_LEGACY, nch->line, nch->pos,
594		    "Dd %s", nch->string);
595
596	/* Use any non-mdoc(7) date verbatim. */
597
598	return mandoc_strdup(nch->string);
599}
600
601int
602mandoc_eos(const char *p, size_t sz)
603{
604	const char	*q;
605	int		 enclosed, found;
606
607	if (0 == sz)
608		return 0;
609
610	/*
611	 * End-of-sentence recognition must include situations where
612	 * some symbols, such as `)', allow prior EOS punctuation to
613	 * propagate outward.
614	 */
615
616	enclosed = found = 0;
617	for (q = p + (int)sz - 1; q >= p; q--) {
618		switch (*q) {
619		case '\"':
620		case '\'':
621		case ']':
622		case ')':
623			if (0 == found)
624				enclosed = 1;
625			break;
626		case '.':
627		case '!':
628		case '?':
629			found = 1;
630			break;
631		default:
632			return found &&
633			    (!enclosed || isalnum((unsigned char)*q));
634		}
635	}
636
637	return found && !enclosed;
638}
639
640/*
641 * Convert a string to a long that may not be <0.
642 * If the string is invalid, or is less than 0, return -1.
643 */
644int
645mandoc_strntoi(const char *p, size_t sz, int base)
646{
647	char		 buf[32];
648	char		*ep;
649	long		 v;
650
651	if (sz > 31)
652		return -1;
653
654	memcpy(buf, p, sz);
655	buf[(int)sz] = '\0';
656
657	errno = 0;
658	v = strtol(buf, &ep, base);
659
660	if (buf[0] == '\0' || *ep != '\0')
661		return -1;
662
663	if (v > INT_MAX)
664		v = INT_MAX;
665	if (v < INT_MIN)
666		v = INT_MIN;
667
668	return (int)v;
669}
670