1323136Sdes/* $OpenBSD: utf8.c,v 1.5 2017/02/19 00:10:57 djm Exp $ */
2313010Sdes/*
3313010Sdes * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4313010Sdes *
5313010Sdes * Permission to use, copy, modify, and distribute this software for any
6313010Sdes * purpose with or without fee is hereby granted, provided that the above
7313010Sdes * copyright notice and this permission notice appear in all copies.
8313010Sdes *
9313010Sdes * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10313010Sdes * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11313010Sdes * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12313010Sdes * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13313010Sdes * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14313010Sdes * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15313010Sdes * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16313010Sdes */
17313010Sdes
18313010Sdes/*
19313010Sdes * Utility functions for multibyte-character handling,
20313010Sdes * in particular to sanitize untrusted strings for terminal output.
21313010Sdes */
22313010Sdes
23313010Sdes#include "includes.h"
24313010Sdes
25313010Sdes#include <sys/types.h>
26313010Sdes#ifdef HAVE_LANGINFO_H
27313010Sdes# include <langinfo.h>
28313010Sdes#endif
29313010Sdes#include <limits.h>
30323134Sdes#include <locale.h>
31313010Sdes#include <stdarg.h>
32313010Sdes#include <stdio.h>
33313010Sdes#include <stdlib.h>
34313010Sdes#include <string.h>
35313010Sdes#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
36313010Sdes# include <vis.h>
37313010Sdes#endif
38313010Sdes#ifdef HAVE_WCHAR_H
39313010Sdes# include <wchar.h>
40313010Sdes#endif
41313010Sdes
42313010Sdes#include "utf8.h"
43313010Sdes
44313010Sdesstatic int	 dangerous_locale(void);
45313010Sdesstatic int	 grow_dst(char **, size_t *, size_t, char **, size_t);
46313010Sdesstatic int	 vasnmprintf(char **, size_t, int *, const char *, va_list);
47313010Sdes
48313010Sdes
49313010Sdes/*
50313010Sdes * For US-ASCII and UTF-8 encodings, we can safely recover from
51313010Sdes * encoding errors and from non-printable characters.  For any
52313010Sdes * other encodings, err to the side of caution and abort parsing:
53313010Sdes * For state-dependent encodings, recovery is impossible.
54313010Sdes * For arbitrary encodings, replacement of non-printable
55313010Sdes * characters would be non-trivial and too fragile.
56313010Sdes */
57313010Sdes
58313010Sdesstatic int
59313010Sdesdangerous_locale(void) {
60313010Sdes	char	*loc;
61313010Sdes
62313010Sdes	loc = nl_langinfo(CODESET);
63323136Sdes	return strcmp(loc, "US-ASCII") != 0 && strcmp(loc, "UTF-8") != 0 &&
64323136Sdes	    strcmp(loc, "ANSI_X3.4-1968") != 0 && strcmp(loc, "646") != 0;
65313010Sdes}
66313010Sdes
67313010Sdesstatic int
68313010Sdesgrow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
69313010Sdes{
70313010Sdes	char	*tp;
71313010Sdes	size_t	 tsz;
72313010Sdes
73313010Sdes	if (*dp + need < *dst + *sz)
74313010Sdes		return 0;
75313010Sdes	tsz = *sz + 128;
76313010Sdes	if (tsz > maxsz)
77313010Sdes		tsz = maxsz;
78313010Sdes	if ((tp = realloc(*dst, tsz)) == NULL)
79313010Sdes		return -1;
80313010Sdes	*dp = tp + (*dp - *dst);
81313010Sdes	*dst = tp;
82313010Sdes	*sz = tsz;
83313010Sdes	return 0;
84313010Sdes}
85313010Sdes
86313010Sdes/*
87313010Sdes * The following two functions limit the number of bytes written,
88313010Sdes * including the terminating '\0', to sz.  Unless wp is NULL,
89313010Sdes * they limit the number of display columns occupied to *wp.
90313010Sdes * Whichever is reached first terminates the output string.
91313010Sdes * To stay close to the standard interfaces, they return the number of
92313010Sdes * non-NUL bytes that would have been written if both were unlimited.
93313010Sdes * If wp is NULL, newline, carriage return, and tab are allowed;
94313010Sdes * otherwise, the actual number of columns occupied by what was
95313010Sdes * written is returned in *wp.
96313010Sdes */
97313010Sdes
98313010Sdesstatic int
99313010Sdesvasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
100313010Sdes{
101313010Sdes	char	*src;	/* Source string returned from vasprintf. */
102313010Sdes	char	*sp;	/* Pointer into src. */
103313010Sdes	char	*dst;	/* Destination string to be returned. */
104313010Sdes	char	*dp;	/* Pointer into dst. */
105313010Sdes	char	*tp;	/* Temporary pointer for dst. */
106313010Sdes	size_t	 sz;	/* Number of bytes allocated for dst. */
107313010Sdes	wchar_t	 wc;	/* Wide character at sp. */
108313010Sdes	int	 len;	/* Number of bytes in the character at sp. */
109313010Sdes	int	 ret;	/* Number of bytes needed to format src. */
110313010Sdes	int	 width;	/* Display width of the character wc. */
111313010Sdes	int	 total_width, max_width, print;
112313010Sdes
113313010Sdes	src = NULL;
114313010Sdes	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
115313010Sdes		goto fail;
116313010Sdes
117313010Sdes	sz = strlen(src) + 1;
118313010Sdes	if ((dst = malloc(sz)) == NULL) {
119313010Sdes		free(src);
120323136Sdes		ret = -1;
121313010Sdes		goto fail;
122313010Sdes	}
123313010Sdes
124313010Sdes	if (maxsz > INT_MAX)
125313010Sdes		maxsz = INT_MAX;
126313010Sdes
127313010Sdes	sp = src;
128313010Sdes	dp = dst;
129313010Sdes	ret = 0;
130313010Sdes	print = 1;
131313010Sdes	total_width = 0;
132313010Sdes	max_width = wp == NULL ? INT_MAX : *wp;
133313010Sdes	while (*sp != '\0') {
134313010Sdes		if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
135313010Sdes			(void)mbtowc(NULL, NULL, MB_CUR_MAX);
136313010Sdes			if (dangerous_locale()) {
137313010Sdes				ret = -1;
138313010Sdes				break;
139313010Sdes			}
140313010Sdes			len = 1;
141313010Sdes			width = -1;
142313010Sdes		} else if (wp == NULL &&
143313010Sdes		    (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
144313010Sdes			/*
145313010Sdes			 * Don't use width uninitialized; the actual
146313010Sdes			 * value doesn't matter because total_width
147313010Sdes			 * is only returned for wp != NULL.
148313010Sdes			 */
149313010Sdes			width = 0;
150313010Sdes		} else if ((width = wcwidth(wc)) == -1 &&
151313010Sdes		    dangerous_locale()) {
152313010Sdes			ret = -1;
153313010Sdes			break;
154313010Sdes		}
155313010Sdes
156313010Sdes		/* Valid, printable character. */
157313010Sdes
158313010Sdes		if (width >= 0) {
159313010Sdes			if (print && (dp - dst >= (int)maxsz - len ||
160313010Sdes			    total_width > max_width - width))
161313010Sdes				print = 0;
162313010Sdes			if (print) {
163313010Sdes				if (grow_dst(&dst, &sz, maxsz,
164313010Sdes				    &dp, len) == -1) {
165313010Sdes					ret = -1;
166313010Sdes					break;
167313010Sdes				}
168313010Sdes				total_width += width;
169313010Sdes				memcpy(dp, sp, len);
170313010Sdes				dp += len;
171313010Sdes			}
172313010Sdes			sp += len;
173313010Sdes			if (ret >= 0)
174313010Sdes				ret += len;
175313010Sdes			continue;
176313010Sdes		}
177313010Sdes
178313010Sdes		/* Escaping required. */
179313010Sdes
180313010Sdes		while (len > 0) {
181313010Sdes			if (print && (dp - dst >= (int)maxsz - 4 ||
182313010Sdes			    total_width > max_width - 4))
183313010Sdes				print = 0;
184313010Sdes			if (print) {
185313010Sdes				if (grow_dst(&dst, &sz, maxsz,
186313010Sdes				    &dp, 4) == -1) {
187313010Sdes					ret = -1;
188313010Sdes					break;
189313010Sdes				}
190313010Sdes				tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
191313010Sdes				width = tp - dp;
192313010Sdes				total_width += width;
193313010Sdes				dp = tp;
194313010Sdes			} else
195313010Sdes				width = 4;
196313010Sdes			len--;
197313010Sdes			sp++;
198313010Sdes			if (ret >= 0)
199313010Sdes				ret += width;
200313010Sdes		}
201313010Sdes		if (len > 0)
202313010Sdes			break;
203313010Sdes	}
204313010Sdes	free(src);
205313010Sdes	*dp = '\0';
206313010Sdes	*str = dst;
207313010Sdes	if (wp != NULL)
208313010Sdes		*wp = total_width;
209313010Sdes
210313010Sdes	/*
211313010Sdes	 * If the string was truncated by the width limit but
212313010Sdes	 * would have fit into the size limit, the only sane way
213313010Sdes	 * to report the problem is using the return value, such
214313010Sdes	 * that the usual idiom "if (ret < 0 || ret >= sz) error"
215313010Sdes	 * works as expected.
216313010Sdes	 */
217313010Sdes
218313010Sdes	if (ret < (int)maxsz && !print)
219313010Sdes		ret = -1;
220313010Sdes	return ret;
221313010Sdes
222313010Sdesfail:
223313010Sdes	if (wp != NULL)
224313010Sdes		*wp = 0;
225313010Sdes	if (ret == 0) {
226313010Sdes		*str = src;
227313010Sdes		return 0;
228313010Sdes	} else {
229313010Sdes		*str = NULL;
230313010Sdes		return -1;
231313010Sdes	}
232313010Sdes}
233313010Sdes
234313010Sdesint
235313010Sdessnmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
236313010Sdes{
237313010Sdes	va_list	 ap;
238313010Sdes	char	*cp;
239313010Sdes	int	 ret;
240313010Sdes
241313010Sdes	va_start(ap, fmt);
242313010Sdes	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
243313010Sdes	va_end(ap);
244313010Sdes	if (cp != NULL) {
245313010Sdes		(void)strlcpy(str, cp, sz);
246313010Sdes		free(cp);
247313010Sdes	} else
248313010Sdes		*str = '\0';
249313010Sdes	return ret;
250313010Sdes}
251313010Sdes
252313010Sdes/*
253313010Sdes * To stay close to the standard interfaces, the following functions
254313010Sdes * return the number of non-NUL bytes written.
255313010Sdes */
256313010Sdes
257313010Sdesint
258313010Sdesvfmprintf(FILE *stream, const char *fmt, va_list ap)
259313010Sdes{
260313010Sdes	char	*str;
261313010Sdes	int	 ret;
262313010Sdes
263313010Sdes	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
264313010Sdes		return -1;
265313010Sdes	if (fputs(str, stream) == EOF)
266313010Sdes		ret = -1;
267313010Sdes	free(str);
268313010Sdes	return ret;
269313010Sdes}
270313010Sdes
271313010Sdesint
272313010Sdesfmprintf(FILE *stream, const char *fmt, ...)
273313010Sdes{
274313010Sdes	va_list	 ap;
275313010Sdes	int	 ret;
276313010Sdes
277313010Sdes	va_start(ap, fmt);
278313010Sdes	ret = vfmprintf(stream, fmt, ap);
279313010Sdes	va_end(ap);
280313010Sdes	return ret;
281313010Sdes}
282313010Sdes
283313010Sdesint
284313010Sdesmprintf(const char *fmt, ...)
285313010Sdes{
286313010Sdes	va_list	 ap;
287313010Sdes	int	 ret;
288313010Sdes
289313010Sdes	va_start(ap, fmt);
290313010Sdes	ret = vfmprintf(stdout, fmt, ap);
291313010Sdes	va_end(ap);
292313010Sdes	return ret;
293313010Sdes}
294323134Sdes
295323134Sdes/*
296323134Sdes * Set up libc for multibyte output in the user's chosen locale.
297323134Sdes *
298323134Sdes * XXX: we are known to have problems with Turkish (i/I confusion) so we
299323134Sdes *      deliberately fall back to the C locale for now. Longer term we should
300323134Sdes *      always prefer to select C.[encoding] if possible, but there's no
301323134Sdes *      standardisation in locales between systems, so we'll need to survey
302323134Sdes *      what's out there first.
303323134Sdes */
304323134Sdesvoid
305323134Sdesmsetlocale(void)
306323134Sdes{
307323134Sdes	const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
308323134Sdes	char *cp;
309323134Sdes	int i;
310323134Sdes
311323134Sdes	/*
312323134Sdes	 * We can't yet cope with dotless/dotted I in Turkish locales,
313323134Sdes	 * so fall back to the C locale for these.
314323134Sdes	 */
315323134Sdes	for (i = 0; vars[i] != NULL; i++) {
316323134Sdes		if ((cp = getenv(vars[i])) == NULL)
317323134Sdes			continue;
318323134Sdes		if (strncasecmp(cp, "TR", 2) != 0)
319323134Sdes			break;
320323134Sdes		/*
321323134Sdes		 * If we're in a UTF-8 locale then prefer to use
322323134Sdes		 * the C.UTF-8 locale (or equivalent) if it exists.
323323134Sdes		 */
324323134Sdes		if ((strcasestr(cp, "UTF-8") != NULL ||
325323134Sdes		    strcasestr(cp, "UTF8") != NULL) &&
326323134Sdes		    (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
327323134Sdes		    setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
328323134Sdes			return;
329323134Sdes		setlocale(LC_CTYPE, "C");
330323134Sdes		return;
331323134Sdes	}
332323134Sdes	/* We can handle this locale */
333323134Sdes	setlocale(LC_CTYPE, "");
334323134Sdes}
335