1323136Sdes/* $OpenBSD: utf8.c,v 1.5 2017/02/19 00:10:57 djm Exp $ */ 2313010Sdes/* 3313010Sdes * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> 4313010Sdes * 5313010Sdes * Permission to use, copy, modify, and distribute this software for any 6313010Sdes * purpose with or without fee is hereby granted, provided that the above 7313010Sdes * copyright notice and this permission notice appear in all copies. 8313010Sdes * 9313010Sdes * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10313010Sdes * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11313010Sdes * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12313010Sdes * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13313010Sdes * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14313010Sdes * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15313010Sdes * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16313010Sdes */ 17313010Sdes 18313010Sdes/* 19313010Sdes * Utility functions for multibyte-character handling, 20313010Sdes * in particular to sanitize untrusted strings for terminal output. 21313010Sdes */ 22313010Sdes 23313010Sdes#include "includes.h" 24313010Sdes 25313010Sdes#include <sys/types.h> 26313010Sdes#ifdef HAVE_LANGINFO_H 27313010Sdes# include <langinfo.h> 28313010Sdes#endif 29313010Sdes#include <limits.h> 30323134Sdes#include <locale.h> 31313010Sdes#include <stdarg.h> 32313010Sdes#include <stdio.h> 33313010Sdes#include <stdlib.h> 34313010Sdes#include <string.h> 35313010Sdes#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS) 36313010Sdes# include <vis.h> 37313010Sdes#endif 38313010Sdes#ifdef HAVE_WCHAR_H 39313010Sdes# include <wchar.h> 40313010Sdes#endif 41313010Sdes 42313010Sdes#include "utf8.h" 43313010Sdes 44313010Sdesstatic int dangerous_locale(void); 45313010Sdesstatic int grow_dst(char **, size_t *, size_t, char **, size_t); 46313010Sdesstatic int vasnmprintf(char **, size_t, int *, const char *, va_list); 47313010Sdes 48313010Sdes 49313010Sdes/* 50313010Sdes * For US-ASCII and UTF-8 encodings, we can safely recover from 51313010Sdes * encoding errors and from non-printable characters. For any 52313010Sdes * other encodings, err to the side of caution and abort parsing: 53313010Sdes * For state-dependent encodings, recovery is impossible. 54313010Sdes * For arbitrary encodings, replacement of non-printable 55313010Sdes * characters would be non-trivial and too fragile. 56313010Sdes */ 57313010Sdes 58313010Sdesstatic int 59313010Sdesdangerous_locale(void) { 60313010Sdes char *loc; 61313010Sdes 62313010Sdes loc = nl_langinfo(CODESET); 63323136Sdes return strcmp(loc, "US-ASCII") != 0 && strcmp(loc, "UTF-8") != 0 && 64323136Sdes strcmp(loc, "ANSI_X3.4-1968") != 0 && strcmp(loc, "646") != 0; 65313010Sdes} 66313010Sdes 67313010Sdesstatic int 68313010Sdesgrow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) 69313010Sdes{ 70313010Sdes char *tp; 71313010Sdes size_t tsz; 72313010Sdes 73313010Sdes if (*dp + need < *dst + *sz) 74313010Sdes return 0; 75313010Sdes tsz = *sz + 128; 76313010Sdes if (tsz > maxsz) 77313010Sdes tsz = maxsz; 78313010Sdes if ((tp = realloc(*dst, tsz)) == NULL) 79313010Sdes return -1; 80313010Sdes *dp = tp + (*dp - *dst); 81313010Sdes *dst = tp; 82313010Sdes *sz = tsz; 83313010Sdes return 0; 84313010Sdes} 85313010Sdes 86313010Sdes/* 87313010Sdes * The following two functions limit the number of bytes written, 88313010Sdes * including the terminating '\0', to sz. Unless wp is NULL, 89313010Sdes * they limit the number of display columns occupied to *wp. 90313010Sdes * Whichever is reached first terminates the output string. 91313010Sdes * To stay close to the standard interfaces, they return the number of 92313010Sdes * non-NUL bytes that would have been written if both were unlimited. 93313010Sdes * If wp is NULL, newline, carriage return, and tab are allowed; 94313010Sdes * otherwise, the actual number of columns occupied by what was 95313010Sdes * written is returned in *wp. 96313010Sdes */ 97313010Sdes 98313010Sdesstatic int 99313010Sdesvasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) 100313010Sdes{ 101313010Sdes char *src; /* Source string returned from vasprintf. */ 102313010Sdes char *sp; /* Pointer into src. */ 103313010Sdes char *dst; /* Destination string to be returned. */ 104313010Sdes char *dp; /* Pointer into dst. */ 105313010Sdes char *tp; /* Temporary pointer for dst. */ 106313010Sdes size_t sz; /* Number of bytes allocated for dst. */ 107313010Sdes wchar_t wc; /* Wide character at sp. */ 108313010Sdes int len; /* Number of bytes in the character at sp. */ 109313010Sdes int ret; /* Number of bytes needed to format src. */ 110313010Sdes int width; /* Display width of the character wc. */ 111313010Sdes int total_width, max_width, print; 112313010Sdes 113313010Sdes src = NULL; 114313010Sdes if ((ret = vasprintf(&src, fmt, ap)) <= 0) 115313010Sdes goto fail; 116313010Sdes 117313010Sdes sz = strlen(src) + 1; 118313010Sdes if ((dst = malloc(sz)) == NULL) { 119313010Sdes free(src); 120323136Sdes ret = -1; 121313010Sdes goto fail; 122313010Sdes } 123313010Sdes 124313010Sdes if (maxsz > INT_MAX) 125313010Sdes maxsz = INT_MAX; 126313010Sdes 127313010Sdes sp = src; 128313010Sdes dp = dst; 129313010Sdes ret = 0; 130313010Sdes print = 1; 131313010Sdes total_width = 0; 132313010Sdes max_width = wp == NULL ? INT_MAX : *wp; 133313010Sdes while (*sp != '\0') { 134313010Sdes if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { 135313010Sdes (void)mbtowc(NULL, NULL, MB_CUR_MAX); 136313010Sdes if (dangerous_locale()) { 137313010Sdes ret = -1; 138313010Sdes break; 139313010Sdes } 140313010Sdes len = 1; 141313010Sdes width = -1; 142313010Sdes } else if (wp == NULL && 143313010Sdes (wc == L'\n' || wc == L'\r' || wc == L'\t')) { 144313010Sdes /* 145313010Sdes * Don't use width uninitialized; the actual 146313010Sdes * value doesn't matter because total_width 147313010Sdes * is only returned for wp != NULL. 148313010Sdes */ 149313010Sdes width = 0; 150313010Sdes } else if ((width = wcwidth(wc)) == -1 && 151313010Sdes dangerous_locale()) { 152313010Sdes ret = -1; 153313010Sdes break; 154313010Sdes } 155313010Sdes 156313010Sdes /* Valid, printable character. */ 157313010Sdes 158313010Sdes if (width >= 0) { 159313010Sdes if (print && (dp - dst >= (int)maxsz - len || 160313010Sdes total_width > max_width - width)) 161313010Sdes print = 0; 162313010Sdes if (print) { 163313010Sdes if (grow_dst(&dst, &sz, maxsz, 164313010Sdes &dp, len) == -1) { 165313010Sdes ret = -1; 166313010Sdes break; 167313010Sdes } 168313010Sdes total_width += width; 169313010Sdes memcpy(dp, sp, len); 170313010Sdes dp += len; 171313010Sdes } 172313010Sdes sp += len; 173313010Sdes if (ret >= 0) 174313010Sdes ret += len; 175313010Sdes continue; 176313010Sdes } 177313010Sdes 178313010Sdes /* Escaping required. */ 179313010Sdes 180313010Sdes while (len > 0) { 181313010Sdes if (print && (dp - dst >= (int)maxsz - 4 || 182313010Sdes total_width > max_width - 4)) 183313010Sdes print = 0; 184313010Sdes if (print) { 185313010Sdes if (grow_dst(&dst, &sz, maxsz, 186313010Sdes &dp, 4) == -1) { 187313010Sdes ret = -1; 188313010Sdes break; 189313010Sdes } 190313010Sdes tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); 191313010Sdes width = tp - dp; 192313010Sdes total_width += width; 193313010Sdes dp = tp; 194313010Sdes } else 195313010Sdes width = 4; 196313010Sdes len--; 197313010Sdes sp++; 198313010Sdes if (ret >= 0) 199313010Sdes ret += width; 200313010Sdes } 201313010Sdes if (len > 0) 202313010Sdes break; 203313010Sdes } 204313010Sdes free(src); 205313010Sdes *dp = '\0'; 206313010Sdes *str = dst; 207313010Sdes if (wp != NULL) 208313010Sdes *wp = total_width; 209313010Sdes 210313010Sdes /* 211313010Sdes * If the string was truncated by the width limit but 212313010Sdes * would have fit into the size limit, the only sane way 213313010Sdes * to report the problem is using the return value, such 214313010Sdes * that the usual idiom "if (ret < 0 || ret >= sz) error" 215313010Sdes * works as expected. 216313010Sdes */ 217313010Sdes 218313010Sdes if (ret < (int)maxsz && !print) 219313010Sdes ret = -1; 220313010Sdes return ret; 221313010Sdes 222313010Sdesfail: 223313010Sdes if (wp != NULL) 224313010Sdes *wp = 0; 225313010Sdes if (ret == 0) { 226313010Sdes *str = src; 227313010Sdes return 0; 228313010Sdes } else { 229313010Sdes *str = NULL; 230313010Sdes return -1; 231313010Sdes } 232313010Sdes} 233313010Sdes 234313010Sdesint 235313010Sdessnmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) 236313010Sdes{ 237313010Sdes va_list ap; 238313010Sdes char *cp; 239313010Sdes int ret; 240313010Sdes 241313010Sdes va_start(ap, fmt); 242313010Sdes ret = vasnmprintf(&cp, sz, wp, fmt, ap); 243313010Sdes va_end(ap); 244313010Sdes if (cp != NULL) { 245313010Sdes (void)strlcpy(str, cp, sz); 246313010Sdes free(cp); 247313010Sdes } else 248313010Sdes *str = '\0'; 249313010Sdes return ret; 250313010Sdes} 251313010Sdes 252313010Sdes/* 253313010Sdes * To stay close to the standard interfaces, the following functions 254313010Sdes * return the number of non-NUL bytes written. 255313010Sdes */ 256313010Sdes 257313010Sdesint 258313010Sdesvfmprintf(FILE *stream, const char *fmt, va_list ap) 259313010Sdes{ 260313010Sdes char *str; 261313010Sdes int ret; 262313010Sdes 263313010Sdes if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) 264313010Sdes return -1; 265313010Sdes if (fputs(str, stream) == EOF) 266313010Sdes ret = -1; 267313010Sdes free(str); 268313010Sdes return ret; 269313010Sdes} 270313010Sdes 271313010Sdesint 272313010Sdesfmprintf(FILE *stream, const char *fmt, ...) 273313010Sdes{ 274313010Sdes va_list ap; 275313010Sdes int ret; 276313010Sdes 277313010Sdes va_start(ap, fmt); 278313010Sdes ret = vfmprintf(stream, fmt, ap); 279313010Sdes va_end(ap); 280313010Sdes return ret; 281313010Sdes} 282313010Sdes 283313010Sdesint 284313010Sdesmprintf(const char *fmt, ...) 285313010Sdes{ 286313010Sdes va_list ap; 287313010Sdes int ret; 288313010Sdes 289313010Sdes va_start(ap, fmt); 290313010Sdes ret = vfmprintf(stdout, fmt, ap); 291313010Sdes va_end(ap); 292313010Sdes return ret; 293313010Sdes} 294323134Sdes 295323134Sdes/* 296323134Sdes * Set up libc for multibyte output in the user's chosen locale. 297323134Sdes * 298323134Sdes * XXX: we are known to have problems with Turkish (i/I confusion) so we 299323134Sdes * deliberately fall back to the C locale for now. Longer term we should 300323134Sdes * always prefer to select C.[encoding] if possible, but there's no 301323134Sdes * standardisation in locales between systems, so we'll need to survey 302323134Sdes * what's out there first. 303323134Sdes */ 304323134Sdesvoid 305323134Sdesmsetlocale(void) 306323134Sdes{ 307323134Sdes const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL }; 308323134Sdes char *cp; 309323134Sdes int i; 310323134Sdes 311323134Sdes /* 312323134Sdes * We can't yet cope with dotless/dotted I in Turkish locales, 313323134Sdes * so fall back to the C locale for these. 314323134Sdes */ 315323134Sdes for (i = 0; vars[i] != NULL; i++) { 316323134Sdes if ((cp = getenv(vars[i])) == NULL) 317323134Sdes continue; 318323134Sdes if (strncasecmp(cp, "TR", 2) != 0) 319323134Sdes break; 320323134Sdes /* 321323134Sdes * If we're in a UTF-8 locale then prefer to use 322323134Sdes * the C.UTF-8 locale (or equivalent) if it exists. 323323134Sdes */ 324323134Sdes if ((strcasestr(cp, "UTF-8") != NULL || 325323134Sdes strcasestr(cp, "UTF8") != NULL) && 326323134Sdes (setlocale(LC_CTYPE, "C.UTF-8") != NULL || 327323134Sdes setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL)) 328323134Sdes return; 329323134Sdes setlocale(LC_CTYPE, "C"); 330323134Sdes return; 331323134Sdes } 332323134Sdes /* We can handle this locale */ 333323134Sdes setlocale(LC_CTYPE, ""); 334323134Sdes} 335