1170530Ssam/*- 2178354Ssam * Copyright (c) 2003-2011 Tim Kientzle 3170530Ssam * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4170530Ssam * All rights reserved. 5170530Ssam * 6170530Ssam * Redistribution and use in source and binary forms, with or without 7170530Ssam * modification, are permitted provided that the following conditions 8170530Ssam * are met: 9170530Ssam * 1. Redistributions of source code must retain the above copyright 10170530Ssam * notice, this list of conditions and the following disclaimer. 11170530Ssam * 2. Redistributions in binary form must reproduce the above copyright 12170530Ssam * notice, this list of conditions and the following disclaimer in the 13170530Ssam * documentation and/or other materials provided with the distribution. 14170530Ssam * 15170530Ssam * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 16170530Ssam * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17170530Ssam * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18170530Ssam * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 19170530Ssam * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20170530Ssam * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21170530Ssam * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22170530Ssam * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23170530Ssam * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24170530Ssam * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25170530Ssam */ 26170530Ssam 27170530Ssam#include "archive_platform.h" 28170530Ssam 29170530Ssam/* 30170530Ssam * Basic resizable string support, to simplify manipulating arbitrary-sized 31170530Ssam * strings while minimizing heap activity. 32170530Ssam * 33170530Ssam * In particular, the buffer used by a string object is only grown, it 34170530Ssam * never shrinks, so you can clear and reuse the same string object 35170530Ssam * without incurring additional memory allocations. 36178354Ssam */ 37170530Ssam 38170530Ssam#ifdef HAVE_ERRNO_H 39170530Ssam#include <errno.h> 40170530Ssam#endif 41170530Ssam#ifdef HAVE_ICONV_H 42170530Ssam#include <iconv.h> 43170530Ssam#endif 44170530Ssam#ifdef HAVE_LANGINFO_H 45170530Ssam#include <langinfo.h> 46170530Ssam#endif 47170530Ssam#ifdef HAVE_LOCALCHARSET_H 48170530Ssam#include <localcharset.h> 49170530Ssam#endif 50195377Ssam#ifdef HAVE_STDLIB_H 51178354Ssam#include <stdlib.h> 52170530Ssam#endif 53170530Ssam#ifdef HAVE_STRING_H 54170530Ssam#include <string.h> 55170530Ssam#endif 56170530Ssam#ifdef HAVE_WCHAR_H 57219456Sbschmidt#include <wchar.h> 58219456Sbschmidt#endif 59219456Sbschmidt#if defined(_WIN32) && !defined(__CYGWIN__) 60219456Sbschmidt#include <windows.h> 61219456Sbschmidt#include <locale.h> 62219456Sbschmidt#endif 63219456Sbschmidt 64219456Sbschmidt#include "archive_endian.h" 65219456Sbschmidt#include "archive_private.h" 66219456Sbschmidt#include "archive_string.h" 67219456Sbschmidt#include "archive_string_composition.h" 68219456Sbschmidt 69219456Sbschmidt#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) 70219456Sbschmidt#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) 71219456Sbschmidt#endif 72219456Sbschmidt 73219456Sbschmidt#if !defined(HAVE_WMEMMOVE) && !defined(wmemmove) 74219456Sbschmidt#define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t)) 75219456Sbschmidt#endif 76219456Sbschmidt 77219456Sbschmidt#undef max 78219456Sbschmidt#define max(a, b) ((a)>(b)?(a):(b)) 79219456Sbschmidt 80219456Sbschmidtstruct archive_string_conv { 81219456Sbschmidt struct archive_string_conv *next; 82219456Sbschmidt char *from_charset; 83219456Sbschmidt char *to_charset; 84219456Sbschmidt unsigned from_cp; 85219456Sbschmidt unsigned to_cp; 86219456Sbschmidt /* Set 1 if from_charset and to_charset are the same. */ 87219456Sbschmidt int same; 88219456Sbschmidt int flag; 89219456Sbschmidt#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified 90219456Sbschmidt * charset. */ 91219456Sbschmidt#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from 92219456Sbschmidt * specified charset. */ 93219456Sbschmidt#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ 94219456Sbschmidt#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting 95219456Sbschmidt * MBS. */ 96219456Sbschmidt#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive 97219456Sbschmidt * 2.x in the wrong assumption. */ 98219456Sbschmidt#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. 99219456Sbschmidt * Before UTF-8 characters are actually 100219456Sbschmidt * processed. */ 101219456Sbschmidt#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. 102219456Sbschmidt * Before UTF-8 characters are actually 103219456Sbschmidt * processed. 104219456Sbschmidt * Currently this only for MAC OS X. */ 105219456Sbschmidt#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ 106219456Sbschmidt#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ 107219456Sbschmidt#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ 108219456Sbschmidt#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ 109219456Sbschmidt#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ 110219456Sbschmidt#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ 111219456Sbschmidt#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) 112219456Sbschmidt#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) 113219456Sbschmidt 114219456Sbschmidt#if HAVE_ICONV 115219456Sbschmidt iconv_t cd; 116219456Sbschmidt iconv_t cd_w;/* Use at archive_mstring on 117219456Sbschmidt * Windows. */ 118219456Sbschmidt#endif 119219456Sbschmidt /* A temporary buffer for normalization. */ 120219456Sbschmidt struct archive_string utftmp; 121219456Sbschmidt int (*converter[2])(struct archive_string *, const void *, size_t, 122219456Sbschmidt struct archive_string_conv *); 123219456Sbschmidt int nconverter; 124219456Sbschmidt}; 125219456Sbschmidt 126219456Sbschmidt#define CP_C_LOCALE 0 /* "C" locale only for this file. */ 127219456Sbschmidt#define CP_UTF16LE 1200 128219456Sbschmidt#define CP_UTF16BE 1201 129219456Sbschmidt 130219456Sbschmidt#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) 131219456Sbschmidt#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) 132219456Sbschmidt#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) 133219456Sbschmidt#define UNICODE_MAX 0x10FFFF 134219456Sbschmidt#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ 135170530Ssam/* Set U+FFFD(Replacement character) in UTF-8. */ 136170530Ssamstatic const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; 137170530Ssam 138170530Ssamstatic struct archive_string_conv *find_sconv_object(struct archive *, 139170530Ssam const char *, const char *); 140170530Ssamstatic void add_sconv_object(struct archive *, struct archive_string_conv *); 141170530Ssamstatic struct archive_string_conv *create_sconv_object(const char *, 142170530Ssam const char *, unsigned, int); 143173273Ssamstatic void free_sconv_object(struct archive_string_conv *); 144193115Ssamstatic struct archive_string_conv *get_sconv_object(struct archive *, 145193115Ssam const char *, const char *, int); 146193115Ssamstatic unsigned make_codepage_from_charset(const char *); 147193115Ssamstatic unsigned get_current_codepage(void); 148173273Ssamstatic unsigned get_current_oemcp(void); 149173273Ssamstatic size_t mbsnbytes(const void *, size_t); 150193115Ssamstatic size_t utf16nbytes(const void *, size_t); 151193115Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 152193115Ssamstatic int archive_wstring_append_from_mbs_in_codepage( 153193115Ssam struct archive_wstring *, const char *, size_t, 154193115Ssam struct archive_string_conv *); 155193115Ssamstatic int archive_string_append_from_wcs_in_codepage(struct archive_string *, 156193115Ssam const wchar_t *, size_t, struct archive_string_conv *); 157193115Ssamstatic int is_big_endian(void); 158193115Ssamstatic int strncat_in_codepage(struct archive_string *, const void *, 159193115Ssam size_t, struct archive_string_conv *); 160193115Ssamstatic int win_strncat_from_utf16be(struct archive_string *, const void *, 161193115Ssam size_t, struct archive_string_conv *); 162193115Ssamstatic int win_strncat_from_utf16le(struct archive_string *, const void *, 163193115Ssam size_t, struct archive_string_conv *); 164193115Ssamstatic int win_strncat_to_utf16be(struct archive_string *, const void *, 165193115Ssam size_t, struct archive_string_conv *); 166193115Ssamstatic int win_strncat_to_utf16le(struct archive_string *, const void *, 167193115Ssam size_t, struct archive_string_conv *); 168193115Ssam#endif 169195377Ssamstatic int best_effort_strncat_from_utf16be(struct archive_string *, 170195377Ssam const void *, size_t, struct archive_string_conv *); 171195377Ssamstatic int best_effort_strncat_from_utf16le(struct archive_string *, 172195377Ssam const void *, size_t, struct archive_string_conv *); 173195377Ssamstatic int best_effort_strncat_to_utf16be(struct archive_string *, 174195377Ssam const void *, size_t, struct archive_string_conv *); 175195377Ssamstatic int best_effort_strncat_to_utf16le(struct archive_string *, 176195377Ssam const void *, size_t, struct archive_string_conv *); 177195377Ssam#if defined(HAVE_ICONV) 178195377Ssamstatic int iconv_strncat_in_locale(struct archive_string *, const void *, 179178354Ssam size_t, struct archive_string_conv *); 180195377Ssam#endif 181178354Ssamstatic int best_effort_strncat_in_locale(struct archive_string *, 182195377Ssam const void *, size_t, struct archive_string_conv *); 183195377Ssamstatic int _utf8_to_unicode(uint32_t *, const char *, size_t); 184195377Ssamstatic int utf8_to_unicode(uint32_t *, const char *, size_t); 185178354Ssamstatic inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); 186178354Ssamstatic int cesu8_to_unicode(uint32_t *, const char *, size_t); 187178354Ssamstatic size_t unicode_to_utf8(char *, size_t, uint32_t); 188178354Ssamstatic int utf16_to_unicode(uint32_t *, const char *, size_t, int); 189178354Ssamstatic size_t unicode_to_utf16be(char *, size_t, uint32_t); 190184280Ssamstatic size_t unicode_to_utf16le(char *, size_t, uint32_t); 191195377Ssamstatic int strncat_from_utf8_libarchive2(struct archive_string *, 192195377Ssam const void *, size_t, struct archive_string_conv *); 193195377Ssamstatic int strncat_from_utf8_to_utf8(struct archive_string *, const void *, 194195377Ssam size_t, struct archive_string_conv *); 195195377Ssamstatic int archive_string_normalize_C(struct archive_string *, const void *, 196195377Ssam size_t, struct archive_string_conv *); 197195377Ssamstatic int archive_string_normalize_D(struct archive_string *, const void *, 198195377Ssam size_t, struct archive_string_conv *); 199195377Ssamstatic int archive_string_append_unicode(struct archive_string *, 200195377Ssam const void *, size_t, struct archive_string_conv *); 201195377Ssam 202195377Ssamstatic struct archive_string * 203195377Ssamarchive_string_append(struct archive_string *as, const char *p, size_t s) 204195377Ssam{ 205195377Ssam if (archive_string_ensure(as, as->length + s + 1) == NULL) 206195377Ssam return (NULL); 207195377Ssam if (s) 208195377Ssam memmove(as->s + as->length, p, s); 209195377Ssam as->length += s; 210195377Ssam as->s[as->length] = 0; 211195377Ssam return (as); 212195377Ssam} 213178354Ssam 214195377Ssamstatic struct archive_wstring * 215170530Ssamarchive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) 216178354Ssam{ 217178354Ssam if (archive_wstring_ensure(as, as->length + s + 1) == NULL) 218170530Ssam return (NULL); 219170530Ssam if (s) 220170530Ssam wmemmove(as->s + as->length, p, s); 221170530Ssam as->length += s; 222170530Ssam as->s[as->length] = 0; 223170530Ssam return (as); 224170530Ssam} 225170530Ssam 226184280Ssamstruct archive_string * 227184280Ssamarchive_array_append(struct archive_string *as, const char *p, size_t s) 228184280Ssam{ 229184280Ssam return archive_string_append(as, p, s); 230191552Ssam} 231191552Ssam 232191552Ssamvoid 233170530Ssamarchive_string_concat(struct archive_string *dest, struct archive_string *src) 234170530Ssam{ 235170530Ssam if (archive_string_append(dest, src->s, src->length) == NULL) 236170530Ssam __archive_errx(1, "Out of memory"); 237170530Ssam} 238195377Ssam 239170530Ssamvoid 240178354Ssamarchive_wstring_concat(struct archive_wstring *dest, 241170530Ssam struct archive_wstring *src) 242170530Ssam{ 243170530Ssam if (archive_wstring_append(dest, src->s, src->length) == NULL) 244184280Ssam __archive_errx(1, "Out of memory"); 245191552Ssam} 246191552Ssam 247170530Ssamvoid 248173273Ssamarchive_string_free(struct archive_string *as) 249173273Ssam{ 250178354Ssam as->length = 0; 251173273Ssam as->buffer_length = 0; 252178354Ssam free(as->s); 253178354Ssam as->s = NULL; 254178354Ssam} 255178354Ssam 256173273Ssamvoid 257178354Ssamarchive_wstring_free(struct archive_wstring *as) 258178354Ssam{ 259178354Ssam as->length = 0; 260178354Ssam as->buffer_length = 0; 261178354Ssam free(as->s); 262178354Ssam as->s = NULL; 263178354Ssam} 264178354Ssam 265178354Ssamstruct archive_wstring * 266178354Ssamarchive_wstring_ensure(struct archive_wstring *as, size_t s) 267178354Ssam{ 268178354Ssam return (struct archive_wstring *) 269178354Ssam archive_string_ensure((struct archive_string *)as, 270178354Ssam s * sizeof(wchar_t)); 271178354Ssam} 272178354Ssam 273170530Ssam/* Returns NULL on any allocation failure. */ 274173273Ssamstruct archive_string * 275173273Ssamarchive_string_ensure(struct archive_string *as, size_t s) 276170530Ssam{ 277170530Ssam char *p; 278193655Ssam size_t new_length; 279193655Ssam 280193655Ssam /* If buffer is already big enough, don't reallocate. */ 281178354Ssam if (as->s && (s <= as->buffer_length)) 282193655Ssam return (as); 283173273Ssam 284178354Ssam /* 285193655Ssam * Growing the buffer at least exponentially ensures that 286178354Ssam * append operations are always linear in the number of 287193655Ssam * characters appended. Using a smaller growth rate for 288170530Ssam * larger buffers reduces memory waste somewhat at the cost of 289183256Ssam * a larger constant factor. 290183256Ssam */ 291193655Ssam if (as->buffer_length < 32) 292183256Ssam /* Start with a minimum 32-character buffer. */ 293170530Ssam new_length = 32; 294193655Ssam else if (as->buffer_length < 8192) 295178354Ssam /* Buffers under 8k are doubled for speed. */ 296193655Ssam new_length = as->buffer_length + as->buffer_length; 297193655Ssam else { 298178354Ssam /* Buffers 8k and over grow by at least 25% each time. */ 299193655Ssam new_length = as->buffer_length + as->buffer_length / 4; 300170530Ssam /* Be safe: If size wraps, fail. */ 301178354Ssam if (new_length < as->buffer_length) { 302178354Ssam /* On failure, wipe the string and return NULL. */ 303193655Ssam archive_string_free(as); 304170530Ssam errno = ENOMEM;/* Make sure errno has ENOMEM. */ 305170530Ssam return (NULL); 306170530Ssam } 307178354Ssam } 308170530Ssam /* 309170530Ssam * The computation above is a lower limit to how much we'll 310170530Ssam * grow the buffer. In any case, we have to grow it enough to 311170530Ssam * hold the request. 312205277Srpaulo */ 313205277Srpaulo if (new_length < s) 314170530Ssam new_length = s; 315170530Ssam /* Now we can reallocate the buffer. */ 316170530Ssam p = (char *)realloc(as->s, new_length); 317205277Srpaulo if (p == NULL) { 318172226Ssam /* On failure, wipe the string and return NULL. */ 319172226Ssam archive_string_free(as); 320170530Ssam errno = ENOMEM;/* Make sure errno has ENOMEM. */ 321170530Ssam return (NULL); 322205277Srpaulo } 323205277Srpaulo 324205277Srpaulo as->s = p; 325205277Srpaulo as->buffer_length = new_length; 326205277Srpaulo return (as); 327205277Srpaulo} 328205277Srpaulo 329205277Srpaulo/* 330205277Srpaulo * TODO: See if there's a way to avoid scanning 331205277Srpaulo * the source string twice. Then test to see 332205277Srpaulo * if it actually helps (remember that we're almost 333205277Srpaulo * always called with pretty short arguments, so 334205277Srpaulo * such an optimization might not help). 335205277Srpaulo */ 336205277Srpaulostruct archive_string * 337205277Srpauloarchive_strncat(struct archive_string *as, const void *_p, size_t n) 338205277Srpaulo{ 339205277Srpaulo size_t s; 340170530Ssam const char *p, *pp; 341170530Ssam 342170530Ssam p = (const char *)_p; 343170530Ssam 344170530Ssam /* Like strlen(p), except won't examine positions beyond p[n]. */ 345170530Ssam s = 0; 346205277Srpaulo pp = p; 347205277Srpaulo while (s < n && *pp) { 348205277Srpaulo pp++; 349205277Srpaulo s++; 350205277Srpaulo } 351205281Srpaulo if ((as = archive_string_append(as, p, s)) == NULL) 352205277Srpaulo __archive_errx(1, "Out of memory"); 353205277Srpaulo return (as); 354205277Srpaulo} 355205277Srpaulo 356205277Srpaulostruct archive_wstring * 357205277Srpauloarchive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) 358205277Srpaulo{ 359205277Srpaulo size_t s; 360205277Srpaulo const wchar_t *pp; 361205277Srpaulo 362205277Srpaulo /* Like strlen(p), except won't examine positions beyond p[n]. */ 363205277Srpaulo s = 0; 364205277Srpaulo pp = p; 365205277Srpaulo while (s < n && *pp) { 366170530Ssam pp++; 367170530Ssam s++; 368170530Ssam } 369170530Ssam if ((as = archive_wstring_append(as, p, s)) == NULL) 370170530Ssam __archive_errx(1, "Out of memory"); 371170530Ssam return (as); 372170530Ssam} 373170530Ssam 374170530Ssamstruct archive_string * 375170530Ssamarchive_strcat(struct archive_string *as, const void *p) 376170530Ssam{ 377170530Ssam /* strcat is just strncat without an effective limit. 378170530Ssam * Assert that we'll never get called with a source 379173273Ssam * string over 16MB. 380170530Ssam * TODO: Review all uses of strcat in the source 381170530Ssam * and try to replace them with strncat(). 382170530Ssam */ 383170530Ssam return archive_strncat(as, p, 0x1000000); 384170530Ssam} 385170530Ssam 386170530Ssamstruct archive_wstring * 387170530Ssamarchive_wstrcat(struct archive_wstring *as, const wchar_t *p) 388170530Ssam{ 389170530Ssam /* Ditto. */ 390170530Ssam return archive_wstrncat(as, p, 0x1000000); 391170530Ssam} 392170530Ssam 393170530Ssamstruct archive_string * 394178354Ssamarchive_strappend_char(struct archive_string *as, char c) 395173462Ssam{ 396170530Ssam if ((as = archive_string_append(as, &c, 1)) == NULL) 397170530Ssam __archive_errx(1, "Out of memory"); 398170530Ssam return (as); 399170530Ssam} 400170530Ssam 401178354Ssamstruct archive_wstring * 402170530Ssamarchive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) 403170530Ssam{ 404170530Ssam if ((as = archive_wstring_append(as, &c, 1)) == NULL) 405170530Ssam __archive_errx(1, "Out of memory"); 406170530Ssam return (as); 407170530Ssam} 408170530Ssam 409170530Ssam/* 410170530Ssam * Get the "current character set" name to use with iconv. 411170530Ssam * On FreeBSD, the empty character set name "" chooses 412178354Ssam * the correct character encoding for the current locale, 413173462Ssam * so this isn't necessary. 414178354Ssam * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; 415170530Ssam * on that system, we have to explicitly call nl_langinfo() 416170530Ssam * to get the right name. Not sure about other platforms. 417173462Ssam * 418170530Ssam * NOTE: GNU libiconv does not recognize the character-set name 419170530Ssam * which some platform nl_langinfo(CODESET) returns, so we should 420170530Ssam * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. 421178354Ssam */ 422170530Ssamstatic const char * 423170530Ssamdefault_iconv_charset(const char *charset) { 424178354Ssam if (charset != NULL && charset[0] != '\0') 425170530Ssam return charset; 426170530Ssam#if HAVE_LOCALE_CHARSET && !defined(__APPLE__) 427170530Ssam /* locale_charset() is broken on Mac OS */ 428178354Ssam return locale_charset(); 429170530Ssam#elif HAVE_NL_LANGINFO 430170530Ssam return nl_langinfo(CODESET); 431170530Ssam#else 432170530Ssam return ""; 433170530Ssam#endif 434170530Ssam} 435170530Ssam 436170530Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 437170530Ssam 438170530Ssam/* 439170530Ssam * Convert MBS to WCS. 440170530Ssam * Note: returns -1 if conversion fails. 441170530Ssam */ 442170530Ssamint 443170530Ssamarchive_wstring_append_from_mbs(struct archive_wstring *dest, 444170530Ssam const char *p, size_t len) 445170530Ssam{ 446170530Ssam return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); 447170530Ssam} 448170530Ssam 449170530Ssamstatic int 450170530Ssamarchive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, 451170530Ssam const char *s, size_t length, struct archive_string_conv *sc) 452170530Ssam{ 453170530Ssam int count, ret = 0; 454170530Ssam UINT from_cp; 455170530Ssam 456170530Ssam if (sc != NULL) 457170530Ssam from_cp = sc->from_cp; 458170530Ssam else 459170530Ssam from_cp = get_current_codepage(); 460170530Ssam 461170530Ssam if (from_cp == CP_C_LOCALE) { 462170530Ssam /* 463170530Ssam * "C" locale special processing. 464170530Ssam */ 465178354Ssam wchar_t *ws; 466178354Ssam const unsigned char *mp; 467191552Ssam 468191552Ssam if (NULL == archive_wstring_ensure(dest, 469191552Ssam dest->length + length + 1)) 470178354Ssam return (-1); 471191552Ssam 472191552Ssam ws = dest->s + dest->length; 473178354Ssam mp = (const unsigned char *)s; 474178354Ssam count = 0; 475178354Ssam while (count < (int)length && *mp) { 476178354Ssam *ws++ = (wchar_t)*mp++; 477178354Ssam count++; 478178354Ssam } 479178354Ssam } else if (sc != NULL && 480178354Ssam (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { 481178354Ssam /* 482178354Ssam * Normalize UTF-8 and UTF-16BE and convert it directly 483191552Ssam * to UTF-16 as wchar_t. 484178354Ssam */ 485191552Ssam struct archive_string u16; 486191552Ssam int saved_flag = sc->flag;/* save current flag. */ 487178354Ssam 488178354Ssam if (is_big_endian()) 489178354Ssam sc->flag |= SCONV_TO_UTF16BE; 490170530Ssam else 491170530Ssam sc->flag |= SCONV_TO_UTF16LE; 492170530Ssam 493191552Ssam if (sc->flag & SCONV_FROM_UTF16) { 494170530Ssam /* 495205277Srpaulo * UTF-16BE/LE NFD ===> UTF-16 NFC 496170530Ssam * UTF-16BE/LE NFC ===> UTF-16 NFD 497178354Ssam */ 498170530Ssam count = (int)utf16nbytes(s, length); 499170530Ssam } else { 500170530Ssam /* 501170530Ssam * UTF-8 NFD ===> UTF-16 NFC 502170530Ssam * UTF-8 NFC ===> UTF-16 NFD 503183247Ssam */ 504170530Ssam count = (int)mbsnbytes(s, length); 505170530Ssam } 506170530Ssam u16.s = (char *)dest->s; 507170530Ssam u16.length = dest->length << 1;; 508170530Ssam u16.buffer_length = dest->buffer_length; 509183247Ssam if (sc->flag & SCONV_NORMALIZATION_C) 510192468Ssam ret = archive_string_normalize_C(&u16, s, count, sc); 511192468Ssam else 512170530Ssam ret = archive_string_normalize_D(&u16, s, count, sc); 513170530Ssam dest->s = (wchar_t *)u16.s; 514170530Ssam dest->length = u16.length >> 1; 515170530Ssam dest->buffer_length = u16.buffer_length; 516170530Ssam sc->flag = saved_flag;/* restore the saved flag. */ 517170530Ssam return (ret); 518170530Ssam } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { 519170530Ssam count = (int)utf16nbytes(s, length); 520170530Ssam count >>= 1; /* to be WCS length */ 521170530Ssam /* Allocate memory for WCS. */ 522170530Ssam if (NULL == archive_wstring_ensure(dest, 523170530Ssam dest->length + count + 1)) 524170530Ssam return (-1); 525178354Ssam wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); 526170530Ssam if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) { 527170530Ssam uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 528170530Ssam int b; 529170530Ssam for (b = 0; b < count; b++) { 530170530Ssam uint16_t val = archive_le16dec(u16+b); 531170530Ssam archive_be16enc(u16+b, val); 532170530Ssam } 533170530Ssam } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) { 534170530Ssam uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 535170530Ssam int b; 536170530Ssam for (b = 0; b < count; b++) { 537170530Ssam uint16_t val = archive_be16dec(u16+b); 538170530Ssam archive_le16enc(u16+b, val); 539170530Ssam } 540170530Ssam } 541170530Ssam } else { 542170530Ssam DWORD mbflag; 543170530Ssam size_t buffsize; 544170530Ssam 545170530Ssam if (sc == NULL) 546170530Ssam mbflag = 0; 547170530Ssam else if (sc->flag & SCONV_FROM_CHARSET) { 548170530Ssam /* Do not trust the length which comes from 549170530Ssam * an archive file. */ 550170530Ssam length = mbsnbytes(s, length); 551170530Ssam mbflag = 0; 552170530Ssam } else 553170530Ssam mbflag = MB_PRECOMPOSED; 554170530Ssam 555178354Ssam mbflag |= MB_ERR_INVALID_CHARS; 556170530Ssam 557173273Ssam buffsize = dest->length + length + 1; 558173273Ssam do { 559173273Ssam /* Allocate memory for WCS. */ 560173273Ssam if (NULL == archive_wstring_ensure(dest, buffsize)) 561173273Ssam return (-1); 562178354Ssam /* Convert MBS to WCS. */ 563170530Ssam count = MultiByteToWideChar(from_cp, 564170530Ssam mbflag, s, (int)length, dest->s + dest->length, 565173273Ssam (int)(dest->buffer_length >> 1) -1); 566170530Ssam if (count == 0 && 567173273Ssam GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 568170530Ssam /* Expand the WCS buffer. */ 569170530Ssam buffsize = dest->buffer_length << 1; 570173273Ssam continue; 571170530Ssam } 572178354Ssam if (count == 0 && length != 0) 573170530Ssam ret = -1; 574170530Ssam break; 575170530Ssam } while (1); 576173273Ssam } 577170530Ssam dest->length += count; 578170530Ssam dest->s[dest->length] = L'\0'; 579170530Ssam return (ret); 580170530Ssam} 581170530Ssam 582173273Ssam#else 583178354Ssam 584173273Ssam/* 585170530Ssam * Convert MBS to WCS. 586173273Ssam * Note: returns -1 if conversion fails. 587170530Ssam */ 588170530Ssamint 589170530Ssamarchive_wstring_append_from_mbs(struct archive_wstring *dest, 590173273Ssam const char *p, size_t len) 591170530Ssam{ 592170530Ssam size_t r; 593173273Ssam int ret_val = 0; 594173273Ssam /* 595173273Ssam * No single byte will be more than one wide character, 596173273Ssam * so this length estimate will always be big enough. 597173273Ssam */ 598173273Ssam // size_t wcs_length = len; 599173273Ssam size_t mbs_length = len; 600173273Ssam const char *mbs = p; 601178354Ssam wchar_t *wcs; 602173273Ssam#if HAVE_MBRTOWC 603173273Ssam mbstate_t shift_state; 604173273Ssam 605173273Ssam memset(&shift_state, 0, sizeof(shift_state)); 606173273Ssam#endif 607173273Ssam /* 608173273Ssam * As we decided to have wcs_length == mbs_length == len 609173273Ssam * we can use len here instead of wcs_length 610173273Ssam */ 611173273Ssam if (NULL == archive_wstring_ensure(dest, dest->length + len + 1)) 612173273Ssam return (-1); 613173273Ssam wcs = dest->s + dest->length; 614173273Ssam /* 615173273Ssam * We cannot use mbsrtowcs/mbstowcs here because those may convert 616173273Ssam * extra MBS when strlen(p) > len and one wide character consists of 617173273Ssam * multi bytes. 618173273Ssam */ 619173273Ssam while (*mbs && mbs_length > 0) { 620178354Ssam /* 621173273Ssam * The buffer we allocated is always big enough. 622173273Ssam * Keep this code path in a comment if we decide to choose 623173273Ssam * smaller wcs_length in the future 624173273Ssam */ 625173273Ssam/* 626173273Ssam if (wcs_length == 0) { 627173273Ssam dest->length = wcs - dest->s; 628173273Ssam dest->s[dest->length] = L'\0'; 629173273Ssam wcs_length = mbs_length; 630173273Ssam if (NULL == archive_wstring_ensure(dest, 631173273Ssam dest->length + wcs_length + 1)) 632173273Ssam return (-1); 633173273Ssam wcs = dest->s + dest->length; 634173273Ssam } 635178354Ssam*/ 636178354Ssam#if HAVE_MBRTOWC 637178354Ssam r = mbrtowc(wcs, mbs, mbs_length, &shift_state); 638178354Ssam#else 639173273Ssam r = mbtowc(wcs, mbs, mbs_length); 640173273Ssam#endif 641173273Ssam if (r == (size_t)-1 || r == (size_t)-2) { 642173273Ssam ret_val = -1; 643173273Ssam break; 644173273Ssam } 645173273Ssam if (r == 0 || r > mbs_length) 646173273Ssam break; 647173273Ssam wcs++; 648173273Ssam // wcs_length--; 649173273Ssam mbs += r; 650173273Ssam mbs_length -= r; 651173273Ssam } 652178354Ssam dest->length = wcs - dest->s; 653173273Ssam dest->s[dest->length] = L'\0'; 654173273Ssam return (ret_val); 655173273Ssam} 656173273Ssam 657173273Ssam#endif 658173273Ssam 659173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 660173273Ssam 661173273Ssam/* 662173273Ssam * WCS ==> MBS. 663173273Ssam * Note: returns -1 if conversion fails. 664170530Ssam * 665170530Ssam * Win32 builds use WideCharToMultiByte from the Windows API. 666170530Ssam * (Maybe Cygwin should too? WideCharToMultiByte will know a 667173273Ssam * lot more about local character encodings than the wcrtomb() 668170530Ssam * wrapper is going to know.) 669170530Ssam */ 670170530Ssamint 671170530Ssamarchive_string_append_from_wcs(struct archive_string *as, 672170530Ssam const wchar_t *w, size_t len) 673170530Ssam{ 674170530Ssam return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); 675170530Ssam} 676173273Ssam 677173273Ssamstatic int 678178354Ssamarchive_string_append_from_wcs_in_codepage(struct archive_string *as, 679170530Ssam const wchar_t *ws, size_t len, struct archive_string_conv *sc) 680170530Ssam{ 681170530Ssam BOOL defchar_used, *dp; 682170530Ssam int count, ret = 0; 683170530Ssam UINT to_cp; 684170530Ssam int wslen = (int)len; 685183247Ssam 686183247Ssam if (sc != NULL) 687170530Ssam to_cp = sc->to_cp; 688170530Ssam else 689170530Ssam to_cp = get_current_codepage(); 690170530Ssam 691183247Ssam if (to_cp == CP_C_LOCALE) { 692183247Ssam /* 693183247Ssam * "C" locale special processing. 694183247Ssam */ 695183247Ssam const wchar_t *wp = ws; 696183247Ssam char *p; 697183247Ssam 698193840Ssam if (NULL == archive_string_ensure(as, 699173273Ssam as->length + wslen +1)) 700173273Ssam return (-1); 701173273Ssam p = as->s + as->length; 702173273Ssam count = 0; 703170530Ssam defchar_used = 0; 704170530Ssam while (count < wslen && *wp) { 705170530Ssam if (*wp > 255) { 706170530Ssam *p++ = '?'; 707170530Ssam wp++; 708173273Ssam defchar_used = 1; 709170530Ssam } else 710182827Ssam *p++ = (char)*wp++; 711182827Ssam count++; 712182827Ssam } 713182827Ssam } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { 714182827Ssam uint16_t *u16; 715182827Ssam 716182827Ssam if (NULL == 717182827Ssam archive_string_ensure(as, as->length + len * 2 + 2)) 718182827Ssam return (-1); 719182827Ssam u16 = (uint16_t *)(as->s + as->length); 720182827Ssam count = 0; 721182827Ssam defchar_used = 0; 722182827Ssam if (sc->flag & SCONV_TO_UTF16BE) { 723182827Ssam while (count < (int)len && *ws) { 724182827Ssam archive_be16enc(u16+count, *ws); 725173273Ssam ws++; 726173273Ssam count++; 727170530Ssam } 728170530Ssam } else { 729170530Ssam while (count < (int)len && *ws) { 730170530Ssam archive_le16enc(u16+count, *ws); 731170530Ssam ws++; 732170530Ssam count++; 733170530Ssam } 734170530Ssam } 735170530Ssam count <<= 1; /* to be byte size */ 736170530Ssam } else { 737170530Ssam /* Make sure the MBS buffer has plenty to set. */ 738173273Ssam if (NULL == 739170530Ssam archive_string_ensure(as, as->length + len * 2 + 1)) 740170530Ssam return (-1); 741170530Ssam do { 742170530Ssam defchar_used = 0; 743170530Ssam if (to_cp == CP_UTF8 || sc == NULL) 744170530Ssam dp = NULL; 745173273Ssam else 746170530Ssam dp = &defchar_used; 747170530Ssam count = WideCharToMultiByte(to_cp, 0, ws, wslen, 748170530Ssam as->s + as->length, 749173273Ssam (int)as->buffer_length - (int)as->length - 1, NULL, dp); 750170530Ssam if (count == 0 && 751170530Ssam GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 752170530Ssam /* Expand the MBS buffer and retry. */ 753173273Ssam if (NULL == archive_string_ensure(as, 754170530Ssam as->buffer_length + len)) 755173273Ssam return (-1); 756205277Srpaulo continue; 757173273Ssam } 758173273Ssam if (count == 0) 759173273Ssam ret = -1; 760173273Ssam break; 761173273Ssam } while (1); 762173273Ssam } 763173273Ssam as->length += count; 764173273Ssam as->s[as->length] = '\0'; 765173273Ssam return (defchar_used?-1:ret); 766173273Ssam} 767173273Ssam 768173273Ssam#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) 769170530Ssam 770173273Ssam/* 771173273Ssam * Translates a wide character string into current locale character set 772173273Ssam * and appends to the archive_string. Note: returns -1 if conversion 773173273Ssam * fails. 774173273Ssam */ 775170530Ssamint 776173273Ssamarchive_string_append_from_wcs(struct archive_string *as, 777173273Ssam const wchar_t *w, size_t len) 778173273Ssam{ 779173273Ssam /* We cannot use the standard wcstombs() here because it 780173273Ssam * cannot tell us how big the output buffer should be. So 781173273Ssam * I've built a loop around wcrtomb() or wctomb() that 782173273Ssam * converts a character at a time and resizes the string as 783173273Ssam * needed. We prefer wcrtomb() when it's available because 784178354Ssam * it's thread-safe. */ 785173273Ssam int n, ret_val = 0; 786173273Ssam char *p; 787173273Ssam char *end; 788173273Ssam#if HAVE_WCRTOMB 789173273Ssam mbstate_t shift_state; 790173273Ssam 791173273Ssam memset(&shift_state, 0, sizeof(shift_state)); 792173273Ssam#else 793173273Ssam /* Clear the shift state before starting. */ 794173273Ssam wctomb(NULL, L'\0'); 795173273Ssam#endif 796173273Ssam /* 797173273Ssam * Allocate buffer for MBS. 798173273Ssam * We need this allocation here since it is possible that 799173273Ssam * as->s is still NULL. 800173273Ssam */ 801173273Ssam if (archive_string_ensure(as, as->length + len + 1) == NULL) 802173273Ssam return (-1); 803178354Ssam 804173273Ssam p = as->s + as->length; 805178354Ssam end = as->s + as->buffer_length - MB_CUR_MAX -1; 806173273Ssam while (*w != L'\0' && len > 0) { 807173273Ssam if (p >= end) { 808173273Ssam as->length = p - as->s; 809173273Ssam as->s[as->length] = '\0'; 810173273Ssam /* Re-allocate buffer for MBS. */ 811178354Ssam if (archive_string_ensure(as, 812173273Ssam as->length + max(len * 2, 813173273Ssam (size_t)MB_CUR_MAX) + 1) == NULL) 814173273Ssam return (-1); 815173273Ssam p = as->s + as->length; 816173273Ssam end = as->s + as->buffer_length - MB_CUR_MAX -1; 817173273Ssam } 818173273Ssam#if HAVE_WCRTOMB 819173273Ssam n = wcrtomb(p, *w++, &shift_state); 820173273Ssam#else 821205277Srpaulo n = wctomb(p, *w++); 822173273Ssam#endif 823178354Ssam if (n == -1) { 824173273Ssam if (errno == EILSEQ) { 825170530Ssam /* Skip an illegal wide char. */ 826173273Ssam *p++ = '?'; 827170530Ssam ret_val = -1; 828178354Ssam } else { 829170530Ssam ret_val = -1; 830173273Ssam break; 831173273Ssam } 832173273Ssam } else 833173273Ssam p += n; 834173273Ssam len--; 835173273Ssam } 836173273Ssam as->length = p - as->s; 837173273Ssam as->s[as->length] = '\0'; 838173273Ssam return (ret_val); 839173273Ssam} 840173273Ssam 841173273Ssam#else /* HAVE_WCTOMB || HAVE_WCRTOMB */ 842170530Ssam 843170530Ssam/* 844173273Ssam * TODO: Test if __STDC_ISO_10646__ is defined. 845205277Srpaulo * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 846170530Ssam * one character at a time. If a non-Windows platform doesn't have 847178354Ssam * either of these, fall back to the built-in UTF8 conversion. 848173273Ssam */ 849178354Ssamint 850173273Ssamarchive_string_append_from_wcs(struct archive_string *as, 851173273Ssam const wchar_t *w, size_t len) 852173273Ssam{ 853173273Ssam (void)as;/* UNUSED */ 854178354Ssam (void)w;/* UNUSED */ 855173273Ssam (void)len;/* UNUSED */ 856170530Ssam errno = ENOSYS; 857173273Ssam return (-1); 858170530Ssam} 859173273Ssam 860173273Ssam#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ 861170530Ssam 862170530Ssam/* 863170530Ssam * Find a string conversion object by a pair of 'from' charset name 864170530Ssam * and 'to' charset name from an archive object. 865170530Ssam * Return NULL if not found. 866170530Ssam */ 867173273Ssamstatic struct archive_string_conv * 868170530Ssamfind_sconv_object(struct archive *a, const char *fc, const char *tc) 869170530Ssam{ 870170530Ssam struct archive_string_conv *sc; 871170530Ssam 872178354Ssam if (a == NULL) 873170530Ssam return (NULL); 874170530Ssam 875170530Ssam for (sc = a->sconv; sc != NULL; sc = sc->next) { 876170530Ssam if (strcmp(sc->from_charset, fc) == 0 && 877170530Ssam strcmp(sc->to_charset, tc) == 0) 878173273Ssam break; 879173273Ssam } 880178354Ssam return (sc); 881173273Ssam} 882173273Ssam 883178354Ssam/* 884173273Ssam * Register a string object to an archive object. 885173273Ssam */ 886170530Ssamstatic void 887170530Ssamadd_sconv_object(struct archive *a, struct archive_string_conv *sc) 888170530Ssam{ 889170530Ssam struct archive_string_conv **psc; 890170530Ssam 891170530Ssam /* Add a new sconv to sconv list. */ 892170530Ssam psc = &(a->sconv); 893170530Ssam while (*psc != NULL) 894178354Ssam psc = &((*psc)->next); 895170530Ssam *psc = sc; 896170530Ssam} 897178354Ssam 898170530Ssamstatic void 899170530Ssamadd_converter(struct archive_string_conv *sc, int (*converter) 900178354Ssam (struct archive_string *, const void *, size_t, 901170530Ssam struct archive_string_conv *)) 902173273Ssam{ 903173273Ssam if (sc == NULL || sc->nconverter >= 2) 904170530Ssam __archive_errx(1, "Programming error"); 905170530Ssam sc->converter[sc->nconverter++] = converter; 906173273Ssam} 907170530Ssam 908173273Ssamstatic void 909205277Srpaulosetup_converter(struct archive_string_conv *sc) 910170530Ssam{ 911178354Ssam 912173273Ssam /* Reset. */ 913170530Ssam sc->nconverter = 0; 914173273Ssam 915173273Ssam /* 916178354Ssam * Perform special sequence for the incorrect UTF-8 filenames 917173273Ssam * made by libarchive2.x. 918173273Ssam */ 919173273Ssam if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { 920173273Ssam add_converter(sc, strncat_from_utf8_libarchive2); 921173273Ssam return; 922173273Ssam } 923173273Ssam 924173273Ssam /* 925173273Ssam * Convert a string to UTF-16BE/LE. 926170530Ssam */ 927173273Ssam if (sc->flag & SCONV_TO_UTF16) { 928170530Ssam /* 929173273Ssam * If the current locale is UTF-8, we can translate 930205277Srpaulo * a UTF-8 string into a UTF-16BE string. 931170530Ssam */ 932178354Ssam if (sc->flag & SCONV_FROM_UTF8) { 933173273Ssam add_converter(sc, archive_string_append_unicode); 934173273Ssam return; 935173273Ssam } 936173273Ssam 937173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 938173273Ssam if (sc->flag & SCONV_WIN_CP) { 939178354Ssam if (sc->flag & SCONV_TO_UTF16BE) 940173273Ssam add_converter(sc, win_strncat_to_utf16be); 941170530Ssam else 942170530Ssam add_converter(sc, win_strncat_to_utf16le); 943170530Ssam return; 944170530Ssam } 945170530Ssam#endif 946170530Ssam 947170530Ssam#if defined(HAVE_ICONV) 948170530Ssam if (sc->cd != (iconv_t)-1) { 949170530Ssam add_converter(sc, iconv_strncat_in_locale); 950183254Ssam return; 951170530Ssam } 952170530Ssam#endif 953170530Ssam 954170530Ssam if (sc->flag & SCONV_BEST_EFFORT) { 955173273Ssam if (sc->flag & SCONV_TO_UTF16BE) 956173273Ssam add_converter(sc, 957173273Ssam best_effort_strncat_to_utf16be); 958173273Ssam else 959173273Ssam add_converter(sc, 960173273Ssam best_effort_strncat_to_utf16le); 961173273Ssam } else 962173273Ssam /* Make sure we have no converter. */ 963170530Ssam sc->nconverter = 0; 964170530Ssam return; 965170530Ssam } 966184280Ssam 967173273Ssam /* 968170530Ssam * Convert a string from UTF-16BE/LE. 969173273Ssam */ 970170530Ssam if (sc->flag & SCONV_FROM_UTF16) { 971170530Ssam /* 972170530Ssam * At least we should normalize a UTF-16BE string. 973170530Ssam */ 974170530Ssam if (sc->flag & SCONV_NORMALIZATION_D) 975170530Ssam add_converter(sc,archive_string_normalize_D); 976170530Ssam else if (sc->flag & SCONV_NORMALIZATION_C) 977170530Ssam add_converter(sc, archive_string_normalize_C); 978170530Ssam 979191552Ssam if (sc->flag & SCONV_TO_UTF8) { 980170530Ssam /* 981170530Ssam * If the current locale is UTF-8, we can translate 982170530Ssam * a UTF-16BE/LE string into a UTF-8 string directly. 983170530Ssam */ 984170530Ssam if (!(sc->flag & 985170530Ssam (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 986170530Ssam add_converter(sc, 987184280Ssam archive_string_append_unicode); 988184280Ssam return; 989170530Ssam } 990170530Ssam 991191552Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 992170530Ssam if (sc->flag & SCONV_WIN_CP) { 993170530Ssam if (sc->flag & SCONV_FROM_UTF16BE) 994182828Ssam add_converter(sc, win_strncat_from_utf16be); 995170530Ssam else 996170530Ssam add_converter(sc, win_strncat_from_utf16le); 997178354Ssam return; 998178354Ssam } 999178354Ssam#endif 1000178354Ssam 1001178354Ssam#if defined(HAVE_ICONV) 1002178354Ssam if (sc->cd != (iconv_t)-1) { 1003178354Ssam add_converter(sc, iconv_strncat_in_locale); 1004178354Ssam return; 1005178354Ssam } 1006178354Ssam#endif 1007178354Ssam 1008178354Ssam if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 1009178354Ssam == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 1010178354Ssam add_converter(sc, best_effort_strncat_from_utf16be); 1011178354Ssam else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 1012178354Ssam == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 1013178354Ssam add_converter(sc, best_effort_strncat_from_utf16le); 1014178354Ssam else 1015178354Ssam /* Make sure we have no converter. */ 1016178354Ssam sc->nconverter = 0; 1017178354Ssam return; 1018178354Ssam } 1019178354Ssam 1020178354Ssam if (sc->flag & SCONV_FROM_UTF8) { 1021178354Ssam /* 1022178354Ssam * At least we should normalize a UTF-8 string. 1023178354Ssam */ 1024178354Ssam if (sc->flag & SCONV_NORMALIZATION_D) 1025178354Ssam add_converter(sc,archive_string_normalize_D); 1026178354Ssam else if (sc->flag & SCONV_NORMALIZATION_C) 1027178354Ssam add_converter(sc, archive_string_normalize_C); 1028178354Ssam 1029178354Ssam /* 1030178354Ssam * Copy UTF-8 string with a check of CESU-8. 1031178354Ssam * Apparently, iconv does not check surrogate pairs in UTF-8 1032178354Ssam * when both from-charset and to-charset are UTF-8, and then 1033178354Ssam * we use our UTF-8 copy code. 1034178354Ssam */ 1035178354Ssam if (sc->flag & SCONV_TO_UTF8) { 1036173273Ssam /* 1037173273Ssam * If the current locale is UTF-8, we can translate 1038173273Ssam * a UTF-16BE string into a UTF-8 string directly. 1039173273Ssam */ 1040173273Ssam if (!(sc->flag & 1041173273Ssam (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 1042173273Ssam add_converter(sc, strncat_from_utf8_to_utf8); 1043173273Ssam return; 1044173273Ssam } 1045173273Ssam } 1046173273Ssam 1047173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1048173273Ssam /* 1049173273Ssam * On Windows we can use Windows API for a string conversion. 1050173273Ssam */ 1051173273Ssam if (sc->flag & SCONV_WIN_CP) { 1052193655Ssam add_converter(sc, strncat_in_codepage); 1053173273Ssam return; 1054193655Ssam } 1055173273Ssam#endif 1056173273Ssam 1057173273Ssam#if HAVE_ICONV 1058173273Ssam if (sc->cd != (iconv_t)-1) { 1059173273Ssam add_converter(sc, iconv_strncat_in_locale); 1060173273Ssam /* 1061173273Ssam * iconv generally does not support UTF-8-MAC and so 1062173273Ssam * we have to the output of iconv from NFC to NFD if 1063173273Ssam * need. 1064173273Ssam */ 1065173273Ssam if ((sc->flag & SCONV_FROM_CHARSET) && 1066173273Ssam (sc->flag & SCONV_TO_UTF8)) { 1067173273Ssam if (sc->flag & SCONV_NORMALIZATION_D) 1068173273Ssam add_converter(sc, archive_string_normalize_D); 1069173273Ssam } 1070173273Ssam return; 1071173273Ssam } 1072173273Ssam#endif 1073173273Ssam 1074173273Ssam /* 1075173273Ssam * Try conversion in the best effort or no conversion. 1076173273Ssam */ 1077173273Ssam if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) 1078173273Ssam add_converter(sc, best_effort_strncat_in_locale); 1079173273Ssam else 1080173273Ssam /* Make sure we have no converter. */ 1081173273Ssam sc->nconverter = 0; 1082173273Ssam} 1083173273Ssam 1084173273Ssam/* 1085173273Ssam * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE 1086173273Ssam * and CP932 which are referenced in create_sconv_object(). 1087173273Ssam */ 1088178354Ssamstatic const char * 1089173273Ssamcanonical_charset_name(const char *charset) 1090173273Ssam{ 1091173273Ssam char cs[16]; 1092193655Ssam char *p; 1093173273Ssam const char *s; 1094173273Ssam 1095173273Ssam if (charset == NULL || charset[0] == '\0' 1096173273Ssam || strlen(charset) > 15) 1097173273Ssam return (charset); 1098173273Ssam 1099173273Ssam /* Copy name to uppercase. */ 1100173273Ssam p = cs; 1101178354Ssam s = charset; 1102178354Ssam while (*s) { 1103173273Ssam char c = *s++; 1104173273Ssam if (c >= 'a' && c <= 'z') 1105193655Ssam c -= 'a' - 'A'; 1106173273Ssam *p++ = c; 1107173273Ssam } 1108173273Ssam *p++ = '\0'; 1109173273Ssam 1110173273Ssam if (strcmp(cs, "UTF-8") == 0 || 1111173273Ssam strcmp(cs, "UTF8") == 0) 1112173273Ssam return ("UTF-8"); 1113173273Ssam if (strcmp(cs, "UTF-16BE") == 0 || 1114193655Ssam strcmp(cs, "UTF16BE") == 0) 1115173273Ssam return ("UTF-16BE"); 1116173273Ssam if (strcmp(cs, "UTF-16LE") == 0 || 1117173273Ssam strcmp(cs, "UTF16LE") == 0) 1118173273Ssam return ("UTF-16LE"); 1119173273Ssam if (strcmp(cs, "CP932") == 0) 1120173273Ssam return ("CP932"); 1121193655Ssam return (charset); 1122183256Ssam} 1123183256Ssam 1124173273Ssam/* 1125173273Ssam * Create a string conversion object. 1126173273Ssam */ 1127173273Ssamstatic struct archive_string_conv * 1128173273Ssamcreate_sconv_object(const char *fc, const char *tc, 1129173273Ssam unsigned current_codepage, int flag) 1130173273Ssam{ 1131173273Ssam struct archive_string_conv *sc; 1132193655Ssam 1133173273Ssam sc = calloc(1, sizeof(*sc)); 1134173273Ssam if (sc == NULL) 1135173273Ssam return (NULL); 1136173273Ssam sc->next = NULL; 1137173273Ssam sc->from_charset = strdup(fc); 1138173273Ssam if (sc->from_charset == NULL) { 1139173273Ssam free(sc); 1140173273Ssam return (NULL); 1141173273Ssam } 1142178354Ssam sc->to_charset = strdup(tc); 1143178354Ssam if (sc->to_charset == NULL) { 1144178354Ssam free(sc->from_charset); 1145178354Ssam free(sc); 1146178354Ssam return (NULL); 1147178354Ssam } 1148178354Ssam archive_string_init(&sc->utftmp); 1149178354Ssam 1150183253Ssam if (flag & SCONV_TO_CHARSET) { 1151183253Ssam /* 1152183253Ssam * Convert characters from the current locale charset to 1153178354Ssam * a specified charset. 1154178354Ssam */ 1155178354Ssam sc->from_cp = current_codepage; 1156178354Ssam sc->to_cp = make_codepage_from_charset(tc); 1157178354Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1158178354Ssam if (IsValidCodePage(sc->to_cp)) 1159178354Ssam flag |= SCONV_WIN_CP; 1160178354Ssam#endif 1161178354Ssam } else if (flag & SCONV_FROM_CHARSET) { 1162193655Ssam /* 1163178354Ssam * Convert characters from a specified charset to 1164178354Ssam * the current locale charset. 1165178354Ssam */ 1166178354Ssam sc->to_cp = current_codepage; 1167178354Ssam sc->from_cp = make_codepage_from_charset(fc); 1168178354Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1169173273Ssam if (IsValidCodePage(sc->from_cp)) 1170173273Ssam flag |= SCONV_WIN_CP; 1171173273Ssam#endif 1172173273Ssam } 1173173273Ssam 1174173273Ssam /* 1175173273Ssam * Check if "from charset" and "to charset" are the same. 1176173273Ssam */ 1177173273Ssam if (strcmp(fc, tc) == 0 || 1178173273Ssam (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) 1179173273Ssam sc->same = 1; 1180178354Ssam else 1181178354Ssam sc->same = 0; 1182178354Ssam 1183193655Ssam /* 1184173273Ssam * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. 1185178354Ssam */ 1186178354Ssam if (strcmp(tc, "UTF-8") == 0) 1187178354Ssam flag |= SCONV_TO_UTF8; 1188173273Ssam else if (strcmp(tc, "UTF-16BE") == 0) 1189173273Ssam flag |= SCONV_TO_UTF16BE; 1190173273Ssam else if (strcmp(tc, "UTF-16LE") == 0) 1191173273Ssam flag |= SCONV_TO_UTF16LE; 1192173273Ssam if (strcmp(fc, "UTF-8") == 0) 1193173273Ssam flag |= SCONV_FROM_UTF8; 1194173273Ssam else if (strcmp(fc, "UTF-16BE") == 0) 1195173273Ssam flag |= SCONV_FROM_UTF16BE; 1196173273Ssam else if (strcmp(fc, "UTF-16LE") == 0) 1197173273Ssam flag |= SCONV_FROM_UTF16LE; 1198173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1199173273Ssam if (sc->to_cp == CP_UTF8) 1200173273Ssam flag |= SCONV_TO_UTF8; 1201173273Ssam else if (sc->to_cp == CP_UTF16BE) 1202173273Ssam flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; 1203173273Ssam else if (sc->to_cp == CP_UTF16LE) 1204173273Ssam flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; 1205173273Ssam if (sc->from_cp == CP_UTF8) 1206173273Ssam flag |= SCONV_FROM_UTF8; 1207173273Ssam else if (sc->from_cp == CP_UTF16BE) 1208173273Ssam flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; 1209173273Ssam else if (sc->from_cp == CP_UTF16LE) 1210173273Ssam flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; 1211173273Ssam#endif 1212173273Ssam 1213173273Ssam /* 1214173273Ssam * Set a flag for Unicode NFD. Usually iconv cannot correctly 1215173273Ssam * handle it. So we have to translate NFD characters to NFC ones 1216173273Ssam * ourselves before iconv handles. Another reason is to prevent 1217173273Ssam * that the same sight of two filenames, one is NFC and other 1218173273Ssam * is NFD, would be in its directory. 1219173273Ssam * On Mac OS X, although its filesystem layer automatically 1220173273Ssam * convert filenames to NFD, it would be useful for filename 1221173273Ssam * comparing to find out the same filenames that we normalize 1222173273Ssam * that to be NFD ourselves. 1223173273Ssam */ 1224173273Ssam if ((flag & SCONV_FROM_CHARSET) && 1225173273Ssam (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { 1226173273Ssam#if defined(__APPLE__) 1227173273Ssam if (flag & SCONV_TO_UTF8) 1228173273Ssam flag |= SCONV_NORMALIZATION_D; 1229173273Ssam else 1230173273Ssam#endif 1231173273Ssam flag |= SCONV_NORMALIZATION_C; 1232173273Ssam } 1233173273Ssam#if defined(__APPLE__) 1234173273Ssam /* 1235173273Ssam * In case writing an archive file, make sure that a filename 1236173273Ssam * going to be passed to iconv is a Unicode NFC string since 1237178354Ssam * a filename in HFS Plus filesystem is a Unicode NFD one and 1238178354Ssam * iconv cannot handle it with "UTF-8" charset. It is simpler 1239178354Ssam * than a use of "UTF-8-MAC" charset. 1240178354Ssam */ 1241178354Ssam if ((flag & SCONV_TO_CHARSET) && 1242178354Ssam (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1243178354Ssam !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1244178354Ssam flag |= SCONV_NORMALIZATION_C; 1245178354Ssam /* 1246173273Ssam * In case reading an archive file. make sure that a filename 1247173273Ssam * will be passed to users is a Unicode NFD string in order to 1248178354Ssam * correctly compare the filename with other one which comes 1249173273Ssam * from HFS Plus filesystem. 1250178354Ssam */ 1251183246Ssam if ((flag & SCONV_FROM_CHARSET) && 1252178354Ssam !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1253178354Ssam (flag & SCONV_TO_UTF8)) 1254178354Ssam flag |= SCONV_NORMALIZATION_D; 1255183246Ssam#endif 1256193655Ssam 1257178354Ssam#if defined(HAVE_ICONV) 1258178354Ssam sc->cd_w = (iconv_t)-1; 1259183246Ssam /* 1260183246Ssam * Create an iconv object. 1261183246Ssam */ 1262183246Ssam if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && 1263183246Ssam (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || 1264183246Ssam (flag & SCONV_WIN_CP)) { 1265183246Ssam /* This case we won't use iconv. */ 1266181197Ssam sc->cd = (iconv_t)-1; 1267178354Ssam } else { 1268173273Ssam sc->cd = iconv_open(tc, fc); 1269173273Ssam if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { 1270173273Ssam /* 1271173273Ssam * Unfortunately, all of iconv implements do support 1272173273Ssam * "CP932" character-set, so we should use "SJIS" 1273173273Ssam * instead if iconv_open failed. 1274173273Ssam */ 1275173273Ssam if (strcmp(tc, "CP932") == 0) 1276173273Ssam sc->cd = iconv_open("SJIS", fc); 1277173273Ssam else if (strcmp(fc, "CP932") == 0) 1278173273Ssam sc->cd = iconv_open(tc, "SJIS"); 1279173273Ssam } 1280173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1281173273Ssam /* 1282173273Ssam * archive_mstring on Windows directly convert multi-bytes 1283193655Ssam * into archive_wstring in order not to depend on locale 1284173273Ssam * so that you can do a I18N programming. This will be 1285173273Ssam * used only in archive_mstring_copy_mbs_len_l so far. 1286178354Ssam */ 1287173273Ssam if (flag & SCONV_FROM_CHARSET) { 1288173273Ssam sc->cd_w = iconv_open("UTF-8", fc); 1289193655Ssam if (sc->cd_w == (iconv_t)-1 && 1290173273Ssam (sc->flag & SCONV_BEST_EFFORT)) { 1291173273Ssam if (strcmp(fc, "CP932") == 0) 1292173273Ssam sc->cd_w = iconv_open("UTF-8", "SJIS"); 1293173273Ssam } 1294170530Ssam } 1295170530Ssam#endif /* _WIN32 && !__CYGWIN__ */ 1296170530Ssam } 1297170530Ssam#endif /* HAVE_ICONV */ 1298170530Ssam 1299170530Ssam sc->flag = flag; 1300170530Ssam 1301170530Ssam /* 1302170530Ssam * Set up converters. 1303170530Ssam */ 1304170530Ssam setup_converter(sc); 1305170530Ssam 1306170530Ssam return (sc); 1307170530Ssam} 1308170530Ssam 1309170530Ssam/* 1310170530Ssam * Free a string conversion object. 1311170530Ssam */ 1312170530Ssamstatic void 1313170530Ssamfree_sconv_object(struct archive_string_conv *sc) 1314170530Ssam{ 1315170530Ssam free(sc->from_charset); 1316170530Ssam free(sc->to_charset); 1317170530Ssam archive_string_free(&sc->utftmp); 1318170530Ssam#if HAVE_ICONV 1319170530Ssam if (sc->cd != (iconv_t)-1) 1320170530Ssam iconv_close(sc->cd); 1321170530Ssam if (sc->cd_w != (iconv_t)-1) 1322183254Ssam iconv_close(sc->cd_w); 1323183254Ssam#endif 1324183254Ssam free(sc); 1325170530Ssam} 1326170530Ssam 1327170530Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1328170530Ssam# if defined(WINAPI_FAMILY_PARTITION) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) 1329170530Ssam# define GetOEMCP() CP_OEMCP 1330172055Ssam# endif 1331170530Ssam 1332170530Ssamstatic unsigned 1333170530Ssammy_atoi(const char *p) 1334183254Ssam{ 1335183254Ssam unsigned cp; 1336183254Ssam 1337183254Ssam cp = 0; 1338183254Ssam while (*p) { 1339183254Ssam if (*p >= '0' && *p <= '9') 1340183254Ssam cp = cp * 10 + (*p - '0'); 1341183254Ssam else 1342183254Ssam return (-1); 1343183254Ssam p++; 1344183254Ssam } 1345183254Ssam return (cp); 1346183254Ssam} 1347183254Ssam 1348183254Ssam/* 1349183254Ssam * Translate Charset name (as used by iconv) into CodePage (as used by Windows) 1350183254Ssam * Return -1 if failed. 1351183254Ssam * 1352183254Ssam * Note: This translation code may be insufficient. 1353183254Ssam */ 1354183254Ssamstatic struct charset { 1355183254Ssam const char *name; 1356183254Ssam unsigned cp; 1357183254Ssam} charsets[] = { 1358183254Ssam /* MUST BE SORTED! */ 1359183254Ssam {"ASCII", 1252}, 1360183254Ssam {"ASMO-708", 708}, 1361183254Ssam {"BIG5", 950}, 1362183254Ssam {"CHINESE", 936}, 1363183254Ssam {"CP367", 1252}, 1364183254Ssam {"CP819", 1252}, 1365173273Ssam {"CP1025", 21025}, 1366173273Ssam {"DOS-720", 720}, 1367183254Ssam {"DOS-862", 862}, 1368173273Ssam {"EUC-CN", 51936}, 1369178354Ssam {"EUC-JP", 51932}, 1370173273Ssam {"EUC-KR", 949}, 1371173273Ssam {"EUCCN", 51936}, 1372173273Ssam {"EUCJP", 51932}, 1373173273Ssam {"EUCKR", 949}, 1374173273Ssam {"GB18030", 54936}, 1375183254Ssam {"GB2312", 936}, 1376183254Ssam {"HEBREW", 1255}, 1377173273Ssam {"HZ-GB-2312", 52936}, 1378173273Ssam {"IBM273", 20273}, 1379173273Ssam {"IBM277", 20277}, 1380183254Ssam {"IBM278", 20278}, 1381173273Ssam {"IBM280", 20280}, 1382173273Ssam {"IBM284", 20284}, 1383173273Ssam {"IBM285", 20285}, 1384183254Ssam {"IBM290", 20290}, 1385173273Ssam {"IBM297", 20297}, 1386173273Ssam {"IBM367", 1252}, 1387173273Ssam {"IBM420", 20420}, 1388173273Ssam {"IBM423", 20423}, 1389173273Ssam {"IBM424", 20424}, 1390173273Ssam {"IBM819", 1252}, 1391173273Ssam {"IBM871", 20871}, 1392173273Ssam {"IBM880", 20880}, 1393173273Ssam {"IBM905", 20905}, 1394173273Ssam {"IBM924", 20924}, 1395170530Ssam {"ISO-8859-1", 28591}, 1396170530Ssam {"ISO-8859-13", 28603}, 1397170530Ssam {"ISO-8859-15", 28605}, 1398183255Ssam {"ISO-8859-2", 28592}, 1399183255Ssam {"ISO-8859-3", 28593}, 1400183255Ssam {"ISO-8859-4", 28594}, 1401183255Ssam {"ISO-8859-5", 28595}, 1402183255Ssam {"ISO-8859-6", 28596}, 1403183255Ssam {"ISO-8859-7", 28597}, 1404183255Ssam {"ISO-8859-8", 28598}, 1405183255Ssam {"ISO-8859-9", 28599}, 1406183255Ssam {"ISO8859-1", 28591}, 1407183255Ssam {"ISO8859-13", 28603}, 1408183255Ssam {"ISO8859-15", 28605}, 1409183255Ssam {"ISO8859-2", 28592}, 1410183255Ssam {"ISO8859-3", 28593}, 1411183255Ssam {"ISO8859-4", 28594}, 1412183255Ssam {"ISO8859-5", 28595}, 1413183255Ssam {"ISO8859-6", 28596}, 1414183255Ssam {"ISO8859-7", 28597}, 1415183255Ssam {"ISO8859-8", 28598}, 1416183255Ssam {"ISO8859-9", 28599}, 1417183255Ssam {"JOHAB", 1361}, 1418183255Ssam {"KOI8-R", 20866}, 1419183255Ssam {"KOI8-U", 21866}, 1420183255Ssam {"KS_C_5601-1987", 949}, 1421183255Ssam {"LATIN1", 1252}, 1422183255Ssam {"LATIN2", 28592}, 1423183255Ssam {"MACINTOSH", 10000}, 1424183257Ssam {"SHIFT-JIS", 932}, 1425183257Ssam {"SHIFT_JIS", 932}, 1426183257Ssam {"SJIS", 932}, 1427183257Ssam {"US", 1252}, 1428183257Ssam {"US-ASCII", 1252}, 1429183257Ssam {"UTF-16", 1200}, 1430183257Ssam {"UTF-16BE", 1201}, 1431183257Ssam {"UTF-16LE", 1200}, 1432183257Ssam {"UTF-8", CP_UTF8}, 1433183257Ssam {"X-EUROPA", 29001}, 1434193655Ssam {"X-MAC-ARABIC", 10004}, 1435183257Ssam {"X-MAC-CE", 10029}, 1436183257Ssam {"X-MAC-CHINESEIMP", 10008}, 1437193655Ssam {"X-MAC-CHINESETRAD", 10002}, 1438183257Ssam {"X-MAC-CROATIAN", 10082}, 1439183257Ssam {"X-MAC-CYRILLIC", 10007}, 1440183257Ssam {"X-MAC-GREEK", 10006}, 1441183257Ssam {"X-MAC-HEBREW", 10005}, 1442183254Ssam {"X-MAC-ICELANDIC", 10079}, 1443183254Ssam {"X-MAC-JAPANESE", 10001}, 1444183254Ssam {"X-MAC-KOREAN", 10003}, 1445183254Ssam {"X-MAC-ROMANIAN", 10010}, 1446183254Ssam {"X-MAC-THAI", 10021}, 1447183254Ssam {"X-MAC-TURKISH", 10081}, 1448183254Ssam {"X-MAC-UKRAINIAN", 10017}, 1449183254Ssam}; 1450183254Ssamstatic unsigned 1451183254Ssammake_codepage_from_charset(const char *charset) 1452183254Ssam{ 1453183254Ssam char cs[16]; 1454183255Ssam char *p; 1455183255Ssam unsigned cp; 1456183257Ssam int a, b; 1457183254Ssam 1458183254Ssam if (charset == NULL || strlen(charset) > 15) 1459183254Ssam return -1; 1460183254Ssam 1461183254Ssam /* Copy name to uppercase. */ 1462183254Ssam p = cs; 1463193655Ssam while (*charset) { 1464183254Ssam char c = *charset++; 1465183254Ssam if (c >= 'a' && c <= 'z') 1466183254Ssam c -= 'a' - 'A'; 1467193655Ssam *p++ = c; 1468183254Ssam } 1469183254Ssam *p++ = '\0'; 1470183254Ssam cp = -1; 1471183254Ssam 1472183254Ssam /* Look it up in the table first, so that we can easily 1473183254Ssam * override CP367, which we map to 1252 instead of 367. */ 1474183256Ssam a = 0; 1475183256Ssam b = sizeof(charsets)/sizeof(charsets[0]); 1476193655Ssam while (b > a) { 1477183256Ssam int c = (b + a) / 2; 1478183256Ssam int r = strcmp(charsets[c].name, cs); 1479183256Ssam if (r < 0) 1480183254Ssam a = c + 1; 1481183254Ssam else if (r > 0) 1482183254Ssam b = c; 1483183254Ssam else 1484183254Ssam return charsets[c].cp; 1485183254Ssam } 1486183254Ssam 1487183254Ssam /* If it's not in the table, try to parse it. */ 1488183254Ssam switch (*cs) { 1489183254Ssam case 'C': 1490183254Ssam if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { 1491183254Ssam cp = my_atoi(cs + 2); 1492183254Ssam } else if (strcmp(cs, "CP_ACP") == 0) 1493183255Ssam cp = get_current_codepage(); 1494183255Ssam else if (strcmp(cs, "CP_OEMCP") == 0) 1495183257Ssam cp = get_current_oemcp(); 1496183254Ssam break; 1497183254Ssam case 'I': 1498183254Ssam if (cs[1] == 'B' && cs[2] == 'M' && 1499193655Ssam cs[3] >= '0' && cs[3] <= '9') { 1500183254Ssam cp = my_atoi(cs + 3); 1501183254Ssam } 1502193655Ssam break; 1503183254Ssam case 'W': 1504183254Ssam if (strncmp(cs, "WINDOWS-", 8) == 0) { 1505183254Ssam cp = my_atoi(cs + 8); 1506183254Ssam if (cp != 874 && (cp < 1250 || cp > 1258)) 1507183254Ssam cp = -1;/* This may invalid code. */ 1508183254Ssam } 1509183254Ssam break; 1510183254Ssam } 1511183254Ssam return (cp); 1512170530Ssam} 1513170530Ssam 1514170530Ssam/* 1515170530Ssam * Return ANSI Code Page of current locale set by setlocale(). 1516170530Ssam */ 1517178354Ssamstatic unsigned 1518170530Ssamget_current_codepage(void) 1519170530Ssam{ 1520170530Ssam char *locale, *p; 1521170530Ssam unsigned cp; 1522170530Ssam 1523170530Ssam locale = setlocale(LC_CTYPE, NULL); 1524170530Ssam if (locale == NULL) 1525170530Ssam return (GetACP()); 1526170530Ssam if (locale[0] == 'C' && locale[1] == '\0') 1527170530Ssam return (CP_C_LOCALE); 1528170530Ssam p = strrchr(locale, '.'); 1529170530Ssam if (p == NULL) 1530170530Ssam return (GetACP()); 1531170530Ssam if ((strcmp(p+1, "utf8") == 0) || (strcmp(p+1, "UTF-8") == 0)) 1532178354Ssam return CP_UTF8; 1533170530Ssam cp = my_atoi(p+1); 1534170530Ssam if ((int)cp <= 0) 1535170530Ssam return (GetACP()); 1536178354Ssam return (cp); 1537170530Ssam} 1538170530Ssam 1539170530Ssam/* 1540170530Ssam * Translation table between Locale Name and ACP/OEMCP. 1541170530Ssam */ 1542170530Ssamstatic struct { 1543170530Ssam unsigned acp; 1544170530Ssam unsigned ocp; 1545170530Ssam const char *locale; 1546170530Ssam} acp_ocp_map[] = { 1547170530Ssam { 950, 950, "Chinese_Taiwan" }, 1548170530Ssam { 936, 936, "Chinese_People's Republic of China" }, 1549170530Ssam { 950, 950, "Chinese_Taiwan" }, 1550170530Ssam { 1250, 852, "Czech_Czech Republic" }, 1551170530Ssam { 1252, 850, "Danish_Denmark" }, 1552170530Ssam { 1252, 850, "Dutch_Netherlands" }, 1553170530Ssam { 1252, 850, "Dutch_Belgium" }, 1554170530Ssam { 1252, 437, "English_United States" }, 1555170530Ssam { 1252, 850, "English_Australia" }, 1556170530Ssam { 1252, 850, "English_Canada" }, 1557170530Ssam { 1252, 850, "English_New Zealand" }, 1558170530Ssam { 1252, 850, "English_United Kingdom" }, 1559170530Ssam { 1252, 437, "English_United States" }, 1560170530Ssam { 1252, 850, "Finnish_Finland" }, 1561178354Ssam { 1252, 850, "French_France" }, 1562170530Ssam { 1252, 850, "French_Belgium" }, 1563170530Ssam { 1252, 850, "French_Canada" }, 1564170530Ssam { 1252, 850, "French_Switzerland" }, 1565170530Ssam { 1252, 850, "German_Germany" }, 1566170530Ssam { 1252, 850, "German_Austria" }, 1567170530Ssam { 1252, 850, "German_Switzerland" }, 1568170530Ssam { 1253, 737, "Greek_Greece" }, 1569170530Ssam { 1250, 852, "Hungarian_Hungary" }, 1570170530Ssam { 1252, 850, "Icelandic_Iceland" }, 1571170530Ssam { 1252, 850, "Italian_Italy" }, 1572170530Ssam { 1252, 850, "Italian_Switzerland" }, 1573170530Ssam { 932, 932, "Japanese_Japan" }, 1574170530Ssam { 949, 949, "Korean_Korea" }, 1575170530Ssam { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1576184280Ssam { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1577184280Ssam { 1252, 850, "Norwegian-Nynorsk_Norway" }, 1578184280Ssam { 1250, 852, "Polish_Poland" }, 1579184280Ssam { 1252, 850, "Portuguese_Portugal" }, 1580184280Ssam { 1252, 850, "Portuguese_Brazil" }, 1581184280Ssam { 1251, 866, "Russian_Russia" }, 1582184280Ssam { 1250, 852, "Slovak_Slovakia" }, 1583184280Ssam { 1252, 850, "Spanish_Spain" }, 1584184280Ssam { 1252, 850, "Spanish_Mexico" }, 1585184280Ssam { 1252, 850, "Spanish_Spain" }, 1586184280Ssam { 1252, 850, "Swedish_Sweden" }, 1587184280Ssam { 1254, 857, "Turkish_Turkey" }, 1588184280Ssam { 0, 0, NULL} 1589184280Ssam}; 1590184280Ssam 1591184280Ssam/* 1592184280Ssam * Return OEM Code Page of current locale set by setlocale(). 1593184280Ssam */ 1594184280Ssamstatic unsigned 1595184280Ssamget_current_oemcp(void) 1596184280Ssam{ 1597184280Ssam int i; 1598184280Ssam char *locale, *p; 1599184280Ssam size_t len; 1600184280Ssam 1601184280Ssam locale = setlocale(LC_CTYPE, NULL); 1602184280Ssam if (locale == NULL) 1603184280Ssam return (GetOEMCP()); 1604184280Ssam if (locale[0] == 'C' && locale[1] == '\0') 1605184280Ssam return (CP_C_LOCALE); 1606184280Ssam 1607184280Ssam p = strrchr(locale, '.'); 1608170530Ssam if (p == NULL) 1609170530Ssam return (GetOEMCP()); 1610170530Ssam len = p - locale; 1611170530Ssam for (i = 0; acp_ocp_map[i].acp; i++) { 1612170530Ssam if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) 1613170530Ssam return (acp_ocp_map[i].ocp); 1614170530Ssam } 1615170530Ssam return (GetOEMCP()); 1616170530Ssam} 1617170530Ssam#else 1618170530Ssam 1619170530Ssam/* 1620170530Ssam * POSIX platform does not use CodePage. 1621178354Ssam */ 1622170530Ssam 1623170530Ssamstatic unsigned 1624178354Ssamget_current_codepage(void) 1625170530Ssam{ 1626170530Ssam return (-1);/* Unknown */ 1627170530Ssam} 1628170530Ssamstatic unsigned 1629170530Ssammake_codepage_from_charset(const char *charset) 1630170530Ssam{ 1631170530Ssam (void)charset; /* UNUSED */ 1632170530Ssam return (-1);/* Unknown */ 1633170530Ssam} 1634170530Ssamstatic unsigned 1635170530Ssamget_current_oemcp(void) 1636170530Ssam{ 1637170530Ssam return (-1);/* Unknown */ 1638170530Ssam} 1639170530Ssam 1640170530Ssam#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 1641170530Ssam 1642170530Ssam/* 1643170530Ssam * Return a string conversion object. 1644170530Ssam */ 1645170530Ssamstatic struct archive_string_conv * 1646170530Ssamget_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) 1647170530Ssam{ 1648170530Ssam struct archive_string_conv *sc; 1649170530Ssam unsigned current_codepage; 1650170530Ssam 1651170530Ssam /* Check if we have made the sconv object. */ 1652170530Ssam sc = find_sconv_object(a, fc, tc); 1653170530Ssam if (sc != NULL) 1654170530Ssam return (sc); 1655170530Ssam 1656170530Ssam if (a == NULL) 1657170530Ssam current_codepage = get_current_codepage(); 1658170530Ssam else 1659170530Ssam current_codepage = a->current_codepage; 1660170530Ssam 1661170530Ssam sc = create_sconv_object(canonical_charset_name(fc), 1662170530Ssam canonical_charset_name(tc), current_codepage, flag); 1663170530Ssam if (sc == NULL) { 1664170530Ssam if (a != NULL) 1665170530Ssam archive_set_error(a, ENOMEM, 1666170530Ssam "Could not allocate memory for " 1667170530Ssam "a string conversion object"); 1668170530Ssam return (NULL); 1669184280Ssam } 1670170530Ssam 1671170530Ssam /* 1672170530Ssam * If there is no converter for current string conversion object, 1673170530Ssam * we cannot handle this conversion. 1674170530Ssam */ 1675170530Ssam if (sc->nconverter == 0) { 1676170530Ssam if (a != NULL) { 1677170530Ssam#if HAVE_ICONV 1678184280Ssam archive_set_error(a, ARCHIVE_ERRNO_MISC, 1679184280Ssam "iconv_open failed : Cannot handle ``%s''", 1680170530Ssam (flag & SCONV_TO_CHARSET)?tc:fc); 1681184280Ssam#else 1682173273Ssam archive_set_error(a, ARCHIVE_ERRNO_MISC, 1683173273Ssam "A character-set conversion not fully supported " 1684173273Ssam "on this platform"); 1685170530Ssam#endif 1686170530Ssam } 1687170530Ssam /* Failed; free a sconv object. */ 1688170530Ssam free_sconv_object(sc); 1689170530Ssam return (NULL); 1690170530Ssam } 1691170530Ssam 1692170530Ssam /* 1693170530Ssam * Success! 1694170530Ssam */ 1695170530Ssam if (a != NULL) 1696170530Ssam add_sconv_object(a, sc); 1697170530Ssam return (sc); 1698170530Ssam} 1699182830Ssam 1700170530Ssamstatic const char * 1701170530Ssamget_current_charset(struct archive *a) 1702170530Ssam{ 1703170530Ssam const char *cur_charset; 1704170530Ssam 1705170530Ssam if (a == NULL) 1706170530Ssam cur_charset = default_iconv_charset(""); 1707170530Ssam else { 1708170530Ssam cur_charset = default_iconv_charset(a->current_code); 1709170530Ssam if (a->current_code == NULL) { 1710170530Ssam a->current_code = strdup(cur_charset); 1711195377Ssam a->current_codepage = get_current_codepage(); 1712195377Ssam a->current_oemcp = get_current_oemcp(); 1713195377Ssam } 1714170530Ssam } 1715170530Ssam return (cur_charset); 1716170530Ssam} 1717178354Ssam 1718170530Ssam/* 1719195377Ssam * Make and Return a string conversion object. 1720195377Ssam * Return NULL if the platform does not support the specified conversion 1721205277Srpaulo * and best_effort is 0. 1722195377Ssam * If best_effort is set, A string conversion object must be returned 1723170530Ssam * unless memory allocation for the object fails, but the conversion 1724195377Ssam * might fail when non-ASCII code is found. 1725195377Ssam */ 1726195377Ssamstruct archive_string_conv * 1727195377Ssamarchive_string_conversion_to_charset(struct archive *a, const char *charset, 1728170530Ssam int best_effort) 1729195377Ssam{ 1730170530Ssam int flag = SCONV_TO_CHARSET; 1731195377Ssam 1732195377Ssam if (best_effort) 1733195377Ssam flag |= SCONV_BEST_EFFORT; 1734195377Ssam return (get_sconv_object(a, get_current_charset(a), charset, flag)); 1735195377Ssam} 1736195377Ssam 1737195377Ssamstruct archive_string_conv * 1738195377Ssamarchive_string_conversion_from_charset(struct archive *a, const char *charset, 1739170530Ssam int best_effort) 1740195377Ssam{ 1741170530Ssam int flag = SCONV_FROM_CHARSET; 1742195377Ssam 1743195377Ssam if (best_effort) 1744195377Ssam flag |= SCONV_BEST_EFFORT; 1745195377Ssam return (get_sconv_object(a, charset, get_current_charset(a), flag)); 1746195377Ssam} 1747195377Ssam 1748195377Ssam/* 1749195377Ssam * archive_string_default_conversion_*_archive() are provided for Windows 1750195377Ssam * platform because other archiver application use CP_OEMCP for 1751195377Ssam * MultiByteToWideChar() and WideCharToMultiByte() for the filenames 1752195377Ssam * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP 1753195377Ssam * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). 1754170530Ssam * So we should make a string conversion between CP_ACP and CP_OEMCP 1755195377Ssam * for compatibility. 1756195377Ssam */ 1757195377Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 1758195377Ssamstruct archive_string_conv * 1759195377Ssamarchive_string_default_conversion_for_read(struct archive *a) 1760195377Ssam{ 1761195377Ssam const char *cur_charset = get_current_charset(a); 1762195377Ssam char oemcp[16]; 1763195377Ssam 1764195377Ssam /* NOTE: a check of cur_charset is unneeded but we need 1765195377Ssam * that get_current_charset() has been surely called at 1766195377Ssam * this time whatever C compiler optimized. */ 1767195377Ssam if (cur_charset != NULL && 1768195377Ssam (a->current_codepage == CP_C_LOCALE || 1769195377Ssam a->current_codepage == a->current_oemcp)) 1770195377Ssam return (NULL);/* no conversion. */ 1771205277Srpaulo 1772195377Ssam _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1773195377Ssam /* Make sure a null termination must be set. */ 1774195377Ssam oemcp[sizeof(oemcp)-1] = '\0'; 1775195377Ssam return (get_sconv_object(a, oemcp, cur_charset, 1776170530Ssam SCONV_FROM_CHARSET)); 1777195377Ssam} 1778195377Ssam 1779195377Ssamstruct archive_string_conv * 1780195377Ssamarchive_string_default_conversion_for_write(struct archive *a) 1781195377Ssam{ 1782195377Ssam const char *cur_charset = get_current_charset(a); 1783195377Ssam char oemcp[16]; 1784195377Ssam 1785195377Ssam /* NOTE: a check of cur_charset is unneeded but we need 1786195377Ssam * that get_current_charset() has been surely called at 1787195377Ssam * this time whatever C compiler optimized. */ 1788170530Ssam if (cur_charset != NULL && 1789195377Ssam (a->current_codepage == CP_C_LOCALE || 1790195377Ssam a->current_codepage == a->current_oemcp)) 1791195377Ssam return (NULL);/* no conversion. */ 1792195377Ssam 1793195377Ssam _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1794195377Ssam /* Make sure a null termination must be set. */ 1795195377Ssam oemcp[sizeof(oemcp)-1] = '\0'; 1796195377Ssam return (get_sconv_object(a, cur_charset, oemcp, 1797195377Ssam SCONV_TO_CHARSET)); 1798195377Ssam} 1799195377Ssam#else 1800195377Ssamstruct archive_string_conv * 1801195377Ssamarchive_string_default_conversion_for_read(struct archive *a) 1802195377Ssam{ 1803195377Ssam (void)a; /* UNUSED */ 1804195377Ssam return (NULL); 1805195377Ssam} 1806195377Ssam 1807195377Ssamstruct archive_string_conv * 1808195377Ssamarchive_string_default_conversion_for_write(struct archive *a) 1809195377Ssam{ 1810195377Ssam (void)a; /* UNUSED */ 1811195377Ssam return (NULL); 1812195377Ssam} 1813195377Ssam#endif 1814195377Ssam 1815195377Ssam/* 1816195377Ssam * Dispose of all character conversion objects in the archive object. 1817195377Ssam */ 1818195377Ssamvoid 1819195377Ssamarchive_string_conversion_free(struct archive *a) 1820195377Ssam{ 1821195377Ssam struct archive_string_conv *sc; 1822195377Ssam struct archive_string_conv *sc_next; 1823195377Ssam 1824195377Ssam for (sc = a->sconv; sc != NULL; sc = sc_next) { 1825195377Ssam sc_next = sc->next; 1826195377Ssam free_sconv_object(sc); 1827195377Ssam } 1828195377Ssam a->sconv = NULL; 1829195377Ssam free(a->current_code); 1830182829Ssam a->current_code = NULL; 1831195377Ssam} 1832195377Ssam 1833195377Ssam/* 1834195377Ssam * Return a conversion charset name. 1835195377Ssam */ 1836195377Ssamconst char * 1837195377Ssamarchive_string_conversion_charset_name(struct archive_string_conv *sc) 1838195377Ssam{ 1839195377Ssam if (sc->flag & SCONV_TO_CHARSET) 1840195377Ssam return (sc->to_charset); 1841195377Ssam else 1842182829Ssam return (sc->from_charset); 1843195377Ssam} 1844195377Ssam 1845195377Ssam/* 1846195377Ssam * Change the behavior of a string conversion. 1847195377Ssam */ 1848195377Ssamvoid 1849195377Ssamarchive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) 1850195377Ssam{ 1851170530Ssam switch (opt) { 1852195377Ssam /* 1853195377Ssam * A filename in UTF-8 was made with libarchive 2.x in a wrong 1854195377Ssam * assumption that wchar_t was Unicode. 1855195377Ssam * This option enables simulating the assumption in order to read 1856195377Ssam * that filename correctly. 1857195377Ssam */ 1858195377Ssam case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: 1859195377Ssam#if (defined(_WIN32) && !defined(__CYGWIN__)) \ 1860195377Ssam || defined(__STDC_ISO_10646__) || defined(__APPLE__) 1861195377Ssam /* 1862170530Ssam * Nothing to do for it since wchar_t on these platforms 1863195377Ssam * is really Unicode. 1864195377Ssam */ 1865170530Ssam (void)sc; /* UNUSED */ 1866195377Ssam#else 1867170530Ssam if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { 1868195377Ssam sc->flag |= SCONV_UTF8_LIBARCHIVE_2; 1869195377Ssam /* Set up string converters. */ 1870195377Ssam setup_converter(sc); 1871195377Ssam } 1872195377Ssam#endif 1873195377Ssam break; 1874195377Ssam case SCONV_SET_OPT_NORMALIZATION_C: 1875195377Ssam if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { 1876195377Ssam sc->flag |= SCONV_NORMALIZATION_C; 1877195377Ssam sc->flag &= ~SCONV_NORMALIZATION_D; 1878195377Ssam /* Set up string converters. */ 1879195377Ssam setup_converter(sc); 1880170530Ssam } 1881195377Ssam break; 1882170530Ssam case SCONV_SET_OPT_NORMALIZATION_D: 1883170530Ssam#if defined(HAVE_ICONV) 1884195377Ssam /* 1885195377Ssam * If iconv will take the string, do not change the 1886195377Ssam * setting of the normalization. 1887170530Ssam */ 1888170530Ssam if (!(sc->flag & SCONV_WIN_CP) && 1889170530Ssam (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1890170530Ssam !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1891195377Ssam break; 1892195377Ssam#endif 1893195377Ssam if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { 1894195377Ssam sc->flag |= SCONV_NORMALIZATION_D; 1895195377Ssam sc->flag &= ~SCONV_NORMALIZATION_C; 1896195377Ssam /* Set up string converters. */ 1897195377Ssam setup_converter(sc); 1898195377Ssam } 1899170530Ssam break; 1900195377Ssam default: 1901170530Ssam break; 1902170530Ssam } 1903195377Ssam} 1904195377Ssam 1905195377Ssam/* 1906195377Ssam * 1907195377Ssam * Copy one archive_string to another in locale conversion. 1908195377Ssam * 1909195377Ssam * archive_strncat_l(); 1910195377Ssam * archive_strncpy_l(); 1911195377Ssam * 1912195377Ssam */ 1913195377Ssam 1914195377Ssamstatic size_t 1915195377Ssammbsnbytes(const void *_p, size_t n) 1916195377Ssam{ 1917195377Ssam size_t s; 1918195377Ssam const char *p, *pp; 1919195377Ssam 1920195377Ssam if (_p == NULL) 1921195377Ssam return (0); 1922195377Ssam p = (const char *)_p; 1923195377Ssam 1924195377Ssam /* Like strlen(p), except won't examine positions beyond p[n]. */ 1925195377Ssam s = 0; 1926195377Ssam pp = p; 1927195377Ssam while (s < n && *pp) { 1928195377Ssam pp++; 1929170530Ssam s++; 1930170530Ssam } 1931170530Ssam return (s); 1932170530Ssam} 1933170530Ssam 1934178354Ssamstatic size_t 1935178354Ssamutf16nbytes(const void *_p, size_t n) 1936178354Ssam{ 1937178354Ssam size_t s; 1938178354Ssam const char *p, *pp; 1939178354Ssam 1940178354Ssam if (_p == NULL) 1941178354Ssam return (0); 1942178354Ssam p = (const char *)_p; 1943178354Ssam 1944178354Ssam /* Like strlen(p), except won't examine positions beyond p[n]. */ 1945178354Ssam s = 0; 1946178354Ssam pp = p; 1947178354Ssam n >>= 1; 1948178354Ssam while (s < n && (pp[0] || pp[1])) { 1949178354Ssam pp += 2; 1950178354Ssam s++; 1951178354Ssam } 1952178354Ssam return (s<<1); 1953178354Ssam} 1954178354Ssam 1955178354Ssamint 1956178354Ssamarchive_strncpy_l(struct archive_string *as, const void *_p, size_t n, 1957178354Ssam struct archive_string_conv *sc) 1958178354Ssam{ 1959178354Ssam as->length = 0; 1960183249Ssam return (archive_strncat_l(as, _p, n, sc)); 1961183249Ssam} 1962178354Ssam 1963178354Ssamint 1964178354Ssamarchive_strncat_l(struct archive_string *as, const void *_p, size_t n, 1965178354Ssam struct archive_string_conv *sc) 1966170530Ssam{ 1967170530Ssam const void *s; 1968170530Ssam size_t length = 0; 1969170530Ssam int i, r = 0, r2; 1970170530Ssam 1971170530Ssam if (_p != NULL && n > 0) { 1972170530Ssam if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) 1973170530Ssam length = utf16nbytes(_p, n); 1974170530Ssam else 1975205277Srpaulo length = mbsnbytes(_p, n); 1976170530Ssam } 1977170530Ssam 1978170530Ssam /* We must allocate memory even if there is no data for conversion 1979170530Ssam * or copy. This simulates archive_string_append behavior. */ 1980170530Ssam if (length == 0) { 1981170530Ssam int tn = 1; 1982184280Ssam if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) 1983170530Ssam tn = 2; 1984173273Ssam if (archive_string_ensure(as, as->length + tn) == NULL) 1985173273Ssam return (-1); 1986173273Ssam as->s[as->length] = 0; 1987170530Ssam if (tn == 2) 1988183245Ssam as->s[as->length+1] = 0; 1989183245Ssam return (0); 1990170530Ssam } 1991170530Ssam 1992205277Srpaulo /* 1993205277Srpaulo * If sc is NULL, we just make a copy. 1994170530Ssam */ 1995170530Ssam if (sc == NULL) { 1996170530Ssam if (archive_string_append(as, _p, length) == NULL) 1997205277Srpaulo return (-1);/* No memory */ 1998170530Ssam return (0); 1999205277Srpaulo } 2000170530Ssam 2001178354Ssam s = _p; 2002173273Ssam i = 0; 2003173273Ssam if (sc->nconverter > 1) { 2004173273Ssam sc->utftmp.length = 0; 2005178354Ssam r2 = sc->converter[0](&(sc->utftmp), s, length, sc); 2006178354Ssam if (r2 != 0 && errno == ENOMEM) 2007178354Ssam return (r2); 2008178354Ssam if (r > r2) 2009170530Ssam r = r2; 2010170530Ssam s = sc->utftmp.s; 2011170530Ssam length = sc->utftmp.length; 2012183245Ssam ++i; 2013205277Srpaulo } 2014178953Ssam r2 = sc->converter[i](as, s, length, sc); 2015178953Ssam if (r > r2) 2016170530Ssam r = r2; 2017170530Ssam return (r); 2018170530Ssam} 2019170530Ssam 2020170530Ssam#if HAVE_ICONV 2021173273Ssam 2022173273Ssam/* 2023173273Ssam * Return -1 if conversion fails. 2024173273Ssam */ 2025183250Ssamstatic int 2026183250Ssamiconv_strncat_in_locale(struct archive_string *as, const void *_p, 2027173273Ssam size_t length, struct archive_string_conv *sc) 2028173273Ssam{ 2029178354Ssam ICONV_CONST char *itp; 2030173273Ssam size_t remaining; 2031173273Ssam iconv_t cd; 2032173273Ssam char *outp; 2033184280Ssam size_t avail, bs; 2034173273Ssam int return_value = 0; /* success */ 2035178354Ssam int to_size, from_size; 2036183250Ssam 2037183250Ssam if (sc->flag & SCONV_TO_UTF16) 2038178354Ssam to_size = 2; 2039173273Ssam else 2040173273Ssam to_size = 1; 2041173273Ssam if (sc->flag & SCONV_FROM_UTF16) 2042173273Ssam from_size = 2; 2043183250Ssam else 2044195377Ssam from_size = 1; 2045173273Ssam 2046173273Ssam if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) 2047178354Ssam return (-1); 2048183250Ssam 2049183250Ssam cd = sc->cd; 2050178354Ssam itp = (char *)(uintptr_t)_p; 2051173273Ssam remaining = length; 2052173273Ssam outp = as->s + as->length; 2053173273Ssam avail = as->buffer_length - as->length - to_size; 2054184280Ssam while (remaining >= (size_t)from_size) { 2055184280Ssam size_t result = iconv(cd, &itp, &remaining, &outp, &avail); 2056184280Ssam 2057184280Ssam if (result != (size_t)-1) 2058184280Ssam break; /* Conversion completed. */ 2059184280Ssam 2060184280Ssam if (errno == EILSEQ || errno == EINVAL) { 2061184280Ssam /* 2062184280Ssam * If an output charset is UTF-8 or UTF-16BE/LE, 2063184280Ssam * unknown character should be U+FFFD 2064184280Ssam * (replacement character). 2065184280Ssam */ 2066184280Ssam if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { 2067184280Ssam size_t rbytes; 2068184280Ssam if (sc->flag & SCONV_TO_UTF8) 2069184280Ssam rbytes = sizeof(utf8_replacement_char); 2070184280Ssam else 2071184280Ssam rbytes = 2; 2072184280Ssam 2073184280Ssam if (avail < rbytes) { 2074184280Ssam as->length = outp - as->s; 2075184280Ssam bs = as->buffer_length + 2076184280Ssam (remaining * to_size) + rbytes; 2077184280Ssam if (NULL == 2078184280Ssam archive_string_ensure(as, bs)) 2079184280Ssam return (-1); 2080184280Ssam outp = as->s + as->length; 2081184280Ssam avail = as->buffer_length 2082184280Ssam - as->length - to_size; 2083184280Ssam } 2084184280Ssam if (sc->flag & SCONV_TO_UTF8) 2085184280Ssam memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); 2086184280Ssam else if (sc->flag & SCONV_TO_UTF16BE) 2087184280Ssam archive_be16enc(outp, UNICODE_R_CHAR); 2088184280Ssam else 2089184280Ssam archive_le16enc(outp, UNICODE_R_CHAR); 2090184280Ssam outp += rbytes; 2091184280Ssam avail -= rbytes; 2092184280Ssam } else { 2093184280Ssam /* Skip the illegal input bytes. */ 2094184280Ssam *outp++ = '?'; 2095184280Ssam avail--; 2096184280Ssam } 2097184280Ssam itp += from_size; 2098184280Ssam remaining -= from_size; 2099184280Ssam return_value = -1; /* failure */ 2100184280Ssam } else { 2101184280Ssam /* E2BIG no output buffer, 2102184280Ssam * Increase an output buffer. */ 2103184280Ssam as->length = outp - as->s; 2104184280Ssam bs = as->buffer_length + remaining * 2; 2105184280Ssam if (NULL == archive_string_ensure(as, bs)) 2106184280Ssam return (-1); 2107184280Ssam outp = as->s + as->length; 2108184280Ssam avail = as->buffer_length - as->length - to_size; 2109184280Ssam } 2110184280Ssam } 2111184280Ssam as->length = outp - as->s; 2112184280Ssam as->s[as->length] = 0; 2113184280Ssam if (to_size == 2) 2114184280Ssam as->s[as->length+1] = 0; 2115184280Ssam return (return_value); 2116184280Ssam} 2117184280Ssam 2118184280Ssam#endif /* HAVE_ICONV */ 2119184280Ssam 2120184280Ssam 2121184280Ssam#if defined(_WIN32) && !defined(__CYGWIN__) 2122184280Ssam 2123184280Ssam/* 2124184280Ssam * Translate a string from a some CodePage to an another CodePage by 2125184280Ssam * Windows APIs, and copy the result. Return -1 if conversion fails. 2126184280Ssam */ 2127184280Ssamstatic int 2128184280Ssamstrncat_in_codepage(struct archive_string *as, 2129184280Ssam const void *_p, size_t length, struct archive_string_conv *sc) 2130173273Ssam{ 2131170530Ssam const char *s = (const char *)_p; 2132170530Ssam struct archive_wstring aws; 2133170530Ssam size_t l; 2134184280Ssam int r, saved_flag; 2135184280Ssam 2136170530Ssam archive_string_init(&aws); 2137170530Ssam saved_flag = sc->flag; 2138170530Ssam sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); 2139184280Ssam r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); 2140170530Ssam sc->flag = saved_flag; 2141178354Ssam if (r != 0) { 2142178354Ssam archive_wstring_free(&aws); 2143170530Ssam if (errno != ENOMEM) 2144184280Ssam archive_string_append(as, s, length); 2145170530Ssam return (-1); 2146184280Ssam } 2147170530Ssam 2148170530Ssam l = as->length; 2149170530Ssam r = archive_string_append_from_wcs_in_codepage( 2150184280Ssam as, aws.s, aws.length, sc); 2151184280Ssam if (r != 0 && errno != ENOMEM && l == as->length) 2152184280Ssam archive_string_append(as, s, length); 2153184280Ssam archive_wstring_free(&aws); 2154184280Ssam return (r); 2155184280Ssam} 2156184280Ssam 2157184280Ssam/* 2158170530Ssam * Test whether MBS ==> WCS is okay. 2159170530Ssam */ 2160184280Ssamstatic int 2161170530Ssaminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2162170530Ssam{ 2163170530Ssam const char *p = (const char *)_p; 2164184280Ssam unsigned codepage; 2165184280Ssam DWORD mbflag = MB_ERR_INVALID_CHARS; 2166184280Ssam 2167184280Ssam if (sc->flag & SCONV_FROM_CHARSET) 2168184280Ssam codepage = sc->to_cp; 2169184280Ssam else 2170184280Ssam codepage = sc->from_cp; 2171184280Ssam 2172170530Ssam if (codepage == CP_C_LOCALE) 2173184280Ssam return (0); 2174184280Ssam if (codepage != CP_UTF8) 2175184280Ssam mbflag |= MB_PRECOMPOSED; 2176170530Ssam 2177170530Ssam if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) 2178170530Ssam return (-1); /* Invalid */ 2179184280Ssam return (0); /* Okay */ 2180184280Ssam} 2181184280Ssam 2182170530Ssam#else 2183184280Ssam 2184184280Ssam/* 2185184280Ssam * Test whether MBS ==> WCS is okay. 2186184280Ssam */ 2187184280Ssamstatic int 2188170530Ssaminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2189178354Ssam{ 2190178354Ssam const char *p = (const char *)_p; 2191170530Ssam size_t r; 2192170530Ssam 2193184280Ssam#if HAVE_MBRTOWC 2194184280Ssam mbstate_t shift_state; 2195184280Ssam 2196184280Ssam memset(&shift_state, 0, sizeof(shift_state)); 2197184280Ssam#else 2198184280Ssam /* Clear the shift state before starting. */ 2199184280Ssam mbtowc(NULL, NULL, 0); 2200184280Ssam#endif 2201170530Ssam while (n) { 2202184280Ssam wchar_t wc; 2203184280Ssam 2204184280Ssam#if HAVE_MBRTOWC 2205184280Ssam r = mbrtowc(&wc, p, n, &shift_state); 2206184280Ssam#else 2207184280Ssam r = mbtowc(&wc, p, n); 2208184280Ssam#endif 2209184280Ssam if (r == (size_t)-1 || r == (size_t)-2) 2210184280Ssam return (-1);/* Invalid. */ 2211184280Ssam if (r == 0) 2212184280Ssam break; 2213184280Ssam p += r; 2214184280Ssam n -= r; 2215184280Ssam } 2216170530Ssam (void)sc; /* UNUSED */ 2217170530Ssam return (0); /* All Okey. */ 2218170530Ssam} 2219170530Ssam 2220170530Ssam#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 2221170530Ssam 2222195377Ssam/* 2223195377Ssam * Basically returns -1 because we cannot make a conversion of charset 2224195377Ssam * without iconv but in some cases this would return 0. 2225195377Ssam * Returns 0 if all copied characters are ASCII. 2226195377Ssam * Returns 0 if both from-locale and to-locale are the same and those 2227195377Ssam * can be WCS with no error. 2228195377Ssam */ 2229195377Ssamstatic int 2230195377Ssambest_effort_strncat_in_locale(struct archive_string *as, const void *_p, 2231195377Ssam size_t length, struct archive_string_conv *sc) 2232195377Ssam{ 2233195377Ssam size_t remaining; 2234195377Ssam const uint8_t *itp; 2235195377Ssam int return_value = 0; /* success */ 2236195377Ssam 2237195377Ssam /* 2238195377Ssam * If both from-locale and to-locale is the same, this makes a copy. 2239195377Ssam * And then this checks all copied MBS can be WCS if so returns 0. 2240195377Ssam */ 2241195377Ssam if (sc->same) { 2242195377Ssam if (archive_string_append(as, _p, length) == NULL) 2243170530Ssam return (-1);/* No memory */ 2244170530Ssam return (invalid_mbs(_p, length, sc)); 2245170530Ssam } 2246170530Ssam 2247170530Ssam /* 2248195377Ssam * If a character is ASCII, this just copies it. If not, this 2249195377Ssam * assigns '?' character instead but in UTF-8 locale this assigns 2250195377Ssam * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, 2251170530Ssam * a Replacement Character in Unicode. 2252178354Ssam */ 2253170530Ssam 2254195377Ssam remaining = length; 2255170530Ssam itp = (const uint8_t *)_p; 2256170530Ssam while (*itp && remaining > 0) { 2257195377Ssam if (*itp > 127) { 2258195377Ssam // Non-ASCII: Substitute with suitable replacement 2259205277Srpaulo if (sc->flag & SCONV_TO_UTF8) { 2260195377Ssam if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { 2261195377Ssam __archive_errx(1, "Out of memory"); 2262195377Ssam } 2263205277Srpaulo } else { 2264205277Srpaulo archive_strappend_char(as, '?'); 2265195377Ssam } 2266195377Ssam return_value = -1; 2267195377Ssam } else { 2268195377Ssam archive_strappend_char(as, *itp); 2269195377Ssam } 2270195377Ssam ++itp; 2271195377Ssam } 2272195377Ssam return (return_value); 2273195377Ssam} 2274195377Ssam 2275195377Ssam 2276195377Ssam/* 2277195377Ssam * Unicode conversion functions. 2278195377Ssam * - UTF-8 <===> UTF-8 in removing surrogate pairs. 2279195377Ssam * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. 2280195377Ssam * - UTF-8 made by libarchive 2.x ===> UTF-8. 2281205277Srpaulo * - UTF-16BE <===> UTF-8. 2282205277Srpaulo * 2283205277Srpaulo */ 2284205277Srpaulo 2285195377Ssam/* 2286205277Srpaulo * Utility to convert a single UTF-8 sequence. 2287195377Ssam * 2288195377Ssam * Usually return used bytes, return used byte in negative value when 2289195377Ssam * a unicode character is replaced with U+FFFD. 2290195377Ssam * See also http://unicode.org/review/pr-121.html Public Review Issue #121 2291195377Ssam * Recommended Practice for Replacement Characters. 2292195377Ssam */ 2293195377Ssamstatic int 2294195377Ssam_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2295195377Ssam{ 2296195377Ssam static const char utf8_count[256] = { 2297195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ 2298195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ 2299195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ 2300195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ 2301195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ 2302195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ 2303195377Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ 2304170530Ssam 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ 2305195377Ssam 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ 2306170530Ssam 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ 2307195377Ssam 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ 2308195377Ssam 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ 2309195377Ssam 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ 2310195377Ssam 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ 2311195377Ssam 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ 2312195377Ssam 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ 2313170530Ssam }; 2314178354Ssam int ch, i; 2315195377Ssam int cnt; 2316195377Ssam uint32_t wc; 2317170530Ssam 2318170530Ssam /* Sanity check. */ 2319170530Ssam if (n == 0) 2320195377Ssam return (0); 2321195377Ssam /* 2322195377Ssam * Decode 1-4 bytes depending on the value of the first byte. 2323195377Ssam */ 2324170530Ssam ch = (unsigned char)*s; 2325195377Ssam if (ch == 0) 2326195377Ssam return (0); /* Standard: return 0 for end-of-string. */ 2327195377Ssam cnt = utf8_count[ch]; 2328195377Ssam 2329195377Ssam /* Invalid sequence or there are not plenty bytes. */ 2330195377Ssam if ((int)n < cnt) { 2331195377Ssam cnt = (int)n; 2332195377Ssam for (i = 1; i < cnt; i++) { 2333195377Ssam if ((s[i] & 0xc0) != 0x80) { 2334195377Ssam cnt = i; 2335195377Ssam break; 2336195377Ssam } 2337195377Ssam } 2338170530Ssam goto invalid_sequence; 2339195377Ssam } 2340195377Ssam 2341195377Ssam /* Make a Unicode code point from a single UTF-8 sequence. */ 2342195377Ssam switch (cnt) { 2343195377Ssam case 1: /* 1 byte sequence. */ 2344195377Ssam *pwc = ch & 0x7f; 2345195377Ssam return (cnt); 2346195377Ssam case 2: /* 2 bytes sequence. */ 2347170530Ssam if ((s[1] & 0xc0) != 0x80) { 2348195377Ssam cnt = 1; 2349195377Ssam goto invalid_sequence; 2350195377Ssam } 2351170530Ssam *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 2352195377Ssam return (cnt); 2353195377Ssam case 3: /* 3 bytes sequence. */ 2354195377Ssam if ((s[1] & 0xc0) != 0x80) { 2355195377Ssam cnt = 1; 2356170530Ssam goto invalid_sequence; 2357195377Ssam } 2358195377Ssam if ((s[2] & 0xc0) != 0x80) { 2359195377Ssam cnt = 2; 2360195377Ssam goto invalid_sequence; 2361195377Ssam } 2362195377Ssam wc = ((ch & 0x0f) << 12) 2363195377Ssam | ((s[1] & 0x3f) << 6) 2364195377Ssam | (s[2] & 0x3f); 2365195377Ssam if (wc < 0x800) 2366195377Ssam goto invalid_sequence;/* Overlong sequence. */ 2367195377Ssam break; 2368195377Ssam case 4: /* 4 bytes sequence. */ 2369195377Ssam if ((s[1] & 0xc0) != 0x80) { 2370195377Ssam cnt = 1; 2371195377Ssam goto invalid_sequence; 2372195377Ssam } 2373195377Ssam if ((s[2] & 0xc0) != 0x80) { 2374195377Ssam cnt = 2; 2375170530Ssam goto invalid_sequence; 2376195377Ssam } 2377170530Ssam if ((s[3] & 0xc0) != 0x80) { 2378170530Ssam cnt = 3; 2379170530Ssam goto invalid_sequence; 2380170530Ssam } 2381170530Ssam wc = ((ch & 0x07) << 18) 2382170530Ssam | ((s[1] & 0x3f) << 12) 2383170530Ssam | ((s[2] & 0x3f) << 6) 2384170530Ssam | (s[3] & 0x3f); 2385170530Ssam if (wc < 0x10000) 2386170530Ssam goto invalid_sequence;/* Overlong sequence. */ 2387170530Ssam break; 2388170530Ssam default: /* Others are all invalid sequence. */ 2389170530Ssam if (ch == 0xc0 || ch == 0xc1) 2390170530Ssam cnt = 2; 2391170530Ssam else if (ch >= 0xf5 && ch <= 0xf7) 2392170530Ssam cnt = 4; 2393170530Ssam else if (ch >= 0xf8 && ch <= 0xfb) 2394170530Ssam cnt = 5; 2395170530Ssam else if (ch == 0xfc || ch == 0xfd) 2396170530Ssam cnt = 6; 2397170530Ssam else 2398170530Ssam cnt = 1; 2399170530Ssam if ((int)n < cnt) 2400170530Ssam cnt = (int)n; 2401170530Ssam for (i = 1; i < cnt; i++) { 2402170530Ssam if ((s[i] & 0xc0) != 0x80) { 2403170530Ssam cnt = i; 2404170530Ssam break; 2405170530Ssam } 2406170530Ssam } 2407170530Ssam goto invalid_sequence; 2408178354Ssam } 2409205513Srpaulo 2410173865Ssam /* The code point larger than 0x10FFFF is not legal 2411170530Ssam * Unicode values. */ 2412170530Ssam if (wc > UNICODE_MAX) 2413178354Ssam goto invalid_sequence; 2414173273Ssam /* Correctly gets a Unicode, returns used bytes. */ 2415173273Ssam *pwc = wc; 2416173273Ssam return (cnt); 2417173273Ssaminvalid_sequence: 2418173273Ssam *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2419173273Ssam return (cnt * -1); 2420173273Ssam} 2421173273Ssam 2422178354Ssamstatic int 2423173273Ssamutf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2424193655Ssam{ 2425173273Ssam int cnt; 2426173273Ssam 2427173273Ssam cnt = _utf8_to_unicode(pwc, s, n); 2428173865Ssam /* Any of Surrogate pair is not legal Unicode values. */ 2429173865Ssam if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2430173865Ssam return (-3); 2431173273Ssam return (cnt); 2432173273Ssam} 2433178354Ssam 2434173273Ssamstatic inline uint32_t 2435173273Ssamcombine_surrogate_pair(uint32_t uc, uint32_t uc2) 2436173273Ssam{ 2437178354Ssam uc -= 0xD800; 2438178354Ssam uc *= 0x400; 2439173273Ssam uc += uc2 - 0xDC00; 2440170530Ssam uc += 0x10000; 2441193655Ssam return (uc); 2442170530Ssam} 2443193655Ssam 2444170530Ssam/* 2445170530Ssam * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in 2446170530Ssam * removing surrogate pairs. 2447170530Ssam * 2448170530Ssam * CESU-8: The Compatibility Encoding Scheme for UTF-16. 2449173865Ssam * 2450173865Ssam * Usually return used bytes, return used byte in negative value when 2451173273Ssam * a unicode character is replaced with U+FFFD. 2452170530Ssam */ 2453170530Ssamstatic int 2454170530Ssamcesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2455170530Ssam{ 2456170530Ssam uint32_t wc = 0; 2457170530Ssam int cnt; 2458170530Ssam 2459173273Ssam cnt = _utf8_to_unicode(&wc, s, n); 2460173273Ssam if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { 2461173273Ssam uint32_t wc2 = 0; 2462173273Ssam if (n - 3 < 3) { 2463173273Ssam /* Invalid byte sequence. */ 2464173273Ssam goto invalid_sequence; 2465173273Ssam } 2466170530Ssam cnt = _utf8_to_unicode(&wc2, s+3, n-3); 2467205513Srpaulo if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { 2468205513Srpaulo /* Invalid byte sequence. */ 2469205513Srpaulo goto invalid_sequence; 2470205513Srpaulo } 2471205513Srpaulo wc = combine_surrogate_pair(wc, wc2); 2472205513Srpaulo cnt = 6; 2473205513Srpaulo } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { 2474205513Srpaulo /* Invalid byte sequence. */ 2475170530Ssam goto invalid_sequence; 2476205513Srpaulo } 2477205513Srpaulo *pwc = wc; 2478170530Ssam return (cnt); 2479170530Ssaminvalid_sequence: 2480170530Ssam *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2481170530Ssam if (cnt > 0) 2482170530Ssam cnt *= -1; 2483170530Ssam return (cnt); 2484170530Ssam} 2485170530Ssam 2486170530Ssam/* 2487170530Ssam * Convert a Unicode code point to a single UTF-8 sequence. 2488170530Ssam * 2489170530Ssam * NOTE:This function does not check if the Unicode is legal or not. 2490170530Ssam * Please you definitely check it before calling this. 2491170530Ssam */ 2492170530Ssamstatic size_t 2493170530Ssamunicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2494170530Ssam{ 2495170530Ssam char *_p = p; 2496170530Ssam 2497170530Ssam /* Invalid Unicode char maps to Replacement character */ 2498170530Ssam if (uc > UNICODE_MAX) 2499170530Ssam uc = UNICODE_R_CHAR; 2500170530Ssam /* Translate code point to UTF8 */ 2501170530Ssam if (uc <= 0x7f) { 2502170530Ssam if (remaining == 0) 2503170530Ssam return (0); 2504170530Ssam *p++ = (char)uc; 2505170530Ssam } else if (uc <= 0x7ff) { 2506170530Ssam if (remaining < 2) 2507170530Ssam return (0); 2508170530Ssam *p++ = 0xc0 | ((uc >> 6) & 0x1f); 2509170530Ssam *p++ = 0x80 | (uc & 0x3f); 2510170530Ssam } else if (uc <= 0xffff) { 2511170530Ssam if (remaining < 3) 2512170530Ssam return (0); 2513170530Ssam *p++ = 0xe0 | ((uc >> 12) & 0x0f); 2514170530Ssam *p++ = 0x80 | ((uc >> 6) & 0x3f); 2515170530Ssam *p++ = 0x80 | (uc & 0x3f); 2516170530Ssam } else { 2517170530Ssam if (remaining < 4) 2518170530Ssam return (0); 2519170530Ssam *p++ = 0xf0 | ((uc >> 18) & 0x07); 2520170530Ssam *p++ = 0x80 | ((uc >> 12) & 0x3f); 2521170530Ssam *p++ = 0x80 | ((uc >> 6) & 0x3f); 2522170530Ssam *p++ = 0x80 | (uc & 0x3f); 2523170530Ssam } 2524170530Ssam return (p - _p); 2525170530Ssam} 2526170530Ssam 2527170530Ssamstatic int 2528170530Ssamutf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) 2529172211Ssam{ 2530172211Ssam return (utf16_to_unicode(pwc, s, n, 1)); 2531172211Ssam} 2532178354Ssam 2533172211Ssamstatic int 2534172211Ssamutf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) 2535172211Ssam{ 2536178354Ssam return (utf16_to_unicode(pwc, s, n, 0)); 2537178354Ssam} 2538172211Ssam 2539172211Ssamstatic int 2540172211Ssamutf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) 2541172211Ssam{ 2542178354Ssam const char *utf16 = s; 2543193655Ssam unsigned uc; 2544183256Ssam 2545183256Ssam if (n == 0) 2546183256Ssam return (0); 2547178354Ssam if (n == 1) { 2548172211Ssam /* set the Replacement Character instead. */ 2549178354Ssam *pwc = UNICODE_R_CHAR; 2550172211Ssam return (-1); 2551172211Ssam } 2552172211Ssam 2553178354Ssam if (be) 2554172211Ssam uc = archive_be16dec(utf16); 2555172211Ssam else 2556172211Ssam uc = archive_le16dec(utf16); 2557172211Ssam utf16 += 2; 2558172211Ssam 2559172211Ssam /* If this is a surrogate pair, assemble the full code point.*/ 2560172211Ssam if (IS_HIGH_SURROGATE_LA(uc)) { 2561172211Ssam unsigned uc2; 2562172211Ssam 2563172211Ssam if (n >= 4) { 2564170530Ssam if (be) 2565173273Ssam uc2 = archive_be16dec(utf16); 2566173273Ssam else 2567173273Ssam uc2 = archive_le16dec(utf16); 2568173273Ssam } else 2569170530Ssam uc2 = 0; 2570170530Ssam if (IS_LOW_SURROGATE_LA(uc2)) { 2571170530Ssam uc = combine_surrogate_pair(uc, uc2); 2572170530Ssam utf16 += 2; 2573183256Ssam } else { 2574170530Ssam /* Undescribed code point should be U+FFFD 2575170530Ssam * (replacement character). */ 2576170530Ssam *pwc = UNICODE_R_CHAR; 2577170530Ssam return (-2); 2578170530Ssam } 2579170530Ssam } 2580178354Ssam 2581170530Ssam /* 2582193655Ssam * Surrogate pair values(0xd800 through 0xdfff) are only 2583183256Ssam * used by UTF-16, so, after above calculation, the code 2584183256Ssam * must not be surrogate values, and Unicode has no codes 2585183256Ssam * larger than 0x10ffff. Thus, those are not legal Unicode 2586178354Ssam * values. 2587170530Ssam */ 2588178354Ssam if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2589170530Ssam /* Undescribed code point should be U+FFFD 2590170530Ssam * (replacement character). */ 2591170530Ssam *pwc = UNICODE_R_CHAR; 2592178354Ssam return (((int)(utf16 - s)) * -1); 2593170530Ssam } 2594170530Ssam *pwc = uc; 2595172211Ssam return ((int)(utf16 - s)); 2596170530Ssam} 2597170530Ssam 2598170530Ssamstatic size_t 2599170530Ssamunicode_to_utf16be(char *p, size_t remaining, uint32_t uc) 2600170530Ssam{ 2601170530Ssam char *utf16 = p; 2602170530Ssam 2603170530Ssam if (uc > 0xffff) { 2604170530Ssam /* We have a code point that won't fit into a 2605170530Ssam * wchar_t; convert it to a surrogate pair. */ 2606170530Ssam if (remaining < 4) 2607170530Ssam return (0); 2608170530Ssam uc -= 0x10000; 2609170530Ssam archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2610170530Ssam archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2611170530Ssam return (4); 2612170530Ssam } else { 2613170530Ssam if (remaining < 2) 2614170530Ssam return (0); 2615170530Ssam archive_be16enc(utf16, uc); 2616170530Ssam return (2); 2617170530Ssam } 2618170530Ssam} 2619170530Ssam 2620170530Ssamstatic size_t 2621170530Ssamunicode_to_utf16le(char *p, size_t remaining, uint32_t uc) 2622170530Ssam{ 2623170530Ssam char *utf16 = p; 2624170530Ssam 2625170530Ssam if (uc > 0xffff) { 2626170530Ssam /* We have a code point that won't fit into a 2627170530Ssam * wchar_t; convert it to a surrogate pair. */ 2628170530Ssam if (remaining < 4) 2629170530Ssam return (0); 2630170530Ssam uc -= 0x10000; 2631170530Ssam archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2632 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2633 return (4); 2634 } else { 2635 if (remaining < 2) 2636 return (0); 2637 archive_le16enc(utf16, uc); 2638 return (2); 2639 } 2640} 2641 2642/* 2643 * Append new UTF-8 string to existing UTF-8 string. 2644 * Existing string is assumed to already be in proper form; 2645 * the new string will have invalid sequences replaced and 2646 * surrogate pairs canonicalized. 2647 */ 2648static int 2649strncat_from_utf8_to_utf8(struct archive_string *as, const void *_src, 2650 size_t len, struct archive_string_conv *sc) 2651{ 2652 int ret = 0; 2653 const char *src = _src; 2654 (void)sc; /* UNUSED */ 2655 2656 /* Pre-extend the destination */ 2657 if (archive_string_ensure(as, as->length + len + 1) == NULL) 2658 return (-1); 2659 2660 /* Invariant: src points to the first UTF8 byte that hasn't 2661 * been copied to the destination `as`. */ 2662 for (;;) { 2663 int n; 2664 uint32_t uc; 2665 const char *e = src; 2666 2667 /* Skip UTF-8 sequences until we reach end-of-string or 2668 * a code point that needs conversion. */ 2669 while ((n = utf8_to_unicode(&uc, e, len)) > 0) { 2670 e += n; 2671 len -= n; 2672 } 2673 /* Copy the part that doesn't need conversion */ 2674 if (e > src) { 2675 if (archive_string_append(as, src, e - src) == NULL) 2676 return (-1); 2677 src = e; 2678 } 2679 2680 if (n == 0) { 2681 /* We reached end-of-string */ 2682 return (ret); 2683 } else { 2684 /* Next code point needs conversion */ 2685 char t[4]; 2686 size_t w; 2687 2688 /* Try decoding a surrogate pair */ 2689 if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { 2690 n = cesu8_to_unicode(&uc, src, len); 2691 } 2692 /* Not a (valid) surrogate, so use a replacement char */ 2693 if (n < 0) { 2694 ret = -1; /* Return -1 if we used any replacement */ 2695 n *= -1; 2696 } 2697 /* Consume converted code point */ 2698 src += n; 2699 len -= n; 2700 /* Convert and append new UTF-8 sequence. */ 2701 w = unicode_to_utf8(t, sizeof(t), uc); 2702 if (archive_string_append(as, t, w) == NULL) 2703 return (-1); 2704 } 2705 } 2706} 2707 2708static int 2709archive_string_append_unicode(struct archive_string *as, const void *_p, 2710 size_t len, struct archive_string_conv *sc) 2711{ 2712 const char *s; 2713 char *p, *endp; 2714 uint32_t uc; 2715 size_t w; 2716 int n, ret = 0, ts, tm; 2717 int (*parse)(uint32_t *, const char *, size_t); 2718 size_t (*unparse)(char *, size_t, uint32_t); 2719 2720 if (sc->flag & SCONV_TO_UTF16BE) { 2721 unparse = unicode_to_utf16be; 2722 ts = 2; 2723 } else if (sc->flag & SCONV_TO_UTF16LE) { 2724 unparse = unicode_to_utf16le; 2725 ts = 2; 2726 } else if (sc->flag & SCONV_TO_UTF8) { 2727 unparse = unicode_to_utf8; 2728 ts = 1; 2729 } else { 2730 /* 2731 * This case is going to be converted to another 2732 * character-set through iconv. 2733 */ 2734 if (sc->flag & SCONV_FROM_UTF16BE) { 2735 unparse = unicode_to_utf16be; 2736 ts = 2; 2737 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2738 unparse = unicode_to_utf16le; 2739 ts = 2; 2740 } else { 2741 unparse = unicode_to_utf8; 2742 ts = 1; 2743 } 2744 } 2745 2746 if (sc->flag & SCONV_FROM_UTF16BE) { 2747 parse = utf16be_to_unicode; 2748 tm = 1; 2749 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2750 parse = utf16le_to_unicode; 2751 tm = 1; 2752 } else { 2753 parse = cesu8_to_unicode; 2754 tm = ts; 2755 } 2756 2757 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2758 return (-1); 2759 2760 s = (const char *)_p; 2761 p = as->s + as->length; 2762 endp = as->s + as->buffer_length - ts; 2763 while ((n = parse(&uc, s, len)) != 0) { 2764 if (n < 0) { 2765 /* Use a replaced unicode character. */ 2766 n *= -1; 2767 ret = -1; 2768 } 2769 s += n; 2770 len -= n; 2771 while ((w = unparse(p, endp - p, uc)) == 0) { 2772 /* There is not enough output buffer so 2773 * we have to expand it. */ 2774 as->length = p - as->s; 2775 if (archive_string_ensure(as, 2776 as->buffer_length + len * tm + ts) == NULL) 2777 return (-1); 2778 p = as->s + as->length; 2779 endp = as->s + as->buffer_length - ts; 2780 } 2781 p += w; 2782 } 2783 as->length = p - as->s; 2784 as->s[as->length] = '\0'; 2785 if (ts == 2) 2786 as->s[as->length+1] = '\0'; 2787 return (ret); 2788} 2789 2790/* 2791 * Following Constants for Hangul compositions this information comes from 2792 * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ 2793 */ 2794#define HC_SBASE 0xAC00 2795#define HC_LBASE 0x1100 2796#define HC_VBASE 0x1161 2797#define HC_TBASE 0x11A7 2798#define HC_LCOUNT 19 2799#define HC_VCOUNT 21 2800#define HC_TCOUNT 28 2801#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) 2802#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) 2803 2804static uint32_t 2805get_nfc(uint32_t uc, uint32_t uc2) 2806{ 2807 int t, b; 2808 2809 t = 0; 2810 b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; 2811 while (b >= t) { 2812 int m = (t + b) / 2; 2813 if (u_composition_table[m].cp1 < uc) 2814 t = m + 1; 2815 else if (u_composition_table[m].cp1 > uc) 2816 b = m - 1; 2817 else if (u_composition_table[m].cp2 < uc2) 2818 t = m + 1; 2819 else if (u_composition_table[m].cp2 > uc2) 2820 b = m - 1; 2821 else 2822 return (u_composition_table[m].nfc); 2823 } 2824 return (0); 2825} 2826 2827#define FDC_MAX 10 /* The maximum number of Following Decomposable 2828 * Characters. */ 2829 2830/* 2831 * Update first code point. 2832 */ 2833#define UPDATE_UC(new_uc) do { \ 2834 uc = new_uc; \ 2835 ucptr = NULL; \ 2836} while (0) 2837 2838/* 2839 * Replace first code point with second code point. 2840 */ 2841#define REPLACE_UC_WITH_UC2() do { \ 2842 uc = uc2; \ 2843 ucptr = uc2ptr; \ 2844 n = n2; \ 2845} while (0) 2846 2847#define EXPAND_BUFFER() do { \ 2848 as->length = p - as->s; \ 2849 if (archive_string_ensure(as, \ 2850 as->buffer_length + len * tm + ts) == NULL)\ 2851 return (-1); \ 2852 p = as->s + as->length; \ 2853 endp = as->s + as->buffer_length - ts; \ 2854} while (0) 2855 2856#define UNPARSE(p, endp, uc) do { \ 2857 while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ 2858 EXPAND_BUFFER(); \ 2859 } \ 2860 p += w; \ 2861} while (0) 2862 2863/* 2864 * Write first code point. 2865 * If the code point has not be changed from its original code, 2866 * this just copies it from its original buffer pointer. 2867 * If not, this converts it to UTF-8 byte sequence and copies it. 2868 */ 2869#define WRITE_UC() do { \ 2870 if (ucptr) { \ 2871 if (p + n > endp) \ 2872 EXPAND_BUFFER(); \ 2873 switch (n) { \ 2874 case 4: \ 2875 *p++ = *ucptr++; \ 2876 /* FALL THROUGH */ \ 2877 case 3: \ 2878 *p++ = *ucptr++; \ 2879 /* FALL THROUGH */ \ 2880 case 2: \ 2881 *p++ = *ucptr++; \ 2882 /* FALL THROUGH */ \ 2883 case 1: \ 2884 *p++ = *ucptr; \ 2885 break; \ 2886 } \ 2887 ucptr = NULL; \ 2888 } else { \ 2889 UNPARSE(p, endp, uc); \ 2890 } \ 2891} while (0) 2892 2893/* 2894 * Collect following decomposable code points. 2895 */ 2896#define COLLECT_CPS(start) do { \ 2897 int _i; \ 2898 for (_i = start; _i < FDC_MAX ; _i++) { \ 2899 nx = parse(&ucx[_i], s, len); \ 2900 if (nx <= 0) \ 2901 break; \ 2902 cx = CCC(ucx[_i]); \ 2903 if (cl >= cx && cl != 228 && cx != 228)\ 2904 break; \ 2905 s += nx; \ 2906 len -= nx; \ 2907 cl = cx; \ 2908 ccx[_i] = cx; \ 2909 } \ 2910 if (_i >= FDC_MAX) { \ 2911 ret = -1; \ 2912 ucx_size = FDC_MAX; \ 2913 } else \ 2914 ucx_size = _i; \ 2915} while (0) 2916 2917/* 2918 * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. 2919 * 2920 * TODO: Convert composition exclusions, which are never converted 2921 * from NFC,NFD,NFKC and NFKD, to Form C. 2922 */ 2923static int 2924archive_string_normalize_C(struct archive_string *as, const void *_p, 2925 size_t len, struct archive_string_conv *sc) 2926{ 2927 const char *s = (const char *)_p; 2928 char *p, *endp; 2929 uint32_t uc, uc2; 2930 size_t w; 2931 int always_replace, n, n2, ret = 0, spair, ts, tm; 2932 int (*parse)(uint32_t *, const char *, size_t); 2933 size_t (*unparse)(char *, size_t, uint32_t); 2934 2935 always_replace = 1; 2936 ts = 1;/* text size. */ 2937 if (sc->flag & SCONV_TO_UTF16BE) { 2938 unparse = unicode_to_utf16be; 2939 ts = 2; 2940 if (sc->flag & SCONV_FROM_UTF16BE) 2941 always_replace = 0; 2942 } else if (sc->flag & SCONV_TO_UTF16LE) { 2943 unparse = unicode_to_utf16le; 2944 ts = 2; 2945 if (sc->flag & SCONV_FROM_UTF16LE) 2946 always_replace = 0; 2947 } else if (sc->flag & SCONV_TO_UTF8) { 2948 unparse = unicode_to_utf8; 2949 if (sc->flag & SCONV_FROM_UTF8) 2950 always_replace = 0; 2951 } else { 2952 /* 2953 * This case is going to be converted to another 2954 * character-set through iconv. 2955 */ 2956 always_replace = 0; 2957 if (sc->flag & SCONV_FROM_UTF16BE) { 2958 unparse = unicode_to_utf16be; 2959 ts = 2; 2960 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2961 unparse = unicode_to_utf16le; 2962 ts = 2; 2963 } else { 2964 unparse = unicode_to_utf8; 2965 } 2966 } 2967 2968 if (sc->flag & SCONV_FROM_UTF16BE) { 2969 parse = utf16be_to_unicode; 2970 tm = 1; 2971 spair = 4;/* surrogate pair size in UTF-16. */ 2972 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2973 parse = utf16le_to_unicode; 2974 tm = 1; 2975 spair = 4;/* surrogate pair size in UTF-16. */ 2976 } else { 2977 parse = cesu8_to_unicode; 2978 tm = ts; 2979 spair = 6;/* surrogate pair size in UTF-8. */ 2980 } 2981 2982 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2983 return (-1); 2984 2985 p = as->s + as->length; 2986 endp = as->s + as->buffer_length - ts; 2987 while ((n = parse(&uc, s, len)) != 0) { 2988 const char *ucptr, *uc2ptr; 2989 2990 if (n < 0) { 2991 /* Use a replaced unicode character. */ 2992 UNPARSE(p, endp, uc); 2993 s += n*-1; 2994 len -= n*-1; 2995 ret = -1; 2996 continue; 2997 } else if (n == spair || always_replace) 2998 /* uc is converted from a surrogate pair. 2999 * this should be treated as a changed code. */ 3000 ucptr = NULL; 3001 else 3002 ucptr = s; 3003 s += n; 3004 len -= n; 3005 3006 /* Read second code point. */ 3007 while ((n2 = parse(&uc2, s, len)) > 0) { 3008 uint32_t ucx[FDC_MAX]; 3009 int ccx[FDC_MAX]; 3010 int cl, cx, i, nx, ucx_size; 3011 int LIndex,SIndex; 3012 uint32_t nfc; 3013 3014 if (n2 == spair || always_replace) 3015 /* uc2 is converted from a surrogate pair. 3016 * this should be treated as a changed code. */ 3017 uc2ptr = NULL; 3018 else 3019 uc2ptr = s; 3020 s += n2; 3021 len -= n2; 3022 3023 /* 3024 * If current second code point is out of decomposable 3025 * code points, finding compositions is unneeded. 3026 */ 3027 if (!IS_DECOMPOSABLE_BLOCK(uc2)) { 3028 WRITE_UC(); 3029 REPLACE_UC_WITH_UC2(); 3030 continue; 3031 } 3032 3033 /* 3034 * Try to combine current code points. 3035 */ 3036 /* 3037 * We have to combine Hangul characters according to 3038 * http://uniicode.org/reports/tr15/#Hangul 3039 */ 3040 if (0 <= (LIndex = uc - HC_LBASE) && 3041 LIndex < HC_LCOUNT) { 3042 /* 3043 * Hangul Composition. 3044 * 1. Two current code points are L and V. 3045 */ 3046 int VIndex = uc2 - HC_VBASE; 3047 if (0 <= VIndex && VIndex < HC_VCOUNT) { 3048 /* Make syllable of form LV. */ 3049 UPDATE_UC(HC_SBASE + 3050 (LIndex * HC_VCOUNT + VIndex) * 3051 HC_TCOUNT); 3052 } else { 3053 WRITE_UC(); 3054 REPLACE_UC_WITH_UC2(); 3055 } 3056 continue; 3057 } else if (0 <= (SIndex = uc - HC_SBASE) && 3058 SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { 3059 /* 3060 * Hangul Composition. 3061 * 2. Two current code points are LV and T. 3062 */ 3063 int TIndex = uc2 - HC_TBASE; 3064 if (0 < TIndex && TIndex < HC_TCOUNT) { 3065 /* Make syllable of form LVT. */ 3066 UPDATE_UC(uc + TIndex); 3067 } else { 3068 WRITE_UC(); 3069 REPLACE_UC_WITH_UC2(); 3070 } 3071 continue; 3072 } else if ((nfc = get_nfc(uc, uc2)) != 0) { 3073 /* A composition to current code points 3074 * is found. */ 3075 UPDATE_UC(nfc); 3076 continue; 3077 } else if ((cl = CCC(uc2)) == 0) { 3078 /* Clearly 'uc2' the second code point is not 3079 * a decomposable code. */ 3080 WRITE_UC(); 3081 REPLACE_UC_WITH_UC2(); 3082 continue; 3083 } 3084 3085 /* 3086 * Collect following decomposable code points. 3087 */ 3088 cx = 0; 3089 ucx[0] = uc2; 3090 ccx[0] = cl; 3091 COLLECT_CPS(1); 3092 3093 /* 3094 * Find a composed code in the collected code points. 3095 */ 3096 i = 1; 3097 while (i < ucx_size) { 3098 int j; 3099 3100 if ((nfc = get_nfc(uc, ucx[i])) == 0) { 3101 i++; 3102 continue; 3103 } 3104 3105 /* 3106 * nfc is composed of uc and ucx[i]. 3107 */ 3108 UPDATE_UC(nfc); 3109 3110 /* 3111 * Remove ucx[i] by shifting 3112 * following code points. 3113 */ 3114 for (j = i; j+1 < ucx_size; j++) { 3115 ucx[j] = ucx[j+1]; 3116 ccx[j] = ccx[j+1]; 3117 } 3118 ucx_size --; 3119 3120 /* 3121 * Collect following code points blocked 3122 * by ucx[i] the removed code point. 3123 */ 3124 if (ucx_size > 0 && i == ucx_size && 3125 nx > 0 && cx == cl) { 3126 cl = ccx[ucx_size-1]; 3127 COLLECT_CPS(ucx_size); 3128 } 3129 /* 3130 * Restart finding a composed code with 3131 * the updated uc from the top of the 3132 * collected code points. 3133 */ 3134 i = 0; 3135 } 3136 3137 /* 3138 * Apparently the current code points are not 3139 * decomposed characters or already composed. 3140 */ 3141 WRITE_UC(); 3142 for (i = 0; i < ucx_size; i++) 3143 UNPARSE(p, endp, ucx[i]); 3144 3145 /* 3146 * Flush out remaining canonical combining characters. 3147 */ 3148 if (nx > 0 && cx == cl && len > 0) { 3149 while ((nx = parse(&ucx[0], s, len)) 3150 > 0) { 3151 cx = CCC(ucx[0]); 3152 if (cl > cx) 3153 break; 3154 s += nx; 3155 len -= nx; 3156 cl = cx; 3157 UNPARSE(p, endp, ucx[0]); 3158 } 3159 } 3160 break; 3161 } 3162 if (n2 < 0) { 3163 WRITE_UC(); 3164 /* Use a replaced unicode character. */ 3165 UNPARSE(p, endp, uc2); 3166 s += n2*-1; 3167 len -= n2*-1; 3168 ret = -1; 3169 continue; 3170 } else if (n2 == 0) { 3171 WRITE_UC(); 3172 break; 3173 } 3174 } 3175 as->length = p - as->s; 3176 as->s[as->length] = '\0'; 3177 if (ts == 2) 3178 as->s[as->length+1] = '\0'; 3179 return (ret); 3180} 3181 3182static int 3183get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) 3184{ 3185 int t, b; 3186 3187 /* 3188 * These are not converted to NFD on Mac OS. 3189 */ 3190 if ((uc >= 0x2000 && uc <= 0x2FFF) || 3191 (uc >= 0xF900 && uc <= 0xFAFF) || 3192 (uc >= 0x2F800 && uc <= 0x2FAFF)) 3193 return (0); 3194 /* 3195 * Those code points are not converted to NFD on Mac OS. 3196 * I do not know the reason because it is undocumented. 3197 * NFC NFD 3198 * 1109A ==> 11099 110BA 3199 * 1109C ==> 1109B 110BA 3200 * 110AB ==> 110A5 110BA 3201 */ 3202 if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) 3203 return (0); 3204 3205 t = 0; 3206 b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; 3207 while (b >= t) { 3208 int m = (t + b) / 2; 3209 if (u_decomposition_table[m].nfc < uc) 3210 t = m + 1; 3211 else if (u_decomposition_table[m].nfc > uc) 3212 b = m - 1; 3213 else { 3214 *cp1 = u_decomposition_table[m].cp1; 3215 *cp2 = u_decomposition_table[m].cp2; 3216 return (1); 3217 } 3218 } 3219 return (0); 3220} 3221 3222#define REPLACE_UC_WITH(cp) do { \ 3223 uc = cp; \ 3224 ucptr = NULL; \ 3225} while (0) 3226 3227/* 3228 * Normalize UTF-8 characters to Form D and copy the result. 3229 */ 3230static int 3231archive_string_normalize_D(struct archive_string *as, const void *_p, 3232 size_t len, struct archive_string_conv *sc) 3233{ 3234 const char *s = (const char *)_p; 3235 char *p, *endp; 3236 uint32_t uc, uc2; 3237 size_t w; 3238 int always_replace, n, n2, ret = 0, spair, ts, tm; 3239 int (*parse)(uint32_t *, const char *, size_t); 3240 size_t (*unparse)(char *, size_t, uint32_t); 3241 3242 always_replace = 1; 3243 ts = 1;/* text size. */ 3244 if (sc->flag & SCONV_TO_UTF16BE) { 3245 unparse = unicode_to_utf16be; 3246 ts = 2; 3247 if (sc->flag & SCONV_FROM_UTF16BE) 3248 always_replace = 0; 3249 } else if (sc->flag & SCONV_TO_UTF16LE) { 3250 unparse = unicode_to_utf16le; 3251 ts = 2; 3252 if (sc->flag & SCONV_FROM_UTF16LE) 3253 always_replace = 0; 3254 } else if (sc->flag & SCONV_TO_UTF8) { 3255 unparse = unicode_to_utf8; 3256 if (sc->flag & SCONV_FROM_UTF8) 3257 always_replace = 0; 3258 } else { 3259 /* 3260 * This case is going to be converted to another 3261 * character-set through iconv. 3262 */ 3263 always_replace = 0; 3264 if (sc->flag & SCONV_FROM_UTF16BE) { 3265 unparse = unicode_to_utf16be; 3266 ts = 2; 3267 } else if (sc->flag & SCONV_FROM_UTF16LE) { 3268 unparse = unicode_to_utf16le; 3269 ts = 2; 3270 } else { 3271 unparse = unicode_to_utf8; 3272 } 3273 } 3274 3275 if (sc->flag & SCONV_FROM_UTF16BE) { 3276 parse = utf16be_to_unicode; 3277 tm = 1; 3278 spair = 4;/* surrogate pair size in UTF-16. */ 3279 } else if (sc->flag & SCONV_FROM_UTF16LE) { 3280 parse = utf16le_to_unicode; 3281 tm = 1; 3282 spair = 4;/* surrogate pair size in UTF-16. */ 3283 } else { 3284 parse = cesu8_to_unicode; 3285 tm = ts; 3286 spair = 6;/* surrogate pair size in UTF-8. */ 3287 } 3288 3289 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 3290 return (-1); 3291 3292 p = as->s + as->length; 3293 endp = as->s + as->buffer_length - ts; 3294 while ((n = parse(&uc, s, len)) != 0) { 3295 const char *ucptr; 3296 uint32_t cp1, cp2; 3297 int SIndex; 3298 struct { 3299 uint32_t uc; 3300 int ccc; 3301 } fdc[FDC_MAX]; 3302 int fdi, fdj; 3303 int ccc; 3304 3305check_first_code: 3306 if (n < 0) { 3307 /* Use a replaced unicode character. */ 3308 UNPARSE(p, endp, uc); 3309 s += n*-1; 3310 len -= n*-1; 3311 ret = -1; 3312 continue; 3313 } else if (n == spair || always_replace) 3314 /* uc is converted from a surrogate pair. 3315 * this should be treated as a changed code. */ 3316 ucptr = NULL; 3317 else 3318 ucptr = s; 3319 s += n; 3320 len -= n; 3321 3322 /* Hangul Decomposition. */ 3323 if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { 3324 int L = HC_LBASE + SIndex / HC_NCOUNT; 3325 int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; 3326 int T = HC_TBASE + SIndex % HC_TCOUNT; 3327 3328 REPLACE_UC_WITH(L); 3329 WRITE_UC(); 3330 REPLACE_UC_WITH(V); 3331 WRITE_UC(); 3332 if (T != HC_TBASE) { 3333 REPLACE_UC_WITH(T); 3334 WRITE_UC(); 3335 } 3336 continue; 3337 } 3338 if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { 3339 WRITE_UC(); 3340 continue; 3341 } 3342 3343 fdi = 0; 3344 while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { 3345 int k; 3346 3347 for (k = fdi; k > 0; k--) 3348 fdc[k] = fdc[k-1]; 3349 fdc[0].ccc = CCC(cp2); 3350 fdc[0].uc = cp2; 3351 fdi++; 3352 REPLACE_UC_WITH(cp1); 3353 } 3354 3355 /* Read following code points. */ 3356 while ((n2 = parse(&uc2, s, len)) > 0 && 3357 (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { 3358 int j, k; 3359 3360 s += n2; 3361 len -= n2; 3362 for (j = 0; j < fdi; j++) { 3363 if (fdc[j].ccc > ccc) 3364 break; 3365 } 3366 if (j < fdi) { 3367 for (k = fdi; k > j; k--) 3368 fdc[k] = fdc[k-1]; 3369 fdc[j].ccc = ccc; 3370 fdc[j].uc = uc2; 3371 } else { 3372 fdc[fdi].ccc = ccc; 3373 fdc[fdi].uc = uc2; 3374 } 3375 fdi++; 3376 } 3377 3378 WRITE_UC(); 3379 for (fdj = 0; fdj < fdi; fdj++) { 3380 REPLACE_UC_WITH(fdc[fdj].uc); 3381 WRITE_UC(); 3382 } 3383 3384 if (n2 == 0) 3385 break; 3386 REPLACE_UC_WITH(uc2); 3387 n = n2; 3388 goto check_first_code; 3389 } 3390 as->length = p - as->s; 3391 as->s[as->length] = '\0'; 3392 if (ts == 2) 3393 as->s[as->length+1] = '\0'; 3394 return (ret); 3395} 3396 3397/* 3398 * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption 3399 * that WCS is Unicode. It is true for several platforms but some are false. 3400 * And then people who did not use UTF-8 locale on the non Unicode WCS 3401 * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those 3402 * now cannot get right filename from libarchive 3.x and later since we 3403 * fixed the wrong assumption and it is incompatible to older its versions. 3404 * So we provide special option, "compat-2x.x", for resolving it. 3405 * That option enable the string conversion of libarchive 2.x. 3406 * 3407 * Translates the wrong UTF-8 string made by libarchive 2.x into current 3408 * locale character set and appends to the archive_string. 3409 * Note: returns -1 if conversion fails. 3410 */ 3411static int 3412strncat_from_utf8_libarchive2(struct archive_string *as, 3413 const void *_p, size_t len, struct archive_string_conv *sc) 3414{ 3415 const char *s; 3416 int n; 3417 char *p; 3418 char *end; 3419 uint32_t unicode; 3420#if HAVE_WCRTOMB 3421 mbstate_t shift_state; 3422 3423 memset(&shift_state, 0, sizeof(shift_state)); 3424#else 3425 /* Clear the shift state before starting. */ 3426 wctomb(NULL, L'\0'); 3427#endif 3428 (void)sc; /* UNUSED */ 3429 /* 3430 * Allocate buffer for MBS. 3431 * We need this allocation here since it is possible that 3432 * as->s is still NULL. 3433 */ 3434 if (archive_string_ensure(as, as->length + len + 1) == NULL) 3435 return (-1); 3436 3437 s = (const char *)_p; 3438 p = as->s + as->length; 3439 end = as->s + as->buffer_length - MB_CUR_MAX -1; 3440 while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { 3441 wchar_t wc; 3442 3443 if (p >= end) { 3444 as->length = p - as->s; 3445 /* Re-allocate buffer for MBS. */ 3446 if (archive_string_ensure(as, 3447 as->length + max(len * 2, 3448 (size_t)MB_CUR_MAX) + 1) == NULL) 3449 return (-1); 3450 p = as->s + as->length; 3451 end = as->s + as->buffer_length - MB_CUR_MAX -1; 3452 } 3453 3454 /* 3455 * As libarchive 2.x, translates the UTF-8 characters into 3456 * wide-characters in the assumption that WCS is Unicode. 3457 */ 3458 if (n < 0) { 3459 n *= -1; 3460 wc = L'?'; 3461 } else 3462 wc = (wchar_t)unicode; 3463 3464 s += n; 3465 len -= n; 3466 /* 3467 * Translates the wide-character into the current locale MBS. 3468 */ 3469#if HAVE_WCRTOMB 3470 n = (int)wcrtomb(p, wc, &shift_state); 3471#else 3472 n = (int)wctomb(p, wc); 3473#endif 3474 if (n == -1) 3475 return (-1); 3476 p += n; 3477 } 3478 as->length = p - as->s; 3479 as->s[as->length] = '\0'; 3480 return (0); 3481} 3482 3483 3484/* 3485 * Conversion functions between current locale dependent MBS and UTF-16BE. 3486 * strncat_from_utf16be() : UTF-16BE --> MBS 3487 * strncat_to_utf16be() : MBS --> UTF16BE 3488 */ 3489 3490#if defined(_WIN32) && !defined(__CYGWIN__) 3491 3492/* 3493 * Convert a UTF-16BE/LE string to current locale and copy the result. 3494 * Return -1 if conversion fails. 3495 */ 3496static int 3497win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, 3498 struct archive_string_conv *sc, int be) 3499{ 3500 struct archive_string tmp; 3501 const char *u16; 3502 int ll; 3503 BOOL defchar; 3504 char *mbs; 3505 size_t mbs_size, b; 3506 int ret = 0; 3507 3508 bytes &= ~1; 3509 if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3510 return (-1); 3511 3512 mbs = as->s + as->length; 3513 mbs_size = as->buffer_length - as->length -1; 3514 3515 if (sc->to_cp == CP_C_LOCALE) { 3516 /* 3517 * "C" locale special process. 3518 */ 3519 u16 = _p; 3520 ll = 0; 3521 for (b = 0; b < bytes; b += 2) { 3522 uint16_t val; 3523 if (be) 3524 val = archive_be16dec(u16+b); 3525 else 3526 val = archive_le16dec(u16+b); 3527 if (val > 255) { 3528 *mbs++ = '?'; 3529 ret = -1; 3530 } else 3531 *mbs++ = (char)(val&0xff); 3532 ll++; 3533 } 3534 as->length += ll; 3535 as->s[as->length] = '\0'; 3536 return (ret); 3537 } 3538 3539 archive_string_init(&tmp); 3540 if (be) { 3541 if (is_big_endian()) { 3542 u16 = _p; 3543 } else { 3544 if (archive_string_ensure(&tmp, bytes+2) == NULL) 3545 return (-1); 3546 memcpy(tmp.s, _p, bytes); 3547 for (b = 0; b < bytes; b += 2) { 3548 uint16_t val = archive_be16dec(tmp.s+b); 3549 archive_le16enc(tmp.s+b, val); 3550 } 3551 u16 = tmp.s; 3552 } 3553 } else { 3554 if (!is_big_endian()) { 3555 u16 = _p; 3556 } else { 3557 if (archive_string_ensure(&tmp, bytes+2) == NULL) 3558 return (-1); 3559 memcpy(tmp.s, _p, bytes); 3560 for (b = 0; b < bytes; b += 2) { 3561 uint16_t val = archive_le16dec(tmp.s+b); 3562 archive_be16enc(tmp.s+b, val); 3563 } 3564 u16 = tmp.s; 3565 } 3566 } 3567 3568 do { 3569 defchar = 0; 3570 ll = WideCharToMultiByte(sc->to_cp, 0, 3571 (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, 3572 NULL, &defchar); 3573 /* Exit loop if we succeeded */ 3574 if (ll != 0 || 3575 GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3576 break; 3577 } 3578 /* Else expand buffer and loop to try again. */ 3579 ll = WideCharToMultiByte(sc->to_cp, 0, 3580 (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); 3581 if (archive_string_ensure(as, ll +1) == NULL) 3582 return (-1); 3583 mbs = as->s + as->length; 3584 mbs_size = as->buffer_length - as->length -1; 3585 } while (1); 3586 archive_string_free(&tmp); 3587 as->length += ll; 3588 as->s[as->length] = '\0'; 3589 if (ll == 0 || defchar) 3590 ret = -1; 3591 return (ret); 3592} 3593 3594static int 3595win_strncat_from_utf16be(struct archive_string *as, const void *_p, 3596 size_t bytes, struct archive_string_conv *sc) 3597{ 3598 return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); 3599} 3600 3601static int 3602win_strncat_from_utf16le(struct archive_string *as, const void *_p, 3603 size_t bytes, struct archive_string_conv *sc) 3604{ 3605 return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); 3606} 3607 3608static int 3609is_big_endian(void) 3610{ 3611 uint16_t d = 1; 3612 3613 return (archive_be16dec(&d) == 1); 3614} 3615 3616/* 3617 * Convert a current locale string to UTF-16BE/LE and copy the result. 3618 * Return -1 if conversion fails. 3619 */ 3620static int 3621win_strncat_to_utf16(struct archive_string *as16, const void *_p, 3622 size_t length, struct archive_string_conv *sc, int bigendian) 3623{ 3624 const char *s = (const char *)_p; 3625 char *u16; 3626 size_t count, avail; 3627 3628 if (archive_string_ensure(as16, 3629 as16->length + (length + 1) * 2) == NULL) 3630 return (-1); 3631 3632 u16 = as16->s + as16->length; 3633 avail = as16->buffer_length - 2; 3634 if (sc->from_cp == CP_C_LOCALE) { 3635 /* 3636 * "C" locale special process. 3637 */ 3638 count = 0; 3639 while (count < length && *s) { 3640 if (bigendian) 3641 archive_be16enc(u16, *s); 3642 else 3643 archive_le16enc(u16, *s); 3644 u16 += 2; 3645 s++; 3646 count++; 3647 } 3648 as16->length += count << 1; 3649 as16->s[as16->length] = 0; 3650 as16->s[as16->length+1] = 0; 3651 return (0); 3652 } 3653 do { 3654 count = MultiByteToWideChar(sc->from_cp, 3655 MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); 3656 /* Exit loop if we succeeded */ 3657 if (count != 0 || 3658 GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3659 break; 3660 } 3661 /* Expand buffer and try again */ 3662 count = MultiByteToWideChar(sc->from_cp, 3663 MB_PRECOMPOSED, s, (int)length, NULL, 0); 3664 if (archive_string_ensure(as16, (count +1) * 2) 3665 == NULL) 3666 return (-1); 3667 u16 = as16->s + as16->length; 3668 avail = as16->buffer_length - 2; 3669 } while (1); 3670 as16->length += count * 2; 3671 as16->s[as16->length] = 0; 3672 as16->s[as16->length+1] = 0; 3673 if (count == 0) 3674 return (-1); 3675 3676 if (is_big_endian()) { 3677 if (!bigendian) { 3678 while (count > 0) { 3679 uint16_t v = archive_be16dec(u16); 3680 archive_le16enc(u16, v); 3681 u16 += 2; 3682 count--; 3683 } 3684 } 3685 } else { 3686 if (bigendian) { 3687 while (count > 0) { 3688 uint16_t v = archive_le16dec(u16); 3689 archive_be16enc(u16, v); 3690 u16 += 2; 3691 count--; 3692 } 3693 } 3694 } 3695 return (0); 3696} 3697 3698static int 3699win_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3700 size_t length, struct archive_string_conv *sc) 3701{ 3702 return (win_strncat_to_utf16(as16, _p, length, sc, 1)); 3703} 3704 3705static int 3706win_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3707 size_t length, struct archive_string_conv *sc) 3708{ 3709 return (win_strncat_to_utf16(as16, _p, length, sc, 0)); 3710} 3711 3712#endif /* _WIN32 && !__CYGWIN__ */ 3713 3714/* 3715 * Do the best effort for conversions. 3716 * We cannot handle UTF-16BE character-set without such iconv, 3717 * but there is a chance if a string consists just ASCII code or 3718 * a current locale is UTF-8. 3719 */ 3720 3721/* 3722 * Convert a UTF-16BE string to current locale and copy the result. 3723 * Return -1 if conversion fails. 3724 */ 3725static int 3726best_effort_strncat_from_utf16(struct archive_string *as, const void *_p, 3727 size_t bytes, struct archive_string_conv *sc, int be) 3728{ 3729 const char *utf16 = (const char *)_p; 3730 char *mbs; 3731 uint32_t uc; 3732 int n, ret; 3733 3734 (void)sc; /* UNUSED */ 3735 /* 3736 * Other case, we should do the best effort. 3737 * If all character are ASCII(<0x7f), we can convert it. 3738 * if not , we set a alternative character and return -1. 3739 */ 3740 ret = 0; 3741 if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3742 return (-1); 3743 mbs = as->s + as->length; 3744 3745 while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { 3746 if (n < 0) { 3747 n *= -1; 3748 ret = -1; 3749 } 3750 bytes -= n; 3751 utf16 += n; 3752 3753 if (uc > 127) { 3754 /* We cannot handle it. */ 3755 *mbs++ = '?'; 3756 ret = -1; 3757 } else 3758 *mbs++ = (char)uc; 3759 } 3760 as->length = mbs - as->s; 3761 as->s[as->length] = '\0'; 3762 return (ret); 3763} 3764 3765static int 3766best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, 3767 size_t bytes, struct archive_string_conv *sc) 3768{ 3769 return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); 3770} 3771 3772static int 3773best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, 3774 size_t bytes, struct archive_string_conv *sc) 3775{ 3776 return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); 3777} 3778 3779/* 3780 * Convert a current locale string to UTF-16BE/LE and copy the result. 3781 * Return -1 if conversion fails. 3782 */ 3783static int 3784best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, 3785 size_t length, struct archive_string_conv *sc, int bigendian) 3786{ 3787 const char *s = (const char *)_p; 3788 char *utf16; 3789 size_t remaining; 3790 int ret; 3791 3792 (void)sc; /* UNUSED */ 3793 /* 3794 * Other case, we should do the best effort. 3795 * If all character are ASCII(<0x7f), we can convert it. 3796 * if not , we set a alternative character and return -1. 3797 */ 3798 ret = 0; 3799 remaining = length; 3800 3801 if (archive_string_ensure(as16, 3802 as16->length + (length + 1) * 2) == NULL) 3803 return (-1); 3804 3805 utf16 = as16->s + as16->length; 3806 while (remaining--) { 3807 unsigned c = *s++; 3808 if (c > 127) { 3809 /* We cannot handle it. */ 3810 c = UNICODE_R_CHAR; 3811 ret = -1; 3812 } 3813 if (bigendian) 3814 archive_be16enc(utf16, c); 3815 else 3816 archive_le16enc(utf16, c); 3817 utf16 += 2; 3818 } 3819 as16->length = utf16 - as16->s; 3820 as16->s[as16->length] = 0; 3821 as16->s[as16->length+1] = 0; 3822 return (ret); 3823} 3824 3825static int 3826best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3827 size_t length, struct archive_string_conv *sc) 3828{ 3829 return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); 3830} 3831 3832static int 3833best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3834 size_t length, struct archive_string_conv *sc) 3835{ 3836 return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); 3837} 3838 3839 3840/* 3841 * Multistring operations. 3842 */ 3843 3844void 3845archive_mstring_clean(struct archive_mstring *aes) 3846{ 3847 archive_wstring_free(&(aes->aes_wcs)); 3848 archive_string_free(&(aes->aes_mbs)); 3849 archive_string_free(&(aes->aes_utf8)); 3850 archive_string_free(&(aes->aes_mbs_in_locale)); 3851 aes->aes_set = 0; 3852} 3853 3854void 3855archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) 3856{ 3857 dest->aes_set = src->aes_set; 3858 archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); 3859 archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); 3860 archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); 3861} 3862 3863int 3864archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, 3865 const char **p) 3866{ 3867 struct archive_string_conv *sc; 3868 int r; 3869 3870 /* If we already have a UTF8 form, return that immediately. */ 3871 if (aes->aes_set & AES_SET_UTF8) { 3872 *p = aes->aes_utf8.s; 3873 return (0); 3874 } 3875 3876 *p = NULL; 3877 /* Try converting WCS to MBS first if MBS does not exist yet. */ 3878 if ((aes->aes_set & AES_SET_MBS) == 0) { 3879 const char *pm; /* unused */ 3880 archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ 3881 } 3882 if (aes->aes_set & AES_SET_MBS) { 3883 sc = archive_string_conversion_to_charset(a, "UTF-8", 1); 3884 if (sc == NULL) 3885 return (-1);/* Couldn't allocate memory for sc. */ 3886 r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s, 3887 aes->aes_mbs.length, sc); 3888 if (a == NULL) 3889 free_sconv_object(sc); 3890 if (r == 0) { 3891 aes->aes_set |= AES_SET_UTF8; 3892 *p = aes->aes_utf8.s; 3893 return (0);/* success. */ 3894 } else 3895 return (-1);/* failure. */ 3896 } 3897 return (0);/* success. */ 3898} 3899 3900int 3901archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, 3902 const char **p) 3903{ 3904 struct archive_string_conv *sc; 3905 int r, ret = 0; 3906 3907 /* If we already have an MBS form, return that immediately. */ 3908 if (aes->aes_set & AES_SET_MBS) { 3909 *p = aes->aes_mbs.s; 3910 return (ret); 3911 } 3912 3913 *p = NULL; 3914 /* If there's a WCS form, try converting with the native locale. */ 3915 if (aes->aes_set & AES_SET_WCS) { 3916 archive_string_empty(&(aes->aes_mbs)); 3917 r = archive_string_append_from_wcs(&(aes->aes_mbs), 3918 aes->aes_wcs.s, aes->aes_wcs.length); 3919 *p = aes->aes_mbs.s; 3920 if (r == 0) { 3921 aes->aes_set |= AES_SET_MBS; 3922 return (ret); 3923 } else 3924 ret = -1; 3925 } 3926 3927 /* If there's a UTF-8 form, try converting with the native locale. */ 3928 if (aes->aes_set & AES_SET_UTF8) { 3929 archive_string_empty(&(aes->aes_mbs)); 3930 sc = archive_string_conversion_from_charset(a, "UTF-8", 1); 3931 if (sc == NULL) 3932 return (-1);/* Couldn't allocate memory for sc. */ 3933 r = archive_strncpy_l(&(aes->aes_mbs), 3934 aes->aes_utf8.s, aes->aes_utf8.length, sc); 3935 if (a == NULL) 3936 free_sconv_object(sc); 3937 *p = aes->aes_mbs.s; 3938 if (r == 0) { 3939 aes->aes_set |= AES_SET_MBS; 3940 ret = 0;/* success; overwrite previous error. */ 3941 } else 3942 ret = -1;/* failure. */ 3943 } 3944 return (ret); 3945} 3946 3947int 3948archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, 3949 const wchar_t **wp) 3950{ 3951 int r, ret = 0; 3952 3953 (void)a;/* UNUSED */ 3954 /* Return WCS form if we already have it. */ 3955 if (aes->aes_set & AES_SET_WCS) { 3956 *wp = aes->aes_wcs.s; 3957 return (ret); 3958 } 3959 3960 *wp = NULL; 3961 /* Try converting UTF8 to MBS first if MBS does not exist yet. */ 3962 if ((aes->aes_set & AES_SET_MBS) == 0) { 3963 const char *p; /* unused */ 3964 archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */ 3965 } 3966 /* Try converting MBS to WCS using native locale. */ 3967 if (aes->aes_set & AES_SET_MBS) { 3968 archive_wstring_empty(&(aes->aes_wcs)); 3969 r = archive_wstring_append_from_mbs(&(aes->aes_wcs), 3970 aes->aes_mbs.s, aes->aes_mbs.length); 3971 if (r == 0) { 3972 aes->aes_set |= AES_SET_WCS; 3973 *wp = aes->aes_wcs.s; 3974 } else 3975 ret = -1;/* failure. */ 3976 } 3977 return (ret); 3978} 3979 3980int 3981archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes, 3982 const char **p, size_t *length, struct archive_string_conv *sc) 3983{ 3984 int ret = 0; 3985#if defined(_WIN32) && !defined(__CYGWIN__) 3986 int r; 3987 3988 /* 3989 * Internationalization programming on Windows must use Wide 3990 * characters because Windows platform cannot make locale UTF-8. 3991 */ 3992 if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { 3993 archive_string_empty(&(aes->aes_mbs_in_locale)); 3994 r = archive_string_append_from_wcs_in_codepage( 3995 &(aes->aes_mbs_in_locale), aes->aes_wcs.s, 3996 aes->aes_wcs.length, sc); 3997 if (r == 0) { 3998 *p = aes->aes_mbs_in_locale.s; 3999 if (length != NULL) 4000 *length = aes->aes_mbs_in_locale.length; 4001 return (0); 4002 } else if (errno == ENOMEM) 4003 return (-1); 4004 else 4005 ret = -1; 4006 } 4007#endif 4008 4009 /* If there is not an MBS form but there is a WCS or UTF8 form, try converting 4010 * with the native locale to be used for translating it to specified 4011 * character-set. */ 4012 if ((aes->aes_set & AES_SET_MBS) == 0) { 4013 const char *pm; /* unused */ 4014 archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ 4015 } 4016 /* If we already have an MBS form, use it to be translated to 4017 * specified character-set. */ 4018 if (aes->aes_set & AES_SET_MBS) { 4019 if (sc == NULL) { 4020 /* Conversion is unneeded. */ 4021 *p = aes->aes_mbs.s; 4022 if (length != NULL) 4023 *length = aes->aes_mbs.length; 4024 return (0); 4025 } 4026 ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), 4027 aes->aes_mbs.s, aes->aes_mbs.length, sc); 4028 *p = aes->aes_mbs_in_locale.s; 4029 if (length != NULL) 4030 *length = aes->aes_mbs_in_locale.length; 4031 } else { 4032 *p = NULL; 4033 if (length != NULL) 4034 *length = 0; 4035 } 4036 return (ret); 4037} 4038 4039int 4040archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) 4041{ 4042 if (mbs == NULL) { 4043 aes->aes_set = 0; 4044 return (0); 4045 } 4046 return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); 4047} 4048 4049int 4050archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, 4051 size_t len) 4052{ 4053 if (mbs == NULL) { 4054 aes->aes_set = 0; 4055 return (0); 4056 } 4057 aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4058 archive_strncpy(&(aes->aes_mbs), mbs, len); 4059 archive_string_empty(&(aes->aes_utf8)); 4060 archive_wstring_empty(&(aes->aes_wcs)); 4061 return (0); 4062} 4063 4064int 4065archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) 4066{ 4067 return archive_mstring_copy_wcs_len(aes, wcs, 4068 wcs == NULL ? 0 : wcslen(wcs)); 4069} 4070 4071int 4072archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8) 4073{ 4074 if (utf8 == NULL) { 4075 aes->aes_set = 0; 4076 return (0); 4077 } 4078 aes->aes_set = AES_SET_UTF8; 4079 archive_string_empty(&(aes->aes_mbs)); 4080 archive_string_empty(&(aes->aes_wcs)); 4081 archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8)); 4082 return (int)strlen(utf8); 4083} 4084 4085int 4086archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, 4087 size_t len) 4088{ 4089 if (wcs == NULL) { 4090 aes->aes_set = 0; 4091 return (0); 4092 } 4093 aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ 4094 archive_string_empty(&(aes->aes_mbs)); 4095 archive_string_empty(&(aes->aes_utf8)); 4096 archive_wstrncpy(&(aes->aes_wcs), wcs, len); 4097 return (0); 4098} 4099 4100int 4101archive_mstring_copy_mbs_len_l(struct archive_mstring *aes, 4102 const char *mbs, size_t len, struct archive_string_conv *sc) 4103{ 4104 int r; 4105 4106 if (mbs == NULL) { 4107 aes->aes_set = 0; 4108 return (0); 4109 } 4110 archive_string_empty(&(aes->aes_mbs)); 4111 archive_wstring_empty(&(aes->aes_wcs)); 4112 archive_string_empty(&(aes->aes_utf8)); 4113#if defined(_WIN32) && !defined(__CYGWIN__) 4114 /* 4115 * Internationalization programming on Windows must use Wide 4116 * characters because Windows platform cannot make locale UTF-8. 4117 */ 4118 if (sc == NULL) { 4119 if (archive_string_append(&(aes->aes_mbs), 4120 mbs, mbsnbytes(mbs, len)) == NULL) { 4121 aes->aes_set = 0; 4122 r = -1; 4123 } else { 4124 aes->aes_set = AES_SET_MBS; 4125 r = 0; 4126 } 4127#if defined(HAVE_ICONV) 4128 } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { 4129 /* 4130 * This case happens only when MultiByteToWideChar() cannot 4131 * handle sc->from_cp, and we have to iconv in order to 4132 * translate character-set to wchar_t,UTF-16. 4133 */ 4134 iconv_t cd = sc->cd; 4135 unsigned from_cp; 4136 int flag; 4137 4138 /* 4139 * Translate multi-bytes from some character-set to UTF-8. 4140 */ 4141 sc->cd = sc->cd_w; 4142 r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); 4143 sc->cd = cd; 4144 if (r != 0) { 4145 aes->aes_set = 0; 4146 return (r); 4147 } 4148 aes->aes_set = AES_SET_UTF8; 4149 4150 /* 4151 * Append the UTF-8 string into wstring. 4152 */ 4153 flag = sc->flag; 4154 sc->flag &= ~(SCONV_NORMALIZATION_C 4155 | SCONV_TO_UTF16| SCONV_FROM_UTF16); 4156 from_cp = sc->from_cp; 4157 sc->from_cp = CP_UTF8; 4158 r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), 4159 aes->aes_utf8.s, aes->aes_utf8.length, sc); 4160 sc->flag = flag; 4161 sc->from_cp = from_cp; 4162 if (r == 0) 4163 aes->aes_set |= AES_SET_WCS; 4164#endif 4165 } else { 4166 r = archive_wstring_append_from_mbs_in_codepage( 4167 &(aes->aes_wcs), mbs, len, sc); 4168 if (r == 0) 4169 aes->aes_set = AES_SET_WCS; 4170 else 4171 aes->aes_set = 0; 4172 } 4173#else 4174 r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); 4175 if (r == 0) 4176 aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4177 else 4178 aes->aes_set = 0; 4179#endif 4180 return (r); 4181} 4182 4183/* 4184 * The 'update' form tries to proactively update all forms of 4185 * this string (WCS and MBS) and returns an error if any of 4186 * them fail. This is used by the 'pax' handler, for instance, 4187 * to detect and report character-conversion failures early while 4188 * still allowing clients to get potentially useful values from 4189 * the more tolerant lazy conversions. (get_mbs and get_wcs will 4190 * strive to give the user something useful, so you can get hopefully 4191 * usable values even if some of the character conversions are failing.) 4192 */ 4193int 4194archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, 4195 const char *utf8) 4196{ 4197 struct archive_string_conv *sc; 4198 int r; 4199 4200 if (utf8 == NULL) { 4201 aes->aes_set = 0; 4202 return (0); /* Succeeded in clearing everything. */ 4203 } 4204 4205 /* Save the UTF8 string. */ 4206 archive_strcpy(&(aes->aes_utf8), utf8); 4207 4208 /* Empty the mbs and wcs strings. */ 4209 archive_string_empty(&(aes->aes_mbs)); 4210 archive_wstring_empty(&(aes->aes_wcs)); 4211 4212 aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ 4213 4214 /* Try converting UTF-8 to MBS, return false on failure. */ 4215 sc = archive_string_conversion_from_charset(a, "UTF-8", 1); 4216 if (sc == NULL) 4217 return (-1);/* Couldn't allocate memory for sc. */ 4218 r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); 4219 4220#if defined(_WIN32) && !defined(__CYGWIN__) 4221 /* On failure, make an effort to convert UTF8 to WCS as the active code page 4222 * may not be able to represent all characters in the string */ 4223 if (r != 0) { 4224 if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), 4225 aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0) 4226 aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; 4227 } 4228#endif 4229 4230 if (a == NULL) 4231 free_sconv_object(sc); 4232 if (r != 0) 4233 return (-1); 4234 aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ 4235 4236 /* Try converting MBS to WCS, return false on failure. */ 4237 if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, 4238 aes->aes_mbs.length)) 4239 return (-1); 4240 aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; 4241 4242 /* All conversions succeeded. */ 4243 return (0); 4244} 4245