1170530Ssam/*-
2178354Ssam * Copyright (c) 2003-2011 Tim Kientzle
3170530Ssam * Copyright (c) 2011-2012 Michihiro NAKAJIMA
4170530Ssam * All rights reserved.
5170530Ssam *
6170530Ssam * Redistribution and use in source and binary forms, with or without
7170530Ssam * modification, are permitted provided that the following conditions
8170530Ssam * are met:
9170530Ssam * 1. Redistributions of source code must retain the above copyright
10170530Ssam *    notice, this list of conditions and the following disclaimer.
11170530Ssam * 2. Redistributions in binary form must reproduce the above copyright
12170530Ssam *    notice, this list of conditions and the following disclaimer in the
13170530Ssam *    documentation and/or other materials provided with the distribution.
14170530Ssam *
15170530Ssam * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16170530Ssam * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17170530Ssam * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18170530Ssam * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19170530Ssam * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20170530Ssam * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21170530Ssam * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22170530Ssam * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23170530Ssam * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24170530Ssam * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25170530Ssam */
26170530Ssam
27170530Ssam#include "archive_platform.h"
28170530Ssam
29170530Ssam/*
30170530Ssam * Basic resizable string support, to simplify manipulating arbitrary-sized
31170530Ssam * strings while minimizing heap activity.
32170530Ssam *
33170530Ssam * In particular, the buffer used by a string object is only grown, it
34170530Ssam * never shrinks, so you can clear and reuse the same string object
35170530Ssam * without incurring additional memory allocations.
36178354Ssam */
37170530Ssam
38170530Ssam#ifdef HAVE_ERRNO_H
39170530Ssam#include <errno.h>
40170530Ssam#endif
41170530Ssam#ifdef HAVE_ICONV_H
42170530Ssam#include <iconv.h>
43170530Ssam#endif
44170530Ssam#ifdef HAVE_LANGINFO_H
45170530Ssam#include <langinfo.h>
46170530Ssam#endif
47170530Ssam#ifdef HAVE_LOCALCHARSET_H
48170530Ssam#include <localcharset.h>
49170530Ssam#endif
50195377Ssam#ifdef HAVE_STDLIB_H
51178354Ssam#include <stdlib.h>
52170530Ssam#endif
53170530Ssam#ifdef HAVE_STRING_H
54170530Ssam#include <string.h>
55170530Ssam#endif
56170530Ssam#ifdef HAVE_WCHAR_H
57219456Sbschmidt#include <wchar.h>
58219456Sbschmidt#endif
59219456Sbschmidt#if defined(_WIN32) && !defined(__CYGWIN__)
60219456Sbschmidt#include <windows.h>
61219456Sbschmidt#include <locale.h>
62219456Sbschmidt#endif
63219456Sbschmidt
64219456Sbschmidt#include "archive_endian.h"
65219456Sbschmidt#include "archive_private.h"
66219456Sbschmidt#include "archive_string.h"
67219456Sbschmidt#include "archive_string_composition.h"
68219456Sbschmidt
69219456Sbschmidt#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)
70219456Sbschmidt#define wmemcpy(a,b,i)  (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))
71219456Sbschmidt#endif
72219456Sbschmidt
73219456Sbschmidt#if !defined(HAVE_WMEMMOVE) && !defined(wmemmove)
74219456Sbschmidt#define wmemmove(a,b,i)  (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t))
75219456Sbschmidt#endif
76219456Sbschmidt
77219456Sbschmidt#undef max
78219456Sbschmidt#define max(a, b)       ((a)>(b)?(a):(b))
79219456Sbschmidt
80219456Sbschmidtstruct archive_string_conv {
81219456Sbschmidt	struct archive_string_conv	*next;
82219456Sbschmidt	char				*from_charset;
83219456Sbschmidt	char				*to_charset;
84219456Sbschmidt	unsigned			 from_cp;
85219456Sbschmidt	unsigned			 to_cp;
86219456Sbschmidt	/* Set 1 if from_charset and to_charset are the same. */
87219456Sbschmidt	int				 same;
88219456Sbschmidt	int				 flag;
89219456Sbschmidt#define SCONV_TO_CHARSET	1	/* MBS is being converted to specified
90219456Sbschmidt					 * charset. */
91219456Sbschmidt#define SCONV_FROM_CHARSET	(1<<1)	/* MBS is being converted from
92219456Sbschmidt					 * specified charset. */
93219456Sbschmidt#define SCONV_BEST_EFFORT 	(1<<2)	/* Copy at least ASCII code. */
94219456Sbschmidt#define SCONV_WIN_CP	 	(1<<3)	/* Use Windows API for converting
95219456Sbschmidt					 * MBS. */
96219456Sbschmidt#define SCONV_UTF8_LIBARCHIVE_2 (1<<4)	/* Incorrect UTF-8 made by libarchive
97219456Sbschmidt					 * 2.x in the wrong assumption. */
98219456Sbschmidt#define SCONV_NORMALIZATION_C	(1<<6)	/* Need normalization to be Form C.
99219456Sbschmidt					 * Before UTF-8 characters are actually
100219456Sbschmidt					 * processed. */
101219456Sbschmidt#define SCONV_NORMALIZATION_D	(1<<7)	/* Need normalization to be Form D.
102219456Sbschmidt					 * Before UTF-8 characters are actually
103219456Sbschmidt					 * processed.
104219456Sbschmidt					 * Currently this only for MAC OS X. */
105219456Sbschmidt#define SCONV_TO_UTF8		(1<<8)	/* "to charset" side is UTF-8. */
106219456Sbschmidt#define SCONV_FROM_UTF8		(1<<9)	/* "from charset" side is UTF-8. */
107219456Sbschmidt#define SCONV_TO_UTF16BE 	(1<<10)	/* "to charset" side is UTF-16BE. */
108219456Sbschmidt#define SCONV_FROM_UTF16BE 	(1<<11)	/* "from charset" side is UTF-16BE. */
109219456Sbschmidt#define SCONV_TO_UTF16LE 	(1<<12)	/* "to charset" side is UTF-16LE. */
110219456Sbschmidt#define SCONV_FROM_UTF16LE 	(1<<13)	/* "from charset" side is UTF-16LE. */
111219456Sbschmidt#define SCONV_TO_UTF16		(SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)
112219456Sbschmidt#define SCONV_FROM_UTF16	(SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)
113219456Sbschmidt
114219456Sbschmidt#if HAVE_ICONV
115219456Sbschmidt	iconv_t				 cd;
116219456Sbschmidt	iconv_t				 cd_w;/* Use at archive_mstring on
117219456Sbschmidt				 	       * Windows. */
118219456Sbschmidt#endif
119219456Sbschmidt	/* A temporary buffer for normalization. */
120219456Sbschmidt	struct archive_string		 utftmp;
121219456Sbschmidt	int (*converter[2])(struct archive_string *, const void *, size_t,
122219456Sbschmidt	    struct archive_string_conv *);
123219456Sbschmidt	int				 nconverter;
124219456Sbschmidt};
125219456Sbschmidt
126219456Sbschmidt#define CP_C_LOCALE	0	/* "C" locale only for this file. */
127219456Sbschmidt#define CP_UTF16LE	1200
128219456Sbschmidt#define CP_UTF16BE	1201
129219456Sbschmidt
130219456Sbschmidt#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)
131219456Sbschmidt#define IS_LOW_SURROGATE_LA(uc)	 ((uc) >= 0xDC00 && (uc) <= 0xDFFF)
132219456Sbschmidt#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)
133219456Sbschmidt#define UNICODE_MAX		0x10FFFF
134219456Sbschmidt#define UNICODE_R_CHAR		0xFFFD	/* Replacement character. */
135170530Ssam/* Set U+FFFD(Replacement character) in UTF-8. */
136170530Ssamstatic const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
137170530Ssam
138170530Ssamstatic struct archive_string_conv *find_sconv_object(struct archive *,
139170530Ssam	const char *, const char *);
140170530Ssamstatic void add_sconv_object(struct archive *, struct archive_string_conv *);
141170530Ssamstatic struct archive_string_conv *create_sconv_object(const char *,
142170530Ssam	const char *, unsigned, int);
143173273Ssamstatic void free_sconv_object(struct archive_string_conv *);
144193115Ssamstatic struct archive_string_conv *get_sconv_object(struct archive *,
145193115Ssam	const char *, const char *, int);
146193115Ssamstatic unsigned make_codepage_from_charset(const char *);
147193115Ssamstatic unsigned get_current_codepage(void);
148173273Ssamstatic unsigned get_current_oemcp(void);
149173273Ssamstatic size_t mbsnbytes(const void *, size_t);
150193115Ssamstatic size_t utf16nbytes(const void *, size_t);
151193115Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
152193115Ssamstatic int archive_wstring_append_from_mbs_in_codepage(
153193115Ssam    struct archive_wstring *, const char *, size_t,
154193115Ssam    struct archive_string_conv *);
155193115Ssamstatic int archive_string_append_from_wcs_in_codepage(struct archive_string *,
156193115Ssam    const wchar_t *, size_t, struct archive_string_conv *);
157193115Ssamstatic int is_big_endian(void);
158193115Ssamstatic int strncat_in_codepage(struct archive_string *, const void *,
159193115Ssam    size_t, struct archive_string_conv *);
160193115Ssamstatic int win_strncat_from_utf16be(struct archive_string *, const void *,
161193115Ssam    size_t, struct archive_string_conv *);
162193115Ssamstatic int win_strncat_from_utf16le(struct archive_string *, const void *,
163193115Ssam    size_t, struct archive_string_conv *);
164193115Ssamstatic int win_strncat_to_utf16be(struct archive_string *, const void *,
165193115Ssam    size_t, struct archive_string_conv *);
166193115Ssamstatic int win_strncat_to_utf16le(struct archive_string *, const void *,
167193115Ssam    size_t, struct archive_string_conv *);
168193115Ssam#endif
169195377Ssamstatic int best_effort_strncat_from_utf16be(struct archive_string *,
170195377Ssam    const void *, size_t, struct archive_string_conv *);
171195377Ssamstatic int best_effort_strncat_from_utf16le(struct archive_string *,
172195377Ssam    const void *, size_t, struct archive_string_conv *);
173195377Ssamstatic int best_effort_strncat_to_utf16be(struct archive_string *,
174195377Ssam    const void *, size_t, struct archive_string_conv *);
175195377Ssamstatic int best_effort_strncat_to_utf16le(struct archive_string *,
176195377Ssam    const void *, size_t, struct archive_string_conv *);
177195377Ssam#if defined(HAVE_ICONV)
178195377Ssamstatic int iconv_strncat_in_locale(struct archive_string *, const void *,
179178354Ssam    size_t, struct archive_string_conv *);
180195377Ssam#endif
181178354Ssamstatic int best_effort_strncat_in_locale(struct archive_string *,
182195377Ssam    const void *, size_t, struct archive_string_conv *);
183195377Ssamstatic int _utf8_to_unicode(uint32_t *, const char *, size_t);
184195377Ssamstatic int utf8_to_unicode(uint32_t *, const char *, size_t);
185178354Ssamstatic inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);
186178354Ssamstatic int cesu8_to_unicode(uint32_t *, const char *, size_t);
187178354Ssamstatic size_t unicode_to_utf8(char *, size_t, uint32_t);
188178354Ssamstatic int utf16_to_unicode(uint32_t *, const char *, size_t, int);
189178354Ssamstatic size_t unicode_to_utf16be(char *, size_t, uint32_t);
190184280Ssamstatic size_t unicode_to_utf16le(char *, size_t, uint32_t);
191195377Ssamstatic int strncat_from_utf8_libarchive2(struct archive_string *,
192195377Ssam    const void *, size_t, struct archive_string_conv *);
193195377Ssamstatic int strncat_from_utf8_to_utf8(struct archive_string *, const void *,
194195377Ssam    size_t, struct archive_string_conv *);
195195377Ssamstatic int archive_string_normalize_C(struct archive_string *, const void *,
196195377Ssam    size_t, struct archive_string_conv *);
197195377Ssamstatic int archive_string_normalize_D(struct archive_string *, const void *,
198195377Ssam    size_t, struct archive_string_conv *);
199195377Ssamstatic int archive_string_append_unicode(struct archive_string *,
200195377Ssam    const void *, size_t, struct archive_string_conv *);
201195377Ssam
202195377Ssamstatic struct archive_string *
203195377Ssamarchive_string_append(struct archive_string *as, const char *p, size_t s)
204195377Ssam{
205195377Ssam	if (archive_string_ensure(as, as->length + s + 1) == NULL)
206195377Ssam		return (NULL);
207195377Ssam	if (s)
208195377Ssam		memmove(as->s + as->length, p, s);
209195377Ssam	as->length += s;
210195377Ssam	as->s[as->length] = 0;
211195377Ssam	return (as);
212195377Ssam}
213178354Ssam
214195377Ssamstatic struct archive_wstring *
215170530Ssamarchive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)
216178354Ssam{
217178354Ssam	if (archive_wstring_ensure(as, as->length + s + 1) == NULL)
218170530Ssam		return (NULL);
219170530Ssam	if (s)
220170530Ssam		wmemmove(as->s + as->length, p, s);
221170530Ssam	as->length += s;
222170530Ssam	as->s[as->length] = 0;
223170530Ssam	return (as);
224170530Ssam}
225170530Ssam
226184280Ssamstruct archive_string *
227184280Ssamarchive_array_append(struct archive_string *as, const char *p, size_t s)
228184280Ssam{
229184280Ssam	return archive_string_append(as, p, s);
230191552Ssam}
231191552Ssam
232191552Ssamvoid
233170530Ssamarchive_string_concat(struct archive_string *dest, struct archive_string *src)
234170530Ssam{
235170530Ssam	if (archive_string_append(dest, src->s, src->length) == NULL)
236170530Ssam		__archive_errx(1, "Out of memory");
237170530Ssam}
238195377Ssam
239170530Ssamvoid
240178354Ssamarchive_wstring_concat(struct archive_wstring *dest,
241170530Ssam    struct archive_wstring *src)
242170530Ssam{
243170530Ssam	if (archive_wstring_append(dest, src->s, src->length) == NULL)
244184280Ssam		__archive_errx(1, "Out of memory");
245191552Ssam}
246191552Ssam
247170530Ssamvoid
248173273Ssamarchive_string_free(struct archive_string *as)
249173273Ssam{
250178354Ssam	as->length = 0;
251173273Ssam	as->buffer_length = 0;
252178354Ssam	free(as->s);
253178354Ssam	as->s = NULL;
254178354Ssam}
255178354Ssam
256173273Ssamvoid
257178354Ssamarchive_wstring_free(struct archive_wstring *as)
258178354Ssam{
259178354Ssam	as->length = 0;
260178354Ssam	as->buffer_length = 0;
261178354Ssam	free(as->s);
262178354Ssam	as->s = NULL;
263178354Ssam}
264178354Ssam
265178354Ssamstruct archive_wstring *
266178354Ssamarchive_wstring_ensure(struct archive_wstring *as, size_t s)
267178354Ssam{
268178354Ssam	return (struct archive_wstring *)
269178354Ssam		archive_string_ensure((struct archive_string *)as,
270178354Ssam					s * sizeof(wchar_t));
271178354Ssam}
272178354Ssam
273170530Ssam/* Returns NULL on any allocation failure. */
274173273Ssamstruct archive_string *
275173273Ssamarchive_string_ensure(struct archive_string *as, size_t s)
276170530Ssam{
277170530Ssam	char *p;
278193655Ssam	size_t new_length;
279193655Ssam
280193655Ssam	/* If buffer is already big enough, don't reallocate. */
281178354Ssam	if (as->s && (s <= as->buffer_length))
282193655Ssam		return (as);
283173273Ssam
284178354Ssam	/*
285193655Ssam	 * Growing the buffer at least exponentially ensures that
286178354Ssam	 * append operations are always linear in the number of
287193655Ssam	 * characters appended.  Using a smaller growth rate for
288170530Ssam	 * larger buffers reduces memory waste somewhat at the cost of
289183256Ssam	 * a larger constant factor.
290183256Ssam	 */
291193655Ssam	if (as->buffer_length < 32)
292183256Ssam		/* Start with a minimum 32-character buffer. */
293170530Ssam		new_length = 32;
294193655Ssam	else if (as->buffer_length < 8192)
295178354Ssam		/* Buffers under 8k are doubled for speed. */
296193655Ssam		new_length = as->buffer_length + as->buffer_length;
297193655Ssam	else {
298178354Ssam		/* Buffers 8k and over grow by at least 25% each time. */
299193655Ssam		new_length = as->buffer_length + as->buffer_length / 4;
300170530Ssam		/* Be safe: If size wraps, fail. */
301178354Ssam		if (new_length < as->buffer_length) {
302178354Ssam			/* On failure, wipe the string and return NULL. */
303193655Ssam			archive_string_free(as);
304170530Ssam			errno = ENOMEM;/* Make sure errno has ENOMEM. */
305170530Ssam			return (NULL);
306170530Ssam		}
307178354Ssam	}
308170530Ssam	/*
309170530Ssam	 * The computation above is a lower limit to how much we'll
310170530Ssam	 * grow the buffer.  In any case, we have to grow it enough to
311170530Ssam	 * hold the request.
312205277Srpaulo	 */
313205277Srpaulo	if (new_length < s)
314170530Ssam		new_length = s;
315170530Ssam	/* Now we can reallocate the buffer. */
316170530Ssam	p = (char *)realloc(as->s, new_length);
317205277Srpaulo	if (p == NULL) {
318172226Ssam		/* On failure, wipe the string and return NULL. */
319172226Ssam		archive_string_free(as);
320170530Ssam		errno = ENOMEM;/* Make sure errno has ENOMEM. */
321170530Ssam		return (NULL);
322205277Srpaulo	}
323205277Srpaulo
324205277Srpaulo	as->s = p;
325205277Srpaulo	as->buffer_length = new_length;
326205277Srpaulo	return (as);
327205277Srpaulo}
328205277Srpaulo
329205277Srpaulo/*
330205277Srpaulo * TODO: See if there's a way to avoid scanning
331205277Srpaulo * the source string twice.  Then test to see
332205277Srpaulo * if it actually helps (remember that we're almost
333205277Srpaulo * always called with pretty short arguments, so
334205277Srpaulo * such an optimization might not help).
335205277Srpaulo */
336205277Srpaulostruct archive_string *
337205277Srpauloarchive_strncat(struct archive_string *as, const void *_p, size_t n)
338205277Srpaulo{
339205277Srpaulo	size_t s;
340170530Ssam	const char *p, *pp;
341170530Ssam
342170530Ssam	p = (const char *)_p;
343170530Ssam
344170530Ssam	/* Like strlen(p), except won't examine positions beyond p[n]. */
345170530Ssam	s = 0;
346205277Srpaulo	pp = p;
347205277Srpaulo	while (s < n && *pp) {
348205277Srpaulo		pp++;
349205277Srpaulo		s++;
350205277Srpaulo	}
351205281Srpaulo	if ((as = archive_string_append(as, p, s)) == NULL)
352205277Srpaulo		__archive_errx(1, "Out of memory");
353205277Srpaulo	return (as);
354205277Srpaulo}
355205277Srpaulo
356205277Srpaulostruct archive_wstring *
357205277Srpauloarchive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)
358205277Srpaulo{
359205277Srpaulo	size_t s;
360205277Srpaulo	const wchar_t *pp;
361205277Srpaulo
362205277Srpaulo	/* Like strlen(p), except won't examine positions beyond p[n]. */
363205277Srpaulo	s = 0;
364205277Srpaulo	pp = p;
365205277Srpaulo	while (s < n && *pp) {
366170530Ssam		pp++;
367170530Ssam		s++;
368170530Ssam	}
369170530Ssam	if ((as = archive_wstring_append(as, p, s)) == NULL)
370170530Ssam		__archive_errx(1, "Out of memory");
371170530Ssam	return (as);
372170530Ssam}
373170530Ssam
374170530Ssamstruct archive_string *
375170530Ssamarchive_strcat(struct archive_string *as, const void *p)
376170530Ssam{
377170530Ssam	/* strcat is just strncat without an effective limit.
378170530Ssam	 * Assert that we'll never get called with a source
379173273Ssam	 * string over 16MB.
380170530Ssam	 * TODO: Review all uses of strcat in the source
381170530Ssam	 * and try to replace them with strncat().
382170530Ssam	 */
383170530Ssam	return archive_strncat(as, p, 0x1000000);
384170530Ssam}
385170530Ssam
386170530Ssamstruct archive_wstring *
387170530Ssamarchive_wstrcat(struct archive_wstring *as, const wchar_t *p)
388170530Ssam{
389170530Ssam	/* Ditto. */
390170530Ssam	return archive_wstrncat(as, p, 0x1000000);
391170530Ssam}
392170530Ssam
393170530Ssamstruct archive_string *
394178354Ssamarchive_strappend_char(struct archive_string *as, char c)
395173462Ssam{
396170530Ssam	if ((as = archive_string_append(as, &c, 1)) == NULL)
397170530Ssam		__archive_errx(1, "Out of memory");
398170530Ssam	return (as);
399170530Ssam}
400170530Ssam
401178354Ssamstruct archive_wstring *
402170530Ssamarchive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)
403170530Ssam{
404170530Ssam	if ((as = archive_wstring_append(as, &c, 1)) == NULL)
405170530Ssam		__archive_errx(1, "Out of memory");
406170530Ssam	return (as);
407170530Ssam}
408170530Ssam
409170530Ssam/*
410170530Ssam * Get the "current character set" name to use with iconv.
411170530Ssam * On FreeBSD, the empty character set name "" chooses
412178354Ssam * the correct character encoding for the current locale,
413173462Ssam * so this isn't necessary.
414178354Ssam * But iconv on Mac OS 10.6 doesn't seem to handle this correctly;
415170530Ssam * on that system, we have to explicitly call nl_langinfo()
416170530Ssam * to get the right name.  Not sure about other platforms.
417173462Ssam *
418170530Ssam * NOTE: GNU libiconv does not recognize the character-set name
419170530Ssam * which some platform nl_langinfo(CODESET) returns, so we should
420170530Ssam * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.
421178354Ssam */
422170530Ssamstatic const char *
423170530Ssamdefault_iconv_charset(const char *charset) {
424178354Ssam	if (charset != NULL && charset[0] != '\0')
425170530Ssam		return charset;
426170530Ssam#if HAVE_LOCALE_CHARSET && !defined(__APPLE__)
427170530Ssam	/* locale_charset() is broken on Mac OS */
428178354Ssam	return locale_charset();
429170530Ssam#elif HAVE_NL_LANGINFO
430170530Ssam	return nl_langinfo(CODESET);
431170530Ssam#else
432170530Ssam	return "";
433170530Ssam#endif
434170530Ssam}
435170530Ssam
436170530Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
437170530Ssam
438170530Ssam/*
439170530Ssam * Convert MBS to WCS.
440170530Ssam * Note: returns -1 if conversion fails.
441170530Ssam */
442170530Ssamint
443170530Ssamarchive_wstring_append_from_mbs(struct archive_wstring *dest,
444170530Ssam    const char *p, size_t len)
445170530Ssam{
446170530Ssam	return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);
447170530Ssam}
448170530Ssam
449170530Ssamstatic int
450170530Ssamarchive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,
451170530Ssam    const char *s, size_t length, struct archive_string_conv *sc)
452170530Ssam{
453170530Ssam	int count, ret = 0;
454170530Ssam	UINT from_cp;
455170530Ssam
456170530Ssam	if (sc != NULL)
457170530Ssam		from_cp = sc->from_cp;
458170530Ssam	else
459170530Ssam		from_cp = get_current_codepage();
460170530Ssam
461170530Ssam	if (from_cp == CP_C_LOCALE) {
462170530Ssam		/*
463170530Ssam		 * "C" locale special processing.
464170530Ssam		 */
465178354Ssam		wchar_t *ws;
466178354Ssam		const unsigned char *mp;
467191552Ssam
468191552Ssam		if (NULL == archive_wstring_ensure(dest,
469191552Ssam		    dest->length + length + 1))
470178354Ssam			return (-1);
471191552Ssam
472191552Ssam		ws = dest->s + dest->length;
473178354Ssam		mp = (const unsigned char *)s;
474178354Ssam		count = 0;
475178354Ssam		while (count < (int)length && *mp) {
476178354Ssam			*ws++ = (wchar_t)*mp++;
477178354Ssam			count++;
478178354Ssam		}
479178354Ssam	} else if (sc != NULL &&
480178354Ssam	    (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) {
481178354Ssam		/*
482178354Ssam		 * Normalize UTF-8 and UTF-16BE and convert it directly
483191552Ssam		 * to UTF-16 as wchar_t.
484178354Ssam		 */
485191552Ssam		struct archive_string u16;
486191552Ssam		int saved_flag = sc->flag;/* save current flag. */
487178354Ssam
488178354Ssam		if (is_big_endian())
489178354Ssam			sc->flag |= SCONV_TO_UTF16BE;
490170530Ssam		else
491170530Ssam			sc->flag |= SCONV_TO_UTF16LE;
492170530Ssam
493191552Ssam		if (sc->flag & SCONV_FROM_UTF16) {
494170530Ssam			/*
495205277Srpaulo			 *  UTF-16BE/LE NFD ===> UTF-16 NFC
496170530Ssam			 *  UTF-16BE/LE NFC ===> UTF-16 NFD
497178354Ssam			 */
498170530Ssam			count = (int)utf16nbytes(s, length);
499170530Ssam		} else {
500170530Ssam			/*
501170530Ssam			 *  UTF-8 NFD ===> UTF-16 NFC
502170530Ssam			 *  UTF-8 NFC ===> UTF-16 NFD
503183247Ssam			 */
504170530Ssam			count = (int)mbsnbytes(s, length);
505170530Ssam		}
506170530Ssam		u16.s = (char *)dest->s;
507170530Ssam		u16.length = dest->length << 1;;
508170530Ssam		u16.buffer_length = dest->buffer_length;
509183247Ssam		if (sc->flag & SCONV_NORMALIZATION_C)
510192468Ssam			ret = archive_string_normalize_C(&u16, s, count, sc);
511192468Ssam		else
512170530Ssam			ret = archive_string_normalize_D(&u16, s, count, sc);
513170530Ssam		dest->s = (wchar_t *)u16.s;
514170530Ssam		dest->length = u16.length >> 1;
515170530Ssam		dest->buffer_length = u16.buffer_length;
516170530Ssam		sc->flag = saved_flag;/* restore the saved flag. */
517170530Ssam		return (ret);
518170530Ssam	} else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {
519170530Ssam		count = (int)utf16nbytes(s, length);
520170530Ssam		count >>= 1; /* to be WCS length */
521170530Ssam		/* Allocate memory for WCS. */
522170530Ssam		if (NULL == archive_wstring_ensure(dest,
523170530Ssam		    dest->length + count + 1))
524170530Ssam			return (-1);
525178354Ssam		wmemcpy(dest->s + dest->length, (const wchar_t *)s, count);
526170530Ssam		if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {
527170530Ssam			uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
528170530Ssam			int b;
529170530Ssam			for (b = 0; b < count; b++) {
530170530Ssam				uint16_t val = archive_le16dec(u16+b);
531170530Ssam				archive_be16enc(u16+b, val);
532170530Ssam			}
533170530Ssam		} else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {
534170530Ssam			uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
535170530Ssam			int b;
536170530Ssam			for (b = 0; b < count; b++) {
537170530Ssam				uint16_t val = archive_be16dec(u16+b);
538170530Ssam				archive_le16enc(u16+b, val);
539170530Ssam			}
540170530Ssam		}
541170530Ssam	} else {
542170530Ssam		DWORD mbflag;
543170530Ssam		size_t buffsize;
544170530Ssam
545170530Ssam		if (sc == NULL)
546170530Ssam			mbflag = 0;
547170530Ssam		else if (sc->flag & SCONV_FROM_CHARSET) {
548170530Ssam			/* Do not trust the length which comes from
549170530Ssam			 * an archive file. */
550170530Ssam			length = mbsnbytes(s, length);
551170530Ssam			mbflag = 0;
552170530Ssam		} else
553170530Ssam			mbflag = MB_PRECOMPOSED;
554170530Ssam
555178354Ssam		mbflag |= MB_ERR_INVALID_CHARS;
556170530Ssam
557173273Ssam		buffsize = dest->length + length + 1;
558173273Ssam		do {
559173273Ssam			/* Allocate memory for WCS. */
560173273Ssam			if (NULL == archive_wstring_ensure(dest, buffsize))
561173273Ssam				return (-1);
562178354Ssam			/* Convert MBS to WCS. */
563170530Ssam			count = MultiByteToWideChar(from_cp,
564170530Ssam			    mbflag, s, (int)length, dest->s + dest->length,
565173273Ssam			    (int)(dest->buffer_length >> 1) -1);
566170530Ssam			if (count == 0 &&
567173273Ssam			    GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
568170530Ssam				/* Expand the WCS buffer. */
569170530Ssam				buffsize = dest->buffer_length << 1;
570173273Ssam				continue;
571170530Ssam			}
572178354Ssam			if (count == 0 && length != 0)
573170530Ssam				ret = -1;
574170530Ssam			break;
575170530Ssam		} while (1);
576173273Ssam	}
577170530Ssam	dest->length += count;
578170530Ssam	dest->s[dest->length] = L'\0';
579170530Ssam	return (ret);
580170530Ssam}
581170530Ssam
582173273Ssam#else
583178354Ssam
584173273Ssam/*
585170530Ssam * Convert MBS to WCS.
586173273Ssam * Note: returns -1 if conversion fails.
587170530Ssam */
588170530Ssamint
589170530Ssamarchive_wstring_append_from_mbs(struct archive_wstring *dest,
590173273Ssam    const char *p, size_t len)
591170530Ssam{
592170530Ssam	size_t r;
593173273Ssam	int ret_val = 0;
594173273Ssam	/*
595173273Ssam	 * No single byte will be more than one wide character,
596173273Ssam	 * so this length estimate will always be big enough.
597173273Ssam	 */
598173273Ssam	// size_t wcs_length = len;
599173273Ssam	size_t mbs_length = len;
600173273Ssam	const char *mbs = p;
601178354Ssam	wchar_t *wcs;
602173273Ssam#if HAVE_MBRTOWC
603173273Ssam	mbstate_t shift_state;
604173273Ssam
605173273Ssam	memset(&shift_state, 0, sizeof(shift_state));
606173273Ssam#endif
607173273Ssam	/*
608173273Ssam	 * As we decided to have wcs_length == mbs_length == len
609173273Ssam	 * we can use len here instead of wcs_length
610173273Ssam	 */
611173273Ssam	if (NULL == archive_wstring_ensure(dest, dest->length + len + 1))
612173273Ssam		return (-1);
613173273Ssam	wcs = dest->s + dest->length;
614173273Ssam	/*
615173273Ssam	 * We cannot use mbsrtowcs/mbstowcs here because those may convert
616173273Ssam	 * extra MBS when strlen(p) > len and one wide character consists of
617173273Ssam	 * multi bytes.
618173273Ssam	 */
619173273Ssam	while (*mbs && mbs_length > 0) {
620178354Ssam		/*
621173273Ssam		 * The buffer we allocated is always big enough.
622173273Ssam		 * Keep this code path in a comment if we decide to choose
623173273Ssam		 * smaller wcs_length in the future
624173273Ssam		 */
625173273Ssam/*
626173273Ssam		if (wcs_length == 0) {
627173273Ssam			dest->length = wcs - dest->s;
628173273Ssam			dest->s[dest->length] = L'\0';
629173273Ssam			wcs_length = mbs_length;
630173273Ssam			if (NULL == archive_wstring_ensure(dest,
631173273Ssam			    dest->length + wcs_length + 1))
632173273Ssam				return (-1);
633173273Ssam			wcs = dest->s + dest->length;
634173273Ssam		}
635178354Ssam*/
636178354Ssam#if HAVE_MBRTOWC
637178354Ssam		r = mbrtowc(wcs, mbs, mbs_length, &shift_state);
638178354Ssam#else
639173273Ssam		r = mbtowc(wcs, mbs, mbs_length);
640173273Ssam#endif
641173273Ssam		if (r == (size_t)-1 || r == (size_t)-2) {
642173273Ssam			ret_val = -1;
643173273Ssam			break;
644173273Ssam		}
645173273Ssam		if (r == 0 || r > mbs_length)
646173273Ssam			break;
647173273Ssam		wcs++;
648173273Ssam		// wcs_length--;
649173273Ssam		mbs += r;
650173273Ssam		mbs_length -= r;
651173273Ssam	}
652178354Ssam	dest->length = wcs - dest->s;
653173273Ssam	dest->s[dest->length] = L'\0';
654173273Ssam	return (ret_val);
655173273Ssam}
656173273Ssam
657173273Ssam#endif
658173273Ssam
659173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
660173273Ssam
661173273Ssam/*
662173273Ssam * WCS ==> MBS.
663173273Ssam * Note: returns -1 if conversion fails.
664170530Ssam *
665170530Ssam * Win32 builds use WideCharToMultiByte from the Windows API.
666170530Ssam * (Maybe Cygwin should too?  WideCharToMultiByte will know a
667173273Ssam * lot more about local character encodings than the wcrtomb()
668170530Ssam * wrapper is going to know.)
669170530Ssam */
670170530Ssamint
671170530Ssamarchive_string_append_from_wcs(struct archive_string *as,
672170530Ssam    const wchar_t *w, size_t len)
673170530Ssam{
674170530Ssam	return archive_string_append_from_wcs_in_codepage(as, w, len, NULL);
675170530Ssam}
676173273Ssam
677173273Ssamstatic int
678178354Ssamarchive_string_append_from_wcs_in_codepage(struct archive_string *as,
679170530Ssam    const wchar_t *ws, size_t len, struct archive_string_conv *sc)
680170530Ssam{
681170530Ssam	BOOL defchar_used, *dp;
682170530Ssam	int count, ret = 0;
683170530Ssam	UINT to_cp;
684170530Ssam	int wslen = (int)len;
685183247Ssam
686183247Ssam	if (sc != NULL)
687170530Ssam		to_cp = sc->to_cp;
688170530Ssam	else
689170530Ssam		to_cp = get_current_codepage();
690170530Ssam
691183247Ssam	if (to_cp == CP_C_LOCALE) {
692183247Ssam		/*
693183247Ssam		 * "C" locale special processing.
694183247Ssam		 */
695183247Ssam		const wchar_t *wp = ws;
696183247Ssam		char *p;
697183247Ssam
698193840Ssam		if (NULL == archive_string_ensure(as,
699173273Ssam		    as->length + wslen +1))
700173273Ssam			return (-1);
701173273Ssam		p = as->s + as->length;
702173273Ssam		count = 0;
703170530Ssam		defchar_used = 0;
704170530Ssam		while (count < wslen && *wp) {
705170530Ssam			if (*wp > 255) {
706170530Ssam				*p++ = '?';
707170530Ssam				wp++;
708173273Ssam				defchar_used = 1;
709170530Ssam			} else
710182827Ssam				*p++ = (char)*wp++;
711182827Ssam			count++;
712182827Ssam		}
713182827Ssam	} else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {
714182827Ssam		uint16_t *u16;
715182827Ssam
716182827Ssam		if (NULL ==
717182827Ssam		    archive_string_ensure(as, as->length + len * 2 + 2))
718182827Ssam			return (-1);
719182827Ssam		u16 = (uint16_t *)(as->s + as->length);
720182827Ssam		count = 0;
721182827Ssam		defchar_used = 0;
722182827Ssam		if (sc->flag & SCONV_TO_UTF16BE) {
723182827Ssam			while (count < (int)len && *ws) {
724182827Ssam				archive_be16enc(u16+count, *ws);
725173273Ssam				ws++;
726173273Ssam				count++;
727170530Ssam			}
728170530Ssam		} else {
729170530Ssam			while (count < (int)len && *ws) {
730170530Ssam				archive_le16enc(u16+count, *ws);
731170530Ssam				ws++;
732170530Ssam				count++;
733170530Ssam			}
734170530Ssam		}
735170530Ssam		count <<= 1; /* to be byte size */
736170530Ssam	} else {
737170530Ssam		/* Make sure the MBS buffer has plenty to set. */
738173273Ssam		if (NULL ==
739170530Ssam		    archive_string_ensure(as, as->length + len * 2 + 1))
740170530Ssam			return (-1);
741170530Ssam		do {
742170530Ssam			defchar_used = 0;
743170530Ssam			if (to_cp == CP_UTF8 || sc == NULL)
744170530Ssam				dp = NULL;
745173273Ssam			else
746170530Ssam				dp = &defchar_used;
747170530Ssam			count = WideCharToMultiByte(to_cp, 0, ws, wslen,
748170530Ssam			    as->s + as->length,
749173273Ssam			    (int)as->buffer_length - (int)as->length - 1, NULL, dp);
750170530Ssam			if (count == 0 &&
751170530Ssam			    GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
752170530Ssam				/* Expand the MBS buffer and retry. */
753173273Ssam				if (NULL == archive_string_ensure(as,
754170530Ssam					as->buffer_length + len))
755173273Ssam					return (-1);
756205277Srpaulo				continue;
757173273Ssam			}
758173273Ssam			if (count == 0)
759173273Ssam				ret = -1;
760173273Ssam			break;
761173273Ssam		} while (1);
762173273Ssam	}
763173273Ssam	as->length += count;
764173273Ssam	as->s[as->length] = '\0';
765173273Ssam	return (defchar_used?-1:ret);
766173273Ssam}
767173273Ssam
768173273Ssam#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)
769170530Ssam
770173273Ssam/*
771173273Ssam * Translates a wide character string into current locale character set
772173273Ssam * and appends to the archive_string.  Note: returns -1 if conversion
773173273Ssam * fails.
774173273Ssam */
775170530Ssamint
776173273Ssamarchive_string_append_from_wcs(struct archive_string *as,
777173273Ssam    const wchar_t *w, size_t len)
778173273Ssam{
779173273Ssam	/* We cannot use the standard wcstombs() here because it
780173273Ssam	 * cannot tell us how big the output buffer should be.  So
781173273Ssam	 * I've built a loop around wcrtomb() or wctomb() that
782173273Ssam	 * converts a character at a time and resizes the string as
783173273Ssam	 * needed.  We prefer wcrtomb() when it's available because
784178354Ssam	 * it's thread-safe. */
785173273Ssam	int n, ret_val = 0;
786173273Ssam	char *p;
787173273Ssam	char *end;
788173273Ssam#if HAVE_WCRTOMB
789173273Ssam	mbstate_t shift_state;
790173273Ssam
791173273Ssam	memset(&shift_state, 0, sizeof(shift_state));
792173273Ssam#else
793173273Ssam	/* Clear the shift state before starting. */
794173273Ssam	wctomb(NULL, L'\0');
795173273Ssam#endif
796173273Ssam	/*
797173273Ssam	 * Allocate buffer for MBS.
798173273Ssam	 * We need this allocation here since it is possible that
799173273Ssam	 * as->s is still NULL.
800173273Ssam	 */
801173273Ssam	if (archive_string_ensure(as, as->length + len + 1) == NULL)
802173273Ssam		return (-1);
803178354Ssam
804173273Ssam	p = as->s + as->length;
805178354Ssam	end = as->s + as->buffer_length - MB_CUR_MAX -1;
806173273Ssam	while (*w != L'\0' && len > 0) {
807173273Ssam		if (p >= end) {
808173273Ssam			as->length = p - as->s;
809173273Ssam			as->s[as->length] = '\0';
810173273Ssam			/* Re-allocate buffer for MBS. */
811178354Ssam			if (archive_string_ensure(as,
812173273Ssam			    as->length + max(len * 2,
813173273Ssam			    (size_t)MB_CUR_MAX) + 1) == NULL)
814173273Ssam				return (-1);
815173273Ssam			p = as->s + as->length;
816173273Ssam			end = as->s + as->buffer_length - MB_CUR_MAX -1;
817173273Ssam		}
818173273Ssam#if HAVE_WCRTOMB
819173273Ssam		n = wcrtomb(p, *w++, &shift_state);
820173273Ssam#else
821205277Srpaulo		n = wctomb(p, *w++);
822173273Ssam#endif
823178354Ssam		if (n == -1) {
824173273Ssam			if (errno == EILSEQ) {
825170530Ssam				/* Skip an illegal wide char. */
826173273Ssam				*p++ = '?';
827170530Ssam				ret_val = -1;
828178354Ssam			} else {
829170530Ssam				ret_val = -1;
830173273Ssam				break;
831173273Ssam			}
832173273Ssam		} else
833173273Ssam			p += n;
834173273Ssam		len--;
835173273Ssam	}
836173273Ssam	as->length = p - as->s;
837173273Ssam	as->s[as->length] = '\0';
838173273Ssam	return (ret_val);
839173273Ssam}
840173273Ssam
841173273Ssam#else /* HAVE_WCTOMB || HAVE_WCRTOMB */
842170530Ssam
843170530Ssam/*
844173273Ssam * TODO: Test if __STDC_ISO_10646__ is defined.
845205277Srpaulo * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
846170530Ssam * one character at a time.  If a non-Windows platform doesn't have
847178354Ssam * either of these, fall back to the built-in UTF8 conversion.
848173273Ssam */
849178354Ssamint
850173273Ssamarchive_string_append_from_wcs(struct archive_string *as,
851173273Ssam    const wchar_t *w, size_t len)
852173273Ssam{
853173273Ssam	(void)as;/* UNUSED */
854178354Ssam	(void)w;/* UNUSED */
855173273Ssam	(void)len;/* UNUSED */
856170530Ssam	errno = ENOSYS;
857173273Ssam	return (-1);
858170530Ssam}
859173273Ssam
860173273Ssam#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */
861170530Ssam
862170530Ssam/*
863170530Ssam * Find a string conversion object by a pair of 'from' charset name
864170530Ssam * and 'to' charset name from an archive object.
865170530Ssam * Return NULL if not found.
866170530Ssam */
867173273Ssamstatic struct archive_string_conv *
868170530Ssamfind_sconv_object(struct archive *a, const char *fc, const char *tc)
869170530Ssam{
870170530Ssam	struct archive_string_conv *sc;
871170530Ssam
872178354Ssam	if (a == NULL)
873170530Ssam		return (NULL);
874170530Ssam
875170530Ssam	for (sc = a->sconv; sc != NULL; sc = sc->next) {
876170530Ssam		if (strcmp(sc->from_charset, fc) == 0 &&
877170530Ssam		    strcmp(sc->to_charset, tc) == 0)
878173273Ssam			break;
879173273Ssam	}
880178354Ssam	return (sc);
881173273Ssam}
882173273Ssam
883178354Ssam/*
884173273Ssam * Register a string object to an archive object.
885173273Ssam */
886170530Ssamstatic void
887170530Ssamadd_sconv_object(struct archive *a, struct archive_string_conv *sc)
888170530Ssam{
889170530Ssam	struct archive_string_conv **psc;
890170530Ssam
891170530Ssam	/* Add a new sconv to sconv list. */
892170530Ssam	psc = &(a->sconv);
893170530Ssam	while (*psc != NULL)
894178354Ssam		psc = &((*psc)->next);
895170530Ssam	*psc = sc;
896170530Ssam}
897178354Ssam
898170530Ssamstatic void
899170530Ssamadd_converter(struct archive_string_conv *sc, int (*converter)
900178354Ssam    (struct archive_string *, const void *, size_t,
901170530Ssam     struct archive_string_conv *))
902173273Ssam{
903173273Ssam	if (sc == NULL || sc->nconverter >= 2)
904170530Ssam		__archive_errx(1, "Programming error");
905170530Ssam	sc->converter[sc->nconverter++] = converter;
906173273Ssam}
907170530Ssam
908173273Ssamstatic void
909205277Srpaulosetup_converter(struct archive_string_conv *sc)
910170530Ssam{
911178354Ssam
912173273Ssam	/* Reset. */
913170530Ssam	sc->nconverter = 0;
914173273Ssam
915173273Ssam	/*
916178354Ssam	 * Perform special sequence for the incorrect UTF-8 filenames
917173273Ssam	 * made by libarchive2.x.
918173273Ssam	 */
919173273Ssam	if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {
920173273Ssam		add_converter(sc, strncat_from_utf8_libarchive2);
921173273Ssam		return;
922173273Ssam	}
923173273Ssam
924173273Ssam	/*
925173273Ssam	 * Convert a string to UTF-16BE/LE.
926170530Ssam	 */
927173273Ssam	if (sc->flag & SCONV_TO_UTF16) {
928170530Ssam		/*
929173273Ssam		 * If the current locale is UTF-8, we can translate
930205277Srpaulo		 * a UTF-8 string into a UTF-16BE string.
931170530Ssam		 */
932178354Ssam		if (sc->flag & SCONV_FROM_UTF8) {
933173273Ssam			add_converter(sc, archive_string_append_unicode);
934173273Ssam			return;
935173273Ssam		}
936173273Ssam
937173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
938173273Ssam		if (sc->flag & SCONV_WIN_CP) {
939178354Ssam			if (sc->flag & SCONV_TO_UTF16BE)
940173273Ssam				add_converter(sc, win_strncat_to_utf16be);
941170530Ssam			else
942170530Ssam				add_converter(sc, win_strncat_to_utf16le);
943170530Ssam			return;
944170530Ssam		}
945170530Ssam#endif
946170530Ssam
947170530Ssam#if defined(HAVE_ICONV)
948170530Ssam		if (sc->cd != (iconv_t)-1) {
949170530Ssam			add_converter(sc, iconv_strncat_in_locale);
950183254Ssam			return;
951170530Ssam		}
952170530Ssam#endif
953170530Ssam
954170530Ssam		if (sc->flag & SCONV_BEST_EFFORT) {
955173273Ssam			if (sc->flag & SCONV_TO_UTF16BE)
956173273Ssam				add_converter(sc,
957173273Ssam					best_effort_strncat_to_utf16be);
958173273Ssam			else
959173273Ssam				add_converter(sc,
960173273Ssam					best_effort_strncat_to_utf16le);
961173273Ssam		} else
962173273Ssam			/* Make sure we have no converter. */
963170530Ssam			sc->nconverter = 0;
964170530Ssam		return;
965170530Ssam	}
966184280Ssam
967173273Ssam	/*
968170530Ssam	 * Convert a string from UTF-16BE/LE.
969173273Ssam	 */
970170530Ssam	if (sc->flag & SCONV_FROM_UTF16) {
971170530Ssam		/*
972170530Ssam		 * At least we should normalize a UTF-16BE string.
973170530Ssam		 */
974170530Ssam		if (sc->flag & SCONV_NORMALIZATION_D)
975170530Ssam			add_converter(sc,archive_string_normalize_D);
976170530Ssam		else if (sc->flag & SCONV_NORMALIZATION_C)
977170530Ssam			add_converter(sc, archive_string_normalize_C);
978170530Ssam
979191552Ssam		if (sc->flag & SCONV_TO_UTF8) {
980170530Ssam			/*
981170530Ssam			 * If the current locale is UTF-8, we can translate
982170530Ssam			 * a UTF-16BE/LE string into a UTF-8 string directly.
983170530Ssam			 */
984170530Ssam			if (!(sc->flag &
985170530Ssam			    (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
986170530Ssam				add_converter(sc,
987184280Ssam				    archive_string_append_unicode);
988184280Ssam			return;
989170530Ssam		}
990170530Ssam
991191552Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
992170530Ssam		if (sc->flag & SCONV_WIN_CP) {
993170530Ssam			if (sc->flag & SCONV_FROM_UTF16BE)
994182828Ssam				add_converter(sc, win_strncat_from_utf16be);
995170530Ssam			else
996170530Ssam				add_converter(sc, win_strncat_from_utf16le);
997178354Ssam			return;
998178354Ssam		}
999178354Ssam#endif
1000178354Ssam
1001178354Ssam#if defined(HAVE_ICONV)
1002178354Ssam		if (sc->cd != (iconv_t)-1) {
1003178354Ssam			add_converter(sc, iconv_strncat_in_locale);
1004178354Ssam			return;
1005178354Ssam		}
1006178354Ssam#endif
1007178354Ssam
1008178354Ssam		if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
1009178354Ssam		    == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
1010178354Ssam			add_converter(sc, best_effort_strncat_from_utf16be);
1011178354Ssam		else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
1012178354Ssam		    == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
1013178354Ssam			add_converter(sc, best_effort_strncat_from_utf16le);
1014178354Ssam		else
1015178354Ssam			/* Make sure we have no converter. */
1016178354Ssam			sc->nconverter = 0;
1017178354Ssam		return;
1018178354Ssam	}
1019178354Ssam
1020178354Ssam	if (sc->flag & SCONV_FROM_UTF8) {
1021178354Ssam		/*
1022178354Ssam		 * At least we should normalize a UTF-8 string.
1023178354Ssam		 */
1024178354Ssam		if (sc->flag & SCONV_NORMALIZATION_D)
1025178354Ssam			add_converter(sc,archive_string_normalize_D);
1026178354Ssam		else if (sc->flag & SCONV_NORMALIZATION_C)
1027178354Ssam			add_converter(sc, archive_string_normalize_C);
1028178354Ssam
1029178354Ssam		/*
1030178354Ssam		 * Copy UTF-8 string with a check of CESU-8.
1031178354Ssam		 * Apparently, iconv does not check surrogate pairs in UTF-8
1032178354Ssam		 * when both from-charset and to-charset are UTF-8, and then
1033178354Ssam		 * we use our UTF-8 copy code.
1034178354Ssam		 */
1035178354Ssam		if (sc->flag & SCONV_TO_UTF8) {
1036173273Ssam			/*
1037173273Ssam			 * If the current locale is UTF-8, we can translate
1038173273Ssam			 * a UTF-16BE string into a UTF-8 string directly.
1039173273Ssam			 */
1040173273Ssam			if (!(sc->flag &
1041173273Ssam			    (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
1042173273Ssam				add_converter(sc, strncat_from_utf8_to_utf8);
1043173273Ssam			return;
1044173273Ssam		}
1045173273Ssam	}
1046173273Ssam
1047173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1048173273Ssam	/*
1049173273Ssam	 * On Windows we can use Windows API for a string conversion.
1050173273Ssam	 */
1051173273Ssam	if (sc->flag & SCONV_WIN_CP) {
1052193655Ssam		add_converter(sc, strncat_in_codepage);
1053173273Ssam		return;
1054193655Ssam	}
1055173273Ssam#endif
1056173273Ssam
1057173273Ssam#if HAVE_ICONV
1058173273Ssam	if (sc->cd != (iconv_t)-1) {
1059173273Ssam		add_converter(sc, iconv_strncat_in_locale);
1060173273Ssam		/*
1061173273Ssam		 * iconv generally does not support UTF-8-MAC and so
1062173273Ssam		 * we have to the output of iconv from NFC to NFD if
1063173273Ssam		 * need.
1064173273Ssam		 */
1065173273Ssam		if ((sc->flag & SCONV_FROM_CHARSET) &&
1066173273Ssam		    (sc->flag & SCONV_TO_UTF8)) {
1067173273Ssam			if (sc->flag & SCONV_NORMALIZATION_D)
1068173273Ssam				add_converter(sc, archive_string_normalize_D);
1069173273Ssam		}
1070173273Ssam		return;
1071173273Ssam	}
1072173273Ssam#endif
1073173273Ssam
1074173273Ssam	/*
1075173273Ssam	 * Try conversion in the best effort or no conversion.
1076173273Ssam	 */
1077173273Ssam	if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)
1078173273Ssam		add_converter(sc, best_effort_strncat_in_locale);
1079173273Ssam	else
1080173273Ssam		/* Make sure we have no converter. */
1081173273Ssam		sc->nconverter = 0;
1082173273Ssam}
1083173273Ssam
1084173273Ssam/*
1085173273Ssam * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE
1086173273Ssam * and CP932 which are referenced in create_sconv_object().
1087173273Ssam */
1088178354Ssamstatic const char *
1089173273Ssamcanonical_charset_name(const char *charset)
1090173273Ssam{
1091173273Ssam	char cs[16];
1092193655Ssam	char *p;
1093173273Ssam	const char *s;
1094173273Ssam
1095173273Ssam	if (charset == NULL || charset[0] == '\0'
1096173273Ssam	    || strlen(charset) > 15)
1097173273Ssam		return (charset);
1098173273Ssam
1099173273Ssam	/* Copy name to uppercase. */
1100173273Ssam	p = cs;
1101178354Ssam	s = charset;
1102178354Ssam	while (*s) {
1103173273Ssam		char c = *s++;
1104173273Ssam		if (c >= 'a' && c <= 'z')
1105193655Ssam			c -= 'a' - 'A';
1106173273Ssam		*p++ = c;
1107173273Ssam	}
1108173273Ssam	*p++ = '\0';
1109173273Ssam
1110173273Ssam	if (strcmp(cs, "UTF-8") == 0 ||
1111173273Ssam	    strcmp(cs, "UTF8") == 0)
1112173273Ssam		return ("UTF-8");
1113173273Ssam	if (strcmp(cs, "UTF-16BE") == 0 ||
1114193655Ssam	    strcmp(cs, "UTF16BE") == 0)
1115173273Ssam		return ("UTF-16BE");
1116173273Ssam	if (strcmp(cs, "UTF-16LE") == 0 ||
1117173273Ssam	    strcmp(cs, "UTF16LE") == 0)
1118173273Ssam		return ("UTF-16LE");
1119173273Ssam	if (strcmp(cs, "CP932") == 0)
1120173273Ssam		return ("CP932");
1121193655Ssam	return (charset);
1122183256Ssam}
1123183256Ssam
1124173273Ssam/*
1125173273Ssam * Create a string conversion object.
1126173273Ssam */
1127173273Ssamstatic struct archive_string_conv *
1128173273Ssamcreate_sconv_object(const char *fc, const char *tc,
1129173273Ssam    unsigned current_codepage, int flag)
1130173273Ssam{
1131173273Ssam	struct archive_string_conv *sc;
1132193655Ssam
1133173273Ssam	sc = calloc(1, sizeof(*sc));
1134173273Ssam	if (sc == NULL)
1135173273Ssam		return (NULL);
1136173273Ssam	sc->next = NULL;
1137173273Ssam	sc->from_charset = strdup(fc);
1138173273Ssam	if (sc->from_charset == NULL) {
1139173273Ssam		free(sc);
1140173273Ssam		return (NULL);
1141173273Ssam	}
1142178354Ssam	sc->to_charset = strdup(tc);
1143178354Ssam	if (sc->to_charset == NULL) {
1144178354Ssam		free(sc->from_charset);
1145178354Ssam		free(sc);
1146178354Ssam		return (NULL);
1147178354Ssam	}
1148178354Ssam	archive_string_init(&sc->utftmp);
1149178354Ssam
1150183253Ssam	if (flag & SCONV_TO_CHARSET) {
1151183253Ssam		/*
1152183253Ssam		 * Convert characters from the current locale charset to
1153178354Ssam		 * a specified charset.
1154178354Ssam		 */
1155178354Ssam		sc->from_cp = current_codepage;
1156178354Ssam		sc->to_cp = make_codepage_from_charset(tc);
1157178354Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1158178354Ssam		if (IsValidCodePage(sc->to_cp))
1159178354Ssam			flag |= SCONV_WIN_CP;
1160178354Ssam#endif
1161178354Ssam	} else if (flag & SCONV_FROM_CHARSET) {
1162193655Ssam		/*
1163178354Ssam		 * Convert characters from a specified charset to
1164178354Ssam		 * the current locale charset.
1165178354Ssam		 */
1166178354Ssam		sc->to_cp = current_codepage;
1167178354Ssam		sc->from_cp = make_codepage_from_charset(fc);
1168178354Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1169173273Ssam		if (IsValidCodePage(sc->from_cp))
1170173273Ssam			flag |= SCONV_WIN_CP;
1171173273Ssam#endif
1172173273Ssam	}
1173173273Ssam
1174173273Ssam	/*
1175173273Ssam	 * Check if "from charset" and "to charset" are the same.
1176173273Ssam	 */
1177173273Ssam	if (strcmp(fc, tc) == 0 ||
1178173273Ssam	    (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp))
1179173273Ssam		sc->same = 1;
1180178354Ssam	else
1181178354Ssam		sc->same = 0;
1182178354Ssam
1183193655Ssam	/*
1184173273Ssam	 * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.
1185178354Ssam	 */
1186178354Ssam	if (strcmp(tc, "UTF-8") == 0)
1187178354Ssam		flag |= SCONV_TO_UTF8;
1188173273Ssam	else if (strcmp(tc, "UTF-16BE") == 0)
1189173273Ssam		flag |= SCONV_TO_UTF16BE;
1190173273Ssam	else if (strcmp(tc, "UTF-16LE") == 0)
1191173273Ssam		flag |= SCONV_TO_UTF16LE;
1192173273Ssam	if (strcmp(fc, "UTF-8") == 0)
1193173273Ssam		flag |= SCONV_FROM_UTF8;
1194173273Ssam	else if (strcmp(fc, "UTF-16BE") == 0)
1195173273Ssam		flag |= SCONV_FROM_UTF16BE;
1196173273Ssam	else if (strcmp(fc, "UTF-16LE") == 0)
1197173273Ssam		flag |= SCONV_FROM_UTF16LE;
1198173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1199173273Ssam	if (sc->to_cp == CP_UTF8)
1200173273Ssam		flag |= SCONV_TO_UTF8;
1201173273Ssam	else if (sc->to_cp == CP_UTF16BE)
1202173273Ssam		flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;
1203173273Ssam	else if (sc->to_cp == CP_UTF16LE)
1204173273Ssam		flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;
1205173273Ssam	if (sc->from_cp == CP_UTF8)
1206173273Ssam		flag |= SCONV_FROM_UTF8;
1207173273Ssam	else if (sc->from_cp == CP_UTF16BE)
1208173273Ssam		flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;
1209173273Ssam	else if (sc->from_cp == CP_UTF16LE)
1210173273Ssam		flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;
1211173273Ssam#endif
1212173273Ssam
1213173273Ssam	/*
1214173273Ssam	 * Set a flag for Unicode NFD. Usually iconv cannot correctly
1215173273Ssam	 * handle it. So we have to translate NFD characters to NFC ones
1216173273Ssam	 * ourselves before iconv handles. Another reason is to prevent
1217173273Ssam	 * that the same sight of two filenames, one is NFC and other
1218173273Ssam	 * is NFD, would be in its directory.
1219173273Ssam	 * On Mac OS X, although its filesystem layer automatically
1220173273Ssam	 * convert filenames to NFD, it would be useful for filename
1221173273Ssam	 * comparing to find out the same filenames that we normalize
1222173273Ssam	 * that to be NFD ourselves.
1223173273Ssam	 */
1224173273Ssam	if ((flag & SCONV_FROM_CHARSET) &&
1225173273Ssam	    (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {
1226173273Ssam#if defined(__APPLE__)
1227173273Ssam		if (flag & SCONV_TO_UTF8)
1228173273Ssam			flag |= SCONV_NORMALIZATION_D;
1229173273Ssam		else
1230173273Ssam#endif
1231173273Ssam			flag |= SCONV_NORMALIZATION_C;
1232173273Ssam	}
1233173273Ssam#if defined(__APPLE__)
1234173273Ssam	/*
1235173273Ssam	 * In case writing an archive file, make sure that a filename
1236173273Ssam	 * going to be passed to iconv is a Unicode NFC string since
1237178354Ssam	 * a filename in HFS Plus filesystem is a Unicode NFD one and
1238178354Ssam	 * iconv cannot handle it with "UTF-8" charset. It is simpler
1239178354Ssam	 * than a use of "UTF-8-MAC" charset.
1240178354Ssam	 */
1241178354Ssam	if ((flag & SCONV_TO_CHARSET) &&
1242178354Ssam	    (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1243178354Ssam	    !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1244178354Ssam		flag |= SCONV_NORMALIZATION_C;
1245178354Ssam	/*
1246173273Ssam	 * In case reading an archive file. make sure that a filename
1247173273Ssam	 * will be passed to users is a Unicode NFD string in order to
1248178354Ssam	 * correctly compare the filename with other one which comes
1249173273Ssam	 * from HFS Plus filesystem.
1250178354Ssam	 */
1251183246Ssam	if ((flag & SCONV_FROM_CHARSET) &&
1252178354Ssam	   !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1253178354Ssam	    (flag & SCONV_TO_UTF8))
1254178354Ssam		flag |= SCONV_NORMALIZATION_D;
1255183246Ssam#endif
1256193655Ssam
1257178354Ssam#if defined(HAVE_ICONV)
1258178354Ssam	sc->cd_w = (iconv_t)-1;
1259183246Ssam	/*
1260183246Ssam	 * Create an iconv object.
1261183246Ssam	 */
1262183246Ssam	if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&
1263183246Ssam	    (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||
1264183246Ssam	    (flag & SCONV_WIN_CP)) {
1265183246Ssam		/* This case we won't use iconv. */
1266181197Ssam		sc->cd = (iconv_t)-1;
1267178354Ssam	} else {
1268173273Ssam		sc->cd = iconv_open(tc, fc);
1269173273Ssam		if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {
1270173273Ssam			/*
1271173273Ssam			 * Unfortunately, all of iconv implements do support
1272173273Ssam			 * "CP932" character-set, so we should use "SJIS"
1273173273Ssam			 * instead if iconv_open failed.
1274173273Ssam			 */
1275173273Ssam			if (strcmp(tc, "CP932") == 0)
1276173273Ssam				sc->cd = iconv_open("SJIS", fc);
1277173273Ssam			else if (strcmp(fc, "CP932") == 0)
1278173273Ssam				sc->cd = iconv_open(tc, "SJIS");
1279173273Ssam		}
1280173273Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1281173273Ssam		/*
1282173273Ssam		 * archive_mstring on Windows directly convert multi-bytes
1283193655Ssam		 * into archive_wstring in order not to depend on locale
1284173273Ssam		 * so that you can do a I18N programming. This will be
1285173273Ssam		 * used only in archive_mstring_copy_mbs_len_l so far.
1286178354Ssam		 */
1287173273Ssam		if (flag & SCONV_FROM_CHARSET) {
1288173273Ssam			sc->cd_w = iconv_open("UTF-8", fc);
1289193655Ssam			if (sc->cd_w == (iconv_t)-1 &&
1290173273Ssam			    (sc->flag & SCONV_BEST_EFFORT)) {
1291173273Ssam				if (strcmp(fc, "CP932") == 0)
1292173273Ssam					sc->cd_w = iconv_open("UTF-8", "SJIS");
1293173273Ssam			}
1294170530Ssam		}
1295170530Ssam#endif /* _WIN32 && !__CYGWIN__ */
1296170530Ssam	}
1297170530Ssam#endif	/* HAVE_ICONV */
1298170530Ssam
1299170530Ssam	sc->flag = flag;
1300170530Ssam
1301170530Ssam	/*
1302170530Ssam	 * Set up converters.
1303170530Ssam	 */
1304170530Ssam	setup_converter(sc);
1305170530Ssam
1306170530Ssam	return (sc);
1307170530Ssam}
1308170530Ssam
1309170530Ssam/*
1310170530Ssam * Free a string conversion object.
1311170530Ssam */
1312170530Ssamstatic void
1313170530Ssamfree_sconv_object(struct archive_string_conv *sc)
1314170530Ssam{
1315170530Ssam	free(sc->from_charset);
1316170530Ssam	free(sc->to_charset);
1317170530Ssam	archive_string_free(&sc->utftmp);
1318170530Ssam#if HAVE_ICONV
1319170530Ssam	if (sc->cd != (iconv_t)-1)
1320170530Ssam		iconv_close(sc->cd);
1321170530Ssam	if (sc->cd_w != (iconv_t)-1)
1322183254Ssam		iconv_close(sc->cd_w);
1323183254Ssam#endif
1324183254Ssam	free(sc);
1325170530Ssam}
1326170530Ssam
1327170530Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1328170530Ssam# if defined(WINAPI_FAMILY_PARTITION) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
1329170530Ssam#  define GetOEMCP() CP_OEMCP
1330172055Ssam# endif
1331170530Ssam
1332170530Ssamstatic unsigned
1333170530Ssammy_atoi(const char *p)
1334183254Ssam{
1335183254Ssam	unsigned cp;
1336183254Ssam
1337183254Ssam	cp = 0;
1338183254Ssam	while (*p) {
1339183254Ssam		if (*p >= '0' && *p <= '9')
1340183254Ssam			cp = cp * 10 + (*p - '0');
1341183254Ssam		else
1342183254Ssam			return (-1);
1343183254Ssam		p++;
1344183254Ssam	}
1345183254Ssam	return (cp);
1346183254Ssam}
1347183254Ssam
1348183254Ssam/*
1349183254Ssam * Translate Charset name (as used by iconv) into CodePage (as used by Windows)
1350183254Ssam * Return -1 if failed.
1351183254Ssam *
1352183254Ssam * Note: This translation code may be insufficient.
1353183254Ssam */
1354183254Ssamstatic struct charset {
1355183254Ssam	const char *name;
1356183254Ssam	unsigned cp;
1357183254Ssam} charsets[] = {
1358183254Ssam	/* MUST BE SORTED! */
1359183254Ssam	{"ASCII", 1252},
1360183254Ssam	{"ASMO-708", 708},
1361183254Ssam	{"BIG5", 950},
1362183254Ssam	{"CHINESE", 936},
1363183254Ssam	{"CP367", 1252},
1364183254Ssam	{"CP819", 1252},
1365173273Ssam	{"CP1025", 21025},
1366173273Ssam	{"DOS-720", 720},
1367183254Ssam	{"DOS-862", 862},
1368173273Ssam	{"EUC-CN", 51936},
1369178354Ssam	{"EUC-JP", 51932},
1370173273Ssam	{"EUC-KR", 949},
1371173273Ssam	{"EUCCN", 51936},
1372173273Ssam	{"EUCJP", 51932},
1373173273Ssam	{"EUCKR", 949},
1374173273Ssam	{"GB18030", 54936},
1375183254Ssam	{"GB2312", 936},
1376183254Ssam	{"HEBREW", 1255},
1377173273Ssam	{"HZ-GB-2312", 52936},
1378173273Ssam	{"IBM273", 20273},
1379173273Ssam	{"IBM277", 20277},
1380183254Ssam	{"IBM278", 20278},
1381173273Ssam	{"IBM280", 20280},
1382173273Ssam	{"IBM284", 20284},
1383173273Ssam	{"IBM285", 20285},
1384183254Ssam	{"IBM290", 20290},
1385173273Ssam	{"IBM297", 20297},
1386173273Ssam	{"IBM367", 1252},
1387173273Ssam	{"IBM420", 20420},
1388173273Ssam	{"IBM423", 20423},
1389173273Ssam	{"IBM424", 20424},
1390173273Ssam	{"IBM819", 1252},
1391173273Ssam	{"IBM871", 20871},
1392173273Ssam	{"IBM880", 20880},
1393173273Ssam	{"IBM905", 20905},
1394173273Ssam	{"IBM924", 20924},
1395170530Ssam	{"ISO-8859-1", 28591},
1396170530Ssam	{"ISO-8859-13", 28603},
1397170530Ssam	{"ISO-8859-15", 28605},
1398183255Ssam	{"ISO-8859-2", 28592},
1399183255Ssam	{"ISO-8859-3", 28593},
1400183255Ssam	{"ISO-8859-4", 28594},
1401183255Ssam	{"ISO-8859-5", 28595},
1402183255Ssam	{"ISO-8859-6", 28596},
1403183255Ssam	{"ISO-8859-7", 28597},
1404183255Ssam	{"ISO-8859-8", 28598},
1405183255Ssam	{"ISO-8859-9", 28599},
1406183255Ssam	{"ISO8859-1", 28591},
1407183255Ssam	{"ISO8859-13", 28603},
1408183255Ssam	{"ISO8859-15", 28605},
1409183255Ssam	{"ISO8859-2", 28592},
1410183255Ssam	{"ISO8859-3", 28593},
1411183255Ssam	{"ISO8859-4", 28594},
1412183255Ssam	{"ISO8859-5", 28595},
1413183255Ssam	{"ISO8859-6", 28596},
1414183255Ssam	{"ISO8859-7", 28597},
1415183255Ssam	{"ISO8859-8", 28598},
1416183255Ssam	{"ISO8859-9", 28599},
1417183255Ssam	{"JOHAB", 1361},
1418183255Ssam	{"KOI8-R", 20866},
1419183255Ssam	{"KOI8-U", 21866},
1420183255Ssam	{"KS_C_5601-1987", 949},
1421183255Ssam	{"LATIN1", 1252},
1422183255Ssam	{"LATIN2", 28592},
1423183255Ssam	{"MACINTOSH", 10000},
1424183257Ssam	{"SHIFT-JIS", 932},
1425183257Ssam	{"SHIFT_JIS", 932},
1426183257Ssam	{"SJIS", 932},
1427183257Ssam	{"US", 1252},
1428183257Ssam	{"US-ASCII", 1252},
1429183257Ssam	{"UTF-16", 1200},
1430183257Ssam	{"UTF-16BE", 1201},
1431183257Ssam	{"UTF-16LE", 1200},
1432183257Ssam	{"UTF-8", CP_UTF8},
1433183257Ssam	{"X-EUROPA", 29001},
1434193655Ssam	{"X-MAC-ARABIC", 10004},
1435183257Ssam	{"X-MAC-CE", 10029},
1436183257Ssam	{"X-MAC-CHINESEIMP", 10008},
1437193655Ssam	{"X-MAC-CHINESETRAD", 10002},
1438183257Ssam	{"X-MAC-CROATIAN", 10082},
1439183257Ssam	{"X-MAC-CYRILLIC", 10007},
1440183257Ssam	{"X-MAC-GREEK", 10006},
1441183257Ssam	{"X-MAC-HEBREW", 10005},
1442183254Ssam	{"X-MAC-ICELANDIC", 10079},
1443183254Ssam	{"X-MAC-JAPANESE", 10001},
1444183254Ssam	{"X-MAC-KOREAN", 10003},
1445183254Ssam	{"X-MAC-ROMANIAN", 10010},
1446183254Ssam	{"X-MAC-THAI", 10021},
1447183254Ssam	{"X-MAC-TURKISH", 10081},
1448183254Ssam	{"X-MAC-UKRAINIAN", 10017},
1449183254Ssam};
1450183254Ssamstatic unsigned
1451183254Ssammake_codepage_from_charset(const char *charset)
1452183254Ssam{
1453183254Ssam	char cs[16];
1454183255Ssam	char *p;
1455183255Ssam	unsigned cp;
1456183257Ssam	int a, b;
1457183254Ssam
1458183254Ssam	if (charset == NULL || strlen(charset) > 15)
1459183254Ssam		return -1;
1460183254Ssam
1461183254Ssam	/* Copy name to uppercase. */
1462183254Ssam	p = cs;
1463193655Ssam	while (*charset) {
1464183254Ssam		char c = *charset++;
1465183254Ssam		if (c >= 'a' && c <= 'z')
1466183254Ssam			c -= 'a' - 'A';
1467193655Ssam		*p++ = c;
1468183254Ssam	}
1469183254Ssam	*p++ = '\0';
1470183254Ssam	cp = -1;
1471183254Ssam
1472183254Ssam	/* Look it up in the table first, so that we can easily
1473183254Ssam	 * override CP367, which we map to 1252 instead of 367. */
1474183256Ssam	a = 0;
1475183256Ssam	b = sizeof(charsets)/sizeof(charsets[0]);
1476193655Ssam	while (b > a) {
1477183256Ssam		int c = (b + a) / 2;
1478183256Ssam		int r = strcmp(charsets[c].name, cs);
1479183256Ssam		if (r < 0)
1480183254Ssam			a = c + 1;
1481183254Ssam		else if (r > 0)
1482183254Ssam			b = c;
1483183254Ssam		else
1484183254Ssam			return charsets[c].cp;
1485183254Ssam	}
1486183254Ssam
1487183254Ssam	/* If it's not in the table, try to parse it. */
1488183254Ssam	switch (*cs) {
1489183254Ssam	case 'C':
1490183254Ssam		if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {
1491183254Ssam			cp = my_atoi(cs + 2);
1492183254Ssam		} else if (strcmp(cs, "CP_ACP") == 0)
1493183255Ssam			cp = get_current_codepage();
1494183255Ssam		else if (strcmp(cs, "CP_OEMCP") == 0)
1495183257Ssam			cp = get_current_oemcp();
1496183254Ssam		break;
1497183254Ssam	case 'I':
1498183254Ssam		if (cs[1] == 'B' && cs[2] == 'M' &&
1499193655Ssam		    cs[3] >= '0' && cs[3] <= '9') {
1500183254Ssam			cp = my_atoi(cs + 3);
1501183254Ssam		}
1502193655Ssam		break;
1503183254Ssam	case 'W':
1504183254Ssam		if (strncmp(cs, "WINDOWS-", 8) == 0) {
1505183254Ssam			cp = my_atoi(cs + 8);
1506183254Ssam			if (cp != 874 && (cp < 1250 || cp > 1258))
1507183254Ssam				cp = -1;/* This may invalid code. */
1508183254Ssam		}
1509183254Ssam		break;
1510183254Ssam	}
1511183254Ssam	return (cp);
1512170530Ssam}
1513170530Ssam
1514170530Ssam/*
1515170530Ssam * Return ANSI Code Page of current locale set by setlocale().
1516170530Ssam */
1517178354Ssamstatic unsigned
1518170530Ssamget_current_codepage(void)
1519170530Ssam{
1520170530Ssam	char *locale, *p;
1521170530Ssam	unsigned cp;
1522170530Ssam
1523170530Ssam	locale = setlocale(LC_CTYPE, NULL);
1524170530Ssam	if (locale == NULL)
1525170530Ssam		return (GetACP());
1526170530Ssam	if (locale[0] == 'C' && locale[1] == '\0')
1527170530Ssam		return (CP_C_LOCALE);
1528170530Ssam	p = strrchr(locale, '.');
1529170530Ssam	if (p == NULL)
1530170530Ssam		return (GetACP());
1531170530Ssam	if ((strcmp(p+1, "utf8") == 0) || (strcmp(p+1, "UTF-8") == 0))
1532178354Ssam		return CP_UTF8;
1533170530Ssam	cp = my_atoi(p+1);
1534170530Ssam	if ((int)cp <= 0)
1535170530Ssam		return (GetACP());
1536178354Ssam	return (cp);
1537170530Ssam}
1538170530Ssam
1539170530Ssam/*
1540170530Ssam * Translation table between Locale Name and ACP/OEMCP.
1541170530Ssam */
1542170530Ssamstatic struct {
1543170530Ssam	unsigned acp;
1544170530Ssam	unsigned ocp;
1545170530Ssam	const char *locale;
1546170530Ssam} acp_ocp_map[] = {
1547170530Ssam	{  950,  950, "Chinese_Taiwan" },
1548170530Ssam	{  936,  936, "Chinese_People's Republic of China" },
1549170530Ssam	{  950,  950, "Chinese_Taiwan" },
1550170530Ssam	{ 1250,  852, "Czech_Czech Republic" },
1551170530Ssam	{ 1252,  850, "Danish_Denmark" },
1552170530Ssam	{ 1252,  850, "Dutch_Netherlands" },
1553170530Ssam	{ 1252,  850, "Dutch_Belgium" },
1554170530Ssam	{ 1252,  437, "English_United States" },
1555170530Ssam	{ 1252,  850, "English_Australia" },
1556170530Ssam	{ 1252,  850, "English_Canada" },
1557170530Ssam	{ 1252,  850, "English_New Zealand" },
1558170530Ssam	{ 1252,  850, "English_United Kingdom" },
1559170530Ssam	{ 1252,  437, "English_United States" },
1560170530Ssam	{ 1252,  850, "Finnish_Finland" },
1561178354Ssam	{ 1252,  850, "French_France" },
1562170530Ssam	{ 1252,  850, "French_Belgium" },
1563170530Ssam	{ 1252,  850, "French_Canada" },
1564170530Ssam	{ 1252,  850, "French_Switzerland" },
1565170530Ssam	{ 1252,  850, "German_Germany" },
1566170530Ssam	{ 1252,  850, "German_Austria" },
1567170530Ssam	{ 1252,  850, "German_Switzerland" },
1568170530Ssam	{ 1253,  737, "Greek_Greece" },
1569170530Ssam	{ 1250,  852, "Hungarian_Hungary" },
1570170530Ssam	{ 1252,  850, "Icelandic_Iceland" },
1571170530Ssam	{ 1252,  850, "Italian_Italy" },
1572170530Ssam	{ 1252,  850, "Italian_Switzerland" },
1573170530Ssam	{  932,  932, "Japanese_Japan" },
1574170530Ssam	{  949,  949, "Korean_Korea" },
1575170530Ssam	{ 1252,  850, "Norwegian (BokmOl)_Norway" },
1576184280Ssam	{ 1252,  850, "Norwegian (BokmOl)_Norway" },
1577184280Ssam	{ 1252,  850, "Norwegian-Nynorsk_Norway" },
1578184280Ssam	{ 1250,  852, "Polish_Poland" },
1579184280Ssam	{ 1252,  850, "Portuguese_Portugal" },
1580184280Ssam	{ 1252,  850, "Portuguese_Brazil" },
1581184280Ssam	{ 1251,  866, "Russian_Russia" },
1582184280Ssam	{ 1250,  852, "Slovak_Slovakia" },
1583184280Ssam	{ 1252,  850, "Spanish_Spain" },
1584184280Ssam	{ 1252,  850, "Spanish_Mexico" },
1585184280Ssam	{ 1252,  850, "Spanish_Spain" },
1586184280Ssam	{ 1252,  850, "Swedish_Sweden" },
1587184280Ssam	{ 1254,  857, "Turkish_Turkey" },
1588184280Ssam	{ 0, 0, NULL}
1589184280Ssam};
1590184280Ssam
1591184280Ssam/*
1592184280Ssam * Return OEM Code Page of current locale set by setlocale().
1593184280Ssam */
1594184280Ssamstatic unsigned
1595184280Ssamget_current_oemcp(void)
1596184280Ssam{
1597184280Ssam	int i;
1598184280Ssam	char *locale, *p;
1599184280Ssam	size_t len;
1600184280Ssam
1601184280Ssam	locale = setlocale(LC_CTYPE, NULL);
1602184280Ssam	if (locale == NULL)
1603184280Ssam		return (GetOEMCP());
1604184280Ssam	if (locale[0] == 'C' && locale[1] == '\0')
1605184280Ssam		return (CP_C_LOCALE);
1606184280Ssam
1607184280Ssam	p = strrchr(locale, '.');
1608170530Ssam	if (p == NULL)
1609170530Ssam		return (GetOEMCP());
1610170530Ssam	len = p - locale;
1611170530Ssam	for (i = 0; acp_ocp_map[i].acp; i++) {
1612170530Ssam		if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)
1613170530Ssam			return (acp_ocp_map[i].ocp);
1614170530Ssam	}
1615170530Ssam	return (GetOEMCP());
1616170530Ssam}
1617170530Ssam#else
1618170530Ssam
1619170530Ssam/*
1620170530Ssam * POSIX platform does not use CodePage.
1621178354Ssam */
1622170530Ssam
1623170530Ssamstatic unsigned
1624178354Ssamget_current_codepage(void)
1625170530Ssam{
1626170530Ssam	return (-1);/* Unknown */
1627170530Ssam}
1628170530Ssamstatic unsigned
1629170530Ssammake_codepage_from_charset(const char *charset)
1630170530Ssam{
1631170530Ssam	(void)charset; /* UNUSED */
1632170530Ssam	return (-1);/* Unknown */
1633170530Ssam}
1634170530Ssamstatic unsigned
1635170530Ssamget_current_oemcp(void)
1636170530Ssam{
1637170530Ssam	return (-1);/* Unknown */
1638170530Ssam}
1639170530Ssam
1640170530Ssam#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
1641170530Ssam
1642170530Ssam/*
1643170530Ssam * Return a string conversion object.
1644170530Ssam */
1645170530Ssamstatic struct archive_string_conv *
1646170530Ssamget_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)
1647170530Ssam{
1648170530Ssam	struct archive_string_conv *sc;
1649170530Ssam	unsigned current_codepage;
1650170530Ssam
1651170530Ssam	/* Check if we have made the sconv object. */
1652170530Ssam	sc = find_sconv_object(a, fc, tc);
1653170530Ssam	if (sc != NULL)
1654170530Ssam		return (sc);
1655170530Ssam
1656170530Ssam	if (a == NULL)
1657170530Ssam		current_codepage = get_current_codepage();
1658170530Ssam	else
1659170530Ssam		current_codepage = a->current_codepage;
1660170530Ssam
1661170530Ssam	sc = create_sconv_object(canonical_charset_name(fc),
1662170530Ssam	    canonical_charset_name(tc), current_codepage, flag);
1663170530Ssam	if (sc == NULL) {
1664170530Ssam		if (a != NULL)
1665170530Ssam			archive_set_error(a, ENOMEM,
1666170530Ssam			    "Could not allocate memory for "
1667170530Ssam			    "a string conversion object");
1668170530Ssam		return (NULL);
1669184280Ssam	}
1670170530Ssam
1671170530Ssam	/*
1672170530Ssam	 * If there is no converter for current string conversion object,
1673170530Ssam	 * we cannot handle this conversion.
1674170530Ssam	 */
1675170530Ssam	if (sc->nconverter == 0) {
1676170530Ssam		if (a != NULL) {
1677170530Ssam#if HAVE_ICONV
1678184280Ssam			archive_set_error(a, ARCHIVE_ERRNO_MISC,
1679184280Ssam			    "iconv_open failed : Cannot handle ``%s''",
1680170530Ssam			    (flag & SCONV_TO_CHARSET)?tc:fc);
1681184280Ssam#else
1682173273Ssam			archive_set_error(a, ARCHIVE_ERRNO_MISC,
1683173273Ssam			    "A character-set conversion not fully supported "
1684173273Ssam			    "on this platform");
1685170530Ssam#endif
1686170530Ssam		}
1687170530Ssam		/* Failed; free a sconv object. */
1688170530Ssam		free_sconv_object(sc);
1689170530Ssam		return (NULL);
1690170530Ssam	}
1691170530Ssam
1692170530Ssam	/*
1693170530Ssam	 * Success!
1694170530Ssam	 */
1695170530Ssam	if (a != NULL)
1696170530Ssam		add_sconv_object(a, sc);
1697170530Ssam	return (sc);
1698170530Ssam}
1699182830Ssam
1700170530Ssamstatic const char *
1701170530Ssamget_current_charset(struct archive *a)
1702170530Ssam{
1703170530Ssam	const char *cur_charset;
1704170530Ssam
1705170530Ssam	if (a == NULL)
1706170530Ssam		cur_charset = default_iconv_charset("");
1707170530Ssam	else {
1708170530Ssam		cur_charset = default_iconv_charset(a->current_code);
1709170530Ssam		if (a->current_code == NULL) {
1710170530Ssam			a->current_code = strdup(cur_charset);
1711195377Ssam			a->current_codepage = get_current_codepage();
1712195377Ssam			a->current_oemcp = get_current_oemcp();
1713195377Ssam		}
1714170530Ssam	}
1715170530Ssam	return (cur_charset);
1716170530Ssam}
1717178354Ssam
1718170530Ssam/*
1719195377Ssam * Make and Return a string conversion object.
1720195377Ssam * Return NULL if the platform does not support the specified conversion
1721205277Srpaulo * and best_effort is 0.
1722195377Ssam * If best_effort is set, A string conversion object must be returned
1723170530Ssam * unless memory allocation for the object fails, but the conversion
1724195377Ssam * might fail when non-ASCII code is found.
1725195377Ssam */
1726195377Ssamstruct archive_string_conv *
1727195377Ssamarchive_string_conversion_to_charset(struct archive *a, const char *charset,
1728170530Ssam    int best_effort)
1729195377Ssam{
1730170530Ssam	int flag = SCONV_TO_CHARSET;
1731195377Ssam
1732195377Ssam	if (best_effort)
1733195377Ssam		flag |= SCONV_BEST_EFFORT;
1734195377Ssam	return (get_sconv_object(a, get_current_charset(a), charset, flag));
1735195377Ssam}
1736195377Ssam
1737195377Ssamstruct archive_string_conv *
1738195377Ssamarchive_string_conversion_from_charset(struct archive *a, const char *charset,
1739170530Ssam    int best_effort)
1740195377Ssam{
1741170530Ssam	int flag = SCONV_FROM_CHARSET;
1742195377Ssam
1743195377Ssam	if (best_effort)
1744195377Ssam		flag |= SCONV_BEST_EFFORT;
1745195377Ssam	return (get_sconv_object(a, charset, get_current_charset(a), flag));
1746195377Ssam}
1747195377Ssam
1748195377Ssam/*
1749195377Ssam * archive_string_default_conversion_*_archive() are provided for Windows
1750195377Ssam * platform because other archiver application use CP_OEMCP for
1751195377Ssam * MultiByteToWideChar() and WideCharToMultiByte() for the filenames
1752195377Ssam * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP
1753195377Ssam * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).
1754170530Ssam * So we should make a string conversion between CP_ACP and CP_OEMCP
1755195377Ssam * for compatibility.
1756195377Ssam */
1757195377Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
1758195377Ssamstruct archive_string_conv *
1759195377Ssamarchive_string_default_conversion_for_read(struct archive *a)
1760195377Ssam{
1761195377Ssam	const char *cur_charset = get_current_charset(a);
1762195377Ssam	char oemcp[16];
1763195377Ssam
1764195377Ssam	/* NOTE: a check of cur_charset is unneeded but we need
1765195377Ssam	 * that get_current_charset() has been surely called at
1766195377Ssam	 * this time whatever C compiler optimized. */
1767195377Ssam	if (cur_charset != NULL &&
1768195377Ssam	    (a->current_codepage == CP_C_LOCALE ||
1769195377Ssam	     a->current_codepage == a->current_oemcp))
1770195377Ssam		return (NULL);/* no conversion. */
1771205277Srpaulo
1772195377Ssam	_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1773195377Ssam	/* Make sure a null termination must be set. */
1774195377Ssam	oemcp[sizeof(oemcp)-1] = '\0';
1775195377Ssam	return (get_sconv_object(a, oemcp, cur_charset,
1776170530Ssam	    SCONV_FROM_CHARSET));
1777195377Ssam}
1778195377Ssam
1779195377Ssamstruct archive_string_conv *
1780195377Ssamarchive_string_default_conversion_for_write(struct archive *a)
1781195377Ssam{
1782195377Ssam	const char *cur_charset = get_current_charset(a);
1783195377Ssam	char oemcp[16];
1784195377Ssam
1785195377Ssam	/* NOTE: a check of cur_charset is unneeded but we need
1786195377Ssam	 * that get_current_charset() has been surely called at
1787195377Ssam	 * this time whatever C compiler optimized. */
1788170530Ssam	if (cur_charset != NULL &&
1789195377Ssam	    (a->current_codepage == CP_C_LOCALE ||
1790195377Ssam	     a->current_codepage == a->current_oemcp))
1791195377Ssam		return (NULL);/* no conversion. */
1792195377Ssam
1793195377Ssam	_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1794195377Ssam	/* Make sure a null termination must be set. */
1795195377Ssam	oemcp[sizeof(oemcp)-1] = '\0';
1796195377Ssam	return (get_sconv_object(a, cur_charset, oemcp,
1797195377Ssam	    SCONV_TO_CHARSET));
1798195377Ssam}
1799195377Ssam#else
1800195377Ssamstruct archive_string_conv *
1801195377Ssamarchive_string_default_conversion_for_read(struct archive *a)
1802195377Ssam{
1803195377Ssam	(void)a; /* UNUSED */
1804195377Ssam	return (NULL);
1805195377Ssam}
1806195377Ssam
1807195377Ssamstruct archive_string_conv *
1808195377Ssamarchive_string_default_conversion_for_write(struct archive *a)
1809195377Ssam{
1810195377Ssam	(void)a; /* UNUSED */
1811195377Ssam	return (NULL);
1812195377Ssam}
1813195377Ssam#endif
1814195377Ssam
1815195377Ssam/*
1816195377Ssam * Dispose of all character conversion objects in the archive object.
1817195377Ssam */
1818195377Ssamvoid
1819195377Ssamarchive_string_conversion_free(struct archive *a)
1820195377Ssam{
1821195377Ssam	struct archive_string_conv *sc;
1822195377Ssam	struct archive_string_conv *sc_next;
1823195377Ssam
1824195377Ssam	for (sc = a->sconv; sc != NULL; sc = sc_next) {
1825195377Ssam		sc_next = sc->next;
1826195377Ssam		free_sconv_object(sc);
1827195377Ssam	}
1828195377Ssam	a->sconv = NULL;
1829195377Ssam	free(a->current_code);
1830182829Ssam	a->current_code = NULL;
1831195377Ssam}
1832195377Ssam
1833195377Ssam/*
1834195377Ssam * Return a conversion charset name.
1835195377Ssam */
1836195377Ssamconst char *
1837195377Ssamarchive_string_conversion_charset_name(struct archive_string_conv *sc)
1838195377Ssam{
1839195377Ssam	if (sc->flag & SCONV_TO_CHARSET)
1840195377Ssam		return (sc->to_charset);
1841195377Ssam	else
1842182829Ssam		return (sc->from_charset);
1843195377Ssam}
1844195377Ssam
1845195377Ssam/*
1846195377Ssam * Change the behavior of a string conversion.
1847195377Ssam */
1848195377Ssamvoid
1849195377Ssamarchive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)
1850195377Ssam{
1851170530Ssam	switch (opt) {
1852195377Ssam	/*
1853195377Ssam	 * A filename in UTF-8 was made with libarchive 2.x in a wrong
1854195377Ssam	 * assumption that wchar_t was Unicode.
1855195377Ssam	 * This option enables simulating the assumption in order to read
1856195377Ssam	 * that filename correctly.
1857195377Ssam	 */
1858195377Ssam	case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:
1859195377Ssam#if (defined(_WIN32) && !defined(__CYGWIN__)) \
1860195377Ssam	 || defined(__STDC_ISO_10646__) || defined(__APPLE__)
1861195377Ssam		/*
1862170530Ssam		 * Nothing to do for it since wchar_t on these platforms
1863195377Ssam		 * is really Unicode.
1864195377Ssam		 */
1865170530Ssam		(void)sc; /* UNUSED */
1866195377Ssam#else
1867170530Ssam		if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {
1868195377Ssam			sc->flag |= SCONV_UTF8_LIBARCHIVE_2;
1869195377Ssam			/* Set up string converters. */
1870195377Ssam			setup_converter(sc);
1871195377Ssam		}
1872195377Ssam#endif
1873195377Ssam		break;
1874195377Ssam	case SCONV_SET_OPT_NORMALIZATION_C:
1875195377Ssam		if ((sc->flag & SCONV_NORMALIZATION_C) == 0) {
1876195377Ssam			sc->flag |= SCONV_NORMALIZATION_C;
1877195377Ssam			sc->flag &= ~SCONV_NORMALIZATION_D;
1878195377Ssam			/* Set up string converters. */
1879195377Ssam			setup_converter(sc);
1880170530Ssam		}
1881195377Ssam		break;
1882170530Ssam	case SCONV_SET_OPT_NORMALIZATION_D:
1883170530Ssam#if defined(HAVE_ICONV)
1884195377Ssam		/*
1885195377Ssam		 * If iconv will take the string, do not change the
1886195377Ssam		 * setting of the normalization.
1887170530Ssam		 */
1888170530Ssam		if (!(sc->flag & SCONV_WIN_CP) &&
1889170530Ssam		     (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1890170530Ssam		    !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1891195377Ssam			break;
1892195377Ssam#endif
1893195377Ssam		if ((sc->flag & SCONV_NORMALIZATION_D) == 0) {
1894195377Ssam			sc->flag |= SCONV_NORMALIZATION_D;
1895195377Ssam			sc->flag &= ~SCONV_NORMALIZATION_C;
1896195377Ssam			/* Set up string converters. */
1897195377Ssam			setup_converter(sc);
1898195377Ssam		}
1899170530Ssam		break;
1900195377Ssam	default:
1901170530Ssam		break;
1902170530Ssam	}
1903195377Ssam}
1904195377Ssam
1905195377Ssam/*
1906195377Ssam *
1907195377Ssam * Copy one archive_string to another in locale conversion.
1908195377Ssam *
1909195377Ssam *	archive_strncat_l();
1910195377Ssam *	archive_strncpy_l();
1911195377Ssam *
1912195377Ssam */
1913195377Ssam
1914195377Ssamstatic size_t
1915195377Ssammbsnbytes(const void *_p, size_t n)
1916195377Ssam{
1917195377Ssam	size_t s;
1918195377Ssam	const char *p, *pp;
1919195377Ssam
1920195377Ssam	if (_p == NULL)
1921195377Ssam		return (0);
1922195377Ssam	p = (const char *)_p;
1923195377Ssam
1924195377Ssam	/* Like strlen(p), except won't examine positions beyond p[n]. */
1925195377Ssam	s = 0;
1926195377Ssam	pp = p;
1927195377Ssam	while (s < n && *pp) {
1928195377Ssam		pp++;
1929170530Ssam		s++;
1930170530Ssam	}
1931170530Ssam	return (s);
1932170530Ssam}
1933170530Ssam
1934178354Ssamstatic size_t
1935178354Ssamutf16nbytes(const void *_p, size_t n)
1936178354Ssam{
1937178354Ssam	size_t s;
1938178354Ssam	const char *p, *pp;
1939178354Ssam
1940178354Ssam	if (_p == NULL)
1941178354Ssam		return (0);
1942178354Ssam	p = (const char *)_p;
1943178354Ssam
1944178354Ssam	/* Like strlen(p), except won't examine positions beyond p[n]. */
1945178354Ssam	s = 0;
1946178354Ssam	pp = p;
1947178354Ssam	n >>= 1;
1948178354Ssam	while (s < n && (pp[0] || pp[1])) {
1949178354Ssam		pp += 2;
1950178354Ssam		s++;
1951178354Ssam	}
1952178354Ssam	return (s<<1);
1953178354Ssam}
1954178354Ssam
1955178354Ssamint
1956178354Ssamarchive_strncpy_l(struct archive_string *as, const void *_p, size_t n,
1957178354Ssam    struct archive_string_conv *sc)
1958178354Ssam{
1959178354Ssam	as->length = 0;
1960183249Ssam	return (archive_strncat_l(as, _p, n, sc));
1961183249Ssam}
1962178354Ssam
1963178354Ssamint
1964178354Ssamarchive_strncat_l(struct archive_string *as, const void *_p, size_t n,
1965178354Ssam    struct archive_string_conv *sc)
1966170530Ssam{
1967170530Ssam	const void *s;
1968170530Ssam	size_t length = 0;
1969170530Ssam	int i, r = 0, r2;
1970170530Ssam
1971170530Ssam	if (_p != NULL && n > 0) {
1972170530Ssam		if (sc != NULL && (sc->flag & SCONV_FROM_UTF16))
1973170530Ssam			length = utf16nbytes(_p, n);
1974170530Ssam		else
1975205277Srpaulo			length = mbsnbytes(_p, n);
1976170530Ssam	}
1977170530Ssam
1978170530Ssam	/* We must allocate memory even if there is no data for conversion
1979170530Ssam	 * or copy. This simulates archive_string_append behavior. */
1980170530Ssam	if (length == 0) {
1981170530Ssam		int tn = 1;
1982184280Ssam		if (sc != NULL && (sc->flag & SCONV_TO_UTF16))
1983170530Ssam			tn = 2;
1984173273Ssam		if (archive_string_ensure(as, as->length + tn) == NULL)
1985173273Ssam			return (-1);
1986173273Ssam		as->s[as->length] = 0;
1987170530Ssam		if (tn == 2)
1988183245Ssam			as->s[as->length+1] = 0;
1989183245Ssam		return (0);
1990170530Ssam	}
1991170530Ssam
1992205277Srpaulo	/*
1993205277Srpaulo	 * If sc is NULL, we just make a copy.
1994170530Ssam	 */
1995170530Ssam	if (sc == NULL) {
1996170530Ssam		if (archive_string_append(as, _p, length) == NULL)
1997205277Srpaulo			return (-1);/* No memory */
1998170530Ssam		return (0);
1999205277Srpaulo	}
2000170530Ssam
2001178354Ssam	s = _p;
2002173273Ssam	i = 0;
2003173273Ssam	if (sc->nconverter > 1) {
2004173273Ssam		sc->utftmp.length = 0;
2005178354Ssam		r2 = sc->converter[0](&(sc->utftmp), s, length, sc);
2006178354Ssam		if (r2 != 0 && errno == ENOMEM)
2007178354Ssam			return (r2);
2008178354Ssam		if (r > r2)
2009170530Ssam			r = r2;
2010170530Ssam		s = sc->utftmp.s;
2011170530Ssam		length = sc->utftmp.length;
2012183245Ssam		++i;
2013205277Srpaulo	}
2014178953Ssam	r2 = sc->converter[i](as, s, length, sc);
2015178953Ssam	if (r > r2)
2016170530Ssam		r = r2;
2017170530Ssam	return (r);
2018170530Ssam}
2019170530Ssam
2020170530Ssam#if HAVE_ICONV
2021173273Ssam
2022173273Ssam/*
2023173273Ssam * Return -1 if conversion fails.
2024173273Ssam */
2025183250Ssamstatic int
2026183250Ssamiconv_strncat_in_locale(struct archive_string *as, const void *_p,
2027173273Ssam    size_t length, struct archive_string_conv *sc)
2028173273Ssam{
2029178354Ssam	ICONV_CONST char *itp;
2030173273Ssam	size_t remaining;
2031173273Ssam	iconv_t cd;
2032173273Ssam	char *outp;
2033184280Ssam	size_t avail, bs;
2034173273Ssam	int return_value = 0; /* success */
2035178354Ssam	int to_size, from_size;
2036183250Ssam
2037183250Ssam	if (sc->flag & SCONV_TO_UTF16)
2038178354Ssam		to_size = 2;
2039173273Ssam	else
2040173273Ssam		to_size = 1;
2041173273Ssam	if (sc->flag & SCONV_FROM_UTF16)
2042173273Ssam		from_size = 2;
2043183250Ssam	else
2044195377Ssam		from_size = 1;
2045173273Ssam
2046173273Ssam	if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)
2047178354Ssam		return (-1);
2048183250Ssam
2049183250Ssam	cd = sc->cd;
2050178354Ssam	itp = (char *)(uintptr_t)_p;
2051173273Ssam	remaining = length;
2052173273Ssam	outp = as->s + as->length;
2053173273Ssam	avail = as->buffer_length - as->length - to_size;
2054184280Ssam	while (remaining >= (size_t)from_size) {
2055184280Ssam		size_t result = iconv(cd, &itp, &remaining, &outp, &avail);
2056184280Ssam
2057184280Ssam		if (result != (size_t)-1)
2058184280Ssam			break; /* Conversion completed. */
2059184280Ssam
2060184280Ssam		if (errno == EILSEQ || errno == EINVAL) {
2061184280Ssam			/*
2062184280Ssam		 	 * If an output charset is UTF-8 or UTF-16BE/LE,
2063184280Ssam			 * unknown character should be U+FFFD
2064184280Ssam			 * (replacement character).
2065184280Ssam			 */
2066184280Ssam			if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
2067184280Ssam				size_t rbytes;
2068184280Ssam				if (sc->flag & SCONV_TO_UTF8)
2069184280Ssam					rbytes = sizeof(utf8_replacement_char);
2070184280Ssam				else
2071184280Ssam					rbytes = 2;
2072184280Ssam
2073184280Ssam				if (avail < rbytes) {
2074184280Ssam					as->length = outp - as->s;
2075184280Ssam					bs = as->buffer_length +
2076184280Ssam					    (remaining * to_size) + rbytes;
2077184280Ssam					if (NULL ==
2078184280Ssam					    archive_string_ensure(as, bs))
2079184280Ssam						return (-1);
2080184280Ssam					outp = as->s + as->length;
2081184280Ssam					avail = as->buffer_length
2082184280Ssam					    - as->length - to_size;
2083184280Ssam				}
2084184280Ssam				if (sc->flag & SCONV_TO_UTF8)
2085184280Ssam					memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
2086184280Ssam				else if (sc->flag & SCONV_TO_UTF16BE)
2087184280Ssam					archive_be16enc(outp, UNICODE_R_CHAR);
2088184280Ssam				else
2089184280Ssam					archive_le16enc(outp, UNICODE_R_CHAR);
2090184280Ssam				outp += rbytes;
2091184280Ssam				avail -= rbytes;
2092184280Ssam			} else {
2093184280Ssam				/* Skip the illegal input bytes. */
2094184280Ssam				*outp++ = '?';
2095184280Ssam				avail--;
2096184280Ssam			}
2097184280Ssam			itp += from_size;
2098184280Ssam			remaining -= from_size;
2099184280Ssam			return_value = -1; /* failure */
2100184280Ssam		} else {
2101184280Ssam			/* E2BIG no output buffer,
2102184280Ssam			 * Increase an output buffer.  */
2103184280Ssam			as->length = outp - as->s;
2104184280Ssam			bs = as->buffer_length + remaining * 2;
2105184280Ssam			if (NULL == archive_string_ensure(as, bs))
2106184280Ssam				return (-1);
2107184280Ssam			outp = as->s + as->length;
2108184280Ssam			avail = as->buffer_length - as->length - to_size;
2109184280Ssam		}
2110184280Ssam	}
2111184280Ssam	as->length = outp - as->s;
2112184280Ssam	as->s[as->length] = 0;
2113184280Ssam	if (to_size == 2)
2114184280Ssam		as->s[as->length+1] = 0;
2115184280Ssam	return (return_value);
2116184280Ssam}
2117184280Ssam
2118184280Ssam#endif /* HAVE_ICONV */
2119184280Ssam
2120184280Ssam
2121184280Ssam#if defined(_WIN32) && !defined(__CYGWIN__)
2122184280Ssam
2123184280Ssam/*
2124184280Ssam * Translate a string from a some CodePage to an another CodePage by
2125184280Ssam * Windows APIs, and copy the result. Return -1 if conversion fails.
2126184280Ssam */
2127184280Ssamstatic int
2128184280Ssamstrncat_in_codepage(struct archive_string *as,
2129184280Ssam    const void *_p, size_t length, struct archive_string_conv *sc)
2130173273Ssam{
2131170530Ssam	const char *s = (const char *)_p;
2132170530Ssam	struct archive_wstring aws;
2133170530Ssam	size_t l;
2134184280Ssam	int r, saved_flag;
2135184280Ssam
2136170530Ssam	archive_string_init(&aws);
2137170530Ssam	saved_flag = sc->flag;
2138170530Ssam	sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);
2139184280Ssam	r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);
2140170530Ssam	sc->flag = saved_flag;
2141178354Ssam	if (r != 0) {
2142178354Ssam		archive_wstring_free(&aws);
2143170530Ssam		if (errno != ENOMEM)
2144184280Ssam			archive_string_append(as, s, length);
2145170530Ssam		return (-1);
2146184280Ssam	}
2147170530Ssam
2148170530Ssam	l = as->length;
2149170530Ssam	r = archive_string_append_from_wcs_in_codepage(
2150184280Ssam	    as, aws.s, aws.length, sc);
2151184280Ssam	if (r != 0 && errno != ENOMEM && l == as->length)
2152184280Ssam		archive_string_append(as, s, length);
2153184280Ssam	archive_wstring_free(&aws);
2154184280Ssam	return (r);
2155184280Ssam}
2156184280Ssam
2157184280Ssam/*
2158170530Ssam * Test whether MBS ==> WCS is okay.
2159170530Ssam */
2160184280Ssamstatic int
2161170530Ssaminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2162170530Ssam{
2163170530Ssam	const char *p = (const char *)_p;
2164184280Ssam	unsigned codepage;
2165184280Ssam	DWORD mbflag = MB_ERR_INVALID_CHARS;
2166184280Ssam
2167184280Ssam	if (sc->flag & SCONV_FROM_CHARSET)
2168184280Ssam		codepage = sc->to_cp;
2169184280Ssam	else
2170184280Ssam		codepage = sc->from_cp;
2171184280Ssam
2172170530Ssam	if (codepage == CP_C_LOCALE)
2173184280Ssam		return (0);
2174184280Ssam	if (codepage != CP_UTF8)
2175184280Ssam		mbflag |= MB_PRECOMPOSED;
2176170530Ssam
2177170530Ssam	if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0)
2178170530Ssam		return (-1); /* Invalid */
2179184280Ssam	return (0); /* Okay */
2180184280Ssam}
2181184280Ssam
2182170530Ssam#else
2183184280Ssam
2184184280Ssam/*
2185184280Ssam * Test whether MBS ==> WCS is okay.
2186184280Ssam */
2187184280Ssamstatic int
2188170530Ssaminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2189178354Ssam{
2190178354Ssam	const char *p = (const char *)_p;
2191170530Ssam	size_t r;
2192170530Ssam
2193184280Ssam#if HAVE_MBRTOWC
2194184280Ssam	mbstate_t shift_state;
2195184280Ssam
2196184280Ssam	memset(&shift_state, 0, sizeof(shift_state));
2197184280Ssam#else
2198184280Ssam	/* Clear the shift state before starting. */
2199184280Ssam	mbtowc(NULL, NULL, 0);
2200184280Ssam#endif
2201170530Ssam	while (n) {
2202184280Ssam		wchar_t wc;
2203184280Ssam
2204184280Ssam#if HAVE_MBRTOWC
2205184280Ssam		r = mbrtowc(&wc, p, n, &shift_state);
2206184280Ssam#else
2207184280Ssam		r = mbtowc(&wc, p, n);
2208184280Ssam#endif
2209184280Ssam		if (r == (size_t)-1 || r == (size_t)-2)
2210184280Ssam			return (-1);/* Invalid. */
2211184280Ssam		if (r == 0)
2212184280Ssam			break;
2213184280Ssam		p += r;
2214184280Ssam		n -= r;
2215184280Ssam	}
2216170530Ssam	(void)sc; /* UNUSED */
2217170530Ssam	return (0); /* All Okey. */
2218170530Ssam}
2219170530Ssam
2220170530Ssam#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
2221170530Ssam
2222195377Ssam/*
2223195377Ssam * Basically returns -1 because we cannot make a conversion of charset
2224195377Ssam * without iconv but in some cases this would return 0.
2225195377Ssam * Returns 0 if all copied characters are ASCII.
2226195377Ssam * Returns 0 if both from-locale and to-locale are the same and those
2227195377Ssam * can be WCS with no error.
2228195377Ssam */
2229195377Ssamstatic int
2230195377Ssambest_effort_strncat_in_locale(struct archive_string *as, const void *_p,
2231195377Ssam    size_t length, struct archive_string_conv *sc)
2232195377Ssam{
2233195377Ssam	size_t remaining;
2234195377Ssam	const uint8_t *itp;
2235195377Ssam	int return_value = 0; /* success */
2236195377Ssam
2237195377Ssam	/*
2238195377Ssam	 * If both from-locale and to-locale is the same, this makes a copy.
2239195377Ssam	 * And then this checks all copied MBS can be WCS if so returns 0.
2240195377Ssam	 */
2241195377Ssam	if (sc->same) {
2242195377Ssam		if (archive_string_append(as, _p, length) == NULL)
2243170530Ssam			return (-1);/* No memory */
2244170530Ssam		return (invalid_mbs(_p, length, sc));
2245170530Ssam	}
2246170530Ssam
2247170530Ssam	/*
2248195377Ssam	 * If a character is ASCII, this just copies it. If not, this
2249195377Ssam	 * assigns '?' character instead but in UTF-8 locale this assigns
2250195377Ssam	 * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
2251170530Ssam	 * a Replacement Character in Unicode.
2252178354Ssam	 */
2253170530Ssam
2254195377Ssam	remaining = length;
2255170530Ssam	itp = (const uint8_t *)_p;
2256170530Ssam	while (*itp && remaining > 0) {
2257195377Ssam		if (*itp > 127) {
2258195377Ssam			// Non-ASCII: Substitute with suitable replacement
2259205277Srpaulo			if (sc->flag & SCONV_TO_UTF8) {
2260195377Ssam				if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
2261195377Ssam					__archive_errx(1, "Out of memory");
2262195377Ssam				}
2263205277Srpaulo			} else {
2264205277Srpaulo				archive_strappend_char(as, '?');
2265195377Ssam			}
2266195377Ssam			return_value = -1;
2267195377Ssam		} else {
2268195377Ssam			archive_strappend_char(as, *itp);
2269195377Ssam		}
2270195377Ssam		++itp;
2271195377Ssam	}
2272195377Ssam	return (return_value);
2273195377Ssam}
2274195377Ssam
2275195377Ssam
2276195377Ssam/*
2277195377Ssam * Unicode conversion functions.
2278195377Ssam *   - UTF-8 <===> UTF-8 in removing surrogate pairs.
2279195377Ssam *   - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.
2280195377Ssam *   - UTF-8 made by libarchive 2.x ===> UTF-8.
2281205277Srpaulo *   - UTF-16BE <===> UTF-8.
2282205277Srpaulo *
2283205277Srpaulo */
2284205277Srpaulo
2285195377Ssam/*
2286205277Srpaulo * Utility to convert a single UTF-8 sequence.
2287195377Ssam *
2288195377Ssam * Usually return used bytes, return used byte in negative value when
2289195377Ssam * a unicode character is replaced with U+FFFD.
2290195377Ssam * See also http://unicode.org/review/pr-121.html Public Review Issue #121
2291195377Ssam * Recommended Practice for Replacement Characters.
2292195377Ssam */
2293195377Ssamstatic int
2294195377Ssam_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2295195377Ssam{
2296195377Ssam	static const char utf8_count[256] = {
2297195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */
2298195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */
2299195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */
2300195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */
2301195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */
2302195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */
2303195377Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */
2304170530Ssam		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */
2305195377Ssam		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */
2306170530Ssam		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */
2307195377Ssam		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */
2308195377Ssam		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */
2309195377Ssam		 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */
2310195377Ssam		 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */
2311195377Ssam		 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */
2312195377Ssam		 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */
2313170530Ssam	};
2314178354Ssam	int ch, i;
2315195377Ssam	int cnt;
2316195377Ssam	uint32_t wc;
2317170530Ssam
2318170530Ssam	/* Sanity check. */
2319170530Ssam	if (n == 0)
2320195377Ssam		return (0);
2321195377Ssam	/*
2322195377Ssam	 * Decode 1-4 bytes depending on the value of the first byte.
2323195377Ssam	 */
2324170530Ssam	ch = (unsigned char)*s;
2325195377Ssam	if (ch == 0)
2326195377Ssam		return (0); /* Standard:  return 0 for end-of-string. */
2327195377Ssam	cnt = utf8_count[ch];
2328195377Ssam
2329195377Ssam	/* Invalid sequence or there are not plenty bytes. */
2330195377Ssam	if ((int)n < cnt) {
2331195377Ssam		cnt = (int)n;
2332195377Ssam		for (i = 1; i < cnt; i++) {
2333195377Ssam			if ((s[i] & 0xc0) != 0x80) {
2334195377Ssam				cnt = i;
2335195377Ssam				break;
2336195377Ssam			}
2337195377Ssam		}
2338170530Ssam		goto invalid_sequence;
2339195377Ssam	}
2340195377Ssam
2341195377Ssam	/* Make a Unicode code point from a single UTF-8 sequence. */
2342195377Ssam	switch (cnt) {
2343195377Ssam	case 1:	/* 1 byte sequence. */
2344195377Ssam		*pwc = ch & 0x7f;
2345195377Ssam		return (cnt);
2346195377Ssam	case 2:	/* 2 bytes sequence. */
2347170530Ssam		if ((s[1] & 0xc0) != 0x80) {
2348195377Ssam			cnt = 1;
2349195377Ssam			goto invalid_sequence;
2350195377Ssam		}
2351170530Ssam		*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
2352195377Ssam		return (cnt);
2353195377Ssam	case 3:	/* 3 bytes sequence. */
2354195377Ssam		if ((s[1] & 0xc0) != 0x80) {
2355195377Ssam			cnt = 1;
2356170530Ssam			goto invalid_sequence;
2357195377Ssam		}
2358195377Ssam		if ((s[2] & 0xc0) != 0x80) {
2359195377Ssam			cnt = 2;
2360195377Ssam			goto invalid_sequence;
2361195377Ssam		}
2362195377Ssam		wc = ((ch & 0x0f) << 12)
2363195377Ssam		    | ((s[1] & 0x3f) << 6)
2364195377Ssam		    | (s[2] & 0x3f);
2365195377Ssam		if (wc < 0x800)
2366195377Ssam			goto invalid_sequence;/* Overlong sequence. */
2367195377Ssam		break;
2368195377Ssam	case 4:	/* 4 bytes sequence. */
2369195377Ssam		if ((s[1] & 0xc0) != 0x80) {
2370195377Ssam			cnt = 1;
2371195377Ssam			goto invalid_sequence;
2372195377Ssam		}
2373195377Ssam		if ((s[2] & 0xc0) != 0x80) {
2374195377Ssam			cnt = 2;
2375170530Ssam			goto invalid_sequence;
2376195377Ssam		}
2377170530Ssam		if ((s[3] & 0xc0) != 0x80) {
2378170530Ssam			cnt = 3;
2379170530Ssam			goto invalid_sequence;
2380170530Ssam		}
2381170530Ssam		wc = ((ch & 0x07) << 18)
2382170530Ssam		    | ((s[1] & 0x3f) << 12)
2383170530Ssam		    | ((s[2] & 0x3f) << 6)
2384170530Ssam		    | (s[3] & 0x3f);
2385170530Ssam		if (wc < 0x10000)
2386170530Ssam			goto invalid_sequence;/* Overlong sequence. */
2387170530Ssam		break;
2388170530Ssam	default: /* Others are all invalid sequence. */
2389170530Ssam		if (ch == 0xc0 || ch == 0xc1)
2390170530Ssam			cnt = 2;
2391170530Ssam		else if (ch >= 0xf5 && ch <= 0xf7)
2392170530Ssam			cnt = 4;
2393170530Ssam		else if (ch >= 0xf8 && ch <= 0xfb)
2394170530Ssam			cnt = 5;
2395170530Ssam		else if (ch == 0xfc || ch == 0xfd)
2396170530Ssam			cnt = 6;
2397170530Ssam		else
2398170530Ssam			cnt = 1;
2399170530Ssam		if ((int)n < cnt)
2400170530Ssam			cnt = (int)n;
2401170530Ssam		for (i = 1; i < cnt; i++) {
2402170530Ssam			if ((s[i] & 0xc0) != 0x80) {
2403170530Ssam				cnt = i;
2404170530Ssam				break;
2405170530Ssam			}
2406170530Ssam		}
2407170530Ssam		goto invalid_sequence;
2408178354Ssam	}
2409205513Srpaulo
2410173865Ssam	/* The code point larger than 0x10FFFF is not legal
2411170530Ssam	 * Unicode values. */
2412170530Ssam	if (wc > UNICODE_MAX)
2413178354Ssam		goto invalid_sequence;
2414173273Ssam	/* Correctly gets a Unicode, returns used bytes. */
2415173273Ssam	*pwc = wc;
2416173273Ssam	return (cnt);
2417173273Ssaminvalid_sequence:
2418173273Ssam	*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2419173273Ssam	return (cnt * -1);
2420173273Ssam}
2421173273Ssam
2422178354Ssamstatic int
2423173273Ssamutf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2424193655Ssam{
2425173273Ssam	int cnt;
2426173273Ssam
2427173273Ssam	cnt = _utf8_to_unicode(pwc, s, n);
2428173865Ssam	/* Any of Surrogate pair is not legal Unicode values. */
2429173865Ssam	if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
2430173865Ssam		return (-3);
2431173273Ssam	return (cnt);
2432173273Ssam}
2433178354Ssam
2434173273Ssamstatic inline uint32_t
2435173273Ssamcombine_surrogate_pair(uint32_t uc, uint32_t uc2)
2436173273Ssam{
2437178354Ssam	uc -= 0xD800;
2438178354Ssam	uc *= 0x400;
2439173273Ssam	uc += uc2 - 0xDC00;
2440170530Ssam	uc += 0x10000;
2441193655Ssam	return (uc);
2442170530Ssam}
2443193655Ssam
2444170530Ssam/*
2445170530Ssam * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in
2446170530Ssam * removing surrogate pairs.
2447170530Ssam *
2448170530Ssam * CESU-8: The Compatibility Encoding Scheme for UTF-16.
2449173865Ssam *
2450173865Ssam * Usually return used bytes, return used byte in negative value when
2451173273Ssam * a unicode character is replaced with U+FFFD.
2452170530Ssam */
2453170530Ssamstatic int
2454170530Ssamcesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2455170530Ssam{
2456170530Ssam	uint32_t wc = 0;
2457170530Ssam	int cnt;
2458170530Ssam
2459173273Ssam	cnt = _utf8_to_unicode(&wc, s, n);
2460173273Ssam	if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {
2461173273Ssam		uint32_t wc2 = 0;
2462173273Ssam		if (n - 3 < 3) {
2463173273Ssam			/* Invalid byte sequence. */
2464173273Ssam			goto invalid_sequence;
2465173273Ssam		}
2466170530Ssam		cnt = _utf8_to_unicode(&wc2, s+3, n-3);
2467205513Srpaulo		if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {
2468205513Srpaulo			/* Invalid byte sequence. */
2469205513Srpaulo			goto invalid_sequence;
2470205513Srpaulo		}
2471205513Srpaulo		wc = combine_surrogate_pair(wc, wc2);
2472205513Srpaulo		cnt = 6;
2473205513Srpaulo	} else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {
2474205513Srpaulo		/* Invalid byte sequence. */
2475170530Ssam		goto invalid_sequence;
2476205513Srpaulo	}
2477205513Srpaulo	*pwc = wc;
2478170530Ssam	return (cnt);
2479170530Ssaminvalid_sequence:
2480170530Ssam	*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2481170530Ssam	if (cnt > 0)
2482170530Ssam		cnt *= -1;
2483170530Ssam	return (cnt);
2484170530Ssam}
2485170530Ssam
2486170530Ssam/*
2487170530Ssam * Convert a Unicode code point to a single UTF-8 sequence.
2488170530Ssam *
2489170530Ssam * NOTE:This function does not check if the Unicode is legal or not.
2490170530Ssam * Please you definitely check it before calling this.
2491170530Ssam */
2492170530Ssamstatic size_t
2493170530Ssamunicode_to_utf8(char *p, size_t remaining, uint32_t uc)
2494170530Ssam{
2495170530Ssam	char *_p = p;
2496170530Ssam
2497170530Ssam	/* Invalid Unicode char maps to Replacement character */
2498170530Ssam	if (uc > UNICODE_MAX)
2499170530Ssam		uc = UNICODE_R_CHAR;
2500170530Ssam	/* Translate code point to UTF8 */
2501170530Ssam	if (uc <= 0x7f) {
2502170530Ssam		if (remaining == 0)
2503170530Ssam			return (0);
2504170530Ssam		*p++ = (char)uc;
2505170530Ssam	} else if (uc <= 0x7ff) {
2506170530Ssam		if (remaining < 2)
2507170530Ssam			return (0);
2508170530Ssam		*p++ = 0xc0 | ((uc >> 6) & 0x1f);
2509170530Ssam		*p++ = 0x80 | (uc & 0x3f);
2510170530Ssam	} else if (uc <= 0xffff) {
2511170530Ssam		if (remaining < 3)
2512170530Ssam			return (0);
2513170530Ssam		*p++ = 0xe0 | ((uc >> 12) & 0x0f);
2514170530Ssam		*p++ = 0x80 | ((uc >> 6) & 0x3f);
2515170530Ssam		*p++ = 0x80 | (uc & 0x3f);
2516170530Ssam	} else {
2517170530Ssam		if (remaining < 4)
2518170530Ssam			return (0);
2519170530Ssam		*p++ = 0xf0 | ((uc >> 18) & 0x07);
2520170530Ssam		*p++ = 0x80 | ((uc >> 12) & 0x3f);
2521170530Ssam		*p++ = 0x80 | ((uc >> 6) & 0x3f);
2522170530Ssam		*p++ = 0x80 | (uc & 0x3f);
2523170530Ssam	}
2524170530Ssam	return (p - _p);
2525170530Ssam}
2526170530Ssam
2527170530Ssamstatic int
2528170530Ssamutf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)
2529172211Ssam{
2530172211Ssam	return (utf16_to_unicode(pwc, s, n, 1));
2531172211Ssam}
2532178354Ssam
2533172211Ssamstatic int
2534172211Ssamutf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)
2535172211Ssam{
2536178354Ssam	return (utf16_to_unicode(pwc, s, n, 0));
2537178354Ssam}
2538172211Ssam
2539172211Ssamstatic int
2540172211Ssamutf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)
2541172211Ssam{
2542178354Ssam	const char *utf16 = s;
2543193655Ssam	unsigned uc;
2544183256Ssam
2545183256Ssam	if (n == 0)
2546183256Ssam		return (0);
2547178354Ssam	if (n == 1) {
2548172211Ssam		/* set the Replacement Character instead. */
2549178354Ssam		*pwc = UNICODE_R_CHAR;
2550172211Ssam		return (-1);
2551172211Ssam	}
2552172211Ssam
2553178354Ssam	if (be)
2554172211Ssam		uc = archive_be16dec(utf16);
2555172211Ssam	else
2556172211Ssam		uc = archive_le16dec(utf16);
2557172211Ssam	utf16 += 2;
2558172211Ssam
2559172211Ssam	/* If this is a surrogate pair, assemble the full code point.*/
2560172211Ssam	if (IS_HIGH_SURROGATE_LA(uc)) {
2561172211Ssam		unsigned uc2;
2562172211Ssam
2563172211Ssam		if (n >= 4) {
2564170530Ssam			if (be)
2565173273Ssam				uc2 = archive_be16dec(utf16);
2566173273Ssam			else
2567173273Ssam				uc2 = archive_le16dec(utf16);
2568173273Ssam		} else
2569170530Ssam			uc2 = 0;
2570170530Ssam		if (IS_LOW_SURROGATE_LA(uc2)) {
2571170530Ssam			uc = combine_surrogate_pair(uc, uc2);
2572170530Ssam			utf16 += 2;
2573183256Ssam		} else {
2574170530Ssam	 		/* Undescribed code point should be U+FFFD
2575170530Ssam		 	* (replacement character). */
2576170530Ssam			*pwc = UNICODE_R_CHAR;
2577170530Ssam			return (-2);
2578170530Ssam		}
2579170530Ssam	}
2580178354Ssam
2581170530Ssam	/*
2582193655Ssam	 * Surrogate pair values(0xd800 through 0xdfff) are only
2583183256Ssam	 * used by UTF-16, so, after above calculation, the code
2584183256Ssam	 * must not be surrogate values, and Unicode has no codes
2585183256Ssam	 * larger than 0x10ffff. Thus, those are not legal Unicode
2586178354Ssam	 * values.
2587170530Ssam	 */
2588178354Ssam	if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
2589170530Ssam	 	/* Undescribed code point should be U+FFFD
2590170530Ssam	 	* (replacement character). */
2591170530Ssam		*pwc = UNICODE_R_CHAR;
2592178354Ssam		return (((int)(utf16 - s)) * -1);
2593170530Ssam	}
2594170530Ssam	*pwc = uc;
2595172211Ssam	return ((int)(utf16 - s));
2596170530Ssam}
2597170530Ssam
2598170530Ssamstatic size_t
2599170530Ssamunicode_to_utf16be(char *p, size_t remaining, uint32_t uc)
2600170530Ssam{
2601170530Ssam	char *utf16 = p;
2602170530Ssam
2603170530Ssam	if (uc > 0xffff) {
2604170530Ssam		/* We have a code point that won't fit into a
2605170530Ssam		 * wchar_t; convert it to a surrogate pair. */
2606170530Ssam		if (remaining < 4)
2607170530Ssam			return (0);
2608170530Ssam		uc -= 0x10000;
2609170530Ssam		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2610170530Ssam		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2611170530Ssam		return (4);
2612170530Ssam	} else {
2613170530Ssam		if (remaining < 2)
2614170530Ssam			return (0);
2615170530Ssam		archive_be16enc(utf16, uc);
2616170530Ssam		return (2);
2617170530Ssam	}
2618170530Ssam}
2619170530Ssam
2620170530Ssamstatic size_t
2621170530Ssamunicode_to_utf16le(char *p, size_t remaining, uint32_t uc)
2622170530Ssam{
2623170530Ssam	char *utf16 = p;
2624170530Ssam
2625170530Ssam	if (uc > 0xffff) {
2626170530Ssam		/* We have a code point that won't fit into a
2627170530Ssam		 * wchar_t; convert it to a surrogate pair. */
2628170530Ssam		if (remaining < 4)
2629170530Ssam			return (0);
2630170530Ssam		uc -= 0x10000;
2631170530Ssam		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2632		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2633		return (4);
2634	} else {
2635		if (remaining < 2)
2636			return (0);
2637		archive_le16enc(utf16, uc);
2638		return (2);
2639	}
2640}
2641
2642/*
2643 * Append new UTF-8 string to existing UTF-8 string.
2644 * Existing string is assumed to already be in proper form;
2645 * the new string will have invalid sequences replaced and
2646 * surrogate pairs canonicalized.
2647 */
2648static int
2649strncat_from_utf8_to_utf8(struct archive_string *as, const void *_src,
2650    size_t len, struct archive_string_conv *sc)
2651{
2652	int ret = 0;
2653	const char *src = _src;
2654	(void)sc; /* UNUSED */
2655
2656	/* Pre-extend the destination */
2657	if (archive_string_ensure(as, as->length + len + 1) == NULL)
2658		return (-1);
2659
2660	/* Invariant: src points to the first UTF8 byte that hasn't
2661	 * been copied to the destination `as`. */
2662	for (;;) {
2663		int n;
2664		uint32_t uc;
2665		const char *e = src;
2666
2667		/* Skip UTF-8 sequences until we reach end-of-string or
2668		 * a code point that needs conversion. */
2669		while ((n = utf8_to_unicode(&uc, e, len)) > 0) {
2670			e += n;
2671			len -= n;
2672		}
2673		/* Copy the part that doesn't need conversion */
2674		if (e > src) {
2675			if (archive_string_append(as, src, e - src) == NULL)
2676				return (-1);
2677			src = e;
2678		}
2679
2680		if (n == 0) {
2681			/* We reached end-of-string */
2682			return (ret);
2683		} else {
2684			/* Next code point needs conversion */
2685			char t[4];
2686			size_t w;
2687
2688			/* Try decoding a surrogate pair */
2689			if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {
2690				n = cesu8_to_unicode(&uc, src, len);
2691			}
2692			/* Not a (valid) surrogate, so use a replacement char */
2693			if (n < 0) {
2694				ret = -1; /* Return -1 if we used any replacement */
2695				n *= -1;
2696			}
2697			/* Consume converted code point */
2698			src += n;
2699			len -= n;
2700			/* Convert and append new UTF-8 sequence. */
2701			w = unicode_to_utf8(t, sizeof(t), uc);
2702			if (archive_string_append(as, t, w) == NULL)
2703				return (-1);
2704		}
2705	}
2706}
2707
2708static int
2709archive_string_append_unicode(struct archive_string *as, const void *_p,
2710    size_t len, struct archive_string_conv *sc)
2711{
2712	const char *s;
2713	char *p, *endp;
2714	uint32_t uc;
2715	size_t w;
2716	int n, ret = 0, ts, tm;
2717	int (*parse)(uint32_t *, const char *, size_t);
2718	size_t (*unparse)(char *, size_t, uint32_t);
2719
2720	if (sc->flag & SCONV_TO_UTF16BE) {
2721		unparse = unicode_to_utf16be;
2722		ts = 2;
2723	} else if (sc->flag & SCONV_TO_UTF16LE) {
2724		unparse = unicode_to_utf16le;
2725		ts = 2;
2726	} else if (sc->flag & SCONV_TO_UTF8) {
2727		unparse = unicode_to_utf8;
2728		ts = 1;
2729	} else {
2730		/*
2731		 * This case is going to be converted to another
2732		 * character-set through iconv.
2733		 */
2734		if (sc->flag & SCONV_FROM_UTF16BE) {
2735			unparse = unicode_to_utf16be;
2736			ts = 2;
2737		} else if (sc->flag & SCONV_FROM_UTF16LE) {
2738			unparse = unicode_to_utf16le;
2739			ts = 2;
2740		} else {
2741			unparse = unicode_to_utf8;
2742			ts = 1;
2743		}
2744	}
2745
2746	if (sc->flag & SCONV_FROM_UTF16BE) {
2747		parse = utf16be_to_unicode;
2748		tm = 1;
2749	} else if (sc->flag & SCONV_FROM_UTF16LE) {
2750		parse = utf16le_to_unicode;
2751		tm = 1;
2752	} else {
2753		parse = cesu8_to_unicode;
2754		tm = ts;
2755	}
2756
2757	if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2758		return (-1);
2759
2760	s = (const char *)_p;
2761	p = as->s + as->length;
2762	endp = as->s + as->buffer_length - ts;
2763	while ((n = parse(&uc, s, len)) != 0) {
2764		if (n < 0) {
2765			/* Use a replaced unicode character. */
2766			n *= -1;
2767			ret = -1;
2768		}
2769		s += n;
2770		len -= n;
2771		while ((w = unparse(p, endp - p, uc)) == 0) {
2772			/* There is not enough output buffer so
2773			 * we have to expand it. */
2774			as->length = p - as->s;
2775			if (archive_string_ensure(as,
2776			    as->buffer_length + len * tm + ts) == NULL)
2777				return (-1);
2778			p = as->s + as->length;
2779			endp = as->s + as->buffer_length - ts;
2780		}
2781		p += w;
2782	}
2783	as->length = p - as->s;
2784	as->s[as->length] = '\0';
2785	if (ts == 2)
2786		as->s[as->length+1] = '\0';
2787	return (ret);
2788}
2789
2790/*
2791 * Following Constants for Hangul compositions this information comes from
2792 * Unicode Standard Annex #15  http://unicode.org/reports/tr15/
2793 */
2794#define HC_SBASE	0xAC00
2795#define HC_LBASE	0x1100
2796#define HC_VBASE	0x1161
2797#define HC_TBASE	0x11A7
2798#define HC_LCOUNT	19
2799#define HC_VCOUNT	21
2800#define HC_TCOUNT	28
2801#define HC_NCOUNT	(HC_VCOUNT * HC_TCOUNT)
2802#define HC_SCOUNT	(HC_LCOUNT * HC_NCOUNT)
2803
2804static uint32_t
2805get_nfc(uint32_t uc, uint32_t uc2)
2806{
2807	int t, b;
2808
2809	t = 0;
2810	b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;
2811	while (b >= t) {
2812		int m = (t + b) / 2;
2813		if (u_composition_table[m].cp1 < uc)
2814			t = m + 1;
2815		else if (u_composition_table[m].cp1 > uc)
2816			b = m - 1;
2817		else if (u_composition_table[m].cp2 < uc2)
2818			t = m + 1;
2819		else if (u_composition_table[m].cp2 > uc2)
2820			b = m - 1;
2821		else
2822			return (u_composition_table[m].nfc);
2823	}
2824	return (0);
2825}
2826
2827#define FDC_MAX 10	/* The maximum number of Following Decomposable
2828			 * Characters. */
2829
2830/*
2831 * Update first code point.
2832 */
2833#define UPDATE_UC(new_uc)	do {		\
2834	uc = new_uc;				\
2835	ucptr = NULL;				\
2836} while (0)
2837
2838/*
2839 * Replace first code point with second code point.
2840 */
2841#define REPLACE_UC_WITH_UC2() do {		\
2842	uc = uc2;				\
2843	ucptr = uc2ptr;				\
2844	n = n2;					\
2845} while (0)
2846
2847#define EXPAND_BUFFER() do {			\
2848	as->length = p - as->s;			\
2849	if (archive_string_ensure(as,		\
2850	    as->buffer_length + len * tm + ts) == NULL)\
2851		return (-1);			\
2852	p = as->s + as->length;			\
2853	endp = as->s + as->buffer_length - ts;	\
2854} while (0)
2855
2856#define UNPARSE(p, endp, uc)	do {		\
2857	while ((w = unparse(p, (endp) - (p), uc)) == 0) {\
2858		EXPAND_BUFFER();		\
2859	}					\
2860	p += w;					\
2861} while (0)
2862
2863/*
2864 * Write first code point.
2865 * If the code point has not be changed from its original code,
2866 * this just copies it from its original buffer pointer.
2867 * If not, this converts it to UTF-8 byte sequence and copies it.
2868 */
2869#define WRITE_UC()	do {			\
2870	if (ucptr) {				\
2871		if (p + n > endp)		\
2872			EXPAND_BUFFER();	\
2873		switch (n) {			\
2874		case 4:				\
2875			*p++ = *ucptr++;	\
2876			/* FALL THROUGH */	\
2877		case 3:				\
2878			*p++ = *ucptr++;	\
2879			/* FALL THROUGH */	\
2880		case 2:				\
2881			*p++ = *ucptr++;	\
2882			/* FALL THROUGH */	\
2883		case 1:				\
2884			*p++ = *ucptr;		\
2885			break;			\
2886		}				\
2887		ucptr = NULL;			\
2888	} else {				\
2889		UNPARSE(p, endp, uc);		\
2890	}					\
2891} while (0)
2892
2893/*
2894 * Collect following decomposable code points.
2895 */
2896#define COLLECT_CPS(start)	do {		\
2897	int _i;					\
2898	for (_i = start; _i < FDC_MAX ; _i++) {	\
2899		nx = parse(&ucx[_i], s, len);	\
2900		if (nx <= 0)			\
2901			break;			\
2902		cx = CCC(ucx[_i]);		\
2903		if (cl >= cx && cl != 228 && cx != 228)\
2904			break;			\
2905		s += nx;			\
2906		len -= nx;			\
2907		cl = cx;			\
2908		ccx[_i] = cx;			\
2909	}					\
2910	if (_i >= FDC_MAX) {			\
2911		ret = -1;			\
2912		ucx_size = FDC_MAX;		\
2913	} else					\
2914		ucx_size = _i;			\
2915} while (0)
2916
2917/*
2918 * Normalize UTF-8/UTF-16BE characters to Form C and copy the result.
2919 *
2920 * TODO: Convert composition exclusions, which are never converted
2921 * from NFC,NFD,NFKC and NFKD, to Form C.
2922 */
2923static int
2924archive_string_normalize_C(struct archive_string *as, const void *_p,
2925    size_t len, struct archive_string_conv *sc)
2926{
2927	const char *s = (const char *)_p;
2928	char *p, *endp;
2929	uint32_t uc, uc2;
2930	size_t w;
2931	int always_replace, n, n2, ret = 0, spair, ts, tm;
2932	int (*parse)(uint32_t *, const char *, size_t);
2933	size_t (*unparse)(char *, size_t, uint32_t);
2934
2935	always_replace = 1;
2936	ts = 1;/* text size. */
2937	if (sc->flag & SCONV_TO_UTF16BE) {
2938		unparse = unicode_to_utf16be;
2939		ts = 2;
2940		if (sc->flag & SCONV_FROM_UTF16BE)
2941			always_replace = 0;
2942	} else if (sc->flag & SCONV_TO_UTF16LE) {
2943		unparse = unicode_to_utf16le;
2944		ts = 2;
2945		if (sc->flag & SCONV_FROM_UTF16LE)
2946			always_replace = 0;
2947	} else if (sc->flag & SCONV_TO_UTF8) {
2948		unparse = unicode_to_utf8;
2949		if (sc->flag & SCONV_FROM_UTF8)
2950			always_replace = 0;
2951	} else {
2952		/*
2953		 * This case is going to be converted to another
2954		 * character-set through iconv.
2955		 */
2956		always_replace = 0;
2957		if (sc->flag & SCONV_FROM_UTF16BE) {
2958			unparse = unicode_to_utf16be;
2959			ts = 2;
2960		} else if (sc->flag & SCONV_FROM_UTF16LE) {
2961			unparse = unicode_to_utf16le;
2962			ts = 2;
2963		} else {
2964			unparse = unicode_to_utf8;
2965		}
2966	}
2967
2968	if (sc->flag & SCONV_FROM_UTF16BE) {
2969		parse = utf16be_to_unicode;
2970		tm = 1;
2971		spair = 4;/* surrogate pair size in UTF-16. */
2972	} else if (sc->flag & SCONV_FROM_UTF16LE) {
2973		parse = utf16le_to_unicode;
2974		tm = 1;
2975		spair = 4;/* surrogate pair size in UTF-16. */
2976	} else {
2977		parse = cesu8_to_unicode;
2978		tm = ts;
2979		spair = 6;/* surrogate pair size in UTF-8. */
2980	}
2981
2982	if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2983		return (-1);
2984
2985	p = as->s + as->length;
2986	endp = as->s + as->buffer_length - ts;
2987	while ((n = parse(&uc, s, len)) != 0) {
2988		const char *ucptr, *uc2ptr;
2989
2990		if (n < 0) {
2991			/* Use a replaced unicode character. */
2992			UNPARSE(p, endp, uc);
2993			s += n*-1;
2994			len -= n*-1;
2995			ret = -1;
2996			continue;
2997		} else if (n == spair || always_replace)
2998			/* uc is converted from a surrogate pair.
2999			 * this should be treated as a changed code. */
3000			ucptr = NULL;
3001		else
3002			ucptr = s;
3003		s += n;
3004		len -= n;
3005
3006		/* Read second code point. */
3007		while ((n2 = parse(&uc2, s, len)) > 0) {
3008			uint32_t ucx[FDC_MAX];
3009			int ccx[FDC_MAX];
3010			int cl, cx, i, nx, ucx_size;
3011			int LIndex,SIndex;
3012			uint32_t nfc;
3013
3014			if (n2 == spair || always_replace)
3015				/* uc2 is converted from a surrogate pair.
3016			 	 * this should be treated as a changed code. */
3017				uc2ptr = NULL;
3018			else
3019				uc2ptr = s;
3020			s += n2;
3021			len -= n2;
3022
3023			/*
3024			 * If current second code point is out of decomposable
3025			 * code points, finding compositions is unneeded.
3026			 */
3027			if (!IS_DECOMPOSABLE_BLOCK(uc2)) {
3028				WRITE_UC();
3029				REPLACE_UC_WITH_UC2();
3030				continue;
3031			}
3032
3033			/*
3034			 * Try to combine current code points.
3035			 */
3036			/*
3037			 * We have to combine Hangul characters according to
3038			 * http://uniicode.org/reports/tr15/#Hangul
3039			 */
3040			if (0 <= (LIndex = uc - HC_LBASE) &&
3041			    LIndex < HC_LCOUNT) {
3042				/*
3043				 * Hangul Composition.
3044				 * 1. Two current code points are L and V.
3045				 */
3046				int VIndex = uc2 - HC_VBASE;
3047				if (0 <= VIndex && VIndex < HC_VCOUNT) {
3048					/* Make syllable of form LV. */
3049					UPDATE_UC(HC_SBASE +
3050					    (LIndex * HC_VCOUNT + VIndex) *
3051					     HC_TCOUNT);
3052				} else {
3053					WRITE_UC();
3054					REPLACE_UC_WITH_UC2();
3055				}
3056				continue;
3057			} else if (0 <= (SIndex = uc - HC_SBASE) &&
3058			    SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {
3059				/*
3060				 * Hangul Composition.
3061				 * 2. Two current code points are LV and T.
3062				 */
3063				int TIndex = uc2 - HC_TBASE;
3064				if (0 < TIndex && TIndex < HC_TCOUNT) {
3065					/* Make syllable of form LVT. */
3066					UPDATE_UC(uc + TIndex);
3067				} else {
3068					WRITE_UC();
3069					REPLACE_UC_WITH_UC2();
3070				}
3071				continue;
3072			} else if ((nfc = get_nfc(uc, uc2)) != 0) {
3073				/* A composition to current code points
3074				 * is found. */
3075				UPDATE_UC(nfc);
3076				continue;
3077			} else if ((cl = CCC(uc2)) == 0) {
3078				/* Clearly 'uc2' the second code point is not
3079				 * a decomposable code. */
3080				WRITE_UC();
3081				REPLACE_UC_WITH_UC2();
3082				continue;
3083			}
3084
3085			/*
3086			 * Collect following decomposable code points.
3087			 */
3088			cx = 0;
3089			ucx[0] = uc2;
3090			ccx[0] = cl;
3091			COLLECT_CPS(1);
3092
3093			/*
3094			 * Find a composed code in the collected code points.
3095			 */
3096			i = 1;
3097			while (i < ucx_size) {
3098				int j;
3099
3100				if ((nfc = get_nfc(uc, ucx[i])) == 0) {
3101					i++;
3102					continue;
3103				}
3104
3105				/*
3106				 * nfc is composed of uc and ucx[i].
3107				 */
3108				UPDATE_UC(nfc);
3109
3110				/*
3111				 * Remove ucx[i] by shifting
3112				 * following code points.
3113				 */
3114				for (j = i; j+1 < ucx_size; j++) {
3115					ucx[j] = ucx[j+1];
3116					ccx[j] = ccx[j+1];
3117				}
3118				ucx_size --;
3119
3120				/*
3121				 * Collect following code points blocked
3122				 * by ucx[i] the removed code point.
3123				 */
3124				if (ucx_size > 0 && i == ucx_size &&
3125				    nx > 0 && cx == cl) {
3126					cl =  ccx[ucx_size-1];
3127					COLLECT_CPS(ucx_size);
3128				}
3129				/*
3130				 * Restart finding a composed code with
3131				 * the updated uc from the top of the
3132				 * collected code points.
3133				 */
3134				i = 0;
3135			}
3136
3137			/*
3138			 * Apparently the current code points are not
3139			 * decomposed characters or already composed.
3140			 */
3141			WRITE_UC();
3142			for (i = 0; i < ucx_size; i++)
3143				UNPARSE(p, endp, ucx[i]);
3144
3145			/*
3146			 * Flush out remaining canonical combining characters.
3147			 */
3148			if (nx > 0 && cx == cl && len > 0) {
3149				while ((nx = parse(&ucx[0], s, len))
3150				    > 0) {
3151					cx = CCC(ucx[0]);
3152					if (cl > cx)
3153						break;
3154					s += nx;
3155					len -= nx;
3156					cl = cx;
3157					UNPARSE(p, endp, ucx[0]);
3158				}
3159			}
3160			break;
3161		}
3162		if (n2 < 0) {
3163			WRITE_UC();
3164			/* Use a replaced unicode character. */
3165			UNPARSE(p, endp, uc2);
3166			s += n2*-1;
3167			len -= n2*-1;
3168			ret = -1;
3169			continue;
3170		} else if (n2 == 0) {
3171			WRITE_UC();
3172			break;
3173		}
3174	}
3175	as->length = p - as->s;
3176	as->s[as->length] = '\0';
3177	if (ts == 2)
3178		as->s[as->length+1] = '\0';
3179	return (ret);
3180}
3181
3182static int
3183get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc)
3184{
3185	int t, b;
3186
3187	/*
3188	 * These are not converted to NFD on Mac OS.
3189	 */
3190	if ((uc >= 0x2000 && uc <= 0x2FFF) ||
3191	    (uc >= 0xF900 && uc <= 0xFAFF) ||
3192	    (uc >= 0x2F800 && uc <= 0x2FAFF))
3193		return (0);
3194	/*
3195	 * Those code points are not converted to NFD on Mac OS.
3196	 * I do not know the reason because it is undocumented.
3197	 *   NFC        NFD
3198	 *   1109A  ==> 11099 110BA
3199	 *   1109C  ==> 1109B 110BA
3200	 *   110AB  ==> 110A5 110BA
3201	 */
3202	if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB)
3203		return (0);
3204
3205	t = 0;
3206	b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1;
3207	while (b >= t) {
3208		int m = (t + b) / 2;
3209		if (u_decomposition_table[m].nfc < uc)
3210			t = m + 1;
3211		else if (u_decomposition_table[m].nfc > uc)
3212			b = m - 1;
3213		else {
3214			*cp1 = u_decomposition_table[m].cp1;
3215			*cp2 = u_decomposition_table[m].cp2;
3216			return (1);
3217		}
3218	}
3219	return (0);
3220}
3221
3222#define REPLACE_UC_WITH(cp) do {		\
3223	uc = cp;				\
3224	ucptr = NULL;				\
3225} while (0)
3226
3227/*
3228 * Normalize UTF-8 characters to Form D and copy the result.
3229 */
3230static int
3231archive_string_normalize_D(struct archive_string *as, const void *_p,
3232    size_t len, struct archive_string_conv *sc)
3233{
3234	const char *s = (const char *)_p;
3235	char *p, *endp;
3236	uint32_t uc, uc2;
3237	size_t w;
3238	int always_replace, n, n2, ret = 0, spair, ts, tm;
3239	int (*parse)(uint32_t *, const char *, size_t);
3240	size_t (*unparse)(char *, size_t, uint32_t);
3241
3242	always_replace = 1;
3243	ts = 1;/* text size. */
3244	if (sc->flag & SCONV_TO_UTF16BE) {
3245		unparse = unicode_to_utf16be;
3246		ts = 2;
3247		if (sc->flag & SCONV_FROM_UTF16BE)
3248			always_replace = 0;
3249	} else if (sc->flag & SCONV_TO_UTF16LE) {
3250		unparse = unicode_to_utf16le;
3251		ts = 2;
3252		if (sc->flag & SCONV_FROM_UTF16LE)
3253			always_replace = 0;
3254	} else if (sc->flag & SCONV_TO_UTF8) {
3255		unparse = unicode_to_utf8;
3256		if (sc->flag & SCONV_FROM_UTF8)
3257			always_replace = 0;
3258	} else {
3259		/*
3260		 * This case is going to be converted to another
3261		 * character-set through iconv.
3262		 */
3263		always_replace = 0;
3264		if (sc->flag & SCONV_FROM_UTF16BE) {
3265			unparse = unicode_to_utf16be;
3266			ts = 2;
3267		} else if (sc->flag & SCONV_FROM_UTF16LE) {
3268			unparse = unicode_to_utf16le;
3269			ts = 2;
3270		} else {
3271			unparse = unicode_to_utf8;
3272		}
3273	}
3274
3275	if (sc->flag & SCONV_FROM_UTF16BE) {
3276		parse = utf16be_to_unicode;
3277		tm = 1;
3278		spair = 4;/* surrogate pair size in UTF-16. */
3279	} else if (sc->flag & SCONV_FROM_UTF16LE) {
3280		parse = utf16le_to_unicode;
3281		tm = 1;
3282		spair = 4;/* surrogate pair size in UTF-16. */
3283	} else {
3284		parse = cesu8_to_unicode;
3285		tm = ts;
3286		spair = 6;/* surrogate pair size in UTF-8. */
3287	}
3288
3289	if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
3290		return (-1);
3291
3292	p = as->s + as->length;
3293	endp = as->s + as->buffer_length - ts;
3294	while ((n = parse(&uc, s, len)) != 0) {
3295		const char *ucptr;
3296		uint32_t cp1, cp2;
3297		int SIndex;
3298		struct {
3299			uint32_t uc;
3300			int ccc;
3301		} fdc[FDC_MAX];
3302		int fdi, fdj;
3303		int ccc;
3304
3305check_first_code:
3306		if (n < 0) {
3307			/* Use a replaced unicode character. */
3308			UNPARSE(p, endp, uc);
3309			s += n*-1;
3310			len -= n*-1;
3311			ret = -1;
3312			continue;
3313		} else if (n == spair || always_replace)
3314			/* uc is converted from a surrogate pair.
3315			 * this should be treated as a changed code. */
3316			ucptr = NULL;
3317		else
3318			ucptr = s;
3319		s += n;
3320		len -= n;
3321
3322		/* Hangul Decomposition. */
3323		if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) {
3324			int L = HC_LBASE + SIndex / HC_NCOUNT;
3325			int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT;
3326			int T = HC_TBASE + SIndex % HC_TCOUNT;
3327
3328			REPLACE_UC_WITH(L);
3329			WRITE_UC();
3330			REPLACE_UC_WITH(V);
3331			WRITE_UC();
3332			if (T != HC_TBASE) {
3333				REPLACE_UC_WITH(T);
3334				WRITE_UC();
3335			}
3336			continue;
3337		}
3338		if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) {
3339			WRITE_UC();
3340			continue;
3341		}
3342
3343		fdi = 0;
3344		while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) {
3345			int k;
3346
3347			for (k = fdi; k > 0; k--)
3348				fdc[k] = fdc[k-1];
3349			fdc[0].ccc = CCC(cp2);
3350			fdc[0].uc = cp2;
3351			fdi++;
3352			REPLACE_UC_WITH(cp1);
3353		}
3354
3355		/* Read following code points. */
3356		while ((n2 = parse(&uc2, s, len)) > 0 &&
3357		    (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) {
3358			int j, k;
3359
3360			s += n2;
3361			len -= n2;
3362			for (j = 0; j < fdi; j++) {
3363				if (fdc[j].ccc > ccc)
3364					break;
3365			}
3366			if (j < fdi) {
3367				for (k = fdi; k > j; k--)
3368					fdc[k] = fdc[k-1];
3369				fdc[j].ccc = ccc;
3370				fdc[j].uc = uc2;
3371			} else {
3372				fdc[fdi].ccc = ccc;
3373				fdc[fdi].uc = uc2;
3374			}
3375			fdi++;
3376		}
3377
3378		WRITE_UC();
3379		for (fdj = 0; fdj < fdi; fdj++) {
3380			REPLACE_UC_WITH(fdc[fdj].uc);
3381			WRITE_UC();
3382		}
3383
3384		if (n2 == 0)
3385			break;
3386		REPLACE_UC_WITH(uc2);
3387		n = n2;
3388		goto check_first_code;
3389	}
3390	as->length = p - as->s;
3391	as->s[as->length] = '\0';
3392	if (ts == 2)
3393		as->s[as->length+1] = '\0';
3394	return (ret);
3395}
3396
3397/*
3398 * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption
3399 * that WCS is Unicode. It is true for several platforms but some are false.
3400 * And then people who did not use UTF-8 locale on the non Unicode WCS
3401 * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those
3402 * now cannot get right filename from libarchive 3.x and later since we
3403 * fixed the wrong assumption and it is incompatible to older its versions.
3404 * So we provide special option, "compat-2x.x", for resolving it.
3405 * That option enable the string conversion of libarchive 2.x.
3406 *
3407 * Translates the wrong UTF-8 string made by libarchive 2.x into current
3408 * locale character set and appends to the archive_string.
3409 * Note: returns -1 if conversion fails.
3410 */
3411static int
3412strncat_from_utf8_libarchive2(struct archive_string *as,
3413    const void *_p, size_t len, struct archive_string_conv *sc)
3414{
3415	const char *s;
3416	int n;
3417	char *p;
3418	char *end;
3419	uint32_t unicode;
3420#if HAVE_WCRTOMB
3421	mbstate_t shift_state;
3422
3423	memset(&shift_state, 0, sizeof(shift_state));
3424#else
3425	/* Clear the shift state before starting. */
3426	wctomb(NULL, L'\0');
3427#endif
3428	(void)sc; /* UNUSED */
3429	/*
3430	 * Allocate buffer for MBS.
3431	 * We need this allocation here since it is possible that
3432	 * as->s is still NULL.
3433	 */
3434	if (archive_string_ensure(as, as->length + len + 1) == NULL)
3435		return (-1);
3436
3437	s = (const char *)_p;
3438	p = as->s + as->length;
3439	end = as->s + as->buffer_length - MB_CUR_MAX -1;
3440	while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {
3441		wchar_t wc;
3442
3443		if (p >= end) {
3444			as->length = p - as->s;
3445			/* Re-allocate buffer for MBS. */
3446			if (archive_string_ensure(as,
3447			    as->length + max(len * 2,
3448			    (size_t)MB_CUR_MAX) + 1) == NULL)
3449				return (-1);
3450			p = as->s + as->length;
3451			end = as->s + as->buffer_length - MB_CUR_MAX -1;
3452		}
3453
3454		/*
3455		 * As libarchive 2.x, translates the UTF-8 characters into
3456		 * wide-characters in the assumption that WCS is Unicode.
3457		 */
3458		if (n < 0) {
3459			n *= -1;
3460			wc = L'?';
3461		} else
3462			wc = (wchar_t)unicode;
3463
3464		s += n;
3465		len -= n;
3466		/*
3467		 * Translates the wide-character into the current locale MBS.
3468		 */
3469#if HAVE_WCRTOMB
3470		n = (int)wcrtomb(p, wc, &shift_state);
3471#else
3472		n = (int)wctomb(p, wc);
3473#endif
3474		if (n == -1)
3475			return (-1);
3476		p += n;
3477	}
3478	as->length = p - as->s;
3479	as->s[as->length] = '\0';
3480	return (0);
3481}
3482
3483
3484/*
3485 * Conversion functions between current locale dependent MBS and UTF-16BE.
3486 *   strncat_from_utf16be() : UTF-16BE --> MBS
3487 *   strncat_to_utf16be()   : MBS --> UTF16BE
3488 */
3489
3490#if defined(_WIN32) && !defined(__CYGWIN__)
3491
3492/*
3493 * Convert a UTF-16BE/LE string to current locale and copy the result.
3494 * Return -1 if conversion fails.
3495 */
3496static int
3497win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,
3498    struct archive_string_conv *sc, int be)
3499{
3500	struct archive_string tmp;
3501	const char *u16;
3502	int ll;
3503	BOOL defchar;
3504	char *mbs;
3505	size_t mbs_size, b;
3506	int ret = 0;
3507
3508	bytes &= ~1;
3509	if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3510		return (-1);
3511
3512	mbs = as->s + as->length;
3513	mbs_size = as->buffer_length - as->length -1;
3514
3515	if (sc->to_cp == CP_C_LOCALE) {
3516		/*
3517		 * "C" locale special process.
3518		 */
3519		u16 = _p;
3520		ll = 0;
3521		for (b = 0; b < bytes; b += 2) {
3522			uint16_t val;
3523			if (be)
3524				val = archive_be16dec(u16+b);
3525			else
3526				val = archive_le16dec(u16+b);
3527			if (val > 255) {
3528				*mbs++ = '?';
3529				ret = -1;
3530			} else
3531				*mbs++ = (char)(val&0xff);
3532			ll++;
3533		}
3534		as->length += ll;
3535		as->s[as->length] = '\0';
3536		return (ret);
3537	}
3538
3539	archive_string_init(&tmp);
3540	if (be) {
3541		if (is_big_endian()) {
3542			u16 = _p;
3543		} else {
3544			if (archive_string_ensure(&tmp, bytes+2) == NULL)
3545				return (-1);
3546			memcpy(tmp.s, _p, bytes);
3547			for (b = 0; b < bytes; b += 2) {
3548				uint16_t val = archive_be16dec(tmp.s+b);
3549				archive_le16enc(tmp.s+b, val);
3550			}
3551			u16 = tmp.s;
3552		}
3553	} else {
3554		if (!is_big_endian()) {
3555			u16 = _p;
3556		} else {
3557			if (archive_string_ensure(&tmp, bytes+2) == NULL)
3558				return (-1);
3559			memcpy(tmp.s, _p, bytes);
3560			for (b = 0; b < bytes; b += 2) {
3561				uint16_t val = archive_le16dec(tmp.s+b);
3562				archive_be16enc(tmp.s+b, val);
3563			}
3564			u16 = tmp.s;
3565		}
3566	}
3567
3568	do {
3569		defchar = 0;
3570		ll = WideCharToMultiByte(sc->to_cp, 0,
3571		    (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size,
3572			NULL, &defchar);
3573		/* Exit loop if we succeeded */
3574		if (ll != 0 ||
3575		    GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3576			break;
3577		}
3578		/* Else expand buffer and loop to try again. */
3579		ll = WideCharToMultiByte(sc->to_cp, 0,
3580		    (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL);
3581		if (archive_string_ensure(as, ll +1) == NULL)
3582			return (-1);
3583		mbs = as->s + as->length;
3584		mbs_size = as->buffer_length - as->length -1;
3585	} while (1);
3586	archive_string_free(&tmp);
3587	as->length += ll;
3588	as->s[as->length] = '\0';
3589	if (ll == 0 || defchar)
3590		ret = -1;
3591	return (ret);
3592}
3593
3594static int
3595win_strncat_from_utf16be(struct archive_string *as, const void *_p,
3596    size_t bytes, struct archive_string_conv *sc)
3597{
3598	return (win_strncat_from_utf16(as, _p, bytes, sc, 1));
3599}
3600
3601static int
3602win_strncat_from_utf16le(struct archive_string *as, const void *_p,
3603    size_t bytes, struct archive_string_conv *sc)
3604{
3605	return (win_strncat_from_utf16(as, _p, bytes, sc, 0));
3606}
3607
3608static int
3609is_big_endian(void)
3610{
3611	uint16_t d = 1;
3612
3613	return (archive_be16dec(&d) == 1);
3614}
3615
3616/*
3617 * Convert a current locale string to UTF-16BE/LE and copy the result.
3618 * Return -1 if conversion fails.
3619 */
3620static int
3621win_strncat_to_utf16(struct archive_string *as16, const void *_p,
3622    size_t length, struct archive_string_conv *sc, int bigendian)
3623{
3624	const char *s = (const char *)_p;
3625	char *u16;
3626	size_t count, avail;
3627
3628	if (archive_string_ensure(as16,
3629	    as16->length + (length + 1) * 2) == NULL)
3630		return (-1);
3631
3632	u16 = as16->s + as16->length;
3633	avail = as16->buffer_length - 2;
3634	if (sc->from_cp == CP_C_LOCALE) {
3635		/*
3636		 * "C" locale special process.
3637		 */
3638		count = 0;
3639		while (count < length && *s) {
3640			if (bigendian)
3641				archive_be16enc(u16, *s);
3642			else
3643				archive_le16enc(u16, *s);
3644			u16 += 2;
3645			s++;
3646			count++;
3647		}
3648		as16->length += count << 1;
3649		as16->s[as16->length] = 0;
3650		as16->s[as16->length+1] = 0;
3651		return (0);
3652	}
3653	do {
3654		count = MultiByteToWideChar(sc->from_cp,
3655		    MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1);
3656		/* Exit loop if we succeeded */
3657		if (count != 0 ||
3658		    GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3659			break;
3660		}
3661		/* Expand buffer and try again */
3662		count = MultiByteToWideChar(sc->from_cp,
3663		    MB_PRECOMPOSED, s, (int)length, NULL, 0);
3664		if (archive_string_ensure(as16, (count +1) * 2)
3665		    == NULL)
3666			return (-1);
3667		u16 = as16->s + as16->length;
3668		avail = as16->buffer_length - 2;
3669	} while (1);
3670	as16->length += count * 2;
3671	as16->s[as16->length] = 0;
3672	as16->s[as16->length+1] = 0;
3673	if (count == 0)
3674		return (-1);
3675
3676	if (is_big_endian()) {
3677		if (!bigendian) {
3678			while (count > 0) {
3679				uint16_t v = archive_be16dec(u16);
3680				archive_le16enc(u16, v);
3681				u16 += 2;
3682				count--;
3683			}
3684		}
3685	} else {
3686		if (bigendian) {
3687			while (count > 0) {
3688				uint16_t v = archive_le16dec(u16);
3689				archive_be16enc(u16, v);
3690				u16 += 2;
3691				count--;
3692			}
3693		}
3694	}
3695	return (0);
3696}
3697
3698static int
3699win_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3700    size_t length, struct archive_string_conv *sc)
3701{
3702	return (win_strncat_to_utf16(as16, _p, length, sc, 1));
3703}
3704
3705static int
3706win_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3707    size_t length, struct archive_string_conv *sc)
3708{
3709	return (win_strncat_to_utf16(as16, _p, length, sc, 0));
3710}
3711
3712#endif /* _WIN32 && !__CYGWIN__ */
3713
3714/*
3715 * Do the best effort for conversions.
3716 * We cannot handle UTF-16BE character-set without such iconv,
3717 * but there is a chance if a string consists just ASCII code or
3718 * a current locale is UTF-8.
3719 */
3720
3721/*
3722 * Convert a UTF-16BE string to current locale and copy the result.
3723 * Return -1 if conversion fails.
3724 */
3725static int
3726best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,
3727    size_t bytes, struct archive_string_conv *sc, int be)
3728{
3729	const char *utf16 = (const char *)_p;
3730	char *mbs;
3731	uint32_t uc;
3732	int n, ret;
3733
3734	(void)sc; /* UNUSED */
3735	/*
3736	 * Other case, we should do the best effort.
3737	 * If all character are ASCII(<0x7f), we can convert it.
3738	 * if not , we set a alternative character and return -1.
3739	 */
3740	ret = 0;
3741	if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3742		return (-1);
3743	mbs = as->s + as->length;
3744
3745	while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {
3746		if (n < 0) {
3747			n *= -1;
3748			ret =  -1;
3749		}
3750		bytes -= n;
3751		utf16 += n;
3752
3753		if (uc > 127) {
3754			/* We cannot handle it. */
3755			*mbs++ = '?';
3756			ret =  -1;
3757		} else
3758			*mbs++ = (char)uc;
3759	}
3760	as->length = mbs - as->s;
3761	as->s[as->length] = '\0';
3762	return (ret);
3763}
3764
3765static int
3766best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,
3767    size_t bytes, struct archive_string_conv *sc)
3768{
3769	return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));
3770}
3771
3772static int
3773best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,
3774    size_t bytes, struct archive_string_conv *sc)
3775{
3776	return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));
3777}
3778
3779/*
3780 * Convert a current locale string to UTF-16BE/LE and copy the result.
3781 * Return -1 if conversion fails.
3782 */
3783static int
3784best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,
3785    size_t length, struct archive_string_conv *sc, int bigendian)
3786{
3787	const char *s = (const char *)_p;
3788	char *utf16;
3789	size_t remaining;
3790	int ret;
3791
3792	(void)sc; /* UNUSED */
3793	/*
3794	 * Other case, we should do the best effort.
3795	 * If all character are ASCII(<0x7f), we can convert it.
3796	 * if not , we set a alternative character and return -1.
3797	 */
3798	ret = 0;
3799	remaining = length;
3800
3801	if (archive_string_ensure(as16,
3802	    as16->length + (length + 1) * 2) == NULL)
3803		return (-1);
3804
3805	utf16 = as16->s + as16->length;
3806	while (remaining--) {
3807		unsigned c = *s++;
3808		if (c > 127) {
3809			/* We cannot handle it. */
3810			c = UNICODE_R_CHAR;
3811			ret = -1;
3812		}
3813		if (bigendian)
3814			archive_be16enc(utf16, c);
3815		else
3816			archive_le16enc(utf16, c);
3817		utf16 += 2;
3818	}
3819	as16->length = utf16 - as16->s;
3820	as16->s[as16->length] = 0;
3821	as16->s[as16->length+1] = 0;
3822	return (ret);
3823}
3824
3825static int
3826best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3827    size_t length, struct archive_string_conv *sc)
3828{
3829	return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));
3830}
3831
3832static int
3833best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3834    size_t length, struct archive_string_conv *sc)
3835{
3836	return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));
3837}
3838
3839
3840/*
3841 * Multistring operations.
3842 */
3843
3844void
3845archive_mstring_clean(struct archive_mstring *aes)
3846{
3847	archive_wstring_free(&(aes->aes_wcs));
3848	archive_string_free(&(aes->aes_mbs));
3849	archive_string_free(&(aes->aes_utf8));
3850	archive_string_free(&(aes->aes_mbs_in_locale));
3851	aes->aes_set = 0;
3852}
3853
3854void
3855archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)
3856{
3857	dest->aes_set = src->aes_set;
3858	archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));
3859	archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));
3860	archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));
3861}
3862
3863int
3864archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
3865  const char **p)
3866{
3867	struct archive_string_conv *sc;
3868	int r;
3869
3870	/* If we already have a UTF8 form, return that immediately. */
3871	if (aes->aes_set & AES_SET_UTF8) {
3872		*p = aes->aes_utf8.s;
3873		return (0);
3874	}
3875
3876	*p = NULL;
3877	/* Try converting WCS to MBS first if MBS does not exist yet. */
3878	if ((aes->aes_set & AES_SET_MBS) == 0) {
3879		const char *pm; /* unused */
3880		archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
3881	}
3882	if (aes->aes_set & AES_SET_MBS) {
3883		sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
3884		if (sc == NULL)
3885			return (-1);/* Couldn't allocate memory for sc. */
3886		r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s,
3887		    aes->aes_mbs.length, sc);
3888		if (a == NULL)
3889			free_sconv_object(sc);
3890		if (r == 0) {
3891			aes->aes_set |= AES_SET_UTF8;
3892			*p = aes->aes_utf8.s;
3893			return (0);/* success. */
3894		} else
3895			return (-1);/* failure. */
3896	}
3897	return (0);/* success. */
3898}
3899
3900int
3901archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,
3902    const char **p)
3903{
3904	struct archive_string_conv *sc;
3905	int r, ret = 0;
3906
3907	/* If we already have an MBS form, return that immediately. */
3908	if (aes->aes_set & AES_SET_MBS) {
3909		*p = aes->aes_mbs.s;
3910		return (ret);
3911	}
3912
3913	*p = NULL;
3914	/* If there's a WCS form, try converting with the native locale. */
3915	if (aes->aes_set & AES_SET_WCS) {
3916		archive_string_empty(&(aes->aes_mbs));
3917		r = archive_string_append_from_wcs(&(aes->aes_mbs),
3918		    aes->aes_wcs.s, aes->aes_wcs.length);
3919		*p = aes->aes_mbs.s;
3920		if (r == 0) {
3921			aes->aes_set |= AES_SET_MBS;
3922			return (ret);
3923		} else
3924			ret = -1;
3925	}
3926
3927	/* If there's a UTF-8 form, try converting with the native locale. */
3928	if (aes->aes_set & AES_SET_UTF8) {
3929		archive_string_empty(&(aes->aes_mbs));
3930		sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
3931		if (sc == NULL)
3932			return (-1);/* Couldn't allocate memory for sc. */
3933		r = archive_strncpy_l(&(aes->aes_mbs),
3934			aes->aes_utf8.s, aes->aes_utf8.length, sc);
3935		if (a == NULL)
3936			free_sconv_object(sc);
3937		*p = aes->aes_mbs.s;
3938		if (r == 0) {
3939			aes->aes_set |= AES_SET_MBS;
3940			ret = 0;/* success; overwrite previous error. */
3941		} else
3942			ret = -1;/* failure. */
3943	}
3944	return (ret);
3945}
3946
3947int
3948archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
3949    const wchar_t **wp)
3950{
3951	int r, ret = 0;
3952
3953	(void)a;/* UNUSED */
3954	/* Return WCS form if we already have it. */
3955	if (aes->aes_set & AES_SET_WCS) {
3956		*wp = aes->aes_wcs.s;
3957		return (ret);
3958	}
3959
3960	*wp = NULL;
3961	/* Try converting UTF8 to MBS first if MBS does not exist yet. */
3962	if ((aes->aes_set & AES_SET_MBS) == 0) {
3963		const char *p; /* unused */
3964		archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */
3965	}
3966	/* Try converting MBS to WCS using native locale. */
3967	if (aes->aes_set & AES_SET_MBS) {
3968		archive_wstring_empty(&(aes->aes_wcs));
3969		r = archive_wstring_append_from_mbs(&(aes->aes_wcs),
3970		    aes->aes_mbs.s, aes->aes_mbs.length);
3971		if (r == 0) {
3972			aes->aes_set |= AES_SET_WCS;
3973			*wp = aes->aes_wcs.s;
3974		} else
3975			ret = -1;/* failure. */
3976	}
3977	return (ret);
3978}
3979
3980int
3981archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes,
3982    const char **p, size_t *length, struct archive_string_conv *sc)
3983{
3984	int ret = 0;
3985#if defined(_WIN32) && !defined(__CYGWIN__)
3986	int r;
3987
3988	/*
3989	 * Internationalization programming on Windows must use Wide
3990	 * characters because Windows platform cannot make locale UTF-8.
3991	 */
3992	if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {
3993		archive_string_empty(&(aes->aes_mbs_in_locale));
3994		r = archive_string_append_from_wcs_in_codepage(
3995		    &(aes->aes_mbs_in_locale), aes->aes_wcs.s,
3996		    aes->aes_wcs.length, sc);
3997		if (r == 0) {
3998			*p = aes->aes_mbs_in_locale.s;
3999			if (length != NULL)
4000				*length = aes->aes_mbs_in_locale.length;
4001			return (0);
4002		} else if (errno == ENOMEM)
4003			return (-1);
4004		else
4005			ret = -1;
4006	}
4007#endif
4008
4009	/* If there is not an MBS form but there is a WCS or UTF8 form, try converting
4010	 * with the native locale to be used for translating it to specified
4011	 * character-set. */
4012	if ((aes->aes_set & AES_SET_MBS) == 0) {
4013		const char *pm; /* unused */
4014		archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
4015	}
4016	/* If we already have an MBS form, use it to be translated to
4017	 * specified character-set. */
4018	if (aes->aes_set & AES_SET_MBS) {
4019		if (sc == NULL) {
4020			/* Conversion is unneeded. */
4021			*p = aes->aes_mbs.s;
4022			if (length != NULL)
4023				*length = aes->aes_mbs.length;
4024			return (0);
4025		}
4026		ret = archive_strncpy_l(&(aes->aes_mbs_in_locale),
4027		    aes->aes_mbs.s, aes->aes_mbs.length, sc);
4028		*p = aes->aes_mbs_in_locale.s;
4029		if (length != NULL)
4030			*length = aes->aes_mbs_in_locale.length;
4031	} else {
4032		*p = NULL;
4033		if (length != NULL)
4034			*length = 0;
4035	}
4036	return (ret);
4037}
4038
4039int
4040archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)
4041{
4042	if (mbs == NULL) {
4043		aes->aes_set = 0;
4044		return (0);
4045	}
4046	return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));
4047}
4048
4049int
4050archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,
4051    size_t len)
4052{
4053	if (mbs == NULL) {
4054		aes->aes_set = 0;
4055		return (0);
4056	}
4057	aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4058	archive_strncpy(&(aes->aes_mbs), mbs, len);
4059	archive_string_empty(&(aes->aes_utf8));
4060	archive_wstring_empty(&(aes->aes_wcs));
4061	return (0);
4062}
4063
4064int
4065archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)
4066{
4067	return archive_mstring_copy_wcs_len(aes, wcs,
4068				wcs == NULL ? 0 : wcslen(wcs));
4069}
4070
4071int
4072archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8)
4073{
4074  if (utf8 == NULL) {
4075    aes->aes_set = 0;
4076    return (0);
4077  }
4078  aes->aes_set = AES_SET_UTF8;
4079  archive_string_empty(&(aes->aes_mbs));
4080  archive_string_empty(&(aes->aes_wcs));
4081  archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8));
4082  return (int)strlen(utf8);
4083}
4084
4085int
4086archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,
4087    size_t len)
4088{
4089	if (wcs == NULL) {
4090		aes->aes_set = 0;
4091		return (0);
4092	}
4093	aes->aes_set = AES_SET_WCS; /* Only WCS form set. */
4094	archive_string_empty(&(aes->aes_mbs));
4095	archive_string_empty(&(aes->aes_utf8));
4096	archive_wstrncpy(&(aes->aes_wcs), wcs, len);
4097	return (0);
4098}
4099
4100int
4101archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
4102    const char *mbs, size_t len, struct archive_string_conv *sc)
4103{
4104	int r;
4105
4106	if (mbs == NULL) {
4107		aes->aes_set = 0;
4108		return (0);
4109	}
4110	archive_string_empty(&(aes->aes_mbs));
4111	archive_wstring_empty(&(aes->aes_wcs));
4112	archive_string_empty(&(aes->aes_utf8));
4113#if defined(_WIN32) && !defined(__CYGWIN__)
4114	/*
4115	 * Internationalization programming on Windows must use Wide
4116	 * characters because Windows platform cannot make locale UTF-8.
4117	 */
4118	if (sc == NULL) {
4119		if (archive_string_append(&(aes->aes_mbs),
4120			mbs, mbsnbytes(mbs, len)) == NULL) {
4121			aes->aes_set = 0;
4122			r = -1;
4123		} else {
4124			aes->aes_set = AES_SET_MBS;
4125			r = 0;
4126		}
4127#if defined(HAVE_ICONV)
4128	} else if (sc != NULL && sc->cd_w != (iconv_t)-1) {
4129		/*
4130		 * This case happens only when MultiByteToWideChar() cannot
4131		 * handle sc->from_cp, and we have to iconv in order to
4132		 * translate character-set to wchar_t,UTF-16.
4133		 */
4134		iconv_t cd = sc->cd;
4135		unsigned from_cp;
4136		int flag;
4137
4138		/*
4139		 * Translate multi-bytes from some character-set to UTF-8.
4140		 */
4141		sc->cd = sc->cd_w;
4142		r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc);
4143		sc->cd = cd;
4144		if (r != 0) {
4145			aes->aes_set = 0;
4146			return (r);
4147		}
4148		aes->aes_set = AES_SET_UTF8;
4149
4150		/*
4151		 * Append the UTF-8 string into wstring.
4152		 */
4153		flag = sc->flag;
4154		sc->flag &= ~(SCONV_NORMALIZATION_C
4155				| SCONV_TO_UTF16| SCONV_FROM_UTF16);
4156		from_cp = sc->from_cp;
4157		sc->from_cp = CP_UTF8;
4158		r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4159			aes->aes_utf8.s, aes->aes_utf8.length, sc);
4160		sc->flag = flag;
4161		sc->from_cp = from_cp;
4162		if (r == 0)
4163			aes->aes_set |= AES_SET_WCS;
4164#endif
4165	} else {
4166		r = archive_wstring_append_from_mbs_in_codepage(
4167		    &(aes->aes_wcs), mbs, len, sc);
4168		if (r == 0)
4169			aes->aes_set = AES_SET_WCS;
4170		else
4171			aes->aes_set = 0;
4172	}
4173#else
4174	r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc);
4175	if (r == 0)
4176		aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4177	else
4178		aes->aes_set = 0;
4179#endif
4180	return (r);
4181}
4182
4183/*
4184 * The 'update' form tries to proactively update all forms of
4185 * this string (WCS and MBS) and returns an error if any of
4186 * them fail.  This is used by the 'pax' handler, for instance,
4187 * to detect and report character-conversion failures early while
4188 * still allowing clients to get potentially useful values from
4189 * the more tolerant lazy conversions.  (get_mbs and get_wcs will
4190 * strive to give the user something useful, so you can get hopefully
4191 * usable values even if some of the character conversions are failing.)
4192 */
4193int
4194archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
4195    const char *utf8)
4196{
4197	struct archive_string_conv *sc;
4198	int r;
4199
4200	if (utf8 == NULL) {
4201		aes->aes_set = 0;
4202		return (0); /* Succeeded in clearing everything. */
4203	}
4204
4205	/* Save the UTF8 string. */
4206	archive_strcpy(&(aes->aes_utf8), utf8);
4207
4208	/* Empty the mbs and wcs strings. */
4209	archive_string_empty(&(aes->aes_mbs));
4210	archive_wstring_empty(&(aes->aes_wcs));
4211
4212	aes->aes_set = AES_SET_UTF8;	/* Only UTF8 is set now. */
4213
4214	/* Try converting UTF-8 to MBS, return false on failure. */
4215	sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
4216	if (sc == NULL)
4217		return (-1);/* Couldn't allocate memory for sc. */
4218	r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
4219
4220#if defined(_WIN32) && !defined(__CYGWIN__)
4221	/* On failure, make an effort to convert UTF8 to WCS as the active code page
4222	 * may not be able to represent all characters in the string */
4223	if (r != 0) {
4224		if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4225			aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0)
4226			aes->aes_set = AES_SET_UTF8 | AES_SET_WCS;
4227	}
4228#endif
4229
4230	if (a == NULL)
4231		free_sconv_object(sc);
4232	if (r != 0)
4233		return (-1);
4234	aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */
4235
4236	/* Try converting MBS to WCS, return false on failure. */
4237	if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
4238	    aes->aes_mbs.length))
4239		return (-1);
4240	aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
4241
4242	/* All conversions succeeded. */
4243	return (0);
4244}
4245