1/*-
2 * Copyright (c) 2011 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26
27#include <locale.h>
28
29DEFINE_TEST(test_zip_filename_encoding_UTF8)
30{
31  	struct archive *a;
32  	struct archive_entry *entry;
33	char buff[4096];
34	size_t used;
35
36	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
37		skipping("en_US.UTF-8 locale not available on this system.");
38		return;
39	}
40
41	/*
42	 * Verify that UTF-8 filenames are correctly stored with
43	 * hdrcharset=UTF-8 option.
44	 */
45	a = archive_write_new();
46	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
47	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
48		skipping("This system cannot convert character-set"
49		    " for UTF-8.");
50		archive_write_free(a);
51		return;
52	}
53	assertEqualInt(ARCHIVE_OK,
54	    archive_write_open_memory(a, buff, sizeof(buff), &used));
55
56	entry = archive_entry_new2(a);
57	/* Set a UTF-8 filename. */
58	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
59	archive_entry_set_filetype(entry, AE_IFREG);
60	archive_entry_set_size(entry, 0);
61	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
62	archive_entry_free(entry);
63	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
64
65	/* A bit 11 of general purpose flag should be 0x08,
66	 * which indicates the filename charset is UTF-8. */
67	assertEqualInt(0x08, buff[7]);
68	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
69
70	/*
71	 * Verify that UTF-8 filenames are correctly stored without
72	 * hdrcharset=UTF-8 option.
73	 * Skip on Windows where we default to OEMCP
74	 */
75#if !defined(_WIN32) || defined(__CYGWIN__)
76	a = archive_write_new();
77	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
78	assertEqualInt(ARCHIVE_OK,
79	    archive_write_open_memory(a, buff, sizeof(buff), &used));
80
81	entry = archive_entry_new2(a);
82	/* Set a UTF-8 filename. */
83	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
84	archive_entry_set_filetype(entry, AE_IFREG);
85	archive_entry_set_size(entry, 0);
86	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
87	archive_entry_free(entry);
88	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
89
90	/* A bit 11 of general purpose flag should be 0x08,
91	 * which indicates the filename charset is UTF-8. */
92	assertEqualInt(0x08, buff[7]);
93	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
94#endif
95
96	/*
97	 * Verify that A bit 11 of general purpose flag is not set
98	 * when ASCII filenames are stored.
99	 */
100	a = archive_write_new();
101	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
102	assertEqualInt(ARCHIVE_OK,
103	    archive_write_open_memory(a, buff, sizeof(buff), &used));
104
105	entry = archive_entry_new2(a);
106	/* Set an ASCII filename. */
107	archive_entry_set_pathname(entry, "abcABC");
108	archive_entry_set_filetype(entry, AE_IFREG);
109	archive_entry_set_size(entry, 0);
110	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
111	archive_entry_free(entry);
112	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
113
114	/* A bit 11 of general purpose flag should be 0,
115	 * which indicates the filename charset is unknown. */
116	assertEqualInt(0, buff[7]);
117	assertEqualMem(buff + 30, "abcABC", 6);
118}
119
120DEFINE_TEST(test_zip_filename_encoding_KOI8R)
121{
122  	struct archive *a;
123  	struct archive_entry *entry;
124	char buff[4096];
125	size_t used;
126
127	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
128		skipping("KOI8-R locale not available on this system.");
129		return;
130	}
131
132	/*
133	 * Verify that KOI8-R filenames are correctly translated to UTF-8.
134	 */
135	a = archive_write_new();
136	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
137	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
138		skipping("This system cannot convert character-set"
139		    " from KOI8-R to UTF-8.");
140		archive_write_free(a);
141		return;
142	}
143	assertEqualInt(ARCHIVE_OK,
144	    archive_write_open_memory(a, buff, sizeof(buff), &used));
145
146	entry = archive_entry_new2(a);
147	/* Set a KOI8-R filename. */
148	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
149	archive_entry_set_filetype(entry, AE_IFREG);
150	archive_entry_set_size(entry, 0);
151	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
152	archive_entry_free(entry);
153	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
154
155	/* A bit 11 of general purpose flag should be 0x08,
156	 * which indicates the filename charset is UTF-8. */
157	assertEqualInt(0x08, buff[7]);
158	/* Above three characters in KOI8-R should translate to the following
159	 * three characters (two bytes each) in UTF-8. */
160	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
161
162	/*
163	 * Verify that KOI8-R filenames are not translated to UTF-8.
164	 */
165	a = archive_write_new();
166	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
167	assertEqualInt(ARCHIVE_OK,
168	    archive_write_open_memory(a, buff, sizeof(buff), &used));
169
170	entry = archive_entry_new2(a);
171	/* Set a KOI8-R filename. */
172	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
173	archive_entry_set_filetype(entry, AE_IFREG);
174	archive_entry_set_size(entry, 0);
175	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
176	archive_entry_free(entry);
177	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
178
179	/* A bit 11 of general purpose flag should be 0,
180	 * which indicates the filename charset is unknown. */
181	assertEqualInt(0, buff[7]);
182	/* Above three characters in KOI8-R should not translate to
183	 * any character-set. */
184	assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3);
185
186	/*
187	 * Verify that A bit 11 of general purpose flag is not set
188	 * when ASCII filenames are stored even if hdrcharset=UTF-8
189	 * is specified.
190	 */
191	a = archive_write_new();
192	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
193	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
194		skipping("This system cannot convert character-set"
195		    " from KOI8-R to UTF-8.");
196		archive_write_free(a);
197		return;
198	}
199	assertEqualInt(ARCHIVE_OK,
200	    archive_write_open_memory(a, buff, sizeof(buff), &used));
201
202	entry = archive_entry_new2(a);
203	/* Set an ASCII filename. */
204	archive_entry_set_pathname(entry, "abcABC");
205	archive_entry_set_filetype(entry, AE_IFREG);
206	archive_entry_set_size(entry, 0);
207	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
208	archive_entry_free(entry);
209	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
210
211	/* A bit 11 of general purpose flag should be 0,
212	 * which indicates the filename charset is unknown. */
213	assertEqualInt(0, buff[7]);
214	assertEqualMem(buff + 30, "abcABC", 6);
215}
216
217/*
218 * Do not translate CP1251 into CP866 if non Windows platform.
219 */
220DEFINE_TEST(test_zip_filename_encoding_ru_RU_CP1251)
221{
222  	struct archive *a;
223  	struct archive_entry *entry;
224	char buff[4096];
225	size_t used;
226
227	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
228		skipping("Russian_Russia locale not available on this system.");
229		return;
230	}
231
232	/*
233	 * Verify that CP1251 filenames are not translated into any
234	 * other character-set, in particular, CP866.
235	 */
236	a = archive_write_new();
237	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
238	assertEqualInt(ARCHIVE_OK,
239	    archive_write_open_memory(a, buff, sizeof(buff), &used));
240
241	entry = archive_entry_new2(a);
242	/* Set a CP1251 filename. */
243	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
244	archive_entry_set_filetype(entry, AE_IFREG);
245	archive_entry_set_size(entry, 0);
246	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
247	archive_entry_free(entry);
248	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
249
250	/* A bit 11 of general purpose flag should be 0,
251	 * which indicates the filename charset is unknown. */
252	assertEqualInt(0, buff[7]);
253	/* Above three characters in CP1251 should not translate into
254	 * any other character-set. */
255	assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3);
256}
257
258/*
259 * Other archiver applications on Windows translate CP1251 filenames
260 * into CP866 filenames and store it in the zip file.
261 * Test above behavior works well.
262 */
263DEFINE_TEST(test_zip_filename_encoding_Russian_Russia)
264{
265  	struct archive *a;
266  	struct archive_entry *entry;
267	char buff[4096];
268	size_t used;
269
270	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
271		skipping("Russian_Russia locale not available on this system.");
272		return;
273	}
274
275	/*
276	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
277	 * to UTF-8.
278	 */
279	a = archive_write_new();
280	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
281	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
282		skipping("This system cannot convert character-set"
283		    " from Russian_Russia.CP1251 to UTF-8.");
284		archive_write_free(a);
285		return;
286	}
287	assertEqualInt(ARCHIVE_OK,
288	    archive_write_open_memory(a, buff, sizeof(buff), &used));
289
290	entry = archive_entry_new2(a);
291	/* Set a CP1251 filename. */
292	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
293	archive_entry_set_filetype(entry, AE_IFREG);
294	archive_entry_set_size(entry, 0);
295	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
296	archive_entry_free(entry);
297	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
298
299	/* A bit 11 of general purpose flag should be 0x08,
300	 * which indicates the filename charset is UTF-8. */
301	assertEqualInt(0x08, buff[7]);
302	/* Above three characters in CP1251 should translate to the following
303	 * three characters (two bytes each) in UTF-8. */
304	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
305
306	/*
307	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
308	 * to CP866.
309	 */
310	a = archive_write_new();
311	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
312	assertEqualInt(ARCHIVE_OK,
313	    archive_write_open_memory(a, buff, sizeof(buff), &used));
314
315	entry = archive_entry_new2(a);
316	/* Set a CP1251 filename. */
317	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
318	archive_entry_set_filetype(entry, AE_IFREG);
319	archive_entry_set_size(entry, 0);
320	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
321	archive_entry_free(entry);
322	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
323
324	/* A bit 11 of general purpose flag should be 0,
325	 * which indicates the filename charset is unknown. */
326	assertEqualInt(0, buff[7]);
327	/* Above three characters in CP1251 should translate to the following
328	 * three characters in CP866. */
329	assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3);
330}
331
332DEFINE_TEST(test_zip_filename_encoding_EUCJP)
333{
334  	struct archive *a;
335  	struct archive_entry *entry;
336	char buff[4096];
337	size_t used;
338
339	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
340		skipping("eucJP locale not available on this system.");
341		return;
342	}
343
344	/*
345	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
346	 */
347	a = archive_write_new();
348	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
349	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
350		skipping("This system cannot convert character-set"
351		    " from eucJP to UTF-8.");
352		archive_write_free(a);
353		return;
354	}
355	assertEqualInt(ARCHIVE_OK,
356	    archive_write_open_memory(a, buff, sizeof(buff), &used));
357
358	entry = archive_entry_new2(a);
359	/* Set an EUC-JP filename. */
360	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
361	/* Check the Unicode version. */
362	archive_entry_set_filetype(entry, AE_IFREG);
363	archive_entry_set_size(entry, 0);
364	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
365	archive_entry_free(entry);
366	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
367
368	/* A bit 11 of general purpose flag should be 0x08,
369	 * which indicates the filename charset is UTF-8. */
370	assertEqualInt(0x08, buff[7]);
371	/* Check UTF-8 version. */
372	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
373
374	/*
375	 * Verify that EUC-JP filenames are not translated to UTF-8.
376	 */
377	a = archive_write_new();
378	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
379	assertEqualInt(ARCHIVE_OK,
380	    archive_write_open_memory(a, buff, sizeof(buff), &used));
381
382	entry = archive_entry_new2(a);
383	/* Set an EUC-JP filename. */
384	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
385	/* Check the Unicode version. */
386	archive_entry_set_filetype(entry, AE_IFREG);
387	archive_entry_set_size(entry, 0);
388	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
389	archive_entry_free(entry);
390	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
391
392	/* A bit 11 of general purpose flag should be 0,
393	 * which indicates the filename charset is unknown. */
394	assertEqualInt(0, buff[7]);
395	/* Above three characters in EUC-JP should not translate to
396	 * any character-set. */
397	assertEqualMem(buff + 30, "\xC9\xBD.txt", 6);
398
399	/*
400	 * Verify that A bit 11 of general purpose flag is not set
401	 * when ASCII filenames are stored even if hdrcharset=UTF-8
402	 * is specified.
403	 */
404	a = archive_write_new();
405	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
406	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
407		skipping("This system cannot convert character-set"
408		    " from eucJP to UTF-8.");
409		archive_write_free(a);
410		return;
411	}
412	assertEqualInt(ARCHIVE_OK,
413	    archive_write_open_memory(a, buff, sizeof(buff), &used));
414
415	entry = archive_entry_new2(a);
416	/* Set an ASCII filename. */
417	archive_entry_set_pathname(entry, "abcABC");
418	/* Check the Unicode version. */
419	archive_entry_set_filetype(entry, AE_IFREG);
420	archive_entry_set_size(entry, 0);
421	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
422	archive_entry_free(entry);
423	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
424
425	/* A bit 11 of general purpose flag should be 0,
426	 * which indicates the filename charset is unknown. */
427	assertEqualInt(0, buff[7]);
428	assertEqualMem(buff + 30, "abcABC", 6);
429}
430
431DEFINE_TEST(test_zip_filename_encoding_CP932)
432{
433  	struct archive *a;
434  	struct archive_entry *entry;
435	char buff[4096];
436	size_t used;
437
438	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
439	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
440		skipping("CP932/SJIS locale not available on this system.");
441		return;
442	}
443
444	/*
445	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
446	 */
447	a = archive_write_new();
448	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
449	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
450		skipping("This system cannot convert character-set"
451		    " from CP932/SJIS to UTF-8.");
452		archive_write_free(a);
453		return;
454	}
455	assertEqualInt(ARCHIVE_OK,
456	    archive_write_open_memory(a, buff, sizeof(buff), &used));
457
458	entry = archive_entry_new2(a);
459	/* Set a CP932/SJIS filename. */
460	archive_entry_set_pathname(entry, "\x95\x5C.txt");
461	/* Check the Unicode version. */
462	archive_entry_set_filetype(entry, AE_IFREG);
463	archive_entry_set_size(entry, 0);
464	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
465	archive_entry_free(entry);
466	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
467
468	/* A bit 11 of general purpose flag should be 0x08,
469	 * which indicates the filename charset is UTF-8. */
470	assertEqualInt(0x08, buff[7]);
471	/* Check UTF-8 version. */
472	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
473
474	/*
475	 * Verify that CP932/SJIS filenames are not translated to UTF-8.
476	 */
477	a = archive_write_new();
478	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
479	assertEqualInt(ARCHIVE_OK,
480	    archive_write_open_memory(a, buff, sizeof(buff), &used));
481
482	entry = archive_entry_new2(a);
483	/* Set a CP932/SJIS filename. */
484	archive_entry_set_pathname(entry, "\x95\x5C.txt");
485	/* Check the Unicode version. */
486	archive_entry_set_filetype(entry, AE_IFREG);
487	archive_entry_set_size(entry, 0);
488	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
489	archive_entry_free(entry);
490	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
491
492	/* A bit 11 of general purpose flag should be 0,
493	 * which indicates the filename charset is unknown. */
494	assertEqualInt(0, buff[7]);
495	/* Above three characters in CP932/SJIS should not translate to
496	 * any character-set. */
497	assertEqualMem(buff + 30, "\x95\x5C.txt", 6);
498
499	/*
500	 * Verify that A bit 11 of general purpose flag is not set
501	 * when ASCII filenames are stored even if hdrcharset=UTF-8
502	 * is specified.
503	 */
504	a = archive_write_new();
505	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
506	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
507		skipping("This system cannot convert character-set"
508		    " from CP932/SJIS to UTF-8.");
509		archive_write_free(a);
510		return;
511	}
512	assertEqualInt(ARCHIVE_OK,
513	    archive_write_open_memory(a, buff, sizeof(buff), &used));
514
515	entry = archive_entry_new2(a);
516	/* Set an ASCII filename. */
517	archive_entry_set_pathname(entry, "abcABC");
518	/* Check the Unicode version. */
519	archive_entry_set_filetype(entry, AE_IFREG);
520	archive_entry_set_size(entry, 0);
521	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
522	archive_entry_free(entry);
523	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
524
525	/* A bit 11 of general purpose flag should be 0,
526	 * which indicates the filename charset is unknown. */
527	assertEqualInt(0, buff[7]);
528	assertEqualMem(buff + 30, "abcABC", 6);
529}
530