1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2003, 2005 Ryuichiro Imura
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/systm.h>
35#include <sys/malloc.h>
36#include <sys/iconv.h>
37
38#include "iconv_converter_if.h"
39
40/*
41 * "UCS" converter
42 */
43
44#define	KICONV_UCS_COMBINE	0x1
45#define	KICONV_UCS_FROM_UTF8	0x2
46#define	KICONV_UCS_TO_UTF8	0x4
47#define	KICONV_UCS_FROM_LE	0x8
48#define	KICONV_UCS_TO_LE	0x10
49#define	KICONV_UCS_FROM_UTF16	0x20
50#define	KICONV_UCS_TO_UTF16	0x40
51#define	KICONV_UCS_UCS4		0x80
52
53#define	ENCODING_UTF16	"UTF-16BE"
54#define	ENCODING_UTF8	"UTF-8"
55
56static struct {
57	const char *name;
58	int from_flag, to_flag;
59} unicode_family[] = {
60	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
61	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
62	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
63	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
64	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
65	{ NULL,		0,	0 }
66};
67
68static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
69static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
70static uint32_t encode_surrogate(uint32_t code);
71static uint32_t decode_surrogate(const u_char *ucs);
72
73#ifdef MODULE_DEPEND
74MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
75#endif
76
77/*
78 * UCS converter instance
79 */
80struct iconv_ucs {
81	KOBJ_FIELDS;
82	int			convtype;
83	struct iconv_cspair *	d_csp;
84	struct iconv_cspair *	d_cspf;
85	void *			f_ctp;
86	void *			t_ctp;
87	void *			ctype;
88};
89
90static int
91iconv_ucs_open(struct iconv_converter_class *dcp,
92	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
93{
94	struct iconv_ucs *dp;
95	int i;
96	const char *from, *to;
97
98	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
99	to = csp->cp_to;
100	from = cspf ? cspf->cp_from : csp->cp_from;
101
102	dp->convtype = 0;
103
104	if (cspf)
105		dp->convtype |= KICONV_UCS_COMBINE;
106	for (i = 0; unicode_family[i].name; i++) {
107		if (strcasecmp(from, unicode_family[i].name) == 0)
108			dp->convtype |= unicode_family[i].from_flag;
109		if (strcasecmp(to, unicode_family[i].name) == 0)
110			dp->convtype |= unicode_family[i].to_flag;
111	}
112	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
113		dp->convtype |= KICONV_UCS_UCS4;
114	else
115		dp->convtype &= ~KICONV_UCS_UCS4;
116
117	dp->f_ctp = dp->t_ctp = NULL;
118	if (dp->convtype & KICONV_UCS_COMBINE) {
119		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
120		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
121			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
122		}
123		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
124		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
125			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
126		}
127	}
128
129	dp->ctype = NULL;
130	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
131		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
132
133	dp->d_csp = csp;
134	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
135		if (cspf) {
136			dp->d_cspf = cspf;
137			cspf->cp_refcount++;
138		} else
139			csp->cp_refcount++;
140	}
141	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
142		csp->cp_refcount++;
143	*dpp = (void*)dp;
144	return 0;
145}
146
147static int
148iconv_ucs_close(void *data)
149{
150	struct iconv_ucs *dp = data;
151
152	if (dp->f_ctp)
153		iconv_close(dp->f_ctp);
154	if (dp->t_ctp)
155		iconv_close(dp->t_ctp);
156	if (dp->ctype)
157		iconv_close(dp->ctype);
158	if (dp->d_cspf)
159		dp->d_cspf->cp_refcount--;
160	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
161		dp->d_csp->cp_refcount--;
162	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
163		dp->d_csp->cp_refcount--;
164	kobj_delete((struct kobj*)data, M_ICONV);
165	return 0;
166}
167
168static int
169iconv_ucs_conv(void *d2p, const char **inbuf,
170	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
171	int convchar, int casetype)
172{
173	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
174	int ret = 0, i;
175	size_t in, on, ir, or, inlen, outlen, ucslen;
176	const char *src, *p;
177	char *dst;
178	u_char ucs[4], *q;
179	uint32_t code;
180
181	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
182		return 0;
183	ir = in = *inbytesleft;
184	or = on = *outbytesleft;
185	src = *inbuf;
186	dst = *outbuf;
187
188	while (ir > 0 && or > 0) {
189
190		/*
191		 * The first half of conversion.
192		 * (convert any code into ENCODING_UNICODE)
193		 */
194		code = 0;
195		p = src;
196		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
197			/* convert UTF-8 to ENCODING_UNICODE */
198			inlen = 0;
199			code = utf8_to_ucs4(p, &inlen, ir);
200			if (code == 0) {
201				ret = -1;
202				break;
203			}
204
205			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
206				code = towlower(code, dp->ctype);
207			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
208				code = towupper(code, dp->ctype);
209			}
210
211			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
212				/* reserved for utf-16 surrogate pair */
213				/* invalid unicode */
214				ret = -1;
215				break;
216			}
217
218			if (inlen == 4) {
219				if (dp->convtype & KICONV_UCS_UCS4) {
220					ucslen = 4;
221					code = encode_surrogate(code);
222				} else {
223					/* can't handle with ucs-2 */
224					ret = -1;
225					break;
226				}
227			} else {
228				ucslen = 2;
229			}
230
231			/* save UCS-4 into ucs[] */
232			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
233				*q++ = (code >> (i << 3)) & 0xff;
234
235		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
236			/* convert local code to ENCODING_UNICODE */
237			ucslen = 4;
238			inlen = ir;
239			q = ucs;
240			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
241			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
242			if (ret)
243				break;
244			inlen = ir - inlen;
245			ucslen = 4 - ucslen;
246
247		} else {
248			/* src code is a proper subset of ENCODING_UNICODE */
249			q = ucs;
250			if (dp->convtype & KICONV_UCS_FROM_LE) {
251				*q = *(p + 1);
252				*(q + 1) = *p;
253				p += 2;
254			} else {
255				*q = *p++;
256				*(q + 1) = *p++;
257			}
258			if ((*q & 0xfc) == 0xd8) {
259				if (dp->convtype & KICONV_UCS_UCS4 &&
260				    dp->convtype & KICONV_UCS_FROM_UTF16) {
261					inlen = ucslen = 4;
262				} else {
263					/* invalid unicode */
264					ret = -1;
265					break;
266				}
267			} else {
268				inlen = ucslen = 2;
269			}
270			if (ir < inlen) {
271				ret = -1;
272				break;
273			}
274			if (ucslen == 4) {
275				q += 2;
276				if (dp->convtype & KICONV_UCS_FROM_LE) {
277					*q = *(p + 1);
278					*(q + 1) = *p;
279				} else {
280					*q = *p++;
281					*(q + 1) = *p;
282				}
283				if ((*q & 0xfc) != 0xdc) {
284					/* invalid unicode */
285					ret = -1;
286					break;
287				}
288			}
289		}
290
291		/*
292		 * The second half of conversion.
293		 * (convert ENCODING_UNICODE into any code)
294		 */
295		p = ucs;
296		if (dp->convtype & KICONV_UCS_TO_UTF8) {
297			q = (u_char *)dst;
298			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
299				/* decode surrogate pair */
300				code = decode_surrogate(p);
301			} else {
302				code = (ucs[0] << 8) | ucs[1];
303			}
304
305			if (casetype == KICONV_LOWER && dp->ctype) {
306				code = towlower(code, dp->ctype);
307			} else if (casetype == KICONV_UPPER && dp->ctype) {
308				code = towupper(code, dp->ctype);
309			}
310
311			outlen = 0;
312			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
313				ret = -1;
314				break;
315			}
316
317			src += inlen;
318			ir -= inlen;
319			dst += outlen;
320			or -= outlen;
321
322		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
323			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
324			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
325			if (ret)
326				break;
327
328			src += inlen;
329			ir -= inlen;
330
331		} else {
332			/* dst code is a proper subset of ENCODING_UNICODE */
333			if (or < ucslen) {
334				ret = -1;
335				break;
336			}
337			src += inlen;
338			ir -= inlen;
339			or -= ucslen;
340			if (dp->convtype & KICONV_UCS_TO_LE) {
341				*dst++ = *(p + 1);
342				*dst++ = *p;
343				p += 2;
344			} else {
345				*dst++ = *p++;
346				*dst++ = *p++;
347			}
348			if (ucslen == 4) {
349				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
350				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
351					ret = -1;
352					break;
353				}
354				if (dp->convtype & KICONV_UCS_TO_LE) {
355					*dst++ = *(p + 1);
356					*dst++ = *p;
357				} else {
358					*dst++ = *p++;
359					*dst++ = *p;
360				}
361			}
362		}
363
364		if (convchar == 1)
365			break;
366	}
367
368	*inbuf += in - ir;
369	*outbuf += on - or;
370	*inbytesleft -= in - ir;
371	*outbytesleft -= on - or;
372	return (ret);
373}
374
375static int
376iconv_ucs_init(struct iconv_converter_class *dcp)
377{
378	int error;
379
380	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
381	if (error)
382		return (error);
383	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
384	if (error)
385		return (error);
386	return (0);
387}
388
389static int
390iconv_ucs_done(struct iconv_converter_class *dcp)
391{
392	return (0);
393}
394
395static const char *
396iconv_ucs_name(struct iconv_converter_class *dcp)
397{
398	return (ENCODING_UNICODE);
399}
400
401static kobj_method_t iconv_ucs_methods[] = {
402	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
403	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
404	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
405	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
406	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
407	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
408	{0, 0}
409};
410
411KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
412
413static uint32_t
414utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
415{
416	size_t i, w = 0;
417	uint32_t ucs4 = 0;
418
419	/*
420	 * get leading 1 byte from utf-8
421	 */
422	if ((*src & 0x80) == 0) {
423		/*
424		 * leading 1 bit is "0"
425		 *  utf-8: 0xxxxxxx
426		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
427		 */
428		w = 1;
429		/* get trailing 7 bits */
430		ucs4 = *src & 0x7f;
431	} else if ((*src & 0xe0) == 0xc0) {
432		/*
433		 * leading 3 bits are "110"
434		 *  utf-8: 110xxxxx 10yyyyyy
435		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
436		 */
437		w = 2;
438		/* get trailing 5 bits */
439		ucs4 = *src & 0x1f;
440	} else if ((*src & 0xf0) == 0xe0) {
441		/*
442		 * leading 4 bits are "1110"
443		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
444		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
445		 */
446		w = 3;
447		/* get trailing 4 bits */
448		ucs4 = *src & 0x0f;
449	} else if ((*src & 0xf8) == 0xf0) {
450		/*
451		 * leading 5 bits are "11110"
452		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
453		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
454		 */
455		w = 4;
456		/* get trailing 3 bits */
457		ucs4 = *src & 0x07;
458	} else {
459		/* out of utf-16 range or having illegal bits */
460		return (0);
461	}
462
463	if (srclen < w)
464		return (0);
465
466	/*
467	 * get left parts from utf-8
468	 */
469	for (i = 1 ; i < w ; i++) {
470		if ((*(src + i) & 0xc0) != 0x80) {
471			/* invalid: leading 2 bits are not "10" */
472			return (0);
473		}
474		/* concatenate trailing 6 bits into ucs4 */
475		ucs4 <<= 6;
476		ucs4 |= *(src + i) & 0x3f;
477	}
478
479	*utf8width = w;
480	return (ucs4);
481}
482
483static u_char *
484ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485{
486	u_char lead, *p;
487	size_t i, w;
488
489	/*
490	 * determine utf-8 width and leading bits
491	 */
492	if (ucs4 < 0x80) {
493		w = 1;
494		lead = 0;	/* "0" */
495	} else if (ucs4 < 0x800) {
496		w = 2;
497		lead = 0xc0;	/* "11" */
498	} else if (ucs4 < 0x10000) {
499		w = 3;
500		lead = 0xe0;	/* "111" */
501	} else if (ucs4 < 0x200000) {
502		w = 4;
503		lead = 0xf0;	/* "1111" */
504	} else {
505		return (NULL);
506	}
507
508	if (dstlen < w)
509		return (NULL);
510
511	/*
512	 * construct utf-8
513	 */
514	p = dst;
515	for (i = w - 1 ; i >= 1 ; i--) {
516		/* get trailing 6 bits and put it with leading bit as "1" */
517		*(p + i) = (ucs4 & 0x3f) | 0x80;
518		ucs4 >>= 6;
519	}
520	*p = ucs4 | lead;
521
522	*utf8width = w;
523
524	return (p);
525}
526
527static uint32_t
528encode_surrogate(uint32_t code)
529{
530	return ((((code - 0x10000) << 6) & 0x3ff0000) |
531	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532}
533
534static uint32_t
535decode_surrogate(const u_char *ucs)
536{
537	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539}
540
541