1227650Skevlo/*-
2227650Skevlo * Copyright (c) 2003, 2005 Ryuichiro Imura
3227650Skevlo * All rights reserved.
4227650Skevlo *
5227650Skevlo * Redistribution and use in source and binary forms, with or without
6227650Skevlo * modification, are permitted provided that the following conditions
7227650Skevlo * are met:
8227650Skevlo * 1. Redistributions of source code must retain the above copyright
9227650Skevlo *    notice, this list of conditions and the following disclaimer.
10227650Skevlo * 2. Redistributions in binary form must reproduce the above copyright
11227650Skevlo *    notice, this list of conditions and the following disclaimer in the
12227650Skevlo *    documentation and/or other materials provided with the distribution.
13227650Skevlo *
14227650Skevlo * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15227650Skevlo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16227650Skevlo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17227650Skevlo * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18227650Skevlo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19227650Skevlo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20227650Skevlo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21227650Skevlo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22227650Skevlo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23227650Skevlo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24227650Skevlo * SUCH DAMAGE.
25227650Skevlo */
26227650Skevlo
27227650Skevlo#include <sys/cdefs.h>
28227650Skevlo__FBSDID("$FreeBSD$");
29227650Skevlo
30227650Skevlo#include <sys/param.h>
31227650Skevlo#include <sys/kernel.h>
32227650Skevlo#include <sys/systm.h>
33227650Skevlo#include <sys/malloc.h>
34227650Skevlo#include <sys/iconv.h>
35227650Skevlo
36227650Skevlo#include "iconv_converter_if.h"
37227650Skevlo
38227650Skevlo/*
39227650Skevlo * "UCS" converter
40227650Skevlo */
41227650Skevlo
42227650Skevlo#define	KICONV_UCS_COMBINE	0x1
43227650Skevlo#define	KICONV_UCS_FROM_UTF8	0x2
44227650Skevlo#define	KICONV_UCS_TO_UTF8	0x4
45227650Skevlo#define	KICONV_UCS_FROM_LE	0x8
46227650Skevlo#define	KICONV_UCS_TO_LE	0x10
47227650Skevlo#define	KICONV_UCS_FROM_UTF16	0x20
48227650Skevlo#define	KICONV_UCS_TO_UTF16	0x40
49227650Skevlo#define	KICONV_UCS_UCS4		0x80
50227650Skevlo
51227650Skevlo#define	ENCODING_UTF16	"UTF-16BE"
52227650Skevlo#define	ENCODING_UTF8	"UTF-8"
53227650Skevlo
54227650Skevlostatic struct {
55227650Skevlo	const char *name;
56227650Skevlo	int from_flag, to_flag;
57227650Skevlo} unicode_family[] = {
58227650Skevlo	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
59227650Skevlo	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
60227650Skevlo	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
61227650Skevlo	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
62227650Skevlo	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
63227650Skevlo	{ NULL,		0,	0 }
64227650Skevlo};
65227650Skevlo
66227650Skevlostatic uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67227650Skevlostatic u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68227650Skevlostatic uint32_t encode_surrogate(uint32_t code);
69227650Skevlostatic uint32_t decode_surrogate(const u_char *ucs);
70227650Skevlo
71227650Skevlo#ifdef MODULE_DEPEND
72227650SkevloMODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73227650Skevlo#endif
74227650Skevlo
75227650Skevlo/*
76227650Skevlo * UCS converter instance
77227650Skevlo */
78227650Skevlostruct iconv_ucs {
79227650Skevlo	KOBJ_FIELDS;
80227650Skevlo	int			convtype;
81227650Skevlo	struct iconv_cspair *	d_csp;
82227650Skevlo	struct iconv_cspair *	d_cspf;
83227650Skevlo	void *			f_ctp;
84227650Skevlo	void *			t_ctp;
85227650Skevlo	void *			ctype;
86227650Skevlo};
87227650Skevlo
88227650Skevlostatic int
89227650Skevloiconv_ucs_open(struct iconv_converter_class *dcp,
90227650Skevlo	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91227650Skevlo{
92227650Skevlo	struct iconv_ucs *dp;
93227650Skevlo	int i;
94227650Skevlo	const char *from, *to;
95227650Skevlo
96227650Skevlo	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97227650Skevlo	to = csp->cp_to;
98227650Skevlo	from = cspf ? cspf->cp_from : csp->cp_from;
99227650Skevlo
100227650Skevlo	dp->convtype = 0;
101227650Skevlo
102227650Skevlo	if (cspf)
103227650Skevlo		dp->convtype |= KICONV_UCS_COMBINE;
104227650Skevlo	for (i = 0; unicode_family[i].name; i++) {
105267980Sjhb		if (strcasecmp(from, unicode_family[i].name) == 0)
106227650Skevlo			dp->convtype |= unicode_family[i].from_flag;
107267980Sjhb		if (strcasecmp(to, unicode_family[i].name) == 0)
108227650Skevlo			dp->convtype |= unicode_family[i].to_flag;
109227650Skevlo	}
110235713Skevlo	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111227650Skevlo		dp->convtype |= KICONV_UCS_UCS4;
112227650Skevlo	else
113227650Skevlo		dp->convtype &= ~KICONV_UCS_UCS4;
114227650Skevlo
115227650Skevlo	dp->f_ctp = dp->t_ctp = NULL;
116227650Skevlo	if (dp->convtype & KICONV_UCS_COMBINE) {
117227650Skevlo		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118227650Skevlo		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119227650Skevlo			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120227650Skevlo		}
121227650Skevlo		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122227650Skevlo		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123227650Skevlo			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124227650Skevlo		}
125227650Skevlo	}
126227650Skevlo
127227650Skevlo	dp->ctype = NULL;
128227650Skevlo	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
129227650Skevlo		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130227650Skevlo
131227650Skevlo	dp->d_csp = csp;
132227650Skevlo	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
133227650Skevlo		if (cspf) {
134227650Skevlo			dp->d_cspf = cspf;
135227650Skevlo			cspf->cp_refcount++;
136227650Skevlo		} else
137227650Skevlo			csp->cp_refcount++;
138227650Skevlo	}
139227650Skevlo	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
140227650Skevlo		csp->cp_refcount++;
141227650Skevlo	*dpp = (void*)dp;
142227650Skevlo	return 0;
143227650Skevlo}
144227650Skevlo
145227650Skevlostatic int
146227650Skevloiconv_ucs_close(void *data)
147227650Skevlo{
148227650Skevlo	struct iconv_ucs *dp = data;
149227650Skevlo
150227650Skevlo	if (dp->f_ctp)
151227650Skevlo		iconv_close(dp->f_ctp);
152227650Skevlo	if (dp->t_ctp)
153227650Skevlo		iconv_close(dp->t_ctp);
154227650Skevlo	if (dp->ctype)
155227650Skevlo		iconv_close(dp->ctype);
156227650Skevlo	if (dp->d_cspf)
157227650Skevlo		dp->d_cspf->cp_refcount--;
158227650Skevlo	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
159227650Skevlo		dp->d_csp->cp_refcount--;
160227650Skevlo	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
161227650Skevlo		dp->d_csp->cp_refcount--;
162227650Skevlo	kobj_delete((struct kobj*)data, M_ICONV);
163227650Skevlo	return 0;
164227650Skevlo}
165227650Skevlo
166227650Skevlostatic int
167227650Skevloiconv_ucs_conv(void *d2p, const char **inbuf,
168227650Skevlo	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169227650Skevlo	int convchar, int casetype)
170227650Skevlo{
171227650Skevlo	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172227650Skevlo	int ret = 0, i;
173227650Skevlo	size_t in, on, ir, or, inlen, outlen, ucslen;
174227650Skevlo	const char *src, *p;
175227650Skevlo	char *dst;
176227650Skevlo	u_char ucs[4], *q;
177227650Skevlo	uint32_t code;
178227650Skevlo
179227650Skevlo	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180227650Skevlo		return 0;
181227650Skevlo	ir = in = *inbytesleft;
182227650Skevlo	or = on = *outbytesleft;
183227650Skevlo	src = *inbuf;
184227650Skevlo	dst = *outbuf;
185227650Skevlo
186227650Skevlo	while (ir > 0 && or > 0) {
187227650Skevlo
188227650Skevlo		/*
189227650Skevlo		 * The first half of conversion.
190227650Skevlo		 * (convert any code into ENCODING_UNICODE)
191227650Skevlo		 */
192227650Skevlo		code = 0;
193227650Skevlo		p = src;
194227650Skevlo		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
195227650Skevlo			/* convert UTF-8 to ENCODING_UNICODE */
196227650Skevlo			inlen = 0;
197227650Skevlo			code = utf8_to_ucs4(p, &inlen, ir);
198227650Skevlo			if (code == 0) {
199227650Skevlo				ret = -1;
200227650Skevlo				break;
201227650Skevlo			}
202227650Skevlo
203227650Skevlo			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
204227650Skevlo				code = towlower(code, dp->ctype);
205227650Skevlo			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
206227650Skevlo				code = towupper(code, dp->ctype);
207227650Skevlo			}
208227650Skevlo
209227650Skevlo			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
210227650Skevlo				/* reserved for utf-16 surrogate pair */
211227650Skevlo				/* invalid unicode */
212227650Skevlo				ret = -1;
213227650Skevlo				break;
214227650Skevlo			}
215227650Skevlo
216227650Skevlo			if (inlen == 4) {
217227650Skevlo				if (dp->convtype & KICONV_UCS_UCS4) {
218227650Skevlo					ucslen = 4;
219227650Skevlo					code = encode_surrogate(code);
220227650Skevlo				} else {
221227650Skevlo					/* can't handle with ucs-2 */
222227650Skevlo					ret = -1;
223227650Skevlo					break;
224227650Skevlo				}
225227650Skevlo			} else {
226227650Skevlo				ucslen = 2;
227227650Skevlo			}
228227650Skevlo
229227650Skevlo			/* save UCS-4 into ucs[] */
230227650Skevlo			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
231227650Skevlo				*q++ = (code >> (i << 3)) & 0xff;
232227650Skevlo
233227650Skevlo		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
234227650Skevlo			/* convert local code to ENCODING_UNICODE */
235227650Skevlo			ucslen = 4;
236227650Skevlo			inlen = ir;
237227650Skevlo			q = ucs;
238227650Skevlo			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
239227650Skevlo			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
240227650Skevlo			if (ret)
241227650Skevlo				break;
242227650Skevlo			inlen = ir - inlen;
243227650Skevlo			ucslen = 4 - ucslen;
244227650Skevlo
245227650Skevlo		} else {
246227650Skevlo			/* src code is a proper subset of ENCODING_UNICODE */
247227650Skevlo			q = ucs;
248227650Skevlo			if (dp->convtype & KICONV_UCS_FROM_LE) {
249227650Skevlo				*q = *(p + 1);
250227650Skevlo				*(q + 1) = *p;
251227650Skevlo				p += 2;
252227650Skevlo			} else {
253227650Skevlo				*q = *p++;
254227650Skevlo				*(q + 1) = *p++;
255227650Skevlo			}
256227650Skevlo			if ((*q & 0xfc) == 0xd8) {
257227650Skevlo				if (dp->convtype & KICONV_UCS_UCS4 &&
258227650Skevlo				    dp->convtype & KICONV_UCS_FROM_UTF16) {
259227650Skevlo					inlen = ucslen = 4;
260227650Skevlo				} else {
261227650Skevlo					/* invalid unicode */
262227650Skevlo					ret = -1;
263227650Skevlo					break;
264227650Skevlo				}
265227650Skevlo			} else {
266227650Skevlo				inlen = ucslen = 2;
267227650Skevlo			}
268227650Skevlo			if (ir < inlen) {
269227650Skevlo				ret = -1;
270227650Skevlo				break;
271227650Skevlo			}
272227650Skevlo			if (ucslen == 4) {
273227650Skevlo				q += 2;
274227650Skevlo				if (dp->convtype & KICONV_UCS_FROM_LE) {
275227650Skevlo					*q = *(p + 1);
276227650Skevlo					*(q + 1) = *p;
277227650Skevlo				} else {
278227650Skevlo					*q = *p++;
279227650Skevlo					*(q + 1) = *p;
280227650Skevlo				}
281227650Skevlo				if ((*q & 0xfc) != 0xdc) {
282227650Skevlo					/* invalid unicode */
283227650Skevlo					ret = -1;
284227650Skevlo					break;
285227650Skevlo				}
286227650Skevlo			}
287227650Skevlo		}
288227650Skevlo
289227650Skevlo		/*
290227650Skevlo		 * The second half of conversion.
291227650Skevlo		 * (convert ENCODING_UNICODE into any code)
292227650Skevlo		 */
293227650Skevlo		p = ucs;
294227650Skevlo		if (dp->convtype & KICONV_UCS_TO_UTF8) {
295227650Skevlo			q = (u_char *)dst;
296227650Skevlo			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
297227650Skevlo				/* decode surrogate pair */
298227650Skevlo				code = decode_surrogate(p);
299227650Skevlo			} else {
300227650Skevlo				code = (ucs[0] << 8) | ucs[1];
301227650Skevlo			}
302227650Skevlo
303227650Skevlo			if (casetype == KICONV_LOWER && dp->ctype) {
304227650Skevlo				code = towlower(code, dp->ctype);
305227650Skevlo			} else if (casetype == KICONV_UPPER && dp->ctype) {
306227650Skevlo				code = towupper(code, dp->ctype);
307227650Skevlo			}
308227650Skevlo
309227650Skevlo			outlen = 0;
310227650Skevlo			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
311227650Skevlo				ret = -1;
312227650Skevlo				break;
313227650Skevlo			}
314227650Skevlo
315227650Skevlo			src += inlen;
316227650Skevlo			ir -= inlen;
317227650Skevlo			dst += outlen;
318227650Skevlo			or -= outlen;
319227650Skevlo
320227650Skevlo		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
321227650Skevlo			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
322227650Skevlo			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
323227650Skevlo			if (ret)
324227650Skevlo				break;
325227650Skevlo
326227650Skevlo			src += inlen;
327227650Skevlo			ir -= inlen;
328227650Skevlo
329227650Skevlo		} else {
330227650Skevlo			/* dst code is a proper subset of ENCODING_UNICODE */
331227650Skevlo			if (or < ucslen) {
332227650Skevlo				ret = -1;
333227650Skevlo				break;
334227650Skevlo			}
335227650Skevlo			src += inlen;
336227650Skevlo			ir -= inlen;
337227650Skevlo			or -= ucslen;
338227650Skevlo			if (dp->convtype & KICONV_UCS_TO_LE) {
339227650Skevlo				*dst++ = *(p + 1);
340227650Skevlo				*dst++ = *p;
341227650Skevlo				p += 2;
342227650Skevlo			} else {
343227650Skevlo				*dst++ = *p++;
344227650Skevlo				*dst++ = *p++;
345227650Skevlo			}
346227650Skevlo			if (ucslen == 4) {
347227650Skevlo				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
348227650Skevlo				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
349227650Skevlo					ret = -1;
350227650Skevlo					break;
351227650Skevlo				}
352227650Skevlo				if (dp->convtype & KICONV_UCS_TO_LE) {
353227650Skevlo					*dst++ = *(p + 1);
354227650Skevlo					*dst++ = *p;
355227650Skevlo				} else {
356227650Skevlo					*dst++ = *p++;
357227650Skevlo					*dst++ = *p;
358227650Skevlo				}
359227650Skevlo			}
360227650Skevlo		}
361227650Skevlo
362227650Skevlo		if (convchar == 1)
363227650Skevlo			break;
364227650Skevlo	}
365227650Skevlo
366227650Skevlo	*inbuf += in - ir;
367227650Skevlo	*outbuf += on - or;
368227650Skevlo	*inbytesleft -= in - ir;
369227650Skevlo	*outbytesleft -= on - or;
370227650Skevlo	return (ret);
371227650Skevlo}
372227650Skevlo
373227650Skevlostatic int
374227650Skevloiconv_ucs_init(struct iconv_converter_class *dcp)
375227650Skevlo{
376227650Skevlo	int error;
377227650Skevlo
378227650Skevlo	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
379227650Skevlo	if (error)
380227650Skevlo		return (error);
381227650Skevlo	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
382227650Skevlo	if (error)
383227650Skevlo		return (error);
384227650Skevlo	return (0);
385227650Skevlo}
386227650Skevlo
387227650Skevlostatic int
388227650Skevloiconv_ucs_done(struct iconv_converter_class *dcp)
389227650Skevlo{
390227650Skevlo	return (0);
391227650Skevlo}
392227650Skevlo
393227650Skevlostatic const char *
394227650Skevloiconv_ucs_name(struct iconv_converter_class *dcp)
395227650Skevlo{
396227650Skevlo	return (ENCODING_UNICODE);
397227650Skevlo}
398227650Skevlo
399227650Skevlostatic kobj_method_t iconv_ucs_methods[] = {
400227650Skevlo	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
401227650Skevlo	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
402227650Skevlo	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
403227650Skevlo	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
404227650Skevlo	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
405227650Skevlo	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
406227650Skevlo	{0, 0}
407227650Skevlo};
408227650Skevlo
409227650SkevloKICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
410227650Skevlo
411227650Skevlostatic uint32_t
412227650Skevloutf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
413227650Skevlo{
414227650Skevlo	size_t i, w = 0;
415227650Skevlo	uint32_t ucs4 = 0;
416227650Skevlo
417227650Skevlo	/*
418227650Skevlo	 * get leading 1 byte from utf-8
419227650Skevlo	 */
420227650Skevlo	if ((*src & 0x80) == 0) {
421227650Skevlo		/*
422227650Skevlo		 * leading 1 bit is "0"
423227650Skevlo		 *  utf-8: 0xxxxxxx
424227650Skevlo		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
425227650Skevlo		 */
426227650Skevlo		w = 1;
427227650Skevlo		/* get trailing 7 bits */
428227650Skevlo		ucs4 = *src & 0x7f;
429227650Skevlo	} else if ((*src & 0xe0) == 0xc0) {
430227650Skevlo		/*
431227650Skevlo		 * leading 3 bits are "110"
432227650Skevlo		 *  utf-8: 110xxxxx 10yyyyyy
433227650Skevlo		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
434227650Skevlo		 */
435227650Skevlo		w = 2;
436227650Skevlo		/* get trailing 5 bits */
437227650Skevlo		ucs4 = *src & 0x1f;
438227650Skevlo	} else if ((*src & 0xf0) == 0xe0) {
439227650Skevlo		/*
440227650Skevlo		 * leading 4 bits are "1110"
441227650Skevlo		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
442227650Skevlo		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
443227650Skevlo		 */
444227650Skevlo		w = 3;
445227650Skevlo		/* get trailing 4 bits */
446227650Skevlo		ucs4 = *src & 0x0f;
447227650Skevlo	} else if ((*src & 0xf8) == 0xf0) {
448227650Skevlo		/*
449227650Skevlo		 * leading 5 bits are "11110"
450227650Skevlo		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
451227650Skevlo		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
452227650Skevlo		 */
453227650Skevlo		w = 4;
454227650Skevlo		/* get trailing 3 bits */
455227650Skevlo		ucs4 = *src & 0x07;
456227650Skevlo	} else {
457227650Skevlo		/* out of utf-16 range or having illegal bits */
458227650Skevlo		return (0);
459227650Skevlo	}
460227650Skevlo
461227650Skevlo	if (srclen < w)
462227650Skevlo		return (0);
463227650Skevlo
464227650Skevlo	/*
465227650Skevlo	 * get left parts from utf-8
466227650Skevlo	 */
467227650Skevlo	for (i = 1 ; i < w ; i++) {
468227650Skevlo		if ((*(src + i) & 0xc0) != 0x80) {
469227650Skevlo			/* invalid: leading 2 bits are not "10" */
470227650Skevlo			return (0);
471227650Skevlo		}
472227650Skevlo		/* concatenate trailing 6 bits into ucs4 */
473227650Skevlo		ucs4 <<= 6;
474227650Skevlo		ucs4 |= *(src + i) & 0x3f;
475227650Skevlo	}
476227650Skevlo
477227650Skevlo	*utf8width = w;
478227650Skevlo	return (ucs4);
479227650Skevlo}
480227650Skevlo
481227650Skevlostatic u_char *
482227650Skevloucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
483227650Skevlo{
484227650Skevlo	u_char lead, *p;
485227650Skevlo	size_t i, w;
486227650Skevlo
487227650Skevlo	/*
488227650Skevlo	 * determine utf-8 width and leading bits
489227650Skevlo	 */
490227650Skevlo	if (ucs4 < 0x80) {
491227650Skevlo		w = 1;
492227650Skevlo		lead = 0;	/* "0" */
493227650Skevlo	} else if (ucs4 < 0x800) {
494227650Skevlo		w = 2;
495227650Skevlo		lead = 0xc0;	/* "11" */
496227650Skevlo	} else if (ucs4 < 0x10000) {
497227650Skevlo		w = 3;
498227650Skevlo		lead = 0xe0;	/* "111" */
499227650Skevlo	} else if (ucs4 < 0x200000) {
500227650Skevlo		w = 4;
501227650Skevlo		lead = 0xf0;	/* "1111" */
502227650Skevlo	} else {
503227650Skevlo		return (NULL);
504227650Skevlo	}
505227650Skevlo
506227650Skevlo	if (dstlen < w)
507227650Skevlo		return (NULL);
508227650Skevlo
509227650Skevlo	/*
510227650Skevlo	 * construct utf-8
511227650Skevlo	 */
512227650Skevlo	p = dst;
513227650Skevlo	for (i = w - 1 ; i >= 1 ; i--) {
514227650Skevlo		/* get trailing 6 bits and put it with leading bit as "1" */
515227650Skevlo		*(p + i) = (ucs4 & 0x3f) | 0x80;
516227650Skevlo		ucs4 >>= 6;
517227650Skevlo	}
518227650Skevlo	*p = ucs4 | lead;
519227650Skevlo
520227650Skevlo	*utf8width = w;
521227650Skevlo
522227650Skevlo	return (p);
523227650Skevlo}
524227650Skevlo
525227650Skevlostatic uint32_t
526227650Skevloencode_surrogate(register uint32_t code)
527227650Skevlo{
528227650Skevlo	return ((((code - 0x10000) << 6) & 0x3ff0000) |
529227650Skevlo	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
530227650Skevlo}
531227650Skevlo
532227650Skevlostatic uint32_t
533227650Skevlodecode_surrogate(register const u_char *ucs)
534227650Skevlo{
535227650Skevlo	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
536227650Skevlo	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
537227650Skevlo}
538227650Skevlo
539