unicode.h revision 272322
1/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2
3/*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Dieter Baron.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $FreeBSD: stable/10/sys/dev/hyperv/utilities/unicode.h 272322 2014-09-30 17:54:57Z delphij $
32 */
33
34#include <sys/types.h>
35
36#define UNICODE_DECOMPOSE		0x01
37#define UNICODE_PRECOMPOSE		0x02
38#define UNICODE_UTF8_LATIN1_FALLBACK	0x03
39
40size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
41size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
42
43size_t
44utf8_to_utf16(uint16_t *dst, size_t dst_len,
45	      const char *src, size_t src_len,
46	      int flags, int *errp)
47{
48    const unsigned char *s;
49    size_t spos, dpos;
50    int error;
51    uint16_t c;
52
53#define IS_CONT(c)	(((c)&0xc0) == 0x80)
54
55    error = 0;
56    s = (const unsigned char *)src;
57    spos = dpos = 0;
58    while (spos<src_len) {
59	if (s[spos] < 0x80)
60	    c = s[spos++];
61	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
62		 && (spos >= src_len || !IS_CONT(s[spos+1]))
63		 && s[spos]>=0xa0) {
64	    /* not valid UTF-8, assume ISO 8859-1 */
65	    c = s[spos++];
66	}
67	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
68	    /* continuation byte without lead byte
69	       or lead byte for codepoint above 0x10ffff */
70	    error++;
71	    spos++;
72	    continue;
73	}
74	else if (s[spos] < 0xe0) {
75	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
76		spos++;
77		error++;
78		continue;
79	    }
80	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
81	    spos += 2;
82	    if (c < 0x80) {
83		/* overlong encoding */
84		error++;
85		continue;
86	    }
87	}
88	else if (s[spos] < 0xf0) {
89	    if (spos >= src_len-2
90		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
91		spos++;
92		error++;
93		continue;
94	    }
95	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
96		| (s[spos+2] & 0x3f);
97	    spos += 3;
98	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
99		/* overlong encoding or encoded surrogate */
100		error++;
101		continue;
102	    }
103	}
104	else {
105	    uint32_t cc;
106	    /* UTF-16 surrogate pair */
107
108	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
109		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
110		spos++;
111		error++;
112
113		continue;
114	    }
115	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
116		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
117	    spos += 4;
118	    if (cc < 0x10000) {
119		/* overlong encoding */
120		error++;
121		continue;
122	    }
123	    if (dst && dpos < dst_len)
124		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
125	    dpos++;
126	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
127	}
128
129	if (dst && dpos < dst_len)
130	    dst[dpos] = c;
131	dpos++;
132    }
133
134    if (errp)
135	*errp = error;
136
137    return dpos;
138
139#undef IS_CONT
140}
141
142
143size_t
144utf16_to_utf8(char *dst, size_t dst_len,
145	      const uint16_t *src, size_t src_len,
146	      int flags, int *errp)
147{
148    uint16_t spos, dpos;
149    int error;
150
151#define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
152#define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
153
154    error = 0;
155    dpos = 0;
156    for (spos=0; spos<src_len; spos++) {
157	if (src[spos] < 0x80) {
158	    CHECK_LENGTH(1);
159	    ADD_BYTE(src[spos]);
160	}
161	else if (src[spos] < 0x800) {
162	    CHECK_LENGTH(2);
163	    ADD_BYTE(0xc0 | (src[spos]>>6));
164	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
165	}
166	else if ((src[spos] & 0xdc00) == 0xd800) {
167	    uint32_t c;
168	    /* first surrogate */
169	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
170		/* no second surrogate present */
171		error++;
172		continue;
173	    }
174	    spos++;
175	    CHECK_LENGTH(4);
176	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
177	    ADD_BYTE(0xf0 | (c>>18));
178	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
179	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
180	    ADD_BYTE(0x80 | (c & 0x3f));
181	}
182	else if ((src[spos] & 0xdc00) == 0xdc00) {
183	    /* second surrogate without preceding first surrogate */
184	    error++;
185	}
186	else {
187	    CHECK_LENGTH(3);
188	    ADD_BYTE(0xe0 | src[spos]>>12);
189	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
190	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
191	}
192    }
193
194    if (errp)
195	*errp = error;
196
197    return dpos;
198
199#undef ADD_BYTE
200#undef CHECK_LENGTH
201}
202