1254225Speter/*-
2254225Speter * Copyright (c) 2011, 2012
3254225Speter *	Zhihao Yuan.  All rights reserved.
4254225Speter *
5254225Speter * See the LICENSE file for redistribution information.
6254225Speter */
7254225Speter
8254225Speter#ifndef lint
9254225Speterstatic const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10254225Speter#endif /* not lint */
11254225Speter
12254225Speter#include <sys/types.h>
13254225Speter
14254225Speterint looks_utf8 __P((const char *, size_t));
15254225Speterint looks_utf16 __P((const char *, size_t));
16254225Speterint decode_utf8 __P((const char *));
17254225Speterint decode_utf16 __P((const char *, int));
18254225Speter
19254225Speter#define F 0   /* character never appears in text */
20254225Speter#define T 1   /* character appears in plain ASCII text */
21254225Speter#define I 2   /* character appears in ISO-8859 text */
22254225Speter#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23254225Speter
24254225Speterstatic char text_chars[256] = {
25254225Speter	/*                  BEL BS HT LF    FF CR    */
26254225Speter	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
27254225Speter	/*                              ESC          */
28254225Speter	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
29254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
30254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
31254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
32254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
33254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
34254225Speter	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
35254225Speter	/*            NEL                            */
36254225Speter	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
37254225Speter	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
38254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
39254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
40254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
41254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
42254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
43254225Speter	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
44254225Speter};
45254225Speter
46254225Speter/*
47254225Speter * looks_utf8 --
48254225Speter *  Decide whether some text looks like UTF-8. Returns:
49254225Speter *
50254225Speter *     -1: invalid UTF-8
51254225Speter *      0: uses odd control characters, so doesn't look like text
52254225Speter *      1: 7-bit text
53254225Speter *      2: definitely UTF-8 text (valid high-bit set bytes)
54254225Speter *
55254225Speter *  Based on RFC 3629. UTF-8 with BOM is not accepted.
56254225Speter *
57254225Speter * PUBLIC: int looks_utf8 __P((const char *, size_t));
58254225Speter */
59254225Speterint
60254225Speterlooks_utf8(const char *ibuf, size_t nbytes)
61254225Speter{
62254225Speter	const u_char *buf = (u_char *)ibuf;
63254225Speter	size_t i;
64254225Speter	int n;
65254225Speter	int gotone = 0, ctrl = 0;
66254225Speter
67254225Speter	for (i = 0; i < nbytes; i++) {
68254225Speter		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
69254225Speter			/*
70254225Speter			 * Even if the whole file is valid UTF-8 sequences,
71254225Speter			 * still reject it if it uses weird control characters.
72254225Speter			 */
73254225Speter
74254225Speter			if (text_chars[buf[i]] != T)
75254225Speter				ctrl = 1;
76254225Speter		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
77254225Speter			return -1;
78254225Speter		} else {			   /* 11xxxxxx begins UTF-8 */
79254225Speter			int following;
80254225Speter
81254225Speter			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
82254225Speter				if (buf[i] > 0xC1)	/* C0, C1 */
83254225Speter					following = 1;
84254225Speter				else return -1;
85254225Speter			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
86254225Speter				following = 2;
87254225Speter			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
88254225Speter				if (buf[i] < 0xF5)
89254225Speter					following = 3;
90254225Speter				else return -1;		/* F5, F6, F7 */
91254225Speter			else
92254225Speter				return -1;		/* F8~FF */
93254225Speter
94254225Speter			for (n = 0; n < following; n++) {
95254225Speter				i++;
96254225Speter				if (i >= nbytes)
97254225Speter					goto done;
98254225Speter
99254225Speter				if (buf[i] & 0x40)	/* 10xxxxxx */
100254225Speter					return -1;
101254225Speter			}
102254225Speter
103254225Speter			gotone = 1;
104254225Speter		}
105254225Speter	}
106254225Speterdone:
107254225Speter	return ctrl ? 0 : (gotone ? 2 : 1);
108254225Speter}
109254225Speter
110254225Speter/*
111254225Speter * looks_utf16 --
112254225Speter *  Decide whether some text looks like UTF-16. Returns:
113254225Speter *
114254225Speter *      0: invalid UTF-16
115254225Speter *      1: Little-endian UTF-16
116254225Speter *      2: Big-endian UTF-16
117254225Speter *
118254225Speter * PUBLIC: int looks_utf16 __P((const char *, size_t));
119254225Speter */
120254225Speterint
121254225Speterlooks_utf16(const char *ibuf, size_t nbytes)
122254225Speter{
123254225Speter	const u_char *buf = (u_char *)ibuf;
124254225Speter	int bigend;
125254225Speter	size_t i;
126254225Speter	unsigned int c;
127254225Speter	int bom;
128254225Speter	int following = 0;
129254225Speter
130254225Speter	if (nbytes < 2)
131254225Speter		return 0;
132254225Speter
133254225Speter	bom = buf[0] << 8 ^ buf[1];
134254225Speter	if (bom == 0xFFFE)
135254225Speter		bigend = 0;
136254225Speter	else if (bom == 0xFEFF)
137254225Speter		bigend = 1;
138254225Speter	else
139254225Speter		return 0;
140254225Speter
141254225Speter	for (i = 2; i + 1 < nbytes; i += 2) {
142254225Speter		if (bigend)
143254225Speter			c = buf[i] << 8 ^ buf[i + 1];
144254225Speter		else
145254225Speter			c = buf[i] ^ buf[i + 1] << 8;
146254225Speter
147254225Speter		if (!following)
148254225Speter			if (c < 0xD800 || c > 0xDFFF)
149254225Speter				if (c < 128 && text_chars[c] != T)
150254225Speter					return 0;
151254225Speter				else
152254225Speter					following = 0;
153254225Speter			else if (c > 0xDBFF)
154254225Speter				return 0;
155254225Speter			else {
156254225Speter				following = 1;
157254225Speter				continue;
158254225Speter			}
159254225Speter		else if (c < 0xDC00 || c > 0xDFFF)
160254225Speter			return 0;
161254225Speter	}
162254225Speter
163254225Speter	return 1 + bigend;
164254225Speter}
165254225Speter
166254225Speter#undef F
167254225Speter#undef T
168254225Speter#undef I
169254225Speter#undef X
170254225Speter
171254225Speter/*
172254225Speter * decode_utf8 --
173254225Speter *  Decode a UTF-8 character from byte string to Unicode.
174254225Speter *  Returns -1 if the first byte is a not UTF-8 leader.
175254225Speter *
176254225Speter *  Based on RFC 3629, but without error detection.
177254225Speter *
178254225Speter * PUBLIC: int decode_utf8 __P((const char *));
179254225Speter */
180254225Speterint decode_utf8(const char *ibuf) {
181254225Speter	const u_char *buf = (u_char *)ibuf;
182254225Speter	int u = -1;
183254225Speter
184254225Speter	if ((buf[0] & 0x80) == 0)
185254225Speter		u = buf[0];
186254225Speter	else if ((buf[0] & 0x40) == 0);
187254225Speter	else {
188254225Speter		if ((buf[0] & 0x20) == 0)
189254225Speter			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
190254225Speter		else if ((buf[0] & 0x10) == 0)
191254225Speter			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
192254225Speter			  ^ (buf[2] ^ 0x80);
193254225Speter		else if (((buf[0] & 0x08) == 0))
194254225Speter			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
195254225Speter			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
196254225Speter	}
197254225Speter	return u;
198254225Speter}
199254225Speter
200254225Speter/*
201254225Speter * decode_utf16 --
202254225Speter *  Decode a UTF-16 character from byte string to Unicode.
203254225Speter *  Returns -1 if the first unsigned integer is invalid.
204254225Speter *
205254225Speter *  No error detection on supplementary bytes.
206254225Speter *
207254225Speter * PUBLIC: int decode_utf16 __P((const char *, int));
208254225Speter */
209254225Speterint decode_utf16(const char* ibuf, int bigend) {
210254225Speter	const u_char *buf = (u_char *)ibuf;
211254225Speter	int u = -1;
212254225Speter	unsigned int w1, w2;
213254225Speter
214254225Speter	if (bigend)
215254225Speter		w1 = buf[0] << 8 ^ buf[1];
216254225Speter	else
217254225Speter		w1 = buf[0] ^ buf[1] << 8;
218254225Speter
219254225Speter	if (w1 < 0xD800 || w1 > 0xDFFF)
220254225Speter		u = w1;
221254225Speter	else if (w1 > 0xDBFF);
222254225Speter	else {
223254225Speter		if (bigend)
224254225Speter			w2 = buf[2] << 8 ^ buf[3];
225254225Speter		else
226254225Speter			w2 = buf[2] ^ buf[3] << 8;
227254225Speter		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
228254225Speter	}
229254225Speter	return u;
230254225Speter}
231