1/*
2 * Copyright (c) 2004 Kungliga Tekniska Högskolan
3 * (Royal Institute of Technology, Stockholm, Sweden).
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the Institute nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#ifdef HAVE_CONFIG_H
35#include <config.h>
36#endif
37#include "windlocl.h"
38
39#include <assert.h>
40#include <stdlib.h>
41#include <errno.h>
42#include <stdio.h>
43
44#include "roken.h"
45
46#include "normalize_table.h"
47
48static int
49translation_cmp(const void *key, const void *data)
50{
51    const struct translation *t1 = (const struct translation *)key;
52    const struct translation *t2 = (const struct translation *)data;
53
54    return t1->key - t2->key;
55}
56
57enum { s_base  = 0xAC00};
58enum { s_count = 11172};
59enum { l_base  = 0x1100};
60enum { l_count = 19};
61enum { v_base  = 0x1161};
62enum { v_count = 21};
63enum { t_base  = 0x11A7};
64enum { t_count = 28};
65enum { n_count = v_count * t_count};
66
67static int
68hangul_decomp(const uint32_t *in, size_t in_len,
69	      uint32_t *out, size_t *out_len)
70{
71    uint32_t u = *in;
72    unsigned s_index;
73    unsigned l, v, t;
74    unsigned o;
75
76    if (u < s_base || u >= s_base + s_count)
77	return 0;
78    s_index = u - s_base;
79    l = l_base + s_index / n_count;
80    v = v_base + (s_index % n_count) / t_count;
81    t = t_base + s_index % t_count;
82    o = 2;
83    if (t != t_base)
84	++o;
85    if (*out_len < o)
86	return WIND_ERR_OVERRUN;
87    out[0] = l;
88    out[1] = v;
89    if (t != t_base)
90	out[2] = t;
91    *out_len = o;
92    return 1;
93}
94
95static uint32_t
96hangul_composition(const uint32_t *in, size_t in_len)
97{
98    if (in_len < 2)
99	return 0;
100    if (in[0] >= l_base && in[0] < l_base + l_count) {
101	unsigned l_index = in[0] - l_base;
102	unsigned v_index;
103
104	if (in[1] < v_base || in[1] >= v_base + v_count)
105	    return 0;
106	v_index = in[1] - v_base;
107	return (l_index * v_count + v_index) * t_count + s_base;
108    } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109	unsigned s_index = in[0] - s_base;
110	unsigned t_index;
111
112	if (s_index % t_count != 0)
113	    return 0;
114	if (in[1] < t_base || in[1] >= t_base + t_count)
115	    return 0;
116	t_index = in[1] - t_base;
117	return in[0] + t_index;
118    }
119    return 0;
120}
121
122static int
123compat_decomp(const uint32_t *in, size_t in_len,
124	      uint32_t *out, size_t *out_len)
125{
126    unsigned i;
127    unsigned o = 0;
128
129    for (i = 0; i < in_len; ++i) {
130	struct translation ts = {in[i]};
131	size_t sub_len = *out_len - o;
132	int ret;
133
134	ret = hangul_decomp(in + i, in_len - i,
135			    out + o, &sub_len);
136	if (ret) {
137	    if (ret == WIND_ERR_OVERRUN)
138		return ret;
139	    o += sub_len;
140	} else {
141	    void *s = bsearch(&ts,
142			      _wind_normalize_table,
143			      _wind_normalize_table_size,
144			      sizeof(_wind_normalize_table[0]),
145			      translation_cmp);
146	    if (s != NULL) {
147		const struct translation *t = (const struct translation *)s;
148
149		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150				    t->val_len,
151				    out + o, &sub_len);
152		if (ret)
153		    return ret;
154		o += sub_len;
155	    } else {
156		if (o >= *out_len)
157		    return WIND_ERR_OVERRUN;
158		out[o++] = in[i];
159
160	    }
161	}
162    }
163    *out_len = o;
164    return 0;
165}
166
167static void
168swap_char(uint32_t * a, uint32_t * b)
169{
170    uint32_t t;
171    t = *a;
172    *a = *b;
173    *b = t;
174}
175
176/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177 * that all have Canonical_Combining_Class > 0 */
178static void
179canonical_reorder_sequence(uint32_t * a, size_t len)
180{
181    size_t i, j;
182
183    if (len <= 1)
184	return;
185
186    for (i = 1; i < len; i++) {
187	for (j = i;
188	     j > 0 &&
189		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190	     j--)
191	    swap_char(&a[j], &a[j-1]);
192    }
193}
194
195static void
196canonical_reorder(uint32_t *tmp, size_t tmp_len)
197{
198    size_t i;
199
200    for (i = 0; i < tmp_len; ++i) {
201	int cc = _wind_combining_class(tmp[i]);
202	if (cc) {
203	    size_t j;
204	    for (j = i + 1;
205		 j < tmp_len && _wind_combining_class(tmp[j]);
206		 ++j)
207		;
208	    canonical_reorder_sequence(&tmp[i], j - i);
209	    i = j;
210	}
211    }
212}
213
214static uint32_t
215find_composition(const uint32_t *in, unsigned in_len)
216{
217    unsigned short canon_index = 0;
218    uint32_t cur;
219    unsigned n = 0;
220
221    cur = hangul_composition(in, in_len);
222    if (cur)
223	return cur;
224
225    do {
226	const struct canon_node *c = &_wind_canon_table[canon_index];
227	unsigned i;
228
229	if (n % 5 == 0) {
230	    cur = *in++;
231	    if (in_len-- == 0)
232		return c->val;
233	}
234
235	i = cur >> 16;
236	if (i < c->next_start || i >= c->next_end)
237	    canon_index = 0;
238	else
239	    canon_index =
240		_wind_canon_next_table[c->next_offset + i - c->next_start];
241	if (canon_index != 0) {
242	    cur = (cur << 4) & 0xFFFFF;
243	    ++n;
244	}
245    } while (canon_index != 0);
246    return 0;
247}
248
249static int
250combine(const uint32_t *in, size_t in_len,
251	uint32_t *out, size_t *out_len)
252{
253    unsigned i;
254    int ostarter;
255    unsigned o = 0;
256    int old_cc;
257
258    for (i = 0; i < in_len;) {
259	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260	    out[o++] = in[i++];
261	}
262	if (i < in_len) {
263	    if (o >= *out_len)
264		return WIND_ERR_OVERRUN;
265	    ostarter = o;
266	    out[o++] = in[i++];
267	    old_cc   = -1;
268
269	    while (i < in_len) {
270		uint32_t comb;
271		uint32_t v[2];
272		int cc;
273
274		v[0] = out[ostarter];
275		v[1] = in[i];
276
277		cc = _wind_combining_class(in[i]);
278		if (old_cc != cc && (comb = find_composition(v, 2))) {
279		    out[ostarter] = comb;
280		} else if (cc == 0) {
281		    break;
282		} else {
283		    if (o >= *out_len)
284			return WIND_ERR_OVERRUN;
285		    out[o++] = in[i];
286		    old_cc   = cc;
287		}
288		++i;
289	    }
290	}
291    }
292    *out_len = o;
293    return 0;
294}
295
296int
297_wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298			   uint32_t *out, size_t *out_len)
299{
300    size_t tmp_len;
301    uint32_t *tmp;
302    int ret;
303
304    if (in_len == 0) {
305	*out_len = 0;
306	return 0;
307    }
308
309    tmp_len = in_len * 4;
310    if (tmp_len < MAX_LENGTH_CANON)
311	tmp_len = MAX_LENGTH_CANON;
312    tmp = malloc(tmp_len * sizeof(uint32_t));
313    if (tmp == NULL)
314	return ENOMEM;
315
316    ret = compat_decomp(in, in_len, tmp, &tmp_len);
317    if (ret) {
318	free(tmp);
319	return ret;
320    }
321    canonical_reorder(tmp, tmp_len);
322    ret = combine(tmp, tmp_len, out, out_len);
323    free(tmp);
324    return ret;
325}
326