1/*
2 * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21/*
22 * CP950
23 */
24
25/*
26 * Microsoft CP950 is a slightly extended and slightly modified version of
27 * BIG5. The differences between the EASTASIA/OTHER/BIG5.TXT and
28 * VENDORS/MICSFT/WINDOWS/CP950.TXT tables found on ftp.unicode.org are
29 * as follows:
30 *
31 * 1. Some characters in the BIG5 range are defined differently:
32 *
33 *     code   BIG5.TXT                       CP950.TXT
34 *    0xA145  0x2022 # BULLET                0x2027 # HYPHENATION POINT
35 *    0xA14E  0xFF64 # HALFWIDTH IDEOGRAPHIC COMMA
36 *                                           0xFE51 # SMALL IDEOGRAPHIC COMMA
37 *    0xA15A    ---                          0x2574 # BOX DRAWINGS LIGHT LEFT
38 *    0xA1C2  0x203E # OVERLINE              0x00AF # MACRON
39 *    0xA1C3    ---                          0xFFE3 # FULLWIDTH MACRON
40 *    0xA1C5    ---                          0x02CD # MODIFIER LETTER LOW MACRON
41 *    0xA1E3  0x223C # TILDE OPERATOR        0xFF5E # FULLWIDTH TILDE
42 *    0xA1F2  0x2641 # EARTH                 0x2295 # CIRCLED PLUS
43 *    0xA1F3  0x2609 # SUN                   0x2299 # CIRCLED DOT OPERATOR
44 *    0xA1FE    ---                          0xFF0F # FULLWIDTH SOLIDUS
45 *    0xA240    ---                          0xFF3C # FULLWIDTH REVERSE SOLIDUS
46 *    0xA241  0xFF0F # FULLWIDTH SOLIDUS     0x2215 # DIVISION SLASH
47 *    0xA242  0xFF3C # FULLWIDTH REVERSE SOLIDUS
48 *                                           0xFE68 # SMALL REVERSE SOLIDUS
49 *    0xA244  0x00A5 # YEN SIGN              0xFFE5 # FULLWIDTH YEN SIGN
50 *    0xA246  0x00A2 # CENT SIGN             0xFFE0 # FULLWIDTH CENT SIGN
51 *    0xA247  0x00A3 # POUND SIGN            0xFFE1 # FULLWIDTH POUND SIGN
52 *    0xA2CC    ---                          0x5341
53 *    0xA2CE    ---                          0x5345
54 *
55 * 2. A small new row. See cp950ext.h.
56 *
57 * 3. CP950.TXT is lacking the range 0xC6A1..0xC7FC (Hiragana, Katakana,
58 *    Cyrillic, circled digits, parenthesized digits).
59 *
60 *    We implement this omission, because said range is marked "uncertain"
61 *    in the unicode.org BIG5 table.
62 *
63 * The table found on Microsoft's website furthermore adds:
64 *
65 * 4. A single character:
66 *
67 *     code   CP950.TXT
68 *    0xA3E1  0x20AC # EURO SIGN
69 *
70 * Many variants of BIG5 or CP950 (in JDK, Solaris, OSF/1, Windows-2000, ICU,
71 * as well as our BIG5-2003 converter) also add:
72 *
73 * 5. Private area mappings:
74 *
75 *              code                 Unicode
76 *    0x{81..8D}{40..7E,A1..FE}  U+EEB8..U+F6B0
77 *    0x{8E..A0}{40..7E,A1..FE}  U+E311..U+EEB7
78 *    0x{FA..FE}{40..7E,A1..FE}  U+E000..U+E310
79 *
80 * We add them too because, although there are backward compatibility problems
81 * when a character from a private area is moved to an official Unicode code
82 * point, they are useful for some people in practice.
83 */
84
85static const unsigned short cp950_2uni_pagea1[314] = {
86  /* 0xa1 */
87  0x3000, 0xff0c, 0x3001, 0x3002, 0xff0e, 0x2027, 0xff1b, 0xff1a,
88  0xff1f, 0xff01, 0xfe30, 0x2026, 0x2025, 0xfe50, 0xfe51, 0xfe52,
89  0x00b7, 0xfe54, 0xfe55, 0xfe56, 0xfe57, 0xff5c, 0x2013, 0xfe31,
90  0x2014, 0xfe33, 0x2574, 0xfe34, 0xfe4f, 0xff08, 0xff09, 0xfe35,
91  0xfe36, 0xff5b, 0xff5d, 0xfe37, 0xfe38, 0x3014, 0x3015, 0xfe39,
92  0xfe3a, 0x3010, 0x3011, 0xfe3b, 0xfe3c, 0x300a, 0x300b, 0xfe3d,
93  0xfe3e, 0x3008, 0x3009, 0xfe3f, 0xfe40, 0x300c, 0x300d, 0xfe41,
94  0xfe42, 0x300e, 0x300f, 0xfe43, 0xfe44, 0xfe59, 0xfe5a, 0xfe5b,
95  0xfe5c, 0xfe5d, 0xfe5e, 0x2018, 0x2019, 0x201c, 0x201d, 0x301d,
96  0x301e, 0x2035, 0x2032, 0xff03, 0xff06, 0xff0a, 0x203b, 0x00a7,
97  0x3003, 0x25cb, 0x25cf, 0x25b3, 0x25b2, 0x25ce, 0x2606, 0x2605,
98  0x25c7, 0x25c6, 0x25a1, 0x25a0, 0x25bd, 0x25bc, 0x32a3, 0x2105,
99  0x00af, 0xffe3, 0xff3f, 0x02cd, 0xfe49, 0xfe4a, 0xfe4d, 0xfe4e,
100  0xfe4b, 0xfe4c, 0xfe5f, 0xfe60, 0xfe61, 0xff0b, 0xff0d, 0x00d7,
101  0x00f7, 0x00b1, 0x221a, 0xff1c, 0xff1e, 0xff1d, 0x2266, 0x2267,
102  0x2260, 0x221e, 0x2252, 0x2261, 0xfe62, 0xfe63, 0xfe64, 0xfe65,
103  0xfe66, 0xff5e, 0x2229, 0x222a, 0x22a5, 0x2220, 0x221f, 0x22bf,
104  0x33d2, 0x33d1, 0x222b, 0x222e, 0x2235, 0x2234, 0x2640, 0x2642,
105  0x2295, 0x2299, 0x2191, 0x2193, 0x2190, 0x2192, 0x2196, 0x2197,
106  0x2199, 0x2198, 0x2225, 0x2223, 0xff0f,
107  /* 0xa2 */
108  0xff3c, 0x2215, 0xfe68, 0xff04, 0xffe5, 0x3012, 0xffe0, 0xffe1,
109  0xff05, 0xff20, 0x2103, 0x2109, 0xfe69, 0xfe6a, 0xfe6b, 0x33d5,
110  0x339c, 0x339d, 0x339e, 0x33ce, 0x33a1, 0x338e, 0x338f, 0x33c4,
111  0x00b0, 0x5159, 0x515b, 0x515e, 0x515d, 0x5161, 0x5163, 0x55e7,
112  0x74e9, 0x7cce, 0x2581, 0x2582, 0x2583, 0x2584, 0x2585, 0x2586,
113  0x2587, 0x2588, 0x258f, 0x258e, 0x258d, 0x258c, 0x258b, 0x258a,
114  0x2589, 0x253c, 0x2534, 0x252c, 0x2524, 0x251c, 0x2594, 0x2500,
115  0x2502, 0x2595, 0x250c, 0x2510, 0x2514, 0x2518, 0x256d, 0x256e,
116  0x2570, 0x256f, 0x2550, 0x255e, 0x256a, 0x2561, 0x25e2, 0x25e3,
117  0x25e5, 0x25e4, 0x2571, 0x2572, 0x2573, 0xff10, 0xff11, 0xff12,
118  0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0x2160,
119  0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168,
120  0x2169, 0x3021, 0x3022, 0x3023, 0x3024, 0x3025, 0x3026, 0x3027,
121  0x3028, 0x3029, 0x5341, 0x5344, 0x5345, 0xff21, 0xff22, 0xff23,
122  0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b,
123  0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33,
124  0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff41,
125  0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49,
126  0xff4a, 0xff4b, 0xff4c, 0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51,
127  0xff52, 0xff53, 0xff54, 0xff55, 0xff56,
128};
129
130#include "cp950ext.h"
131
132static int
133cp950_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
134{
135  unsigned char c = *s;
136  /* Code set 0 (ASCII) */
137  if (c < 0x80)
138    return ascii_mbtowc(conv,pwc,s,n);
139  /* Code set 1 (BIG5 extended) */
140  if (c >= 0x81 && c < 0xff) {
141    if (n < 2)
142      return RET_TOOFEW(0);
143    {
144      unsigned char c2 = s[1];
145      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) {
146        if (c >= 0xa1) {
147          if (c < 0xa3) {
148            unsigned int i = 157 * (c - 0xa1) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
149            unsigned short wc = cp950_2uni_pagea1[i];
150            if (wc != 0xfffd) {
151              *pwc = (ucs4_t) wc;
152              return 2;
153            }
154          }
155          if (!((c == 0xc6 && c2 >= 0xa1) || c == 0xc7)) {
156            int ret = big5_mbtowc(conv,pwc,s,2);
157            if (ret != RET_ILSEQ)
158              return ret;
159          }
160          if (c == 0xa3 && c2 == 0xe1) {
161            *pwc = 0x20ac;
162            return 2;
163          }
164          if (c >= 0xfa) {
165            /* User-defined characters */
166            *pwc = 0xe000 + 157 * (c - 0xfa) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
167            return 2;
168          }
169        } else {
170          /* 0x81 <= c < 0xa1. */
171          /* User-defined characters */
172          *pwc = (c >= 0x8e ? 0xdb18 : 0xeeb8) + 157 * (c - 0x81)
173                 + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
174          return 2;
175        }
176      }
177    }
178    if (c == 0xf9) {
179      int ret = cp950ext_mbtowc(conv,pwc,s,2);
180      if (ret != RET_ILSEQ)
181        return ret;
182    }
183  }
184  return RET_ILSEQ;
185}
186
187static int
188cp950_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
189{
190  unsigned char buf[2];
191  int ret;
192
193  /* Code set 0 (ASCII) */
194  ret = ascii_wctomb(conv,r,wc,n);
195  if (ret != RET_ILUNI)
196    return ret;
197
198  /* Code set 1 (BIG5 extended) */
199  switch (wc >> 8) {
200    case 0x00:
201      if (wc == 0x00af) { buf[0] = 0xa1; buf[1] = 0xc2; ret = 2; break; }
202      if (wc == 0x00a2 || wc == 0x00a3 || wc == 0x00a4)
203        return RET_ILUNI;
204      break;
205    case 0x02:
206      if (wc == 0x02cd) { buf[0] = 0xa1; buf[1] = 0xc5; ret = 2; break; }
207      break;
208    case 0x20:
209      if (wc == 0x2027) { buf[0] = 0xa1; buf[1] = 0x45; ret = 2; break; }
210      if (wc == 0x20ac) { buf[0] = 0xa3; buf[1] = 0xe1; ret = 2; break; }
211      if (wc == 0x2022 || wc == 0x203e)
212        return RET_ILUNI;
213      break;
214    case 0x22:
215      if (wc == 0x2215) { buf[0] = 0xa2; buf[1] = 0x41; ret = 2; break; }
216      if (wc == 0x2295) { buf[0] = 0xa1; buf[1] = 0xf2; ret = 2; break; }
217      if (wc == 0x2299) { buf[0] = 0xa1; buf[1] = 0xf3; ret = 2; break; }
218      if (wc == 0x223c)
219        return RET_ILUNI;
220      break;
221    case 0x25:
222      if (wc == 0x2574) { buf[0] = 0xa1; buf[1] = 0x5a; ret = 2; break; }
223      break;
224    case 0x26:
225      if (wc == 0x2609 || wc == 0x2641)
226        return RET_ILUNI;
227      break;
228    case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5:
229    case 0xe6: case 0xe7: case 0xe8: case 0xe9: case 0xea: case 0xeb:
230    case 0xec: case 0xed: case 0xee: case 0xef: case 0xf0: case 0xf1:
231    case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6:
232      {
233        /* User-defined characters */
234        unsigned int i = wc - 0xe000;
235        if (i < 5809) {
236          unsigned int c1 = i / 157;
237          unsigned int c2 = i % 157;
238          buf[0] = c1 + (c1 < 5 ? 0xfa : c1 < 24 ? 0x89 : 0x69);
239          buf[1] = c2 + (c2 < 0x3f ? 0x40 : 0x62);
240          ret = 2;
241          break;
242        }
243      }
244      break;
245    case 0xfe:
246      if (wc == 0xfe51) { buf[0] = 0xa1; buf[1] = 0x4e; ret = 2; break; }
247      if (wc == 0xfe68) { buf[0] = 0xa2; buf[1] = 0x42; ret = 2; break; }
248      break;
249    case 0xff:
250      if (wc == 0xff0f) { buf[0] = 0xa1; buf[1] = 0xfe; ret = 2; break; }
251      if (wc == 0xff3c) { buf[0] = 0xa2; buf[1] = 0x40; ret = 2; break; }
252      if (wc == 0xff5e) { buf[0] = 0xa1; buf[1] = 0xe3; ret = 2; break; }
253      if (wc == 0xffe0) { buf[0] = 0xa2; buf[1] = 0x46; ret = 2; break; }
254      if (wc == 0xffe1) { buf[0] = 0xa2; buf[1] = 0x47; ret = 2; break; }
255      if (wc == 0xffe3) { buf[0] = 0xa1; buf[1] = 0xc3; ret = 2; break; }
256      if (wc == 0xffe5) { buf[0] = 0xa2; buf[1] = 0x44; ret = 2; break; }
257      if (wc == 0xff64)
258        return RET_ILUNI;
259      break;
260  }
261  if (ret == RET_ILUNI)
262    ret = big5_wctomb(conv,buf,wc,2);
263  if (ret != RET_ILUNI) {
264    if (ret != 2) abort();
265    if (!((buf[0] == 0xc6 && buf[1] >= 0xa1) || buf[0] == 0xc7)) {
266      if (n < 2)
267        return RET_TOOSMALL;
268      r[0] = buf[0];
269      r[1] = buf[1];
270      return 2;
271    }
272  }
273  ret = cp950ext_wctomb(conv,buf,wc,2);
274  if (ret != RET_ILUNI) {
275    if (ret != 2) abort();
276    if (n < 2)
277      return RET_TOOSMALL;
278    r[0] = buf[0];
279    r[1] = buf[1];
280    return 2;
281  }
282
283  return RET_ILUNI;
284}
285