1/* $NetBSD: citrus_utf1632.c,v 1.11 2010/03/20 18:15:32 tnozaki Exp $ */ 2 3/*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30#if defined(LIBC_SCCS) && !defined(lint) 31__RCSID("$NetBSD: citrus_utf1632.c,v 1.11 2010/03/20 18:15:32 tnozaki Exp $"); 32#endif /* LIBC_SCCS and not lint */ 33 34#include <assert.h> 35#include <errno.h> 36#include <string.h> 37#include <stdio.h> 38#include <stdlib.h> 39#include <stddef.h> 40#include <limits.h> 41#include <wchar.h> 42#include <sys/types.h> 43#include <machine/endian.h> 44 45#include "citrus_namespace.h" 46#include "citrus_types.h" 47#include "citrus_module.h" 48#include "citrus_stdenc.h" 49#include "citrus_bcs.h" 50 51#include "citrus_utf1632.h" 52 53 54/* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58typedef struct { 59 u_int8_t ch[4]; 60 int chlen; 61 int current_endian; 62} _UTF1632State; 63 64typedef struct { 65 int preffered_endian; 66 unsigned int cur_max; 67#define _ENDIAN_UNKNOWN 0 68#define _ENDIAN_BIG 1 69#define _ENDIAN_LITTLE 2 70 u_int32_t mode; 71#define _MODE_UTF32 0x00000001U 72#define _MODE_FORCE_ENDIAN 0x00000002U 73} _UTF1632EncodingInfo; 74 75#define _FUNCNAME(m) _citrus_UTF1632_##m 76#define _ENCODING_INFO _UTF1632EncodingInfo 77#define _ENCODING_STATE _UTF1632State 78#define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 79#define _ENCODING_IS_STATE_DEPENDENT 0 80#define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 81 82 83static __inline void 84/*ARGSUSED*/ 85_citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 86{ 87 memset(s, 0, sizeof(*s)); 88} 89 90static int 91_citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 92 const char **s, size_t n, _UTF1632State *psenc, 93 size_t *nresult) 94{ 95 int chlenbak, endian, needlen; 96 wchar_t wc; 97 size_t result; 98 const char *s0; 99 100 _DIAGASSERT(nresult != 0); 101 _DIAGASSERT(ei != NULL); 102 _DIAGASSERT(s != NULL); 103 _DIAGASSERT(psenc != NULL); 104 105 s0 = *s; 106 107 if (s0 == NULL) { 108 _citrus_UTF1632_init_state(ei, psenc); 109 *nresult = 0; /* state independent */ 110 return (0); 111 } 112 113 result = 0; 114 chlenbak = psenc->chlen; 115 116refetch: 117 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 118 needlen = 4; 119 else 120 needlen = 2; 121 122 while (chlenbak < needlen) { 123 if (n==0) 124 goto restart; 125 psenc->ch[chlenbak++] = *s0++; 126 n--; 127 result++; 128 } 129 130 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 131 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 132 /* judge endian marker */ 133 if ((ei->mode & _MODE_UTF32) == 0) { 134 /* UTF16 */ 135 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 136 psenc->current_endian = _ENDIAN_BIG; 137 chlenbak = 0; 138 goto refetch; 139 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 140 psenc->current_endian = _ENDIAN_LITTLE; 141 chlenbak = 0; 142 goto refetch; 143 } 144 } else { 145 /* UTF32 */ 146 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 147 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 148 psenc->current_endian = _ENDIAN_BIG; 149 chlenbak = 0; 150 goto refetch; 151 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 152 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 153 psenc->current_endian = _ENDIAN_LITTLE; 154 chlenbak = 0; 155 goto refetch; 156 } 157 } 158 } 159 psenc->current_endian = ei->preffered_endian; 160 } 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen=4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 if (wc >= 0xD800 && wc <= 0xDFFF) 225 goto ilseq; 226 } 227 228 229 *pwc = wc; 230 psenc->chlen = 0; 231 *nresult = result; 232 *s = s0; 233 234 return (0); 235 236ilseq: 237 *nresult = (size_t)-1; 238 psenc->chlen = 0; 239 return (EILSEQ); 240 241restart: 242 *nresult = (size_t)-2; 243 psenc->chlen = chlenbak; 244 *s = s0; 245 return (0); 246} 247 248static int 249_citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 250 wchar_t wc, _UTF1632State *psenc, 251 size_t *nresult) 252{ 253 wchar_t wc2; 254 static const char _bom[4] = { 255#if BYTE_ORDER == BIG_ENDIAN 256 0x00, 0x00, 0xFE, 0xFF, 257#else 258 0xFF, 0xFE, 0x00, 0x00, 259#endif 260 }; 261 const char *bom = &_bom[0]; 262 size_t cnt; 263 264 _DIAGASSERT(ei != NULL); 265 _DIAGASSERT(nresult != 0); 266 _DIAGASSERT(s != NULL); 267 268 cnt = (size_t)0; 269 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 270 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 271 if (ei->mode & _MODE_UTF32) { 272 cnt = 4; 273 } else { 274 cnt = 2; 275#if BYTE_ORDER == BIG_ENDIAN 276 bom += 2; 277#endif 278 } 279 if (n < cnt) 280 goto e2big; 281 memcpy(s, bom, cnt); 282 s += cnt, n -= cnt; 283 } 284 psenc->current_endian = ei->preffered_endian; 285 } 286 287 wc2 = 0; 288 if ((ei->mode & _MODE_UTF32)==0) { 289 /* UTF16 */ 290 if (wc>0xFFFF) { 291 /* surrogate */ 292 if (wc>0x10FFFF) 293 goto ilseq; 294 if (n < 4) 295 goto e2big; 296 cnt += 4; 297 wc -= 0x10000; 298 wc2 = (wc & 0x3FF) | 0xDC00; 299 wc = (wc>>10) | 0xD800; 300 } else { 301 if (n < 2) 302 goto e2big; 303 cnt += 2; 304 } 305 306surrogate: 307 switch (psenc->current_endian) { 308 case _ENDIAN_BIG: 309 s[1] = wc; 310 s[0] = (wc >>= 8); 311 break; 312 case _ENDIAN_LITTLE: 313 s[0] = wc; 314 s[1] = (wc >>= 8); 315 break; 316 } 317 if (wc2!=0) { 318 wc = wc2; 319 wc2 = 0; 320 s += 2; 321 goto surrogate; 322 } 323 } else { 324 /* UTF32 */ 325 if (wc >= 0xD800 && wc <= 0xDFFF) 326 goto ilseq; 327 if (n < 4) 328 goto e2big; 329 cnt += 4; 330 switch (psenc->current_endian) { 331 case _ENDIAN_BIG: 332 s[3] = wc; 333 s[2] = (wc >>= 8); 334 s[1] = (wc >>= 8); 335 s[0] = (wc >>= 8); 336 break; 337 case _ENDIAN_LITTLE: 338 s[0] = wc; 339 s[1] = (wc >>= 8); 340 s[2] = (wc >>= 8); 341 s[3] = (wc >>= 8); 342 break; 343 } 344 } 345 *nresult = cnt; 346 347 return 0; 348 349ilseq: 350 *nresult = (size_t)-1; 351 return EILSEQ; 352e2big: 353 *nresult = (size_t)-1; 354 return E2BIG; 355} 356 357static void 358parse_variable(_UTF1632EncodingInfo * __restrict ei, 359 const void * __restrict var, size_t lenvar) 360{ 361#define MATCH(x, act) \ 362do { \ 363 if (lenvar >= (sizeof(#x)-1) && \ 364 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 365 act; \ 366 lenvar -= sizeof(#x)-1; \ 367 p += sizeof(#x)-1; \ 368 } \ 369} while (/*CONSTCOND*/0) 370 const char *p; 371 p = var; 372 while (lenvar>0) { 373 switch (*p) { 374 case 'B': 375 case 'b': 376 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 377 break; 378 case 'L': 379 case 'l': 380 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 381 break; 382 case 'F': 383 case 'f': 384 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 385 break; 386 case 'U': 387 case 'u': 388 MATCH(utf32, ei->mode |= _MODE_UTF32); 389 break; 390 } 391 p++; 392 lenvar--; 393 } 394} 395 396static int 397/*ARGSUSED*/ 398_citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 399 const void * __restrict var, 400 size_t lenvar) 401{ 402 _DIAGASSERT(ei != NULL); 403 404 memset((void *)ei, 0, sizeof(*ei)); 405 406 parse_variable(ei, var, lenvar); 407 408 if ((ei->mode&_MODE_UTF32)==0) 409 ei->cur_max = 6; /* endian + surrogate */ 410 else 411 ei->cur_max = 8; /* endian + normal */ 412 413 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 414#if BYTE_ORDER == BIG_ENDIAN 415 ei->preffered_endian = _ENDIAN_BIG; 416#else 417 ei->preffered_endian = _ENDIAN_LITTLE; 418#endif 419 } 420 421 return (0); 422} 423 424static void 425/*ARGSUSED*/ 426_citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 427{ 428} 429 430static __inline int 431/*ARGSUSED*/ 432_citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 433 _csid_t * __restrict csid, 434 _index_t * __restrict idx, 435 _wc_t wc) 436{ 437 438 _DIAGASSERT(csid != NULL && idx != NULL); 439 440 *csid = 0; 441 *idx = (_index_t)wc; 442 443 return (0); 444} 445 446static __inline int 447/*ARGSUSED*/ 448_citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 449 _wc_t * __restrict wc, 450 _csid_t csid, _index_t idx) 451{ 452 453 _DIAGASSERT(wc != NULL); 454 455 if (csid != 0) 456 return (EILSEQ); 457 458 *wc = (_wc_t)idx; 459 460 return (0); 461} 462 463static __inline int 464/*ARGSUSED*/ 465_citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 466 _UTF1632State * __restrict psenc, 467 int * __restrict rstate) 468{ 469 470 if (psenc->chlen == 0) 471 *rstate = _STDENC_SDGEN_INITIAL; 472 else 473 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 474 475 return 0; 476} 477 478/* ---------------------------------------------------------------------- 479 * public interface for stdenc 480 */ 481 482_CITRUS_STDENC_DECLS(UTF1632); 483_CITRUS_STDENC_DEF_OPS(UTF1632); 484 485#include "citrus_stdenc_template.h" 486