citrus_utf1632.c revision 330897
1/* $FreeBSD: stable/11/lib/libiconv_modules/UTF1632/citrus_utf1632.c 330897 2018-03-14 03:19:51Z eadler $ */ 2/* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 3 4/*- 5 * SPDX-License-Identifier: BSD-2-Clause 6 * 7 * Copyright (c)2003 Citrus Project, 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33#include <sys/endian.h> 34#include <sys/types.h> 35 36#include <assert.h> 37#include <errno.h> 38#include <limits.h> 39#include <stddef.h> 40#include <stdio.h> 41#include <stdlib.h> 42#include <string.h> 43#include <wchar.h> 44 45#include "citrus_namespace.h" 46#include "citrus_types.h" 47#include "citrus_module.h" 48#include "citrus_stdenc.h" 49#include "citrus_bcs.h" 50 51#include "citrus_utf1632.h" 52 53 54/* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58typedef struct { 59 int chlen; 60 int current_endian; 61 uint8_t ch[4]; 62} _UTF1632State; 63 64#define _ENDIAN_UNKNOWN 0 65#define _ENDIAN_BIG 1 66#define _ENDIAN_LITTLE 2 67#if BYTE_ORDER == BIG_ENDIAN 68#define _ENDIAN_INTERNAL _ENDIAN_BIG 69#define _ENDIAN_SWAPPED _ENDIAN_LITTLE 70#else 71#define _ENDIAN_INTERNAL _ENDIAN_LITTLE 72#define _ENDIAN_SWAPPED _ENDIAN_BIG 73#endif 74#define _MODE_UTF32 0x00000001U 75#define _MODE_FORCE_ENDIAN 0x00000002U 76 77typedef struct { 78 int preffered_endian; 79 unsigned int cur_max; 80 uint32_t mode; 81} _UTF1632EncodingInfo; 82 83#define _FUNCNAME(m) _citrus_UTF1632_##m 84#define _ENCODING_INFO _UTF1632EncodingInfo 85#define _ENCODING_STATE _UTF1632State 86#define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 87#define _ENCODING_IS_STATE_DEPENDENT 0 88#define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 89 90 91static __inline void 92/*ARGSUSED*/ 93_citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 94 _UTF1632State *s) 95{ 96 97 memset(s, 0, sizeof(*s)); 98} 99 100static int 101_citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 102 char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 103{ 104 char *s0; 105 size_t result; 106 wchar_t wc = L'\0'; 107 int chlenbak, endian, needlen; 108 109 s0 = *s; 110 111 if (s0 == NULL) { 112 _citrus_UTF1632_init_state(ei, psenc); 113 *nresult = 0; /* state independent */ 114 return (0); 115 } 116 117 result = 0; 118 chlenbak = psenc->chlen; 119 120refetch: 121 needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 122 123 while (chlenbak < needlen) { 124 if (n == 0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 146 psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 151 psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 159 psenc->current_endian; 160 161 /* get wc */ 162 if ((ei->mode & _MODE_UTF32) == 0) { 163 /* UTF16 */ 164 if (needlen == 2) { 165 switch (endian) { 166 case _ENDIAN_LITTLE: 167 wc = (psenc->ch[0] | 168 ((wchar_t)psenc->ch[1] << 8)); 169 break; 170 case _ENDIAN_BIG: 171 wc = (psenc->ch[1] | 172 ((wchar_t)psenc->ch[0] << 8)); 173 break; 174 default: 175 goto ilseq; 176 } 177 if (wc >= 0xD800 && wc <= 0xDBFF) { 178 /* surrogate high */ 179 needlen = 4; 180 goto refetch; 181 } 182 } else { 183 /* surrogate low */ 184 wc -= 0xD800; /* wc : surrogate high (see above) */ 185 wc <<= 10; 186 switch (endian) { 187 case _ENDIAN_LITTLE: 188 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 189 goto ilseq; 190 wc |= psenc->ch[2]; 191 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 192 break; 193 case _ENDIAN_BIG: 194 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 195 goto ilseq; 196 wc |= psenc->ch[3]; 197 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 198 break; 199 default: 200 goto ilseq; 201 } 202 wc += 0x10000; 203 } 204 } else { 205 /* UTF32 */ 206 switch (endian) { 207 case _ENDIAN_LITTLE: 208 wc = (psenc->ch[0] | 209 ((wchar_t)psenc->ch[1] << 8) | 210 ((wchar_t)psenc->ch[2] << 16) | 211 ((wchar_t)psenc->ch[3] << 24)); 212 break; 213 case _ENDIAN_BIG: 214 wc = (psenc->ch[3] | 215 ((wchar_t)psenc->ch[2] << 8) | 216 ((wchar_t)psenc->ch[1] << 16) | 217 ((wchar_t)psenc->ch[0] << 24)); 218 break; 219 default: 220 goto ilseq; 221 } 222 if (wc >= 0xD800 && wc <= 0xDFFF) 223 goto ilseq; 224 } 225 226 227 *pwc = wc; 228 psenc->chlen = 0; 229 *nresult = result; 230 *s = s0; 231 232 return (0); 233 234ilseq: 235 *nresult = (size_t)-1; 236 psenc->chlen = 0; 237 return (EILSEQ); 238 239restart: 240 *nresult = (size_t)-2; 241 psenc->chlen = chlenbak; 242 *s = s0; 243 return (0); 244} 245 246static int 247_citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 248 wchar_t wc, _UTF1632State *psenc, size_t *nresult) 249{ 250 wchar_t wc2; 251 static const char _bom[4] = { 252 0x00, 0x00, 0xFE, 0xFF, 253 }; 254 const char *bom = &_bom[0]; 255 size_t cnt; 256 257 cnt = (size_t)0; 258 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 259 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 260 if (ei->mode & _MODE_UTF32) 261 cnt = 4; 262 else { 263 cnt = 2; 264 bom += 2; 265 } 266 if (n < cnt) 267 goto e2big; 268 memcpy(s, bom, cnt); 269 s += cnt, n -= cnt; 270 } 271 psenc->current_endian = ei->preffered_endian; 272 } 273 274 wc2 = 0; 275 if ((ei->mode & _MODE_UTF32)==0) { 276 /* UTF16 */ 277 if (wc > 0xFFFF) { 278 /* surrogate */ 279 if (wc > 0x10FFFF) 280 goto ilseq; 281 if (n < 4) 282 goto e2big; 283 cnt += 4; 284 wc -= 0x10000; 285 wc2 = (wc & 0x3FF) | 0xDC00; 286 wc = (wc>>10) | 0xD800; 287 } else { 288 if (n < 2) 289 goto e2big; 290 cnt += 2; 291 } 292 293surrogate: 294 switch (psenc->current_endian) { 295 case _ENDIAN_BIG: 296 s[1] = wc; 297 s[0] = (wc >>= 8); 298 break; 299 case _ENDIAN_LITTLE: 300 s[0] = wc; 301 s[1] = (wc >>= 8); 302 break; 303 } 304 if (wc2 != 0) { 305 wc = wc2; 306 wc2 = 0; 307 s += 2; 308 goto surrogate; 309 } 310 } else { 311 /* UTF32 */ 312 if (wc >= 0xD800 && wc <= 0xDFFF) 313 goto ilseq; 314 if (n < 4) 315 goto e2big; 316 cnt += 4; 317 switch (psenc->current_endian) { 318 case _ENDIAN_BIG: 319 s[3] = wc; 320 s[2] = (wc >>= 8); 321 s[1] = (wc >>= 8); 322 s[0] = (wc >>= 8); 323 break; 324 case _ENDIAN_LITTLE: 325 s[0] = wc; 326 s[1] = (wc >>= 8); 327 s[2] = (wc >>= 8); 328 s[3] = (wc >>= 8); 329 break; 330 } 331 } 332 *nresult = cnt; 333 334 return (0); 335 336ilseq: 337 *nresult = (size_t)-1; 338 return (EILSEQ); 339e2big: 340 *nresult = (size_t)-1; 341 return (E2BIG); 342} 343 344static void 345parse_variable(_UTF1632EncodingInfo * __restrict ei, 346 const void * __restrict var, size_t lenvar) 347{ 348 const char *p; 349 350 p = var; 351 while (lenvar > 0) { 352 switch (*p) { 353 case 'B': 354 case 'b': 355 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 356 break; 357 case 'L': 358 case 'l': 359 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 360 break; 361 case 'i': 362 case 'I': 363 MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 364 break; 365 case 's': 366 case 'S': 367 MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 368 break; 369 case 'F': 370 case 'f': 371 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 372 break; 373 case 'U': 374 case 'u': 375 MATCH(utf32, ei->mode |= _MODE_UTF32); 376 break; 377 } 378 p++; 379 lenvar--; 380 } 381} 382 383static int 384/*ARGSUSED*/ 385_citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 386 const void * __restrict var, size_t lenvar) 387{ 388 389 memset((void *)ei, 0, sizeof(*ei)); 390 391 parse_variable(ei, var, lenvar); 392 393 ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 394 /* 6: endian + surrogate */ 395 /* 8: endian + normal */ 396 397 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 398 ei->preffered_endian = _ENDIAN_BIG; 399 } 400 401 return (0); 402} 403 404static void 405/*ARGSUSED*/ 406_citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 407{ 408 409} 410 411static __inline int 412/*ARGSUSED*/ 413_citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 414 _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 415{ 416 417 *csid = 0; 418 *idx = (_index_t)wc; 419 420 return (0); 421} 422 423static __inline int 424/*ARGSUSED*/ 425_citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 426 _wc_t * __restrict wc, _csid_t csid, _index_t idx) 427{ 428 429 if (csid != 0) 430 return (EILSEQ); 431 432 *wc = (_wc_t)idx; 433 434 return (0); 435} 436 437static __inline int 438/*ARGSUSED*/ 439_citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 440 _UTF1632State * __restrict psenc, int * __restrict rstate) 441{ 442 443 *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 444 _STDENC_SDGEN_INCOMPLETE_CHAR; 445 return (0); 446} 447 448/* ---------------------------------------------------------------------- 449 * public interface for stdenc 450 */ 451 452_CITRUS_STDENC_DECLS(UTF1632); 453_CITRUS_STDENC_DEF_OPS(UTF1632); 454 455#include "citrus_stdenc_template.h" 456