1227650Skevlo/*- 2227650Skevlo * Copyright (c) 2003, 2005 Ryuichiro Imura 3227650Skevlo * All rights reserved. 4227650Skevlo * 5227650Skevlo * Redistribution and use in source and binary forms, with or without 6227650Skevlo * modification, are permitted provided that the following conditions 7227650Skevlo * are met: 8227650Skevlo * 1. Redistributions of source code must retain the above copyright 9227650Skevlo * notice, this list of conditions and the following disclaimer. 10227650Skevlo * 2. Redistributions in binary form must reproduce the above copyright 11227650Skevlo * notice, this list of conditions and the following disclaimer in the 12227650Skevlo * documentation and/or other materials provided with the distribution. 13227650Skevlo * 14227650Skevlo * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15227650Skevlo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16227650Skevlo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17227650Skevlo * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18227650Skevlo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19227650Skevlo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20227650Skevlo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21227650Skevlo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22227650Skevlo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23227650Skevlo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24227650Skevlo * SUCH DAMAGE. 25227650Skevlo */ 26227650Skevlo 27227650Skevlo#include <sys/cdefs.h> 28227650Skevlo__FBSDID("$FreeBSD$"); 29227650Skevlo 30227650Skevlo#include <sys/param.h> 31227650Skevlo#include <sys/kernel.h> 32227650Skevlo#include <sys/systm.h> 33227650Skevlo#include <sys/malloc.h> 34227650Skevlo#include <sys/iconv.h> 35227650Skevlo 36227650Skevlo#include "iconv_converter_if.h" 37227650Skevlo 38227650Skevlo/* 39227650Skevlo * "UCS" converter 40227650Skevlo */ 41227650Skevlo 42227650Skevlo#define KICONV_UCS_COMBINE 0x1 43227650Skevlo#define KICONV_UCS_FROM_UTF8 0x2 44227650Skevlo#define KICONV_UCS_TO_UTF8 0x4 45227650Skevlo#define KICONV_UCS_FROM_LE 0x8 46227650Skevlo#define KICONV_UCS_TO_LE 0x10 47227650Skevlo#define KICONV_UCS_FROM_UTF16 0x20 48227650Skevlo#define KICONV_UCS_TO_UTF16 0x40 49227650Skevlo#define KICONV_UCS_UCS4 0x80 50227650Skevlo 51227650Skevlo#define ENCODING_UTF16 "UTF-16BE" 52227650Skevlo#define ENCODING_UTF8 "UTF-8" 53227650Skevlo 54227650Skevlostatic struct { 55227650Skevlo const char *name; 56227650Skevlo int from_flag, to_flag; 57227650Skevlo} unicode_family[] = { 58227650Skevlo { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, 59227650Skevlo { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, 60227650Skevlo { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, 61227650Skevlo { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, 62227650Skevlo KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, 63227650Skevlo { NULL, 0, 0 } 64227650Skevlo}; 65227650Skevlo 66227650Skevlostatic uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); 67227650Skevlostatic u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); 68227650Skevlostatic uint32_t encode_surrogate(uint32_t code); 69227650Skevlostatic uint32_t decode_surrogate(const u_char *ucs); 70227650Skevlo 71227650Skevlo#ifdef MODULE_DEPEND 72227650SkevloMODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); 73227650Skevlo#endif 74227650Skevlo 75227650Skevlo/* 76227650Skevlo * UCS converter instance 77227650Skevlo */ 78227650Skevlostruct iconv_ucs { 79227650Skevlo KOBJ_FIELDS; 80227650Skevlo int convtype; 81227650Skevlo struct iconv_cspair * d_csp; 82227650Skevlo struct iconv_cspair * d_cspf; 83227650Skevlo void * f_ctp; 84227650Skevlo void * t_ctp; 85227650Skevlo void * ctype; 86227650Skevlo}; 87227650Skevlo 88227650Skevlostatic int 89227650Skevloiconv_ucs_open(struct iconv_converter_class *dcp, 90227650Skevlo struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) 91227650Skevlo{ 92227650Skevlo struct iconv_ucs *dp; 93227650Skevlo int i; 94227650Skevlo const char *from, *to; 95227650Skevlo 96227650Skevlo dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); 97227650Skevlo to = csp->cp_to; 98227650Skevlo from = cspf ? cspf->cp_from : csp->cp_from; 99227650Skevlo 100227650Skevlo dp->convtype = 0; 101227650Skevlo 102227650Skevlo if (cspf) 103227650Skevlo dp->convtype |= KICONV_UCS_COMBINE; 104227650Skevlo for (i = 0; unicode_family[i].name; i++) { 105267980Sjhb if (strcasecmp(from, unicode_family[i].name) == 0) 106227650Skevlo dp->convtype |= unicode_family[i].from_flag; 107267980Sjhb if (strcasecmp(to, unicode_family[i].name) == 0) 108227650Skevlo dp->convtype |= unicode_family[i].to_flag; 109227650Skevlo } 110235713Skevlo if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) 111227650Skevlo dp->convtype |= KICONV_UCS_UCS4; 112227650Skevlo else 113227650Skevlo dp->convtype &= ~KICONV_UCS_UCS4; 114227650Skevlo 115227650Skevlo dp->f_ctp = dp->t_ctp = NULL; 116227650Skevlo if (dp->convtype & KICONV_UCS_COMBINE) { 117227650Skevlo if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && 118227650Skevlo (dp->convtype & KICONV_UCS_FROM_LE) == 0) { 119227650Skevlo iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); 120227650Skevlo } 121227650Skevlo if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && 122227650Skevlo (dp->convtype & KICONV_UCS_TO_LE) == 0) { 123227650Skevlo iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); 124227650Skevlo } 125227650Skevlo } 126227650Skevlo 127227650Skevlo dp->ctype = NULL; 128227650Skevlo if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) 129227650Skevlo iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); 130227650Skevlo 131227650Skevlo dp->d_csp = csp; 132227650Skevlo if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { 133227650Skevlo if (cspf) { 134227650Skevlo dp->d_cspf = cspf; 135227650Skevlo cspf->cp_refcount++; 136227650Skevlo } else 137227650Skevlo csp->cp_refcount++; 138227650Skevlo } 139227650Skevlo if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 140227650Skevlo csp->cp_refcount++; 141227650Skevlo *dpp = (void*)dp; 142227650Skevlo return 0; 143227650Skevlo} 144227650Skevlo 145227650Skevlostatic int 146227650Skevloiconv_ucs_close(void *data) 147227650Skevlo{ 148227650Skevlo struct iconv_ucs *dp = data; 149227650Skevlo 150227650Skevlo if (dp->f_ctp) 151227650Skevlo iconv_close(dp->f_ctp); 152227650Skevlo if (dp->t_ctp) 153227650Skevlo iconv_close(dp->t_ctp); 154227650Skevlo if (dp->ctype) 155227650Skevlo iconv_close(dp->ctype); 156227650Skevlo if (dp->d_cspf) 157227650Skevlo dp->d_cspf->cp_refcount--; 158227650Skevlo else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) 159227650Skevlo dp->d_csp->cp_refcount--; 160227650Skevlo if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 161227650Skevlo dp->d_csp->cp_refcount--; 162227650Skevlo kobj_delete((struct kobj*)data, M_ICONV); 163227650Skevlo return 0; 164227650Skevlo} 165227650Skevlo 166227650Skevlostatic int 167227650Skevloiconv_ucs_conv(void *d2p, const char **inbuf, 168227650Skevlo size_t *inbytesleft, char **outbuf, size_t *outbytesleft, 169227650Skevlo int convchar, int casetype) 170227650Skevlo{ 171227650Skevlo struct iconv_ucs *dp = (struct iconv_ucs*)d2p; 172227650Skevlo int ret = 0, i; 173227650Skevlo size_t in, on, ir, or, inlen, outlen, ucslen; 174227650Skevlo const char *src, *p; 175227650Skevlo char *dst; 176227650Skevlo u_char ucs[4], *q; 177227650Skevlo uint32_t code; 178227650Skevlo 179227650Skevlo if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) 180227650Skevlo return 0; 181227650Skevlo ir = in = *inbytesleft; 182227650Skevlo or = on = *outbytesleft; 183227650Skevlo src = *inbuf; 184227650Skevlo dst = *outbuf; 185227650Skevlo 186227650Skevlo while (ir > 0 && or > 0) { 187227650Skevlo 188227650Skevlo /* 189227650Skevlo * The first half of conversion. 190227650Skevlo * (convert any code into ENCODING_UNICODE) 191227650Skevlo */ 192227650Skevlo code = 0; 193227650Skevlo p = src; 194227650Skevlo if (dp->convtype & KICONV_UCS_FROM_UTF8) { 195227650Skevlo /* convert UTF-8 to ENCODING_UNICODE */ 196227650Skevlo inlen = 0; 197227650Skevlo code = utf8_to_ucs4(p, &inlen, ir); 198227650Skevlo if (code == 0) { 199227650Skevlo ret = -1; 200227650Skevlo break; 201227650Skevlo } 202227650Skevlo 203227650Skevlo if (casetype == KICONV_FROM_LOWER && dp->ctype) { 204227650Skevlo code = towlower(code, dp->ctype); 205227650Skevlo } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { 206227650Skevlo code = towupper(code, dp->ctype); 207227650Skevlo } 208227650Skevlo 209227650Skevlo if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { 210227650Skevlo /* reserved for utf-16 surrogate pair */ 211227650Skevlo /* invalid unicode */ 212227650Skevlo ret = -1; 213227650Skevlo break; 214227650Skevlo } 215227650Skevlo 216227650Skevlo if (inlen == 4) { 217227650Skevlo if (dp->convtype & KICONV_UCS_UCS4) { 218227650Skevlo ucslen = 4; 219227650Skevlo code = encode_surrogate(code); 220227650Skevlo } else { 221227650Skevlo /* can't handle with ucs-2 */ 222227650Skevlo ret = -1; 223227650Skevlo break; 224227650Skevlo } 225227650Skevlo } else { 226227650Skevlo ucslen = 2; 227227650Skevlo } 228227650Skevlo 229227650Skevlo /* save UCS-4 into ucs[] */ 230227650Skevlo for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) 231227650Skevlo *q++ = (code >> (i << 3)) & 0xff; 232227650Skevlo 233227650Skevlo } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { 234227650Skevlo /* convert local code to ENCODING_UNICODE */ 235227650Skevlo ucslen = 4; 236227650Skevlo inlen = ir; 237227650Skevlo q = ucs; 238227650Skevlo ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, 239227650Skevlo &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); 240227650Skevlo if (ret) 241227650Skevlo break; 242227650Skevlo inlen = ir - inlen; 243227650Skevlo ucslen = 4 - ucslen; 244227650Skevlo 245227650Skevlo } else { 246227650Skevlo /* src code is a proper subset of ENCODING_UNICODE */ 247227650Skevlo q = ucs; 248227650Skevlo if (dp->convtype & KICONV_UCS_FROM_LE) { 249227650Skevlo *q = *(p + 1); 250227650Skevlo *(q + 1) = *p; 251227650Skevlo p += 2; 252227650Skevlo } else { 253227650Skevlo *q = *p++; 254227650Skevlo *(q + 1) = *p++; 255227650Skevlo } 256227650Skevlo if ((*q & 0xfc) == 0xd8) { 257227650Skevlo if (dp->convtype & KICONV_UCS_UCS4 && 258227650Skevlo dp->convtype & KICONV_UCS_FROM_UTF16) { 259227650Skevlo inlen = ucslen = 4; 260227650Skevlo } else { 261227650Skevlo /* invalid unicode */ 262227650Skevlo ret = -1; 263227650Skevlo break; 264227650Skevlo } 265227650Skevlo } else { 266227650Skevlo inlen = ucslen = 2; 267227650Skevlo } 268227650Skevlo if (ir < inlen) { 269227650Skevlo ret = -1; 270227650Skevlo break; 271227650Skevlo } 272227650Skevlo if (ucslen == 4) { 273227650Skevlo q += 2; 274227650Skevlo if (dp->convtype & KICONV_UCS_FROM_LE) { 275227650Skevlo *q = *(p + 1); 276227650Skevlo *(q + 1) = *p; 277227650Skevlo } else { 278227650Skevlo *q = *p++; 279227650Skevlo *(q + 1) = *p; 280227650Skevlo } 281227650Skevlo if ((*q & 0xfc) != 0xdc) { 282227650Skevlo /* invalid unicode */ 283227650Skevlo ret = -1; 284227650Skevlo break; 285227650Skevlo } 286227650Skevlo } 287227650Skevlo } 288227650Skevlo 289227650Skevlo /* 290227650Skevlo * The second half of conversion. 291227650Skevlo * (convert ENCODING_UNICODE into any code) 292227650Skevlo */ 293227650Skevlo p = ucs; 294227650Skevlo if (dp->convtype & KICONV_UCS_TO_UTF8) { 295227650Skevlo q = (u_char *)dst; 296227650Skevlo if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { 297227650Skevlo /* decode surrogate pair */ 298227650Skevlo code = decode_surrogate(p); 299227650Skevlo } else { 300227650Skevlo code = (ucs[0] << 8) | ucs[1]; 301227650Skevlo } 302227650Skevlo 303227650Skevlo if (casetype == KICONV_LOWER && dp->ctype) { 304227650Skevlo code = towlower(code, dp->ctype); 305227650Skevlo } else if (casetype == KICONV_UPPER && dp->ctype) { 306227650Skevlo code = towupper(code, dp->ctype); 307227650Skevlo } 308227650Skevlo 309227650Skevlo outlen = 0; 310227650Skevlo if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { 311227650Skevlo ret = -1; 312227650Skevlo break; 313227650Skevlo } 314227650Skevlo 315227650Skevlo src += inlen; 316227650Skevlo ir -= inlen; 317227650Skevlo dst += outlen; 318227650Skevlo or -= outlen; 319227650Skevlo 320227650Skevlo } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { 321227650Skevlo ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, 322227650Skevlo &or, casetype & (KICONV_LOWER | KICONV_UPPER)); 323227650Skevlo if (ret) 324227650Skevlo break; 325227650Skevlo 326227650Skevlo src += inlen; 327227650Skevlo ir -= inlen; 328227650Skevlo 329227650Skevlo } else { 330227650Skevlo /* dst code is a proper subset of ENCODING_UNICODE */ 331227650Skevlo if (or < ucslen) { 332227650Skevlo ret = -1; 333227650Skevlo break; 334227650Skevlo } 335227650Skevlo src += inlen; 336227650Skevlo ir -= inlen; 337227650Skevlo or -= ucslen; 338227650Skevlo if (dp->convtype & KICONV_UCS_TO_LE) { 339227650Skevlo *dst++ = *(p + 1); 340227650Skevlo *dst++ = *p; 341227650Skevlo p += 2; 342227650Skevlo } else { 343227650Skevlo *dst++ = *p++; 344227650Skevlo *dst++ = *p++; 345227650Skevlo } 346227650Skevlo if (ucslen == 4) { 347227650Skevlo if ((dp->convtype & KICONV_UCS_UCS4) == 0 || 348227650Skevlo (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { 349227650Skevlo ret = -1; 350227650Skevlo break; 351227650Skevlo } 352227650Skevlo if (dp->convtype & KICONV_UCS_TO_LE) { 353227650Skevlo *dst++ = *(p + 1); 354227650Skevlo *dst++ = *p; 355227650Skevlo } else { 356227650Skevlo *dst++ = *p++; 357227650Skevlo *dst++ = *p; 358227650Skevlo } 359227650Skevlo } 360227650Skevlo } 361227650Skevlo 362227650Skevlo if (convchar == 1) 363227650Skevlo break; 364227650Skevlo } 365227650Skevlo 366227650Skevlo *inbuf += in - ir; 367227650Skevlo *outbuf += on - or; 368227650Skevlo *inbytesleft -= in - ir; 369227650Skevlo *outbytesleft -= on - or; 370227650Skevlo return (ret); 371227650Skevlo} 372227650Skevlo 373227650Skevlostatic int 374227650Skevloiconv_ucs_init(struct iconv_converter_class *dcp) 375227650Skevlo{ 376227650Skevlo int error; 377227650Skevlo 378227650Skevlo error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); 379227650Skevlo if (error) 380227650Skevlo return (error); 381227650Skevlo error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); 382227650Skevlo if (error) 383227650Skevlo return (error); 384227650Skevlo return (0); 385227650Skevlo} 386227650Skevlo 387227650Skevlostatic int 388227650Skevloiconv_ucs_done(struct iconv_converter_class *dcp) 389227650Skevlo{ 390227650Skevlo return (0); 391227650Skevlo} 392227650Skevlo 393227650Skevlostatic const char * 394227650Skevloiconv_ucs_name(struct iconv_converter_class *dcp) 395227650Skevlo{ 396227650Skevlo return (ENCODING_UNICODE); 397227650Skevlo} 398227650Skevlo 399227650Skevlostatic kobj_method_t iconv_ucs_methods[] = { 400227650Skevlo KOBJMETHOD(iconv_converter_open, iconv_ucs_open), 401227650Skevlo KOBJMETHOD(iconv_converter_close, iconv_ucs_close), 402227650Skevlo KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), 403227650Skevlo KOBJMETHOD(iconv_converter_init, iconv_ucs_init), 404227650Skevlo KOBJMETHOD(iconv_converter_done, iconv_ucs_done), 405227650Skevlo KOBJMETHOD(iconv_converter_name, iconv_ucs_name), 406227650Skevlo {0, 0} 407227650Skevlo}; 408227650Skevlo 409227650SkevloKICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); 410227650Skevlo 411227650Skevlostatic uint32_t 412227650Skevloutf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) 413227650Skevlo{ 414227650Skevlo size_t i, w = 0; 415227650Skevlo uint32_t ucs4 = 0; 416227650Skevlo 417227650Skevlo /* 418227650Skevlo * get leading 1 byte from utf-8 419227650Skevlo */ 420227650Skevlo if ((*src & 0x80) == 0) { 421227650Skevlo /* 422227650Skevlo * leading 1 bit is "0" 423227650Skevlo * utf-8: 0xxxxxxx 424227650Skevlo * ucs-4: 00000000 00000000 00000000 0xxxxxxx 425227650Skevlo */ 426227650Skevlo w = 1; 427227650Skevlo /* get trailing 7 bits */ 428227650Skevlo ucs4 = *src & 0x7f; 429227650Skevlo } else if ((*src & 0xe0) == 0xc0) { 430227650Skevlo /* 431227650Skevlo * leading 3 bits are "110" 432227650Skevlo * utf-8: 110xxxxx 10yyyyyy 433227650Skevlo * ucs-4: 00000000 00000000 00000xxx xxyyyyyy 434227650Skevlo */ 435227650Skevlo w = 2; 436227650Skevlo /* get trailing 5 bits */ 437227650Skevlo ucs4 = *src & 0x1f; 438227650Skevlo } else if ((*src & 0xf0) == 0xe0) { 439227650Skevlo /* 440227650Skevlo * leading 4 bits are "1110" 441227650Skevlo * utf-8: 1110xxxx 10yyyyyy 10zzzzzz 442227650Skevlo * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz 443227650Skevlo */ 444227650Skevlo w = 3; 445227650Skevlo /* get trailing 4 bits */ 446227650Skevlo ucs4 = *src & 0x0f; 447227650Skevlo } else if ((*src & 0xf8) == 0xf0) { 448227650Skevlo /* 449227650Skevlo * leading 5 bits are "11110" 450227650Skevlo * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz 451227650Skevlo * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz 452227650Skevlo */ 453227650Skevlo w = 4; 454227650Skevlo /* get trailing 3 bits */ 455227650Skevlo ucs4 = *src & 0x07; 456227650Skevlo } else { 457227650Skevlo /* out of utf-16 range or having illegal bits */ 458227650Skevlo return (0); 459227650Skevlo } 460227650Skevlo 461227650Skevlo if (srclen < w) 462227650Skevlo return (0); 463227650Skevlo 464227650Skevlo /* 465227650Skevlo * get left parts from utf-8 466227650Skevlo */ 467227650Skevlo for (i = 1 ; i < w ; i++) { 468227650Skevlo if ((*(src + i) & 0xc0) != 0x80) { 469227650Skevlo /* invalid: leading 2 bits are not "10" */ 470227650Skevlo return (0); 471227650Skevlo } 472227650Skevlo /* concatenate trailing 6 bits into ucs4 */ 473227650Skevlo ucs4 <<= 6; 474227650Skevlo ucs4 |= *(src + i) & 0x3f; 475227650Skevlo } 476227650Skevlo 477227650Skevlo *utf8width = w; 478227650Skevlo return (ucs4); 479227650Skevlo} 480227650Skevlo 481227650Skevlostatic u_char * 482227650Skevloucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) 483227650Skevlo{ 484227650Skevlo u_char lead, *p; 485227650Skevlo size_t i, w; 486227650Skevlo 487227650Skevlo /* 488227650Skevlo * determine utf-8 width and leading bits 489227650Skevlo */ 490227650Skevlo if (ucs4 < 0x80) { 491227650Skevlo w = 1; 492227650Skevlo lead = 0; /* "0" */ 493227650Skevlo } else if (ucs4 < 0x800) { 494227650Skevlo w = 2; 495227650Skevlo lead = 0xc0; /* "11" */ 496227650Skevlo } else if (ucs4 < 0x10000) { 497227650Skevlo w = 3; 498227650Skevlo lead = 0xe0; /* "111" */ 499227650Skevlo } else if (ucs4 < 0x200000) { 500227650Skevlo w = 4; 501227650Skevlo lead = 0xf0; /* "1111" */ 502227650Skevlo } else { 503227650Skevlo return (NULL); 504227650Skevlo } 505227650Skevlo 506227650Skevlo if (dstlen < w) 507227650Skevlo return (NULL); 508227650Skevlo 509227650Skevlo /* 510227650Skevlo * construct utf-8 511227650Skevlo */ 512227650Skevlo p = dst; 513227650Skevlo for (i = w - 1 ; i >= 1 ; i--) { 514227650Skevlo /* get trailing 6 bits and put it with leading bit as "1" */ 515227650Skevlo *(p + i) = (ucs4 & 0x3f) | 0x80; 516227650Skevlo ucs4 >>= 6; 517227650Skevlo } 518227650Skevlo *p = ucs4 | lead; 519227650Skevlo 520227650Skevlo *utf8width = w; 521227650Skevlo 522227650Skevlo return (p); 523227650Skevlo} 524227650Skevlo 525227650Skevlostatic uint32_t 526227650Skevloencode_surrogate(register uint32_t code) 527227650Skevlo{ 528227650Skevlo return ((((code - 0x10000) << 6) & 0x3ff0000) | 529227650Skevlo ((code - 0x10000) & 0x3ff) | 0xd800dc00); 530227650Skevlo} 531227650Skevlo 532227650Skevlostatic uint32_t 533227650Skevlodecode_surrogate(register const u_char *ucs) 534227650Skevlo{ 535227650Skevlo return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | 536227650Skevlo ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); 537227650Skevlo} 538227650Skevlo 539