1/* 2 * Copyright 2010-2011, Oliver Tappe, zooey@hirschkaefer.de. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 7#include "ICUCtypeData.h" 8 9#include <langinfo.h> 10#include <stdlib.h> 11#include <string.h> 12 13#include <algorithm> 14 15#include <unicode/uchar.h> 16#include <unicode/uvernum.h> 17 18#include <Debug.h> 19 20 21//#define TRACE_CTYPE 22#undef TRACE 23#ifdef TRACE_CTYPE 24# include <OS.h> 25# define TRACE(x) debug_printf x 26#else 27# define TRACE(x) ; 28#endif 29 30 31U_NAMESPACE_USE 32 33 34namespace BPrivate { 35namespace Libroot { 36 37 38ICUCtypeData::ICUCtypeData(pthread_key_t tlsKey) 39 : 40 inherited(tlsKey), 41 fDataBridge(NULL) 42{ 43} 44 45 46ICUCtypeData::~ICUCtypeData() 47{ 48} 49 50 51void 52ICUCtypeData::Initialize(LocaleCtypeDataBridge* dataBridge) 53{ 54 *dataBridge->addrOfClassInfoTable = &fClassInfo[128]; 55 *dataBridge->addrOfToLowerTable = &fToLowerMap[128]; 56 *dataBridge->addrOfToUpperTable = &fToUpperMap[128]; 57 fDataBridge = dataBridge; 58} 59 60 61status_t 62ICUCtypeData::SetTo(const Locale& locale, const char* posixLocaleName) 63{ 64 status_t result = inherited::SetTo(locale, posixLocaleName); 65 if (result != B_OK) 66 return result; 67 68 UErrorCode icuStatus = U_ZERO_ERROR; 69 70 UConverter* converter; 71 result = _GetConverter(converter); 72 if (result != B_OK) 73 return result; 74 75 ucnv_reset(converter); 76 77 fDataBridge->setMbCurMax(ucnv_getMaxCharSize(converter)); 78 79 char buffer[] = { 0, 0 }; 80 for (int i = 0; i < 256; ++i) { 81 const char* source = buffer; 82 buffer[0] = (char)i; 83 buffer[1] = '\0'; 84 icuStatus = U_ZERO_ERROR; 85 UChar32 unicodeChar 86 = ucnv_getNextUChar(converter, &source, source + 1, &icuStatus); 87 88 unsigned short classInfo = 0; 89 unsigned int toLower = i; 90 unsigned int toUpper = i; 91 if (U_SUCCESS(icuStatus)) { 92 if (u_isblank(unicodeChar)) 93 classInfo |= _ISblank; 94 if (u_charType(unicodeChar) == U_CONTROL_CHAR) 95 classInfo |= _IScntrl; 96 if (u_ispunct(unicodeChar)) 97 classInfo |= _ISpunct; 98 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_ALNUM)) 99 classInfo |= _ISalnum; 100 if (u_isUUppercase(unicodeChar)) 101 classInfo |= _ISupper; 102 if (u_isULowercase(unicodeChar)) 103 classInfo |= _ISlower; 104 if (u_isUAlphabetic(unicodeChar)) 105 classInfo |= _ISalpha; 106 if (u_isdigit(unicodeChar)) 107 classInfo |= _ISdigit; 108 if (u_isxdigit(unicodeChar)) 109 classInfo |= _ISxdigit; 110 if (u_isUWhiteSpace(unicodeChar)) 111 classInfo |= _ISspace; 112 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_PRINT)) 113 classInfo |= _ISprint; 114 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_GRAPH)) 115 classInfo |= _ISgraph; 116 117 UChar lowerChar = u_tolower(unicodeChar); 118 icuStatus = U_ZERO_ERROR; 119 ucnv_fromUChars(converter, buffer, 1, &lowerChar, 1, &icuStatus); 120 if (U_SUCCESS(icuStatus)) 121 toLower = (unsigned char)buffer[0]; 122 123 UChar upperChar = u_toupper(unicodeChar); 124 icuStatus = U_ZERO_ERROR; 125 ucnv_fromUChars(converter, buffer, 1, &upperChar, 1, &icuStatus); 126 if (U_SUCCESS(icuStatus)) 127 toUpper = (unsigned char)buffer[0]; 128 } 129 fClassInfo[i + 128] = classInfo; 130 fToLowerMap[i + 128] = toLower; 131 fToUpperMap[i + 128] = toUpper; 132 if (i >= 128 && i < 255) { 133 // mirror upper half at negative indices (except for -1 [=EOF]) 134 fClassInfo[i - 128] = classInfo; 135 fToLowerMap[i - 128] = toLower; 136 fToUpperMap[i - 128] = toUpper; 137 } 138 } 139 140 return B_OK; 141} 142 143 144status_t 145ICUCtypeData::SetToPosix() 146{ 147 status_t result = inherited::SetToPosix(); 148 149 if (result == B_OK) { 150 memcpy(fClassInfo, fDataBridge->posixClassInfo, sizeof(fClassInfo)); 151 memcpy(fToLowerMap, fDataBridge->posixToLowerMap, sizeof(fToLowerMap)); 152 memcpy(fToUpperMap, fDataBridge->posixToUpperMap, sizeof(fToUpperMap)); 153 154 fDataBridge->setMbCurMax(1); 155 } 156 157 return result; 158} 159 160 161int 162ICUCtypeData::IsWCType(wint_t wc, wctype_t charClass) 163{ 164 if (wc == WEOF) 165 return 0; 166 167 switch (charClass) { 168 case _ISalnum: 169 return u_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM); 170 case _ISalpha: 171 return u_isUAlphabetic(wc); 172 case _ISblank: 173 return u_isblank(wc); 174 case _IScntrl: 175 return u_charType(wc) == U_CONTROL_CHAR; 176 case _ISdigit: 177 return u_isdigit(wc); 178 case _ISgraph: 179 return u_hasBinaryProperty(wc, UCHAR_POSIX_GRAPH); 180 case _ISlower: 181 return u_isULowercase(wc); 182 case _ISprint: 183 return u_hasBinaryProperty(wc, UCHAR_POSIX_PRINT); 184 case _ISpunct: 185 return u_ispunct(wc); 186 case _ISspace: 187 return u_isUWhiteSpace(wc); 188 case _ISupper: 189 return u_isUUppercase(wc); 190 case _ISxdigit: 191 return u_isxdigit(wc); 192 default: 193 return 0; 194 } 195} 196 197 198status_t 199ICUCtypeData::ToWCTrans(wint_t wc, wctrans_t transition, wint_t& result) 200{ 201 switch (transition) { 202 case _ISlower: 203 result = u_tolower(wc); 204 return B_OK; 205 case _ISupper: 206 result = u_toupper(wc); 207 return B_OK; 208 default: 209 return B_BAD_VALUE; 210 } 211} 212 213 214status_t 215ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen, 216 mbstate_t* mbState, size_t& lengthOut) 217{ 218 UConverter* converter = NULL; 219 status_t result = _GetConverterForMbState(mbState, converter); 220 if (result != B_OK) { 221 TRACE(("MultibyteToWchar(): couldn't get converter for mbstate %p - " 222 "%" B_PRIx32 "\n", mbState, result)); 223 return result; 224 } 225 226 // do the conversion 227 UErrorCode icuStatus = U_ZERO_ERROR; 228 229 const char* buffer = mb; 230 UChar targetBuffer[3]; 231 UChar* target = targetBuffer; 232 ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen, 233 NULL, FALSE, &icuStatus); 234 size_t sourceLengthUsed = buffer - mb; 235 size_t targetLengthUsed = (size_t)(target - targetBuffer); 236 237 if (U16_IS_LEAD(targetBuffer[0])) { 238 // we have a surrogate pair, so re-read with enough space for a pair 239 // of characters instead 240 TRACE(("MultibyteToWchar(): have a surrogate pair\n")); 241 ucnv_resetToUnicode(converter); 242 buffer = mb; 243 target = targetBuffer; 244 ucnv_toUnicode(converter, &target, target + 2, &buffer, buffer + mbLen, 245 NULL, FALSE, &icuStatus); 246 sourceLengthUsed = buffer - mb; 247 targetLengthUsed = (size_t)(target - targetBuffer); 248 } 249 250 if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) { 251 // we've got one character, which is all that we wanted 252 icuStatus = U_ZERO_ERROR; 253 } 254 255 if (!U_SUCCESS(icuStatus)) { 256 // conversion failed because of illegal character sequence 257 TRACE(("MultibyteToWchar(): illegal character sequence\n")); 258 ucnv_resetToUnicode(converter); 259 result = B_BAD_DATA; 260 } else if (targetLengthUsed == 0) { 261 TRACE(("MultibyteToWchar(): incomplete character (len=%lu)\n", mbLen)); 262 for (size_t i = 0; i < mbLen; ++i) 263 TRACE(("\tbyte %lu: %x\n", i, mb[i])); 264 mbState->count = sourceLengthUsed; 265 result = B_BAD_INDEX; 266 } else { 267 UChar32 unicodeChar = 0xBADBEEF; 268 U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar); 269 270 if (unicodeChar == 0) { 271 // reset to initial state 272 _DropConverterFromMbState(mbState); 273 memset(mbState, 0, sizeof(mbstate_t)); 274 lengthOut = 0; 275 } else { 276 mbState->count = 0; 277 lengthOut = sourceLengthUsed; 278 } 279 280 if (wcOut != NULL) 281 *wcOut = unicodeChar; 282 283 result = B_OK; 284 } 285 286 return result; 287} 288 289 290status_t 291ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength, 292 const char** mbSource, size_t mbSourceLength, mbstate_t* mbState, 293 size_t& lengthOut) 294{ 295 UConverter* converter = NULL; 296 status_t result = _GetConverterForMbState(mbState, converter); 297 if (result != B_OK) { 298 TRACE(("MultibyteStringToWchar(): couldn't get converter for mbstate %p" 299 " - %" B_PRIx32 "\n", mbState, result)); 300 return result; 301 } 302 303 bool wcsIsTerminated = false; 304 const char* source = *mbSource; 305 const char* sourceEnd = source + mbSourceLength; 306 if (sourceEnd < source) { 307 // overflow, clamp to highest possible address 308 sourceEnd = (const char*)-1; 309 } 310 311 if (wcDest == NULL) { 312 // if there's no destination buffer, there's no length limit either 313 wcDestLength = (size_t)-1; 314 } 315 316 UErrorCode icuStatus = U_ZERO_ERROR; 317 size_t sourceLengthUsed = 0; 318 for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) { 319 if (sourceLengthUsed >= mbSourceLength) 320 break; 321 UChar32 unicodeChar = ucnv_getNextUChar(converter, &source, 322 std::min(source + MB_CUR_MAX, sourceEnd), &icuStatus); 323 TRACE(("MultibyteStringToWchar() l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu" 324 " uchar:%x st:%x\n", lengthOut, wcDestLength, source, sourceEnd, 325 mbSourceLength, sourceLengthUsed, unicodeChar, icuStatus)); 326 if (!U_SUCCESS(icuStatus)) 327 break; 328 sourceLengthUsed = source - *mbSource; 329 if (wcDest != NULL) 330 *wcDest++ = unicodeChar; 331 if (unicodeChar == L'\0') { 332 wcsIsTerminated = true; 333 break; 334 } 335 icuStatus = U_ZERO_ERROR; 336 } 337 338 if (!U_SUCCESS(icuStatus)) { 339 // conversion failed because of illegal character sequence 340 TRACE(("MultibyteStringToWchar(): illegal character sequence\n")); 341 ucnv_resetToUnicode(converter); 342 result = B_BAD_DATA; 343 if (wcDest != NULL) 344 *mbSource = *mbSource + sourceLengthUsed; 345 } else if (wcsIsTerminated) { 346 // reset to initial state 347 _DropConverterFromMbState(mbState); 348 memset(mbState, 0, sizeof(mbstate_t)); 349 if (wcDest != NULL) 350 *mbSource = NULL; 351 } else { 352 mbState->count = 0; 353 if (wcDest != NULL) 354 *mbSource = source; 355 } 356 357 return result; 358} 359 360 361status_t 362ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState, 363 size_t& lengthOut) 364{ 365 UConverter* converter = NULL; 366 status_t result = _GetConverterForMbState(mbState, converter); 367 if (result != B_OK) { 368 TRACE(("WcharToMultibyte(): couldn't get converter for mbstate %p - " 369 "%" B_PRIx32 "\n", mbState, result)); 370 return result; 371 } 372 373 // convert input from UTF-32 to UTF-16 374 UChar ucharBuffer[2]; 375 size_t ucharLength; 376 if (U_IS_BMP(wc)) { 377 ucharBuffer[0] = wc; 378 ucharLength = 1; 379 } else { 380 ucharBuffer[0] = U16_LEAD(wc); 381 ucharBuffer[1] = U16_TRAIL(wc); 382 ucharLength = 2; 383 } 384 385 // do the actual conversion 386 UErrorCode icuStatus = U_ZERO_ERROR; 387 size_t mbLength = mbOut == NULL ? 0 : MB_CUR_MAX; 388 lengthOut = ucnv_fromUChars(converter, mbOut, mbLength, ucharBuffer, 389 ucharLength, &icuStatus); 390 TRACE(("WcharToMultibyte() l:%lu mb:%p ml:%lu uchar:%x st:%x\n", lengthOut, 391 mbOut, mbLength, wc, icuStatus)); 392 393 if (icuStatus == U_BUFFER_OVERFLOW_ERROR && mbOut == NULL) { 394 // we have no output buffer, so we ignore buffer overflows 395 icuStatus = U_ZERO_ERROR; 396 } 397 398 if (!U_SUCCESS(icuStatus)) { 399 if (icuStatus == U_ILLEGAL_ARGUMENT_ERROR) { 400 // bad converter (shouldn't really happen) 401 TRACE(("WcharToMultibyte(): bad converter\n")); 402 return B_BAD_VALUE; 403 } 404 405 // conversion failed because of illegal/unmappable character 406 TRACE(("WcharToMultibyte(): illegal character sequence\n")); 407 ucnv_resetFromUnicode(converter); 408 return B_BAD_DATA; 409 } 410 411 if (wc == 0) { 412 // reset to initial state 413 _DropConverterFromMbState(mbState); 414 memset(mbState, 0, sizeof(mbstate_t)); 415 } 416 417 return B_OK; 418} 419 420 421status_t 422ICUCtypeData::WcharStringToMultibyte(char* mbDest, size_t mbDestLength, 423 const wchar_t** wcSource, size_t wcSourceLength, mbstate_t* mbState, 424 size_t& lengthOut) 425{ 426 UConverter* converter = NULL; 427 status_t result = _GetConverterForMbState(mbState, converter); 428 if (result != B_OK) { 429 TRACE(("WcharStringToMultibyte(): couldn't get converter for mbstate %p" 430 " - %" B_PRIx32 "\n", mbState, result)); 431 return result; 432 } 433 434 bool mbsIsTerminated = false; 435 const UChar32* source = (UChar32*)*wcSource; 436 437 UErrorCode icuStatus = U_ZERO_ERROR; 438 lengthOut = 0; 439 size_t sourceLengthUsed = 0; 440 for (; sourceLengthUsed < wcSourceLength; ++sourceLengthUsed, ++source) { 441 if (mbDest != NULL && lengthOut >= mbDestLength) 442 break; 443 444 // convert input from UTF-32 to UTF-16 445 UChar ucharBuffer[2]; 446 size_t ucharLength; 447 if (U_IS_BMP(*source)) { 448 ucharBuffer[0] = *source; 449 ucharLength = 1; 450 } else { 451 ucharBuffer[0] = U16_LEAD(*source); 452 ucharBuffer[1] = U16_TRAIL(*source); 453 ucharLength = 2; 454 } 455 456 // do the actual conversion 457 size_t destLength = mbDest == NULL ? 0 : mbDestLength - lengthOut; 458 char buffer[MB_CUR_MAX]; 459 size_t mbLength = ucnv_fromUChars(converter, 460 mbDest == NULL ? NULL : buffer, destLength, ucharBuffer, 461 ucharLength, &icuStatus); 462 TRACE(("WcharStringToMultibyte() l:%lu mb:%p ml:%lu s:%p ul:%lu slu:%lu" 463 " uchar:%x st:%x\n", mbLength, mbDest, destLength, source, 464 ucharLength, sourceLengthUsed, *source, icuStatus)); 465 466 if (icuStatus == U_BUFFER_OVERFLOW_ERROR) { 467 // ignore buffer overflows ... 468 icuStatus = U_ZERO_ERROR; 469 // ... but stop if the output buffer has been exceeded 470 if (destLength > 0) 471 break; 472 } else if (mbDest != NULL) 473 memcpy(mbDest, buffer, mbLength); 474 475 if (!U_SUCCESS(icuStatus)) 476 break; 477 if (mbDest != NULL) 478 mbDest += mbLength; 479 if (*source == L'\0') { 480 mbsIsTerminated = true; 481 break; 482 } 483 lengthOut += mbLength; 484 icuStatus = U_ZERO_ERROR; 485 } 486 487 if (!U_SUCCESS(icuStatus)) { 488 // conversion failed because of illegal character sequence 489 TRACE(("WcharStringToMultibyte(): illegal character sequence\n")); 490 ucnv_resetFromUnicode(converter); 491 result = B_BAD_DATA; 492 if (mbDest != NULL) 493 *wcSource = *wcSource + sourceLengthUsed; 494 } else if (mbsIsTerminated) { 495 // reset to initial state 496 _DropConverterFromMbState(mbState); 497 memset(mbState, 0, sizeof(mbstate_t)); 498 if (mbDest != NULL) 499 *wcSource = NULL; 500 } else { 501 mbState->count = 0; 502 if (mbDest != NULL) 503 *wcSource = (wchar_t*)source; 504 } 505 506 return result; 507} 508 509 510const char* 511ICUCtypeData::GetLanginfo(int index) 512{ 513 switch(index) { 514 case CODESET: 515 return fGivenCharset; 516 default: 517 return ""; 518 } 519} 520 521 522status_t 523ICUCtypeData::_GetConverterForMbState(mbstate_t* mbState, 524 UConverter*& converterOut) 525{ 526 if (strcmp(mbState->charset, fGivenCharset) == 0 527 && (char*)mbState->converter >= mbState->data 528 && (char*)mbState->converter < mbState->data + 8) { 529 // charset matches and converter actually lives in *this* mbState, 530 // so we can use it (if the converter points to the outside, it means 531 // that the mbstate_t has been copied) 532 converterOut = (UConverter*)mbState->converter; 533 return B_OK; 534 } 535 536 // charset no longer matches the converter, we need to dump it and 537 // create a new one 538 _DropConverterFromMbState(mbState); 539 540 // create a new converter for the current charset ... 541 UConverter* icuConverter; 542 status_t result = _GetConverter(icuConverter); 543 if (result != B_OK) 544 return result; 545 546 // ... and clone it into the mbstate 547 UErrorCode icuStatus = U_ZERO_ERROR; 548 int32_t bufferSize = sizeof(mbState->data); 549 UConverter* clone 550 = ucnv_safeClone(icuConverter, mbState->data, &bufferSize, &icuStatus); 551 552 if (clone == NULL || !U_SUCCESS(icuStatus)) 553 return B_ERROR; 554 555 if ((char*)clone < mbState->data || (char*)clone >= mbState->data + 8) { 556 // buffer is too small (shouldn't happen according to ICU docs) 557 return B_NO_MEMORY; 558 } 559 560 strlcpy(mbState->charset, fGivenCharset, sizeof(mbState->charset)); 561 mbState->converter = clone; 562 563 converterOut = clone; 564 565 return B_OK; 566} 567 568 569status_t 570ICUCtypeData::_DropConverterFromMbState(mbstate_t* mbState) 571{ 572 if (mbState->converter != NULL && (char*)mbState->converter >= mbState->data 573 && (char*)mbState->converter < mbState->data + 8) { 574 // check that the converter actually lives in *this* mbState, 575 // otherwise we risk freeing a converter that doesn't belong to us; 576 // this parallels the check in _GetConverterForMbState() 577 ucnv_close((UConverter*)mbState->converter); 578 } 579 memset(mbState, 0, sizeof(mbstate_t)); 580 581 return B_OK; 582} 583 584 585} // namespace Libroot 586} // namespace BPrivate 587