1/*
2 * Copyright 2010-2011, Oliver Tappe, zooey@hirschkaefer.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include "ICUCtypeData.h"
8
9#include <langinfo.h>
10#include <stdlib.h>
11#include <string.h>
12
13#include <algorithm>
14
15#include <unicode/uchar.h>
16#include <unicode/uvernum.h>
17
18#include <Debug.h>
19
20
21//#define TRACE_CTYPE
22#undef TRACE
23#ifdef TRACE_CTYPE
24#	include <OS.h>
25#	define TRACE(x) debug_printf x
26#else
27#	define TRACE(x) ;
28#endif
29
30
31U_NAMESPACE_USE
32
33
34namespace BPrivate {
35namespace Libroot {
36
37
38ICUCtypeData::ICUCtypeData(pthread_key_t tlsKey)
39	:
40	inherited(tlsKey),
41	fDataBridge(NULL)
42{
43}
44
45
46ICUCtypeData::~ICUCtypeData()
47{
48}
49
50
51void
52ICUCtypeData::Initialize(LocaleCtypeDataBridge* dataBridge)
53{
54	*dataBridge->addrOfClassInfoTable = &fClassInfo[128];
55	*dataBridge->addrOfToLowerTable = &fToLowerMap[128];
56	*dataBridge->addrOfToUpperTable = &fToUpperMap[128];
57	fDataBridge = dataBridge;
58}
59
60
61status_t
62ICUCtypeData::SetTo(const Locale& locale, const char* posixLocaleName)
63{
64	status_t result = inherited::SetTo(locale, posixLocaleName);
65	if (result != B_OK)
66		return result;
67
68	UErrorCode icuStatus = U_ZERO_ERROR;
69
70	UConverter* converter;
71	result = _GetConverter(converter);
72	if (result != B_OK)
73		return result;
74
75	ucnv_reset(converter);
76
77	fDataBridge->setMbCurMax(ucnv_getMaxCharSize(converter));
78
79	char buffer[] = { 0, 0 };
80	for (int i = 0; i < 256; ++i) {
81		const char* source = buffer;
82		buffer[0] = (char)i;
83		buffer[1] = '\0';
84		icuStatus = U_ZERO_ERROR;
85		UChar32 unicodeChar
86			= ucnv_getNextUChar(converter, &source, source + 1, &icuStatus);
87
88		unsigned short classInfo = 0;
89		unsigned int toLower = i;
90		unsigned int toUpper = i;
91		if (U_SUCCESS(icuStatus)) {
92			if (u_isblank(unicodeChar))
93				classInfo |= _ISblank;
94			if (u_charType(unicodeChar) == U_CONTROL_CHAR)
95				classInfo |= _IScntrl;
96			if (u_ispunct(unicodeChar))
97				classInfo |= _ISpunct;
98			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_ALNUM))
99				classInfo |= _ISalnum;
100			if (u_isUUppercase(unicodeChar))
101				classInfo |= _ISupper;
102			if (u_isULowercase(unicodeChar))
103				classInfo |= _ISlower;
104			if (u_isUAlphabetic(unicodeChar))
105				classInfo |= _ISalpha;
106			if (u_isdigit(unicodeChar))
107				classInfo |= _ISdigit;
108			if (u_isxdigit(unicodeChar))
109				classInfo |= _ISxdigit;
110			if (u_isUWhiteSpace(unicodeChar))
111				classInfo |= _ISspace;
112			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_PRINT))
113				classInfo |= _ISprint;
114			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_GRAPH))
115				classInfo |= _ISgraph;
116
117			UChar lowerChar = u_tolower(unicodeChar);
118			icuStatus = U_ZERO_ERROR;
119			ucnv_fromUChars(converter, buffer, 1, &lowerChar, 1, &icuStatus);
120			if (U_SUCCESS(icuStatus))
121				toLower = (unsigned char)buffer[0];
122
123			UChar upperChar = u_toupper(unicodeChar);
124			icuStatus = U_ZERO_ERROR;
125			ucnv_fromUChars(converter, buffer, 1, &upperChar, 1, &icuStatus);
126			if (U_SUCCESS(icuStatus))
127				toUpper = (unsigned char)buffer[0];
128		}
129		fClassInfo[i + 128] = classInfo;
130		fToLowerMap[i + 128] = toLower;
131		fToUpperMap[i + 128] = toUpper;
132		if (i >= 128 && i < 255) {
133			// mirror upper half at negative indices (except for -1 [=EOF])
134			fClassInfo[i - 128] = classInfo;
135			fToLowerMap[i - 128] = toLower;
136			fToUpperMap[i - 128] = toUpper;
137		}
138	}
139
140	return B_OK;
141}
142
143
144status_t
145ICUCtypeData::SetToPosix()
146{
147	status_t result = inherited::SetToPosix();
148
149	if (result == B_OK) {
150		memcpy(fClassInfo, fDataBridge->posixClassInfo, sizeof(fClassInfo));
151		memcpy(fToLowerMap, fDataBridge->posixToLowerMap, sizeof(fToLowerMap));
152		memcpy(fToUpperMap, fDataBridge->posixToUpperMap, sizeof(fToUpperMap));
153
154		fDataBridge->setMbCurMax(1);
155	}
156
157	return result;
158}
159
160
161int
162ICUCtypeData::IsWCType(wint_t wc, wctype_t charClass)
163{
164	if (wc == WEOF)
165		return 0;
166
167	switch (charClass) {
168		case _ISalnum:
169			return u_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM);
170		case _ISalpha:
171			return u_isUAlphabetic(wc);
172		case _ISblank:
173			return u_isblank(wc);
174		case _IScntrl:
175			return u_charType(wc) == U_CONTROL_CHAR;
176		case _ISdigit:
177			return u_isdigit(wc);
178		case _ISgraph:
179			return u_hasBinaryProperty(wc, UCHAR_POSIX_GRAPH);
180		case _ISlower:
181			return u_isULowercase(wc);
182		case _ISprint:
183			return u_hasBinaryProperty(wc, UCHAR_POSIX_PRINT);
184		case _ISpunct:
185			return u_ispunct(wc);
186		case _ISspace:
187			return u_isUWhiteSpace(wc);
188		case _ISupper:
189			return u_isUUppercase(wc);
190		case _ISxdigit:
191			return u_isxdigit(wc);
192		default:
193			return 0;
194	}
195}
196
197
198status_t
199ICUCtypeData::ToWCTrans(wint_t wc, wctrans_t transition, wint_t& result)
200{
201	switch (transition) {
202		case _ISlower:
203			result = u_tolower(wc);
204			return B_OK;
205		case _ISupper:
206			result = u_toupper(wc);
207			return B_OK;
208		default:
209			return B_BAD_VALUE;
210	}
211}
212
213
214status_t
215ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
216	mbstate_t* mbState, size_t& lengthOut)
217{
218	UConverter* converter = NULL;
219	status_t result = _GetConverterForMbState(mbState, converter);
220	if (result != B_OK) {
221		TRACE(("MultibyteToWchar(): couldn't get converter for mbstate %p - "
222				"%" B_PRIx32 "\n", mbState, result));
223		return result;
224	}
225
226	// do the conversion
227	UErrorCode icuStatus = U_ZERO_ERROR;
228
229	const char* buffer = mb;
230	UChar targetBuffer[3];
231	UChar* target = targetBuffer;
232	ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen,
233		NULL, FALSE, &icuStatus);
234	size_t sourceLengthUsed = buffer - mb;
235	size_t targetLengthUsed = (size_t)(target - targetBuffer);
236
237	if (U16_IS_LEAD(targetBuffer[0])) {
238		// we have a surrogate pair, so re-read with enough space for a pair
239		// of characters instead
240		TRACE(("MultibyteToWchar(): have a surrogate pair\n"));
241		ucnv_resetToUnicode(converter);
242		buffer = mb;
243		target = targetBuffer;
244		ucnv_toUnicode(converter, &target, target + 2, &buffer, buffer + mbLen,
245			NULL, FALSE, &icuStatus);
246		sourceLengthUsed = buffer - mb;
247		targetLengthUsed = (size_t)(target - targetBuffer);
248	}
249
250	if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) {
251		// we've got one character, which is all that we wanted
252		icuStatus = U_ZERO_ERROR;
253	}
254
255	if (!U_SUCCESS(icuStatus)) {
256		// conversion failed because of illegal character sequence
257		TRACE(("MultibyteToWchar(): illegal character sequence\n"));
258		ucnv_resetToUnicode(converter);
259		result = B_BAD_DATA;
260	} else 	if (targetLengthUsed == 0) {
261		TRACE(("MultibyteToWchar(): incomplete character (len=%lu)\n", mbLen));
262		for (size_t i = 0; i < mbLen; ++i)
263			TRACE(("\tbyte %lu: %x\n", i, mb[i]));
264		mbState->count = sourceLengthUsed;
265		result = B_BAD_INDEX;
266	} else {
267		UChar32 unicodeChar = 0xBADBEEF;
268		U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar);
269
270		if (unicodeChar == 0) {
271			// reset to initial state
272			_DropConverterFromMbState(mbState);
273			memset(mbState, 0, sizeof(mbstate_t));
274			lengthOut = 0;
275		} else {
276			mbState->count = 0;
277			lengthOut = sourceLengthUsed;
278		}
279
280		if (wcOut != NULL)
281			*wcOut = unicodeChar;
282
283		result = B_OK;
284	}
285
286	return result;
287}
288
289
290status_t
291ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
292	const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
293	size_t& lengthOut)
294{
295	UConverter* converter = NULL;
296	status_t result = _GetConverterForMbState(mbState, converter);
297	if (result != B_OK) {
298		TRACE(("MultibyteStringToWchar(): couldn't get converter for mbstate %p"
299				" - %" B_PRIx32 "\n", mbState, result));
300		return result;
301	}
302
303	bool wcsIsTerminated = false;
304	const char* source = *mbSource;
305	const char* sourceEnd = source + mbSourceLength;
306	if (sourceEnd < source) {
307		// overflow, clamp to highest possible address
308		sourceEnd = (const char*)-1;
309	}
310
311	if (wcDest == NULL) {
312		// if there's no destination buffer, there's no length limit either
313		wcDestLength = (size_t)-1;
314	}
315
316	UErrorCode icuStatus = U_ZERO_ERROR;
317	size_t sourceLengthUsed = 0;
318	for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) {
319		if (sourceLengthUsed >= mbSourceLength)
320			break;
321		UChar32 unicodeChar = ucnv_getNextUChar(converter, &source,
322			std::min(source + MB_CUR_MAX, sourceEnd), &icuStatus);
323		TRACE(("MultibyteStringToWchar() l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu"
324				" uchar:%x st:%x\n", lengthOut, wcDestLength, source, sourceEnd,
325			mbSourceLength, sourceLengthUsed, unicodeChar, icuStatus));
326		if (!U_SUCCESS(icuStatus))
327			break;
328		sourceLengthUsed = source - *mbSource;
329		if (wcDest != NULL)
330			*wcDest++ = unicodeChar;
331		if (unicodeChar == L'\0') {
332			wcsIsTerminated = true;
333			break;
334		}
335		icuStatus = U_ZERO_ERROR;
336	}
337
338	if (!U_SUCCESS(icuStatus)) {
339		// conversion failed because of illegal character sequence
340		TRACE(("MultibyteStringToWchar(): illegal character sequence\n"));
341		ucnv_resetToUnicode(converter);
342		result = B_BAD_DATA;
343		if (wcDest != NULL)
344			*mbSource = *mbSource + sourceLengthUsed;
345	} else if (wcsIsTerminated) {
346		// reset to initial state
347		_DropConverterFromMbState(mbState);
348		memset(mbState, 0, sizeof(mbstate_t));
349		if (wcDest != NULL)
350			*mbSource = NULL;
351	} else {
352		mbState->count = 0;
353		if (wcDest != NULL)
354			*mbSource = source;
355	}
356
357	return result;
358}
359
360
361status_t
362ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
363	size_t& lengthOut)
364{
365	UConverter* converter = NULL;
366	status_t result = _GetConverterForMbState(mbState, converter);
367	if (result != B_OK) {
368		TRACE(("WcharToMultibyte(): couldn't get converter for mbstate %p - "
369				"%" B_PRIx32 "\n", mbState, result));
370		return result;
371	}
372
373	// convert input from UTF-32 to UTF-16
374	UChar ucharBuffer[2];
375	size_t ucharLength;
376	if (U_IS_BMP(wc)) {
377		ucharBuffer[0] = wc;
378		ucharLength = 1;
379	} else {
380		ucharBuffer[0] = U16_LEAD(wc);
381		ucharBuffer[1] = U16_TRAIL(wc);
382		ucharLength = 2;
383	}
384
385	// do the actual conversion
386	UErrorCode icuStatus = U_ZERO_ERROR;
387	size_t mbLength = mbOut == NULL ? 0 : MB_CUR_MAX;
388	lengthOut = ucnv_fromUChars(converter, mbOut, mbLength, ucharBuffer,
389		ucharLength, &icuStatus);
390	TRACE(("WcharToMultibyte() l:%lu mb:%p ml:%lu uchar:%x st:%x\n", lengthOut,
391		mbOut, mbLength, wc, icuStatus));
392
393	if (icuStatus == U_BUFFER_OVERFLOW_ERROR && mbOut == NULL) {
394		// we have no output buffer, so we ignore buffer overflows
395		icuStatus = U_ZERO_ERROR;
396	}
397
398	if (!U_SUCCESS(icuStatus)) {
399		if (icuStatus == U_ILLEGAL_ARGUMENT_ERROR) {
400			// bad converter (shouldn't really happen)
401			TRACE(("WcharToMultibyte(): bad converter\n"));
402			return B_BAD_VALUE;
403		}
404
405		// conversion failed because of illegal/unmappable character
406		TRACE(("WcharToMultibyte(): illegal character sequence\n"));
407		ucnv_resetFromUnicode(converter);
408		return B_BAD_DATA;
409	}
410
411	if (wc == 0) {
412		// reset to initial state
413		_DropConverterFromMbState(mbState);
414		memset(mbState, 0, sizeof(mbstate_t));
415	}
416
417	return B_OK;
418}
419
420
421status_t
422ICUCtypeData::WcharStringToMultibyte(char* mbDest, size_t mbDestLength,
423	const wchar_t** wcSource, size_t wcSourceLength, mbstate_t* mbState,
424	size_t& lengthOut)
425{
426	UConverter* converter = NULL;
427	status_t result = _GetConverterForMbState(mbState, converter);
428	if (result != B_OK) {
429		TRACE(("WcharStringToMultibyte(): couldn't get converter for mbstate %p"
430			" - %" B_PRIx32 "\n", mbState, result));
431		return result;
432	}
433
434	bool mbsIsTerminated = false;
435	const UChar32* source = (UChar32*)*wcSource;
436
437	UErrorCode icuStatus = U_ZERO_ERROR;
438	lengthOut = 0;
439	size_t sourceLengthUsed = 0;
440	for (; sourceLengthUsed < wcSourceLength; ++sourceLengthUsed, ++source) {
441		if (mbDest != NULL && lengthOut >= mbDestLength)
442			break;
443
444		// convert input from UTF-32 to UTF-16
445		UChar ucharBuffer[2];
446		size_t ucharLength;
447		if (U_IS_BMP(*source)) {
448			ucharBuffer[0] = *source;
449			ucharLength = 1;
450		} else {
451			ucharBuffer[0] = U16_LEAD(*source);
452			ucharBuffer[1] = U16_TRAIL(*source);
453			ucharLength = 2;
454		}
455
456		// do the actual conversion
457		size_t destLength = mbDest == NULL ? 0 : mbDestLength - lengthOut;
458		char buffer[MB_CUR_MAX];
459		size_t mbLength = ucnv_fromUChars(converter,
460			mbDest == NULL ? NULL : buffer, destLength, ucharBuffer,
461			ucharLength, &icuStatus);
462		TRACE(("WcharStringToMultibyte() l:%lu mb:%p ml:%lu s:%p ul:%lu slu:%lu"
463				" uchar:%x st:%x\n", mbLength, mbDest, destLength, source,
464			ucharLength, sourceLengthUsed, *source, icuStatus));
465
466		if (icuStatus == U_BUFFER_OVERFLOW_ERROR) {
467			// ignore buffer overflows ...
468 			icuStatus = U_ZERO_ERROR;
469 			// ... but stop if the output buffer has been exceeded
470 			if (destLength > 0)
471 				break;
472		} else if (mbDest != NULL)
473			memcpy(mbDest, buffer, mbLength);
474
475		if (!U_SUCCESS(icuStatus))
476			break;
477		if (mbDest != NULL)
478			mbDest += mbLength;
479		if (*source == L'\0') {
480			mbsIsTerminated = true;
481			break;
482		}
483		lengthOut += mbLength;
484		icuStatus = U_ZERO_ERROR;
485	}
486
487	if (!U_SUCCESS(icuStatus)) {
488		// conversion failed because of illegal character sequence
489		TRACE(("WcharStringToMultibyte(): illegal character sequence\n"));
490		ucnv_resetFromUnicode(converter);
491		result = B_BAD_DATA;
492		if (mbDest != NULL)
493			*wcSource = *wcSource + sourceLengthUsed;
494	} else if (mbsIsTerminated) {
495		// reset to initial state
496		_DropConverterFromMbState(mbState);
497		memset(mbState, 0, sizeof(mbstate_t));
498		if (mbDest != NULL)
499			*wcSource = NULL;
500	} else {
501		mbState->count = 0;
502		if (mbDest != NULL)
503			*wcSource = (wchar_t*)source;
504	}
505
506	return result;
507}
508
509
510const char*
511ICUCtypeData::GetLanginfo(int index)
512{
513	switch(index) {
514		case CODESET:
515			return fGivenCharset;
516		default:
517			return "";
518	}
519}
520
521
522status_t
523ICUCtypeData::_GetConverterForMbState(mbstate_t* mbState,
524	UConverter*& converterOut)
525{
526	if (strcmp(mbState->charset, fGivenCharset) == 0
527			&& (char*)mbState->converter >= mbState->data
528			&& (char*)mbState->converter < mbState->data + 8) {
529		// charset matches and converter actually lives in *this* mbState,
530		// so we can use it (if the converter points to the outside, it means
531		// that the mbstate_t has been copied)
532		converterOut = (UConverter*)mbState->converter;
533		return B_OK;
534	}
535
536	// charset no longer matches the converter, we need to dump it and
537	// create a new one
538	_DropConverterFromMbState(mbState);
539
540	// create a new converter for the current charset ...
541	UConverter* icuConverter;
542	status_t result = _GetConverter(icuConverter);
543	if (result != B_OK)
544		return result;
545
546	// ... and clone it into the mbstate
547	UErrorCode icuStatus = U_ZERO_ERROR;
548	int32_t bufferSize = sizeof(mbState->data);
549	UConverter* clone
550		= ucnv_safeClone(icuConverter, mbState->data, &bufferSize, &icuStatus);
551
552	if (clone == NULL || !U_SUCCESS(icuStatus))
553		return B_ERROR;
554
555	if ((char*)clone < mbState->data || (char*)clone >= mbState->data + 8) {
556		// buffer is too small (shouldn't happen according to ICU docs)
557		return B_NO_MEMORY;
558	}
559
560	strlcpy(mbState->charset, fGivenCharset, sizeof(mbState->charset));
561	mbState->converter = clone;
562
563	converterOut = clone;
564
565	return B_OK;
566}
567
568
569status_t
570ICUCtypeData::_DropConverterFromMbState(mbstate_t* mbState)
571{
572	if (mbState->converter != NULL && (char*)mbState->converter >= mbState->data
573			&& (char*)mbState->converter < mbState->data + 8) {
574		// check that the converter actually lives in *this* mbState,
575		// otherwise we risk freeing a converter that doesn't belong to us;
576		// this parallels the check in _GetConverterForMbState()
577		ucnv_close((UConverter*)mbState->converter);
578	}
579	memset(mbState, 0, sizeof(mbstate_t));
580
581	return B_OK;
582}
583
584
585}	// namespace Libroot
586}	// namespace BPrivate
587