1/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2 *
3 *                     The LLVM Compiler Infrastructure
4 *
5 * This file is distributed under the University of Illinois Open Source
6 * License. See LICENSE.TXT for details.
7 *
8 *===------------------------------------------------------------------------=*/
9/*
10 * Copyright 2001-2004 Unicode, Inc.
11 *
12 * Disclaimer
13 *
14 * This source code is provided as is by Unicode, Inc. No claims are
15 * made as to fitness for any particular purpose. No warranties of any
16 * kind are expressed or implied. The recipient agrees to determine
17 * applicability of information provided. If this file has been
18 * purchased on magnetic or optical media from Unicode, Inc., the
19 * sole remedy for any claim will be exchange of defective media
20 * within 90 days of receipt.
21 *
22 * Limitations on Rights to Redistribute This Code
23 *
24 * Unicode, Inc. hereby grants the right to freely use the information
25 * supplied in this file in the creation of products supporting the
26 * Unicode Standard, and to make copies of this file in any form
27 * for internal or external distribution as long as this notice
28 * remains attached.
29 */
30
31/* ---------------------------------------------------------------------
32
33    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34    Author: Mark E. Davis, 1994.
35    Rev History: Rick McGowan, fixes & updates May 2001.
36    Sept 2001: fixed const & error conditions per
37        mods suggested by S. Parent & A. Lillich.
38    June 2002: Tim Dodd added detection and handling of incomplete
39        source sequences, enhanced error detection, added casts
40        to eliminate compiler warnings.
41    July 2003: slight mods to back out aggressive FFFE detection.
42    Jan 2004: updated switches in from-UTF8 conversions.
43    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44
45    See the header file "ConvertUTF.h" for complete documentation.
46
47------------------------------------------------------------------------ */
48
49
50#include "llvm/Support/ConvertUTF.h"
51#ifdef CVTUTF_DEBUG
52#include <stdio.h>
53#endif
54
55static const int halfShift  = 10; /* used for shifting by 10 bits */
56
57static const UTF32 halfBase = 0x0010000UL;
58static const UTF32 halfMask = 0x3FFUL;
59
60#define UNI_SUR_HIGH_START  (UTF32)0xD800
61#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
62#define UNI_SUR_LOW_START   (UTF32)0xDC00
63#define UNI_SUR_LOW_END     (UTF32)0xDFFF
64#define false      0
65#define true        1
66
67/* --------------------------------------------------------------------- */
68
69/*
70 * Index into the table below with the first byte of a UTF-8 sequence to
71 * get the number of trailing bytes that are supposed to follow it.
72 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
73 * left as-is for anyone who may want to do such conversion, which was
74 * allowed in earlier algorithms.
75 */
76static const char trailingBytesForUTF8[256] = {
77    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
78    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
83    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
84    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
85};
86
87/*
88 * Magic values subtracted from a buffer value during UTF8 conversion.
89 * This table contains as many values as there might be trailing bytes
90 * in a UTF-8 sequence.
91 */
92static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
93                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
94
95/*
96 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
97 * into the first byte, depending on how many bytes follow.  There are
98 * as many entries in this table as there are UTF-8 sequence types.
99 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
100 * for *legal* UTF-8 will be 4 or fewer bytes total.
101 */
102static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
103
104/* --------------------------------------------------------------------- */
105
106/* The interface converts a whole buffer to avoid function-call overhead.
107 * Constants have been gathered. Loops & conditionals have been removed as
108 * much as possible for efficiency, in favor of drop-through switches.
109 * (See "Note A" at the bottom of the file for equivalent code.)
110 * If your compiler supports it, the "isLegalUTF8" call can be turned
111 * into an inline function.
112 */
113
114
115/* --------------------------------------------------------------------- */
116
117ConversionResult ConvertUTF32toUTF16 (
118        const UTF32** sourceStart, const UTF32* sourceEnd,
119        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
120    ConversionResult result = conversionOK;
121    const UTF32* source = *sourceStart;
122    UTF16* target = *targetStart;
123    while (source < sourceEnd) {
124        UTF32 ch;
125        if (target >= targetEnd) {
126            result = targetExhausted; break;
127        }
128        ch = *source++;
129        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
130            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
131            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
132                if (flags == strictConversion) {
133                    --source; /* return to the illegal value itself */
134                    result = sourceIllegal;
135                    break;
136                } else {
137                    *target++ = UNI_REPLACEMENT_CHAR;
138                }
139            } else {
140                *target++ = (UTF16)ch; /* normal case */
141            }
142        } else if (ch > UNI_MAX_LEGAL_UTF32) {
143            if (flags == strictConversion) {
144                result = sourceIllegal;
145            } else {
146                *target++ = UNI_REPLACEMENT_CHAR;
147            }
148        } else {
149            /* target is a character in range 0xFFFF - 0x10FFFF. */
150            if (target + 1 >= targetEnd) {
151                --source; /* Back up source pointer! */
152                result = targetExhausted; break;
153            }
154            ch -= halfBase;
155            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
156            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
157        }
158    }
159    *sourceStart = source;
160    *targetStart = target;
161    return result;
162}
163
164/* --------------------------------------------------------------------- */
165
166ConversionResult ConvertUTF16toUTF32 (
167        const UTF16** sourceStart, const UTF16* sourceEnd,
168        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
169    ConversionResult result = conversionOK;
170    const UTF16* source = *sourceStart;
171    UTF32* target = *targetStart;
172    UTF32 ch, ch2;
173    while (source < sourceEnd) {
174        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
175        ch = *source++;
176        /* If we have a surrogate pair, convert to UTF32 first. */
177        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
178            /* If the 16 bits following the high surrogate are in the source buffer... */
179            if (source < sourceEnd) {
180                ch2 = *source;
181                /* If it's a low surrogate, convert to UTF32. */
182                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
183                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
184                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
185                    ++source;
186                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
187                    --source; /* return to the illegal value itself */
188                    result = sourceIllegal;
189                    break;
190                }
191            } else { /* We don't have the 16 bits following the high surrogate. */
192                --source; /* return to the high surrogate */
193                result = sourceExhausted;
194                break;
195            }
196        } else if (flags == strictConversion) {
197            /* UTF-16 surrogate values are illegal in UTF-32 */
198            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
199                --source; /* return to the illegal value itself */
200                result = sourceIllegal;
201                break;
202            }
203        }
204        if (target >= targetEnd) {
205            source = oldSource; /* Back up source pointer! */
206            result = targetExhausted; break;
207        }
208        *target++ = ch;
209    }
210    *sourceStart = source;
211    *targetStart = target;
212#ifdef CVTUTF_DEBUG
213if (result == sourceIllegal) {
214    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
215    fflush(stderr);
216}
217#endif
218    return result;
219}
220ConversionResult ConvertUTF16toUTF8 (
221        const UTF16** sourceStart, const UTF16* sourceEnd,
222        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
223    ConversionResult result = conversionOK;
224    const UTF16* source = *sourceStart;
225    UTF8* target = *targetStart;
226    while (source < sourceEnd) {
227        UTF32 ch;
228        unsigned short bytesToWrite = 0;
229        const UTF32 byteMask = 0xBF;
230        const UTF32 byteMark = 0x80;
231        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
232        ch = *source++;
233        /* If we have a surrogate pair, convert to UTF32 first. */
234        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
235            /* If the 16 bits following the high surrogate are in the source buffer... */
236            if (source < sourceEnd) {
237                UTF32 ch2 = *source;
238                /* If it's a low surrogate, convert to UTF32. */
239                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
240                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
241                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
242                    ++source;
243                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
244                    --source; /* return to the illegal value itself */
245                    result = sourceIllegal;
246                    break;
247                }
248            } else { /* We don't have the 16 bits following the high surrogate. */
249                --source; /* return to the high surrogate */
250                result = sourceExhausted;
251                break;
252            }
253        } else if (flags == strictConversion) {
254            /* UTF-16 surrogate values are illegal in UTF-32 */
255            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
256                --source; /* return to the illegal value itself */
257                result = sourceIllegal;
258                break;
259            }
260        }
261        /* Figure out how many bytes the result will require */
262        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
263        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
264        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
265        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
266        } else {                            bytesToWrite = 3;
267                                            ch = UNI_REPLACEMENT_CHAR;
268        }
269
270        target += bytesToWrite;
271        if (target > targetEnd) {
272            source = oldSource; /* Back up source pointer! */
273            target -= bytesToWrite; result = targetExhausted; break;
274        }
275        switch (bytesToWrite) { /* note: everything falls through. */
276            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
277            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
278            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
279            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
280        }
281        target += bytesToWrite;
282    }
283    *sourceStart = source;
284    *targetStart = target;
285    return result;
286}
287
288/* --------------------------------------------------------------------- */
289
290ConversionResult ConvertUTF32toUTF8 (
291        const UTF32** sourceStart, const UTF32* sourceEnd,
292        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
293    ConversionResult result = conversionOK;
294    const UTF32* source = *sourceStart;
295    UTF8* target = *targetStart;
296    while (source < sourceEnd) {
297        UTF32 ch;
298        unsigned short bytesToWrite = 0;
299        const UTF32 byteMask = 0xBF;
300        const UTF32 byteMark = 0x80;
301        ch = *source++;
302        if (flags == strictConversion ) {
303            /* UTF-16 surrogate values are illegal in UTF-32 */
304            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
305                --source; /* return to the illegal value itself */
306                result = sourceIllegal;
307                break;
308            }
309        }
310        /*
311         * Figure out how many bytes the result will require. Turn any
312         * illegally large UTF32 things (> Plane 17) into replacement chars.
313         */
314        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
315        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
316        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
317        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
318        } else {                            bytesToWrite = 3;
319                                            ch = UNI_REPLACEMENT_CHAR;
320                                            result = sourceIllegal;
321        }
322
323        target += bytesToWrite;
324        if (target > targetEnd) {
325            --source; /* Back up source pointer! */
326            target -= bytesToWrite; result = targetExhausted; break;
327        }
328        switch (bytesToWrite) { /* note: everything falls through. */
329            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
330            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
331            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
332            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
333        }
334        target += bytesToWrite;
335    }
336    *sourceStart = source;
337    *targetStart = target;
338    return result;
339}
340
341/* --------------------------------------------------------------------- */
342
343/*
344 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
345 * This must be called with the length pre-determined by the first byte.
346 * If not calling this from ConvertUTF8to*, then the length can be set by:
347 *  length = trailingBytesForUTF8[*source]+1;
348 * and the sequence is illegal right away if there aren't that many bytes
349 * available.
350 * If presented with a length > 4, this returns false.  The Unicode
351 * definition of UTF-8 goes up to 4-byte sequences.
352 */
353
354static Boolean isLegalUTF8(const UTF8 *source, int length) {
355    UTF8 a;
356    const UTF8 *srcptr = source+length;
357    switch (length) {
358    default: return false;
359        /* Everything else falls through when "true"... */
360    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
361    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
362    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
363
364        switch (*source) {
365            /* no fall-through in this inner switch */
366            case 0xE0: if (a < 0xA0) return false; break;
367            case 0xED: if (a > 0x9F) return false; break;
368            case 0xF0: if (a < 0x90) return false; break;
369            case 0xF4: if (a > 0x8F) return false; break;
370            default:   if (a < 0x80) return false;
371        }
372
373    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
374    }
375    if (*source > 0xF4) return false;
376    return true;
377}
378
379/* --------------------------------------------------------------------- */
380
381/*
382 * Exported function to return whether a UTF-8 sequence is legal or not.
383 * This is not used here; it's just exported.
384 */
385Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
386    int length = trailingBytesForUTF8[*source]+1;
387    if (length > sourceEnd - source) {
388        return false;
389    }
390    return isLegalUTF8(source, length);
391}
392
393/* --------------------------------------------------------------------- */
394
395/*
396 * Exported function to return the total number of bytes in a codepoint
397 * represented in UTF-8, given the value of the first byte.
398 */
399unsigned getNumBytesForUTF8(UTF8 first) {
400  return trailingBytesForUTF8[first] + 1;
401}
402
403/* --------------------------------------------------------------------- */
404
405/*
406 * Exported function to return whether a UTF-8 string is legal or not.
407 * This is not used here; it's just exported.
408 */
409Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
410    while (*source != sourceEnd) {
411        int length = trailingBytesForUTF8[**source] + 1;
412        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
413            return false;
414        *source += length;
415    }
416    return true;
417}
418
419/* --------------------------------------------------------------------- */
420
421ConversionResult ConvertUTF8toUTF16 (
422        const UTF8** sourceStart, const UTF8* sourceEnd,
423        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
424    ConversionResult result = conversionOK;
425    const UTF8* source = *sourceStart;
426    UTF16* target = *targetStart;
427    while (source < sourceEnd) {
428        UTF32 ch = 0;
429        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
430        if (extraBytesToRead >= sourceEnd - source) {
431            result = sourceExhausted; break;
432        }
433        /* Do this check whether lenient or strict */
434        if (!isLegalUTF8(source, extraBytesToRead+1)) {
435            result = sourceIllegal;
436            break;
437        }
438        /*
439         * The cases all fall through. See "Note A" below.
440         */
441        switch (extraBytesToRead) {
442            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
443            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
444            case 3: ch += *source++; ch <<= 6;
445            case 2: ch += *source++; ch <<= 6;
446            case 1: ch += *source++; ch <<= 6;
447            case 0: ch += *source++;
448        }
449        ch -= offsetsFromUTF8[extraBytesToRead];
450
451        if (target >= targetEnd) {
452            source -= (extraBytesToRead+1); /* Back up source pointer! */
453            result = targetExhausted; break;
454        }
455        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
456            /* UTF-16 surrogate values are illegal in UTF-32 */
457            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
458                if (flags == strictConversion) {
459                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
460                    result = sourceIllegal;
461                    break;
462                } else {
463                    *target++ = UNI_REPLACEMENT_CHAR;
464                }
465            } else {
466                *target++ = (UTF16)ch; /* normal case */
467            }
468        } else if (ch > UNI_MAX_UTF16) {
469            if (flags == strictConversion) {
470                result = sourceIllegal;
471                source -= (extraBytesToRead+1); /* return to the start */
472                break; /* Bail out; shouldn't continue */
473            } else {
474                *target++ = UNI_REPLACEMENT_CHAR;
475            }
476        } else {
477            /* target is a character in range 0xFFFF - 0x10FFFF. */
478            if (target + 1 >= targetEnd) {
479                source -= (extraBytesToRead+1); /* Back up source pointer! */
480                result = targetExhausted; break;
481            }
482            ch -= halfBase;
483            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
484            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
485        }
486    }
487    *sourceStart = source;
488    *targetStart = target;
489    return result;
490}
491
492/* --------------------------------------------------------------------- */
493
494ConversionResult ConvertUTF8toUTF32 (
495        const UTF8** sourceStart, const UTF8* sourceEnd,
496        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
497    ConversionResult result = conversionOK;
498    const UTF8* source = *sourceStart;
499    UTF32* target = *targetStart;
500    while (source < sourceEnd) {
501        UTF32 ch = 0;
502        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
503        if (extraBytesToRead >= sourceEnd - source) {
504            result = sourceExhausted; break;
505        }
506        /* Do this check whether lenient or strict */
507        if (!isLegalUTF8(source, extraBytesToRead+1)) {
508            result = sourceIllegal;
509            break;
510        }
511        /*
512         * The cases all fall through. See "Note A" below.
513         */
514        switch (extraBytesToRead) {
515            case 5: ch += *source++; ch <<= 6;
516            case 4: ch += *source++; ch <<= 6;
517            case 3: ch += *source++; ch <<= 6;
518            case 2: ch += *source++; ch <<= 6;
519            case 1: ch += *source++; ch <<= 6;
520            case 0: ch += *source++;
521        }
522        ch -= offsetsFromUTF8[extraBytesToRead];
523
524        if (target >= targetEnd) {
525            source -= (extraBytesToRead+1); /* Back up the source pointer! */
526            result = targetExhausted; break;
527        }
528        if (ch <= UNI_MAX_LEGAL_UTF32) {
529            /*
530             * UTF-16 surrogate values are illegal in UTF-32, and anything
531             * over Plane 17 (> 0x10FFFF) is illegal.
532             */
533            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
534                if (flags == strictConversion) {
535                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
536                    result = sourceIllegal;
537                    break;
538                } else {
539                    *target++ = UNI_REPLACEMENT_CHAR;
540                }
541            } else {
542                *target++ = ch;
543            }
544        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
545            result = sourceIllegal;
546            *target++ = UNI_REPLACEMENT_CHAR;
547        }
548    }
549    *sourceStart = source;
550    *targetStart = target;
551    return result;
552}
553
554/* ---------------------------------------------------------------------
555
556    Note A.
557    The fall-through switches in UTF-8 reading code save a
558    temp variable, some decrements & conditionals.  The switches
559    are equivalent to the following loop:
560        {
561            int tmpBytesToRead = extraBytesToRead+1;
562            do {
563                ch += *source++;
564                --tmpBytesToRead;
565                if (tmpBytesToRead) ch <<= 6;
566            } while (tmpBytesToRead > 0);
567        }
568    In UTF-8 writing code, the switches on "bytesToWrite" are
569    similarly unrolled loops.
570
571   --------------------------------------------------------------------- */
572