ConvertUTFWrapper.cpp revision 263508
1243791Sdim//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----=== 2243791Sdim// 3243791Sdim// The LLVM Compiler Infrastructure 4243791Sdim// 5243791Sdim// This file is distributed under the University of Illinois Open Source 6243791Sdim// License. See LICENSE.TXT for details. 7243791Sdim// 8243791Sdim//===----------------------------------------------------------------------===// 9243791Sdim 10243791Sdim#include "llvm/Support/ConvertUTF.h" 11243791Sdim#include "llvm/Support/SwapByteOrder.h" 12243791Sdim#include <string> 13243791Sdim#include <vector> 14243791Sdim 15243791Sdimnamespace llvm { 16243791Sdim 17252723Sdimbool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 18243791Sdim char *&ResultPtr, const UTF8 *&ErrorPtr) { 19252723Sdim assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4); 20252723Sdim ConversionResult result = conversionOK; 21252723Sdim // Copy the character span over. 22243791Sdim if (WideCharWidth == 1) { 23243791Sdim const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin()); 24252723Sdim if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) { 25252723Sdim result = sourceIllegal; 26252723Sdim ErrorPtr = Pos; 27252723Sdim } else { 28252723Sdim memcpy(ResultPtr, Source.data(), Source.size()); 29243791Sdim ResultPtr += Source.size(); 30243791Sdim } 31243791Sdim } else if (WideCharWidth == 2) { 32243791Sdim const UTF8 *sourceStart = (const UTF8*)Source.data(); 33243791Sdim // FIXME: Make the type of the result buffer correct instead of 34243791Sdim // using reinterpret_cast. 35243791Sdim UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr); 36243791Sdim ConversionFlags flags = strictConversion; 37243791Sdim result = ConvertUTF8toUTF16( 38243791Sdim &sourceStart, sourceStart + Source.size(), 39243791Sdim &targetStart, targetStart + 2*Source.size(), flags); 40243791Sdim if (result == conversionOK) 41243791Sdim ResultPtr = reinterpret_cast<char*>(targetStart); 42243791Sdim else 43243791Sdim ErrorPtr = sourceStart; 44243791Sdim } else if (WideCharWidth == 4) { 45243791Sdim const UTF8 *sourceStart = (const UTF8*)Source.data(); 46243791Sdim // FIXME: Make the type of the result buffer correct instead of 47243791Sdim // using reinterpret_cast. 48243791Sdim UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr); 49243791Sdim ConversionFlags flags = strictConversion; 50243791Sdim result = ConvertUTF8toUTF32( 51243791Sdim &sourceStart, sourceStart + Source.size(), 52243791Sdim &targetStart, targetStart + 4*Source.size(), flags); 53243791Sdim if (result == conversionOK) 54243791Sdim ResultPtr = reinterpret_cast<char*>(targetStart); 55243791Sdim else 56243791Sdim ErrorPtr = sourceStart; 57243791Sdim } 58243791Sdim assert((result != targetExhausted) 59243791Sdim && "ConvertUTF8toUTFXX exhausted target buffer"); 60243791Sdim return result == conversionOK; 61243791Sdim} 62243791Sdim 63243791Sdimbool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { 64243791Sdim const UTF32 *SourceStart = &Source; 65243791Sdim const UTF32 *SourceEnd = SourceStart + 1; 66243791Sdim UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr); 67243791Sdim UTF8 *TargetEnd = TargetStart + 4; 68243791Sdim ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd, 69243791Sdim &TargetStart, TargetEnd, 70243791Sdim strictConversion); 71243791Sdim if (CR != conversionOK) 72243791Sdim return false; 73243791Sdim 74243791Sdim ResultPtr = reinterpret_cast<char*>(TargetStart); 75243791Sdim return true; 76243791Sdim} 77243791Sdim 78243791Sdimbool hasUTF16ByteOrderMark(ArrayRef<char> S) { 79243791Sdim return (S.size() >= 2 && 80243791Sdim ((S[0] == '\xff' && S[1] == '\xfe') || 81243791Sdim (S[0] == '\xfe' && S[1] == '\xff'))); 82243791Sdim} 83243791Sdim 84243791Sdimbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { 85243791Sdim assert(Out.empty()); 86243791Sdim 87243791Sdim // Error out on an uneven byte count. 88243791Sdim if (SrcBytes.size() % 2) 89243791Sdim return false; 90243791Sdim 91243791Sdim // Avoid OOB by returning early on empty input. 92243791Sdim if (SrcBytes.empty()) 93243791Sdim return true; 94243791Sdim 95243791Sdim const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin()); 96243791Sdim const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end()); 97243791Sdim 98243791Sdim // Byteswap if necessary. 99243791Sdim std::vector<UTF16> ByteSwapped; 100243791Sdim if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) { 101243791Sdim ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); 102243791Sdim for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I) 103263509Sdim ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]); 104243791Sdim Src = &ByteSwapped[0]; 105243791Sdim SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; 106243791Sdim } 107243791Sdim 108243791Sdim // Skip the BOM for conversion. 109243791Sdim if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE) 110243791Sdim Src++; 111243791Sdim 112243791Sdim // Just allocate enough space up front. We'll shrink it later. 113243791Sdim Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 114243791Sdim UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]); 115243791Sdim UTF8 *DstEnd = Dst + Out.size(); 116243791Sdim 117243791Sdim ConversionResult CR = 118243791Sdim ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 119243791Sdim assert(CR != targetExhausted); 120243791Sdim 121243791Sdim if (CR != conversionOK) { 122243791Sdim Out.clear(); 123243791Sdim return false; 124243791Sdim } 125243791Sdim 126243791Sdim Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]); 127243791Sdim return true; 128243791Sdim} 129243791Sdim 130243791Sdim} // end namespace llvm 131243791Sdim 132243791Sdim