StringRef.cpp revision 263508
1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include "llvm/ADT/Hashing.h" 13#include "llvm/ADT/OwningPtr.h" 14#include "llvm/ADT/edit_distance.h" 15#include <bitset> 16 17using namespace llvm; 18 19// MSVC emits references to this into the translation units which reference it. 20#ifndef _MSC_VER 21const size_t StringRef::npos; 22#endif 23 24static char ascii_tolower(char x) { 25 if (x >= 'A' && x <= 'Z') 26 return x - 'A' + 'a'; 27 return x; 28} 29 30static char ascii_toupper(char x) { 31 if (x >= 'a' && x <= 'z') 32 return x - 'a' + 'A'; 33 return x; 34} 35 36static bool ascii_isdigit(char x) { 37 return x >= '0' && x <= '9'; 38} 39 40// strncasecmp() is not available on non-POSIX systems, so define an 41// alternative function here. 42static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 43 for (size_t I = 0; I < Length; ++I) { 44 unsigned char LHC = ascii_tolower(LHS[I]); 45 unsigned char RHC = ascii_tolower(RHS[I]); 46 if (LHC != RHC) 47 return LHC < RHC ? -1 : 1; 48 } 49 return 0; 50} 51 52/// compare_lower - Compare strings, ignoring case. 53int StringRef::compare_lower(StringRef RHS) const { 54 if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length))) 55 return Res; 56 if (Length == RHS.Length) 57 return 0; 58 return Length < RHS.Length ? -1 : 1; 59} 60 61/// Check if this string starts with the given \p Prefix, ignoring case. 62bool StringRef::startswith_lower(StringRef Prefix) const { 63 return Length >= Prefix.Length && 64 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 65} 66 67/// Check if this string ends with the given \p Suffix, ignoring case. 68bool StringRef::endswith_lower(StringRef Suffix) const { 69 return Length >= Suffix.Length && 70 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 71} 72 73/// compare_numeric - Compare strings, handle embedded numbers. 74int StringRef::compare_numeric(StringRef RHS) const { 75 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 76 // Check for sequences of digits. 77 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 78 // The longer sequence of numbers is considered larger. 79 // This doesn't really handle prefixed zeros well. 80 size_t J; 81 for (J = I + 1; J != E + 1; ++J) { 82 bool ld = J < Length && ascii_isdigit(Data[J]); 83 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 84 if (ld != rd) 85 return rd ? -1 : 1; 86 if (!rd) 87 break; 88 } 89 // The two number sequences have the same length (J-I), just memcmp them. 90 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 91 return Res < 0 ? -1 : 1; 92 // Identical number sequences, continue search after the numbers. 93 I = J - 1; 94 continue; 95 } 96 if (Data[I] != RHS.Data[I]) 97 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 98 } 99 if (Length == RHS.Length) 100 return 0; 101 return Length < RHS.Length ? -1 : 1; 102} 103 104// Compute the edit distance between the two given strings. 105unsigned StringRef::edit_distance(llvm::StringRef Other, 106 bool AllowReplacements, 107 unsigned MaxEditDistance) const { 108 return llvm::ComputeEditDistance( 109 llvm::ArrayRef<char>(data(), size()), 110 llvm::ArrayRef<char>(Other.data(), Other.size()), 111 AllowReplacements, MaxEditDistance); 112} 113 114//===----------------------------------------------------------------------===// 115// String Operations 116//===----------------------------------------------------------------------===// 117 118std::string StringRef::lower() const { 119 std::string Result(size(), char()); 120 for (size_type i = 0, e = size(); i != e; ++i) { 121 Result[i] = ascii_tolower(Data[i]); 122 } 123 return Result; 124} 125 126std::string StringRef::upper() const { 127 std::string Result(size(), char()); 128 for (size_type i = 0, e = size(); i != e; ++i) { 129 Result[i] = ascii_toupper(Data[i]); 130 } 131 return Result; 132} 133 134//===----------------------------------------------------------------------===// 135// String Searching 136//===----------------------------------------------------------------------===// 137 138 139/// find - Search for the first string \arg Str in the string. 140/// 141/// \return - The index of the first occurrence of \arg Str, or npos if not 142/// found. 143size_t StringRef::find(StringRef Str, size_t From) const { 144 size_t N = Str.size(); 145 if (N > Length) 146 return npos; 147 148 // For short haystacks or unsupported needles fall back to the naive algorithm 149 if (Length < 16 || N > 255 || N == 0) { 150 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 151 if (substr(i, N).equals(Str)) 152 return i; 153 return npos; 154 } 155 156 if (From >= Length) 157 return npos; 158 159 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 160 uint8_t BadCharSkip[256]; 161 std::memset(BadCharSkip, N, 256); 162 for (unsigned i = 0; i != N-1; ++i) 163 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 164 165 unsigned Len = Length-From, Pos = From; 166 while (Len >= N) { 167 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 168 return Pos; 169 170 // Otherwise skip the appropriate number of bytes. 171 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 172 Len -= Skip; 173 Pos += Skip; 174 } 175 176 return npos; 177} 178 179/// rfind - Search for the last string \arg Str in the string. 180/// 181/// \return - The index of the last occurrence of \arg Str, or npos if not 182/// found. 183size_t StringRef::rfind(StringRef Str) const { 184 size_t N = Str.size(); 185 if (N > Length) 186 return npos; 187 for (size_t i = Length - N + 1, e = 0; i != e;) { 188 --i; 189 if (substr(i, N).equals(Str)) 190 return i; 191 } 192 return npos; 193} 194 195/// find_first_of - Find the first character in the string that is in \arg 196/// Chars, or npos if not found. 197/// 198/// Note: O(size() + Chars.size()) 199StringRef::size_type StringRef::find_first_of(StringRef Chars, 200 size_t From) const { 201 std::bitset<1 << CHAR_BIT> CharBits; 202 for (size_type i = 0; i != Chars.size(); ++i) 203 CharBits.set((unsigned char)Chars[i]); 204 205 for (size_type i = min(From, Length), e = Length; i != e; ++i) 206 if (CharBits.test((unsigned char)Data[i])) 207 return i; 208 return npos; 209} 210 211/// find_first_not_of - Find the first character in the string that is not 212/// \arg C or npos if not found. 213StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 214 for (size_type i = min(From, Length), e = Length; i != e; ++i) 215 if (Data[i] != C) 216 return i; 217 return npos; 218} 219 220/// find_first_not_of - Find the first character in the string that is not 221/// in the string \arg Chars, or npos if not found. 222/// 223/// Note: O(size() + Chars.size()) 224StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 225 size_t From) const { 226 std::bitset<1 << CHAR_BIT> CharBits; 227 for (size_type i = 0; i != Chars.size(); ++i) 228 CharBits.set((unsigned char)Chars[i]); 229 230 for (size_type i = min(From, Length), e = Length; i != e; ++i) 231 if (!CharBits.test((unsigned char)Data[i])) 232 return i; 233 return npos; 234} 235 236/// find_last_of - Find the last character in the string that is in \arg C, 237/// or npos if not found. 238/// 239/// Note: O(size() + Chars.size()) 240StringRef::size_type StringRef::find_last_of(StringRef Chars, 241 size_t From) const { 242 std::bitset<1 << CHAR_BIT> CharBits; 243 for (size_type i = 0; i != Chars.size(); ++i) 244 CharBits.set((unsigned char)Chars[i]); 245 246 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 247 if (CharBits.test((unsigned char)Data[i])) 248 return i; 249 return npos; 250} 251 252/// find_last_not_of - Find the last character in the string that is not 253/// \arg C, or npos if not found. 254StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 255 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 256 if (Data[i] != C) 257 return i; 258 return npos; 259} 260 261/// find_last_not_of - Find the last character in the string that is not in 262/// \arg Chars, or npos if not found. 263/// 264/// Note: O(size() + Chars.size()) 265StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 266 size_t From) const { 267 std::bitset<1 << CHAR_BIT> CharBits; 268 for (size_type i = 0, e = Chars.size(); i != e; ++i) 269 CharBits.set((unsigned char)Chars[i]); 270 271 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 272 if (!CharBits.test((unsigned char)Data[i])) 273 return i; 274 return npos; 275} 276 277void StringRef::split(SmallVectorImpl<StringRef> &A, 278 StringRef Separators, int MaxSplit, 279 bool KeepEmpty) const { 280 StringRef rest = *this; 281 282 // rest.data() is used to distinguish cases like "a," that splits into 283 // "a" + "" and "a" that splits into "a" + 0. 284 for (int splits = 0; 285 rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit); 286 ++splits) { 287 std::pair<StringRef, StringRef> p = rest.split(Separators); 288 289 if (KeepEmpty || p.first.size() != 0) 290 A.push_back(p.first); 291 rest = p.second; 292 } 293 // If we have a tail left, add it. 294 if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty)) 295 A.push_back(rest); 296} 297 298//===----------------------------------------------------------------------===// 299// Helpful Algorithms 300//===----------------------------------------------------------------------===// 301 302/// count - Return the number of non-overlapped occurrences of \arg Str in 303/// the string. 304size_t StringRef::count(StringRef Str) const { 305 size_t Count = 0; 306 size_t N = Str.size(); 307 if (N > Length) 308 return 0; 309 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 310 if (substr(i, N).equals(Str)) 311 ++Count; 312 return Count; 313} 314 315static unsigned GetAutoSenseRadix(StringRef &Str) { 316 if (Str.startswith("0x")) { 317 Str = Str.substr(2); 318 return 16; 319 } 320 321 if (Str.startswith("0b")) { 322 Str = Str.substr(2); 323 return 2; 324 } 325 326 if (Str.startswith("0o")) { 327 Str = Str.substr(2); 328 return 8; 329 } 330 331 if (Str.startswith("0")) 332 return 8; 333 334 return 10; 335} 336 337 338/// GetAsUnsignedInteger - Workhorse method that converts a integer character 339/// sequence of radix up to 36 to an unsigned long long value. 340bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 341 unsigned long long &Result) { 342 // Autosense radix if not specified. 343 if (Radix == 0) 344 Radix = GetAutoSenseRadix(Str); 345 346 // Empty strings (after the radix autosense) are invalid. 347 if (Str.empty()) return true; 348 349 // Parse all the bytes of the string given this radix. Watch for overflow. 350 Result = 0; 351 while (!Str.empty()) { 352 unsigned CharVal; 353 if (Str[0] >= '0' && Str[0] <= '9') 354 CharVal = Str[0]-'0'; 355 else if (Str[0] >= 'a' && Str[0] <= 'z') 356 CharVal = Str[0]-'a'+10; 357 else if (Str[0] >= 'A' && Str[0] <= 'Z') 358 CharVal = Str[0]-'A'+10; 359 else 360 return true; 361 362 // If the parsed value is larger than the integer radix, the string is 363 // invalid. 364 if (CharVal >= Radix) 365 return true; 366 367 // Add in this character. 368 unsigned long long PrevResult = Result; 369 Result = Result*Radix+CharVal; 370 371 // Check for overflow by shifting back and seeing if bits were lost. 372 if (Result/Radix < PrevResult) 373 return true; 374 375 Str = Str.substr(1); 376 } 377 378 return false; 379} 380 381bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 382 long long &Result) { 383 unsigned long long ULLVal; 384 385 // Handle positive strings first. 386 if (Str.empty() || Str.front() != '-') { 387 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 388 // Check for value so large it overflows a signed value. 389 (long long)ULLVal < 0) 390 return true; 391 Result = ULLVal; 392 return false; 393 } 394 395 // Get the positive part of the value. 396 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 397 // Reject values so large they'd overflow as negative signed, but allow 398 // "-0". This negates the unsigned so that the negative isn't undefined 399 // on signed overflow. 400 (long long)-ULLVal > 0) 401 return true; 402 403 Result = -ULLVal; 404 return false; 405} 406 407bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 408 StringRef Str = *this; 409 410 // Autosense radix if not specified. 411 if (Radix == 0) 412 Radix = GetAutoSenseRadix(Str); 413 414 assert(Radix > 1 && Radix <= 36); 415 416 // Empty strings (after the radix autosense) are invalid. 417 if (Str.empty()) return true; 418 419 // Skip leading zeroes. This can be a significant improvement if 420 // it means we don't need > 64 bits. 421 while (!Str.empty() && Str.front() == '0') 422 Str = Str.substr(1); 423 424 // If it was nothing but zeroes.... 425 if (Str.empty()) { 426 Result = APInt(64, 0); 427 return false; 428 } 429 430 // (Over-)estimate the required number of bits. 431 unsigned Log2Radix = 0; 432 while ((1U << Log2Radix) < Radix) Log2Radix++; 433 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 434 435 unsigned BitWidth = Log2Radix * Str.size(); 436 if (BitWidth < Result.getBitWidth()) 437 BitWidth = Result.getBitWidth(); // don't shrink the result 438 else if (BitWidth > Result.getBitWidth()) 439 Result = Result.zext(BitWidth); 440 441 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 442 if (!IsPowerOf2Radix) { 443 // These must have the same bit-width as Result. 444 RadixAP = APInt(BitWidth, Radix); 445 CharAP = APInt(BitWidth, 0); 446 } 447 448 // Parse all the bytes of the string given this radix. 449 Result = 0; 450 while (!Str.empty()) { 451 unsigned CharVal; 452 if (Str[0] >= '0' && Str[0] <= '9') 453 CharVal = Str[0]-'0'; 454 else if (Str[0] >= 'a' && Str[0] <= 'z') 455 CharVal = Str[0]-'a'+10; 456 else if (Str[0] >= 'A' && Str[0] <= 'Z') 457 CharVal = Str[0]-'A'+10; 458 else 459 return true; 460 461 // If the parsed value is larger than the integer radix, the string is 462 // invalid. 463 if (CharVal >= Radix) 464 return true; 465 466 // Add in this character. 467 if (IsPowerOf2Radix) { 468 Result <<= Log2Radix; 469 Result |= CharVal; 470 } else { 471 Result *= RadixAP; 472 CharAP = CharVal; 473 Result += CharAP; 474 } 475 476 Str = Str.substr(1); 477 } 478 479 return false; 480} 481 482 483// Implementation of StringRef hashing. 484hash_code llvm::hash_value(StringRef S) { 485 return hash_combine_range(S.begin(), S.end()); 486} 487