YAMLParser.cpp revision 263508
1//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a YAML parser. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/Support/YAMLParser.h" 15#include "llvm/ADT/SmallVector.h" 16#include "llvm/ADT/StringExtras.h" 17#include "llvm/ADT/Twine.h" 18#include "llvm/ADT/ilist.h" 19#include "llvm/ADT/ilist_node.h" 20#include "llvm/Support/ErrorHandling.h" 21#include "llvm/Support/MemoryBuffer.h" 22#include "llvm/Support/SourceMgr.h" 23#include "llvm/Support/raw_ostream.h" 24 25using namespace llvm; 26using namespace yaml; 27 28enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35}; 36 37/// EncodingInfo - Holds the encoding type and length of the byte order mark if 38/// it exists. Length is in {0, 2, 3, 4}. 39typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42/// encoding form of \a Input. 43/// 44/// @param Input A string of length 0 or more. 45/// @returns An EncodingInfo indicating the Unicode encoding form of the input 46/// and how long the byte order mark is if one exists. 47static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95} 96 97namespace llvm { 98namespace yaml { 99/// Pin the vtables to this file. 100void Node::anchor() {} 101void NullNode::anchor() {} 102void ScalarNode::anchor() {} 103void KeyValueNode::anchor() {} 104void MappingNode::anchor() {} 105void SequenceNode::anchor() {} 106void AliasNode::anchor() {} 107 108/// Token - A single YAML token. 109struct Token : ilist_node<Token> { 110 enum TokenKind { 111 TK_Error, // Uninitialized token. 112 TK_StreamStart, 113 TK_StreamEnd, 114 TK_VersionDirective, 115 TK_TagDirective, 116 TK_DocumentStart, 117 TK_DocumentEnd, 118 TK_BlockEntry, 119 TK_BlockEnd, 120 TK_BlockSequenceStart, 121 TK_BlockMappingStart, 122 TK_FlowEntry, 123 TK_FlowSequenceStart, 124 TK_FlowSequenceEnd, 125 TK_FlowMappingStart, 126 TK_FlowMappingEnd, 127 TK_Key, 128 TK_Value, 129 TK_Scalar, 130 TK_Alias, 131 TK_Anchor, 132 TK_Tag 133 } Kind; 134 135 /// A string of length 0 or more whose begin() points to the logical location 136 /// of the token in the input. 137 StringRef Range; 138 139 Token() : Kind(TK_Error) {} 140}; 141} 142} 143 144namespace llvm { 145template<> 146struct ilist_sentinel_traits<Token> { 147 Token *createSentinel() const { 148 return &Sentinel; 149 } 150 static void destroySentinel(Token*) {} 151 152 Token *provideInitialHead() const { return createSentinel(); } 153 Token *ensureHead(Token*) const { return createSentinel(); } 154 static void noteHead(Token*, Token*) {} 155 156private: 157 mutable Token Sentinel; 158}; 159 160template<> 161struct ilist_node_traits<Token> { 162 Token *createNode(const Token &V) { 163 return new (Alloc.Allocate<Token>()) Token(V); 164 } 165 static void deleteNode(Token *V) {} 166 167 void addNodeToList(Token *) {} 168 void removeNodeFromList(Token *) {} 169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170 ilist_iterator<Token> /*first*/, 171 ilist_iterator<Token> /*last*/) {} 172 173 BumpPtrAllocator Alloc; 174}; 175} 176 177typedef ilist<Token> TokenQueueT; 178 179namespace { 180/// @brief This struct is used to track simple keys. 181/// 182/// Simple keys are handled by creating an entry in SimpleKeys for each Token 183/// which could legally be the start of a simple key. When peekNext is called, 184/// if the Token To be returned is referenced by a SimpleKey, we continue 185/// tokenizing until that potential simple key has either been found to not be 186/// a simple key (we moved on to the next line or went further than 1024 chars). 187/// Or when we run into a Value, and then insert a Key token (and possibly 188/// others) before the SimpleKey's Tok. 189struct SimpleKey { 190 TokenQueueT::iterator Tok; 191 unsigned Column; 192 unsigned Line; 193 unsigned FlowLevel; 194 bool IsRequired; 195 196 bool operator ==(const SimpleKey &Other) { 197 return Tok == Other.Tok; 198 } 199}; 200} 201 202/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203/// subsequence and the subsequence's length in code units (uint8_t). 204/// A length of 0 represents an error. 205typedef std::pair<uint32_t, unsigned> UTF8Decoded; 206 207static UTF8Decoded decodeUTF8(StringRef Range) { 208 StringRef::iterator Position= Range.begin(); 209 StringRef::iterator End = Range.end(); 210 // 1 byte: [0x00, 0x7f] 211 // Bit pattern: 0xxxxxxx 212 if ((*Position & 0x80) == 0) { 213 return std::make_pair(*Position, 1); 214 } 215 // 2 bytes: [0x80, 0x7ff] 216 // Bit pattern: 110xxxxx 10xxxxxx 217 if (Position + 1 != End && 218 ((*Position & 0xE0) == 0xC0) && 219 ((*(Position + 1) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 221 (*(Position + 1) & 0x3F); 222 if (codepoint >= 0x80) 223 return std::make_pair(codepoint, 2); 224 } 225 // 3 bytes: [0x8000, 0xffff] 226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227 if (Position + 2 != End && 228 ((*Position & 0xF0) == 0xE0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80)) { 231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 232 ((*(Position + 1) & 0x3F) << 6) | 233 (*(Position + 2) & 0x3F); 234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 235 // they are high / low surrogate halves used by UTF-16. 236 if (codepoint >= 0x800 && 237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 238 return std::make_pair(codepoint, 3); 239 } 240 // 4 bytes: [0x10000, 0x10FFFF] 241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 if (Position + 3 != End && 243 ((*Position & 0xF8) == 0xF0) && 244 ((*(Position + 1) & 0xC0) == 0x80) && 245 ((*(Position + 2) & 0xC0) == 0x80) && 246 ((*(Position + 3) & 0xC0) == 0x80)) { 247 uint32_t codepoint = ((*Position & 0x07) << 18) | 248 ((*(Position + 1) & 0x3F) << 12) | 249 ((*(Position + 2) & 0x3F) << 6) | 250 (*(Position + 3) & 0x3F); 251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252 return std::make_pair(codepoint, 4); 253 } 254 return std::make_pair(0, 0); 255} 256 257namespace llvm { 258namespace yaml { 259/// @brief Scans YAML tokens from a MemoryBuffer. 260class Scanner { 261public: 262 Scanner(const StringRef Input, SourceMgr &SM); 263 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 264 265 /// @brief Parse the next token and return it without popping it. 266 Token &peekNext(); 267 268 /// @brief Parse the next token and pop it from the queue. 269 Token getNext(); 270 271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272 ArrayRef<SMRange> Ranges = None) { 273 SM.PrintMessage(Loc, Kind, Message, Ranges); 274 } 275 276 void setError(const Twine &Message, StringRef::iterator Position) { 277 if (Current >= End) 278 Current = End - 1; 279 280 // Don't print out more errors after the first one we encounter. The rest 281 // are just the result of the first, and have no meaning. 282 if (!Failed) 283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284 Failed = true; 285 } 286 287 void setError(const Twine &Message) { 288 setError(Message, Current); 289 } 290 291 /// @brief Returns true if an error occurred while parsing. 292 bool failed() { 293 return Failed; 294 } 295 296private: 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// @brief Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// @brief Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// @brief Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// @brief Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 365 /// @brief Skip minimal well-formed code unit subsequences until Func 366 /// returns its input. 367 /// 368 /// @returns The code unit after the last minimal well-formed code unit 369 /// subsequence that Func accepted. 370 StringRef::iterator skip_while( SkipWhileFunc Func 371 , StringRef::iterator Position); 372 373 /// @brief Scan ns-uri-char[39]s starting at Cur. 374 /// 375 /// This updates Cur and Column while scanning. 376 /// 377 /// @returns A StringRef starting at Cur which covers the longest contiguous 378 /// sequence of ns-uri-char. 379 StringRef scan_ns_uri_char(); 380 381 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 382 StringRef scan_ns_plain_one_line(); 383 384 /// @brief Consume a minimal well-formed code unit subsequence starting at 385 /// \a Cur. Return false if it is not the same Unicode scalar value as 386 /// \a Expected. This updates \a Column. 387 bool consume(uint32_t Expected); 388 389 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 390 void skip(uint32_t Distance); 391 392 /// @brief Return true if the minimal well-formed code unit subsequence at 393 /// Pos is whitespace or a new line 394 bool isBlankOrBreak(StringRef::iterator Position); 395 396 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 397 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 398 , unsigned AtColumn 399 , bool IsRequired); 400 401 /// @brief Remove simple keys that can no longer be valid simple keys. 402 /// 403 /// Invalid simple keys are not on the current line or are further than 1024 404 /// columns back. 405 void removeStaleSimpleKeyCandidates(); 406 407 /// @brief Remove all simple keys on FlowLevel \a Level. 408 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 409 410 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 411 /// tokens if needed. 412 bool unrollIndent(int ToColumn); 413 414 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 415 /// if needed. 416 bool rollIndent( int ToColumn 417 , Token::TokenKind Kind 418 , TokenQueueT::iterator InsertPoint); 419 420 /// @brief Skip whitespace and comments until the start of the next token. 421 void scanToNextToken(); 422 423 /// @brief Must be the first token generated. 424 bool scanStreamStart(); 425 426 /// @brief Generate tokens needed to close out the stream. 427 bool scanStreamEnd(); 428 429 /// @brief Scan a %BLAH directive. 430 bool scanDirective(); 431 432 /// @brief Scan a ... or ---. 433 bool scanDocumentIndicator(bool IsStart); 434 435 /// @brief Scan a [ or { and generate the proper flow collection start token. 436 bool scanFlowCollectionStart(bool IsSequence); 437 438 /// @brief Scan a ] or } and generate the proper flow collection end token. 439 bool scanFlowCollectionEnd(bool IsSequence); 440 441 /// @brief Scan the , that separates entries in a flow collection. 442 bool scanFlowEntry(); 443 444 /// @brief Scan the - that starts block sequence entries. 445 bool scanBlockEntry(); 446 447 /// @brief Scan an explicit ? indicating a key. 448 bool scanKey(); 449 450 /// @brief Scan an explicit : indicating a value. 451 bool scanValue(); 452 453 /// @brief Scan a quoted scalar. 454 bool scanFlowScalar(bool IsDoubleQuoted); 455 456 /// @brief Scan an unquoted scalar. 457 bool scanPlainScalar(); 458 459 /// @brief Scan an Alias or Anchor starting with * or &. 460 bool scanAliasOrAnchor(bool IsAlias); 461 462 /// @brief Scan a block scalar starting with | or >. 463 bool scanBlockScalar(bool IsLiteral); 464 465 /// @brief Scan a tag of the form !stuff. 466 bool scanTag(); 467 468 /// @brief Dispatch to the next scanning function based on \a *Cur. 469 bool fetchMoreTokens(); 470 471 /// @brief The SourceMgr used for diagnostics and buffer management. 472 SourceMgr &SM; 473 474 /// @brief The original input. 475 MemoryBuffer *InputBuffer; 476 477 /// @brief The current position of the scanner. 478 StringRef::iterator Current; 479 480 /// @brief The end of the input (one past the last character). 481 StringRef::iterator End; 482 483 /// @brief Current YAML indentation level in spaces. 484 int Indent; 485 486 /// @brief Current column number in Unicode code points. 487 unsigned Column; 488 489 /// @brief Current line number. 490 unsigned Line; 491 492 /// @brief How deep we are in flow style containers. 0 Means at block level. 493 unsigned FlowLevel; 494 495 /// @brief Are we at the start of the stream? 496 bool IsStartOfStream; 497 498 /// @brief Can the next token be the start of a simple key? 499 bool IsSimpleKeyAllowed; 500 501 /// @brief True if an error has occurred. 502 bool Failed; 503 504 /// @brief Queue of tokens. This is required to queue up tokens while looking 505 /// for the end of a simple key. And for cases where a single character 506 /// can produce multiple tokens (e.g. BlockEnd). 507 TokenQueueT TokenQueue; 508 509 /// @brief Indentation levels. 510 SmallVector<int, 4> Indents; 511 512 /// @brief Potential simple keys. 513 SmallVector<SimpleKey, 4> SimpleKeys; 514}; 515 516} // end namespace yaml 517} // end namespace llvm 518 519/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 520static void encodeUTF8( uint32_t UnicodeScalarValue 521 , SmallVectorImpl<char> &Result) { 522 if (UnicodeScalarValue <= 0x7F) { 523 Result.push_back(UnicodeScalarValue & 0x7F); 524 } else if (UnicodeScalarValue <= 0x7FF) { 525 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 526 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 527 Result.push_back(FirstByte); 528 Result.push_back(SecondByte); 529 } else if (UnicodeScalarValue <= 0xFFFF) { 530 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 531 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 532 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 533 Result.push_back(FirstByte); 534 Result.push_back(SecondByte); 535 Result.push_back(ThirdByte); 536 } else if (UnicodeScalarValue <= 0x10FFFF) { 537 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 538 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 539 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 540 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 541 Result.push_back(FirstByte); 542 Result.push_back(SecondByte); 543 Result.push_back(ThirdByte); 544 Result.push_back(FourthByte); 545 } 546} 547 548bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 549 SourceMgr SM; 550 Scanner scanner(Input, SM); 551 while (true) { 552 Token T = scanner.getNext(); 553 switch (T.Kind) { 554 case Token::TK_StreamStart: 555 OS << "Stream-Start: "; 556 break; 557 case Token::TK_StreamEnd: 558 OS << "Stream-End: "; 559 break; 560 case Token::TK_VersionDirective: 561 OS << "Version-Directive: "; 562 break; 563 case Token::TK_TagDirective: 564 OS << "Tag-Directive: "; 565 break; 566 case Token::TK_DocumentStart: 567 OS << "Document-Start: "; 568 break; 569 case Token::TK_DocumentEnd: 570 OS << "Document-End: "; 571 break; 572 case Token::TK_BlockEntry: 573 OS << "Block-Entry: "; 574 break; 575 case Token::TK_BlockEnd: 576 OS << "Block-End: "; 577 break; 578 case Token::TK_BlockSequenceStart: 579 OS << "Block-Sequence-Start: "; 580 break; 581 case Token::TK_BlockMappingStart: 582 OS << "Block-Mapping-Start: "; 583 break; 584 case Token::TK_FlowEntry: 585 OS << "Flow-Entry: "; 586 break; 587 case Token::TK_FlowSequenceStart: 588 OS << "Flow-Sequence-Start: "; 589 break; 590 case Token::TK_FlowSequenceEnd: 591 OS << "Flow-Sequence-End: "; 592 break; 593 case Token::TK_FlowMappingStart: 594 OS << "Flow-Mapping-Start: "; 595 break; 596 case Token::TK_FlowMappingEnd: 597 OS << "Flow-Mapping-End: "; 598 break; 599 case Token::TK_Key: 600 OS << "Key: "; 601 break; 602 case Token::TK_Value: 603 OS << "Value: "; 604 break; 605 case Token::TK_Scalar: 606 OS << "Scalar: "; 607 break; 608 case Token::TK_Alias: 609 OS << "Alias: "; 610 break; 611 case Token::TK_Anchor: 612 OS << "Anchor: "; 613 break; 614 case Token::TK_Tag: 615 OS << "Tag: "; 616 break; 617 case Token::TK_Error: 618 break; 619 } 620 OS << T.Range << "\n"; 621 if (T.Kind == Token::TK_StreamEnd) 622 break; 623 else if (T.Kind == Token::TK_Error) 624 return false; 625 } 626 return true; 627} 628 629bool yaml::scanTokens(StringRef Input) { 630 llvm::SourceMgr SM; 631 llvm::yaml::Scanner scanner(Input, SM); 632 for (;;) { 633 llvm::yaml::Token T = scanner.getNext(); 634 if (T.Kind == Token::TK_StreamEnd) 635 break; 636 else if (T.Kind == Token::TK_Error) 637 return false; 638 } 639 return true; 640} 641 642std::string yaml::escape(StringRef Input) { 643 std::string EscapedInput; 644 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 645 if (*i == '\\') 646 EscapedInput += "\\\\"; 647 else if (*i == '"') 648 EscapedInput += "\\\""; 649 else if (*i == 0) 650 EscapedInput += "\\0"; 651 else if (*i == 0x07) 652 EscapedInput += "\\a"; 653 else if (*i == 0x08) 654 EscapedInput += "\\b"; 655 else if (*i == 0x09) 656 EscapedInput += "\\t"; 657 else if (*i == 0x0A) 658 EscapedInput += "\\n"; 659 else if (*i == 0x0B) 660 EscapedInput += "\\v"; 661 else if (*i == 0x0C) 662 EscapedInput += "\\f"; 663 else if (*i == 0x0D) 664 EscapedInput += "\\r"; 665 else if (*i == 0x1B) 666 EscapedInput += "\\e"; 667 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 668 std::string HexStr = utohexstr(*i); 669 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 670 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 671 UTF8Decoded UnicodeScalarValue 672 = decodeUTF8(StringRef(i, Input.end() - i)); 673 if (UnicodeScalarValue.second == 0) { 674 // Found invalid char. 675 SmallString<4> Val; 676 encodeUTF8(0xFFFD, Val); 677 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 678 // FIXME: Error reporting. 679 return EscapedInput; 680 } 681 if (UnicodeScalarValue.first == 0x85) 682 EscapedInput += "\\N"; 683 else if (UnicodeScalarValue.first == 0xA0) 684 EscapedInput += "\\_"; 685 else if (UnicodeScalarValue.first == 0x2028) 686 EscapedInput += "\\L"; 687 else if (UnicodeScalarValue.first == 0x2029) 688 EscapedInput += "\\P"; 689 else { 690 std::string HexStr = utohexstr(UnicodeScalarValue.first); 691 if (HexStr.size() <= 2) 692 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 693 else if (HexStr.size() <= 4) 694 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 695 else if (HexStr.size() <= 8) 696 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 697 } 698 i += UnicodeScalarValue.second - 1; 699 } else 700 EscapedInput.push_back(*i); 701 } 702 return EscapedInput; 703} 704 705Scanner::Scanner(StringRef Input, SourceMgr &sm) 706 : SM(sm) 707 , Indent(-1) 708 , Column(0) 709 , Line(0) 710 , FlowLevel(0) 711 , IsStartOfStream(true) 712 , IsSimpleKeyAllowed(true) 713 , Failed(false) { 714 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 715 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 716 Current = InputBuffer->getBufferStart(); 717 End = InputBuffer->getBufferEnd(); 718} 719 720Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 721 : SM(SM_) 722 , InputBuffer(Buffer) 723 , Current(InputBuffer->getBufferStart()) 724 , End(InputBuffer->getBufferEnd()) 725 , Indent(-1) 726 , Column(0) 727 , Line(0) 728 , FlowLevel(0) 729 , IsStartOfStream(true) 730 , IsSimpleKeyAllowed(true) 731 , Failed(false) { 732 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 733} 734 735Token &Scanner::peekNext() { 736 // If the current token is a possible simple key, keep parsing until we 737 // can confirm. 738 bool NeedMore = false; 739 while (true) { 740 if (TokenQueue.empty() || NeedMore) { 741 if (!fetchMoreTokens()) { 742 TokenQueue.clear(); 743 TokenQueue.push_back(Token()); 744 return TokenQueue.front(); 745 } 746 } 747 assert(!TokenQueue.empty() && 748 "fetchMoreTokens lied about getting tokens!"); 749 750 removeStaleSimpleKeyCandidates(); 751 SimpleKey SK; 752 SK.Tok = TokenQueue.front(); 753 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 754 == SimpleKeys.end()) 755 break; 756 else 757 NeedMore = true; 758 } 759 return TokenQueue.front(); 760} 761 762Token Scanner::getNext() { 763 Token Ret = peekNext(); 764 // TokenQueue can be empty if there was an error getting the next token. 765 if (!TokenQueue.empty()) 766 TokenQueue.pop_front(); 767 768 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 769 // quick deallocation of them all. 770 if (TokenQueue.empty()) { 771 TokenQueue.Alloc.Reset(); 772 } 773 774 return Ret; 775} 776 777StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 778 if (Position == End) 779 return Position; 780 // Check 7 bit c-printable - b-char. 781 if ( *Position == 0x09 782 || (*Position >= 0x20 && *Position <= 0x7E)) 783 return Position + 1; 784 785 // Check for valid UTF-8. 786 if (uint8_t(*Position) & 0x80) { 787 UTF8Decoded u8d = decodeUTF8(Position); 788 if ( u8d.second != 0 789 && u8d.first != 0xFEFF 790 && ( u8d.first == 0x85 791 || ( u8d.first >= 0xA0 792 && u8d.first <= 0xD7FF) 793 || ( u8d.first >= 0xE000 794 && u8d.first <= 0xFFFD) 795 || ( u8d.first >= 0x10000 796 && u8d.first <= 0x10FFFF))) 797 return Position + u8d.second; 798 } 799 return Position; 800} 801 802StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 803 if (Position == End) 804 return Position; 805 if (*Position == 0x0D) { 806 if (Position + 1 != End && *(Position + 1) == 0x0A) 807 return Position + 2; 808 return Position + 1; 809 } 810 811 if (*Position == 0x0A) 812 return Position + 1; 813 return Position; 814} 815 816 817StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 818 if (Position == End) 819 return Position; 820 if (*Position == ' ' || *Position == '\t') 821 return Position + 1; 822 return Position; 823} 824 825StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 826 if (Position == End) 827 return Position; 828 if (*Position == ' ' || *Position == '\t') 829 return Position; 830 return skip_nb_char(Position); 831} 832 833StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 834 , StringRef::iterator Position) { 835 while (true) { 836 StringRef::iterator i = (this->*Func)(Position); 837 if (i == Position) 838 break; 839 Position = i; 840 } 841 return Position; 842} 843 844static bool is_ns_hex_digit(const char C) { 845 return (C >= '0' && C <= '9') 846 || (C >= 'a' && C <= 'z') 847 || (C >= 'A' && C <= 'Z'); 848} 849 850static bool is_ns_word_char(const char C) { 851 return C == '-' 852 || (C >= 'a' && C <= 'z') 853 || (C >= 'A' && C <= 'Z'); 854} 855 856StringRef Scanner::scan_ns_uri_char() { 857 StringRef::iterator Start = Current; 858 while (true) { 859 if (Current == End) 860 break; 861 if (( *Current == '%' 862 && Current + 2 < End 863 && is_ns_hex_digit(*(Current + 1)) 864 && is_ns_hex_digit(*(Current + 2))) 865 || is_ns_word_char(*Current) 866 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 867 != StringRef::npos) { 868 ++Current; 869 ++Column; 870 } else 871 break; 872 } 873 return StringRef(Start, Current - Start); 874} 875 876StringRef Scanner::scan_ns_plain_one_line() { 877 StringRef::iterator start = Current; 878 // The first character must already be verified. 879 ++Current; 880 while (true) { 881 if (Current == End) { 882 break; 883 } else if (*Current == ':') { 884 // Check if the next character is a ns-char. 885 if (Current + 1 == End) 886 break; 887 StringRef::iterator i = skip_ns_char(Current + 1); 888 if (Current + 1 != i) { 889 Current = i; 890 Column += 2; // Consume both the ':' and ns-char. 891 } else 892 break; 893 } else if (*Current == '#') { 894 // Check if the previous character was a ns-char. 895 // The & 0x80 check is to check for the trailing byte of a utf-8 896 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 897 ++Current; 898 ++Column; 899 } else 900 break; 901 } else { 902 StringRef::iterator i = skip_nb_char(Current); 903 if (i == Current) 904 break; 905 Current = i; 906 ++Column; 907 } 908 } 909 return StringRef(start, Current - start); 910} 911 912bool Scanner::consume(uint32_t Expected) { 913 if (Expected >= 0x80) 914 report_fatal_error("Not dealing with this yet"); 915 if (Current == End) 916 return false; 917 if (uint8_t(*Current) >= 0x80) 918 report_fatal_error("Not dealing with this yet"); 919 if (uint8_t(*Current) == Expected) { 920 ++Current; 921 ++Column; 922 return true; 923 } 924 return false; 925} 926 927void Scanner::skip(uint32_t Distance) { 928 Current += Distance; 929 Column += Distance; 930 assert(Current <= End && "Skipped past the end"); 931} 932 933bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 934 if (Position == End) 935 return false; 936 if ( *Position == ' ' || *Position == '\t' 937 || *Position == '\r' || *Position == '\n') 938 return true; 939 return false; 940} 941 942void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 943 , unsigned AtColumn 944 , bool IsRequired) { 945 if (IsSimpleKeyAllowed) { 946 SimpleKey SK; 947 SK.Tok = Tok; 948 SK.Line = Line; 949 SK.Column = AtColumn; 950 SK.IsRequired = IsRequired; 951 SK.FlowLevel = FlowLevel; 952 SimpleKeys.push_back(SK); 953 } 954} 955 956void Scanner::removeStaleSimpleKeyCandidates() { 957 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 958 i != SimpleKeys.end();) { 959 if (i->Line != Line || i->Column + 1024 < Column) { 960 if (i->IsRequired) 961 setError( "Could not find expected : for simple key" 962 , i->Tok->Range.begin()); 963 i = SimpleKeys.erase(i); 964 } else 965 ++i; 966 } 967} 968 969void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 970 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 971 SimpleKeys.pop_back(); 972} 973 974bool Scanner::unrollIndent(int ToColumn) { 975 Token T; 976 // Indentation is ignored in flow. 977 if (FlowLevel != 0) 978 return true; 979 980 while (Indent > ToColumn) { 981 T.Kind = Token::TK_BlockEnd; 982 T.Range = StringRef(Current, 1); 983 TokenQueue.push_back(T); 984 Indent = Indents.pop_back_val(); 985 } 986 987 return true; 988} 989 990bool Scanner::rollIndent( int ToColumn 991 , Token::TokenKind Kind 992 , TokenQueueT::iterator InsertPoint) { 993 if (FlowLevel) 994 return true; 995 if (Indent < ToColumn) { 996 Indents.push_back(Indent); 997 Indent = ToColumn; 998 999 Token T; 1000 T.Kind = Kind; 1001 T.Range = StringRef(Current, 0); 1002 TokenQueue.insert(InsertPoint, T); 1003 } 1004 return true; 1005} 1006 1007void Scanner::scanToNextToken() { 1008 while (true) { 1009 while (*Current == ' ' || *Current == '\t') { 1010 skip(1); 1011 } 1012 1013 // Skip comment. 1014 if (*Current == '#') { 1015 while (true) { 1016 // This may skip more than one byte, thus Column is only incremented 1017 // for code points. 1018 StringRef::iterator i = skip_nb_char(Current); 1019 if (i == Current) 1020 break; 1021 Current = i; 1022 ++Column; 1023 } 1024 } 1025 1026 // Skip EOL. 1027 StringRef::iterator i = skip_b_break(Current); 1028 if (i == Current) 1029 break; 1030 Current = i; 1031 ++Line; 1032 Column = 0; 1033 // New lines may start a simple key. 1034 if (!FlowLevel) 1035 IsSimpleKeyAllowed = true; 1036 } 1037} 1038 1039bool Scanner::scanStreamStart() { 1040 IsStartOfStream = false; 1041 1042 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1043 1044 Token T; 1045 T.Kind = Token::TK_StreamStart; 1046 T.Range = StringRef(Current, EI.second); 1047 TokenQueue.push_back(T); 1048 Current += EI.second; 1049 return true; 1050} 1051 1052bool Scanner::scanStreamEnd() { 1053 // Force an ending new line if one isn't present. 1054 if (Column != 0) { 1055 Column = 0; 1056 ++Line; 1057 } 1058 1059 unrollIndent(-1); 1060 SimpleKeys.clear(); 1061 IsSimpleKeyAllowed = false; 1062 1063 Token T; 1064 T.Kind = Token::TK_StreamEnd; 1065 T.Range = StringRef(Current, 0); 1066 TokenQueue.push_back(T); 1067 return true; 1068} 1069 1070bool Scanner::scanDirective() { 1071 // Reset the indentation level. 1072 unrollIndent(-1); 1073 SimpleKeys.clear(); 1074 IsSimpleKeyAllowed = false; 1075 1076 StringRef::iterator Start = Current; 1077 consume('%'); 1078 StringRef::iterator NameStart = Current; 1079 Current = skip_while(&Scanner::skip_ns_char, Current); 1080 StringRef Name(NameStart, Current - NameStart); 1081 Current = skip_while(&Scanner::skip_s_white, Current); 1082 1083 Token T; 1084 if (Name == "YAML") { 1085 Current = skip_while(&Scanner::skip_ns_char, Current); 1086 T.Kind = Token::TK_VersionDirective; 1087 T.Range = StringRef(Start, Current - Start); 1088 TokenQueue.push_back(T); 1089 return true; 1090 } else if(Name == "TAG") { 1091 Current = skip_while(&Scanner::skip_ns_char, Current); 1092 Current = skip_while(&Scanner::skip_s_white, Current); 1093 Current = skip_while(&Scanner::skip_ns_char, Current); 1094 T.Kind = Token::TK_TagDirective; 1095 T.Range = StringRef(Start, Current - Start); 1096 TokenQueue.push_back(T); 1097 return true; 1098 } 1099 return false; 1100} 1101 1102bool Scanner::scanDocumentIndicator(bool IsStart) { 1103 unrollIndent(-1); 1104 SimpleKeys.clear(); 1105 IsSimpleKeyAllowed = false; 1106 1107 Token T; 1108 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1109 T.Range = StringRef(Current, 3); 1110 skip(3); 1111 TokenQueue.push_back(T); 1112 return true; 1113} 1114 1115bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1116 Token T; 1117 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1118 : Token::TK_FlowMappingStart; 1119 T.Range = StringRef(Current, 1); 1120 skip(1); 1121 TokenQueue.push_back(T); 1122 1123 // [ and { may begin a simple key. 1124 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1125 1126 // And may also be followed by a simple key. 1127 IsSimpleKeyAllowed = true; 1128 ++FlowLevel; 1129 return true; 1130} 1131 1132bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1133 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1134 IsSimpleKeyAllowed = false; 1135 Token T; 1136 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1137 : Token::TK_FlowMappingEnd; 1138 T.Range = StringRef(Current, 1); 1139 skip(1); 1140 TokenQueue.push_back(T); 1141 if (FlowLevel) 1142 --FlowLevel; 1143 return true; 1144} 1145 1146bool Scanner::scanFlowEntry() { 1147 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1148 IsSimpleKeyAllowed = true; 1149 Token T; 1150 T.Kind = Token::TK_FlowEntry; 1151 T.Range = StringRef(Current, 1); 1152 skip(1); 1153 TokenQueue.push_back(T); 1154 return true; 1155} 1156 1157bool Scanner::scanBlockEntry() { 1158 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1159 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1160 IsSimpleKeyAllowed = true; 1161 Token T; 1162 T.Kind = Token::TK_BlockEntry; 1163 T.Range = StringRef(Current, 1); 1164 skip(1); 1165 TokenQueue.push_back(T); 1166 return true; 1167} 1168 1169bool Scanner::scanKey() { 1170 if (!FlowLevel) 1171 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1172 1173 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1174 IsSimpleKeyAllowed = !FlowLevel; 1175 1176 Token T; 1177 T.Kind = Token::TK_Key; 1178 T.Range = StringRef(Current, 1); 1179 skip(1); 1180 TokenQueue.push_back(T); 1181 return true; 1182} 1183 1184bool Scanner::scanValue() { 1185 // If the previous token could have been a simple key, insert the key token 1186 // into the token queue. 1187 if (!SimpleKeys.empty()) { 1188 SimpleKey SK = SimpleKeys.pop_back_val(); 1189 Token T; 1190 T.Kind = Token::TK_Key; 1191 T.Range = SK.Tok->Range; 1192 TokenQueueT::iterator i, e; 1193 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1194 if (i == SK.Tok) 1195 break; 1196 } 1197 assert(i != e && "SimpleKey not in token queue!"); 1198 i = TokenQueue.insert(i, T); 1199 1200 // We may also need to add a Block-Mapping-Start token. 1201 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1202 1203 IsSimpleKeyAllowed = false; 1204 } else { 1205 if (!FlowLevel) 1206 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1207 IsSimpleKeyAllowed = !FlowLevel; 1208 } 1209 1210 Token T; 1211 T.Kind = Token::TK_Value; 1212 T.Range = StringRef(Current, 1); 1213 skip(1); 1214 TokenQueue.push_back(T); 1215 return true; 1216} 1217 1218// Forbidding inlining improves performance by roughly 20%. 1219// FIXME: Remove once llvm optimizes this to the faster version without hints. 1220LLVM_ATTRIBUTE_NOINLINE static bool 1221wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1222 1223// Returns whether a character at 'Position' was escaped with a leading '\'. 1224// 'First' specifies the position of the first character in the string. 1225static bool wasEscaped(StringRef::iterator First, 1226 StringRef::iterator Position) { 1227 assert(Position - 1 >= First); 1228 StringRef::iterator I = Position - 1; 1229 // We calculate the number of consecutive '\'s before the current position 1230 // by iterating backwards through our string. 1231 while (I >= First && *I == '\\') --I; 1232 // (Position - 1 - I) now contains the number of '\'s before the current 1233 // position. If it is odd, the character at 'Position' was escaped. 1234 return (Position - 1 - I) % 2 == 1; 1235} 1236 1237bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1238 StringRef::iterator Start = Current; 1239 unsigned ColStart = Column; 1240 if (IsDoubleQuoted) { 1241 do { 1242 ++Current; 1243 while (Current != End && *Current != '"') 1244 ++Current; 1245 // Repeat until the previous character was not a '\' or was an escaped 1246 // backslash. 1247 } while ( Current != End 1248 && *(Current - 1) == '\\' 1249 && wasEscaped(Start + 1, Current)); 1250 } else { 1251 skip(1); 1252 while (true) { 1253 // Skip a ' followed by another '. 1254 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1255 skip(2); 1256 continue; 1257 } else if (*Current == '\'') 1258 break; 1259 StringRef::iterator i = skip_nb_char(Current); 1260 if (i == Current) { 1261 i = skip_b_break(Current); 1262 if (i == Current) 1263 break; 1264 Current = i; 1265 Column = 0; 1266 ++Line; 1267 } else { 1268 if (i == End) 1269 break; 1270 Current = i; 1271 ++Column; 1272 } 1273 } 1274 } 1275 1276 if (Current == End) { 1277 setError("Expected quote at end of scalar", Current); 1278 return false; 1279 } 1280 1281 skip(1); // Skip ending quote. 1282 Token T; 1283 T.Kind = Token::TK_Scalar; 1284 T.Range = StringRef(Start, Current - Start); 1285 TokenQueue.push_back(T); 1286 1287 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1288 1289 IsSimpleKeyAllowed = false; 1290 1291 return true; 1292} 1293 1294bool Scanner::scanPlainScalar() { 1295 StringRef::iterator Start = Current; 1296 unsigned ColStart = Column; 1297 unsigned LeadingBlanks = 0; 1298 assert(Indent >= -1 && "Indent must be >= -1 !"); 1299 unsigned indent = static_cast<unsigned>(Indent + 1); 1300 while (true) { 1301 if (*Current == '#') 1302 break; 1303 1304 while (!isBlankOrBreak(Current)) { 1305 if ( FlowLevel && *Current == ':' 1306 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1307 setError("Found unexpected ':' while scanning a plain scalar", Current); 1308 return false; 1309 } 1310 1311 // Check for the end of the plain scalar. 1312 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1313 || ( FlowLevel 1314 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1315 != StringRef::npos))) 1316 break; 1317 1318 StringRef::iterator i = skip_nb_char(Current); 1319 if (i == Current) 1320 break; 1321 Current = i; 1322 ++Column; 1323 } 1324 1325 // Are we at the end? 1326 if (!isBlankOrBreak(Current)) 1327 break; 1328 1329 // Eat blanks. 1330 StringRef::iterator Tmp = Current; 1331 while (isBlankOrBreak(Tmp)) { 1332 StringRef::iterator i = skip_s_white(Tmp); 1333 if (i != Tmp) { 1334 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1335 setError("Found invalid tab character in indentation", Tmp); 1336 return false; 1337 } 1338 Tmp = i; 1339 ++Column; 1340 } else { 1341 i = skip_b_break(Tmp); 1342 if (!LeadingBlanks) 1343 LeadingBlanks = 1; 1344 Tmp = i; 1345 Column = 0; 1346 ++Line; 1347 } 1348 } 1349 1350 if (!FlowLevel && Column < indent) 1351 break; 1352 1353 Current = Tmp; 1354 } 1355 if (Start == Current) { 1356 setError("Got empty plain scalar", Start); 1357 return false; 1358 } 1359 Token T; 1360 T.Kind = Token::TK_Scalar; 1361 T.Range = StringRef(Start, Current - Start); 1362 TokenQueue.push_back(T); 1363 1364 // Plain scalars can be simple keys. 1365 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1366 1367 IsSimpleKeyAllowed = false; 1368 1369 return true; 1370} 1371 1372bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1373 StringRef::iterator Start = Current; 1374 unsigned ColStart = Column; 1375 skip(1); 1376 while(true) { 1377 if ( *Current == '[' || *Current == ']' 1378 || *Current == '{' || *Current == '}' 1379 || *Current == ',' 1380 || *Current == ':') 1381 break; 1382 StringRef::iterator i = skip_ns_char(Current); 1383 if (i == Current) 1384 break; 1385 Current = i; 1386 ++Column; 1387 } 1388 1389 if (Start == Current) { 1390 setError("Got empty alias or anchor", Start); 1391 return false; 1392 } 1393 1394 Token T; 1395 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1396 T.Range = StringRef(Start, Current - Start); 1397 TokenQueue.push_back(T); 1398 1399 // Alias and anchors can be simple keys. 1400 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1401 1402 IsSimpleKeyAllowed = false; 1403 1404 return true; 1405} 1406 1407bool Scanner::scanBlockScalar(bool IsLiteral) { 1408 StringRef::iterator Start = Current; 1409 skip(1); // Eat | or > 1410 while(true) { 1411 StringRef::iterator i = skip_nb_char(Current); 1412 if (i == Current) { 1413 if (Column == 0) 1414 break; 1415 i = skip_b_break(Current); 1416 if (i != Current) { 1417 // We got a line break. 1418 Column = 0; 1419 ++Line; 1420 Current = i; 1421 continue; 1422 } else { 1423 // There was an error, which should already have been printed out. 1424 return false; 1425 } 1426 } 1427 Current = i; 1428 ++Column; 1429 } 1430 1431 if (Start == Current) { 1432 setError("Got empty block scalar", Start); 1433 return false; 1434 } 1435 1436 Token T; 1437 T.Kind = Token::TK_Scalar; 1438 T.Range = StringRef(Start, Current - Start); 1439 TokenQueue.push_back(T); 1440 return true; 1441} 1442 1443bool Scanner::scanTag() { 1444 StringRef::iterator Start = Current; 1445 unsigned ColStart = Column; 1446 skip(1); // Eat !. 1447 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1448 else if (*Current == '<') { 1449 skip(1); 1450 scan_ns_uri_char(); 1451 if (!consume('>')) 1452 return false; 1453 } else { 1454 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1455 Current = skip_while(&Scanner::skip_ns_char, Current); 1456 } 1457 1458 Token T; 1459 T.Kind = Token::TK_Tag; 1460 T.Range = StringRef(Start, Current - Start); 1461 TokenQueue.push_back(T); 1462 1463 // Tags can be simple keys. 1464 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1465 1466 IsSimpleKeyAllowed = false; 1467 1468 return true; 1469} 1470 1471bool Scanner::fetchMoreTokens() { 1472 if (IsStartOfStream) 1473 return scanStreamStart(); 1474 1475 scanToNextToken(); 1476 1477 if (Current == End) 1478 return scanStreamEnd(); 1479 1480 removeStaleSimpleKeyCandidates(); 1481 1482 unrollIndent(Column); 1483 1484 if (Column == 0 && *Current == '%') 1485 return scanDirective(); 1486 1487 if (Column == 0 && Current + 4 <= End 1488 && *Current == '-' 1489 && *(Current + 1) == '-' 1490 && *(Current + 2) == '-' 1491 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1492 return scanDocumentIndicator(true); 1493 1494 if (Column == 0 && Current + 4 <= End 1495 && *Current == '.' 1496 && *(Current + 1) == '.' 1497 && *(Current + 2) == '.' 1498 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1499 return scanDocumentIndicator(false); 1500 1501 if (*Current == '[') 1502 return scanFlowCollectionStart(true); 1503 1504 if (*Current == '{') 1505 return scanFlowCollectionStart(false); 1506 1507 if (*Current == ']') 1508 return scanFlowCollectionEnd(true); 1509 1510 if (*Current == '}') 1511 return scanFlowCollectionEnd(false); 1512 1513 if (*Current == ',') 1514 return scanFlowEntry(); 1515 1516 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1517 return scanBlockEntry(); 1518 1519 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1520 return scanKey(); 1521 1522 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1523 return scanValue(); 1524 1525 if (*Current == '*') 1526 return scanAliasOrAnchor(true); 1527 1528 if (*Current == '&') 1529 return scanAliasOrAnchor(false); 1530 1531 if (*Current == '!') 1532 return scanTag(); 1533 1534 if (*Current == '|' && !FlowLevel) 1535 return scanBlockScalar(true); 1536 1537 if (*Current == '>' && !FlowLevel) 1538 return scanBlockScalar(false); 1539 1540 if (*Current == '\'') 1541 return scanFlowScalar(false); 1542 1543 if (*Current == '"') 1544 return scanFlowScalar(true); 1545 1546 // Get a plain scalar. 1547 StringRef FirstChar(Current, 1); 1548 if (!(isBlankOrBreak(Current) 1549 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1550 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1551 || (!FlowLevel && (*Current == '?' || *Current == ':') 1552 && isBlankOrBreak(Current + 1)) 1553 || (!FlowLevel && *Current == ':' 1554 && Current + 2 < End 1555 && *(Current + 1) == ':' 1556 && !isBlankOrBreak(Current + 2))) 1557 return scanPlainScalar(); 1558 1559 setError("Unrecognized character while tokenizing."); 1560 return false; 1561} 1562 1563Stream::Stream(StringRef Input, SourceMgr &SM) 1564 : scanner(new Scanner(Input, SM)) 1565 , CurrentDoc(0) {} 1566 1567Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1568 : scanner(new Scanner(InputBuffer, SM)) 1569 , CurrentDoc(0) {} 1570 1571Stream::~Stream() {} 1572 1573bool Stream::failed() { return scanner->failed(); } 1574 1575void Stream::printError(Node *N, const Twine &Msg) { 1576 SmallVector<SMRange, 1> Ranges; 1577 Ranges.push_back(N->getSourceRange()); 1578 scanner->printError( N->getSourceRange().Start 1579 , SourceMgr::DK_Error 1580 , Msg 1581 , Ranges); 1582} 1583 1584document_iterator Stream::begin() { 1585 if (CurrentDoc) 1586 report_fatal_error("Can only iterate over the stream once"); 1587 1588 // Skip Stream-Start. 1589 scanner->getNext(); 1590 1591 CurrentDoc.reset(new Document(*this)); 1592 return document_iterator(CurrentDoc); 1593} 1594 1595document_iterator Stream::end() { 1596 return document_iterator(); 1597} 1598 1599void Stream::skip() { 1600 for (document_iterator i = begin(), e = end(); i != e; ++i) 1601 i->skip(); 1602} 1603 1604Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T) 1605 : Doc(D) 1606 , TypeID(Type) 1607 , Anchor(A) 1608 , Tag(T) { 1609 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1610 SourceRange = SMRange(Start, Start); 1611} 1612 1613std::string Node::getVerbatimTag() const { 1614 StringRef Raw = getRawTag(); 1615 if (!Raw.empty() && Raw != "!") { 1616 std::string Ret; 1617 if (Raw.find_last_of('!') == 0) { 1618 Ret = Doc->getTagMap().find("!")->second; 1619 Ret += Raw.substr(1); 1620 return llvm_move(Ret); 1621 } else if (Raw.startswith("!!")) { 1622 Ret = Doc->getTagMap().find("!!")->second; 1623 Ret += Raw.substr(2); 1624 return llvm_move(Ret); 1625 } else { 1626 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1627 std::map<StringRef, StringRef>::const_iterator It = 1628 Doc->getTagMap().find(TagHandle); 1629 if (It != Doc->getTagMap().end()) 1630 Ret = It->second; 1631 else { 1632 Token T; 1633 T.Kind = Token::TK_Tag; 1634 T.Range = TagHandle; 1635 setError(Twine("Unknown tag handle ") + TagHandle, T); 1636 } 1637 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1638 return llvm_move(Ret); 1639 } 1640 } 1641 1642 switch (getType()) { 1643 case NK_Null: 1644 return "tag:yaml.org,2002:null"; 1645 case NK_Scalar: 1646 // TODO: Tag resolution. 1647 return "tag:yaml.org,2002:str"; 1648 case NK_Mapping: 1649 return "tag:yaml.org,2002:map"; 1650 case NK_Sequence: 1651 return "tag:yaml.org,2002:seq"; 1652 } 1653 1654 return ""; 1655} 1656 1657Token &Node::peekNext() { 1658 return Doc->peekNext(); 1659} 1660 1661Token Node::getNext() { 1662 return Doc->getNext(); 1663} 1664 1665Node *Node::parseBlockNode() { 1666 return Doc->parseBlockNode(); 1667} 1668 1669BumpPtrAllocator &Node::getAllocator() { 1670 return Doc->NodeAllocator; 1671} 1672 1673void Node::setError(const Twine &Msg, Token &Tok) const { 1674 Doc->setError(Msg, Tok); 1675} 1676 1677bool Node::failed() const { 1678 return Doc->failed(); 1679} 1680 1681 1682 1683StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1684 // TODO: Handle newlines properly. We need to remove leading whitespace. 1685 if (Value[0] == '"') { // Double quoted. 1686 // Pull off the leading and trailing "s. 1687 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1688 // Search for characters that would require unescaping the value. 1689 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1690 if (i != StringRef::npos) 1691 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1692 return UnquotedValue; 1693 } else if (Value[0] == '\'') { // Single quoted. 1694 // Pull off the leading and trailing 's. 1695 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1696 StringRef::size_type i = UnquotedValue.find('\''); 1697 if (i != StringRef::npos) { 1698 // We're going to need Storage. 1699 Storage.clear(); 1700 Storage.reserve(UnquotedValue.size()); 1701 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1702 StringRef Valid(UnquotedValue.begin(), i); 1703 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1704 Storage.push_back('\''); 1705 UnquotedValue = UnquotedValue.substr(i + 2); 1706 } 1707 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1708 return StringRef(Storage.begin(), Storage.size()); 1709 } 1710 return UnquotedValue; 1711 } 1712 // Plain or block. 1713 return Value.rtrim(" "); 1714} 1715 1716StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1717 , StringRef::size_type i 1718 , SmallVectorImpl<char> &Storage) 1719 const { 1720 // Use Storage to build proper value. 1721 Storage.clear(); 1722 Storage.reserve(UnquotedValue.size()); 1723 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1724 // Insert all previous chars into Storage. 1725 StringRef Valid(UnquotedValue.begin(), i); 1726 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1727 // Chop off inserted chars. 1728 UnquotedValue = UnquotedValue.substr(i); 1729 1730 assert(!UnquotedValue.empty() && "Can't be empty!"); 1731 1732 // Parse escape or line break. 1733 switch (UnquotedValue[0]) { 1734 case '\r': 1735 case '\n': 1736 Storage.push_back('\n'); 1737 if ( UnquotedValue.size() > 1 1738 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1739 UnquotedValue = UnquotedValue.substr(1); 1740 UnquotedValue = UnquotedValue.substr(1); 1741 break; 1742 default: 1743 if (UnquotedValue.size() == 1) 1744 // TODO: Report error. 1745 break; 1746 UnquotedValue = UnquotedValue.substr(1); 1747 switch (UnquotedValue[0]) { 1748 default: { 1749 Token T; 1750 T.Range = StringRef(UnquotedValue.begin(), 1); 1751 setError("Unrecognized escape code!", T); 1752 return ""; 1753 } 1754 case '\r': 1755 case '\n': 1756 // Remove the new line. 1757 if ( UnquotedValue.size() > 1 1758 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1759 UnquotedValue = UnquotedValue.substr(1); 1760 // If this was just a single byte newline, it will get skipped 1761 // below. 1762 break; 1763 case '0': 1764 Storage.push_back(0x00); 1765 break; 1766 case 'a': 1767 Storage.push_back(0x07); 1768 break; 1769 case 'b': 1770 Storage.push_back(0x08); 1771 break; 1772 case 't': 1773 case 0x09: 1774 Storage.push_back(0x09); 1775 break; 1776 case 'n': 1777 Storage.push_back(0x0A); 1778 break; 1779 case 'v': 1780 Storage.push_back(0x0B); 1781 break; 1782 case 'f': 1783 Storage.push_back(0x0C); 1784 break; 1785 case 'r': 1786 Storage.push_back(0x0D); 1787 break; 1788 case 'e': 1789 Storage.push_back(0x1B); 1790 break; 1791 case ' ': 1792 Storage.push_back(0x20); 1793 break; 1794 case '"': 1795 Storage.push_back(0x22); 1796 break; 1797 case '/': 1798 Storage.push_back(0x2F); 1799 break; 1800 case '\\': 1801 Storage.push_back(0x5C); 1802 break; 1803 case 'N': 1804 encodeUTF8(0x85, Storage); 1805 break; 1806 case '_': 1807 encodeUTF8(0xA0, Storage); 1808 break; 1809 case 'L': 1810 encodeUTF8(0x2028, Storage); 1811 break; 1812 case 'P': 1813 encodeUTF8(0x2029, Storage); 1814 break; 1815 case 'x': { 1816 if (UnquotedValue.size() < 3) 1817 // TODO: Report error. 1818 break; 1819 unsigned int UnicodeScalarValue; 1820 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1821 // TODO: Report error. 1822 UnicodeScalarValue = 0xFFFD; 1823 encodeUTF8(UnicodeScalarValue, Storage); 1824 UnquotedValue = UnquotedValue.substr(2); 1825 break; 1826 } 1827 case 'u': { 1828 if (UnquotedValue.size() < 5) 1829 // TODO: Report error. 1830 break; 1831 unsigned int UnicodeScalarValue; 1832 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1833 // TODO: Report error. 1834 UnicodeScalarValue = 0xFFFD; 1835 encodeUTF8(UnicodeScalarValue, Storage); 1836 UnquotedValue = UnquotedValue.substr(4); 1837 break; 1838 } 1839 case 'U': { 1840 if (UnquotedValue.size() < 9) 1841 // TODO: Report error. 1842 break; 1843 unsigned int UnicodeScalarValue; 1844 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1845 // TODO: Report error. 1846 UnicodeScalarValue = 0xFFFD; 1847 encodeUTF8(UnicodeScalarValue, Storage); 1848 UnquotedValue = UnquotedValue.substr(8); 1849 break; 1850 } 1851 } 1852 UnquotedValue = UnquotedValue.substr(1); 1853 } 1854 } 1855 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1856 return StringRef(Storage.begin(), Storage.size()); 1857} 1858 1859Node *KeyValueNode::getKey() { 1860 if (Key) 1861 return Key; 1862 // Handle implicit null keys. 1863 { 1864 Token &t = peekNext(); 1865 if ( t.Kind == Token::TK_BlockEnd 1866 || t.Kind == Token::TK_Value 1867 || t.Kind == Token::TK_Error) { 1868 return Key = new (getAllocator()) NullNode(Doc); 1869 } 1870 if (t.Kind == Token::TK_Key) 1871 getNext(); // skip TK_Key. 1872 } 1873 1874 // Handle explicit null keys. 1875 Token &t = peekNext(); 1876 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1877 return Key = new (getAllocator()) NullNode(Doc); 1878 } 1879 1880 // We've got a normal key. 1881 return Key = parseBlockNode(); 1882} 1883 1884Node *KeyValueNode::getValue() { 1885 if (Value) 1886 return Value; 1887 getKey()->skip(); 1888 if (failed()) 1889 return Value = new (getAllocator()) NullNode(Doc); 1890 1891 // Handle implicit null values. 1892 { 1893 Token &t = peekNext(); 1894 if ( t.Kind == Token::TK_BlockEnd 1895 || t.Kind == Token::TK_FlowMappingEnd 1896 || t.Kind == Token::TK_Key 1897 || t.Kind == Token::TK_FlowEntry 1898 || t.Kind == Token::TK_Error) { 1899 return Value = new (getAllocator()) NullNode(Doc); 1900 } 1901 1902 if (t.Kind != Token::TK_Value) { 1903 setError("Unexpected token in Key Value.", t); 1904 return Value = new (getAllocator()) NullNode(Doc); 1905 } 1906 getNext(); // skip TK_Value. 1907 } 1908 1909 // Handle explicit null values. 1910 Token &t = peekNext(); 1911 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1912 return Value = new (getAllocator()) NullNode(Doc); 1913 } 1914 1915 // We got a normal value. 1916 return Value = parseBlockNode(); 1917} 1918 1919void MappingNode::increment() { 1920 if (failed()) { 1921 IsAtEnd = true; 1922 CurrentEntry = 0; 1923 return; 1924 } 1925 if (CurrentEntry) { 1926 CurrentEntry->skip(); 1927 if (Type == MT_Inline) { 1928 IsAtEnd = true; 1929 CurrentEntry = 0; 1930 return; 1931 } 1932 } 1933 Token T = peekNext(); 1934 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1935 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1936 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1937 } else if (Type == MT_Block) { 1938 switch (T.Kind) { 1939 case Token::TK_BlockEnd: 1940 getNext(); 1941 IsAtEnd = true; 1942 CurrentEntry = 0; 1943 break; 1944 default: 1945 setError("Unexpected token. Expected Key or Block End", T); 1946 case Token::TK_Error: 1947 IsAtEnd = true; 1948 CurrentEntry = 0; 1949 } 1950 } else { 1951 switch (T.Kind) { 1952 case Token::TK_FlowEntry: 1953 // Eat the flow entry and recurse. 1954 getNext(); 1955 return increment(); 1956 case Token::TK_FlowMappingEnd: 1957 getNext(); 1958 case Token::TK_Error: 1959 // Set this to end iterator. 1960 IsAtEnd = true; 1961 CurrentEntry = 0; 1962 break; 1963 default: 1964 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1965 "Mapping End." 1966 , T); 1967 IsAtEnd = true; 1968 CurrentEntry = 0; 1969 } 1970 } 1971} 1972 1973void SequenceNode::increment() { 1974 if (failed()) { 1975 IsAtEnd = true; 1976 CurrentEntry = 0; 1977 return; 1978 } 1979 if (CurrentEntry) 1980 CurrentEntry->skip(); 1981 Token T = peekNext(); 1982 if (SeqType == ST_Block) { 1983 switch (T.Kind) { 1984 case Token::TK_BlockEntry: 1985 getNext(); 1986 CurrentEntry = parseBlockNode(); 1987 if (CurrentEntry == 0) { // An error occurred. 1988 IsAtEnd = true; 1989 CurrentEntry = 0; 1990 } 1991 break; 1992 case Token::TK_BlockEnd: 1993 getNext(); 1994 IsAtEnd = true; 1995 CurrentEntry = 0; 1996 break; 1997 default: 1998 setError( "Unexpected token. Expected Block Entry or Block End." 1999 , T); 2000 case Token::TK_Error: 2001 IsAtEnd = true; 2002 CurrentEntry = 0; 2003 } 2004 } else if (SeqType == ST_Indentless) { 2005 switch (T.Kind) { 2006 case Token::TK_BlockEntry: 2007 getNext(); 2008 CurrentEntry = parseBlockNode(); 2009 if (CurrentEntry == 0) { // An error occurred. 2010 IsAtEnd = true; 2011 CurrentEntry = 0; 2012 } 2013 break; 2014 default: 2015 case Token::TK_Error: 2016 IsAtEnd = true; 2017 CurrentEntry = 0; 2018 } 2019 } else if (SeqType == ST_Flow) { 2020 switch (T.Kind) { 2021 case Token::TK_FlowEntry: 2022 // Eat the flow entry and recurse. 2023 getNext(); 2024 WasPreviousTokenFlowEntry = true; 2025 return increment(); 2026 case Token::TK_FlowSequenceEnd: 2027 getNext(); 2028 case Token::TK_Error: 2029 // Set this to end iterator. 2030 IsAtEnd = true; 2031 CurrentEntry = 0; 2032 break; 2033 case Token::TK_StreamEnd: 2034 case Token::TK_DocumentEnd: 2035 case Token::TK_DocumentStart: 2036 setError("Could not find closing ]!", T); 2037 // Set this to end iterator. 2038 IsAtEnd = true; 2039 CurrentEntry = 0; 2040 break; 2041 default: 2042 if (!WasPreviousTokenFlowEntry) { 2043 setError("Expected , between entries!", T); 2044 IsAtEnd = true; 2045 CurrentEntry = 0; 2046 break; 2047 } 2048 // Otherwise it must be a flow entry. 2049 CurrentEntry = parseBlockNode(); 2050 if (!CurrentEntry) { 2051 IsAtEnd = true; 2052 } 2053 WasPreviousTokenFlowEntry = false; 2054 break; 2055 } 2056 } 2057} 2058 2059Document::Document(Stream &S) : stream(S), Root(0) { 2060 // Tag maps starts with two default mappings. 2061 TagMap["!"] = "!"; 2062 TagMap["!!"] = "tag:yaml.org,2002:"; 2063 2064 if (parseDirectives()) 2065 expectToken(Token::TK_DocumentStart); 2066 Token &T = peekNext(); 2067 if (T.Kind == Token::TK_DocumentStart) 2068 getNext(); 2069} 2070 2071bool Document::skip() { 2072 if (stream.scanner->failed()) 2073 return false; 2074 if (!Root) 2075 getRoot(); 2076 Root->skip(); 2077 Token &T = peekNext(); 2078 if (T.Kind == Token::TK_StreamEnd) 2079 return false; 2080 if (T.Kind == Token::TK_DocumentEnd) { 2081 getNext(); 2082 return skip(); 2083 } 2084 return true; 2085} 2086 2087Token &Document::peekNext() { 2088 return stream.scanner->peekNext(); 2089} 2090 2091Token Document::getNext() { 2092 return stream.scanner->getNext(); 2093} 2094 2095void Document::setError(const Twine &Message, Token &Location) const { 2096 stream.scanner->setError(Message, Location.Range.begin()); 2097} 2098 2099bool Document::failed() const { 2100 return stream.scanner->failed(); 2101} 2102 2103Node *Document::parseBlockNode() { 2104 Token T = peekNext(); 2105 // Handle properties. 2106 Token AnchorInfo; 2107 Token TagInfo; 2108parse_property: 2109 switch (T.Kind) { 2110 case Token::TK_Alias: 2111 getNext(); 2112 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2113 case Token::TK_Anchor: 2114 if (AnchorInfo.Kind == Token::TK_Anchor) { 2115 setError("Already encountered an anchor for this node!", T); 2116 return 0; 2117 } 2118 AnchorInfo = getNext(); // Consume TK_Anchor. 2119 T = peekNext(); 2120 goto parse_property; 2121 case Token::TK_Tag: 2122 if (TagInfo.Kind == Token::TK_Tag) { 2123 setError("Already encountered a tag for this node!", T); 2124 return 0; 2125 } 2126 TagInfo = getNext(); // Consume TK_Tag. 2127 T = peekNext(); 2128 goto parse_property; 2129 default: 2130 break; 2131 } 2132 2133 switch (T.Kind) { 2134 case Token::TK_BlockEntry: 2135 // We got an unindented BlockEntry sequence. This is not terminated with 2136 // a BlockEnd. 2137 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2138 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2139 , AnchorInfo.Range.substr(1) 2140 , TagInfo.Range 2141 , SequenceNode::ST_Indentless); 2142 case Token::TK_BlockSequenceStart: 2143 getNext(); 2144 return new (NodeAllocator) 2145 SequenceNode( stream.CurrentDoc 2146 , AnchorInfo.Range.substr(1) 2147 , TagInfo.Range 2148 , SequenceNode::ST_Block); 2149 case Token::TK_BlockMappingStart: 2150 getNext(); 2151 return new (NodeAllocator) 2152 MappingNode( stream.CurrentDoc 2153 , AnchorInfo.Range.substr(1) 2154 , TagInfo.Range 2155 , MappingNode::MT_Block); 2156 case Token::TK_FlowSequenceStart: 2157 getNext(); 2158 return new (NodeAllocator) 2159 SequenceNode( stream.CurrentDoc 2160 , AnchorInfo.Range.substr(1) 2161 , TagInfo.Range 2162 , SequenceNode::ST_Flow); 2163 case Token::TK_FlowMappingStart: 2164 getNext(); 2165 return new (NodeAllocator) 2166 MappingNode( stream.CurrentDoc 2167 , AnchorInfo.Range.substr(1) 2168 , TagInfo.Range 2169 , MappingNode::MT_Flow); 2170 case Token::TK_Scalar: 2171 getNext(); 2172 return new (NodeAllocator) 2173 ScalarNode( stream.CurrentDoc 2174 , AnchorInfo.Range.substr(1) 2175 , TagInfo.Range 2176 , T.Range); 2177 case Token::TK_Key: 2178 // Don't eat the TK_Key, KeyValueNode expects it. 2179 return new (NodeAllocator) 2180 MappingNode( stream.CurrentDoc 2181 , AnchorInfo.Range.substr(1) 2182 , TagInfo.Range 2183 , MappingNode::MT_Inline); 2184 case Token::TK_DocumentStart: 2185 case Token::TK_DocumentEnd: 2186 case Token::TK_StreamEnd: 2187 default: 2188 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2189 // !!null null. 2190 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2191 case Token::TK_Error: 2192 return 0; 2193 } 2194 llvm_unreachable("Control flow shouldn't reach here."); 2195 return 0; 2196} 2197 2198bool Document::parseDirectives() { 2199 bool isDirective = false; 2200 while (true) { 2201 Token T = peekNext(); 2202 if (T.Kind == Token::TK_TagDirective) { 2203 parseTAGDirective(); 2204 isDirective = true; 2205 } else if (T.Kind == Token::TK_VersionDirective) { 2206 parseYAMLDirective(); 2207 isDirective = true; 2208 } else 2209 break; 2210 } 2211 return isDirective; 2212} 2213 2214void Document::parseYAMLDirective() { 2215 getNext(); // Eat %YAML <version> 2216} 2217 2218void Document::parseTAGDirective() { 2219 Token Tag = getNext(); // %TAG <handle> <prefix> 2220 StringRef T = Tag.Range; 2221 // Strip %TAG 2222 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2223 std::size_t HandleEnd = T.find_first_of(" \t"); 2224 StringRef TagHandle = T.substr(0, HandleEnd); 2225 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2226 TagMap[TagHandle] = TagPrefix; 2227} 2228 2229bool Document::expectToken(int TK) { 2230 Token T = getNext(); 2231 if (T.Kind != TK) { 2232 setError("Unexpected token", T); 2233 return false; 2234 } 2235 return true; 2236} 2237