YAMLParser.cpp revision 263508
1//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements a YAML parser.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Support/YAMLParser.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/Twine.h"
18#include "llvm/ADT/ilist.h"
19#include "llvm/ADT/ilist_node.h"
20#include "llvm/Support/ErrorHandling.h"
21#include "llvm/Support/MemoryBuffer.h"
22#include "llvm/Support/SourceMgr.h"
23#include "llvm/Support/raw_ostream.h"
24
25using namespace llvm;
26using namespace yaml;
27
28enum UnicodeEncodingForm {
29  UEF_UTF32_LE, ///< UTF-32 Little Endian
30  UEF_UTF32_BE, ///< UTF-32 Big Endian
31  UEF_UTF16_LE, ///< UTF-16 Little Endian
32  UEF_UTF16_BE, ///< UTF-16 Big Endian
33  UEF_UTF8,     ///< UTF-8 or ascii.
34  UEF_Unknown   ///< Not a valid Unicode encoding.
35};
36
37/// EncodingInfo - Holds the encoding type and length of the byte order mark if
38///                it exists. Length is in {0, 2, 3, 4}.
39typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
40
41/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
42///                      encoding form of \a Input.
43///
44/// @param Input A string of length 0 or more.
45/// @returns An EncodingInfo indicating the Unicode encoding form of the input
46///          and how long the byte order mark is if one exists.
47static EncodingInfo getUnicodeEncoding(StringRef Input) {
48  if (Input.size() == 0)
49    return std::make_pair(UEF_Unknown, 0);
50
51  switch (uint8_t(Input[0])) {
52  case 0x00:
53    if (Input.size() >= 4) {
54      if (  Input[1] == 0
55         && uint8_t(Input[2]) == 0xFE
56         && uint8_t(Input[3]) == 0xFF)
57        return std::make_pair(UEF_UTF32_BE, 4);
58      if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
59        return std::make_pair(UEF_UTF32_BE, 0);
60    }
61
62    if (Input.size() >= 2 && Input[1] != 0)
63      return std::make_pair(UEF_UTF16_BE, 0);
64    return std::make_pair(UEF_Unknown, 0);
65  case 0xFF:
66    if (  Input.size() >= 4
67       && uint8_t(Input[1]) == 0xFE
68       && Input[2] == 0
69       && Input[3] == 0)
70      return std::make_pair(UEF_UTF32_LE, 4);
71
72    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
73      return std::make_pair(UEF_UTF16_LE, 2);
74    return std::make_pair(UEF_Unknown, 0);
75  case 0xFE:
76    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
77      return std::make_pair(UEF_UTF16_BE, 2);
78    return std::make_pair(UEF_Unknown, 0);
79  case 0xEF:
80    if (  Input.size() >= 3
81       && uint8_t(Input[1]) == 0xBB
82       && uint8_t(Input[2]) == 0xBF)
83      return std::make_pair(UEF_UTF8, 3);
84    return std::make_pair(UEF_Unknown, 0);
85  }
86
87  // It could still be utf-32 or utf-16.
88  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
89    return std::make_pair(UEF_UTF32_LE, 0);
90
91  if (Input.size() >= 2 && Input[1] == 0)
92    return std::make_pair(UEF_UTF16_LE, 0);
93
94  return std::make_pair(UEF_UTF8, 0);
95}
96
97namespace llvm {
98namespace yaml {
99/// Pin the vtables to this file.
100void Node::anchor() {}
101void NullNode::anchor() {}
102void ScalarNode::anchor() {}
103void KeyValueNode::anchor() {}
104void MappingNode::anchor() {}
105void SequenceNode::anchor() {}
106void AliasNode::anchor() {}
107
108/// Token - A single YAML token.
109struct Token : ilist_node<Token> {
110  enum TokenKind {
111    TK_Error, // Uninitialized token.
112    TK_StreamStart,
113    TK_StreamEnd,
114    TK_VersionDirective,
115    TK_TagDirective,
116    TK_DocumentStart,
117    TK_DocumentEnd,
118    TK_BlockEntry,
119    TK_BlockEnd,
120    TK_BlockSequenceStart,
121    TK_BlockMappingStart,
122    TK_FlowEntry,
123    TK_FlowSequenceStart,
124    TK_FlowSequenceEnd,
125    TK_FlowMappingStart,
126    TK_FlowMappingEnd,
127    TK_Key,
128    TK_Value,
129    TK_Scalar,
130    TK_Alias,
131    TK_Anchor,
132    TK_Tag
133  } Kind;
134
135  /// A string of length 0 or more whose begin() points to the logical location
136  /// of the token in the input.
137  StringRef Range;
138
139  Token() : Kind(TK_Error) {}
140};
141}
142}
143
144namespace llvm {
145template<>
146struct ilist_sentinel_traits<Token> {
147  Token *createSentinel() const {
148    return &Sentinel;
149  }
150  static void destroySentinel(Token*) {}
151
152  Token *provideInitialHead() const { return createSentinel(); }
153  Token *ensureHead(Token*) const { return createSentinel(); }
154  static void noteHead(Token*, Token*) {}
155
156private:
157  mutable Token Sentinel;
158};
159
160template<>
161struct ilist_node_traits<Token> {
162  Token *createNode(const Token &V) {
163    return new (Alloc.Allocate<Token>()) Token(V);
164  }
165  static void deleteNode(Token *V) {}
166
167  void addNodeToList(Token *) {}
168  void removeNodeFromList(Token *) {}
169  void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
170                             ilist_iterator<Token> /*first*/,
171                             ilist_iterator<Token> /*last*/) {}
172
173  BumpPtrAllocator Alloc;
174};
175}
176
177typedef ilist<Token> TokenQueueT;
178
179namespace {
180/// @brief This struct is used to track simple keys.
181///
182/// Simple keys are handled by creating an entry in SimpleKeys for each Token
183/// which could legally be the start of a simple key. When peekNext is called,
184/// if the Token To be returned is referenced by a SimpleKey, we continue
185/// tokenizing until that potential simple key has either been found to not be
186/// a simple key (we moved on to the next line or went further than 1024 chars).
187/// Or when we run into a Value, and then insert a Key token (and possibly
188/// others) before the SimpleKey's Tok.
189struct SimpleKey {
190  TokenQueueT::iterator Tok;
191  unsigned Column;
192  unsigned Line;
193  unsigned FlowLevel;
194  bool IsRequired;
195
196  bool operator ==(const SimpleKey &Other) {
197    return Tok == Other.Tok;
198  }
199};
200}
201
202/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
203///        subsequence and the subsequence's length in code units (uint8_t).
204///        A length of 0 represents an error.
205typedef std::pair<uint32_t, unsigned> UTF8Decoded;
206
207static UTF8Decoded decodeUTF8(StringRef Range) {
208  StringRef::iterator Position= Range.begin();
209  StringRef::iterator End = Range.end();
210  // 1 byte: [0x00, 0x7f]
211  // Bit pattern: 0xxxxxxx
212  if ((*Position & 0x80) == 0) {
213     return std::make_pair(*Position, 1);
214  }
215  // 2 bytes: [0x80, 0x7ff]
216  // Bit pattern: 110xxxxx 10xxxxxx
217  if (Position + 1 != End &&
218      ((*Position & 0xE0) == 0xC0) &&
219      ((*(Position + 1) & 0xC0) == 0x80)) {
220    uint32_t codepoint = ((*Position & 0x1F) << 6) |
221                          (*(Position + 1) & 0x3F);
222    if (codepoint >= 0x80)
223      return std::make_pair(codepoint, 2);
224  }
225  // 3 bytes: [0x8000, 0xffff]
226  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
227  if (Position + 2 != End &&
228      ((*Position & 0xF0) == 0xE0) &&
229      ((*(Position + 1) & 0xC0) == 0x80) &&
230      ((*(Position + 2) & 0xC0) == 0x80)) {
231    uint32_t codepoint = ((*Position & 0x0F) << 12) |
232                         ((*(Position + 1) & 0x3F) << 6) |
233                          (*(Position + 2) & 0x3F);
234    // Codepoints between 0xD800 and 0xDFFF are invalid, as
235    // they are high / low surrogate halves used by UTF-16.
236    if (codepoint >= 0x800 &&
237        (codepoint < 0xD800 || codepoint > 0xDFFF))
238      return std::make_pair(codepoint, 3);
239  }
240  // 4 bytes: [0x10000, 0x10FFFF]
241  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
242  if (Position + 3 != End &&
243      ((*Position & 0xF8) == 0xF0) &&
244      ((*(Position + 1) & 0xC0) == 0x80) &&
245      ((*(Position + 2) & 0xC0) == 0x80) &&
246      ((*(Position + 3) & 0xC0) == 0x80)) {
247    uint32_t codepoint = ((*Position & 0x07) << 18) |
248                         ((*(Position + 1) & 0x3F) << 12) |
249                         ((*(Position + 2) & 0x3F) << 6) |
250                          (*(Position + 3) & 0x3F);
251    if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
252      return std::make_pair(codepoint, 4);
253  }
254  return std::make_pair(0, 0);
255}
256
257namespace llvm {
258namespace yaml {
259/// @brief Scans YAML tokens from a MemoryBuffer.
260class Scanner {
261public:
262  Scanner(const StringRef Input, SourceMgr &SM);
263  Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
264
265  /// @brief Parse the next token and return it without popping it.
266  Token &peekNext();
267
268  /// @brief Parse the next token and pop it from the queue.
269  Token getNext();
270
271  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
272                  ArrayRef<SMRange> Ranges = None) {
273    SM.PrintMessage(Loc, Kind, Message, Ranges);
274  }
275
276  void setError(const Twine &Message, StringRef::iterator Position) {
277    if (Current >= End)
278      Current = End - 1;
279
280    // Don't print out more errors after the first one we encounter. The rest
281    // are just the result of the first, and have no meaning.
282    if (!Failed)
283      printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
284    Failed = true;
285  }
286
287  void setError(const Twine &Message) {
288    setError(Message, Current);
289  }
290
291  /// @brief Returns true if an error occurred while parsing.
292  bool failed() {
293    return Failed;
294  }
295
296private:
297  StringRef currentInput() {
298    return StringRef(Current, End - Current);
299  }
300
301  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
302  ///        at \a Position.
303  ///
304  /// If the UTF-8 code units starting at Position do not form a well-formed
305  /// code unit subsequence, then the Unicode scalar value is 0, and the length
306  /// is 0.
307  UTF8Decoded decodeUTF8(StringRef::iterator Position) {
308    return ::decodeUTF8(StringRef(Position, End - Position));
309  }
310
311  // The following functions are based on the gramar rules in the YAML spec. The
312  // style of the function names it meant to closely match how they are written
313  // in the spec. The number within the [] is the number of the grammar rule in
314  // the spec.
315  //
316  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
317  //
318  // c-
319  //   A production starting and ending with a special character.
320  // b-
321  //   A production matching a single line break.
322  // nb-
323  //   A production starting and ending with a non-break character.
324  // s-
325  //   A production starting and ending with a white space character.
326  // ns-
327  //   A production starting and ending with a non-space character.
328  // l-
329  //   A production matching complete line(s).
330
331  /// @brief Skip a single nb-char[27] starting at Position.
332  ///
333  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
334  ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
335  ///
336  /// @returns The code unit after the nb-char, or Position if it's not an
337  ///          nb-char.
338  StringRef::iterator skip_nb_char(StringRef::iterator Position);
339
340  /// @brief Skip a single b-break[28] starting at Position.
341  ///
342  /// A b-break is 0xD 0xA | 0xD | 0xA
343  ///
344  /// @returns The code unit after the b-break, or Position if it's not a
345  ///          b-break.
346  StringRef::iterator skip_b_break(StringRef::iterator Position);
347
348  /// @brief Skip a single s-white[33] starting at Position.
349  ///
350  /// A s-white is 0x20 | 0x9
351  ///
352  /// @returns The code unit after the s-white, or Position if it's not a
353  ///          s-white.
354  StringRef::iterator skip_s_white(StringRef::iterator Position);
355
356  /// @brief Skip a single ns-char[34] starting at Position.
357  ///
358  /// A ns-char is nb-char - s-white
359  ///
360  /// @returns The code unit after the ns-char, or Position if it's not a
361  ///          ns-char.
362  StringRef::iterator skip_ns_char(StringRef::iterator Position);
363
364  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
365  /// @brief Skip minimal well-formed code unit subsequences until Func
366  ///        returns its input.
367  ///
368  /// @returns The code unit after the last minimal well-formed code unit
369  ///          subsequence that Func accepted.
370  StringRef::iterator skip_while( SkipWhileFunc Func
371                                , StringRef::iterator Position);
372
373  /// @brief Scan ns-uri-char[39]s starting at Cur.
374  ///
375  /// This updates Cur and Column while scanning.
376  ///
377  /// @returns A StringRef starting at Cur which covers the longest contiguous
378  ///          sequence of ns-uri-char.
379  StringRef scan_ns_uri_char();
380
381  /// @brief Scan ns-plain-one-line[133] starting at \a Cur.
382  StringRef scan_ns_plain_one_line();
383
384  /// @brief Consume a minimal well-formed code unit subsequence starting at
385  ///        \a Cur. Return false if it is not the same Unicode scalar value as
386  ///        \a Expected. This updates \a Column.
387  bool consume(uint32_t Expected);
388
389  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
390  void skip(uint32_t Distance);
391
392  /// @brief Return true if the minimal well-formed code unit subsequence at
393  ///        Pos is whitespace or a new line
394  bool isBlankOrBreak(StringRef::iterator Position);
395
396  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
397  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
398                             , unsigned AtColumn
399                             , bool IsRequired);
400
401  /// @brief Remove simple keys that can no longer be valid simple keys.
402  ///
403  /// Invalid simple keys are not on the current line or are further than 1024
404  /// columns back.
405  void removeStaleSimpleKeyCandidates();
406
407  /// @brief Remove all simple keys on FlowLevel \a Level.
408  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
409
410  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
411  ///        tokens if needed.
412  bool unrollIndent(int ToColumn);
413
414  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
415  ///        if needed.
416  bool rollIndent( int ToColumn
417                 , Token::TokenKind Kind
418                 , TokenQueueT::iterator InsertPoint);
419
420  /// @brief Skip whitespace and comments until the start of the next token.
421  void scanToNextToken();
422
423  /// @brief Must be the first token generated.
424  bool scanStreamStart();
425
426  /// @brief Generate tokens needed to close out the stream.
427  bool scanStreamEnd();
428
429  /// @brief Scan a %BLAH directive.
430  bool scanDirective();
431
432  /// @brief Scan a ... or ---.
433  bool scanDocumentIndicator(bool IsStart);
434
435  /// @brief Scan a [ or { and generate the proper flow collection start token.
436  bool scanFlowCollectionStart(bool IsSequence);
437
438  /// @brief Scan a ] or } and generate the proper flow collection end token.
439  bool scanFlowCollectionEnd(bool IsSequence);
440
441  /// @brief Scan the , that separates entries in a flow collection.
442  bool scanFlowEntry();
443
444  /// @brief Scan the - that starts block sequence entries.
445  bool scanBlockEntry();
446
447  /// @brief Scan an explicit ? indicating a key.
448  bool scanKey();
449
450  /// @brief Scan an explicit : indicating a value.
451  bool scanValue();
452
453  /// @brief Scan a quoted scalar.
454  bool scanFlowScalar(bool IsDoubleQuoted);
455
456  /// @brief Scan an unquoted scalar.
457  bool scanPlainScalar();
458
459  /// @brief Scan an Alias or Anchor starting with * or &.
460  bool scanAliasOrAnchor(bool IsAlias);
461
462  /// @brief Scan a block scalar starting with | or >.
463  bool scanBlockScalar(bool IsLiteral);
464
465  /// @brief Scan a tag of the form !stuff.
466  bool scanTag();
467
468  /// @brief Dispatch to the next scanning function based on \a *Cur.
469  bool fetchMoreTokens();
470
471  /// @brief The SourceMgr used for diagnostics and buffer management.
472  SourceMgr &SM;
473
474  /// @brief The original input.
475  MemoryBuffer *InputBuffer;
476
477  /// @brief The current position of the scanner.
478  StringRef::iterator Current;
479
480  /// @brief The end of the input (one past the last character).
481  StringRef::iterator End;
482
483  /// @brief Current YAML indentation level in spaces.
484  int Indent;
485
486  /// @brief Current column number in Unicode code points.
487  unsigned Column;
488
489  /// @brief Current line number.
490  unsigned Line;
491
492  /// @brief How deep we are in flow style containers. 0 Means at block level.
493  unsigned FlowLevel;
494
495  /// @brief Are we at the start of the stream?
496  bool IsStartOfStream;
497
498  /// @brief Can the next token be the start of a simple key?
499  bool IsSimpleKeyAllowed;
500
501  /// @brief True if an error has occurred.
502  bool Failed;
503
504  /// @brief Queue of tokens. This is required to queue up tokens while looking
505  ///        for the end of a simple key. And for cases where a single character
506  ///        can produce multiple tokens (e.g. BlockEnd).
507  TokenQueueT TokenQueue;
508
509  /// @brief Indentation levels.
510  SmallVector<int, 4> Indents;
511
512  /// @brief Potential simple keys.
513  SmallVector<SimpleKey, 4> SimpleKeys;
514};
515
516} // end namespace yaml
517} // end namespace llvm
518
519/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
520static void encodeUTF8( uint32_t UnicodeScalarValue
521                      , SmallVectorImpl<char> &Result) {
522  if (UnicodeScalarValue <= 0x7F) {
523    Result.push_back(UnicodeScalarValue & 0x7F);
524  } else if (UnicodeScalarValue <= 0x7FF) {
525    uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
526    uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
527    Result.push_back(FirstByte);
528    Result.push_back(SecondByte);
529  } else if (UnicodeScalarValue <= 0xFFFF) {
530    uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
531    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
532    uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
533    Result.push_back(FirstByte);
534    Result.push_back(SecondByte);
535    Result.push_back(ThirdByte);
536  } else if (UnicodeScalarValue <= 0x10FFFF) {
537    uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
538    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
539    uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
540    uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
541    Result.push_back(FirstByte);
542    Result.push_back(SecondByte);
543    Result.push_back(ThirdByte);
544    Result.push_back(FourthByte);
545  }
546}
547
548bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
549  SourceMgr SM;
550  Scanner scanner(Input, SM);
551  while (true) {
552    Token T = scanner.getNext();
553    switch (T.Kind) {
554    case Token::TK_StreamStart:
555      OS << "Stream-Start: ";
556      break;
557    case Token::TK_StreamEnd:
558      OS << "Stream-End: ";
559      break;
560    case Token::TK_VersionDirective:
561      OS << "Version-Directive: ";
562      break;
563    case Token::TK_TagDirective:
564      OS << "Tag-Directive: ";
565      break;
566    case Token::TK_DocumentStart:
567      OS << "Document-Start: ";
568      break;
569    case Token::TK_DocumentEnd:
570      OS << "Document-End: ";
571      break;
572    case Token::TK_BlockEntry:
573      OS << "Block-Entry: ";
574      break;
575    case Token::TK_BlockEnd:
576      OS << "Block-End: ";
577      break;
578    case Token::TK_BlockSequenceStart:
579      OS << "Block-Sequence-Start: ";
580      break;
581    case Token::TK_BlockMappingStart:
582      OS << "Block-Mapping-Start: ";
583      break;
584    case Token::TK_FlowEntry:
585      OS << "Flow-Entry: ";
586      break;
587    case Token::TK_FlowSequenceStart:
588      OS << "Flow-Sequence-Start: ";
589      break;
590    case Token::TK_FlowSequenceEnd:
591      OS << "Flow-Sequence-End: ";
592      break;
593    case Token::TK_FlowMappingStart:
594      OS << "Flow-Mapping-Start: ";
595      break;
596    case Token::TK_FlowMappingEnd:
597      OS << "Flow-Mapping-End: ";
598      break;
599    case Token::TK_Key:
600      OS << "Key: ";
601      break;
602    case Token::TK_Value:
603      OS << "Value: ";
604      break;
605    case Token::TK_Scalar:
606      OS << "Scalar: ";
607      break;
608    case Token::TK_Alias:
609      OS << "Alias: ";
610      break;
611    case Token::TK_Anchor:
612      OS << "Anchor: ";
613      break;
614    case Token::TK_Tag:
615      OS << "Tag: ";
616      break;
617    case Token::TK_Error:
618      break;
619    }
620    OS << T.Range << "\n";
621    if (T.Kind == Token::TK_StreamEnd)
622      break;
623    else if (T.Kind == Token::TK_Error)
624      return false;
625  }
626  return true;
627}
628
629bool yaml::scanTokens(StringRef Input) {
630  llvm::SourceMgr SM;
631  llvm::yaml::Scanner scanner(Input, SM);
632  for (;;) {
633    llvm::yaml::Token T = scanner.getNext();
634    if (T.Kind == Token::TK_StreamEnd)
635      break;
636    else if (T.Kind == Token::TK_Error)
637      return false;
638  }
639  return true;
640}
641
642std::string yaml::escape(StringRef Input) {
643  std::string EscapedInput;
644  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
645    if (*i == '\\')
646      EscapedInput += "\\\\";
647    else if (*i == '"')
648      EscapedInput += "\\\"";
649    else if (*i == 0)
650      EscapedInput += "\\0";
651    else if (*i == 0x07)
652      EscapedInput += "\\a";
653    else if (*i == 0x08)
654      EscapedInput += "\\b";
655    else if (*i == 0x09)
656      EscapedInput += "\\t";
657    else if (*i == 0x0A)
658      EscapedInput += "\\n";
659    else if (*i == 0x0B)
660      EscapedInput += "\\v";
661    else if (*i == 0x0C)
662      EscapedInput += "\\f";
663    else if (*i == 0x0D)
664      EscapedInput += "\\r";
665    else if (*i == 0x1B)
666      EscapedInput += "\\e";
667    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
668      std::string HexStr = utohexstr(*i);
669      EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
670    } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
671      UTF8Decoded UnicodeScalarValue
672        = decodeUTF8(StringRef(i, Input.end() - i));
673      if (UnicodeScalarValue.second == 0) {
674        // Found invalid char.
675        SmallString<4> Val;
676        encodeUTF8(0xFFFD, Val);
677        EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
678        // FIXME: Error reporting.
679        return EscapedInput;
680      }
681      if (UnicodeScalarValue.first == 0x85)
682        EscapedInput += "\\N";
683      else if (UnicodeScalarValue.first == 0xA0)
684        EscapedInput += "\\_";
685      else if (UnicodeScalarValue.first == 0x2028)
686        EscapedInput += "\\L";
687      else if (UnicodeScalarValue.first == 0x2029)
688        EscapedInput += "\\P";
689      else {
690        std::string HexStr = utohexstr(UnicodeScalarValue.first);
691        if (HexStr.size() <= 2)
692          EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
693        else if (HexStr.size() <= 4)
694          EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
695        else if (HexStr.size() <= 8)
696          EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
697      }
698      i += UnicodeScalarValue.second - 1;
699    } else
700      EscapedInput.push_back(*i);
701  }
702  return EscapedInput;
703}
704
705Scanner::Scanner(StringRef Input, SourceMgr &sm)
706  : SM(sm)
707  , Indent(-1)
708  , Column(0)
709  , Line(0)
710  , FlowLevel(0)
711  , IsStartOfStream(true)
712  , IsSimpleKeyAllowed(true)
713  , Failed(false) {
714  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
715  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
716  Current = InputBuffer->getBufferStart();
717  End = InputBuffer->getBufferEnd();
718}
719
720Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
721  : SM(SM_)
722  , InputBuffer(Buffer)
723  , Current(InputBuffer->getBufferStart())
724  , End(InputBuffer->getBufferEnd())
725  , Indent(-1)
726  , Column(0)
727  , Line(0)
728  , FlowLevel(0)
729  , IsStartOfStream(true)
730  , IsSimpleKeyAllowed(true)
731  , Failed(false) {
732    SM.AddNewSourceBuffer(InputBuffer, SMLoc());
733}
734
735Token &Scanner::peekNext() {
736  // If the current token is a possible simple key, keep parsing until we
737  // can confirm.
738  bool NeedMore = false;
739  while (true) {
740    if (TokenQueue.empty() || NeedMore) {
741      if (!fetchMoreTokens()) {
742        TokenQueue.clear();
743        TokenQueue.push_back(Token());
744        return TokenQueue.front();
745      }
746    }
747    assert(!TokenQueue.empty() &&
748            "fetchMoreTokens lied about getting tokens!");
749
750    removeStaleSimpleKeyCandidates();
751    SimpleKey SK;
752    SK.Tok = TokenQueue.front();
753    if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
754        == SimpleKeys.end())
755      break;
756    else
757      NeedMore = true;
758  }
759  return TokenQueue.front();
760}
761
762Token Scanner::getNext() {
763  Token Ret = peekNext();
764  // TokenQueue can be empty if there was an error getting the next token.
765  if (!TokenQueue.empty())
766    TokenQueue.pop_front();
767
768  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
769  // quick deallocation of them all.
770  if (TokenQueue.empty()) {
771    TokenQueue.Alloc.Reset();
772  }
773
774  return Ret;
775}
776
777StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
778  if (Position == End)
779    return Position;
780  // Check 7 bit c-printable - b-char.
781  if (   *Position == 0x09
782      || (*Position >= 0x20 && *Position <= 0x7E))
783    return Position + 1;
784
785  // Check for valid UTF-8.
786  if (uint8_t(*Position) & 0x80) {
787    UTF8Decoded u8d = decodeUTF8(Position);
788    if (   u8d.second != 0
789        && u8d.first != 0xFEFF
790        && ( u8d.first == 0x85
791          || ( u8d.first >= 0xA0
792            && u8d.first <= 0xD7FF)
793          || ( u8d.first >= 0xE000
794            && u8d.first <= 0xFFFD)
795          || ( u8d.first >= 0x10000
796            && u8d.first <= 0x10FFFF)))
797      return Position + u8d.second;
798  }
799  return Position;
800}
801
802StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
803  if (Position == End)
804    return Position;
805  if (*Position == 0x0D) {
806    if (Position + 1 != End && *(Position + 1) == 0x0A)
807      return Position + 2;
808    return Position + 1;
809  }
810
811  if (*Position == 0x0A)
812    return Position + 1;
813  return Position;
814}
815
816
817StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
818  if (Position == End)
819    return Position;
820  if (*Position == ' ' || *Position == '\t')
821    return Position + 1;
822  return Position;
823}
824
825StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
826  if (Position == End)
827    return Position;
828  if (*Position == ' ' || *Position == '\t')
829    return Position;
830  return skip_nb_char(Position);
831}
832
833StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
834                                       , StringRef::iterator Position) {
835  while (true) {
836    StringRef::iterator i = (this->*Func)(Position);
837    if (i == Position)
838      break;
839    Position = i;
840  }
841  return Position;
842}
843
844static bool is_ns_hex_digit(const char C) {
845  return    (C >= '0' && C <= '9')
846         || (C >= 'a' && C <= 'z')
847         || (C >= 'A' && C <= 'Z');
848}
849
850static bool is_ns_word_char(const char C) {
851  return    C == '-'
852         || (C >= 'a' && C <= 'z')
853         || (C >= 'A' && C <= 'Z');
854}
855
856StringRef Scanner::scan_ns_uri_char() {
857  StringRef::iterator Start = Current;
858  while (true) {
859    if (Current == End)
860      break;
861    if ((   *Current == '%'
862          && Current + 2 < End
863          && is_ns_hex_digit(*(Current + 1))
864          && is_ns_hex_digit(*(Current + 2)))
865        || is_ns_word_char(*Current)
866        || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
867          != StringRef::npos) {
868      ++Current;
869      ++Column;
870    } else
871      break;
872  }
873  return StringRef(Start, Current - Start);
874}
875
876StringRef Scanner::scan_ns_plain_one_line() {
877  StringRef::iterator start = Current;
878  // The first character must already be verified.
879  ++Current;
880  while (true) {
881    if (Current == End) {
882      break;
883    } else if (*Current == ':') {
884      // Check if the next character is a ns-char.
885      if (Current + 1 == End)
886        break;
887      StringRef::iterator i = skip_ns_char(Current + 1);
888      if (Current + 1 != i) {
889        Current = i;
890        Column += 2; // Consume both the ':' and ns-char.
891      } else
892        break;
893    } else if (*Current == '#') {
894      // Check if the previous character was a ns-char.
895      // The & 0x80 check is to check for the trailing byte of a utf-8
896      if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) {
897        ++Current;
898        ++Column;
899      } else
900        break;
901    } else {
902      StringRef::iterator i = skip_nb_char(Current);
903      if (i == Current)
904        break;
905      Current = i;
906      ++Column;
907    }
908  }
909  return StringRef(start, Current - start);
910}
911
912bool Scanner::consume(uint32_t Expected) {
913  if (Expected >= 0x80)
914    report_fatal_error("Not dealing with this yet");
915  if (Current == End)
916    return false;
917  if (uint8_t(*Current) >= 0x80)
918    report_fatal_error("Not dealing with this yet");
919  if (uint8_t(*Current) == Expected) {
920    ++Current;
921    ++Column;
922    return true;
923  }
924  return false;
925}
926
927void Scanner::skip(uint32_t Distance) {
928  Current += Distance;
929  Column += Distance;
930  assert(Current <= End && "Skipped past the end");
931}
932
933bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
934  if (Position == End)
935    return false;
936  if (   *Position == ' ' || *Position == '\t'
937      || *Position == '\r' || *Position == '\n')
938    return true;
939  return false;
940}
941
942void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
943                                    , unsigned AtColumn
944                                    , bool IsRequired) {
945  if (IsSimpleKeyAllowed) {
946    SimpleKey SK;
947    SK.Tok = Tok;
948    SK.Line = Line;
949    SK.Column = AtColumn;
950    SK.IsRequired = IsRequired;
951    SK.FlowLevel = FlowLevel;
952    SimpleKeys.push_back(SK);
953  }
954}
955
956void Scanner::removeStaleSimpleKeyCandidates() {
957  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
958                                            i != SimpleKeys.end();) {
959    if (i->Line != Line || i->Column + 1024 < Column) {
960      if (i->IsRequired)
961        setError( "Could not find expected : for simple key"
962                , i->Tok->Range.begin());
963      i = SimpleKeys.erase(i);
964    } else
965      ++i;
966  }
967}
968
969void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
970  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
971    SimpleKeys.pop_back();
972}
973
974bool Scanner::unrollIndent(int ToColumn) {
975  Token T;
976  // Indentation is ignored in flow.
977  if (FlowLevel != 0)
978    return true;
979
980  while (Indent > ToColumn) {
981    T.Kind = Token::TK_BlockEnd;
982    T.Range = StringRef(Current, 1);
983    TokenQueue.push_back(T);
984    Indent = Indents.pop_back_val();
985  }
986
987  return true;
988}
989
990bool Scanner::rollIndent( int ToColumn
991                        , Token::TokenKind Kind
992                        , TokenQueueT::iterator InsertPoint) {
993  if (FlowLevel)
994    return true;
995  if (Indent < ToColumn) {
996    Indents.push_back(Indent);
997    Indent = ToColumn;
998
999    Token T;
1000    T.Kind = Kind;
1001    T.Range = StringRef(Current, 0);
1002    TokenQueue.insert(InsertPoint, T);
1003  }
1004  return true;
1005}
1006
1007void Scanner::scanToNextToken() {
1008  while (true) {
1009    while (*Current == ' ' || *Current == '\t') {
1010      skip(1);
1011    }
1012
1013    // Skip comment.
1014    if (*Current == '#') {
1015      while (true) {
1016        // This may skip more than one byte, thus Column is only incremented
1017        // for code points.
1018        StringRef::iterator i = skip_nb_char(Current);
1019        if (i == Current)
1020          break;
1021        Current = i;
1022        ++Column;
1023      }
1024    }
1025
1026    // Skip EOL.
1027    StringRef::iterator i = skip_b_break(Current);
1028    if (i == Current)
1029      break;
1030    Current = i;
1031    ++Line;
1032    Column = 0;
1033    // New lines may start a simple key.
1034    if (!FlowLevel)
1035      IsSimpleKeyAllowed = true;
1036  }
1037}
1038
1039bool Scanner::scanStreamStart() {
1040  IsStartOfStream = false;
1041
1042  EncodingInfo EI = getUnicodeEncoding(currentInput());
1043
1044  Token T;
1045  T.Kind = Token::TK_StreamStart;
1046  T.Range = StringRef(Current, EI.second);
1047  TokenQueue.push_back(T);
1048  Current += EI.second;
1049  return true;
1050}
1051
1052bool Scanner::scanStreamEnd() {
1053  // Force an ending new line if one isn't present.
1054  if (Column != 0) {
1055    Column = 0;
1056    ++Line;
1057  }
1058
1059  unrollIndent(-1);
1060  SimpleKeys.clear();
1061  IsSimpleKeyAllowed = false;
1062
1063  Token T;
1064  T.Kind = Token::TK_StreamEnd;
1065  T.Range = StringRef(Current, 0);
1066  TokenQueue.push_back(T);
1067  return true;
1068}
1069
1070bool Scanner::scanDirective() {
1071  // Reset the indentation level.
1072  unrollIndent(-1);
1073  SimpleKeys.clear();
1074  IsSimpleKeyAllowed = false;
1075
1076  StringRef::iterator Start = Current;
1077  consume('%');
1078  StringRef::iterator NameStart = Current;
1079  Current = skip_while(&Scanner::skip_ns_char, Current);
1080  StringRef Name(NameStart, Current - NameStart);
1081  Current = skip_while(&Scanner::skip_s_white, Current);
1082
1083  Token T;
1084  if (Name == "YAML") {
1085    Current = skip_while(&Scanner::skip_ns_char, Current);
1086    T.Kind = Token::TK_VersionDirective;
1087    T.Range = StringRef(Start, Current - Start);
1088    TokenQueue.push_back(T);
1089    return true;
1090  } else if(Name == "TAG") {
1091    Current = skip_while(&Scanner::skip_ns_char, Current);
1092    Current = skip_while(&Scanner::skip_s_white, Current);
1093    Current = skip_while(&Scanner::skip_ns_char, Current);
1094    T.Kind = Token::TK_TagDirective;
1095    T.Range = StringRef(Start, Current - Start);
1096    TokenQueue.push_back(T);
1097    return true;
1098  }
1099  return false;
1100}
1101
1102bool Scanner::scanDocumentIndicator(bool IsStart) {
1103  unrollIndent(-1);
1104  SimpleKeys.clear();
1105  IsSimpleKeyAllowed = false;
1106
1107  Token T;
1108  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1109  T.Range = StringRef(Current, 3);
1110  skip(3);
1111  TokenQueue.push_back(T);
1112  return true;
1113}
1114
1115bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1116  Token T;
1117  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1118                      : Token::TK_FlowMappingStart;
1119  T.Range = StringRef(Current, 1);
1120  skip(1);
1121  TokenQueue.push_back(T);
1122
1123  // [ and { may begin a simple key.
1124  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
1125
1126  // And may also be followed by a simple key.
1127  IsSimpleKeyAllowed = true;
1128  ++FlowLevel;
1129  return true;
1130}
1131
1132bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1133  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1134  IsSimpleKeyAllowed = false;
1135  Token T;
1136  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1137                      : Token::TK_FlowMappingEnd;
1138  T.Range = StringRef(Current, 1);
1139  skip(1);
1140  TokenQueue.push_back(T);
1141  if (FlowLevel)
1142    --FlowLevel;
1143  return true;
1144}
1145
1146bool Scanner::scanFlowEntry() {
1147  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1148  IsSimpleKeyAllowed = true;
1149  Token T;
1150  T.Kind = Token::TK_FlowEntry;
1151  T.Range = StringRef(Current, 1);
1152  skip(1);
1153  TokenQueue.push_back(T);
1154  return true;
1155}
1156
1157bool Scanner::scanBlockEntry() {
1158  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1159  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1160  IsSimpleKeyAllowed = true;
1161  Token T;
1162  T.Kind = Token::TK_BlockEntry;
1163  T.Range = StringRef(Current, 1);
1164  skip(1);
1165  TokenQueue.push_back(T);
1166  return true;
1167}
1168
1169bool Scanner::scanKey() {
1170  if (!FlowLevel)
1171    rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1172
1173  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1174  IsSimpleKeyAllowed = !FlowLevel;
1175
1176  Token T;
1177  T.Kind = Token::TK_Key;
1178  T.Range = StringRef(Current, 1);
1179  skip(1);
1180  TokenQueue.push_back(T);
1181  return true;
1182}
1183
1184bool Scanner::scanValue() {
1185  // If the previous token could have been a simple key, insert the key token
1186  // into the token queue.
1187  if (!SimpleKeys.empty()) {
1188    SimpleKey SK = SimpleKeys.pop_back_val();
1189    Token T;
1190    T.Kind = Token::TK_Key;
1191    T.Range = SK.Tok->Range;
1192    TokenQueueT::iterator i, e;
1193    for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1194      if (i == SK.Tok)
1195        break;
1196    }
1197    assert(i != e && "SimpleKey not in token queue!");
1198    i = TokenQueue.insert(i, T);
1199
1200    // We may also need to add a Block-Mapping-Start token.
1201    rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1202
1203    IsSimpleKeyAllowed = false;
1204  } else {
1205    if (!FlowLevel)
1206      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1207    IsSimpleKeyAllowed = !FlowLevel;
1208  }
1209
1210  Token T;
1211  T.Kind = Token::TK_Value;
1212  T.Range = StringRef(Current, 1);
1213  skip(1);
1214  TokenQueue.push_back(T);
1215  return true;
1216}
1217
1218// Forbidding inlining improves performance by roughly 20%.
1219// FIXME: Remove once llvm optimizes this to the faster version without hints.
1220LLVM_ATTRIBUTE_NOINLINE static bool
1221wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1222
1223// Returns whether a character at 'Position' was escaped with a leading '\'.
1224// 'First' specifies the position of the first character in the string.
1225static bool wasEscaped(StringRef::iterator First,
1226                       StringRef::iterator Position) {
1227  assert(Position - 1 >= First);
1228  StringRef::iterator I = Position - 1;
1229  // We calculate the number of consecutive '\'s before the current position
1230  // by iterating backwards through our string.
1231  while (I >= First && *I == '\\') --I;
1232  // (Position - 1 - I) now contains the number of '\'s before the current
1233  // position. If it is odd, the character at 'Position' was escaped.
1234  return (Position - 1 - I) % 2 == 1;
1235}
1236
1237bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1238  StringRef::iterator Start = Current;
1239  unsigned ColStart = Column;
1240  if (IsDoubleQuoted) {
1241    do {
1242      ++Current;
1243      while (Current != End && *Current != '"')
1244        ++Current;
1245      // Repeat until the previous character was not a '\' or was an escaped
1246      // backslash.
1247    } while (   Current != End
1248             && *(Current - 1) == '\\'
1249             && wasEscaped(Start + 1, Current));
1250  } else {
1251    skip(1);
1252    while (true) {
1253      // Skip a ' followed by another '.
1254      if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1255        skip(2);
1256        continue;
1257      } else if (*Current == '\'')
1258        break;
1259      StringRef::iterator i = skip_nb_char(Current);
1260      if (i == Current) {
1261        i = skip_b_break(Current);
1262        if (i == Current)
1263          break;
1264        Current = i;
1265        Column = 0;
1266        ++Line;
1267      } else {
1268        if (i == End)
1269          break;
1270        Current = i;
1271        ++Column;
1272      }
1273    }
1274  }
1275
1276  if (Current == End) {
1277    setError("Expected quote at end of scalar", Current);
1278    return false;
1279  }
1280
1281  skip(1); // Skip ending quote.
1282  Token T;
1283  T.Kind = Token::TK_Scalar;
1284  T.Range = StringRef(Start, Current - Start);
1285  TokenQueue.push_back(T);
1286
1287  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1288
1289  IsSimpleKeyAllowed = false;
1290
1291  return true;
1292}
1293
1294bool Scanner::scanPlainScalar() {
1295  StringRef::iterator Start = Current;
1296  unsigned ColStart = Column;
1297  unsigned LeadingBlanks = 0;
1298  assert(Indent >= -1 && "Indent must be >= -1 !");
1299  unsigned indent = static_cast<unsigned>(Indent + 1);
1300  while (true) {
1301    if (*Current == '#')
1302      break;
1303
1304    while (!isBlankOrBreak(Current)) {
1305      if (  FlowLevel && *Current == ':'
1306          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1307        setError("Found unexpected ':' while scanning a plain scalar", Current);
1308        return false;
1309      }
1310
1311      // Check for the end of the plain scalar.
1312      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
1313          || (  FlowLevel
1314          && (StringRef(Current, 1).find_first_of(",:?[]{}")
1315              != StringRef::npos)))
1316        break;
1317
1318      StringRef::iterator i = skip_nb_char(Current);
1319      if (i == Current)
1320        break;
1321      Current = i;
1322      ++Column;
1323    }
1324
1325    // Are we at the end?
1326    if (!isBlankOrBreak(Current))
1327      break;
1328
1329    // Eat blanks.
1330    StringRef::iterator Tmp = Current;
1331    while (isBlankOrBreak(Tmp)) {
1332      StringRef::iterator i = skip_s_white(Tmp);
1333      if (i != Tmp) {
1334        if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1335          setError("Found invalid tab character in indentation", Tmp);
1336          return false;
1337        }
1338        Tmp = i;
1339        ++Column;
1340      } else {
1341        i = skip_b_break(Tmp);
1342        if (!LeadingBlanks)
1343          LeadingBlanks = 1;
1344        Tmp = i;
1345        Column = 0;
1346        ++Line;
1347      }
1348    }
1349
1350    if (!FlowLevel && Column < indent)
1351      break;
1352
1353    Current = Tmp;
1354  }
1355  if (Start == Current) {
1356    setError("Got empty plain scalar", Start);
1357    return false;
1358  }
1359  Token T;
1360  T.Kind = Token::TK_Scalar;
1361  T.Range = StringRef(Start, Current - Start);
1362  TokenQueue.push_back(T);
1363
1364  // Plain scalars can be simple keys.
1365  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1366
1367  IsSimpleKeyAllowed = false;
1368
1369  return true;
1370}
1371
1372bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1373  StringRef::iterator Start = Current;
1374  unsigned ColStart = Column;
1375  skip(1);
1376  while(true) {
1377    if (   *Current == '[' || *Current == ']'
1378        || *Current == '{' || *Current == '}'
1379        || *Current == ','
1380        || *Current == ':')
1381      break;
1382    StringRef::iterator i = skip_ns_char(Current);
1383    if (i == Current)
1384      break;
1385    Current = i;
1386    ++Column;
1387  }
1388
1389  if (Start == Current) {
1390    setError("Got empty alias or anchor", Start);
1391    return false;
1392  }
1393
1394  Token T;
1395  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1396  T.Range = StringRef(Start, Current - Start);
1397  TokenQueue.push_back(T);
1398
1399  // Alias and anchors can be simple keys.
1400  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1401
1402  IsSimpleKeyAllowed = false;
1403
1404  return true;
1405}
1406
1407bool Scanner::scanBlockScalar(bool IsLiteral) {
1408  StringRef::iterator Start = Current;
1409  skip(1); // Eat | or >
1410  while(true) {
1411    StringRef::iterator i = skip_nb_char(Current);
1412    if (i == Current) {
1413      if (Column == 0)
1414        break;
1415      i = skip_b_break(Current);
1416      if (i != Current) {
1417        // We got a line break.
1418        Column = 0;
1419        ++Line;
1420        Current = i;
1421        continue;
1422      } else {
1423        // There was an error, which should already have been printed out.
1424        return false;
1425      }
1426    }
1427    Current = i;
1428    ++Column;
1429  }
1430
1431  if (Start == Current) {
1432    setError("Got empty block scalar", Start);
1433    return false;
1434  }
1435
1436  Token T;
1437  T.Kind = Token::TK_Scalar;
1438  T.Range = StringRef(Start, Current - Start);
1439  TokenQueue.push_back(T);
1440  return true;
1441}
1442
1443bool Scanner::scanTag() {
1444  StringRef::iterator Start = Current;
1445  unsigned ColStart = Column;
1446  skip(1); // Eat !.
1447  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1448  else if (*Current == '<') {
1449    skip(1);
1450    scan_ns_uri_char();
1451    if (!consume('>'))
1452      return false;
1453  } else {
1454    // FIXME: Actually parse the c-ns-shorthand-tag rule.
1455    Current = skip_while(&Scanner::skip_ns_char, Current);
1456  }
1457
1458  Token T;
1459  T.Kind = Token::TK_Tag;
1460  T.Range = StringRef(Start, Current - Start);
1461  TokenQueue.push_back(T);
1462
1463  // Tags can be simple keys.
1464  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1465
1466  IsSimpleKeyAllowed = false;
1467
1468  return true;
1469}
1470
1471bool Scanner::fetchMoreTokens() {
1472  if (IsStartOfStream)
1473    return scanStreamStart();
1474
1475  scanToNextToken();
1476
1477  if (Current == End)
1478    return scanStreamEnd();
1479
1480  removeStaleSimpleKeyCandidates();
1481
1482  unrollIndent(Column);
1483
1484  if (Column == 0 && *Current == '%')
1485    return scanDirective();
1486
1487  if (Column == 0 && Current + 4 <= End
1488      && *Current == '-'
1489      && *(Current + 1) == '-'
1490      && *(Current + 2) == '-'
1491      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1492    return scanDocumentIndicator(true);
1493
1494  if (Column == 0 && Current + 4 <= End
1495      && *Current == '.'
1496      && *(Current + 1) == '.'
1497      && *(Current + 2) == '.'
1498      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1499    return scanDocumentIndicator(false);
1500
1501  if (*Current == '[')
1502    return scanFlowCollectionStart(true);
1503
1504  if (*Current == '{')
1505    return scanFlowCollectionStart(false);
1506
1507  if (*Current == ']')
1508    return scanFlowCollectionEnd(true);
1509
1510  if (*Current == '}')
1511    return scanFlowCollectionEnd(false);
1512
1513  if (*Current == ',')
1514    return scanFlowEntry();
1515
1516  if (*Current == '-' && isBlankOrBreak(Current + 1))
1517    return scanBlockEntry();
1518
1519  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1520    return scanKey();
1521
1522  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1523    return scanValue();
1524
1525  if (*Current == '*')
1526    return scanAliasOrAnchor(true);
1527
1528  if (*Current == '&')
1529    return scanAliasOrAnchor(false);
1530
1531  if (*Current == '!')
1532    return scanTag();
1533
1534  if (*Current == '|' && !FlowLevel)
1535    return scanBlockScalar(true);
1536
1537  if (*Current == '>' && !FlowLevel)
1538    return scanBlockScalar(false);
1539
1540  if (*Current == '\'')
1541    return scanFlowScalar(false);
1542
1543  if (*Current == '"')
1544    return scanFlowScalar(true);
1545
1546  // Get a plain scalar.
1547  StringRef FirstChar(Current, 1);
1548  if (!(isBlankOrBreak(Current)
1549        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1550      || (*Current == '-' && !isBlankOrBreak(Current + 1))
1551      || (!FlowLevel && (*Current == '?' || *Current == ':')
1552          && isBlankOrBreak(Current + 1))
1553      || (!FlowLevel && *Current == ':'
1554                      && Current + 2 < End
1555                      && *(Current + 1) == ':'
1556                      && !isBlankOrBreak(Current + 2)))
1557    return scanPlainScalar();
1558
1559  setError("Unrecognized character while tokenizing.");
1560  return false;
1561}
1562
1563Stream::Stream(StringRef Input, SourceMgr &SM)
1564  : scanner(new Scanner(Input, SM))
1565  , CurrentDoc(0) {}
1566
1567Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
1568  : scanner(new Scanner(InputBuffer, SM))
1569  , CurrentDoc(0) {}
1570
1571Stream::~Stream() {}
1572
1573bool Stream::failed() { return scanner->failed(); }
1574
1575void Stream::printError(Node *N, const Twine &Msg) {
1576  SmallVector<SMRange, 1> Ranges;
1577  Ranges.push_back(N->getSourceRange());
1578  scanner->printError( N->getSourceRange().Start
1579                     , SourceMgr::DK_Error
1580                     , Msg
1581                     , Ranges);
1582}
1583
1584document_iterator Stream::begin() {
1585  if (CurrentDoc)
1586    report_fatal_error("Can only iterate over the stream once");
1587
1588  // Skip Stream-Start.
1589  scanner->getNext();
1590
1591  CurrentDoc.reset(new Document(*this));
1592  return document_iterator(CurrentDoc);
1593}
1594
1595document_iterator Stream::end() {
1596  return document_iterator();
1597}
1598
1599void Stream::skip() {
1600  for (document_iterator i = begin(), e = end(); i != e; ++i)
1601    i->skip();
1602}
1603
1604Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T)
1605  : Doc(D)
1606  , TypeID(Type)
1607  , Anchor(A)
1608  , Tag(T) {
1609  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1610  SourceRange = SMRange(Start, Start);
1611}
1612
1613std::string Node::getVerbatimTag() const {
1614  StringRef Raw = getRawTag();
1615  if (!Raw.empty() && Raw != "!") {
1616    std::string Ret;
1617    if (Raw.find_last_of('!') == 0) {
1618      Ret = Doc->getTagMap().find("!")->second;
1619      Ret += Raw.substr(1);
1620      return llvm_move(Ret);
1621    } else if (Raw.startswith("!!")) {
1622      Ret = Doc->getTagMap().find("!!")->second;
1623      Ret += Raw.substr(2);
1624      return llvm_move(Ret);
1625    } else {
1626      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1627      std::map<StringRef, StringRef>::const_iterator It =
1628          Doc->getTagMap().find(TagHandle);
1629      if (It != Doc->getTagMap().end())
1630        Ret = It->second;
1631      else {
1632        Token T;
1633        T.Kind = Token::TK_Tag;
1634        T.Range = TagHandle;
1635        setError(Twine("Unknown tag handle ") + TagHandle, T);
1636      }
1637      Ret += Raw.substr(Raw.find_last_of('!') + 1);
1638      return llvm_move(Ret);
1639    }
1640  }
1641
1642  switch (getType()) {
1643  case NK_Null:
1644    return "tag:yaml.org,2002:null";
1645  case NK_Scalar:
1646    // TODO: Tag resolution.
1647    return "tag:yaml.org,2002:str";
1648  case NK_Mapping:
1649    return "tag:yaml.org,2002:map";
1650  case NK_Sequence:
1651    return "tag:yaml.org,2002:seq";
1652  }
1653
1654  return "";
1655}
1656
1657Token &Node::peekNext() {
1658  return Doc->peekNext();
1659}
1660
1661Token Node::getNext() {
1662  return Doc->getNext();
1663}
1664
1665Node *Node::parseBlockNode() {
1666  return Doc->parseBlockNode();
1667}
1668
1669BumpPtrAllocator &Node::getAllocator() {
1670  return Doc->NodeAllocator;
1671}
1672
1673void Node::setError(const Twine &Msg, Token &Tok) const {
1674  Doc->setError(Msg, Tok);
1675}
1676
1677bool Node::failed() const {
1678  return Doc->failed();
1679}
1680
1681
1682
1683StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1684  // TODO: Handle newlines properly. We need to remove leading whitespace.
1685  if (Value[0] == '"') { // Double quoted.
1686    // Pull off the leading and trailing "s.
1687    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1688    // Search for characters that would require unescaping the value.
1689    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1690    if (i != StringRef::npos)
1691      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1692    return UnquotedValue;
1693  } else if (Value[0] == '\'') { // Single quoted.
1694    // Pull off the leading and trailing 's.
1695    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1696    StringRef::size_type i = UnquotedValue.find('\'');
1697    if (i != StringRef::npos) {
1698      // We're going to need Storage.
1699      Storage.clear();
1700      Storage.reserve(UnquotedValue.size());
1701      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1702        StringRef Valid(UnquotedValue.begin(), i);
1703        Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1704        Storage.push_back('\'');
1705        UnquotedValue = UnquotedValue.substr(i + 2);
1706      }
1707      Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1708      return StringRef(Storage.begin(), Storage.size());
1709    }
1710    return UnquotedValue;
1711  }
1712  // Plain or block.
1713  return Value.rtrim(" ");
1714}
1715
1716StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1717                                          , StringRef::size_type i
1718                                          , SmallVectorImpl<char> &Storage)
1719                                          const {
1720  // Use Storage to build proper value.
1721  Storage.clear();
1722  Storage.reserve(UnquotedValue.size());
1723  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1724    // Insert all previous chars into Storage.
1725    StringRef Valid(UnquotedValue.begin(), i);
1726    Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1727    // Chop off inserted chars.
1728    UnquotedValue = UnquotedValue.substr(i);
1729
1730    assert(!UnquotedValue.empty() && "Can't be empty!");
1731
1732    // Parse escape or line break.
1733    switch (UnquotedValue[0]) {
1734    case '\r':
1735    case '\n':
1736      Storage.push_back('\n');
1737      if (   UnquotedValue.size() > 1
1738          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1739        UnquotedValue = UnquotedValue.substr(1);
1740      UnquotedValue = UnquotedValue.substr(1);
1741      break;
1742    default:
1743      if (UnquotedValue.size() == 1)
1744        // TODO: Report error.
1745        break;
1746      UnquotedValue = UnquotedValue.substr(1);
1747      switch (UnquotedValue[0]) {
1748      default: {
1749          Token T;
1750          T.Range = StringRef(UnquotedValue.begin(), 1);
1751          setError("Unrecognized escape code!", T);
1752          return "";
1753        }
1754      case '\r':
1755      case '\n':
1756        // Remove the new line.
1757        if (   UnquotedValue.size() > 1
1758            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1759          UnquotedValue = UnquotedValue.substr(1);
1760        // If this was just a single byte newline, it will get skipped
1761        // below.
1762        break;
1763      case '0':
1764        Storage.push_back(0x00);
1765        break;
1766      case 'a':
1767        Storage.push_back(0x07);
1768        break;
1769      case 'b':
1770        Storage.push_back(0x08);
1771        break;
1772      case 't':
1773      case 0x09:
1774        Storage.push_back(0x09);
1775        break;
1776      case 'n':
1777        Storage.push_back(0x0A);
1778        break;
1779      case 'v':
1780        Storage.push_back(0x0B);
1781        break;
1782      case 'f':
1783        Storage.push_back(0x0C);
1784        break;
1785      case 'r':
1786        Storage.push_back(0x0D);
1787        break;
1788      case 'e':
1789        Storage.push_back(0x1B);
1790        break;
1791      case ' ':
1792        Storage.push_back(0x20);
1793        break;
1794      case '"':
1795        Storage.push_back(0x22);
1796        break;
1797      case '/':
1798        Storage.push_back(0x2F);
1799        break;
1800      case '\\':
1801        Storage.push_back(0x5C);
1802        break;
1803      case 'N':
1804        encodeUTF8(0x85, Storage);
1805        break;
1806      case '_':
1807        encodeUTF8(0xA0, Storage);
1808        break;
1809      case 'L':
1810        encodeUTF8(0x2028, Storage);
1811        break;
1812      case 'P':
1813        encodeUTF8(0x2029, Storage);
1814        break;
1815      case 'x': {
1816          if (UnquotedValue.size() < 3)
1817            // TODO: Report error.
1818            break;
1819          unsigned int UnicodeScalarValue;
1820          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
1821            // TODO: Report error.
1822            UnicodeScalarValue = 0xFFFD;
1823          encodeUTF8(UnicodeScalarValue, Storage);
1824          UnquotedValue = UnquotedValue.substr(2);
1825          break;
1826        }
1827      case 'u': {
1828          if (UnquotedValue.size() < 5)
1829            // TODO: Report error.
1830            break;
1831          unsigned int UnicodeScalarValue;
1832          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
1833            // TODO: Report error.
1834            UnicodeScalarValue = 0xFFFD;
1835          encodeUTF8(UnicodeScalarValue, Storage);
1836          UnquotedValue = UnquotedValue.substr(4);
1837          break;
1838        }
1839      case 'U': {
1840          if (UnquotedValue.size() < 9)
1841            // TODO: Report error.
1842            break;
1843          unsigned int UnicodeScalarValue;
1844          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
1845            // TODO: Report error.
1846            UnicodeScalarValue = 0xFFFD;
1847          encodeUTF8(UnicodeScalarValue, Storage);
1848          UnquotedValue = UnquotedValue.substr(8);
1849          break;
1850        }
1851      }
1852      UnquotedValue = UnquotedValue.substr(1);
1853    }
1854  }
1855  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1856  return StringRef(Storage.begin(), Storage.size());
1857}
1858
1859Node *KeyValueNode::getKey() {
1860  if (Key)
1861    return Key;
1862  // Handle implicit null keys.
1863  {
1864    Token &t = peekNext();
1865    if (   t.Kind == Token::TK_BlockEnd
1866        || t.Kind == Token::TK_Value
1867        || t.Kind == Token::TK_Error) {
1868      return Key = new (getAllocator()) NullNode(Doc);
1869    }
1870    if (t.Kind == Token::TK_Key)
1871      getNext(); // skip TK_Key.
1872  }
1873
1874  // Handle explicit null keys.
1875  Token &t = peekNext();
1876  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
1877    return Key = new (getAllocator()) NullNode(Doc);
1878  }
1879
1880  // We've got a normal key.
1881  return Key = parseBlockNode();
1882}
1883
1884Node *KeyValueNode::getValue() {
1885  if (Value)
1886    return Value;
1887  getKey()->skip();
1888  if (failed())
1889    return Value = new (getAllocator()) NullNode(Doc);
1890
1891  // Handle implicit null values.
1892  {
1893    Token &t = peekNext();
1894    if (   t.Kind == Token::TK_BlockEnd
1895        || t.Kind == Token::TK_FlowMappingEnd
1896        || t.Kind == Token::TK_Key
1897        || t.Kind == Token::TK_FlowEntry
1898        || t.Kind == Token::TK_Error) {
1899      return Value = new (getAllocator()) NullNode(Doc);
1900    }
1901
1902    if (t.Kind != Token::TK_Value) {
1903      setError("Unexpected token in Key Value.", t);
1904      return Value = new (getAllocator()) NullNode(Doc);
1905    }
1906    getNext(); // skip TK_Value.
1907  }
1908
1909  // Handle explicit null values.
1910  Token &t = peekNext();
1911  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
1912    return Value = new (getAllocator()) NullNode(Doc);
1913  }
1914
1915  // We got a normal value.
1916  return Value = parseBlockNode();
1917}
1918
1919void MappingNode::increment() {
1920  if (failed()) {
1921    IsAtEnd = true;
1922    CurrentEntry = 0;
1923    return;
1924  }
1925  if (CurrentEntry) {
1926    CurrentEntry->skip();
1927    if (Type == MT_Inline) {
1928      IsAtEnd = true;
1929      CurrentEntry = 0;
1930      return;
1931    }
1932  }
1933  Token T = peekNext();
1934  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
1935    // KeyValueNode eats the TK_Key. That way it can detect null keys.
1936    CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
1937  } else if (Type == MT_Block) {
1938    switch (T.Kind) {
1939    case Token::TK_BlockEnd:
1940      getNext();
1941      IsAtEnd = true;
1942      CurrentEntry = 0;
1943      break;
1944    default:
1945      setError("Unexpected token. Expected Key or Block End", T);
1946    case Token::TK_Error:
1947      IsAtEnd = true;
1948      CurrentEntry = 0;
1949    }
1950  } else {
1951    switch (T.Kind) {
1952    case Token::TK_FlowEntry:
1953      // Eat the flow entry and recurse.
1954      getNext();
1955      return increment();
1956    case Token::TK_FlowMappingEnd:
1957      getNext();
1958    case Token::TK_Error:
1959      // Set this to end iterator.
1960      IsAtEnd = true;
1961      CurrentEntry = 0;
1962      break;
1963    default:
1964      setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
1965                "Mapping End."
1966              , T);
1967      IsAtEnd = true;
1968      CurrentEntry = 0;
1969    }
1970  }
1971}
1972
1973void SequenceNode::increment() {
1974  if (failed()) {
1975    IsAtEnd = true;
1976    CurrentEntry = 0;
1977    return;
1978  }
1979  if (CurrentEntry)
1980    CurrentEntry->skip();
1981  Token T = peekNext();
1982  if (SeqType == ST_Block) {
1983    switch (T.Kind) {
1984    case Token::TK_BlockEntry:
1985      getNext();
1986      CurrentEntry = parseBlockNode();
1987      if (CurrentEntry == 0) { // An error occurred.
1988        IsAtEnd = true;
1989        CurrentEntry = 0;
1990      }
1991      break;
1992    case Token::TK_BlockEnd:
1993      getNext();
1994      IsAtEnd = true;
1995      CurrentEntry = 0;
1996      break;
1997    default:
1998      setError( "Unexpected token. Expected Block Entry or Block End."
1999              , T);
2000    case Token::TK_Error:
2001      IsAtEnd = true;
2002      CurrentEntry = 0;
2003    }
2004  } else if (SeqType == ST_Indentless) {
2005    switch (T.Kind) {
2006    case Token::TK_BlockEntry:
2007      getNext();
2008      CurrentEntry = parseBlockNode();
2009      if (CurrentEntry == 0) { // An error occurred.
2010        IsAtEnd = true;
2011        CurrentEntry = 0;
2012      }
2013      break;
2014    default:
2015    case Token::TK_Error:
2016      IsAtEnd = true;
2017      CurrentEntry = 0;
2018    }
2019  } else if (SeqType == ST_Flow) {
2020    switch (T.Kind) {
2021    case Token::TK_FlowEntry:
2022      // Eat the flow entry and recurse.
2023      getNext();
2024      WasPreviousTokenFlowEntry = true;
2025      return increment();
2026    case Token::TK_FlowSequenceEnd:
2027      getNext();
2028    case Token::TK_Error:
2029      // Set this to end iterator.
2030      IsAtEnd = true;
2031      CurrentEntry = 0;
2032      break;
2033    case Token::TK_StreamEnd:
2034    case Token::TK_DocumentEnd:
2035    case Token::TK_DocumentStart:
2036      setError("Could not find closing ]!", T);
2037      // Set this to end iterator.
2038      IsAtEnd = true;
2039      CurrentEntry = 0;
2040      break;
2041    default:
2042      if (!WasPreviousTokenFlowEntry) {
2043        setError("Expected , between entries!", T);
2044        IsAtEnd = true;
2045        CurrentEntry = 0;
2046        break;
2047      }
2048      // Otherwise it must be a flow entry.
2049      CurrentEntry = parseBlockNode();
2050      if (!CurrentEntry) {
2051        IsAtEnd = true;
2052      }
2053      WasPreviousTokenFlowEntry = false;
2054      break;
2055    }
2056  }
2057}
2058
2059Document::Document(Stream &S) : stream(S), Root(0) {
2060  // Tag maps starts with two default mappings.
2061  TagMap["!"] = "!";
2062  TagMap["!!"] = "tag:yaml.org,2002:";
2063
2064  if (parseDirectives())
2065    expectToken(Token::TK_DocumentStart);
2066  Token &T = peekNext();
2067  if (T.Kind == Token::TK_DocumentStart)
2068    getNext();
2069}
2070
2071bool Document::skip()  {
2072  if (stream.scanner->failed())
2073    return false;
2074  if (!Root)
2075    getRoot();
2076  Root->skip();
2077  Token &T = peekNext();
2078  if (T.Kind == Token::TK_StreamEnd)
2079    return false;
2080  if (T.Kind == Token::TK_DocumentEnd) {
2081    getNext();
2082    return skip();
2083  }
2084  return true;
2085}
2086
2087Token &Document::peekNext() {
2088  return stream.scanner->peekNext();
2089}
2090
2091Token Document::getNext() {
2092  return stream.scanner->getNext();
2093}
2094
2095void Document::setError(const Twine &Message, Token &Location) const {
2096  stream.scanner->setError(Message, Location.Range.begin());
2097}
2098
2099bool Document::failed() const {
2100  return stream.scanner->failed();
2101}
2102
2103Node *Document::parseBlockNode() {
2104  Token T = peekNext();
2105  // Handle properties.
2106  Token AnchorInfo;
2107  Token TagInfo;
2108parse_property:
2109  switch (T.Kind) {
2110  case Token::TK_Alias:
2111    getNext();
2112    return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2113  case Token::TK_Anchor:
2114    if (AnchorInfo.Kind == Token::TK_Anchor) {
2115      setError("Already encountered an anchor for this node!", T);
2116      return 0;
2117    }
2118    AnchorInfo = getNext(); // Consume TK_Anchor.
2119    T = peekNext();
2120    goto parse_property;
2121  case Token::TK_Tag:
2122    if (TagInfo.Kind == Token::TK_Tag) {
2123      setError("Already encountered a tag for this node!", T);
2124      return 0;
2125    }
2126    TagInfo = getNext(); // Consume TK_Tag.
2127    T = peekNext();
2128    goto parse_property;
2129  default:
2130    break;
2131  }
2132
2133  switch (T.Kind) {
2134  case Token::TK_BlockEntry:
2135    // We got an unindented BlockEntry sequence. This is not terminated with
2136    // a BlockEnd.
2137    // Don't eat the TK_BlockEntry, SequenceNode needs it.
2138    return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2139                                           , AnchorInfo.Range.substr(1)
2140                                           , TagInfo.Range
2141                                           , SequenceNode::ST_Indentless);
2142  case Token::TK_BlockSequenceStart:
2143    getNext();
2144    return new (NodeAllocator)
2145      SequenceNode( stream.CurrentDoc
2146                  , AnchorInfo.Range.substr(1)
2147                  , TagInfo.Range
2148                  , SequenceNode::ST_Block);
2149  case Token::TK_BlockMappingStart:
2150    getNext();
2151    return new (NodeAllocator)
2152      MappingNode( stream.CurrentDoc
2153                 , AnchorInfo.Range.substr(1)
2154                 , TagInfo.Range
2155                 , MappingNode::MT_Block);
2156  case Token::TK_FlowSequenceStart:
2157    getNext();
2158    return new (NodeAllocator)
2159      SequenceNode( stream.CurrentDoc
2160                  , AnchorInfo.Range.substr(1)
2161                  , TagInfo.Range
2162                  , SequenceNode::ST_Flow);
2163  case Token::TK_FlowMappingStart:
2164    getNext();
2165    return new (NodeAllocator)
2166      MappingNode( stream.CurrentDoc
2167                 , AnchorInfo.Range.substr(1)
2168                 , TagInfo.Range
2169                 , MappingNode::MT_Flow);
2170  case Token::TK_Scalar:
2171    getNext();
2172    return new (NodeAllocator)
2173      ScalarNode( stream.CurrentDoc
2174                , AnchorInfo.Range.substr(1)
2175                , TagInfo.Range
2176                , T.Range);
2177  case Token::TK_Key:
2178    // Don't eat the TK_Key, KeyValueNode expects it.
2179    return new (NodeAllocator)
2180      MappingNode( stream.CurrentDoc
2181                 , AnchorInfo.Range.substr(1)
2182                 , TagInfo.Range
2183                 , MappingNode::MT_Inline);
2184  case Token::TK_DocumentStart:
2185  case Token::TK_DocumentEnd:
2186  case Token::TK_StreamEnd:
2187  default:
2188    // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2189    //       !!null null.
2190    return new (NodeAllocator) NullNode(stream.CurrentDoc);
2191  case Token::TK_Error:
2192    return 0;
2193  }
2194  llvm_unreachable("Control flow shouldn't reach here.");
2195  return 0;
2196}
2197
2198bool Document::parseDirectives() {
2199  bool isDirective = false;
2200  while (true) {
2201    Token T = peekNext();
2202    if (T.Kind == Token::TK_TagDirective) {
2203      parseTAGDirective();
2204      isDirective = true;
2205    } else if (T.Kind == Token::TK_VersionDirective) {
2206      parseYAMLDirective();
2207      isDirective = true;
2208    } else
2209      break;
2210  }
2211  return isDirective;
2212}
2213
2214void Document::parseYAMLDirective() {
2215  getNext(); // Eat %YAML <version>
2216}
2217
2218void Document::parseTAGDirective() {
2219  Token Tag = getNext(); // %TAG <handle> <prefix>
2220  StringRef T = Tag.Range;
2221  // Strip %TAG
2222  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2223  std::size_t HandleEnd = T.find_first_of(" \t");
2224  StringRef TagHandle = T.substr(0, HandleEnd);
2225  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2226  TagMap[TagHandle] = TagPrefix;
2227}
2228
2229bool Document::expectToken(int TK) {
2230  Token T = getNext();
2231  if (T.Kind != TK) {
2232    setError("Unexpected token", T);
2233    return false;
2234  }
2235  return true;
2236}
2237