1//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAMLPARSER_H
39#define LLVM_SUPPORT_YAMLPARSER_H
40
41#include "llvm/ADT/OwningPtr.h"
42#include "llvm/ADT/SmallString.h"
43#include "llvm/ADT/StringRef.h"
44#include "llvm/Support/Allocator.h"
45#include "llvm/Support/SMLoc.h"
46#include <limits>
47#include <utility>
48
49namespace llvm {
50class MemoryBuffer;
51class SourceMgr;
52class raw_ostream;
53class Twine;
54
55namespace yaml {
56
57class document_iterator;
58class Document;
59class Node;
60class Scanner;
61struct Token;
62
63/// @brief Dump all the tokens in this stream to OS.
64/// @returns true if there was an error, false otherwise.
65bool dumpTokens(StringRef Input, raw_ostream &);
66
67/// @brief Scans all tokens in input without outputting anything. This is used
68///        for benchmarking the tokenizer.
69/// @returns true if there was an error, false otherwise.
70bool scanTokens(StringRef Input);
71
72/// @brief Escape \a Input for a double quoted scalar.
73std::string escape(StringRef Input);
74
75/// @brief This class represents a YAML stream potentially containing multiple
76///        documents.
77class Stream {
78public:
79  /// @brief This keeps a reference to the string referenced by \p Input.
80  Stream(StringRef Input, SourceMgr &);
81
82  /// @brief This takes ownership of \p InputBuffer.
83  Stream(MemoryBuffer *InputBuffer, SourceMgr &);
84  ~Stream();
85
86  document_iterator begin();
87  document_iterator end();
88  void skip();
89  bool failed();
90  bool validate() {
91    skip();
92    return !failed();
93  }
94
95  void printError(Node *N, const Twine &Msg);
96
97private:
98  OwningPtr<Scanner> scanner;
99  OwningPtr<Document> CurrentDoc;
100
101  friend class Document;
102
103  /// @brief Validate a %YAML x.x directive.
104  void handleYAMLDirective(const Token &);
105};
106
107/// @brief Abstract base class for all Nodes.
108class Node {
109public:
110  enum NodeKind {
111    NK_Null,
112    NK_Scalar,
113    NK_KeyValue,
114    NK_Mapping,
115    NK_Sequence,
116    NK_Alias
117  };
118
119  Node(unsigned int Type, OwningPtr<Document>&, StringRef Anchor);
120
121  /// @brief Get the value of the anchor attached to this node. If it does not
122  ///        have one, getAnchor().size() will be 0.
123  StringRef getAnchor() const { return Anchor; }
124
125  SMRange getSourceRange() const { return SourceRange; }
126  void setSourceRange(SMRange SR) { SourceRange = SR; }
127
128  // These functions forward to Document and Scanner.
129  Token &peekNext();
130  Token getNext();
131  Node *parseBlockNode();
132  BumpPtrAllocator &getAllocator();
133  void setError(const Twine &Message, Token &Location) const;
134  bool failed() const;
135
136  virtual void skip() {}
137
138  unsigned int getType() const { return TypeID; }
139
140  void *operator new ( size_t Size
141                     , BumpPtrAllocator &Alloc
142                     , size_t Alignment = 16) throw() {
143    return Alloc.Allocate(Size, Alignment);
144  }
145
146  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t) throw() {
147    Alloc.Deallocate(Ptr);
148  }
149
150protected:
151  OwningPtr<Document> &Doc;
152  SMRange SourceRange;
153
154  void operator delete(void *) throw() {}
155
156  virtual ~Node() {}
157
158private:
159  unsigned int TypeID;
160  StringRef Anchor;
161};
162
163/// @brief A null value.
164///
165/// Example:
166///   !!null null
167class NullNode : public Node {
168public:
169  NullNode(OwningPtr<Document> &D) : Node(NK_Null, D, StringRef()) {}
170
171  static inline bool classof(const Node *N) {
172    return N->getType() == NK_Null;
173  }
174};
175
176/// @brief A scalar node is an opaque datum that can be presented as a
177///        series of zero or more Unicode scalar values.
178///
179/// Example:
180///   Adena
181class ScalarNode : public Node {
182public:
183  ScalarNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Val)
184    : Node(NK_Scalar, D, Anchor)
185    , Value(Val) {
186    SMLoc Start = SMLoc::getFromPointer(Val.begin());
187    SMLoc End = SMLoc::getFromPointer(Val.end());
188    SourceRange = SMRange(Start, End);
189  }
190
191  // Return Value without any escaping or folding or other fun YAML stuff. This
192  // is the exact bytes that are contained in the file (after conversion to
193  // utf8).
194  StringRef getRawValue() const { return Value; }
195
196  /// @brief Gets the value of this node as a StringRef.
197  ///
198  /// @param Storage is used to store the content of the returned StringRef iff
199  ///        it requires any modification from how it appeared in the source.
200  ///        This happens with escaped characters and multi-line literals.
201  StringRef getValue(SmallVectorImpl<char> &Storage) const;
202
203  static inline bool classof(const Node *N) {
204    return N->getType() == NK_Scalar;
205  }
206
207private:
208  StringRef Value;
209
210  StringRef unescapeDoubleQuoted( StringRef UnquotedValue
211                                , StringRef::size_type Start
212                                , SmallVectorImpl<char> &Storage) const;
213};
214
215/// @brief A key and value pair. While not technically a Node under the YAML
216///        representation graph, it is easier to treat them this way.
217///
218/// TODO: Consider making this not a child of Node.
219///
220/// Example:
221///   Section: .text
222class KeyValueNode : public Node {
223public:
224  KeyValueNode(OwningPtr<Document> &D)
225    : Node(NK_KeyValue, D, StringRef())
226    , Key(0)
227    , Value(0)
228  {}
229
230  /// @brief Parse and return the key.
231  ///
232  /// This may be called multiple times.
233  ///
234  /// @returns The key, or nullptr if failed() == true.
235  Node *getKey();
236
237  /// @brief Parse and return the value.
238  ///
239  /// This may be called multiple times.
240  ///
241  /// @returns The value, or nullptr if failed() == true.
242  Node *getValue();
243
244  virtual void skip() LLVM_OVERRIDE {
245    getKey()->skip();
246    getValue()->skip();
247  }
248
249  static inline bool classof(const Node *N) {
250    return N->getType() == NK_KeyValue;
251  }
252
253private:
254  Node *Key;
255  Node *Value;
256};
257
258/// @brief This is an iterator abstraction over YAML collections shared by both
259///        sequences and maps.
260///
261/// BaseT must have a ValueT* member named CurrentEntry and a member function
262/// increment() which must set CurrentEntry to 0 to create an end iterator.
263template <class BaseT, class ValueT>
264class basic_collection_iterator
265  : public std::iterator<std::forward_iterator_tag, ValueT> {
266public:
267  basic_collection_iterator() : Base(0) {}
268  basic_collection_iterator(BaseT *B) : Base(B) {}
269
270  ValueT *operator ->() const {
271    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
272    return Base->CurrentEntry;
273  }
274
275  ValueT &operator *() const {
276    assert(Base && Base->CurrentEntry &&
277           "Attempted to dereference end iterator!");
278    return *Base->CurrentEntry;
279  }
280
281  operator ValueT*() const {
282    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
283    return Base->CurrentEntry;
284  }
285
286  bool operator !=(const basic_collection_iterator &Other) const {
287    if(Base != Other.Base)
288      return true;
289    return (Base && Other.Base) && Base->CurrentEntry
290                                   != Other.Base->CurrentEntry;
291  }
292
293  basic_collection_iterator &operator++() {
294    assert(Base && "Attempted to advance iterator past end!");
295    Base->increment();
296    // Create an end iterator.
297    if (Base->CurrentEntry == 0)
298      Base = 0;
299    return *this;
300  }
301
302private:
303  BaseT *Base;
304};
305
306// The following two templates are used for both MappingNode and Sequence Node.
307template <class CollectionType>
308typename CollectionType::iterator begin(CollectionType &C) {
309  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
310  C.IsAtBeginning = false;
311  typename CollectionType::iterator ret(&C);
312  ++ret;
313  return ret;
314}
315
316template <class CollectionType>
317void skip(CollectionType &C) {
318  // TODO: support skipping from the middle of a parsed collection ;/
319  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
320  if (C.IsAtBeginning)
321    for (typename CollectionType::iterator i = begin(C), e = C.end();
322                                           i != e; ++i)
323      i->skip();
324}
325
326/// @brief Represents a YAML map created from either a block map for a flow map.
327///
328/// This parses the YAML stream as increment() is called.
329///
330/// Example:
331///   Name: _main
332///   Scope: Global
333class MappingNode : public Node {
334public:
335  enum MappingType {
336    MT_Block,
337    MT_Flow,
338    MT_Inline ///< An inline mapping node is used for "[key: value]".
339  };
340
341  MappingNode(OwningPtr<Document> &D, StringRef Anchor, MappingType MT)
342    : Node(NK_Mapping, D, Anchor)
343    , Type(MT)
344    , IsAtBeginning(true)
345    , IsAtEnd(false)
346    , CurrentEntry(0)
347  {}
348
349  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
350  typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
351  template <class T> friend typename T::iterator yaml::begin(T &);
352  template <class T> friend void yaml::skip(T &);
353
354  iterator begin() {
355    return yaml::begin(*this);
356  }
357
358  iterator end() { return iterator(); }
359
360  virtual void skip() LLVM_OVERRIDE {
361    yaml::skip(*this);
362  }
363
364  static inline bool classof(const Node *N) {
365    return N->getType() == NK_Mapping;
366  }
367
368private:
369  MappingType Type;
370  bool IsAtBeginning;
371  bool IsAtEnd;
372  KeyValueNode *CurrentEntry;
373
374  void increment();
375};
376
377/// @brief Represents a YAML sequence created from either a block sequence for a
378///        flow sequence.
379///
380/// This parses the YAML stream as increment() is called.
381///
382/// Example:
383///   - Hello
384///   - World
385class SequenceNode : public Node {
386public:
387  enum SequenceType {
388    ST_Block,
389    ST_Flow,
390    // Use for:
391    //
392    // key:
393    // - val1
394    // - val2
395    //
396    // As a BlockMappingEntry and BlockEnd are not created in this case.
397    ST_Indentless
398  };
399
400  SequenceNode(OwningPtr<Document> &D, StringRef Anchor, SequenceType ST)
401    : Node(NK_Sequence, D, Anchor)
402    , SeqType(ST)
403    , IsAtBeginning(true)
404    , IsAtEnd(false)
405    , WasPreviousTokenFlowEntry(true) // Start with an imaginary ','.
406    , CurrentEntry(0)
407  {}
408
409  friend class basic_collection_iterator<SequenceNode, Node>;
410  typedef basic_collection_iterator<SequenceNode, Node> iterator;
411  template <class T> friend typename T::iterator yaml::begin(T &);
412  template <class T> friend void yaml::skip(T &);
413
414  void increment();
415
416  iterator begin() {
417    return yaml::begin(*this);
418  }
419
420  iterator end() { return iterator(); }
421
422  virtual void skip() LLVM_OVERRIDE {
423    yaml::skip(*this);
424  }
425
426  static inline bool classof(const Node *N) {
427    return N->getType() == NK_Sequence;
428  }
429
430private:
431  SequenceType SeqType;
432  bool IsAtBeginning;
433  bool IsAtEnd;
434  bool WasPreviousTokenFlowEntry;
435  Node *CurrentEntry;
436};
437
438/// @brief Represents an alias to a Node with an anchor.
439///
440/// Example:
441///   *AnchorName
442class AliasNode : public Node {
443public:
444  AliasNode(OwningPtr<Document> &D, StringRef Val)
445    : Node(NK_Alias, D, StringRef()), Name(Val) {}
446
447  StringRef getName() const { return Name; }
448  Node *getTarget();
449
450  static inline bool classof(const Node *N) {
451    return N->getType() == NK_Alias;
452  }
453
454private:
455  StringRef Name;
456};
457
458/// @brief A YAML Stream is a sequence of Documents. A document contains a root
459///        node.
460class Document {
461public:
462  /// @brief Root for parsing a node. Returns a single node.
463  Node *parseBlockNode();
464
465  Document(Stream &ParentStream);
466
467  /// @brief Finish parsing the current document and return true if there are
468  ///        more. Return false otherwise.
469  bool skip();
470
471  /// @brief Parse and return the root level node.
472  Node *getRoot() {
473    if (Root)
474      return Root;
475    return Root = parseBlockNode();
476  }
477
478private:
479  friend class Node;
480  friend class document_iterator;
481
482  /// @brief Stream to read tokens from.
483  Stream &stream;
484
485  /// @brief Used to allocate nodes to. All are destroyed without calling their
486  ///        destructor when the document is destroyed.
487  BumpPtrAllocator NodeAllocator;
488
489  /// @brief The root node. Used to support skipping a partially parsed
490  ///        document.
491  Node *Root;
492
493  Token &peekNext();
494  Token getNext();
495  void setError(const Twine &Message, Token &Location) const;
496  bool failed() const;
497
498  void handleTagDirective(const Token &Tag) {
499    // TODO: Track tags.
500  }
501
502  /// @brief Parse %BLAH directives and return true if any were encountered.
503  bool parseDirectives();
504
505  /// @brief Consume the next token and error if it is not \a TK.
506  bool expectToken(int TK);
507};
508
509/// @brief Iterator abstraction for Documents over a Stream.
510class document_iterator {
511public:
512  document_iterator() : Doc(0) {}
513  document_iterator(OwningPtr<Document> &D) : Doc(&D) {}
514
515  bool operator ==(const document_iterator &Other) {
516    if (isAtEnd() || Other.isAtEnd())
517      return isAtEnd() && Other.isAtEnd();
518
519    return *Doc == *Other.Doc;
520  }
521  bool operator !=(const document_iterator &Other) {
522    return !(*this == Other);
523  }
524
525  document_iterator operator ++() {
526    assert(Doc != 0 && "incrementing iterator past the end.");
527    if (!(*Doc)->skip()) {
528      Doc->reset(0);
529    } else {
530      Stream &S = (*Doc)->stream;
531      Doc->reset(new Document(S));
532    }
533    return *this;
534  }
535
536  Document &operator *() {
537    return *Doc->get();
538  }
539
540  OwningPtr<Document> &operator ->() {
541    return *Doc;
542  }
543
544private:
545  bool isAtEnd() const {
546    return Doc == 0 || *Doc == 0;
547  }
548
549  OwningPtr<Document> *Doc;
550};
551
552}
553}
554
555#endif
556