1135SN/A//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2135SN/A//
3135SN/A// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4135SN/A// See https://llvm.org/LICENSE.txt for license information.
5135SN/A// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6135SN/A//
7135SN/A//===----------------------------------------------------------------------===//
8135SN/A//
9135SN/A//  This file defines lexer for structured comments and supporting token class.
10135SN/A//
11135SN/A//===----------------------------------------------------------------------===//
12135SN/A
13135SN/A#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14135SN/A#define LLVM_CLANG_AST_COMMENTLEXER_H
15135SN/A
16135SN/A#include "clang/Basic/Diagnostic.h"
17135SN/A#include "clang/Basic/SourceManager.h"
18135SN/A#include "llvm/ADT/SmallString.h"
19135SN/A#include "llvm/ADT/StringRef.h"
20135SN/A#include "llvm/Support/Allocator.h"
21135SN/A#include "llvm/Support/raw_ostream.h"
22135SN/A
23135SN/Anamespace clang {
24135SN/Anamespace comments {
25135SN/A
26135SN/Aclass Lexer;
27135SN/Aclass TextTokenRetokenizer;
28974Ssundarstruct CommandInfo;
29974Ssundarclass CommandTraits;
30427SN/A
31741SN/Anamespace tok {
321551Sattilaenum TokenKind {
331805Sattila  eof,
341551Sattila  newline,
351551Sattila  text,
361551Sattila  unknown_command,   // Command that does not have an ID.
371551Sattila  backslash_command, // Command with an ID, that used backslash marker.
381551Sattila  at_command,        // Command with an ID, that used 'at' marker.
39974Ssundar  verbatim_block_begin,
401470Sattila  verbatim_block_line,
41427SN/A  verbatim_block_end,
42135SN/A  verbatim_line_name,
43135SN/A  verbatim_line_text,
44135SN/A  html_start_tag,     // <tag
45135SN/A  html_ident,         // attr
46135SN/A  html_equals,        // =
47789SN/A  html_quoted_string, // "blah\"blah" or 'blah\'blah'
48789SN/A  html_greater,       // >
49789SN/A  html_slash_greater, // />
50789SN/A  html_end_tag        // </tag
51789SN/A};
52789SN/A} // end namespace tok
53789SN/A
54789SN/A/// Comment token.
55789SN/Aclass Token {
56789SN/A  friend class Lexer;
57789SN/A  friend class TextTokenRetokenizer;
58789SN/A
59789SN/A  /// The location of the token.
60135SN/A  SourceLocation Loc;
61135SN/A
62440SN/A  /// The actual kind of the token.
63135SN/A  tok::TokenKind Kind;
64135SN/A
65440SN/A  /// Integer value associated with a token.
66789SN/A  ///
67135SN/A  /// If the token is a known command, contains command ID and TextPtr is
68135SN/A  /// unused (command spelling can be found with CommandTraits).  Otherwise,
69135SN/A  /// contains the length of the string that starts at TextPtr.
70741SN/A  unsigned IntVal;
71789SN/A
72789SN/A  /// Length of the token spelling in comment.  Can be 0 for synthenized
73789SN/A  /// tokens.
74789SN/A  unsigned Length;
75789SN/A
76789SN/A  /// Contains text value associated with a token.
77789SN/A  const char *TextPtr;
78789SN/A
79135SN/Apublic:
80135SN/A  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81135SN/A  void setLocation(SourceLocation SL) { Loc = SL; }
82135SN/A
83135SN/A  SourceLocation getEndLocation() const LLVM_READONLY {
84135SN/A    if (Length == 0 || Length == 1)
85135SN/A      return Loc;
86440SN/A    return Loc.getLocWithOffset(Length - 1);
87440SN/A  }
88440SN/A
89440SN/A  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90440SN/A  void setKind(tok::TokenKind K) { Kind = K; }
91741SN/A
92741SN/A  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93741SN/A  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94741SN/A
95741SN/A  unsigned getLength() const LLVM_READONLY { return Length; }
96741SN/A  void setLength(unsigned L) { Length = L; }
97741SN/A
98741SN/A  StringRef getText() const LLVM_READONLY {
99741SN/A    assert(is(tok::text));
100741SN/A    return StringRef(TextPtr, IntVal);
101741SN/A  }
102741SN/A
103741SN/A  void setText(StringRef Text) {
104741SN/A    assert(is(tok::text));
105741SN/A    TextPtr = Text.data();
106741SN/A    IntVal = Text.size();
107741SN/A  }
108974Ssundar
109974Ssundar  StringRef getUnknownCommandName() const LLVM_READONLY {
110974Ssundar    assert(is(tok::unknown_command));
111974Ssundar    return StringRef(TextPtr, IntVal);
112974Ssundar  }
113974Ssundar
114440SN/A  void setUnknownCommandName(StringRef Name) {
115741SN/A    assert(is(tok::unknown_command));
116440SN/A    TextPtr = Name.data();
117440SN/A    IntVal = Name.size();
118440SN/A  }
119440SN/A
1201472Sattila  unsigned getCommandID() const LLVM_READONLY {
121974Ssundar    assert(is(tok::backslash_command) || is(tok::at_command));
122974Ssundar    return IntVal;
123974Ssundar  }
124974Ssundar
125974Ssundar  void setCommandID(unsigned ID) {
126974Ssundar    assert(is(tok::backslash_command) || is(tok::at_command));
127135SN/A    IntVal = ID;
128135SN/A  }
1291472Sattila
130427SN/A  unsigned getVerbatimBlockID() const LLVM_READONLY {
131427SN/A    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
1321472Sattila    return IntVal;
1331805Sattila  }
1341483Sattila
1351805Sattila  void setVerbatimBlockID(unsigned ID) {
1361483Sattila    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137427SN/A    IntVal = ID;
138440SN/A  }
139427SN/A
140440SN/A  StringRef getVerbatimBlockText() const LLVM_READONLY {
141135SN/A    assert(is(tok::verbatim_block_line));
142440SN/A    return StringRef(TextPtr, IntVal);
143440SN/A  }
144440SN/A
145491SN/A  void setVerbatimBlockText(StringRef Text) {
146135SN/A    assert(is(tok::verbatim_block_line));
147135SN/A    TextPtr = Text.data();
148    IntVal = Text.size();
149  }
150
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152    assert(is(tok::verbatim_line_name));
153    return IntVal;
154  }
155
156  void setVerbatimLineID(unsigned ID) {
157    assert(is(tok::verbatim_line_name));
158    IntVal = ID;
159  }
160
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162    assert(is(tok::verbatim_line_text));
163    return StringRef(TextPtr, IntVal);
164  }
165
166  void setVerbatimLineText(StringRef Text) {
167    assert(is(tok::verbatim_line_text));
168    TextPtr = Text.data();
169    IntVal = Text.size();
170  }
171
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173    assert(is(tok::html_start_tag));
174    return StringRef(TextPtr, IntVal);
175  }
176
177  void setHTMLTagStartName(StringRef Name) {
178    assert(is(tok::html_start_tag));
179    TextPtr = Name.data();
180    IntVal = Name.size();
181  }
182
183  StringRef getHTMLIdent() const LLVM_READONLY {
184    assert(is(tok::html_ident));
185    return StringRef(TextPtr, IntVal);
186  }
187
188  void setHTMLIdent(StringRef Name) {
189    assert(is(tok::html_ident));
190    TextPtr = Name.data();
191    IntVal = Name.size();
192  }
193
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195    assert(is(tok::html_quoted_string));
196    return StringRef(TextPtr, IntVal);
197  }
198
199  void setHTMLQuotedString(StringRef Str) {
200    assert(is(tok::html_quoted_string));
201    TextPtr = Str.data();
202    IntVal = Str.size();
203  }
204
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206    assert(is(tok::html_end_tag));
207    return StringRef(TextPtr, IntVal);
208  }
209
210  void setHTMLTagEndName(StringRef Name) {
211    assert(is(tok::html_end_tag));
212    TextPtr = Name.data();
213    IntVal = Name.size();
214  }
215
216  void dump(const Lexer &L, const SourceManager &SM) const;
217};
218
219/// Comment lexer.
220class Lexer {
221private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228
229  DiagnosticsEngine &Diags;
230
231  const CommandTraits &Traits;
232
233  const char *const BufferStart;
234  const char *const BufferEnd;
235
236  const char *BufferPtr;
237
238  /// One past end pointer for the current comment.  For BCPL comments points
239  /// to newline or BufferEnd, for C comments points to star in '*/'.
240  const char *CommentEnd;
241
242  SourceLocation FileLoc;
243
244  /// If true, the commands, html tags, etc will be parsed and reported as
245  /// separate tokens inside the comment body. If false, the comment text will
246  /// be parsed into text and newline tokens.
247  bool ParseCommands;
248
249  enum LexerCommentState : uint8_t {
250    LCS_BeforeComment,
251    LCS_InsideBCPLComment,
252    LCS_InsideCComment,
253    LCS_BetweenComments
254  };
255
256  /// Low-level lexer state, track if we are inside or outside of comment.
257  LexerCommentState CommentState;
258
259  enum LexerState : uint8_t {
260    /// Lexing normal comment text
261    LS_Normal,
262
263    /// Finished lexing verbatim block beginning command, will lex first body
264    /// line.
265    LS_VerbatimBlockFirstLine,
266
267    /// Lexing verbatim block body line-by-line, skipping line-starting
268    /// decorations.
269    LS_VerbatimBlockBody,
270
271    /// Finished lexing verbatim line beginning command, will lex text (one
272    /// line).
273    LS_VerbatimLineText,
274
275    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276    LS_HTMLStartTag,
277
278    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279    LS_HTMLEndTag
280  };
281
282  /// Current lexing mode.
283  LexerState State;
284
285  /// If State is LS_VerbatimBlock, contains the name of verbatim end
286  /// command, including command marker.
287  SmallString<16> VerbatimBlockEndCommandName;
288
289  /// Given a character reference name (e.g., "lt"), return the character that
290  /// it stands for (e.g., "<").
291  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292
293  /// Given a Unicode codepoint as base-10 integer, return the character.
294  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295
296  /// Given a Unicode codepoint as base-16 integer, return the character.
297  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298
299  void formTokenWithChars(Token &Result, const char *TokEnd,
300                          tok::TokenKind Kind);
301
302  void formTextToken(Token &Result, const char *TokEnd) {
303    StringRef Text(BufferPtr, TokEnd - BufferPtr);
304    formTokenWithChars(Result, TokEnd, tok::text);
305    Result.setText(Text);
306  }
307
308  SourceLocation getSourceLocation(const char *Loc) const {
309    assert(Loc >= BufferStart && Loc <= BufferEnd &&
310           "Location out of range for this buffer!");
311
312    const unsigned CharNo = Loc - BufferStart;
313    return FileLoc.getLocWithOffset(CharNo);
314  }
315
316  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317    return Diags.Report(Loc, DiagID);
318  }
319
320  /// Eat string matching regexp \code \s*\* \endcode.
321  void skipLineStartingDecorations();
322
323  /// Skip over pure text.
324  const char *skipTextToken();
325
326  /// Lex comment text, including commands if ParseCommands is set to true.
327  void lexCommentText(Token &T);
328
329  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
330                                const CommandInfo *Info);
331
332  void lexVerbatimBlockFirstLine(Token &T);
333
334  void lexVerbatimBlockBody(Token &T);
335
336  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
337                               const CommandInfo *Info);
338
339  void lexVerbatimLineText(Token &T);
340
341  void lexHTMLCharacterReference(Token &T);
342
343  void setupAndLexHTMLStartTag(Token &T);
344
345  void lexHTMLStartTag(Token &T);
346
347  void setupAndLexHTMLEndTag(Token &T);
348
349  void lexHTMLEndTag(Token &T);
350
351public:
352  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
353        const CommandTraits &Traits, SourceLocation FileLoc,
354        const char *BufferStart, const char *BufferEnd,
355        bool ParseCommands = true);
356
357  void lex(Token &T);
358
359  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
360};
361
362} // end namespace comments
363} // end namespace clang
364
365#endif
366
367