1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//  This file defines lexer for structured comments and supporting token class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14#define LLVM_CLANG_AST_COMMENTLEXER_H
15
16#include "clang/Basic/Diagnostic.h"
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Support/Allocator.h"
21#include "llvm/Support/raw_ostream.h"
22
23namespace clang {
24namespace comments {
25
26class Lexer;
27class TextTokenRetokenizer;
28struct CommandInfo;
29class CommandTraits;
30
31namespace tok {
32enum TokenKind {
33  eof,
34  newline,
35  text,
36  unknown_command,   // Command that does not have an ID.
37  backslash_command, // Command with an ID, that used backslash marker.
38  at_command,        // Command with an ID, that used 'at' marker.
39  verbatim_block_begin,
40  verbatim_block_line,
41  verbatim_block_end,
42  verbatim_line_name,
43  verbatim_line_text,
44  html_start_tag,     // <tag
45  html_ident,         // attr
46  html_equals,        // =
47  html_quoted_string, // "blah\"blah" or 'blah\'blah'
48  html_greater,       // >
49  html_slash_greater, // />
50  html_end_tag        // </tag
51};
52} // end namespace tok
53
54/// Comment token.
55class Token {
56  friend class Lexer;
57  friend class TextTokenRetokenizer;
58
59  /// The location of the token.
60  SourceLocation Loc;
61
62  /// The actual kind of the token.
63  tok::TokenKind Kind;
64
65  /// Integer value associated with a token.
66  ///
67  /// If the token is a known command, contains command ID and TextPtr is
68  /// unused (command spelling can be found with CommandTraits).  Otherwise,
69  /// contains the length of the string that starts at TextPtr.
70  unsigned IntVal;
71
72  /// Length of the token spelling in comment.  Can be 0 for synthenized
73  /// tokens.
74  unsigned Length;
75
76  /// Contains text value associated with a token.
77  const char *TextPtr;
78
79public:
80  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81  void setLocation(SourceLocation SL) { Loc = SL; }
82
83  SourceLocation getEndLocation() const LLVM_READONLY {
84    if (Length == 0 || Length == 1)
85      return Loc;
86    return Loc.getLocWithOffset(Length - 1);
87  }
88
89  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90  void setKind(tok::TokenKind K) { Kind = K; }
91
92  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94
95  unsigned getLength() const LLVM_READONLY { return Length; }
96  void setLength(unsigned L) { Length = L; }
97
98  StringRef getText() const LLVM_READONLY {
99    assert(is(tok::text));
100    return StringRef(TextPtr, IntVal);
101  }
102
103  void setText(StringRef Text) {
104    assert(is(tok::text));
105    TextPtr = Text.data();
106    IntVal = Text.size();
107  }
108
109  StringRef getUnknownCommandName() const LLVM_READONLY {
110    assert(is(tok::unknown_command));
111    return StringRef(TextPtr, IntVal);
112  }
113
114  void setUnknownCommandName(StringRef Name) {
115    assert(is(tok::unknown_command));
116    TextPtr = Name.data();
117    IntVal = Name.size();
118  }
119
120  unsigned getCommandID() const LLVM_READONLY {
121    assert(is(tok::backslash_command) || is(tok::at_command));
122    return IntVal;
123  }
124
125  void setCommandID(unsigned ID) {
126    assert(is(tok::backslash_command) || is(tok::at_command));
127    IntVal = ID;
128  }
129
130  unsigned getVerbatimBlockID() const LLVM_READONLY {
131    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
132    return IntVal;
133  }
134
135  void setVerbatimBlockID(unsigned ID) {
136    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137    IntVal = ID;
138  }
139
140  StringRef getVerbatimBlockText() const LLVM_READONLY {
141    assert(is(tok::verbatim_block_line));
142    return StringRef(TextPtr, IntVal);
143  }
144
145  void setVerbatimBlockText(StringRef Text) {
146    assert(is(tok::verbatim_block_line));
147    TextPtr = Text.data();
148    IntVal = Text.size();
149  }
150
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152    assert(is(tok::verbatim_line_name));
153    return IntVal;
154  }
155
156  void setVerbatimLineID(unsigned ID) {
157    assert(is(tok::verbatim_line_name));
158    IntVal = ID;
159  }
160
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162    assert(is(tok::verbatim_line_text));
163    return StringRef(TextPtr, IntVal);
164  }
165
166  void setVerbatimLineText(StringRef Text) {
167    assert(is(tok::verbatim_line_text));
168    TextPtr = Text.data();
169    IntVal = Text.size();
170  }
171
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173    assert(is(tok::html_start_tag));
174    return StringRef(TextPtr, IntVal);
175  }
176
177  void setHTMLTagStartName(StringRef Name) {
178    assert(is(tok::html_start_tag));
179    TextPtr = Name.data();
180    IntVal = Name.size();
181  }
182
183  StringRef getHTMLIdent() const LLVM_READONLY {
184    assert(is(tok::html_ident));
185    return StringRef(TextPtr, IntVal);
186  }
187
188  void setHTMLIdent(StringRef Name) {
189    assert(is(tok::html_ident));
190    TextPtr = Name.data();
191    IntVal = Name.size();
192  }
193
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195    assert(is(tok::html_quoted_string));
196    return StringRef(TextPtr, IntVal);
197  }
198
199  void setHTMLQuotedString(StringRef Str) {
200    assert(is(tok::html_quoted_string));
201    TextPtr = Str.data();
202    IntVal = Str.size();
203  }
204
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206    assert(is(tok::html_end_tag));
207    return StringRef(TextPtr, IntVal);
208  }
209
210  void setHTMLTagEndName(StringRef Name) {
211    assert(is(tok::html_end_tag));
212    TextPtr = Name.data();
213    IntVal = Name.size();
214  }
215
216  void dump(const Lexer &L, const SourceManager &SM) const;
217};
218
219/// Comment lexer.
220class Lexer {
221private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228
229  DiagnosticsEngine &Diags;
230
231  const CommandTraits &Traits;
232
233  const char *const BufferStart;
234  const char *const BufferEnd;
235
236  const char *BufferPtr;
237
238  /// One past end pointer for the current comment.  For BCPL comments points
239  /// to newline or BufferEnd, for C comments points to star in '*/'.
240  const char *CommentEnd;
241
242  SourceLocation FileLoc;
243
244  /// If true, the commands, html tags, etc will be parsed and reported as
245  /// separate tokens inside the comment body. If false, the comment text will
246  /// be parsed into text and newline tokens.
247  bool ParseCommands;
248
249  enum LexerCommentState : uint8_t {
250    LCS_BeforeComment,
251    LCS_InsideBCPLComment,
252    LCS_InsideCComment,
253    LCS_BetweenComments
254  };
255
256  /// Low-level lexer state, track if we are inside or outside of comment.
257  LexerCommentState CommentState;
258
259  enum LexerState : uint8_t {
260    /// Lexing normal comment text
261    LS_Normal,
262
263    /// Finished lexing verbatim block beginning command, will lex first body
264    /// line.
265    LS_VerbatimBlockFirstLine,
266
267    /// Lexing verbatim block body line-by-line, skipping line-starting
268    /// decorations.
269    LS_VerbatimBlockBody,
270
271    /// Finished lexing verbatim line beginning command, will lex text (one
272    /// line).
273    LS_VerbatimLineText,
274
275    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276    LS_HTMLStartTag,
277
278    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279    LS_HTMLEndTag
280  };
281
282  /// Current lexing mode.
283  LexerState State;
284
285  /// If State is LS_VerbatimBlock, contains the name of verbatim end
286  /// command, including command marker.
287  SmallString<16> VerbatimBlockEndCommandName;
288
289  /// Given a character reference name (e.g., "lt"), return the character that
290  /// it stands for (e.g., "<").
291  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292
293  /// Given a Unicode codepoint as base-10 integer, return the character.
294  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295
296  /// Given a Unicode codepoint as base-16 integer, return the character.
297  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298
299  void formTokenWithChars(Token &Result, const char *TokEnd,
300                          tok::TokenKind Kind);
301
302  void formTextToken(Token &Result, const char *TokEnd) {
303    StringRef Text(BufferPtr, TokEnd - BufferPtr);
304    formTokenWithChars(Result, TokEnd, tok::text);
305    Result.setText(Text);
306  }
307
308  SourceLocation getSourceLocation(const char *Loc) const {
309    assert(Loc >= BufferStart && Loc <= BufferEnd &&
310           "Location out of range for this buffer!");
311
312    const unsigned CharNo = Loc - BufferStart;
313    return FileLoc.getLocWithOffset(CharNo);
314  }
315
316  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317    return Diags.Report(Loc, DiagID);
318  }
319
320  /// Eat string matching regexp \code \s*\* \endcode.
321  void skipLineStartingDecorations();
322
323  /// Skip over pure text.
324  const char *skipTextToken();
325
326  /// Lex comment text, including commands if ParseCommands is set to true.
327  void lexCommentText(Token &T);
328
329  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
330                                const CommandInfo *Info);
331
332  void lexVerbatimBlockFirstLine(Token &T);
333
334  void lexVerbatimBlockBody(Token &T);
335
336  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
337                               const CommandInfo *Info);
338
339  void lexVerbatimLineText(Token &T);
340
341  void lexHTMLCharacterReference(Token &T);
342
343  void setupAndLexHTMLStartTag(Token &T);
344
345  void lexHTMLStartTag(Token &T);
346
347  void setupAndLexHTMLEndTag(Token &T);
348
349  void lexHTMLEndTag(Token &T);
350
351public:
352  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
353        const CommandTraits &Traits, SourceLocation FileLoc,
354        const char *BufferStart, const char *BufferEnd,
355        bool ParseCommands = true);
356
357  void lex(Token &T);
358
359  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
360};
361
362} // end namespace comments
363} // end namespace clang
364
365#endif
366
367