1135SN/A//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2135SN/A// 3135SN/A// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4135SN/A// See https://llvm.org/LICENSE.txt for license information. 5135SN/A// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6135SN/A// 7135SN/A//===----------------------------------------------------------------------===// 8135SN/A// 9135SN/A// This file defines lexer for structured comments and supporting token class. 10135SN/A// 11135SN/A//===----------------------------------------------------------------------===// 12135SN/A 13135SN/A#ifndef LLVM_CLANG_AST_COMMENTLEXER_H 14135SN/A#define LLVM_CLANG_AST_COMMENTLEXER_H 15135SN/A 16135SN/A#include "clang/Basic/Diagnostic.h" 17135SN/A#include "clang/Basic/SourceManager.h" 18135SN/A#include "llvm/ADT/SmallString.h" 19135SN/A#include "llvm/ADT/StringRef.h" 20135SN/A#include "llvm/Support/Allocator.h" 21135SN/A#include "llvm/Support/raw_ostream.h" 22135SN/A 23135SN/Anamespace clang { 24135SN/Anamespace comments { 25135SN/A 26135SN/Aclass Lexer; 27135SN/Aclass TextTokenRetokenizer; 28974Ssundarstruct CommandInfo; 29974Ssundarclass CommandTraits; 30427SN/A 31741SN/Anamespace tok { 321551Sattilaenum TokenKind { 331805Sattila eof, 341551Sattila newline, 351551Sattila text, 361551Sattila unknown_command, // Command that does not have an ID. 371551Sattila backslash_command, // Command with an ID, that used backslash marker. 381551Sattila at_command, // Command with an ID, that used 'at' marker. 39974Ssundar verbatim_block_begin, 401470Sattila verbatim_block_line, 41427SN/A verbatim_block_end, 42135SN/A verbatim_line_name, 43135SN/A verbatim_line_text, 44135SN/A html_start_tag, // <tag 45135SN/A html_ident, // attr 46135SN/A html_equals, // = 47789SN/A html_quoted_string, // "blah\"blah" or 'blah\'blah' 48789SN/A html_greater, // > 49789SN/A html_slash_greater, // /> 50789SN/A html_end_tag // </tag 51789SN/A}; 52789SN/A} // end namespace tok 53789SN/A 54789SN/A/// Comment token. 55789SN/Aclass Token { 56789SN/A friend class Lexer; 57789SN/A friend class TextTokenRetokenizer; 58789SN/A 59789SN/A /// The location of the token. 60135SN/A SourceLocation Loc; 61135SN/A 62440SN/A /// The actual kind of the token. 63135SN/A tok::TokenKind Kind; 64135SN/A 65440SN/A /// Integer value associated with a token. 66789SN/A /// 67135SN/A /// If the token is a known command, contains command ID and TextPtr is 68135SN/A /// unused (command spelling can be found with CommandTraits). Otherwise, 69135SN/A /// contains the length of the string that starts at TextPtr. 70741SN/A unsigned IntVal; 71789SN/A 72789SN/A /// Length of the token spelling in comment. Can be 0 for synthenized 73789SN/A /// tokens. 74789SN/A unsigned Length; 75789SN/A 76789SN/A /// Contains text value associated with a token. 77789SN/A const char *TextPtr; 78789SN/A 79135SN/Apublic: 80135SN/A SourceLocation getLocation() const LLVM_READONLY { return Loc; } 81135SN/A void setLocation(SourceLocation SL) { Loc = SL; } 82135SN/A 83135SN/A SourceLocation getEndLocation() const LLVM_READONLY { 84135SN/A if (Length == 0 || Length == 1) 85135SN/A return Loc; 86440SN/A return Loc.getLocWithOffset(Length - 1); 87440SN/A } 88440SN/A 89440SN/A tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 90440SN/A void setKind(tok::TokenKind K) { Kind = K; } 91741SN/A 92741SN/A bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 93741SN/A bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 94741SN/A 95741SN/A unsigned getLength() const LLVM_READONLY { return Length; } 96741SN/A void setLength(unsigned L) { Length = L; } 97741SN/A 98741SN/A StringRef getText() const LLVM_READONLY { 99741SN/A assert(is(tok::text)); 100741SN/A return StringRef(TextPtr, IntVal); 101741SN/A } 102741SN/A 103741SN/A void setText(StringRef Text) { 104741SN/A assert(is(tok::text)); 105741SN/A TextPtr = Text.data(); 106741SN/A IntVal = Text.size(); 107741SN/A } 108974Ssundar 109974Ssundar StringRef getUnknownCommandName() const LLVM_READONLY { 110974Ssundar assert(is(tok::unknown_command)); 111974Ssundar return StringRef(TextPtr, IntVal); 112974Ssundar } 113974Ssundar 114440SN/A void setUnknownCommandName(StringRef Name) { 115741SN/A assert(is(tok::unknown_command)); 116440SN/A TextPtr = Name.data(); 117440SN/A IntVal = Name.size(); 118440SN/A } 119440SN/A 1201472Sattila unsigned getCommandID() const LLVM_READONLY { 121974Ssundar assert(is(tok::backslash_command) || is(tok::at_command)); 122974Ssundar return IntVal; 123974Ssundar } 124974Ssundar 125974Ssundar void setCommandID(unsigned ID) { 126974Ssundar assert(is(tok::backslash_command) || is(tok::at_command)); 127135SN/A IntVal = ID; 128135SN/A } 1291472Sattila 130427SN/A unsigned getVerbatimBlockID() const LLVM_READONLY { 131427SN/A assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 1321472Sattila return IntVal; 1331805Sattila } 1341483Sattila 1351805Sattila void setVerbatimBlockID(unsigned ID) { 1361483Sattila assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 137427SN/A IntVal = ID; 138440SN/A } 139427SN/A 140440SN/A StringRef getVerbatimBlockText() const LLVM_READONLY { 141135SN/A assert(is(tok::verbatim_block_line)); 142440SN/A return StringRef(TextPtr, IntVal); 143440SN/A } 144440SN/A 145491SN/A void setVerbatimBlockText(StringRef Text) { 146135SN/A assert(is(tok::verbatim_block_line)); 147135SN/A TextPtr = Text.data(); 148 IntVal = Text.size(); 149 } 150 151 unsigned getVerbatimLineID() const LLVM_READONLY { 152 assert(is(tok::verbatim_line_name)); 153 return IntVal; 154 } 155 156 void setVerbatimLineID(unsigned ID) { 157 assert(is(tok::verbatim_line_name)); 158 IntVal = ID; 159 } 160 161 StringRef getVerbatimLineText() const LLVM_READONLY { 162 assert(is(tok::verbatim_line_text)); 163 return StringRef(TextPtr, IntVal); 164 } 165 166 void setVerbatimLineText(StringRef Text) { 167 assert(is(tok::verbatim_line_text)); 168 TextPtr = Text.data(); 169 IntVal = Text.size(); 170 } 171 172 StringRef getHTMLTagStartName() const LLVM_READONLY { 173 assert(is(tok::html_start_tag)); 174 return StringRef(TextPtr, IntVal); 175 } 176 177 void setHTMLTagStartName(StringRef Name) { 178 assert(is(tok::html_start_tag)); 179 TextPtr = Name.data(); 180 IntVal = Name.size(); 181 } 182 183 StringRef getHTMLIdent() const LLVM_READONLY { 184 assert(is(tok::html_ident)); 185 return StringRef(TextPtr, IntVal); 186 } 187 188 void setHTMLIdent(StringRef Name) { 189 assert(is(tok::html_ident)); 190 TextPtr = Name.data(); 191 IntVal = Name.size(); 192 } 193 194 StringRef getHTMLQuotedString() const LLVM_READONLY { 195 assert(is(tok::html_quoted_string)); 196 return StringRef(TextPtr, IntVal); 197 } 198 199 void setHTMLQuotedString(StringRef Str) { 200 assert(is(tok::html_quoted_string)); 201 TextPtr = Str.data(); 202 IntVal = Str.size(); 203 } 204 205 StringRef getHTMLTagEndName() const LLVM_READONLY { 206 assert(is(tok::html_end_tag)); 207 return StringRef(TextPtr, IntVal); 208 } 209 210 void setHTMLTagEndName(StringRef Name) { 211 assert(is(tok::html_end_tag)); 212 TextPtr = Name.data(); 213 IntVal = Name.size(); 214 } 215 216 void dump(const Lexer &L, const SourceManager &SM) const; 217}; 218 219/// Comment lexer. 220class Lexer { 221private: 222 Lexer(const Lexer &) = delete; 223 void operator=(const Lexer &) = delete; 224 225 /// Allocator for strings that are semantic values of tokens and have to be 226 /// computed (for example, resolved decimal character references). 227 llvm::BumpPtrAllocator &Allocator; 228 229 DiagnosticsEngine &Diags; 230 231 const CommandTraits &Traits; 232 233 const char *const BufferStart; 234 const char *const BufferEnd; 235 236 const char *BufferPtr; 237 238 /// One past end pointer for the current comment. For BCPL comments points 239 /// to newline or BufferEnd, for C comments points to star in '*/'. 240 const char *CommentEnd; 241 242 SourceLocation FileLoc; 243 244 /// If true, the commands, html tags, etc will be parsed and reported as 245 /// separate tokens inside the comment body. If false, the comment text will 246 /// be parsed into text and newline tokens. 247 bool ParseCommands; 248 249 enum LexerCommentState : uint8_t { 250 LCS_BeforeComment, 251 LCS_InsideBCPLComment, 252 LCS_InsideCComment, 253 LCS_BetweenComments 254 }; 255 256 /// Low-level lexer state, track if we are inside or outside of comment. 257 LexerCommentState CommentState; 258 259 enum LexerState : uint8_t { 260 /// Lexing normal comment text 261 LS_Normal, 262 263 /// Finished lexing verbatim block beginning command, will lex first body 264 /// line. 265 LS_VerbatimBlockFirstLine, 266 267 /// Lexing verbatim block body line-by-line, skipping line-starting 268 /// decorations. 269 LS_VerbatimBlockBody, 270 271 /// Finished lexing verbatim line beginning command, will lex text (one 272 /// line). 273 LS_VerbatimLineText, 274 275 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 276 LS_HTMLStartTag, 277 278 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 279 LS_HTMLEndTag 280 }; 281 282 /// Current lexing mode. 283 LexerState State; 284 285 /// If State is LS_VerbatimBlock, contains the name of verbatim end 286 /// command, including command marker. 287 SmallString<16> VerbatimBlockEndCommandName; 288 289 /// Given a character reference name (e.g., "lt"), return the character that 290 /// it stands for (e.g., "<"). 291 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 292 293 /// Given a Unicode codepoint as base-10 integer, return the character. 294 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 295 296 /// Given a Unicode codepoint as base-16 integer, return the character. 297 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 298 299 void formTokenWithChars(Token &Result, const char *TokEnd, 300 tok::TokenKind Kind); 301 302 void formTextToken(Token &Result, const char *TokEnd) { 303 StringRef Text(BufferPtr, TokEnd - BufferPtr); 304 formTokenWithChars(Result, TokEnd, tok::text); 305 Result.setText(Text); 306 } 307 308 SourceLocation getSourceLocation(const char *Loc) const { 309 assert(Loc >= BufferStart && Loc <= BufferEnd && 310 "Location out of range for this buffer!"); 311 312 const unsigned CharNo = Loc - BufferStart; 313 return FileLoc.getLocWithOffset(CharNo); 314 } 315 316 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 317 return Diags.Report(Loc, DiagID); 318 } 319 320 /// Eat string matching regexp \code \s*\* \endcode. 321 void skipLineStartingDecorations(); 322 323 /// Skip over pure text. 324 const char *skipTextToken(); 325 326 /// Lex comment text, including commands if ParseCommands is set to true. 327 void lexCommentText(Token &T); 328 329 void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, 330 const CommandInfo *Info); 331 332 void lexVerbatimBlockFirstLine(Token &T); 333 334 void lexVerbatimBlockBody(Token &T); 335 336 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 337 const CommandInfo *Info); 338 339 void lexVerbatimLineText(Token &T); 340 341 void lexHTMLCharacterReference(Token &T); 342 343 void setupAndLexHTMLStartTag(Token &T); 344 345 void lexHTMLStartTag(Token &T); 346 347 void setupAndLexHTMLEndTag(Token &T); 348 349 void lexHTMLEndTag(Token &T); 350 351public: 352 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 353 const CommandTraits &Traits, SourceLocation FileLoc, 354 const char *BufferStart, const char *BufferEnd, 355 bool ParseCommands = true); 356 357 void lex(Token &T); 358 359 StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; 360}; 361 362} // end namespace comments 363} // end namespace clang 364 365#endif 366 367