Tokens.h revision 360784
1//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// Record tokens that a preprocessor emits and define operations to map between 9// the tokens written in a file and tokens produced by the preprocessor. 10// 11// When running the compiler, there are two token streams we are interested in: 12// - "spelled" tokens directly correspond to a substring written in some 13// source file. 14// - "expanded" tokens represent the result of preprocessing, parses consumes 15// this token stream to produce the AST. 16// 17// Expanded tokens correspond directly to locations found in the AST, allowing 18// to find subranges of the token stream covered by various AST nodes. Spelled 19// tokens correspond directly to the source code written by the user. 20// 21// To allow composing these two use-cases, we also define operations that map 22// between expanded and spelled tokens that produced them (macro calls, 23// directives, etc). 24// 25//===----------------------------------------------------------------------===// 26 27#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 28#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 29 30#include "clang/Basic/FileManager.h" 31#include "clang/Basic/LangOptions.h" 32#include "clang/Basic/SourceLocation.h" 33#include "clang/Basic/SourceManager.h" 34#include "clang/Basic/TokenKinds.h" 35#include "clang/Lex/Token.h" 36#include "llvm/ADT/ArrayRef.h" 37#include "llvm/ADT/Optional.h" 38#include "llvm/ADT/StringRef.h" 39#include "llvm/Support/Compiler.h" 40#include "llvm/Support/raw_ostream.h" 41#include <cstdint> 42#include <tuple> 43 44namespace clang { 45class Preprocessor; 46 47namespace syntax { 48 49/// A half-open character range inside a particular file, the start offset is 50/// included and the end offset is excluded from the range. 51struct FileRange { 52 /// EXPECTS: File.isValid() && Begin <= End. 53 FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); 54 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). 55 FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); 56 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files 57 /// are the same. 58 FileRange(const SourceManager &SM, SourceLocation BeginLoc, 59 SourceLocation EndLoc); 60 61 FileID file() const { return File; } 62 /// Start is a start offset (inclusive) in the corresponding file. 63 unsigned beginOffset() const { return Begin; } 64 /// End offset (exclusive) in the corresponding file. 65 unsigned endOffset() const { return End; } 66 67 unsigned length() const { return End - Begin; } 68 69 /// Check if \p Offset is inside the range. 70 bool contains(unsigned Offset) const { 71 return Begin <= Offset && Offset < End; 72 } 73 /// Check \p Offset is inside the range or equal to its endpoint. 74 bool touches(unsigned Offset) const { 75 return Begin <= Offset && Offset <= End; 76 } 77 78 /// Gets the substring that this FileRange refers to. 79 llvm::StringRef text(const SourceManager &SM) const; 80 81 /// Convert to the clang range. The returned range is always a char range, 82 /// never a token range. 83 CharSourceRange toCharRange(const SourceManager &SM) const; 84 85 friend bool operator==(const FileRange &L, const FileRange &R) { 86 return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); 87 } 88 friend bool operator!=(const FileRange &L, const FileRange &R) { 89 return !(L == R); 90 } 91 92private: 93 FileID File; 94 unsigned Begin; 95 unsigned End; 96}; 97 98/// For debugging purposes. 99llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); 100 101/// A token coming directly from a file or from a macro invocation. Has just 102/// enough information to locate the token in the source code. 103/// Can represent both expanded and spelled tokens. 104class Token { 105public: 106 Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); 107 /// EXPECTS: clang::Token is not an annotation token. 108 explicit Token(const clang::Token &T); 109 110 tok::TokenKind kind() const { return Kind; } 111 /// Location of the first character of a token. 112 SourceLocation location() const { return Location; } 113 /// Location right after the last character of a token. 114 SourceLocation endLocation() const { 115 return Location.getLocWithOffset(Length); 116 } 117 unsigned length() const { return Length; } 118 119 /// Get the substring covered by the token. Note that will include all 120 /// digraphs, newline continuations, etc. E.g. tokens for 'int' and 121 /// in\ 122 /// t 123 /// both have the same kind tok::kw_int, but results of text() are different. 124 llvm::StringRef text(const SourceManager &SM) const; 125 126 /// Gets a range of this token. 127 /// EXPECTS: token comes from a file, not from a macro expansion. 128 FileRange range(const SourceManager &SM) const; 129 130 /// Given two tokens inside the same file, returns a file range that starts at 131 /// \p First and ends at \p Last. 132 /// EXPECTS: First and Last are file tokens from the same file, Last starts 133 /// after First. 134 static FileRange range(const SourceManager &SM, const syntax::Token &First, 135 const syntax::Token &Last); 136 137 std::string dumpForTests(const SourceManager &SM) const; 138 /// For debugging purposes. 139 std::string str() const; 140 141private: 142 SourceLocation Location; 143 unsigned Length; 144 tok::TokenKind Kind; 145}; 146/// For debugging purposes. Equivalent to a call to Token::str(). 147llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); 148 149/// A list of tokens obtained by preprocessing a text buffer and operations to 150/// map between the expanded and spelled tokens, i.e. TokenBuffer has 151/// information about two token streams: 152/// 1. Expanded tokens: tokens produced by the preprocessor after all macro 153/// replacements, 154/// 2. Spelled tokens: corresponding directly to the source code of a file 155/// before any macro replacements occurred. 156/// Here's an example to illustrate a difference between those two: 157/// #define FOO 10 158/// int a = FOO; 159/// 160/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. 161/// Expanded tokens are {'int','a','=','10',';','eof'}. 162/// 163/// Note that the expanded token stream has a tok::eof token at the end, the 164/// spelled tokens never store a 'eof' token. 165/// 166/// The full list expanded tokens can be obtained with expandedTokens(). Spelled 167/// tokens for each of the files can be obtained via spelledTokens(FileID). 168/// 169/// To map between the expanded and spelled tokens use findSpelledByExpanded(). 170/// 171/// To build a token buffer use the TokenCollector class. You can also compute 172/// the spelled tokens of a file using the tokenize() helper. 173/// 174/// FIXME: allow to map from spelled to expanded tokens when use-case shows up. 175/// FIXME: allow mappings into macro arguments. 176class TokenBuffer { 177public: 178 TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} 179 /// All tokens produced by the preprocessor after all macro replacements, 180 /// directives, etc. Source locations found in the clang AST will always 181 /// point to one of these tokens. 182 /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). 183 /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split 184 /// into two '>' tokens by the parser. However, TokenBuffer currently 185 /// keeps it as a single '>>' token. 186 llvm::ArrayRef<syntax::Token> expandedTokens() const { 187 return ExpandedTokens; 188 } 189 190 /// Returns the subrange of expandedTokens() corresponding to the closed 191 /// token range R. 192 llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; 193 194 /// Find the subrange of spelled tokens that produced the corresponding \p 195 /// Expanded tokens. 196 /// 197 /// EXPECTS: \p Expanded is a subrange of expandedTokens(). 198 /// 199 /// Will fail if the expanded tokens do not correspond to a 200 /// sequence of spelled tokens. E.g. for the following example: 201 /// 202 /// #define FIRST f1 f2 f3 203 /// #define SECOND s1 s2 s3 204 /// 205 /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c 206 /// 207 /// the results would be: 208 /// expanded => spelled 209 /// ------------------------ 210 /// a => a 211 /// s1 s2 s3 => SECOND 212 /// a f1 f2 f3 => a FIRST 213 /// a f1 => can't map 214 /// s1 s2 => can't map 215 /// 216 /// If \p Expanded is empty, the returned value is llvm::None. 217 /// Complexity is logarithmic. 218 llvm::Optional<llvm::ArrayRef<syntax::Token>> 219 spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; 220 221 /// An expansion produced by the preprocessor, includes macro expansions and 222 /// preprocessor directives. Preprocessor always maps a non-empty range of 223 /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a 224 /// few examples of expansions: 225 /// #pragma once // Expands to an empty range. 226 /// #define FOO 1 2 3 // Expands an empty range. 227 /// FOO // Expands to "1 2 3". 228 /// FIXME(ibiryukov): implement this, currently #include expansions are empty. 229 /// #include <vector> // Expands to tokens produced by the include. 230 struct Expansion { 231 llvm::ArrayRef<syntax::Token> Spelled; 232 llvm::ArrayRef<syntax::Token> Expanded; 233 }; 234 /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting 235 /// a preprocessor directive) return the subrange of expanded tokens that the 236 /// macro expands to. 237 llvm::Optional<Expansion> 238 expansionStartingAt(const syntax::Token *Spelled) const; 239 240 /// Lexed tokens of a file before preprocessing. E.g. for the following input 241 /// #define DECL(name) int name = 10 242 /// DECL(a); 243 /// spelledTokens() returns 244 /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", 245 /// "DECL", "(", "a", ")", ";"} 246 llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; 247 248 /// Get all tokens that expand a macro in \p FID. For the following input 249 /// #define FOO B 250 /// #define FOO2(X) int X 251 /// FOO2(XY) 252 /// int B; 253 /// FOO; 254 /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 255 /// respecitvely). 256 std::vector<const syntax::Token *> macroExpansions(FileID FID) const; 257 258 const SourceManager &sourceManager() const { return *SourceMgr; } 259 260 std::string dumpForTests() const; 261 262private: 263 /// Describes a mapping between a continuous subrange of spelled tokens and 264 /// expanded tokens. Represents macro expansions, preprocessor directives, 265 /// conditionally disabled pp regions, etc. 266 /// #define FOO 1+2 267 /// #define BAR(a) a + 1 268 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. 269 /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, 270 /// macroTokens = {'BAR', '(', '1', ')'}. 271 struct Mapping { 272 // Positions in the corresponding spelled token stream. The corresponding 273 // range is never empty. 274 unsigned BeginSpelled = 0; 275 unsigned EndSpelled = 0; 276 // Positions in the expanded token stream. The corresponding range can be 277 // empty. 278 unsigned BeginExpanded = 0; 279 unsigned EndExpanded = 0; 280 281 /// For debugging purposes. 282 std::string str() const; 283 }; 284 /// Spelled tokens of the file with information about the subranges. 285 struct MarkedFile { 286 /// Lexed, but not preprocessed, tokens of the file. These map directly to 287 /// text in the corresponding files and include tokens of all preprocessor 288 /// directives. 289 /// FIXME: spelled tokens don't change across FileID that map to the same 290 /// FileEntry. We could consider deduplicating them to save memory. 291 std::vector<syntax::Token> SpelledTokens; 292 /// A sorted list to convert between the spelled and expanded token streams. 293 std::vector<Mapping> Mappings; 294 /// The first expanded token produced for this FileID. 295 unsigned BeginExpanded = 0; 296 unsigned EndExpanded = 0; 297 }; 298 299 friend class TokenCollector; 300 301 /// Maps a single expanded token to its spelled counterpart or a mapping that 302 /// produced it. 303 std::pair<const syntax::Token *, const Mapping *> 304 spelledForExpandedToken(const syntax::Token *Expanded) const; 305 306 /// Token stream produced after preprocessing, conceputally this captures the 307 /// same stream as 'clang -E' (excluding the preprocessor directives like 308 /// #file, etc.). 309 std::vector<syntax::Token> ExpandedTokens; 310 llvm::DenseMap<FileID, MarkedFile> Files; 311 // The value is never null, pointer instead of reference to avoid disabling 312 // implicit assignment operator. 313 const SourceManager *SourceMgr; 314}; 315 316/// The spelled tokens that overlap or touch a spelling location Loc. 317/// This always returns 0-2 tokens. 318llvm::ArrayRef<syntax::Token> 319spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); 320 321/// The identifier token that overlaps or touches a spelling location Loc. 322/// If there is none, returns nullptr. 323const syntax::Token * 324spelledIdentifierTouching(SourceLocation Loc, 325 const syntax::TokenBuffer &Tokens); 326 327/// Lex the text buffer, corresponding to \p FID, in raw mode and record the 328/// resulting spelled tokens. Does minimal post-processing on raw identifiers, 329/// setting the appropriate token kind (instead of the raw_identifier reported 330/// by lexer in raw mode). This is a very low-level function, most users should 331/// prefer to use TokenCollector. Lexing in raw mode produces wildly different 332/// results from what one might expect when running a C++ frontend, e.g. 333/// preprocessor does not run at all. 334/// The result will *not* have a 'eof' token at the end. 335std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, 336 const LangOptions &LO); 337 338/// Collects tokens for the main file while running the frontend action. An 339/// instance of this object should be created on 340/// FrontendAction::BeginSourceFile() and the results should be consumed after 341/// FrontendAction::Execute() finishes. 342class TokenCollector { 343public: 344 /// Adds the hooks to collect the tokens. Should be called before the 345 /// preprocessing starts, i.e. as a part of BeginSourceFile() or 346 /// CreateASTConsumer(). 347 TokenCollector(Preprocessor &P); 348 349 /// Finalizes token collection. Should be called after preprocessing is 350 /// finished, i.e. after running Execute(). 351 LLVM_NODISCARD TokenBuffer consume() &&; 352 353private: 354 /// Maps from a start to an end spelling location of transformations 355 /// performed by the preprocessor. These include: 356 /// 1. range from '#' to the last token in the line for PP directives, 357 /// 2. macro name and arguments for macro expansions. 358 /// Note that we record only top-level macro expansions, intermediate 359 /// expansions (e.g. inside macro arguments) are ignored. 360 /// 361 /// Used to find correct boundaries of macro calls and directives when 362 /// building mappings from spelled to expanded tokens. 363 /// 364 /// Logically, at each point of the preprocessor execution there is a stack of 365 /// macro expansions being processed and we could use it to recover the 366 /// location information we need. However, the public preprocessor API only 367 /// exposes the points when macro expansions start (when we push a macro onto 368 /// the stack) and not when they end (when we pop a macro from the stack). 369 /// To workaround this limitation, we rely on source location information 370 /// stored in this map. 371 using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>; 372 class Builder; 373 class CollectPPExpansions; 374 375 std::vector<syntax::Token> Expanded; 376 // FIXME: we only store macro expansions, also add directives(#pragma, etc.) 377 PPExpansions Expansions; 378 Preprocessor &PP; 379 CollectPPExpansions *Collector; 380}; 381 382} // namespace syntax 383} // namespace clang 384 385#endif 386