Tokens.h revision 360784
1//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// Record tokens that a preprocessor emits and define operations to map between
9// the tokens written in a file and tokens produced by the preprocessor.
10//
11// When running the compiler, there are two token streams we are interested in:
12//   - "spelled" tokens directly correspond to a substring written in some
13//     source file.
14//   - "expanded" tokens represent the result of preprocessing, parses consumes
15//     this token stream to produce the AST.
16//
17// Expanded tokens correspond directly to locations found in the AST, allowing
18// to find subranges of the token stream covered by various AST nodes. Spelled
19// tokens correspond directly to the source code written by the user.
20//
21// To allow composing these two use-cases, we also define operations that map
22// between expanded and spelled tokens that produced them (macro calls,
23// directives, etc).
24//
25//===----------------------------------------------------------------------===//
26
27#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29
30#include "clang/Basic/FileManager.h"
31#include "clang/Basic/LangOptions.h"
32#include "clang/Basic/SourceLocation.h"
33#include "clang/Basic/SourceManager.h"
34#include "clang/Basic/TokenKinds.h"
35#include "clang/Lex/Token.h"
36#include "llvm/ADT/ArrayRef.h"
37#include "llvm/ADT/Optional.h"
38#include "llvm/ADT/StringRef.h"
39#include "llvm/Support/Compiler.h"
40#include "llvm/Support/raw_ostream.h"
41#include <cstdint>
42#include <tuple>
43
44namespace clang {
45class Preprocessor;
46
47namespace syntax {
48
49/// A half-open character range inside a particular file, the start offset is
50/// included and the end offset is excluded from the range.
51struct FileRange {
52  /// EXPECTS: File.isValid() && Begin <= End.
53  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
54  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
55  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
56  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
57  ///          are the same.
58  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
59            SourceLocation EndLoc);
60
61  FileID file() const { return File; }
62  /// Start is a start offset (inclusive) in the corresponding file.
63  unsigned beginOffset() const { return Begin; }
64  /// End offset (exclusive) in the corresponding file.
65  unsigned endOffset() const { return End; }
66
67  unsigned length() const { return End - Begin; }
68
69  /// Check if \p Offset is inside the range.
70  bool contains(unsigned Offset) const {
71    return Begin <= Offset && Offset < End;
72  }
73  /// Check \p Offset is inside the range or equal to its endpoint.
74  bool touches(unsigned Offset) const {
75    return Begin <= Offset && Offset <= End;
76  }
77
78  /// Gets the substring that this FileRange refers to.
79  llvm::StringRef text(const SourceManager &SM) const;
80
81  /// Convert to the clang range. The returned range is always a char range,
82  /// never a token range.
83  CharSourceRange toCharRange(const SourceManager &SM) const;
84
85  friend bool operator==(const FileRange &L, const FileRange &R) {
86    return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
87  }
88  friend bool operator!=(const FileRange &L, const FileRange &R) {
89    return !(L == R);
90  }
91
92private:
93  FileID File;
94  unsigned Begin;
95  unsigned End;
96};
97
98/// For debugging purposes.
99llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
100
101/// A token coming directly from a file or from a macro invocation. Has just
102/// enough information to locate the token in the source code.
103/// Can represent both expanded and spelled tokens.
104class Token {
105public:
106  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
107  /// EXPECTS: clang::Token is not an annotation token.
108  explicit Token(const clang::Token &T);
109
110  tok::TokenKind kind() const { return Kind; }
111  /// Location of the first character of a token.
112  SourceLocation location() const { return Location; }
113  /// Location right after the last character of a token.
114  SourceLocation endLocation() const {
115    return Location.getLocWithOffset(Length);
116  }
117  unsigned length() const { return Length; }
118
119  /// Get the substring covered by the token. Note that will include all
120  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
121  ///    in\
122  ///    t
123  /// both have the same kind tok::kw_int, but results of text() are different.
124  llvm::StringRef text(const SourceManager &SM) const;
125
126  /// Gets a range of this token.
127  /// EXPECTS: token comes from a file, not from a macro expansion.
128  FileRange range(const SourceManager &SM) const;
129
130  /// Given two tokens inside the same file, returns a file range that starts at
131  /// \p First and ends at \p Last.
132  /// EXPECTS: First and Last are file tokens from the same file, Last starts
133  ///          after First.
134  static FileRange range(const SourceManager &SM, const syntax::Token &First,
135                         const syntax::Token &Last);
136
137  std::string dumpForTests(const SourceManager &SM) const;
138  /// For debugging purposes.
139  std::string str() const;
140
141private:
142  SourceLocation Location;
143  unsigned Length;
144  tok::TokenKind Kind;
145};
146/// For debugging purposes. Equivalent to a call to Token::str().
147llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
148
149/// A list of tokens obtained by preprocessing a text buffer and operations to
150/// map between the expanded and spelled tokens, i.e. TokenBuffer has
151/// information about two token streams:
152///    1. Expanded tokens: tokens produced by the preprocessor after all macro
153///       replacements,
154///    2. Spelled tokens: corresponding directly to the source code of a file
155///       before any macro replacements occurred.
156/// Here's an example to illustrate a difference between those two:
157///     #define FOO 10
158///     int a = FOO;
159///
160/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
161/// Expanded tokens are {'int','a','=','10',';','eof'}.
162///
163/// Note that the expanded token stream has a tok::eof token at the end, the
164/// spelled tokens never store a 'eof' token.
165///
166/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
167/// tokens for each of the files can be obtained via spelledTokens(FileID).
168///
169/// To map between the expanded and spelled tokens use findSpelledByExpanded().
170///
171/// To build a token buffer use the TokenCollector class. You can also compute
172/// the spelled tokens of a file using the tokenize() helper.
173///
174/// FIXME: allow to map from spelled to expanded tokens when use-case shows up.
175/// FIXME: allow mappings into macro arguments.
176class TokenBuffer {
177public:
178  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
179  /// All tokens produced by the preprocessor after all macro replacements,
180  /// directives, etc. Source locations found in the clang AST will always
181  /// point to one of these tokens.
182  /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
183  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
184  ///        into two '>' tokens by the parser. However, TokenBuffer currently
185  ///        keeps it as a single '>>' token.
186  llvm::ArrayRef<syntax::Token> expandedTokens() const {
187    return ExpandedTokens;
188  }
189
190  /// Returns the subrange of expandedTokens() corresponding to the closed
191  /// token range R.
192  llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
193
194  /// Find the subrange of spelled tokens that produced the corresponding \p
195  /// Expanded tokens.
196  ///
197  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
198  ///
199  /// Will fail if the expanded tokens do not correspond to a
200  /// sequence of spelled tokens. E.g. for the following example:
201  ///
202  ///   #define FIRST f1 f2 f3
203  ///   #define SECOND s1 s2 s3
204  ///
205  ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
206  ///
207  /// the results would be:
208  ///   expanded   => spelled
209  ///   ------------------------
210  ///            a => a
211  ///     s1 s2 s3 => SECOND
212  ///   a f1 f2 f3 => a FIRST
213  ///         a f1 => can't map
214  ///        s1 s2 => can't map
215  ///
216  /// If \p Expanded is empty, the returned value is llvm::None.
217  /// Complexity is logarithmic.
218  llvm::Optional<llvm::ArrayRef<syntax::Token>>
219  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
220
221  /// An expansion produced by the preprocessor, includes macro expansions and
222  /// preprocessor directives. Preprocessor always maps a non-empty range of
223  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
224  /// few examples of expansions:
225  ///    #pragma once      // Expands to an empty range.
226  ///    #define FOO 1 2 3 // Expands an empty range.
227  ///    FOO               // Expands to "1 2 3".
228  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
229  ///    #include <vector> // Expands to tokens produced by the include.
230  struct Expansion {
231    llvm::ArrayRef<syntax::Token> Spelled;
232    llvm::ArrayRef<syntax::Token> Expanded;
233  };
234  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
235  /// a preprocessor directive) return the subrange of expanded tokens that the
236  /// macro expands to.
237  llvm::Optional<Expansion>
238  expansionStartingAt(const syntax::Token *Spelled) const;
239
240  /// Lexed tokens of a file before preprocessing. E.g. for the following input
241  ///     #define DECL(name) int name = 10
242  ///     DECL(a);
243  /// spelledTokens() returns
244  ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
245  ///     "DECL", "(", "a", ")", ";"}
246  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
247
248  /// Get all tokens that expand a macro in \p FID. For the following input
249  ///     #define FOO B
250  ///     #define FOO2(X) int X
251  ///     FOO2(XY)
252  ///     int B;
253  ///     FOO;
254  /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
255  /// respecitvely).
256  std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
257
258  const SourceManager &sourceManager() const { return *SourceMgr; }
259
260  std::string dumpForTests() const;
261
262private:
263  /// Describes a mapping between a continuous subrange of spelled tokens and
264  /// expanded tokens. Represents macro expansions, preprocessor directives,
265  /// conditionally disabled pp regions, etc.
266  ///   #define FOO 1+2
267  ///   #define BAR(a) a + 1
268  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
269  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
270  ///                            macroTokens = {'BAR', '(', '1', ')'}.
271  struct Mapping {
272    // Positions in the corresponding spelled token stream. The corresponding
273    // range is never empty.
274    unsigned BeginSpelled = 0;
275    unsigned EndSpelled = 0;
276    // Positions in the expanded token stream. The corresponding range can be
277    // empty.
278    unsigned BeginExpanded = 0;
279    unsigned EndExpanded = 0;
280
281    /// For debugging purposes.
282    std::string str() const;
283  };
284  /// Spelled tokens of the file with information about the subranges.
285  struct MarkedFile {
286    /// Lexed, but not preprocessed, tokens of the file. These map directly to
287    /// text in the corresponding files and include tokens of all preprocessor
288    /// directives.
289    /// FIXME: spelled tokens don't change across FileID that map to the same
290    ///        FileEntry. We could consider deduplicating them to save memory.
291    std::vector<syntax::Token> SpelledTokens;
292    /// A sorted list to convert between the spelled and expanded token streams.
293    std::vector<Mapping> Mappings;
294    /// The first expanded token produced for this FileID.
295    unsigned BeginExpanded = 0;
296    unsigned EndExpanded = 0;
297  };
298
299  friend class TokenCollector;
300
301  /// Maps a single expanded token to its spelled counterpart or a mapping that
302  /// produced it.
303  std::pair<const syntax::Token *, const Mapping *>
304  spelledForExpandedToken(const syntax::Token *Expanded) const;
305
306  /// Token stream produced after preprocessing, conceputally this captures the
307  /// same stream as 'clang -E' (excluding the preprocessor directives like
308  /// #file, etc.).
309  std::vector<syntax::Token> ExpandedTokens;
310  llvm::DenseMap<FileID, MarkedFile> Files;
311  // The value is never null, pointer instead of reference to avoid disabling
312  // implicit assignment operator.
313  const SourceManager *SourceMgr;
314};
315
316/// The spelled tokens that overlap or touch a spelling location Loc.
317/// This always returns 0-2 tokens.
318llvm::ArrayRef<syntax::Token>
319spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
320
321/// The identifier token that overlaps or touches a spelling location Loc.
322/// If there is none, returns nullptr.
323const syntax::Token *
324spelledIdentifierTouching(SourceLocation Loc,
325                          const syntax::TokenBuffer &Tokens);
326
327/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
328/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
329/// setting the appropriate token kind (instead of the raw_identifier reported
330/// by lexer in raw mode). This is a very low-level function, most users should
331/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
332/// results from what one might expect when running a C++ frontend, e.g.
333/// preprocessor does not run at all.
334/// The result will *not* have a 'eof' token at the end.
335std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
336                                    const LangOptions &LO);
337
338/// Collects tokens for the main file while running the frontend action. An
339/// instance of this object should be created on
340/// FrontendAction::BeginSourceFile() and the results should be consumed after
341/// FrontendAction::Execute() finishes.
342class TokenCollector {
343public:
344  /// Adds the hooks to collect the tokens. Should be called before the
345  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
346  /// CreateASTConsumer().
347  TokenCollector(Preprocessor &P);
348
349  /// Finalizes token collection. Should be called after preprocessing is
350  /// finished, i.e. after running Execute().
351  LLVM_NODISCARD TokenBuffer consume() &&;
352
353private:
354  /// Maps from a start to an end spelling location of transformations
355  /// performed by the preprocessor. These include:
356  ///   1. range from '#' to the last token in the line for PP directives,
357  ///   2. macro name and arguments for macro expansions.
358  /// Note that we record only top-level macro expansions, intermediate
359  /// expansions (e.g. inside macro arguments) are ignored.
360  ///
361  /// Used to find correct boundaries of macro calls and directives when
362  /// building mappings from spelled to expanded tokens.
363  ///
364  /// Logically, at each point of the preprocessor execution there is a stack of
365  /// macro expansions being processed and we could use it to recover the
366  /// location information we need. However, the public preprocessor API only
367  /// exposes the points when macro expansions start (when we push a macro onto
368  /// the stack) and not when they end (when we pop a macro from the stack).
369  /// To workaround this limitation, we rely on source location information
370  /// stored in this map.
371  using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>;
372  class Builder;
373  class CollectPPExpansions;
374
375  std::vector<syntax::Token> Expanded;
376  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
377  PPExpansions Expansions;
378  Preprocessor &PP;
379  CollectPPExpansions *Collector;
380};
381
382} // namespace syntax
383} // namespace clang
384
385#endif
386