ClangHighlighter.cpp revision 360784
1//===-- ClangHighlighter.cpp ------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ClangHighlighter.h"
10
11#include "lldb/Host/FileSystem.h"
12#include "lldb/Target/Language.h"
13#include "lldb/Utility/AnsiTerminal.h"
14#include "lldb/Utility/StreamString.h"
15
16#include "clang/Basic/SourceManager.h"
17#include "clang/Lex/Lexer.h"
18#include "llvm/ADT/StringSet.h"
19#include "llvm/Support/MemoryBuffer.h"
20
21using namespace lldb_private;
22
23bool ClangHighlighter::isKeyword(llvm::StringRef token) const {
24  return keywords.find(token) != keywords.end();
25}
26
27ClangHighlighter::ClangHighlighter() {
28#define KEYWORD(X, N) keywords.insert(#X);
29#include "clang/Basic/TokenKinds.def"
30}
31
32/// Determines which style should be applied to the given token.
33/// \param highlighter
34///     The current highlighter that should use the style.
35/// \param token
36///     The current token.
37/// \param tok_str
38///     The string in the source code the token represents.
39/// \param options
40///     The style we use for coloring the source code.
41/// \param in_pp_directive
42///     If we are currently in a preprocessor directive. NOTE: This is
43///     passed by reference and will be updated if the current token starts
44///     or ends a preprocessor directive.
45/// \return
46///     The ColorStyle that should be applied to the token.
47static HighlightStyle::ColorStyle
48determineClangStyle(const ClangHighlighter &highlighter,
49                    const clang::Token &token, llvm::StringRef tok_str,
50                    const HighlightStyle &options, bool &in_pp_directive) {
51  using namespace clang;
52
53  if (token.is(tok::comment)) {
54    // If we were in a preprocessor directive before, we now left it.
55    in_pp_directive = false;
56    return options.comment;
57  } else if (in_pp_directive || token.getKind() == tok::hash) {
58    // Let's assume that the rest of the line is a PP directive.
59    in_pp_directive = true;
60    // Preprocessor directives are hard to match, so we have to hack this in.
61    return options.pp_directive;
62  } else if (tok::isStringLiteral(token.getKind()))
63    return options.string_literal;
64  else if (tok::isLiteral(token.getKind()))
65    return options.scalar_literal;
66  else if (highlighter.isKeyword(tok_str))
67    return options.keyword;
68  else
69    switch (token.getKind()) {
70    case tok::raw_identifier:
71    case tok::identifier:
72      return options.identifier;
73    case tok::l_brace:
74    case tok::r_brace:
75      return options.braces;
76    case tok::l_square:
77    case tok::r_square:
78      return options.square_brackets;
79    case tok::l_paren:
80    case tok::r_paren:
81      return options.parentheses;
82    case tok::comma:
83      return options.comma;
84    case tok::coloncolon:
85    case tok::colon:
86      return options.colon;
87
88    case tok::amp:
89    case tok::ampamp:
90    case tok::ampequal:
91    case tok::star:
92    case tok::starequal:
93    case tok::plus:
94    case tok::plusplus:
95    case tok::plusequal:
96    case tok::minus:
97    case tok::arrow:
98    case tok::minusminus:
99    case tok::minusequal:
100    case tok::tilde:
101    case tok::exclaim:
102    case tok::exclaimequal:
103    case tok::slash:
104    case tok::slashequal:
105    case tok::percent:
106    case tok::percentequal:
107    case tok::less:
108    case tok::lessless:
109    case tok::lessequal:
110    case tok::lesslessequal:
111    case tok::spaceship:
112    case tok::greater:
113    case tok::greatergreater:
114    case tok::greaterequal:
115    case tok::greatergreaterequal:
116    case tok::caret:
117    case tok::caretequal:
118    case tok::pipe:
119    case tok::pipepipe:
120    case tok::pipeequal:
121    case tok::question:
122    case tok::equal:
123    case tok::equalequal:
124      return options.operators;
125    default:
126      break;
127    }
128  return HighlightStyle::ColorStyle();
129}
130
131void ClangHighlighter::Highlight(const HighlightStyle &options,
132                                 llvm::StringRef line,
133                                 llvm::Optional<size_t> cursor_pos,
134                                 llvm::StringRef previous_lines,
135                                 Stream &result) const {
136  using namespace clang;
137
138  FileSystemOptions file_opts;
139  FileManager file_mgr(file_opts,
140                       FileSystem::Instance().GetVirtualFileSystem());
141
142  // The line might end in a backslash which would cause Clang to drop the
143  // backslash and the terminating new line. This makes sense when parsing C++,
144  // but when highlighting we care about preserving the backslash/newline. To
145  // not lose this information we remove the new line here so that Clang knows
146  // this is just a single line we are highlighting. We add back the newline
147  // after tokenizing.
148  llvm::StringRef line_ending = "";
149  // There are a few legal line endings Clang recognizes and we need to
150  // temporarily remove from the string.
151  if (line.consume_back("\r\n"))
152    line_ending = "\r\n";
153  else if (line.consume_back("\n"))
154    line_ending = "\n";
155  else if (line.consume_back("\r"))
156    line_ending = "\r";
157
158  unsigned line_number = previous_lines.count('\n') + 1U;
159
160  // Let's build the actual source code Clang needs and setup some utility
161  // objects.
162  std::string full_source = previous_lines.str() + line.str();
163  llvm::IntrusiveRefCntPtr<DiagnosticIDs> diag_ids(new DiagnosticIDs());
164  llvm::IntrusiveRefCntPtr<DiagnosticOptions> diags_opts(
165      new DiagnosticOptions());
166  DiagnosticsEngine diags(diag_ids, diags_opts);
167  clang::SourceManager SM(diags, file_mgr);
168  auto buf = llvm::MemoryBuffer::getMemBuffer(full_source);
169
170  FileID FID = SM.createFileID(clang::SourceManager::Unowned, buf.get());
171
172  // Let's just enable the latest ObjC and C++ which should get most tokens
173  // right.
174  LangOptions Opts;
175  Opts.ObjC = true;
176  // FIXME: This should probably set CPlusPlus, CPlusPlus11, ... too
177  Opts.CPlusPlus17 = true;
178  Opts.LineComment = true;
179
180  Lexer lex(FID, buf.get(), SM, Opts);
181  // The lexer should keep whitespace around.
182  lex.SetKeepWhitespaceMode(true);
183
184  // Keeps track if we have entered a PP directive.
185  bool in_pp_directive = false;
186
187  // True once we actually lexed the user provided line.
188  bool found_user_line = false;
189
190  // True if we already highlighted the token under the cursor, false otherwise.
191  bool highlighted_cursor = false;
192  Token token;
193  bool exit = false;
194  while (!exit) {
195    // Returns true if this is the last token we get from the lexer.
196    exit = lex.LexFromRawLexer(token);
197
198    bool invalid = false;
199    unsigned current_line_number =
200        SM.getSpellingLineNumber(token.getLocation(), &invalid);
201    if (current_line_number != line_number)
202      continue;
203    found_user_line = true;
204
205    // We don't need to print any tokens without a spelling line number.
206    if (invalid)
207      continue;
208
209    // Same as above but with the column number.
210    invalid = false;
211    unsigned start = SM.getSpellingColumnNumber(token.getLocation(), &invalid);
212    if (invalid)
213      continue;
214    // Column numbers start at 1, but indexes in our string start at 0.
215    --start;
216
217    // Annotations don't have a length, so let's skip them.
218    if (token.isAnnotation())
219      continue;
220
221    // Extract the token string from our source code.
222    llvm::StringRef tok_str = line.substr(start, token.getLength());
223
224    // If the token is just an empty string, we can skip all the work below.
225    if (tok_str.empty())
226      continue;
227
228    // If the cursor is inside this token, we have to apply the 'selected'
229    // highlight style before applying the actual token color.
230    llvm::StringRef to_print = tok_str;
231    StreamString storage;
232    auto end = start + token.getLength();
233    if (cursor_pos && end > *cursor_pos && !highlighted_cursor) {
234      highlighted_cursor = true;
235      options.selected.Apply(storage, tok_str);
236      to_print = storage.GetString();
237    }
238
239    // See how we are supposed to highlight this token.
240    HighlightStyle::ColorStyle color =
241        determineClangStyle(*this, token, tok_str, options, in_pp_directive);
242
243    color.Apply(result, to_print);
244  }
245
246  // Add the line ending we trimmed before tokenizing.
247  result << line_ending;
248
249  // If we went over the whole file but couldn't find our own file, then
250  // somehow our setup was wrong. When we're in release mode we just give the
251  // user the normal line and pretend we don't know how to highlight it. In
252  // debug mode we bail out with an assert as this should never happen.
253  if (!found_user_line) {
254    result << line;
255    assert(false && "We couldn't find the user line in the input file?");
256  }
257}
258