1//===- GsymCreator.cpp ----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//===----------------------------------------------------------------------===//
7
8#include "llvm/DebugInfo/GSYM/GsymCreator.h"
9#include "llvm/DebugInfo/GSYM/FileWriter.h"
10#include "llvm/DebugInfo/GSYM/Header.h"
11#include "llvm/DebugInfo/GSYM/LineTable.h"
12#include "llvm/MC/StringTableBuilder.h"
13#include "llvm/Support/raw_ostream.h"
14
15#include <algorithm>
16#include <cassert>
17#include <functional>
18#include <vector>
19
20using namespace llvm;
21using namespace gsym;
22
23GsymCreator::GsymCreator(bool Quiet)
24    : StrTab(StringTableBuilder::ELF), Quiet(Quiet) {
25  insertFile(StringRef());
26}
27
28uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) {
29  llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
30  llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
31  // We must insert the strings first, then call the FileEntry constructor.
32  // If we inline the insertString() function call into the constructor, the
33  // call order is undefined due to parameter lists not having any ordering
34  // requirements.
35  const uint32_t Dir = insertString(directory);
36  const uint32_t Base = insertString(filename);
37  return insertFileEntry(FileEntry(Dir, Base));
38}
39
40uint32_t GsymCreator::insertFileEntry(FileEntry FE) {
41  std::lock_guard<std::mutex> Guard(Mutex);
42  const auto NextIndex = Files.size();
43  // Find FE in hash map and insert if not present.
44  auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex));
45  if (R.second)
46    Files.emplace_back(FE);
47  return R.first->second;
48}
49
50uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) {
51  // File index zero is reserved for a FileEntry with no directory and no
52  // filename. Any other file and we need to copy the strings for the directory
53  // and filename.
54  if (FileIdx == 0)
55    return 0;
56  const FileEntry SrcFE = SrcGC.Files[FileIdx];
57  // Copy the strings for the file and then add the newly converted file entry.
58  uint32_t Dir = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Dir)->second);
59  uint32_t Base = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Base)->second);
60  FileEntry DstFE(Dir, Base);
61  return insertFileEntry(DstFE);
62}
63
64llvm::Error GsymCreator::save(StringRef Path, llvm::endianness ByteOrder,
65                              std::optional<uint64_t> SegmentSize) const {
66  if (SegmentSize)
67    return saveSegments(Path, ByteOrder, *SegmentSize);
68  std::error_code EC;
69  raw_fd_ostream OutStrm(Path, EC);
70  if (EC)
71    return llvm::errorCodeToError(EC);
72  FileWriter O(OutStrm, ByteOrder);
73  return encode(O);
74}
75
76llvm::Error GsymCreator::encode(FileWriter &O) const {
77  std::lock_guard<std::mutex> Guard(Mutex);
78  if (Funcs.empty())
79    return createStringError(std::errc::invalid_argument,
80                             "no functions to encode");
81  if (!Finalized)
82    return createStringError(std::errc::invalid_argument,
83                             "GsymCreator wasn't finalized prior to encoding");
84
85  if (Funcs.size() > UINT32_MAX)
86    return createStringError(std::errc::invalid_argument,
87                             "too many FunctionInfos");
88
89  std::optional<uint64_t> BaseAddress = getBaseAddress();
90  // Base address should be valid if we have any functions.
91  if (!BaseAddress)
92    return createStringError(std::errc::invalid_argument,
93                             "invalid base address");
94  Header Hdr;
95  Hdr.Magic = GSYM_MAGIC;
96  Hdr.Version = GSYM_VERSION;
97  Hdr.AddrOffSize = getAddressOffsetSize();
98  Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
99  Hdr.BaseAddress = *BaseAddress;
100  Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
101  Hdr.StrtabOffset = 0; // We will fix this up later.
102  Hdr.StrtabSize = 0;   // We will fix this up later.
103  memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
104  if (UUID.size() > sizeof(Hdr.UUID))
105    return createStringError(std::errc::invalid_argument,
106                             "invalid UUID size %u", (uint32_t)UUID.size());
107  // Copy the UUID value if we have one.
108  if (UUID.size() > 0)
109    memcpy(Hdr.UUID, UUID.data(), UUID.size());
110  // Write out the header.
111  llvm::Error Err = Hdr.encode(O);
112  if (Err)
113    return Err;
114
115  const uint64_t MaxAddressOffset = getMaxAddressOffset();
116  // Write out the address offsets.
117  O.alignTo(Hdr.AddrOffSize);
118  for (const auto &FuncInfo : Funcs) {
119    uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
120    // Make sure we calculated the address offsets byte size correctly by
121    // verifying the current address offset is within ranges. We have seen bugs
122    // introduced when the code changes that can cause problems here so it is
123    // good to catch this during testing.
124    assert(AddrOffset <= MaxAddressOffset);
125    (void)MaxAddressOffset;
126    switch (Hdr.AddrOffSize) {
127    case 1:
128      O.writeU8(static_cast<uint8_t>(AddrOffset));
129      break;
130    case 2:
131      O.writeU16(static_cast<uint16_t>(AddrOffset));
132      break;
133    case 4:
134      O.writeU32(static_cast<uint32_t>(AddrOffset));
135      break;
136    case 8:
137      O.writeU64(AddrOffset);
138      break;
139    }
140  }
141
142  // Write out all zeros for the AddrInfoOffsets.
143  O.alignTo(4);
144  const off_t AddrInfoOffsetsOffset = O.tell();
145  for (size_t i = 0, n = Funcs.size(); i < n; ++i)
146    O.writeU32(0);
147
148  // Write out the file table
149  O.alignTo(4);
150  assert(!Files.empty());
151  assert(Files[0].Dir == 0);
152  assert(Files[0].Base == 0);
153  size_t NumFiles = Files.size();
154  if (NumFiles > UINT32_MAX)
155    return createStringError(std::errc::invalid_argument, "too many files");
156  O.writeU32(static_cast<uint32_t>(NumFiles));
157  for (auto File : Files) {
158    O.writeU32(File.Dir);
159    O.writeU32(File.Base);
160  }
161
162  // Write out the string table.
163  const off_t StrtabOffset = O.tell();
164  StrTab.write(O.get_stream());
165  const off_t StrtabSize = O.tell() - StrtabOffset;
166  std::vector<uint32_t> AddrInfoOffsets;
167
168  // Write out the address infos for each function info.
169  for (const auto &FuncInfo : Funcs) {
170    if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
171      AddrInfoOffsets.push_back(OffsetOrErr.get());
172    else
173      return OffsetOrErr.takeError();
174  }
175  // Fixup the string table offset and size in the header
176  O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
177  O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize));
178
179  // Fixup all address info offsets
180  uint64_t Offset = 0;
181  for (auto AddrInfoOffset : AddrInfoOffsets) {
182    O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset);
183    Offset += 4;
184  }
185  return ErrorSuccess();
186}
187
188llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
189  std::lock_guard<std::mutex> Guard(Mutex);
190  if (Finalized)
191    return createStringError(std::errc::invalid_argument, "already finalized");
192  Finalized = true;
193
194  // Don't let the string table indexes change by finalizing in order.
195  StrTab.finalizeInOrder();
196
197  // Remove duplicates function infos that have both entries from debug info
198  // (DWARF or Breakpad) and entries from the SymbolTable.
199  //
200  // Also handle overlapping function. Usually there shouldn't be any, but they
201  // can and do happen in some rare cases.
202  //
203  // (a)          (b)         (c)
204  //     ^  ^       ^            ^
205  //     |X |Y      |X ^         |X
206  //     |  |       |  |Y        |  ^
207  //     |  |       |  v         v  |Y
208  //     v  v       v               v
209  //
210  // In (a) and (b), Y is ignored and X will be reported for the full range.
211  // In (c), both functions will be included in the result and lookups for an
212  // address in the intersection will return Y because of binary search.
213  //
214  // Note that in case of (b), we cannot include Y in the result because then
215  // we wouldn't find any function for range (end of Y, end of X)
216  // with binary search
217
218  const auto NumBefore = Funcs.size();
219  // Only sort and unique if this isn't a segment. If this is a segment we
220  // already finalized the main GsymCreator with all of the function infos
221  // and then the already sorted and uniqued function infos were added to this
222  // object.
223  if (!IsSegment) {
224    if (NumBefore > 1) {
225      // Sort function infos so we can emit sorted functions.
226      llvm::sort(Funcs);
227      std::vector<FunctionInfo> FinalizedFuncs;
228      FinalizedFuncs.reserve(Funcs.size());
229      FinalizedFuncs.emplace_back(std::move(Funcs.front()));
230      for (size_t Idx=1; Idx < NumBefore; ++Idx) {
231        FunctionInfo &Prev = FinalizedFuncs.back();
232        FunctionInfo &Curr = Funcs[Idx];
233        // Empty ranges won't intersect, but we still need to
234        // catch the case where we have multiple symbols at the
235        // same address and coalesce them.
236        const bool ranges_equal = Prev.Range == Curr.Range;
237        if (ranges_equal || Prev.Range.intersects(Curr.Range)) {
238          // Overlapping ranges or empty identical ranges.
239          if (ranges_equal) {
240            // Same address range. Check if one is from debug
241            // info and the other is from a symbol table. If
242            // so, then keep the one with debug info. Our
243            // sorting guarantees that entries with matching
244            // address ranges that have debug info are last in
245            // the sort.
246            if (!(Prev == Curr)) {
247              if (Prev.hasRichInfo() && Curr.hasRichInfo()) {
248                if (!Quiet) {
249                  OS << "warning: same address range contains "
250                        "different debug "
251                    << "info. Removing:\n"
252                    << Prev << "\nIn favor of this one:\n"
253                    << Curr << "\n";
254                }
255              }
256              // We want to swap the current entry with the previous since
257              // later entries with the same range always have more debug info
258              // or different debug info.
259              std::swap(Prev, Curr);
260            }
261          } else {
262            if (!Quiet) { // print warnings about overlaps
263              OS << "warning: function ranges overlap:\n"
264                << Prev << "\n"
265                << Curr << "\n";
266            }
267            FinalizedFuncs.emplace_back(std::move(Curr));
268          }
269        } else {
270          if (Prev.Range.size() == 0 && Curr.Range.contains(Prev.Range.start())) {
271            // Symbols on macOS don't have address ranges, so if the range
272            // doesn't match and the size is zero, then we replace the empty
273            // symbol function info with the current one.
274            std::swap(Prev, Curr);
275          } else {
276            FinalizedFuncs.emplace_back(std::move(Curr));
277          }
278        }
279      }
280      std::swap(Funcs, FinalizedFuncs);
281    }
282    // If our last function info entry doesn't have a size and if we have valid
283    // text ranges, we should set the size of the last entry since any search for
284    // a high address might match our last entry. By fixing up this size, we can
285    // help ensure we don't cause lookups to always return the last symbol that
286    // has no size when doing lookups.
287    if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
288      if (auto Range =
289              ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) {
290        Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
291      }
292    }
293    OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
294      << Funcs.size() << " total\n";
295  }
296  return Error::success();
297}
298
299uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) {
300  // String offset at zero is always the empty string, no copying needed.
301  if (StrOff == 0)
302    return 0;
303  return StrTab.add(SrcGC.StringOffsetMap.find(StrOff)->second);
304}
305
306uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
307  if (S.empty())
308    return 0;
309
310  // The hash can be calculated outside the lock.
311  CachedHashStringRef CHStr(S);
312  std::lock_guard<std::mutex> Guard(Mutex);
313  if (Copy) {
314    // We need to provide backing storage for the string if requested
315    // since StringTableBuilder stores references to strings. Any string
316    // that comes from a section in an object file doesn't need to be
317    // copied, but any string created by code will need to be copied.
318    // This allows GsymCreator to be really fast when parsing DWARF and
319    // other object files as most strings don't need to be copied.
320    if (!StrTab.contains(CHStr))
321      CHStr = CachedHashStringRef{StringStorage.insert(S).first->getKey(),
322                                  CHStr.hash()};
323  }
324  const uint32_t StrOff = StrTab.add(CHStr);
325  // Save a mapping of string offsets to the cached string reference in case
326  // we need to segment the GSYM file and copy string from one string table to
327  // another.
328  if (StringOffsetMap.count(StrOff) == 0)
329    StringOffsetMap.insert(std::make_pair(StrOff, CHStr));
330  return StrOff;
331}
332
333void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
334  std::lock_guard<std::mutex> Guard(Mutex);
335  Funcs.emplace_back(std::move(FI));
336}
337
338void GsymCreator::forEachFunctionInfo(
339    std::function<bool(FunctionInfo &)> const &Callback) {
340  std::lock_guard<std::mutex> Guard(Mutex);
341  for (auto &FI : Funcs) {
342    if (!Callback(FI))
343      break;
344  }
345}
346
347void GsymCreator::forEachFunctionInfo(
348    std::function<bool(const FunctionInfo &)> const &Callback) const {
349  std::lock_guard<std::mutex> Guard(Mutex);
350  for (const auto &FI : Funcs) {
351    if (!Callback(FI))
352      break;
353  }
354}
355
356size_t GsymCreator::getNumFunctionInfos() const {
357  std::lock_guard<std::mutex> Guard(Mutex);
358  return Funcs.size();
359}
360
361bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
362  if (ValidTextRanges)
363    return ValidTextRanges->contains(Addr);
364  return true; // No valid text ranges has been set, so accept all ranges.
365}
366
367std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const {
368  // If we have finalized then Funcs are sorted. If we are a segment then
369  // Funcs will be sorted as well since function infos get added from an
370  // already finalized GsymCreator object where its functions were sorted and
371  // uniqued.
372  if ((Finalized || IsSegment) && !Funcs.empty())
373    return std::optional<uint64_t>(Funcs.front().startAddress());
374  return std::nullopt;
375}
376
377std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const {
378  // If we have finalized then Funcs are sorted. If we are a segment then
379  // Funcs will be sorted as well since function infos get added from an
380  // already finalized GsymCreator object where its functions were sorted and
381  // uniqued.
382  if ((Finalized || IsSegment) && !Funcs.empty())
383    return std::optional<uint64_t>(Funcs.back().startAddress());
384  return std::nullopt;
385}
386
387std::optional<uint64_t> GsymCreator::getBaseAddress() const {
388  if (BaseAddress)
389    return BaseAddress;
390  return getFirstFunctionAddress();
391}
392
393uint64_t GsymCreator::getMaxAddressOffset() const {
394  switch (getAddressOffsetSize()) {
395    case 1: return UINT8_MAX;
396    case 2: return UINT16_MAX;
397    case 4: return UINT32_MAX;
398    case 8: return UINT64_MAX;
399  }
400  llvm_unreachable("invalid address offset");
401}
402
403uint8_t GsymCreator::getAddressOffsetSize() const {
404  const std::optional<uint64_t> BaseAddress = getBaseAddress();
405  const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress();
406  if (BaseAddress && LastFuncAddr) {
407    const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress;
408    if (AddrDelta <= UINT8_MAX)
409      return 1;
410    else if (AddrDelta <= UINT16_MAX)
411      return 2;
412    else if (AddrDelta <= UINT32_MAX)
413      return 4;
414    return 8;
415  }
416  return 1;
417}
418
419uint64_t GsymCreator::calculateHeaderAndTableSize() const {
420  uint64_t Size = sizeof(Header);
421  const size_t NumFuncs = Funcs.size();
422  // Add size of address offset table
423  Size += NumFuncs * getAddressOffsetSize();
424  // Add size of address info offsets which are 32 bit integers in version 1.
425  Size += NumFuncs * sizeof(uint32_t);
426  // Add file table size
427  Size += Files.size() * sizeof(FileEntry);
428  // Add string table size
429  Size += StrTab.getSize();
430
431  return Size;
432}
433
434// This function takes a InlineInfo class that was copy constructed from an
435// InlineInfo from the \a SrcGC and updates all members that point to strings
436// and files to point to strings and files from this GsymCreator.
437void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) {
438  II.Name = copyString(SrcGC, II.Name);
439  II.CallFile = copyFile(SrcGC, II.CallFile);
440  for (auto &ChildII: II.Children)
441    fixupInlineInfo(SrcGC, ChildII);
442}
443
444uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) {
445  // To copy a function info we need to copy any files and strings over into
446  // this GsymCreator and then copy the function info and update the string
447  // table offsets to match the new offsets.
448  const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx];
449
450  FunctionInfo DstFI;
451  DstFI.Range = SrcFI.Range;
452  DstFI.Name = copyString(SrcGC, SrcFI.Name);
453  // Copy the line table if there is one.
454  if (SrcFI.OptLineTable) {
455    // Copy the entire line table.
456    DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value());
457    // Fixup all LineEntry::File entries which are indexes in the the file table
458    // from SrcGC and must be converted to file indexes from this GsymCreator.
459    LineTable &DstLT = DstFI.OptLineTable.value();
460    const size_t NumLines = DstLT.size();
461    for (size_t I=0; I<NumLines; ++I) {
462      LineEntry &LE = DstLT.get(I);
463      LE.File = copyFile(SrcGC, LE.File);
464    }
465  }
466  // Copy the inline information if needed.
467  if (SrcFI.Inline) {
468    // Make a copy of the source inline information.
469    DstFI.Inline = SrcFI.Inline.value();
470    // Fixup all strings and files in the copied inline information.
471    fixupInlineInfo(SrcGC, *DstFI.Inline);
472  }
473  std::lock_guard<std::mutex> Guard(Mutex);
474  Funcs.emplace_back(DstFI);
475  return Funcs.back().cacheEncoding();
476}
477
478llvm::Error GsymCreator::saveSegments(StringRef Path,
479                                      llvm::endianness ByteOrder,
480                                      uint64_t SegmentSize) const {
481  if (SegmentSize == 0)
482    return createStringError(std::errc::invalid_argument,
483                             "invalid segment size zero");
484
485  size_t FuncIdx = 0;
486  const size_t NumFuncs = Funcs.size();
487  while (FuncIdx < NumFuncs) {
488    llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC =
489        createSegment(SegmentSize, FuncIdx);
490    if (ExpectedGC) {
491      GsymCreator *GC = ExpectedGC->get();
492      if (GC == NULL)
493        break; // We had not more functions to encode.
494      raw_null_ostream ErrorStrm;
495      llvm::Error Err = GC->finalize(ErrorStrm);
496      if (Err)
497        return Err;
498      std::string SegmentedGsymPath;
499      raw_string_ostream SGP(SegmentedGsymPath);
500      std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress();
501      if (FirstFuncAddr) {
502        SGP << Path << "-" << llvm::format_hex(*FirstFuncAddr, 1);
503        SGP.flush();
504        Err = GC->save(SegmentedGsymPath, ByteOrder, std::nullopt);
505        if (Err)
506          return Err;
507      }
508    } else {
509      return ExpectedGC.takeError();
510    }
511  }
512  return Error::success();
513}
514
515llvm::Expected<std::unique_ptr<GsymCreator>>
516GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const {
517  // No function entries, return empty unique pointer
518  if (FuncIdx >= Funcs.size())
519    return std::unique_ptr<GsymCreator>();
520
521  std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true));
522
523  // Tell the creator that this is a segment.
524  GC->setIsSegment();
525
526  // Set the base address if there is one.
527  if (BaseAddress)
528    GC->setBaseAddress(*BaseAddress);
529  // Copy the UUID value from this object into the new creator.
530  GC->setUUID(UUID);
531  const size_t NumFuncs = Funcs.size();
532  // Track how big the function infos are for the current segment so we can
533  // emit segments that are close to the requested size. It is quick math to
534  // determine the current header and tables sizes, so we can do that each loop.
535  uint64_t SegmentFuncInfosSize = 0;
536  for (; FuncIdx < NumFuncs; ++FuncIdx) {
537    const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize();
538    if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) {
539      if (SegmentFuncInfosSize == 0)
540        return createStringError(std::errc::invalid_argument,
541                                 "a segment size of %" PRIu64 " is to small to "
542                                 "fit any function infos, specify a larger value",
543                                 SegmentSize);
544
545      break;
546    }
547    SegmentFuncInfosSize += alignTo(GC->copyFunctionInfo(*this, FuncIdx), 4);
548  }
549  return std::move(GC);
550}
551