1//===- ExportTrie.cpp -----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is a partial implementation of the Mach-O export trie format. It's
10// essentially a symbol table encoded as a compressed prefix trie, meaning that
11// the common prefixes of each symbol name are shared for a more compact
12// representation. The prefixes are stored on the edges of the trie, and one
13// edge can represent multiple characters. For example, given two exported
14// symbols _bar and _baz, we will have a trie like this (terminal nodes are
15// marked with an asterisk):
16//
17//              +-+-+
18//              |   | // root node
19//              +-+-+
20//                |
21//                | _ba
22//                |
23//              +-+-+
24//              |   |
25//              +-+-+
26//           r /     \ z
27//            /       \
28//        +-+-+       +-+-+
29//        | * |       | * |
30//        +-+-+       +-+-+
31//
32// More documentation of the format can be found in
33// llvm/tools/obj2yaml/macho2yaml.cpp.
34//
35//===----------------------------------------------------------------------===//
36
37#include "ExportTrie.h"
38#include "Symbols.h"
39
40#include "lld/Common/ErrorHandler.h"
41#include "lld/Common/Memory.h"
42#include "llvm/BinaryFormat/MachO.h"
43#include "llvm/Support/LEB128.h"
44#include <optional>
45
46using namespace llvm;
47using namespace lld;
48using namespace lld::macho;
49
50namespace {
51
52struct Edge {
53  Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54
55  StringRef substring;
56  struct TrieNode *child;
57};
58
59struct ExportInfo {
60  uint64_t address;
61  uint64_t ordinal = 0;
62  uint8_t flags = 0;
63  ExportInfo(const Symbol &sym, uint64_t imageBase)
64      : address(sym.getVA() - imageBase) {
65    using namespace llvm::MachO;
66    if (sym.isWeakDef())
67      flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68    if (sym.isTlv())
69      flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
70    // TODO: Add proper support for stub-and-resolver flags.
71
72    if (auto *defined = dyn_cast<Defined>(&sym)) {
73      if (defined->isAbsolute())
74        flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
75    } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {
76      flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
77      if (!dysym->isDynamicLookup())
78        ordinal = dysym->getFile()->ordinal;
79    }
80  }
81};
82
83} // namespace
84
85struct macho::TrieNode {
86  std::vector<Edge> edges;
87  std::optional<ExportInfo> info;
88  // Estimated offset from the start of the serialized trie to the current node.
89  // This will converge to the true offset when updateOffset() is run to a
90  // fixpoint.
91  size_t offset = 0;
92
93  uint32_t getTerminalSize() const;
94  // Returns whether the new estimated offset differs from the old one.
95  bool updateOffset(size_t &nextOffset);
96  void writeTo(uint8_t *buf) const;
97};
98
99// For regular symbols, the node layout (excluding the children) is
100//
101//   uleb128 terminalSize;
102//   uleb128 flags;
103//   uleb128 address;
104//
105// For re-exported symbols, the layout is
106//
107//   uleb128 terminalSize;
108//   uleb128 flags;
109//   uleb128 ordinal;
110//   char[] originalName;
111//
112// If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
113// _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
114// re-exports _bar directly (i.e. not via an alias), then originalName will be
115// the empty string.
116//
117// TODO: Support aliased re-exports. (Since we don't yet support these,
118// originalName will always be the empty string.)
119//
120// For stub-and-resolver nodes, the layout is
121//
122//   uleb128 terminalSize;
123//   uleb128 flags;
124//   uleb128 stubAddress;
125//   uleb128 resolverAddress;
126//
127// TODO: Support stub-and-resolver nodes.
128uint32_t TrieNode::getTerminalSize() const {
129  uint32_t size = getULEB128Size(info->flags);
130  if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
131    size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator
132  else
133    size += getULEB128Size(info->address);
134  return size;
135}
136
137bool TrieNode::updateOffset(size_t &nextOffset) {
138  // Size of the whole node (including the terminalSize and the outgoing edges.)
139  // In contrast, terminalSize only records the size of the other data in the
140  // node.
141  size_t nodeSize;
142  if (info) {
143    uint32_t terminalSize = getTerminalSize();
144    // Overall node size so far is the uleb128 size of the length of the symbol
145    // info + the symbol info itself.
146    nodeSize = terminalSize + getULEB128Size(terminalSize);
147  } else {
148    nodeSize = 1; // Size of terminalSize (which has a value of 0)
149  }
150  // Compute size of all child edges.
151  ++nodeSize; // Byte for number of children.
152  for (const Edge &edge : edges) {
153    nodeSize += edge.substring.size() + 1             // String length.
154                + getULEB128Size(edge.child->offset); // Offset len.
155  }
156  // On input, 'nextOffset' is the new preferred location for this node.
157  bool result = (offset != nextOffset);
158  // Store new location in node object for use by parents.
159  offset = nextOffset;
160  nextOffset += nodeSize;
161  return result;
162}
163
164void TrieNode::writeTo(uint8_t *buf) const {
165  buf += offset;
166  if (info) {
167    uint32_t terminalSize = getTerminalSize();
168    buf += encodeULEB128(terminalSize, buf);
169    buf += encodeULEB128(info->flags, buf);
170    if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
171      buf += encodeULEB128(info->ordinal, buf);
172      *buf++ = 0; // empty originalName string
173    } else {
174      buf += encodeULEB128(info->address, buf);
175    }
176  } else {
177    // TrieNode with no Symbol info.
178    *buf++ = 0; // terminalSize
179  }
180  // Add number of children. TODO: Handle case where we have more than 256.
181  assert(edges.size() < 256);
182  *buf++ = edges.size();
183  // Append each child edge substring and node offset.
184  for (const Edge &edge : edges) {
185    memcpy(buf, edge.substring.data(), edge.substring.size());
186    buf += edge.substring.size();
187    *buf++ = '\0';
188    buf += encodeULEB128(edge.child->offset, buf);
189  }
190}
191
192TrieBuilder::~TrieBuilder() {
193  for (TrieNode *node : nodes)
194    delete node;
195}
196
197TrieNode *TrieBuilder::makeNode() {
198  auto *node = new TrieNode();
199  nodes.emplace_back(node);
200  return node;
201}
202
203static int charAt(const Symbol *sym, size_t pos) {
204  StringRef str = sym->getName();
205  if (pos >= str.size())
206    return -1;
207  return str[pos];
208}
209
210// Build the trie by performing a three-way radix quicksort: We start by sorting
211// the strings by their first characters, then sort the strings with the same
212// first characters by their second characters, and so on recursively. Each
213// time the prefixes diverge, we add a node to the trie.
214//
215// node:    The most recently created node along this path in the trie (i.e.
216//          the furthest from the root.)
217// lastPos: The prefix length of the most recently created node, i.e. the number
218//          of characters along its path from the root.
219// pos:     The string index we are currently sorting on. Note that each symbol
220//          S contained in vec has the same prefix S[0...pos).
221void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
222                               TrieNode *node, size_t lastPos, size_t pos) {
223tailcall:
224  if (vec.empty())
225    return;
226
227  // Partition items so that items in [0, i) are less than the pivot,
228  // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
229  // the pivot.
230  const Symbol *pivotSymbol = vec[vec.size() / 2];
231  int pivot = charAt(pivotSymbol, pos);
232  size_t i = 0;
233  size_t j = vec.size();
234  for (size_t k = 0; k < j;) {
235    int c = charAt(vec[k], pos);
236    if (c < pivot)
237      std::swap(vec[i++], vec[k++]);
238    else if (c > pivot)
239      std::swap(vec[--j], vec[k]);
240    else
241      k++;
242  }
243
244  bool isTerminal = pivot == -1;
245  bool prefixesDiverge = i != 0 || j != vec.size();
246  if (lastPos != pos && (isTerminal || prefixesDiverge)) {
247    TrieNode *newNode = makeNode();
248    node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
249                             newNode);
250    node = newNode;
251    lastPos = pos;
252  }
253
254  sortAndBuild(vec.slice(0, i), node, lastPos, pos);
255  sortAndBuild(vec.slice(j), node, lastPos, pos);
256
257  if (isTerminal) {
258    assert(j - i == 1); // no duplicate symbols
259    node->info = ExportInfo(*pivotSymbol, imageBase);
260  } else {
261    // This is the tail-call-optimized version of the following:
262    // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
263    vec = vec.slice(i, j - i);
264    ++pos;
265    goto tailcall;
266  }
267}
268
269size_t TrieBuilder::build() {
270  if (exported.empty())
271    return 0;
272
273  TrieNode *root = makeNode();
274  sortAndBuild(exported, root, 0, 0);
275
276  // Assign each node in the vector an offset in the trie stream, iterating
277  // until all uleb128 sizes have stabilized.
278  size_t offset;
279  bool more;
280  do {
281    offset = 0;
282    more = false;
283    for (TrieNode *node : nodes)
284      more |= node->updateOffset(offset);
285  } while (more);
286
287  return offset;
288}
289
290void TrieBuilder::writeTo(uint8_t *buf) const {
291  for (TrieNode *node : nodes)
292    node->writeTo(buf);
293}
294
295namespace {
296
297// Parse a serialized trie and invoke a callback for each entry.
298class TrieParser {
299public:
300  TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
301      : start(buf), end(start + size), callback(callback) {}
302
303  void parse(const uint8_t *buf, const Twine &cumulativeString);
304
305  void parse() { parse(start, ""); }
306
307  const uint8_t *start;
308  const uint8_t *end;
309  const TrieEntryCallback &callback;
310};
311
312} // namespace
313
314void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
315  if (buf >= end)
316    fatal("Node offset points outside export section");
317
318  unsigned ulebSize;
319  uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
320  buf += ulebSize;
321  uint64_t flags = 0;
322  size_t offset;
323  if (terminalSize != 0) {
324    flags = decodeULEB128(buf, &ulebSize);
325    callback(cumulativeString, flags);
326  }
327  buf += terminalSize;
328  uint8_t numEdges = *buf++;
329  for (uint8_t i = 0; i < numEdges; ++i) {
330    const char *cbuf = reinterpret_cast<const char *>(buf);
331    StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
332    buf += substring.size() + 1;
333    offset = decodeULEB128(buf, &ulebSize);
334    buf += ulebSize;
335    parse(start + offset, cumulativeString + substring);
336  }
337}
338
339void macho::parseTrie(const uint8_t *buf, size_t size,
340                      const TrieEntryCallback &callback) {
341  if (size == 0)
342    return;
343
344  TrieParser(buf, size, callback).parse();
345}
346