1#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18from __future__ import print_function
19import sys
20import re
21import os
22import subprocess
23import multiprocessing
24import argparse
25
26# Define functions which extract a list of symbols from a library using several
27# different tools. We use subprocess.Popen and yield a symbol at a time instead
28# of using subprocess.check_output and returning a list as, especially on
29# Windows, waiting for the entire output to be ready can take a significant
30# amount of time.
31
32def dumpbin_get_symbols(lib):
33    process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
34                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
35                               universal_newlines=True)
36    process.stdin.close()
37    for line in process.stdout:
38        # Look for external symbols that are defined in some section
39        match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
40        if match:
41            yield match.group(1)
42    process.wait()
43
44def nm_get_symbols(lib):
45    if sys.platform.startswith('aix'):
46        process = subprocess.Popen(['nm','-P','-Xany','-C','-p',lib], bufsize=1,
47                                   stdout=subprocess.PIPE, stdin=subprocess.PIPE,
48                                   universal_newlines=True)
49    else:
50        process = subprocess.Popen(['nm','-P',lib], bufsize=1,
51                                   stdout=subprocess.PIPE, stdin=subprocess.PIPE,
52                                   universal_newlines=True)
53    process.stdin.close()
54    for line in process.stdout:
55        # Look for external symbols that are defined in some section
56        # The POSIX format is:
57        #   name   type   value   size
58        # The -P flag displays the size field for symbols only when applicable,
59        # so the last field is optional. There's no space after the value field,
60        # but \s+ match newline also, so \s+\S* will match the optional size field.
61        match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S*$", line)
62        if match:
63            yield match.group(1)
64    process.wait()
65
66def readobj_get_symbols(lib):
67    process = subprocess.Popen(['llvm-readobj','--symbols',lib], bufsize=1,
68                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
69                               universal_newlines=True)
70    process.stdin.close()
71    for line in process.stdout:
72        # When looking through the output of llvm-readobj we expect to see Name,
73        # Section, then StorageClass, so record Name and Section when we see
74        # them and decide if this is a defined external symbol when we see
75        # StorageClass.
76        match = re.search('Name: (\S+)', line)
77        if match:
78            name = match.group(1)
79        match = re.search('Section: (\S+)', line)
80        if match:
81            section = match.group(1)
82        match = re.search('StorageClass: (\S+)', line)
83        if match:
84            storageclass = match.group(1)
85            if section != 'IMAGE_SYM_ABSOLUTE' and \
86               section != 'IMAGE_SYM_UNDEFINED' and \
87               storageclass == 'External':
88                yield name
89    process.wait()
90
91# Define functions which determine if the target is 32-bit Windows (as that's
92# where calling convention name decoration happens).
93
94def dumpbin_is_32bit_windows(lib):
95    # dumpbin /headers can output a huge amount of data (>100MB in a debug
96    # build) so we read only up to the 'machine' line then close the output.
97    process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
98                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
99                               universal_newlines=True)
100    process.stdin.close()
101    retval = False
102    for line in process.stdout:
103        match = re.match('.+machine \((\S+)\)', line)
104        if match:
105            retval = (match.group(1) == 'x86')
106            break
107    process.stdout.close()
108    process.wait()
109    return retval
110
111def objdump_is_32bit_windows(lib):
112    output = subprocess.check_output(['objdump','-f',lib],
113                                     universal_newlines=True)
114    for line in output.splitlines():
115        match = re.match('.+file format (\S+)', line)
116        if match:
117            return (match.group(1) == 'pe-i386')
118    return False
119
120def readobj_is_32bit_windows(lib):
121    output = subprocess.check_output(['llvm-readobj','--file-header',lib],
122                                     universal_newlines=True)
123    for line in output.splitlines():
124        match = re.match('Format: (\S+)', line)
125        if match:
126            return (match.group(1) == 'COFF-i386')
127    return False
128
129# On AIX, there isn't an easy way to detect 32-bit windows objects with the system toolchain,
130# so just assume false.
131def aix_is_32bit_windows(lib):
132    return False
133
134# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
135# identifier/type mangling we can decide which symbols could possibly be
136# required and which we can discard.
137def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
138    # Keep unmangled (i.e. extern "C") names
139    if not '?' in symbol:
140        if calling_convention_decoration:
141            # Remove calling convention decoration from names
142            match = re.match('[_@]([^@]+)', symbol)
143            if match:
144                return match.group(1)
145        return symbol
146    # Function template instantiations start with ?$; keep the instantiations of
147    # clang::Type::getAs, as some of them are explipict specializations that are
148    # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
149    # the definition is public
150    elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
151        return symbol
152    elif symbol.startswith('??$'):
153        return None
154    # Deleting destructors start with ?_G or ?_E and can be discarded because
155    # link.exe gives you a warning telling you they can't be exported if you
156    # don't
157    elif symbol.startswith('??_G') or symbol.startswith('??_E'):
158        return None
159    # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
160    # defined in headers and not required to be kept
161    elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
162        return None
163    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
164    # that mentions an anonymous namespace can be discarded, as the anonymous
165    # namespace doesn't exist outside of that translation unit.
166    elif re.search('\?A(0x\w+)?@', symbol):
167        return None
168    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
169    # bit of a mess and imprecise, but that avoids having to completely demangle
170    # the symbol name. The outermost namespace is at the end of the identifier
171    # mangling, and the identifier mangling is followed by the type mangling, so
172    # we look for (llvm|clang)@@ followed by something that looks like a
173    # function type mangling. To spot a function type we use (this is derived
174    # from clang/lib/AST/MicrosoftMangle.cpp):
175    # <function-type> ::= <function-class> <this-cvr-qualifiers>
176    #                     <calling-convention> <return-type>
177    #                     <argument-list> <throw-spec>
178    # <function-class> ::= [A-Z]
179    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
180    # <calling-convention> ::= [A-JQ]
181    # <return-type> ::= .+
182    # <argument-list> ::= X   (void)
183    #                 ::= .+@ (list of types)
184    #                 ::= .*Z (list of types, varargs)
185    # <throw-spec> ::= exceptions are not allowed
186    elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
187        return symbol
188    return None
189
190# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
191# demangle the identifier mangling to identify symbols that can be safely
192# discarded.
193def should_keep_itanium_symbol(symbol, calling_convention_decoration):
194    # Start by removing any calling convention decoration (which we expect to
195    # see on all symbols, even mangled C++ symbols)
196    if calling_convention_decoration and symbol.startswith('_'):
197        symbol = symbol[1:]
198    # Keep unmangled names
199    if not symbol.startswith('_') and not symbol.startswith('.'):
200        return symbol
201    # Discard manglings that aren't nested names
202    match = re.match('_Z(T[VTIS])?(N.+)', symbol)
203    if not match:
204        return None
205    # Demangle the name. If the name is too complex then we don't need to keep
206    # it, but it the demangling fails then keep the symbol just in case.
207    try:
208        names, _ = parse_itanium_nested_name(match.group(2))
209    except TooComplexName:
210        return None
211    if not names:
212        return symbol
213    # Constructors and destructors of templates classes are assumed to be
214    # defined in headers and not required to be kept
215    if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
216        return None
217    # Keep the instantiations of clang::Type::getAs, as some of them are
218    # explipict specializations that are defined in clang's lib/AST/Type.cpp;
219    # discard any other function template instantiations as it's assumed that
220    # the definition is public
221    elif symbol.startswith('_ZNK5clang4Type5getAs'):
222        return symbol
223    elif names[-1][1]:
224        return None
225    # Keep llvm:: and clang:: names
226    elif names[0][0] == '4llvm' or names[0][0] == '5clang':
227        return symbol
228    # Discard everything else
229    else:
230        return None
231
232# Certain kinds of complex manglings we assume cannot be part of a public
233# interface, and we handle them by raising an exception.
234class TooComplexName(Exception):
235    pass
236
237# Parse an itanium mangled name from the start of a string and return a
238# (name, rest of string) pair.
239def parse_itanium_name(arg):
240    # Check for a normal name
241    match = re.match('(\d+)(.+)', arg)
242    if match:
243        n = int(match.group(1))
244        name = match.group(1)+match.group(2)[:n]
245        rest = match.group(2)[n:]
246        return name, rest
247    # Check for constructor/destructor names
248    match = re.match('([CD][123])(.+)', arg)
249    if match:
250        return match.group(1), match.group(2)
251    # Assume that a sequence of characters that doesn't end a nesting is an
252    # operator (this is very imprecise, but appears to be good enough)
253    match = re.match('([^E]+)(.+)', arg)
254    if match:
255        return match.group(1), match.group(2)
256    # Anything else: we can't handle it
257    return None, arg
258
259# Parse an itanium mangled template argument list from the start of a string
260# and throw it away, returning the rest of the string.
261def skip_itanium_template(arg):
262    # A template argument list starts with I
263    assert arg.startswith('I'), arg
264    tmp = arg[1:]
265    while tmp:
266        # Check for names
267        match = re.match('(\d+)(.+)', tmp)
268        if match:
269            n = int(match.group(1))
270            tmp =  match.group(2)[n:]
271            continue
272        # Check for substitutions
273        match = re.match('S[A-Z0-9]*_(.+)', tmp)
274        if match:
275            tmp = match.group(1)
276        # Start of a template
277        elif tmp.startswith('I'):
278            tmp = skip_itanium_template(tmp)
279        # Start of a nested name
280        elif tmp.startswith('N'):
281            _, tmp = parse_itanium_nested_name(tmp)
282        # Start of an expression: assume that it's too complicated
283        elif tmp.startswith('L') or tmp.startswith('X'):
284            raise TooComplexName
285        # End of the template
286        elif tmp.startswith('E'):
287            return tmp[1:]
288        # Something else: probably a type, skip it
289        else:
290            tmp = tmp[1:]
291    return None
292
293# Parse an itanium mangled nested name and transform it into a list of pairs of
294# (name, is_template), returning (list, rest of string).
295def parse_itanium_nested_name(arg):
296    # A nested name starts with N
297    assert arg.startswith('N'), arg
298    ret = []
299
300    # Skip past the N, and possibly a substitution
301    match = re.match('NS[A-Z0-9]*_(.+)', arg)
302    if match:
303        tmp = match.group(1)
304    else:
305        tmp = arg[1:]
306
307    # Skip past CV-qualifiers and ref qualifiers
308    match = re.match('[rVKRO]*(.+)', tmp);
309    if match:
310        tmp = match.group(1)
311
312    # Repeatedly parse names from the string until we reach the end of the
313    # nested name
314    while tmp:
315        # An E ends the nested name
316        if tmp.startswith('E'):
317            return ret, tmp[1:]
318        # Parse a name
319        name_part, tmp = parse_itanium_name(tmp)
320        if not name_part:
321            # If we failed then we don't know how to demangle this
322            return None, None
323        is_template = False
324        # If this name is a template record that, then skip the template
325        # arguments
326        if tmp.startswith('I'):
327            tmp = skip_itanium_template(tmp)
328            is_template = True
329        # Add the name to the list
330        ret.append((name_part, is_template))
331
332    # If we get here then something went wrong
333    return None, None
334
335def extract_symbols(arg):
336    get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
337    symbols = dict()
338    for symbol in get_symbols(lib):
339        symbol = should_keep_symbol(symbol, calling_convention_decoration)
340        if symbol:
341            symbols[symbol] = 1 + symbols.setdefault(symbol,0)
342    return symbols
343
344if __name__ == '__main__':
345    tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
346    parser = argparse.ArgumentParser(
347        description='Extract symbols to export from libraries')
348    parser.add_argument('--mangling', choices=['itanium','microsoft'],
349                        required=True, help='expected symbol mangling scheme')
350    parser.add_argument('--tools', choices=tool_exes, nargs='*',
351                        help='tools to use to extract symbols and determine the'
352                        ' target')
353    parser.add_argument('libs', metavar='lib', type=str, nargs='+',
354                        help='libraries to extract symbols from')
355    parser.add_argument('-o', metavar='file', type=str, help='output to file')
356    args = parser.parse_args()
357
358    # Determine the function to use to get the list of symbols from the inputs,
359    # and the function to use to determine if the target is 32-bit windows.
360    tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
361              'nm' : (nm_get_symbols, None),
362              'objdump' : (None, objdump_is_32bit_windows),
363              'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
364    get_symbols = None
365    is_32bit_windows = aix_is_32bit_windows if sys.platform.startswith('aix') else None
366    # If we have a tools argument then use that for the list of tools to check
367    if args.tools:
368        tool_exes = args.tools
369    # Find a tool to use by trying each in turn until we find one that exists
370    # (subprocess.call will throw OSError when the program does not exist)
371    get_symbols = None
372    for exe in tool_exes:
373        try:
374            # Close std streams as we don't want any output and we don't
375            # want the process to wait for something on stdin.
376            p = subprocess.Popen([exe], stdout=subprocess.PIPE,
377                                 stderr=subprocess.PIPE,
378                                 stdin=subprocess.PIPE,
379                                 universal_newlines=True)
380            p.stdout.close()
381            p.stderr.close()
382            p.stdin.close()
383            p.wait()
384            # Keep going until we have a tool to use for both get_symbols and
385            # is_32bit_windows
386            if not get_symbols:
387                get_symbols = tools[exe][0]
388            if not is_32bit_windows:
389                is_32bit_windows = tools[exe][1]
390            if get_symbols and is_32bit_windows:
391                break
392        except OSError:
393            continue
394    if not get_symbols:
395        print("Couldn't find a program to read symbols with", file=sys.stderr)
396        exit(1)
397    if not is_32bit_windows:
398        print("Couldn't find a program to determining the target", file=sys.stderr)
399        exit(1)
400
401    # How we determine which symbols to keep and which to discard depends on
402    # the mangling scheme
403    if args.mangling == 'microsoft':
404        should_keep_symbol = should_keep_microsoft_symbol
405    else:
406        should_keep_symbol = should_keep_itanium_symbol
407
408    # Get the list of libraries to extract symbols from
409    libs = list()
410    for lib in args.libs:
411        # When invoked by cmake the arguments are the cmake target names of the
412        # libraries, so we need to add .lib/.a to the end and maybe lib to the
413        # start to get the filename. Also allow objects.
414        suffixes = ['.lib','.a','.obj','.o']
415        if not any([lib.endswith(s) for s in suffixes]):
416            for s in suffixes:
417                if os.path.exists(lib+s):
418                    lib = lib+s
419                    break
420                if os.path.exists('lib'+lib+s):
421                    lib = 'lib'+lib+s
422                    break
423        if not any([lib.endswith(s) for s in suffixes]):
424            print("Don't know what to do with argument "+lib, file=sys.stderr)
425            exit(1)
426        libs.append(lib)
427
428    # Check if calling convention decoration is used by inspecting the first
429    # library in the list
430    calling_convention_decoration = is_32bit_windows(libs[0])
431
432    # Extract symbols from libraries in parallel. This is a huge time saver when
433    # doing a debug build, as there are hundreds of thousands of symbols in each
434    # library.
435    pool = multiprocessing.Pool()
436    try:
437        # Only one argument can be passed to the mapping function, and we can't
438        # use a lambda or local function definition as that doesn't work on
439        # windows, so create a list of tuples which duplicates the arguments
440        # that are the same in all calls.
441        vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
442        # Do an async map then wait for the result to make sure that
443        # KeyboardInterrupt gets caught correctly (see
444        # http://bugs.python.org/issue8296)
445        result = pool.map_async(extract_symbols, vals)
446        pool.close()
447        libs_symbols = result.get(3600)
448    except KeyboardInterrupt:
449        # On Ctrl-C terminate everything and exit
450        pool.terminate()
451        pool.join()
452        exit(1)
453
454    # Merge everything into a single dict
455    symbols = dict()
456    for this_lib_symbols in libs_symbols:
457        for k,v in list(this_lib_symbols.items()):
458            symbols[k] = v + symbols.setdefault(k,0)
459
460    # Count instances of member functions of template classes, and map the
461    # symbol name to the function+class. We do this under the assumption that if
462    # a member function of a template class is instantiated many times it's
463    # probably declared in a public header file.
464    template_function_count = dict()
465    template_function_mapping = dict()
466    template_function_count[""] = 0
467    for k in symbols:
468        name = None
469        if args.mangling == 'microsoft':
470            # Member functions of templates start with
471            # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
472            # As manglings go from the innermost scope to the outermost scope
473            # this means:
474            #  * When we have a function member of a subclass of a template
475            #    class then <fn_name> will actually contain the mangling of
476            #    both the subclass and the function member. This is fine.
477            #  * When we have a function member of a template subclass of a
478            #    (possibly template) class then it's the innermost template
479            #    subclass that becomes <class_name>. This should be OK so long
480            #    as we don't have multiple classes with a template subclass of
481            #    the same name.
482            match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
483            if match:
484                name = match.group(1)
485        else:
486            # Find member functions of templates by demangling the name and
487            # checking if the second-to-last name in the list is a template.
488            match = re.match('_Z(T[VTIS])?(N.+)', k)
489            if match:
490                try:
491                    names, _ = parse_itanium_nested_name(match.group(2))
492                    if names and names[-2][1]:
493                        name = ''.join([x for x,_ in names])
494                except TooComplexName:
495                    # Manglings that are too complex should already have been
496                    # filtered out, but if we happen to somehow see one here
497                    # just leave it as-is.
498                    pass
499        if name:
500            old_count = template_function_count.setdefault(name,0)
501            template_function_count[name] = old_count + 1
502            template_function_mapping[k] = name
503        else:
504            template_function_mapping[k] = ""
505
506    # Print symbols which both:
507    #  * Appear in exactly one input, as symbols defined in multiple
508    #    objects/libraries are assumed to have public definitions.
509    #  * Aren't instances of member functions of templates which have been
510    #    instantiated 100 times or more, which are assumed to have public
511    #    definitions. (100 is an arbitrary guess here.)
512    if args.o:
513        outfile = open(args.o,'w')
514    else:
515        outfile = sys.stdout
516    for k,v in list(symbols.items()):
517        template_count = template_function_count[template_function_mapping[k]]
518        if v == 1 and template_count < 100:
519            print(k, file=outfile)
520