1#!/usr/bin/env python 2 3"""A tool for extracting a list of symbols to export 4 5When exporting symbols from a dll or exe we either need to mark the symbols in 6the source code as __declspec(dllexport) or supply a list of symbols to the 7linker. This program automates the latter by inspecting the symbol tables of a 8list of link inputs and deciding which of those symbols need to be exported. 9 10We can't just export all the defined symbols, as there's a limit of 65535 11exported symbols and in clang we go way over that, particularly in a debug 12build. Therefore a large part of the work is pruning symbols either which can't 13be imported, or which we think are things that have definitions in public header 14files (i.e. template instantiations) and we would get defined in the thing 15importing these symbols anyway. 16""" 17 18from __future__ import print_function 19import sys 20import re 21import os 22import subprocess 23import multiprocessing 24import argparse 25 26# Define functions which extract a list of symbols from a library using several 27# different tools. We use subprocess.Popen and yield a symbol at a time instead 28# of using subprocess.check_output and returning a list as, especially on 29# Windows, waiting for the entire output to be ready can take a significant 30# amount of time. 31 32def dumpbin_get_symbols(lib): 33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, 34 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 35 universal_newlines=True) 36 process.stdin.close() 37 for line in process.stdout: 38 # Look for external symbols that are defined in some section 39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) 40 if match: 41 yield match.group(1) 42 process.wait() 43 44def nm_get_symbols(lib): 45 if sys.platform.startswith('aix'): 46 process = subprocess.Popen(['nm','-P','-Xany','-C','-p',lib], bufsize=1, 47 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 48 universal_newlines=True) 49 else: 50 process = subprocess.Popen(['nm','-P',lib], bufsize=1, 51 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 52 universal_newlines=True) 53 process.stdin.close() 54 for line in process.stdout: 55 # Look for external symbols that are defined in some section 56 # The POSIX format is: 57 # name type value size 58 # The -P flag displays the size field for symbols only when applicable, 59 # so the last field is optional. There's no space after the value field, 60 # but \s+ match newline also, so \s+\S* will match the optional size field. 61 match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S*$", line) 62 if match: 63 yield match.group(1) 64 process.wait() 65 66def readobj_get_symbols(lib): 67 process = subprocess.Popen(['llvm-readobj','--symbols',lib], bufsize=1, 68 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 69 universal_newlines=True) 70 process.stdin.close() 71 for line in process.stdout: 72 # When looking through the output of llvm-readobj we expect to see Name, 73 # Section, then StorageClass, so record Name and Section when we see 74 # them and decide if this is a defined external symbol when we see 75 # StorageClass. 76 match = re.search('Name: (\S+)', line) 77 if match: 78 name = match.group(1) 79 match = re.search('Section: (\S+)', line) 80 if match: 81 section = match.group(1) 82 match = re.search('StorageClass: (\S+)', line) 83 if match: 84 storageclass = match.group(1) 85 if section != 'IMAGE_SYM_ABSOLUTE' and \ 86 section != 'IMAGE_SYM_UNDEFINED' and \ 87 storageclass == 'External': 88 yield name 89 process.wait() 90 91# Define functions which determine if the target is 32-bit Windows (as that's 92# where calling convention name decoration happens). 93 94def dumpbin_is_32bit_windows(lib): 95 # dumpbin /headers can output a huge amount of data (>100MB in a debug 96 # build) so we read only up to the 'machine' line then close the output. 97 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, 98 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 99 universal_newlines=True) 100 process.stdin.close() 101 retval = False 102 for line in process.stdout: 103 match = re.match('.+machine \((\S+)\)', line) 104 if match: 105 retval = (match.group(1) == 'x86') 106 break 107 process.stdout.close() 108 process.wait() 109 return retval 110 111def objdump_is_32bit_windows(lib): 112 output = subprocess.check_output(['objdump','-f',lib], 113 universal_newlines=True) 114 for line in output.splitlines(): 115 match = re.match('.+file format (\S+)', line) 116 if match: 117 return (match.group(1) == 'pe-i386') 118 return False 119 120def readobj_is_32bit_windows(lib): 121 output = subprocess.check_output(['llvm-readobj','--file-header',lib], 122 universal_newlines=True) 123 for line in output.splitlines(): 124 match = re.match('Format: (\S+)', line) 125 if match: 126 return (match.group(1) == 'COFF-i386') 127 return False 128 129# On AIX, there isn't an easy way to detect 32-bit windows objects with the system toolchain, 130# so just assume false. 131def aix_is_32bit_windows(lib): 132 return False 133 134# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the 135# identifier/type mangling we can decide which symbols could possibly be 136# required and which we can discard. 137def should_keep_microsoft_symbol(symbol, calling_convention_decoration): 138 # Keep unmangled (i.e. extern "C") names 139 if not '?' in symbol: 140 if calling_convention_decoration: 141 # Remove calling convention decoration from names 142 match = re.match('[_@]([^@]+)', symbol) 143 if match: 144 return match.group(1) 145 return symbol 146 # Function template instantiations start with ?$; keep the instantiations of 147 # clang::Type::getAs, as some of them are explipict specializations that are 148 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that 149 # the definition is public 150 elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): 151 return symbol 152 elif symbol.startswith('??$'): 153 return None 154 # Deleting destructors start with ?_G or ?_E and can be discarded because 155 # link.exe gives you a warning telling you they can't be exported if you 156 # don't 157 elif symbol.startswith('??_G') or symbol.startswith('??_E'): 158 return None 159 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be 160 # defined in headers and not required to be kept 161 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): 162 return None 163 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol 164 # that mentions an anonymous namespace can be discarded, as the anonymous 165 # namespace doesn't exist outside of that translation unit. 166 elif re.search('\?A(0x\w+)?@', symbol): 167 return None 168 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a 169 # bit of a mess and imprecise, but that avoids having to completely demangle 170 # the symbol name. The outermost namespace is at the end of the identifier 171 # mangling, and the identifier mangling is followed by the type mangling, so 172 # we look for (llvm|clang)@@ followed by something that looks like a 173 # function type mangling. To spot a function type we use (this is derived 174 # from clang/lib/AST/MicrosoftMangle.cpp): 175 # <function-type> ::= <function-class> <this-cvr-qualifiers> 176 # <calling-convention> <return-type> 177 # <argument-list> <throw-spec> 178 # <function-class> ::= [A-Z] 179 # <this-cvr-qualifiers> ::= [A-Z0-9_]* 180 # <calling-convention> ::= [A-JQ] 181 # <return-type> ::= .+ 182 # <argument-list> ::= X (void) 183 # ::= .+@ (list of types) 184 # ::= .*Z (list of types, varargs) 185 # <throw-spec> ::= exceptions are not allowed 186 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): 187 return symbol 188 return None 189 190# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We 191# demangle the identifier mangling to identify symbols that can be safely 192# discarded. 193def should_keep_itanium_symbol(symbol, calling_convention_decoration): 194 # Start by removing any calling convention decoration (which we expect to 195 # see on all symbols, even mangled C++ symbols) 196 if calling_convention_decoration and symbol.startswith('_'): 197 symbol = symbol[1:] 198 # Keep unmangled names 199 if not symbol.startswith('_') and not symbol.startswith('.'): 200 return symbol 201 # Discard manglings that aren't nested names 202 match = re.match('_Z(T[VTIS])?(N.+)', symbol) 203 if not match: 204 return None 205 # Demangle the name. If the name is too complex then we don't need to keep 206 # it, but it the demangling fails then keep the symbol just in case. 207 try: 208 names, _ = parse_itanium_nested_name(match.group(2)) 209 except TooComplexName: 210 return None 211 if not names: 212 return symbol 213 # Constructors and destructors of templates classes are assumed to be 214 # defined in headers and not required to be kept 215 if re.match('[CD][123]', names[-1][0]) and names[-2][1]: 216 return None 217 # Keep the instantiations of clang::Type::getAs, as some of them are 218 # explipict specializations that are defined in clang's lib/AST/Type.cpp; 219 # discard any other function template instantiations as it's assumed that 220 # the definition is public 221 elif symbol.startswith('_ZNK5clang4Type5getAs'): 222 return symbol 223 elif names[-1][1]: 224 return None 225 # Keep llvm:: and clang:: names 226 elif names[0][0] == '4llvm' or names[0][0] == '5clang': 227 return symbol 228 # Discard everything else 229 else: 230 return None 231 232# Certain kinds of complex manglings we assume cannot be part of a public 233# interface, and we handle them by raising an exception. 234class TooComplexName(Exception): 235 pass 236 237# Parse an itanium mangled name from the start of a string and return a 238# (name, rest of string) pair. 239def parse_itanium_name(arg): 240 # Check for a normal name 241 match = re.match('(\d+)(.+)', arg) 242 if match: 243 n = int(match.group(1)) 244 name = match.group(1)+match.group(2)[:n] 245 rest = match.group(2)[n:] 246 return name, rest 247 # Check for constructor/destructor names 248 match = re.match('([CD][123])(.+)', arg) 249 if match: 250 return match.group(1), match.group(2) 251 # Assume that a sequence of characters that doesn't end a nesting is an 252 # operator (this is very imprecise, but appears to be good enough) 253 match = re.match('([^E]+)(.+)', arg) 254 if match: 255 return match.group(1), match.group(2) 256 # Anything else: we can't handle it 257 return None, arg 258 259# Parse an itanium mangled template argument list from the start of a string 260# and throw it away, returning the rest of the string. 261def skip_itanium_template(arg): 262 # A template argument list starts with I 263 assert arg.startswith('I'), arg 264 tmp = arg[1:] 265 while tmp: 266 # Check for names 267 match = re.match('(\d+)(.+)', tmp) 268 if match: 269 n = int(match.group(1)) 270 tmp = match.group(2)[n:] 271 continue 272 # Check for substitutions 273 match = re.match('S[A-Z0-9]*_(.+)', tmp) 274 if match: 275 tmp = match.group(1) 276 # Start of a template 277 elif tmp.startswith('I'): 278 tmp = skip_itanium_template(tmp) 279 # Start of a nested name 280 elif tmp.startswith('N'): 281 _, tmp = parse_itanium_nested_name(tmp) 282 # Start of an expression: assume that it's too complicated 283 elif tmp.startswith('L') or tmp.startswith('X'): 284 raise TooComplexName 285 # End of the template 286 elif tmp.startswith('E'): 287 return tmp[1:] 288 # Something else: probably a type, skip it 289 else: 290 tmp = tmp[1:] 291 return None 292 293# Parse an itanium mangled nested name and transform it into a list of pairs of 294# (name, is_template), returning (list, rest of string). 295def parse_itanium_nested_name(arg): 296 # A nested name starts with N 297 assert arg.startswith('N'), arg 298 ret = [] 299 300 # Skip past the N, and possibly a substitution 301 match = re.match('NS[A-Z0-9]*_(.+)', arg) 302 if match: 303 tmp = match.group(1) 304 else: 305 tmp = arg[1:] 306 307 # Skip past CV-qualifiers and ref qualifiers 308 match = re.match('[rVKRO]*(.+)', tmp); 309 if match: 310 tmp = match.group(1) 311 312 # Repeatedly parse names from the string until we reach the end of the 313 # nested name 314 while tmp: 315 # An E ends the nested name 316 if tmp.startswith('E'): 317 return ret, tmp[1:] 318 # Parse a name 319 name_part, tmp = parse_itanium_name(tmp) 320 if not name_part: 321 # If we failed then we don't know how to demangle this 322 return None, None 323 is_template = False 324 # If this name is a template record that, then skip the template 325 # arguments 326 if tmp.startswith('I'): 327 tmp = skip_itanium_template(tmp) 328 is_template = True 329 # Add the name to the list 330 ret.append((name_part, is_template)) 331 332 # If we get here then something went wrong 333 return None, None 334 335def extract_symbols(arg): 336 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg 337 symbols = dict() 338 for symbol in get_symbols(lib): 339 symbol = should_keep_symbol(symbol, calling_convention_decoration) 340 if symbol: 341 symbols[symbol] = 1 + symbols.setdefault(symbol,0) 342 return symbols 343 344if __name__ == '__main__': 345 tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] 346 parser = argparse.ArgumentParser( 347 description='Extract symbols to export from libraries') 348 parser.add_argument('--mangling', choices=['itanium','microsoft'], 349 required=True, help='expected symbol mangling scheme') 350 parser.add_argument('--tools', choices=tool_exes, nargs='*', 351 help='tools to use to extract symbols and determine the' 352 ' target') 353 parser.add_argument('libs', metavar='lib', type=str, nargs='+', 354 help='libraries to extract symbols from') 355 parser.add_argument('-o', metavar='file', type=str, help='output to file') 356 args = parser.parse_args() 357 358 # Determine the function to use to get the list of symbols from the inputs, 359 # and the function to use to determine if the target is 32-bit windows. 360 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), 361 'nm' : (nm_get_symbols, None), 362 'objdump' : (None, objdump_is_32bit_windows), 363 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } 364 get_symbols = None 365 is_32bit_windows = aix_is_32bit_windows if sys.platform.startswith('aix') else None 366 # If we have a tools argument then use that for the list of tools to check 367 if args.tools: 368 tool_exes = args.tools 369 # Find a tool to use by trying each in turn until we find one that exists 370 # (subprocess.call will throw OSError when the program does not exist) 371 get_symbols = None 372 for exe in tool_exes: 373 try: 374 # Close std streams as we don't want any output and we don't 375 # want the process to wait for something on stdin. 376 p = subprocess.Popen([exe], stdout=subprocess.PIPE, 377 stderr=subprocess.PIPE, 378 stdin=subprocess.PIPE, 379 universal_newlines=True) 380 p.stdout.close() 381 p.stderr.close() 382 p.stdin.close() 383 p.wait() 384 # Keep going until we have a tool to use for both get_symbols and 385 # is_32bit_windows 386 if not get_symbols: 387 get_symbols = tools[exe][0] 388 if not is_32bit_windows: 389 is_32bit_windows = tools[exe][1] 390 if get_symbols and is_32bit_windows: 391 break 392 except OSError: 393 continue 394 if not get_symbols: 395 print("Couldn't find a program to read symbols with", file=sys.stderr) 396 exit(1) 397 if not is_32bit_windows: 398 print("Couldn't find a program to determining the target", file=sys.stderr) 399 exit(1) 400 401 # How we determine which symbols to keep and which to discard depends on 402 # the mangling scheme 403 if args.mangling == 'microsoft': 404 should_keep_symbol = should_keep_microsoft_symbol 405 else: 406 should_keep_symbol = should_keep_itanium_symbol 407 408 # Get the list of libraries to extract symbols from 409 libs = list() 410 for lib in args.libs: 411 # When invoked by cmake the arguments are the cmake target names of the 412 # libraries, so we need to add .lib/.a to the end and maybe lib to the 413 # start to get the filename. Also allow objects. 414 suffixes = ['.lib','.a','.obj','.o'] 415 if not any([lib.endswith(s) for s in suffixes]): 416 for s in suffixes: 417 if os.path.exists(lib+s): 418 lib = lib+s 419 break 420 if os.path.exists('lib'+lib+s): 421 lib = 'lib'+lib+s 422 break 423 if not any([lib.endswith(s) for s in suffixes]): 424 print("Don't know what to do with argument "+lib, file=sys.stderr) 425 exit(1) 426 libs.append(lib) 427 428 # Check if calling convention decoration is used by inspecting the first 429 # library in the list 430 calling_convention_decoration = is_32bit_windows(libs[0]) 431 432 # Extract symbols from libraries in parallel. This is a huge time saver when 433 # doing a debug build, as there are hundreds of thousands of symbols in each 434 # library. 435 pool = multiprocessing.Pool() 436 try: 437 # Only one argument can be passed to the mapping function, and we can't 438 # use a lambda or local function definition as that doesn't work on 439 # windows, so create a list of tuples which duplicates the arguments 440 # that are the same in all calls. 441 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] 442 # Do an async map then wait for the result to make sure that 443 # KeyboardInterrupt gets caught correctly (see 444 # http://bugs.python.org/issue8296) 445 result = pool.map_async(extract_symbols, vals) 446 pool.close() 447 libs_symbols = result.get(3600) 448 except KeyboardInterrupt: 449 # On Ctrl-C terminate everything and exit 450 pool.terminate() 451 pool.join() 452 exit(1) 453 454 # Merge everything into a single dict 455 symbols = dict() 456 for this_lib_symbols in libs_symbols: 457 for k,v in list(this_lib_symbols.items()): 458 symbols[k] = v + symbols.setdefault(k,0) 459 460 # Count instances of member functions of template classes, and map the 461 # symbol name to the function+class. We do this under the assumption that if 462 # a member function of a template class is instantiated many times it's 463 # probably declared in a public header file. 464 template_function_count = dict() 465 template_function_mapping = dict() 466 template_function_count[""] = 0 467 for k in symbols: 468 name = None 469 if args.mangling == 'microsoft': 470 # Member functions of templates start with 471 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. 472 # As manglings go from the innermost scope to the outermost scope 473 # this means: 474 # * When we have a function member of a subclass of a template 475 # class then <fn_name> will actually contain the mangling of 476 # both the subclass and the function member. This is fine. 477 # * When we have a function member of a template subclass of a 478 # (possibly template) class then it's the innermost template 479 # subclass that becomes <class_name>. This should be OK so long 480 # as we don't have multiple classes with a template subclass of 481 # the same name. 482 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) 483 if match: 484 name = match.group(1) 485 else: 486 # Find member functions of templates by demangling the name and 487 # checking if the second-to-last name in the list is a template. 488 match = re.match('_Z(T[VTIS])?(N.+)', k) 489 if match: 490 try: 491 names, _ = parse_itanium_nested_name(match.group(2)) 492 if names and names[-2][1]: 493 name = ''.join([x for x,_ in names]) 494 except TooComplexName: 495 # Manglings that are too complex should already have been 496 # filtered out, but if we happen to somehow see one here 497 # just leave it as-is. 498 pass 499 if name: 500 old_count = template_function_count.setdefault(name,0) 501 template_function_count[name] = old_count + 1 502 template_function_mapping[k] = name 503 else: 504 template_function_mapping[k] = "" 505 506 # Print symbols which both: 507 # * Appear in exactly one input, as symbols defined in multiple 508 # objects/libraries are assumed to have public definitions. 509 # * Aren't instances of member functions of templates which have been 510 # instantiated 100 times or more, which are assumed to have public 511 # definitions. (100 is an arbitrary guess here.) 512 if args.o: 513 outfile = open(args.o,'w') 514 else: 515 outfile = sys.stdout 516 for k,v in list(symbols.items()): 517 template_count = template_function_count[template_function_mapping[k]] 518 if v == 1 and template_count < 100: 519 print(k, file=outfile) 520