Chromium Code Reviews| Index: tools/cygprofile/symbolize.py |
| diff --git a/tools/cygprofile/symbolize.py b/tools/cygprofile/symbolize.py |
| index 4f7dfe7a63d3fe30f50b39b11273456724902041..22ddfdd19dfd5562a311da97c734c98ea2ab78c9 100755 |
| --- a/tools/cygprofile/symbolize.py |
| +++ b/tools/cygprofile/symbolize.py |
| @@ -5,20 +5,21 @@ |
| """Symbolize log file produced by cypgofile instrumentation. |
|
pasko
2015/01/30 13:34:48
nit: cypgofile instrumentation (mergetraces.py)
Benoit L
2015/01/30 16:22:38
Done.
|
| -Given a log file and the binary being profiled (e.g. executable, shared |
| -library), the script can produce three different outputs: 1) symbols for the |
| -addresses, 2) function and line numbers for the addresses, or 3) an order file. |
| +Given a log file and the binary being profiled, create an order file. |
|
pasko
2015/01/30 13:34:48
it's a misleading name for the file, since symboli
Benoit L
2015/01/30 16:22:38
Done.
|
| """ |
| +import logging |
| +import multiprocessing |
| import optparse |
| import os |
| import string |
| -import subprocess |
| import sys |
| +import symbol_extractor |
| -def ParseLogLines(log_file_lines): |
| - """Parse a log file produced by the profiled run of clank. |
| + |
| +def _ParseLogLines(log_file_lines): |
| + """Parse a log file produced by the profiled run of chrome. |
|
pasko
2015/01/30 13:34:48
nit: maybe enhance the comment with:
(i.e. "merge
Benoit L
2015/01/30 16:22:38
Done.
|
| Args: |
| log_file_lines: array of lines in log file produced by profiled run |
| @@ -41,11 +42,10 @@ def ParseLogLines(log_file_lines): |
| call_lines = [] |
| vm_start = 0 |
| line = log_file_lines[0] |
| - assert("r-xp" in line) |
| + assert "r-xp" in line |
|
pasko
2015/01/30 13:34:48
s/"/'/
Benoit L
2015/01/30 16:22:38
Done.
|
| end_index = line.find('-') |
| vm_start = int(line[:end_index], 16) |
| for line in log_file_lines[2:]: |
| - # print hex(vm_start) |
| fields = line.split() |
| if len(fields) == 4: |
| call_lines.append(fields) |
| @@ -62,122 +62,66 @@ def ParseLogLines(log_file_lines): |
| return call_info |
| -def GetStdOutputLines(cmd): |
| - p = subprocess.Popen(cmd, stdout=subprocess.PIPE) |
| - output = p.communicate()[0] |
| - return output.split('\n') |
| - |
| -def ParseLibSymbols(lib_file): |
| - """Get output from running nm and greping for text symbols. |
| - |
| - Args: |
| - lib_file: the library or executable that contains the profiled code |
| - Returns: |
| - list of sorted unique addresses and corresponding size of function symbols |
| - in lib_file and map of addresses to all symbols at a particular address |
| - """ |
| - cmd = ['nm', '-S', '-n', lib_file] |
| - nm_lines = GetStdOutputLines(cmd) |
| - |
| - nm_symbols = [] |
| - for nm_line in nm_lines: |
| - if any(str in nm_line for str in (' t ', ' W ', ' T ')): |
| - nm_symbols.append(nm_line) |
| - |
| - nm_index = 0 |
| - unique_addrs = [] |
| - address_map = {} |
| - while nm_index < len(nm_symbols): |
| - |
| - # If the length of the split line is not 4, then it does not contain all the |
| - # information needed to symbolize (i.e. address, size and symbol name). |
| - if len(nm_symbols[nm_index].split()) == 4: |
| - (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]] |
| - |
| - # Multiple symbols may be at the same address. This is do to aliasing |
| - # done by the compiler. Since there is no way to be sure which one was |
| - # called in profiled run, we will symbolize to include all symbol names at |
| - # a particular address. |
| - fnames = [] |
| - while (nm_index < len(nm_symbols) and |
| - addr == int(nm_symbols[nm_index].split()[0], 16)): |
| - if len(nm_symbols[nm_index].split()) == 4: |
| - fnames.append(nm_symbols[nm_index].split()[3]) |
| - nm_index += 1 |
| - address_map[addr] = fnames |
| - unique_addrs.append((addr, size)) |
| - else: |
| - nm_index += 1 |
| +def _GroupLibrarySymbolInfosByOffset(lib_filename): |
| + """Return a dict {offset: [SymbolInfo]} from a library.""" |
| + symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename) |
| + return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos) |
| - return (unique_addrs, address_map) |
| class SymbolNotFoundException(Exception): |
| - def __init__(self,value): |
| - super(SymbolNotFoundException,self).__init__(value) |
| + def __init__(self, value): |
| + super(SymbolNotFoundException, self).__init__(value) |
| self.value = value |
| + |
| def __str__(self): |
| return repr(self.value) |
| -def BinarySearchAddresses(addr, start, end, arr): |
| - """Find starting address of a symbol at a particular address. |
| - The reason we can not directly use the address provided by the log file is |
| - that the log file may give an address after the start of the symbol. The |
| - logged address is often one byte after the start. By using this search |
| - function rather than just subtracting one from the logged address allows |
| - the logging instrumentation to log any address in a function. |
| +def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset): |
| + """Find all SymbolInfo at a given offset. |
| Args: |
| - addr: the address being searched for |
| - start: the starting index for the binary search |
| - end: the ending index for the binary search |
| - arr: the list being searched containing tuple of address and size |
| + offset_to_symbol_infos: {offset: [SymbolInfo]} |
| + offset: offset to look the symbols at |
| Returns: |
| - the starting address of the symbol at address addr |
| + The list of SymbolInfo at the given offset |
| Raises: |
| - Exception: if address not found. Functions expects all logged addresses |
| - to be found |
| + SymbolNotFoundException if the offset doesn't match any symbol. |
| """ |
| - # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end) |
| - if start >= end or start == end - 1: |
| - # arr[i] is a tuple of address and size. Check if addr inside range |
| - if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]: |
| - return arr[start][0] |
| - elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]: |
| - return arr[end][0] |
| - else: |
| - raise SymbolNotFoundException(addr) |
| + if offset in offset_to_symbol_infos: |
| + return offset_to_symbol_infos[offset] |
| + elif offset % 2 and (offset - 1) in offset_to_symbol_infos: |
| + # On ARM, odd addresses are used to signal thumb instruction. They are |
| + # generated by setting the LSB to 1 (see |
| + # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfjhia.html). |
| + # TODO(lizeb): Make sure this hack doesn't propagate to other archs. |
| + return offset_to_symbol_infos[offset - 1] |
| else: |
| - halfway = (start + end) / 2 |
| - (nm_addr, size) = arr[halfway] |
| - # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway) |
| - if addr >= nm_addr and addr < nm_addr + size: |
| - return nm_addr |
| - elif addr < nm_addr: |
| - return BinarySearchAddresses(addr, start, halfway-1, arr) |
| - else: |
| - # Condition (addr >= nm_addr + size) must be true. |
| - return BinarySearchAddresses(addr, halfway+1, end, arr) |
| + raise SymbolNotFoundException(offset) |
|
pasko
2015/01/30 13:34:48
If we raise an exception like this, it will be ign
Benoit L
2015/01/30 16:22:38
Done.
|
| -def FindFunctions(addr, unique_addrs, address_map): |
| - """Find function symbol names at address addr.""" |
| - return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1, |
| - unique_addrs)] |
| +class WarningCollector(object): |
| + """Collect warnings, but limit the number printed to a set value.""" |
| + def __init__(self, max_warnings): |
| + self._warnings = 0 |
| + self._max_warnings = max_warnings |
| + |
| + def Write(self, message): |
| + if self._warnings < self._max_warnings: |
| + logging.warning(message) |
| + self._warnings += 1 |
| + def WriteEnd(self, message): |
| + if self._warnings > self._max_warnings: |
| + logging.warning('%d more warnings for %s' % ( |
| + self._warnings - self._max_warnings, message)) |
| -def AddrToLine(addr, lib_file): |
| - """Use addr2line to determine line info of a particular address.""" |
| - cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)] |
| - output = GetStdOutputLines(cmd) |
| - assert(len(output) == 2) |
| - return ':'.join(output) |
| -def GetObjectFileNames(obj_dir): |
| - """ Gets the list of object files in the output directory. """ |
| +def _GetObjectFileNames(obj_dir): |
| + """Return the list of object files in a directory.""" |
| obj_files = [] |
| for (dirpath, _, filenames) in os.walk(obj_dir): |
| for file_name in filenames: |
| @@ -185,134 +129,118 @@ def GetObjectFileNames(obj_dir): |
| obj_files.append(os.path.join(dirpath, file_name)) |
| return obj_files |
| -class WarningCollector(object): |
| - def __init__(self, max_warnings): |
| - self._warnings = 0 |
| - self._max_warnings = max_warnings |
| - def Write(self, message): |
| - if self._warnings < self._max_warnings: |
| - sys.stderr.write(message + '\n') |
| - self._warnings += 1 |
| +def _AllSymbolInfos(object_filenames): |
| + """Return a list of SymbolInfo from an iterable of filenames.""" |
| + pool = multiprocessing.Pool() |
| + symbol_infos_nested = pool.map( |
| + symbol_extractor.SymbolInfosFromBinary, object_filenames) |
| + result = [] |
| + for symbol_infos in symbol_infos_nested: |
| + result += symbol_infos |
| + return result |
| - def WriteEnd(self, message): |
| - if self._warnings > self._max_warnings: |
| - sys.stderr.write(str(self._warnings - self._max_warnings) + |
| - ' more warnings for: ' + message + '\n') |
| -def SymbolToSection(obj_dir): |
| - """ Gets a mapping from symbol to linker section name by scanning all |
| - of the object files. """ |
| - object_files = GetObjectFileNames(obj_dir) |
| +def _GetSymbolToSectionMapFromObjectFiles(obj_dir): |
| + """ Create a mapping from symbol to linker section name by scanning all |
| + the object files. |
| + """ |
| + object_files = _GetObjectFileNames(obj_dir) |
| symbol_to_section_map = {} |
| symbol_warnings = WarningCollector(300) |
| - for obj_file in object_files: |
| - cmd = ['objdump', '-w', '-t', obj_file] |
| - symbol_lines = GetStdOutputLines(cmd) |
| - for symbol_line in symbol_lines: |
| - items = symbol_line.split() |
| - # All of the symbol lines we care about are in the form |
| - # 0000000000 g F .text.foo 000000000 [.hidden] foo |
| - # where g (global) might also be l (local) or w (weak). |
| - if len(items) > 4 and items[2] == 'F': |
| - # This symbol is a function |
| - symbol = items[len(items) - 1] |
| - if symbol.startswith('.LTHUNK'): |
| - continue |
| - section = items[3] |
| - if ((symbol in symbol_to_section_map) and |
| - (symbol_to_section_map[symbol] != section)): |
| - symbol_warnings.Write('WARNING: Symbol ' + symbol + |
| - ' in conflicting sections ' + section + |
| - ' and ' + symbol_to_section_map[symbol]) |
| - elif not section.startswith('.text.'): |
| - symbol_warnings.Write('WARNING: Symbol ' + symbol + |
| - ' in incorrect section ' + section) |
| - else: |
| - symbol_to_section_map[symbol] = section |
| + symbol_infos = _AllSymbolInfos(object_files) |
| + symbol_infos = [s for s in symbol_infos if not s.name.startswith('.LTHUNK')] |
| + for symbol_info in symbol_infos: |
| + symbol = symbol_info.name |
| + section = symbol_info.section |
| + if ((symbol in symbol_to_section_map) and |
| + (symbol_to_section_map[symbol] != symbol_info.section)): |
| + symbol_warnings.Write('Symbol ' + symbol + |
| + ' in conflicting sections ' + section + |
| + ' and ' + symbol_to_section_map[symbol]) |
| + elif not section.startswith('.text'): |
| + symbol_warnings.Write('Symbol ' + symbol + |
| + ' in incorrect section ' + section) |
| + else: |
| + symbol_to_section_map[symbol] = section |
| symbol_warnings.WriteEnd('bad sections') |
| return symbol_to_section_map |
| -def main(): |
| - """Write output for profiled run to standard out. |
| - The format of the output depends on the output type specified as the third |
| - command line argument. The default output type is to symbolize the addresses |
| - of the functions called. |
| +def _WarnAboutDuplicates(offsets): |
| + """Warn about duplicate offsets. |
| + |
| + Args: |
| + offsets: list of offsets to check for duplicates |
| + |
| + Returns: |
| + True if there are no duplicates, False otherwise. |
| """ |
| + seen_offsets = set() |
| + ok = True |
| + for offset in offsets: |
| + if offset not in seen_offsets: |
| + seen_offsets.add(offset) |
| + else: |
| + ok = False |
| + logging.warning('Duplicate offset: ' + hex(offset)) |
| + return ok |
| + |
| + |
| +def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map, |
| + output_file): |
| + """Output the order file to output_file. |
| + |
| + Args: |
| + offsets: Iterable of offsets to match to section names |
| + offset_to_symbol_infos: {offset: [SymbolInfo]} |
| + symbol_to_section_map: {name: section} |
| + output_file: file-like object to write the results to |
| + """ |
| + unknown_symbol_warnings = WarningCollector(300) |
| + symbol_not_found_warnings = WarningCollector(300) |
| + for offset in offsets: |
| + try: |
| + symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset) |
| + for symbol_info in symbol_infos: |
| + if symbol_info.name in symbol_to_section_map: |
| + output_file.write(symbol_to_section_map[symbol_info.name] + '\n') |
| + else: |
| + unknown_symbol_warnings.Write( |
| + 'No known section for symbol ' + symbol_info.name) |
| + except SymbolNotFoundException: |
| + symbol_not_found_warnings.Write( |
| + 'Did not find function in binary. offset: ' + hex(offset)) |
| + unknown_symbol_warnings.WriteEnd('no known section for symbol') |
| + symbol_not_found_warnings.WriteEnd('did not find function') |
| + |
| + |
| +def main(): |
| parser = optparse.OptionParser('usage: %prog [options] log_file lib_file') |
| - parser.add_option('-t', '--outputType', dest='output_type', |
| - default='symbolize', type='string', |
| + # TODO(lizeb): Remove this unused flag once the client script is updated. |
| + parser.add_option('-t', dest='output_type', choices=['orderfile'], |
| help='lineize or symbolize or orderfile') |
| # Option for output type. The log file and lib file arguments are required |
| # by the script and therefore are not options. |
| - (options, args) = parser.parse_args() |
| + (_, args) = parser.parse_args() |
| if len(args) != 2: |
| parser.error('expected 2 args: log_file lib_file') |
| - |
| (log_file, lib_file) = args |
| - output_type = options.output_type |
| obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj')) |
| log_file_lines = map(string.rstrip, open(log_file).readlines()) |
| - call_info = ParseLogLines(log_file_lines) |
| - (unique_addrs, address_map) = ParseLibSymbols(lib_file) |
| - |
| - # Check for duplicate addresses in the log file, and print a warning if |
| - # duplicates are found. The instrumentation that produces the log file |
| - # should only print the first time a function is entered. |
| - addr_list = [] |
| - for call in call_info: |
| - addr = call[3] |
| - if addr not in addr_list: |
| - addr_list.append(addr) |
| - else: |
| - print('WARNING: Address ' + hex(addr) + ' (line= ' + |
| - AddrToLine(addr, lib_file) + ') already profiled.') |
| + call_info = _ParseLogLines(log_file_lines) |
| + offsets = [call[3] for call in call_info] |
| + offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_file) |
| - symbol_to_section_map = SymbolToSection(obj_dir) |
| + _WarnAboutDuplicates(offsets) |
|
pasko
2015/01/30 13:34:48
how many warnings are produced in practice?
Benoit L
2015/01/30 16:22:38
0 +/- 0 (95% CL)
pasko
2015/01/30 19:31:32
CI ?
|
| + |
| + symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir) |
| + _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map, |
| + sys.stdout) |
| - unknown_symbol_warnings = WarningCollector(300) |
| - symbol_not_found_warnings = WarningCollector(300) |
| - for call in call_info: |
| - addr = call[3] |
| - if output_type == 'lineize': |
| - symbol = AddrToLine(addr, lib_file) |
| - print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t' |
| - + symbol) |
| - elif output_type == 'orderfile': |
| - try: |
| - symbols = FindFunctions(addr, unique_addrs, address_map) |
| - for symbol in symbols: |
| - if symbol in symbol_to_section_map: |
| - print symbol_to_section_map[symbol] |
| - else: |
| - unknown_symbol_warnings.Write( |
| - 'WARNING: No known section for symbol ' + symbol) |
| - print '' |
| - except SymbolNotFoundException: |
| - symbol_not_found_warnings.Write( |
| - 'WARNING: Did not find function in binary. addr: ' |
| - + hex(addr)) |
| - else: |
| - try: |
| - symbols = FindFunctions(addr, unique_addrs, address_map) |
| - print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t' |
| - + symbols[0]) |
| - first_symbol = True |
| - for symbol in symbols: |
| - if not first_symbol: |
| - print '\t\t\t\t\t' + symbol |
| - else: |
| - first_symbol = False |
| - except SymbolNotFoundException: |
| - symbol_not_found_warnings.Write( |
| - 'WARNING: Did not find function in binary. addr: ' |
| - + hex(addr)) |
| - unknown_symbol_warnings.WriteEnd('no known section for symbol') |
| - symbol_not_found_warnings.WriteEnd('did not find function') |
| if __name__ == '__main__': |
| main() |
|
pasko
2015/01/30 13:34:48
shouldn't we also set the logging level above this
Benoit L
2015/01/30 16:22:38
Done.
|