Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Unified Diff: tools/cygprofile/symbolize.py

Issue 874683004: Refactor the symbolize step of the orderfile generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Naming and typos. Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | tools/cygprofile/symbolize_unittest.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/cygprofile/symbolize.py
diff --git a/tools/cygprofile/symbolize.py b/tools/cygprofile/symbolize.py
index 4f7dfe7a63d3fe30f50b39b11273456724902041..22ddfdd19dfd5562a311da97c734c98ea2ab78c9 100755
--- a/tools/cygprofile/symbolize.py
+++ b/tools/cygprofile/symbolize.py
@@ -5,20 +5,21 @@
"""Symbolize log file produced by cypgofile instrumentation.
pasko 2015/01/30 13:34:48 nit: cypgofile instrumentation (mergetraces.py)
Benoit L 2015/01/30 16:22:38 Done.
-Given a log file and the binary being profiled (e.g. executable, shared
-library), the script can produce three different outputs: 1) symbols for the
-addresses, 2) function and line numbers for the addresses, or 3) an order file.
+Given a log file and the binary being profiled, create an order file.
pasko 2015/01/30 13:34:48 it's a misleading name for the file, since symboli
Benoit L 2015/01/30 16:22:38 Done.
"""
+import logging
+import multiprocessing
import optparse
import os
import string
-import subprocess
import sys
+import symbol_extractor
-def ParseLogLines(log_file_lines):
- """Parse a log file produced by the profiled run of clank.
+
+def _ParseLogLines(log_file_lines):
+ """Parse a log file produced by the profiled run of chrome.
pasko 2015/01/30 13:34:48 nit: maybe enhance the comment with: (i.e. "merge
Benoit L 2015/01/30 16:22:38 Done.
Args:
log_file_lines: array of lines in log file produced by profiled run
@@ -41,11 +42,10 @@ def ParseLogLines(log_file_lines):
call_lines = []
vm_start = 0
line = log_file_lines[0]
- assert("r-xp" in line)
+ assert "r-xp" in line
pasko 2015/01/30 13:34:48 s/"/'/
Benoit L 2015/01/30 16:22:38 Done.
end_index = line.find('-')
vm_start = int(line[:end_index], 16)
for line in log_file_lines[2:]:
- # print hex(vm_start)
fields = line.split()
if len(fields) == 4:
call_lines.append(fields)
@@ -62,122 +62,66 @@ def ParseLogLines(log_file_lines):
return call_info
-def GetStdOutputLines(cmd):
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
- output = p.communicate()[0]
- return output.split('\n')
-
-def ParseLibSymbols(lib_file):
- """Get output from running nm and greping for text symbols.
-
- Args:
- lib_file: the library or executable that contains the profiled code
- Returns:
- list of sorted unique addresses and corresponding size of function symbols
- in lib_file and map of addresses to all symbols at a particular address
- """
- cmd = ['nm', '-S', '-n', lib_file]
- nm_lines = GetStdOutputLines(cmd)
-
- nm_symbols = []
- for nm_line in nm_lines:
- if any(str in nm_line for str in (' t ', ' W ', ' T ')):
- nm_symbols.append(nm_line)
-
- nm_index = 0
- unique_addrs = []
- address_map = {}
- while nm_index < len(nm_symbols):
-
- # If the length of the split line is not 4, then it does not contain all the
- # information needed to symbolize (i.e. address, size and symbol name).
- if len(nm_symbols[nm_index].split()) == 4:
- (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]
-
- # Multiple symbols may be at the same address. This is do to aliasing
- # done by the compiler. Since there is no way to be sure which one was
- # called in profiled run, we will symbolize to include all symbol names at
- # a particular address.
- fnames = []
- while (nm_index < len(nm_symbols) and
- addr == int(nm_symbols[nm_index].split()[0], 16)):
- if len(nm_symbols[nm_index].split()) == 4:
- fnames.append(nm_symbols[nm_index].split()[3])
- nm_index += 1
- address_map[addr] = fnames
- unique_addrs.append((addr, size))
- else:
- nm_index += 1
+def _GroupLibrarySymbolInfosByOffset(lib_filename):
+ """Return a dict {offset: [SymbolInfo]} from a library."""
+ symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename)
+ return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos)
- return (unique_addrs, address_map)
class SymbolNotFoundException(Exception):
- def __init__(self,value):
- super(SymbolNotFoundException,self).__init__(value)
+ def __init__(self, value):
+ super(SymbolNotFoundException, self).__init__(value)
self.value = value
+
def __str__(self):
return repr(self.value)
-def BinarySearchAddresses(addr, start, end, arr):
- """Find starting address of a symbol at a particular address.
- The reason we can not directly use the address provided by the log file is
- that the log file may give an address after the start of the symbol. The
- logged address is often one byte after the start. By using this search
- function rather than just subtracting one from the logged address allows
- the logging instrumentation to log any address in a function.
+def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset):
+ """Find all SymbolInfo at a given offset.
Args:
- addr: the address being searched for
- start: the starting index for the binary search
- end: the ending index for the binary search
- arr: the list being searched containing tuple of address and size
+ offset_to_symbol_infos: {offset: [SymbolInfo]}
+ offset: offset to look the symbols at
Returns:
- the starting address of the symbol at address addr
+ The list of SymbolInfo at the given offset
Raises:
- Exception: if address not found. Functions expects all logged addresses
- to be found
+ SymbolNotFoundException if the offset doesn't match any symbol.
"""
- # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)
- if start >= end or start == end - 1:
- # arr[i] is a tuple of address and size. Check if addr inside range
- if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:
- return arr[start][0]
- elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:
- return arr[end][0]
- else:
- raise SymbolNotFoundException(addr)
+ if offset in offset_to_symbol_infos:
+ return offset_to_symbol_infos[offset]
+ elif offset % 2 and (offset - 1) in offset_to_symbol_infos:
+ # On ARM, odd addresses are used to signal thumb instruction. They are
+ # generated by setting the LSB to 1 (see
+ # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfjhia.html).
+ # TODO(lizeb): Make sure this hack doesn't propagate to other archs.
+ return offset_to_symbol_infos[offset - 1]
else:
- halfway = (start + end) / 2
- (nm_addr, size) = arr[halfway]
- # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)
- if addr >= nm_addr and addr < nm_addr + size:
- return nm_addr
- elif addr < nm_addr:
- return BinarySearchAddresses(addr, start, halfway-1, arr)
- else:
- # Condition (addr >= nm_addr + size) must be true.
- return BinarySearchAddresses(addr, halfway+1, end, arr)
+ raise SymbolNotFoundException(offset)
pasko 2015/01/30 13:34:48 If we raise an exception like this, it will be ign
Benoit L 2015/01/30 16:22:38 Done.
-def FindFunctions(addr, unique_addrs, address_map):
- """Find function symbol names at address addr."""
- return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,
- unique_addrs)]
+class WarningCollector(object):
+ """Collect warnings, but limit the number printed to a set value."""
+ def __init__(self, max_warnings):
+ self._warnings = 0
+ self._max_warnings = max_warnings
+
+ def Write(self, message):
+ if self._warnings < self._max_warnings:
+ logging.warning(message)
+ self._warnings += 1
+ def WriteEnd(self, message):
+ if self._warnings > self._max_warnings:
+ logging.warning('%d more warnings for %s' % (
+ self._warnings - self._max_warnings, message))
-def AddrToLine(addr, lib_file):
- """Use addr2line to determine line info of a particular address."""
- cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]
- output = GetStdOutputLines(cmd)
- assert(len(output) == 2)
- return ':'.join(output)
-def GetObjectFileNames(obj_dir):
- """ Gets the list of object files in the output directory. """
+def _GetObjectFileNames(obj_dir):
+ """Return the list of object files in a directory."""
obj_files = []
for (dirpath, _, filenames) in os.walk(obj_dir):
for file_name in filenames:
@@ -185,134 +129,118 @@ def GetObjectFileNames(obj_dir):
obj_files.append(os.path.join(dirpath, file_name))
return obj_files
-class WarningCollector(object):
- def __init__(self, max_warnings):
- self._warnings = 0
- self._max_warnings = max_warnings
- def Write(self, message):
- if self._warnings < self._max_warnings:
- sys.stderr.write(message + '\n')
- self._warnings += 1
+def _AllSymbolInfos(object_filenames):
+ """Return a list of SymbolInfo from an iterable of filenames."""
+ pool = multiprocessing.Pool()
+ symbol_infos_nested = pool.map(
+ symbol_extractor.SymbolInfosFromBinary, object_filenames)
+ result = []
+ for symbol_infos in symbol_infos_nested:
+ result += symbol_infos
+ return result
- def WriteEnd(self, message):
- if self._warnings > self._max_warnings:
- sys.stderr.write(str(self._warnings - self._max_warnings) +
- ' more warnings for: ' + message + '\n')
-def SymbolToSection(obj_dir):
- """ Gets a mapping from symbol to linker section name by scanning all
- of the object files. """
- object_files = GetObjectFileNames(obj_dir)
+def _GetSymbolToSectionMapFromObjectFiles(obj_dir):
+ """ Create a mapping from symbol to linker section name by scanning all
+ the object files.
+ """
+ object_files = _GetObjectFileNames(obj_dir)
symbol_to_section_map = {}
symbol_warnings = WarningCollector(300)
- for obj_file in object_files:
- cmd = ['objdump', '-w', '-t', obj_file]
- symbol_lines = GetStdOutputLines(cmd)
- for symbol_line in symbol_lines:
- items = symbol_line.split()
- # All of the symbol lines we care about are in the form
- # 0000000000 g F .text.foo 000000000 [.hidden] foo
- # where g (global) might also be l (local) or w (weak).
- if len(items) > 4 and items[2] == 'F':
- # This symbol is a function
- symbol = items[len(items) - 1]
- if symbol.startswith('.LTHUNK'):
- continue
- section = items[3]
- if ((symbol in symbol_to_section_map) and
- (symbol_to_section_map[symbol] != section)):
- symbol_warnings.Write('WARNING: Symbol ' + symbol +
- ' in conflicting sections ' + section +
- ' and ' + symbol_to_section_map[symbol])
- elif not section.startswith('.text.'):
- symbol_warnings.Write('WARNING: Symbol ' + symbol +
- ' in incorrect section ' + section)
- else:
- symbol_to_section_map[symbol] = section
+ symbol_infos = _AllSymbolInfos(object_files)
+ symbol_infos = [s for s in symbol_infos if not s.name.startswith('.LTHUNK')]
+ for symbol_info in symbol_infos:
+ symbol = symbol_info.name
+ section = symbol_info.section
+ if ((symbol in symbol_to_section_map) and
+ (symbol_to_section_map[symbol] != symbol_info.section)):
+ symbol_warnings.Write('Symbol ' + symbol +
+ ' in conflicting sections ' + section +
+ ' and ' + symbol_to_section_map[symbol])
+ elif not section.startswith('.text'):
+ symbol_warnings.Write('Symbol ' + symbol +
+ ' in incorrect section ' + section)
+ else:
+ symbol_to_section_map[symbol] = section
symbol_warnings.WriteEnd('bad sections')
return symbol_to_section_map
-def main():
- """Write output for profiled run to standard out.
- The format of the output depends on the output type specified as the third
- command line argument. The default output type is to symbolize the addresses
- of the functions called.
+def _WarnAboutDuplicates(offsets):
+ """Warn about duplicate offsets.
+
+ Args:
+ offsets: list of offsets to check for duplicates
+
+ Returns:
+ True if there are no duplicates, False otherwise.
"""
+ seen_offsets = set()
+ ok = True
+ for offset in offsets:
+ if offset not in seen_offsets:
+ seen_offsets.add(offset)
+ else:
+ ok = False
+ logging.warning('Duplicate offset: ' + hex(offset))
+ return ok
+
+
+def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,
+ output_file):
+ """Output the order file to output_file.
+
+ Args:
+ offsets: Iterable of offsets to match to section names
+ offset_to_symbol_infos: {offset: [SymbolInfo]}
+ symbol_to_section_map: {name: section}
+ output_file: file-like object to write the results to
+ """
+ unknown_symbol_warnings = WarningCollector(300)
+ symbol_not_found_warnings = WarningCollector(300)
+ for offset in offsets:
+ try:
+ symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset)
+ for symbol_info in symbol_infos:
+ if symbol_info.name in symbol_to_section_map:
+ output_file.write(symbol_to_section_map[symbol_info.name] + '\n')
+ else:
+ unknown_symbol_warnings.Write(
+ 'No known section for symbol ' + symbol_info.name)
+ except SymbolNotFoundException:
+ symbol_not_found_warnings.Write(
+ 'Did not find function in binary. offset: ' + hex(offset))
+ unknown_symbol_warnings.WriteEnd('no known section for symbol')
+ symbol_not_found_warnings.WriteEnd('did not find function')
+
+
+def main():
parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')
- parser.add_option('-t', '--outputType', dest='output_type',
- default='symbolize', type='string',
+ # TODO(lizeb): Remove this unused flag once the client script is updated.
+ parser.add_option('-t', dest='output_type', choices=['orderfile'],
help='lineize or symbolize or orderfile')
# Option for output type. The log file and lib file arguments are required
# by the script and therefore are not options.
- (options, args) = parser.parse_args()
+ (_, args) = parser.parse_args()
if len(args) != 2:
parser.error('expected 2 args: log_file lib_file')
-
(log_file, lib_file) = args
- output_type = options.output_type
obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj'))
log_file_lines = map(string.rstrip, open(log_file).readlines())
- call_info = ParseLogLines(log_file_lines)
- (unique_addrs, address_map) = ParseLibSymbols(lib_file)
-
- # Check for duplicate addresses in the log file, and print a warning if
- # duplicates are found. The instrumentation that produces the log file
- # should only print the first time a function is entered.
- addr_list = []
- for call in call_info:
- addr = call[3]
- if addr not in addr_list:
- addr_list.append(addr)
- else:
- print('WARNING: Address ' + hex(addr) + ' (line= ' +
- AddrToLine(addr, lib_file) + ') already profiled.')
+ call_info = _ParseLogLines(log_file_lines)
+ offsets = [call[3] for call in call_info]
+ offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_file)
- symbol_to_section_map = SymbolToSection(obj_dir)
+ _WarnAboutDuplicates(offsets)
pasko 2015/01/30 13:34:48 how many warnings are produced in practice?
Benoit L 2015/01/30 16:22:38 0 +/- 0 (95% CL)
pasko 2015/01/30 19:31:32 CI ?
+
+ symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir)
+ _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,
+ sys.stdout)
- unknown_symbol_warnings = WarningCollector(300)
- symbol_not_found_warnings = WarningCollector(300)
- for call in call_info:
- addr = call[3]
- if output_type == 'lineize':
- symbol = AddrToLine(addr, lib_file)
- print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
- + symbol)
- elif output_type == 'orderfile':
- try:
- symbols = FindFunctions(addr, unique_addrs, address_map)
- for symbol in symbols:
- if symbol in symbol_to_section_map:
- print symbol_to_section_map[symbol]
- else:
- unknown_symbol_warnings.Write(
- 'WARNING: No known section for symbol ' + symbol)
- print ''
- except SymbolNotFoundException:
- symbol_not_found_warnings.Write(
- 'WARNING: Did not find function in binary. addr: '
- + hex(addr))
- else:
- try:
- symbols = FindFunctions(addr, unique_addrs, address_map)
- print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
- + symbols[0])
- first_symbol = True
- for symbol in symbols:
- if not first_symbol:
- print '\t\t\t\t\t' + symbol
- else:
- first_symbol = False
- except SymbolNotFoundException:
- symbol_not_found_warnings.Write(
- 'WARNING: Did not find function in binary. addr: '
- + hex(addr))
- unknown_symbol_warnings.WriteEnd('no known section for symbol')
- symbol_not_found_warnings.WriteEnd('did not find function')
if __name__ == '__main__':
main()
pasko 2015/01/30 13:34:48 shouldn't we also set the logging level above this
Benoit L 2015/01/30 16:22:38 Done.
« no previous file with comments | « no previous file | tools/cygprofile/symbolize_unittest.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698