tools/cygprofile/symbolize.py - Issue 874683004: Refactor the symbolize step of the orderfile generation.

Unified Diff: tools/cygprofile/symbolize.py

Issue 874683004: Refactor the symbolize step of the orderfile generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Naming and typos. Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/cygprofile/symbolize.py

diff --git a/tools/cygprofile/symbolize.py b/tools/cygprofile/symbolize.py

index 4f7dfe7a63d3fe30f50b39b11273456724902041..22ddfdd19dfd5562a311da97c734c98ea2ab78c9 100755

--- a/tools/cygprofile/symbolize.py

+++ b/tools/cygprofile/symbolize.py

@@ -5,20 +5,21 @@

"""Symbolize log file produced by cypgofile instrumentation.

pasko 2015/01/30 13:34:48 nit: cypgofile instrumentation (mergetraces.py)

Benoit L 2015/01/30 16:22:38 Done.

-Given a log file and the binary being profiled (e.g. executable, shared

-library), the script can produce three different outputs: 1) symbols for the

-addresses, 2) function and line numbers for the addresses, or 3) an order file.

+Given a log file and the binary being profiled, create an order file.

pasko 2015/01/30 13:34:48 it's a misleading name for the file, since symboli

Benoit L 2015/01/30 16:22:38 Done.

"""

+import logging

+import multiprocessing

import optparse

import os

import string

-import subprocess

import sys

+import symbol_extractor

-def ParseLogLines(log_file_lines):

- """Parse a log file produced by the profiled run of clank.

+def _ParseLogLines(log_file_lines):

+ """Parse a log file produced by the profiled run of chrome.

pasko 2015/01/30 13:34:48 nit: maybe enhance the comment with: (i.e. "merge

Benoit L 2015/01/30 16:22:38 Done.

Args:

log_file_lines: array of lines in log file produced by profiled run

@@ -41,11 +42,10 @@ def ParseLogLines(log_file_lines):

call_lines = []

vm_start = 0

line = log_file_lines[0]

- assert("r-xp" in line)

+ assert "r-xp" in line

pasko 2015/01/30 13:34:48 s/"/'/

Benoit L 2015/01/30 16:22:38 Done.

end_index = line.find('-')

vm_start = int(line[:end_index], 16)

for line in log_file_lines[2:]:

- # print hex(vm_start)

fields = line.split()

if len(fields) == 4:

call_lines.append(fields)

@@ -62,122 +62,66 @@ def ParseLogLines(log_file_lines):

return call_info

-def GetStdOutputLines(cmd):

- p = subprocess.Popen(cmd, stdout=subprocess.PIPE)

- output = p.communicate()[0]

- return output.split('\n')

-def ParseLibSymbols(lib_file):

- """Get output from running nm and greping for text symbols.

- Args:

- lib_file: the library or executable that contains the profiled code

- Returns:

- list of sorted unique addresses and corresponding size of function symbols

- in lib_file and map of addresses to all symbols at a particular address

- """

- cmd = ['nm', '-S', '-n', lib_file]

- nm_lines = GetStdOutputLines(cmd)

- nm_symbols = []

- for nm_line in nm_lines:

- if any(str in nm_line for str in (' t ', ' W ', ' T ')):

- nm_symbols.append(nm_line)

- nm_index = 0

- unique_addrs = []

- address_map = {}

- while nm_index < len(nm_symbols):

- # If the length of the split line is not 4, then it does not contain all the

- # information needed to symbolize (i.e. address, size and symbol name).

- if len(nm_symbols[nm_index].split()) == 4:

- (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]

- # Multiple symbols may be at the same address. This is do to aliasing

- # done by the compiler. Since there is no way to be sure which one was

- # called in profiled run, we will symbolize to include all symbol names at

- # a particular address.

- fnames = []

- while (nm_index < len(nm_symbols) and

- addr == int(nm_symbols[nm_index].split()[0], 16)):

- if len(nm_symbols[nm_index].split()) == 4:

- fnames.append(nm_symbols[nm_index].split()[3])

- nm_index += 1

- address_map[addr] = fnames

- unique_addrs.append((addr, size))

- else:

- nm_index += 1

+def _GroupLibrarySymbolInfosByOffset(lib_filename):

+ """Return a dict {offset: [SymbolInfo]} from a library."""

+ symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename)

+ return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos)

- return (unique_addrs, address_map)

class SymbolNotFoundException(Exception):

- def __init__(self,value):

- super(SymbolNotFoundException,self).__init__(value)

+ def __init__(self, value):

+ super(SymbolNotFoundException, self).__init__(value)

self.value = value

def __str__(self):

return repr(self.value)

-def BinarySearchAddresses(addr, start, end, arr):

- """Find starting address of a symbol at a particular address.

- The reason we can not directly use the address provided by the log file is

- that the log file may give an address after the start of the symbol. The

- logged address is often one byte after the start. By using this search

- function rather than just subtracting one from the logged address allows

- the logging instrumentation to log any address in a function.

+def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset):

+ """Find all SymbolInfo at a given offset.

Args:

- addr: the address being searched for

- start: the starting index for the binary search

- end: the ending index for the binary search

- arr: the list being searched containing tuple of address and size

+ offset_to_symbol_infos: {offset: [SymbolInfo]}

+ offset: offset to look the symbols at

Returns:

- the starting address of the symbol at address addr

+ The list of SymbolInfo at the given offset

Raises:

- Exception: if address not found. Functions expects all logged addresses

- to be found

+ SymbolNotFoundException if the offset doesn't match any symbol.

"""

- # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)

- if start >= end or start == end - 1:

- # arr[i] is a tuple of address and size. Check if addr inside range

- if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:

- return arr[start][0]

- elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:

- return arr[end][0]

- else:

- raise SymbolNotFoundException(addr)

+ if offset in offset_to_symbol_infos:

+ return offset_to_symbol_infos[offset]

+ elif offset % 2 and (offset - 1) in offset_to_symbol_infos:

+ # On ARM, odd addresses are used to signal thumb instruction. They are

+ # generated by setting the LSB to 1 (see

+ # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfjhia.html).

+ # TODO(lizeb): Make sure this hack doesn't propagate to other archs.

+ return offset_to_symbol_infos[offset - 1]

else:

- halfway = (start + end) / 2

- (nm_addr, size) = arr[halfway]

- # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)

- if addr >= nm_addr and addr < nm_addr + size:

- return nm_addr

- elif addr < nm_addr:

- return BinarySearchAddresses(addr, start, halfway-1, arr)

- else:

- # Condition (addr >= nm_addr + size) must be true.

- return BinarySearchAddresses(addr, halfway+1, end, arr)

+ raise SymbolNotFoundException(offset)

pasko 2015/01/30 13:34:48 If we raise an exception like this, it will be ign

Benoit L 2015/01/30 16:22:38 Done.

-def FindFunctions(addr, unique_addrs, address_map):

- """Find function symbol names at address addr."""

- return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,

- unique_addrs)]

+class WarningCollector(object):

+ """Collect warnings, but limit the number printed to a set value."""

+ def __init__(self, max_warnings):

+ self._warnings = 0

+ self._max_warnings = max_warnings

+ def Write(self, message):

+ if self._warnings < self._max_warnings:

+ logging.warning(message)

+ self._warnings += 1

+ def WriteEnd(self, message):

+ if self._warnings > self._max_warnings:

+ logging.warning('%d more warnings for %s' % (

+ self._warnings - self._max_warnings, message))

-def AddrToLine(addr, lib_file):

- """Use addr2line to determine line info of a particular address."""

- cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]

- output = GetStdOutputLines(cmd)

- assert(len(output) == 2)

- return ':'.join(output)

-def GetObjectFileNames(obj_dir):

- """ Gets the list of object files in the output directory. """

+def _GetObjectFileNames(obj_dir):

+ """Return the list of object files in a directory."""

obj_files = []

for (dirpath, _, filenames) in os.walk(obj_dir):

for file_name in filenames:

@@ -185,134 +129,118 @@ def GetObjectFileNames(obj_dir):

obj_files.append(os.path.join(dirpath, file_name))

return obj_files

-class WarningCollector(object):

- def __init__(self, max_warnings):

- self._warnings = 0

- self._max_warnings = max_warnings

- def Write(self, message):

- if self._warnings < self._max_warnings:

- sys.stderr.write(message + '\n')

- self._warnings += 1

+def _AllSymbolInfos(object_filenames):

+ """Return a list of SymbolInfo from an iterable of filenames."""

+ pool = multiprocessing.Pool()

+ symbol_infos_nested = pool.map(

+ symbol_extractor.SymbolInfosFromBinary, object_filenames)

+ result = []

+ for symbol_infos in symbol_infos_nested:

+ result += symbol_infos

+ return result

- def WriteEnd(self, message):

- if self._warnings > self._max_warnings:

- sys.stderr.write(str(self._warnings - self._max_warnings) +

- ' more warnings for: ' + message + '\n')

-def SymbolToSection(obj_dir):

- """ Gets a mapping from symbol to linker section name by scanning all

- of the object files. """

- object_files = GetObjectFileNames(obj_dir)

+def _GetSymbolToSectionMapFromObjectFiles(obj_dir):

+ """ Create a mapping from symbol to linker section name by scanning all

+ the object files.

+ """

+ object_files = _GetObjectFileNames(obj_dir)

symbol_to_section_map = {}

symbol_warnings = WarningCollector(300)

- for obj_file in object_files:

- cmd = ['objdump', '-w', '-t', obj_file]

- symbol_lines = GetStdOutputLines(cmd)

- for symbol_line in symbol_lines:

- items = symbol_line.split()

- # All of the symbol lines we care about are in the form

- # 0000000000 g F .text.foo 000000000 [.hidden] foo

- # where g (global) might also be l (local) or w (weak).

- if len(items) > 4 and items[2] == 'F':

- # This symbol is a function

- symbol = items[len(items) - 1]

- if symbol.startswith('.LTHUNK'):

- continue

- section = items[3]

- if ((symbol in symbol_to_section_map) and

- (symbol_to_section_map[symbol] != section)):

- symbol_warnings.Write('WARNING: Symbol ' + symbol +

- ' in conflicting sections ' + section +

- ' and ' + symbol_to_section_map[symbol])

- elif not section.startswith('.text.'):

- symbol_warnings.Write('WARNING: Symbol ' + symbol +

- ' in incorrect section ' + section)

- else:

- symbol_to_section_map[symbol] = section

+ symbol_infos = _AllSymbolInfos(object_files)

+ symbol_infos = [s for s in symbol_infos if not s.name.startswith('.LTHUNK')]

+ for symbol_info in symbol_infos:

+ symbol = symbol_info.name

+ section = symbol_info.section

+ if ((symbol in symbol_to_section_map) and

+ (symbol_to_section_map[symbol] != symbol_info.section)):

+ symbol_warnings.Write('Symbol ' + symbol +

+ ' in conflicting sections ' + section +

+ ' and ' + symbol_to_section_map[symbol])

+ elif not section.startswith('.text'):

+ symbol_warnings.Write('Symbol ' + symbol +

+ ' in incorrect section ' + section)

+ else:

+ symbol_to_section_map[symbol] = section

symbol_warnings.WriteEnd('bad sections')

return symbol_to_section_map

-def main():

- """Write output for profiled run to standard out.

- The format of the output depends on the output type specified as the third

- command line argument. The default output type is to symbolize the addresses

- of the functions called.

+def _WarnAboutDuplicates(offsets):

+ """Warn about duplicate offsets.

+ Args:

+ offsets: list of offsets to check for duplicates

+ Returns:

+ True if there are no duplicates, False otherwise.

"""

+ seen_offsets = set()

+ ok = True

+ for offset in offsets:

+ if offset not in seen_offsets:

+ seen_offsets.add(offset)

+ else:

+ ok = False

+ logging.warning('Duplicate offset: ' + hex(offset))

+ return ok

+def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,

+ output_file):

+ """Output the order file to output_file.

+ Args:

+ offsets: Iterable of offsets to match to section names

+ offset_to_symbol_infos: {offset: [SymbolInfo]}

+ symbol_to_section_map: {name: section}

+ output_file: file-like object to write the results to

+ """

+ unknown_symbol_warnings = WarningCollector(300)

+ symbol_not_found_warnings = WarningCollector(300)

+ for offset in offsets:

+ try:

+ symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset)

+ for symbol_info in symbol_infos:

+ if symbol_info.name in symbol_to_section_map:

+ output_file.write(symbol_to_section_map[symbol_info.name] + '\n')

+ else:

+ unknown_symbol_warnings.Write(

+ 'No known section for symbol ' + symbol_info.name)

+ except SymbolNotFoundException:

+ symbol_not_found_warnings.Write(

+ 'Did not find function in binary. offset: ' + hex(offset))

+ unknown_symbol_warnings.WriteEnd('no known section for symbol')

+ symbol_not_found_warnings.WriteEnd('did not find function')

+def main():

parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')

- parser.add_option('-t', '--outputType', dest='output_type',

- default='symbolize', type='string',

+ # TODO(lizeb): Remove this unused flag once the client script is updated.

+ parser.add_option('-t', dest='output_type', choices=['orderfile'],

help='lineize or symbolize or orderfile')

# Option for output type. The log file and lib file arguments are required

# by the script and therefore are not options.

- (options, args) = parser.parse_args()

+ (_, args) = parser.parse_args()

if len(args) != 2:

parser.error('expected 2 args: log_file lib_file')

(log_file, lib_file) = args

- output_type = options.output_type

obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj'))

log_file_lines = map(string.rstrip, open(log_file).readlines())

- call_info = ParseLogLines(log_file_lines)

- (unique_addrs, address_map) = ParseLibSymbols(lib_file)

- # Check for duplicate addresses in the log file, and print a warning if

- # duplicates are found. The instrumentation that produces the log file

- # should only print the first time a function is entered.

- addr_list = []

- for call in call_info:

- addr = call[3]

- if addr not in addr_list:

- addr_list.append(addr)

- else:

- print('WARNING: Address ' + hex(addr) + ' (line= ' +

- AddrToLine(addr, lib_file) + ') already profiled.')

+ call_info = _ParseLogLines(log_file_lines)

+ offsets = [call[3] for call in call_info]

+ offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_file)

- symbol_to_section_map = SymbolToSection(obj_dir)

+ _WarnAboutDuplicates(offsets)

pasko 2015/01/30 13:34:48 how many warnings are produced in practice?

Benoit L 2015/01/30 16:22:38 0 +/- 0 (95% CL)

pasko 2015/01/30 19:31:32 CI ?

+ symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir)

+ _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,

+ sys.stdout)

- unknown_symbol_warnings = WarningCollector(300)

- symbol_not_found_warnings = WarningCollector(300)

- for call in call_info:

- addr = call[3]

- if output_type == 'lineize':

- symbol = AddrToLine(addr, lib_file)

- print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'

- + symbol)

- elif output_type == 'orderfile':

- try:

- symbols = FindFunctions(addr, unique_addrs, address_map)

- for symbol in symbols:

- if symbol in symbol_to_section_map:

- print symbol_to_section_map[symbol]

- else:

- unknown_symbol_warnings.Write(

- 'WARNING: No known section for symbol ' + symbol)

- print ''

- except SymbolNotFoundException:

- symbol_not_found_warnings.Write(

- 'WARNING: Did not find function in binary. addr: '

- + hex(addr))

- else:

- try:

- symbols = FindFunctions(addr, unique_addrs, address_map)

- print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'

- + symbols[0])

- first_symbol = True

- for symbol in symbols:

- if not first_symbol:

- print '\t\t\t\t\t' + symbol

- else:

- first_symbol = False

- except SymbolNotFoundException:

- symbol_not_found_warnings.Write(

- 'WARNING: Did not find function in binary. addr: '

- + hex(addr))

- unknown_symbol_warnings.WriteEnd('no known section for symbol')

- symbol_not_found_warnings.WriteEnd('did not find function')

if __name__ == '__main__':

main()

pasko 2015/01/30 13:34:48 shouldn't we also set the logging level above this

Benoit L 2015/01/30 16:22:38 Done.

« no previous file with comments | « no previous file | tools/cygprofile/symbolize_unittest.py » ('j') | no next file with comments »