Chromium Code Reviews| Index: tracing/bin/symbolize_trace |
| diff --git a/tracing/bin/symbolize_trace b/tracing/bin/symbolize_trace |
| index 7c6f5a4e37b04144fc5bf255bb5509857023654a..a7416d2b54e0960143898ae75a693674da69d823 100755 |
| --- a/tracing/bin/symbolize_trace |
| +++ b/tracing/bin/symbolize_trace |
| @@ -7,6 +7,7 @@ import argparse |
| import bisect |
| import collections |
| import gzip |
| +import itertools |
| import json |
| import os |
| import re |
| @@ -26,165 +27,7 @@ import symbolize_trace_atos_regex |
| import symbolize_trace_macho_reader |
| -# Relevant trace event phases from Chromium's |
| -# src/base/trace_event/common/trace_event_common.h. |
| -TRACE_EVENT_PHASE_METADATA = 'M' |
| -TRACE_EVENT_PHASE_MEMORY_DUMP = 'v' |
| - |
| - |
| -# Matches Android library paths, supports both K (/data/app-lib/<>/lib.so) |
| -# as well as L+ (/data/app/<>/lib/<>/lib.so). Library name is available |
| -# via 'name' group. |
| -ANDROID_PATH_MATCHER = re.compile( |
| - r'^/data/(?:' |
| - r'app/[^/]+/lib/[^/]+/|' |
| - r'app-lib/[^/]+/|' |
| - r'data/[^/]+/incremental-install-files/lib/' |
| - r')(?P<name>.*\.so)') |
| - |
| -# Subpath of output path where unstripped libraries are stored. |
| -ANDROID_UNSTRIPPED_SUBPATH = 'lib.unstripped' |
| - |
| - |
| -def FindInSystemPath(binary_name): |
| - paths = os.environ['PATH'].split(os.pathsep) |
| - for path in paths: |
| - binary_path = os.path.join(path, binary_name) |
| - if os.path.isfile(binary_path): |
| - return binary_path |
| - return None |
| - |
| - |
| -class Symbolizer(object): |
| - # Encapsulates platform-specific symbolization logic. |
| - def __init__(self): |
| - self.is_mac = sys.platform == 'darwin' |
| - self.is_win = sys.platform == 'win32' |
| - if self.is_mac: |
| - self.binary = 'atos' |
| - self._matcher = symbolize_trace_atos_regex.AtosRegexMatcher() |
| - elif self.is_win: |
| - self.binary = 'addr2line-pdb.exe' |
| - else: |
| - self.binary = 'addr2line' |
| - self.symbolizer_path = FindInSystemPath(self.binary) |
| - |
| - def _SymbolizeLinuxAndAndroid(self, symfile, unsymbolized_name): |
| - def _SymbolizerCallback(sym_info, frames): |
| - # Unwind inline chain to the top. |
| - while sym_info.inlined_by: |
| - sym_info = sym_info.inlined_by |
| - |
| - symbolized_name = sym_info.name if sym_info.name else unsymbolized_name |
| - for frame in frames: |
| - frame.name = symbolized_name |
| - |
| - symbolizer = elf_symbolizer.ELFSymbolizer(symfile.symbolizable_path, |
| - self.symbolizer_path, |
| - _SymbolizerCallback, |
| - inlines=True) |
| - |
| - for address, frames in symfile.frames_by_address.iteritems(): |
| - # SymbolizeAsync() asserts that the type of address is int. We operate |
| - # on longs (since they are raw pointers possibly from 64-bit processes). |
| - # It's OK to cast here because we're passing relative PC, which should |
| - # always fit into int. |
| - symbolizer.SymbolizeAsync(int(address), frames) |
| - |
| - symbolizer.Join() |
| - |
| - |
| - def _SymbolizeMac(self, symfile): |
| - chars_max = int(subprocess.check_output("getconf ARG_MAX", shell=True)) |
| - |
| - # 16 for the address, 2 for "0x", 1 for the space |
| - chars_per_address = 19 |
| - |
| - load_address = (symbolize_trace_macho_reader. |
| - ReadMachOTextLoadAddress(symfile.symbolizable_path)) |
| - assert load_address is not None |
| - |
| - cmd_base = [self.symbolizer_path, '-arch', 'x86_64', '-l', |
| - '0x%x' % load_address, '-o', |
| - symfile.symbolizable_path] |
| - chars_for_other_arguments = len(' '.join(cmd_base)) + 1 |
| - |
| - # The maximum number of inputs that can be processed at once is limited by |
| - # ARG_MAX. This currently evalutes to ~13000 on macOS. |
| - max_inputs = (chars_max - chars_for_other_arguments) / chars_per_address |
| - |
| - all_keys = symfile.frames_by_address.keys() |
| - processed_keys_count = 0 |
| - while len(all_keys): |
| - input_count = min(len(all_keys), max_inputs) |
| - keys_to_process = all_keys[0:input_count] |
| - |
| - cmd = list(cmd_base) |
| - cmd.extend([hex(int(x) + load_address) |
| - for x in keys_to_process]) |
| - output_array = subprocess.check_output(cmd).split('\n') |
| - for i in range(len(keys_to_process)): |
| - for frame in (symfile.frames_by_address.values() |
| - [i + processed_keys_count]): |
| - frame.name = self._matcher.Match(output_array[i]) |
| - processed_keys_count += len(keys_to_process) |
| - all_keys = all_keys[input_count:] |
| - |
| - |
| - def _SymbolizeWin(self, symfile): |
| - """Invoke symbolizer binary on windows and write all input in one go. |
| - |
| - Unlike linux, on windows, symbolization talks through a shared system |
| - service that handles communication with the NT symbol servers. This |
| - creates an explicit serialization (and therefor lock contention) of |
| - any process using the symbol API for files do not have a local PDB. |
| - |
| - Thus, even though the windows symbolizer binary can be make command line |
| - compatible with the POSIX addr2line interface, paralellizing the |
| - symbolization does not yield the same performance effects. Running |
| - just one symbolizer seems good enough for now. Can optimize later |
| - if this becomes a bottleneck. |
| - """ |
| - cmd = [self.symbolizer_path, '--functions', '--demangle', '--exe', |
| - symfile.symbolizable_path] |
| - |
| - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, |
| - stderr=sys.stderr) |
| - addrs = ["%x" % relative_pc for relative_pc in |
| - symfile.frames_by_address.keys()] |
| - (stdout_data, stderr_data) = proc.communicate('\n'.join(addrs)) |
| - stdout_data = stdout_data.split('\n') |
| - |
| - # This is known to be in the same order as stderr_data. |
| - for i, addr in enumerate(addrs): |
| - for frame in symfile.frames_by_address[int(addr, 16)]: |
| - # Output of addr2line with --functions is always 2 outputs per |
| - # symbol, function name followed by source line number. Only grab |
| - # the function name as line info is not always available. |
| - frame.name = stdout_data[i * 2] |
| - |
| - |
| - def Symbolize(self, symfile, unsymbolized_name): |
| - if self.is_mac: |
| - self._SymbolizeMac(symfile) |
| - if self.is_win: |
| - self._SymbolizeWin(symfile) |
| - else: |
| - self._SymbolizeLinuxAndAndroid(symfile, unsymbolized_name) |
| - |
| - |
| - def IsSymbolizableFile(self, file_path): |
| - if self.is_win: |
| - extension = os.path.splitext(file_path)[1].lower() |
| - return extension in ['.dll', '.exe'] |
| - else: |
| - result = subprocess.check_output(['file', '-0', file_path]) |
| - type_string = result[result.find('\0') + 1:] |
| - return bool(re.match(r'.*(ELF|Mach-O) (32|64)-bit\b.*', |
| - type_string, re.DOTALL)) |
| - |
| - |
| -class ProcessMemoryMaps(object): |
| +class MemoryMap(object): |
| """Represents 'process_mmaps' trace file entry.""" |
| class Region(object): |
| @@ -221,15 +64,13 @@ class ProcessMemoryMaps(object): |
| return 'Region(0x{:X} - 0x{:X}, {})'.format( |
| self.start_address, self.end_address, self.file_path) |
| - def __init__(self, process_mmaps): |
| - """Parses 'process_mmaps' dictionary.""" |
| - |
| + def __init__(self, process_mmaps_json): |
| regions = [] |
| - for region_value in process_mmaps['vm_regions']: |
| + for region_json in process_mmaps_json['vm_regions']: |
| regions.append(self.Region( |
| - long(region_value['sa'], 16), |
| - long(region_value['sz'], 16), |
| - region_value['mf'])) |
| + long(region_json['sa'], 16), |
| + long(region_json['sz'], 16), |
| + region_json['mf'])) |
| regions.sort() |
| # Copy regions without duplicates and check for overlaps. |
| @@ -259,104 +100,550 @@ class ProcessMemoryMaps(object): |
| return None |
| -class StackFrames(object): |
| - """Represents 'stackFrames' trace file entry.""" |
| +class UnsupportedHeapDumpVersionError(Exception): |
| + def __init__(self, version): |
| + message = 'Unsupported heap dump version: {}'.format(version) |
| + super(UnsupportedHeapDumpVersionError, self).__init__(message) |
| + |
| + |
| +class StringMap(object): |
|
awong
2017/04/20 19:37:39
These classes should have doc strings explaining t
|
| + def __init__(self): |
| + self._modified = False |
| + self._string_jsons = [] |
| + self._string_by_id = {} |
| + self._id_by_string = {} |
| + self._max_string_id = 0 |
| + |
| + @property |
| + def modified(self): |
| + return self._modified |
| + |
| + @property |
| + def string_by_id(self): |
| + return self._string_by_id |
| + |
| + def ParseMore(self, heap_dump_version, strings_json): |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + |
| + self._string_jsons.append(strings_json) |
| + for string_json in strings_json: |
| + self._Insert(string_json['id'], string_json['string']) |
| + |
| + def Clear(self): |
| + if self._string_by_id: |
| + self._modified = True |
|
awong
2017/04/20 22:12:14
Is Clear() not reset?
This looks *almost* like __
|
| + self._string_by_id = {} |
| + self._id_by_string = {} |
| + self._Insert(0, '[null]') |
| + self._max_string_id = 0 |
| + |
| + def AddString(self, string): |
| + string_id = self._id_by_string.get(string) |
| + if string_id is None: |
| + string_id = self._max_string_id + 1 |
| + self._Insert(string_id, string) |
| + self._modified = True |
| + return string_id |
| + |
| + def ApplyModifications(self): |
|
awong
2017/04/20 22:12:14
What are such modifications? Can we use a less gen
|
| + if not self.modified: |
| + return |
| + |
| + assert self._string_jsons, 'no JSON nodes' |
| + |
| + # Serialize into first JSON node, and clear all others. |
|
awong
2017/04/20 22:12:14
Can we get a "why" in this comment?
As a reader,
|
| + |
| + for string_json in self._string_jsons: |
| + string_json[:] = [] |
|
awong
2017/04/20 22:12:14
string_json.clear()?
|
| + string_json = self._string_jsons[0] |
| + for string_id, string in self._string_by_id.iteritems(): |
| + string_json.append({'id': string_id, 'string': string}) |
| + |
| + self._modified = False |
|
awong
2017/04/20 22:12:14
This is confusing. Shouldn't it be true?
|
| + |
| + def _Insert(self, string_id, string): |
| + self._id_by_string[string] = string_id |
| + self._string_by_id[string_id] = string |
| + self._max_string_id = max(self._max_string_id, string_id) |
| + |
| + |
| +class TypeNameMap(object): |
| + def __init__(self): |
| + self._modified = False |
| + self._type_name_jsons = [] |
| + self._name_by_id = {} |
| + self._id_by_name = {} |
| + self._max_type_id = 0 |
| + |
| + @property |
| + def modified(self): |
| + return self._modified |
| + |
| + @property |
| + def name_by_id(self): |
| + return self._name_by_id |
| + |
| + def ParseMore(self, heap_dump_version, type_name_json, string_map): |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + |
| + self._type_name_jsons.append(type_name_json) |
| + for type_json in type_name_json: |
| + self._Insert(type_json['id'], |
| + string_map.string_by_id[type_json['name_sid']]) |
| + |
| + def AddType(self, type_name): |
| + type_id = self._id_by_name.get(type_name) |
| + if type_id is None: |
| + type_id = self._max_type_id + 1 |
| + self._Insert(type_id, type_name) |
| + self._modified = True |
| + return type_id |
| + |
| + def ApplyModifications(self, string_map, force=False): |
| + if not self.modified and not force: |
| + return |
| + |
| + assert self._type_name_jsons, 'no JSON nodes' |
| + |
| + # Serialize into first JSON node, and clear all others. |
| + |
| + for types_json in self._type_name_jsons: |
| + types_json[:] = [] |
|
awong
2017/04/20 22:12:14
types_json.clear()?
|
| + types_json = self._type_name_jsons[0] |
| + for type_id, type_name in self._name_by_id.iteritems(): |
| + types_json.append({ |
| + 'id': type_id, |
| + 'name_sid': string_map.AddString(type_name)}) |
| + |
| + self._modified = False |
|
awong
2017/04/20 22:12:14
Should this be true?
|
| + |
| + def _Insert(self, type_id, type_name): |
| + self._id_by_name[type_name] = type_id |
| + self._name_by_id[type_id] = type_name |
| + self._max_type_id = max(self._max_type_id, type_id) |
| - class PCFrame(object): |
| - def __init__(self, pc, frame): |
| + |
| +class StackFrameMap(object): |
| + class Frame(object): |
| + def __init__(self, frame_id, name, parent_frame_id): |
| self._modified = False |
| - self._pc = pc |
| - self._frame = frame |
| + self._id = frame_id |
| + self._name = name |
| + self._pc = self._ParsePC(name) |
| + self._parent_id = parent_frame_id |
| + self._ext = None |
| @property |
| def modified(self): |
| return self._modified |
| @property |
| + def id(self): |
| + return self._id |
| + |
| + @property |
| def pc(self): |
| return self._pc |
| @property |
| def name(self): |
| - return self._frame['name'] |
| + return self._name |
| @name.setter |
| def name(self, value): |
| self._modified = True |
| - self._frame['name'] = value |
| + self._name = value |
| + |
| + @property |
| + def parent_id(self): |
| + return self._parent_id |
| + |
| + _PC_TAG = 'pc:' |
| + |
| + def _ParsePC(self, name): |
| + if not name.startswith(self._PC_TAG): |
|
awong
2017/04/20 22:12:14
How about invert the logic to remove the not?
|
| + return None |
| + return long(name[len(self._PC_TAG):], 16) |
| + |
| + def _ClearModified(self): |
| + self._modified = False |
| + |
| + def __init__(self): |
|
awong
2017/04/20 22:12:14
Group the __init__?
|
| + self._modified = False |
| + self._heap_dump_version = None |
| + self._stack_frames_jsons = [] |
| + self._frame_by_id = {} |
| + |
| + @property |
| + def modified(self): |
| + return (self._modified or |
| + any(f.modified for f in self._frame_by_id.itervalues())) |
| + |
| + @property |
| + def frame_by_id(self): |
| + return self._frame_by_id |
| + |
| + def ParseMore(self, heap_dump_version, stack_frames_json, string_map): |
| + frame_by_id = {} |
| + if heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + if self._stack_frames_jsons: |
| + raise Exception('Legacy stack frames are expected only once.') |
| + for frame_id, frame_json in stack_frames_json.iteritems(): |
| + frame = self.Frame(frame_id, |
| + frame_json['name'], |
| + frame_json.get('parent')) |
| + frame_by_id[frame.id] = frame |
| + else: |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + for frame_json in stack_frames_json: |
| + frame = self.Frame(frame_json['id'], |
| + string_map.string_by_id[frame_json['name_sid']], |
| + frame_json.get('parent')) |
| + frame_by_id[frame.id] = frame |
| + |
| + self._heap_dump_version = heap_dump_version |
| + self._stack_frames_jsons.append(stack_frames_json) |
| + |
| + self._frame_by_id = frame_by_id |
| + |
| + def ApplyModifications(self, string_map, force=False): |
| + if not self.modified and not force: |
| + return |
| + |
| + assert self._stack_frames_jsons, 'no JSON nodes' |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + assert string_map is None, \ |
| + 'string_map should not be used with the legacy format' |
| + |
| + # Serialize frames into first JSON node, and clear all others. |
| + |
| + for frames_json in self._stack_frames_jsons: |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + frames_json.clear() |
| + else: |
| + frames_json[:] = [] |
| + |
| + frames_json = self._stack_frames_jsons[0] |
| + for frame in self._frame_by_id.itervalues(): |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + frame_json = {'name': frame.name} |
| + frames_json[frame.id] = frame_json |
| + else: |
| + frame_json = { |
| + 'id': frame.id, |
| + 'name_sid': string_map.AddString(frame.name) |
| + } |
| + frames_json.append(frame_json) |
| + if frame.parent_id is not None: |
| + frame_json['parent'] = frame.parent_id |
| + frame._ClearModified() |
| + |
| + self._modified = False |
| - def __init__(self, stack_frames): |
| - """Constructs object using 'stackFrames' dictionary.""" |
| - self._pc_frames = [] |
| - for frame in stack_frames.itervalues(): |
| - pc_frame = self._ParsePCFrame(frame) |
| - if pc_frame: |
| - self._pc_frames.append(pc_frame) |
| + |
| +class HeapProfile(object): |
| + EntryKey = collections.namedtuple( |
| + 'EntryKey', |
| + ['stack_frame_id', 'type_name_id']) |
| + |
| + class Entry(object): |
| + def __init__(self, key, mapped_value_by_name, numeric_value_by_name): |
| + self._key = key |
| + self._mapped_value_by_name = mapped_value_by_name |
| + self._numeric_value_by_name = numeric_value_by_name |
| + |
| + @property |
| + def key(self): |
| + return self._key |
| + |
| + @property |
| + def stack_frame_id(self): |
| + return self._key.stack_frame_id |
| + |
| + @property |
| + def type_name_id(self): |
| + return self._key.type_name_id |
| + |
| + def _AddValuesFrom(self, entry): |
| + self._mapped_value_by_name.clear() |
| + for name, value in entry._numeric_value_by_name.iteritems(): |
| + value += self._numeric_value_by_name.get(name, 0) |
| + self._numeric_value_by_name[name] = value |
| + |
| + def __init__(self, allocator_name, entries_json, mapped_entry_names): |
| + self._modified = False |
| + self._allocator_name = allocator_name |
| + self._entries_json = entries_json |
| + self._entries = [] |
| + for values in zip(*entries_json.itervalues()): |
| + stack_frame_id = None |
| + type_name_id = None |
| + mapped_value_by_name = {} |
| + numeric_value_by_name = {} |
| + for index, name in enumerate(entries_json.iterkeys()): |
| + value = values[index] |
| + if name == 'nodes': |
| + stack_frame_id = value |
| + elif name == 'types': |
| + type_name_id = value |
| + elif name in mapped_entry_names: |
| + mapped_value_by_name[name] = value |
| + else: |
| + numeric_value_by_name[name] = value |
| + entry = self.Entry(self.EntryKey(stack_frame_id, type_name_id), |
| + mapped_value_by_name, numeric_value_by_name) |
| + self._entries.append(entry) |
| + |
| + @property |
| + def modified(self): |
| + return self._modified |
| @property |
| - def pc_frames(self): |
| - return self._pc_frames |
| + def allocator_name(self): |
| + return self._allocator_name |
| + |
| + @property |
| + def entries(self): |
| + return self._entries |
| + |
| + def ApplyModifications(self): |
| + if not self.modified: |
| + return |
| + |
| + mapped_value_names = set() |
| + numeric_value_names = set() |
| + for entry in self._entries: |
| + mapped_value_names.update(entry._mapped_value_by_name.iterkeys()) |
| + numeric_value_names.update(entry._numeric_value_by_name.iterkeys()) |
| + |
| + def _AddJSONValue(name, value): |
| + values = self._entries_json.get(name) |
| + if values is None: |
| + values = [] |
| + self._entries_json[name] = values |
| + values.append(value) |
| + |
| + self._entries_json.clear() |
| + for entry in self._entries: |
| + _AddJSONValue('nodes', entry.stack_frame_id) |
| + _AddJSONValue('types', entry.type_name_id) |
| + for name in mapped_value_names: |
| + value = entry._mapped_value_by_name[name] |
| + _AddJSONValue(name, value) |
| + for name in numeric_value_names: |
| + value = entry._numeric_value_by_name[name] |
| + _AddJSONValue(name, value) |
| + |
| + self._modified = False |
| + |
| + |
| +class MemoryDump(object): |
| + def __init__(self, allocators_json, mapped_entry_names): |
| + self._profiles = [] |
| + for allocator_name, entries_json in allocators_json.iteritems(): |
| + profile = HeapProfile(allocator_name, entries_json, mapped_entry_names) |
| + self._profiles.append(profile) |
| @property |
| def modified(self): |
| - return any(f.modified for f in self._pc_frames) |
| + return any(p.modified for p in self.profiles) |
| - _PC_TAG = 'pc:' |
| + @property |
| + def profiles(self): |
| + return self._profiles |
| - @classmethod |
| - def _ParsePCFrame(self, frame): |
| - name = frame['name'] |
| - if not name.startswith(self._PC_TAG): |
| - return None |
| - pc = long(name[len(self._PC_TAG):], 16) |
| - return self.PCFrame(pc, frame) |
| + def ApplyModifications(self): |
| + for profile in self._profiles: |
| + profile.ApplyModifications() |
| -class Process(object): |
| - """Holds various bits of information about a process in a trace file.""" |
| +class Trace(object): |
| - def __init__(self, pid): |
| - self.pid = pid |
| - self.name = None |
| - self.mmaps = None |
| - self.stack_frames = None |
| + HEAP_DUMP_VERSION_LEGACY = 'Legacy' |
| + HEAP_DUMP_VERSION_1 = 1 |
| + class Process(object): |
| + def __init__(self, pid): |
| + self._pid = pid |
| + self._name = None |
| + self._memory_map = None |
| + self._memory_dumps = [] |
| + self._stack_frame_map = StackFrameMap() |
| + self._type_name_map = TypeNameMap() |
| + self._string_map = StringMap() |
| + self._heap_dump_version = None |
| -def CollectProcesses(trace): |
| - """Parses trace dictionary and returns pid->Process map of all processes |
| - suitable for symbolization (which have both mmaps and stack_frames). |
| - """ |
| + @property |
| + def modified(self): |
| + return (self._stack_frame_map.modified or |
| + self._type_name_map.modified or |
| + any(d.modified for d in self._memory_dumps)) |
| - process_map = {} |
| + @property |
| + def pid(self): |
| + return self._pid |
| - # Android traces produced via 'chrome://inspect/?tracing#devices' are |
| - # just list of events. |
| - events = trace if isinstance(trace, list) else trace['traceEvents'] |
| - for event in events: |
| - name = event.get('name') |
| - if not name: |
| - continue |
| + @property |
| + def name(self): |
|
awong
2017/04/20 22:28:04
For these properties, having a docstring that expl
|
| + return self._name |
| + |
| + @property |
| + def unique_name(self): |
| + name = self._name if self._name else 'UnnamedProcess' |
| + return '{}({})'.format(name, self._pid) |
| + |
| + @property |
| + def memory_map(self): |
| + return self._memory_map |
| + |
| + @property |
| + def memory_dumps(self): |
|
awong
2017/04/20 22:28:04
Why is this one plural?
|
| + return self._memory_dumps |
| - pid = event['pid'] |
| - process = process_map.get(pid) |
| - if process is None: |
| - process = Process(pid) |
| - process_map[pid] = process |
| + @property |
| + def stack_frame_map(self): |
| + return self._stack_frame_map |
| + |
| + @property |
| + def type_name_map(self): |
| + return self._type_name_map |
| + |
| + def ApplyModifications(self): |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + self._stack_frame_map.ApplyModifications(None) |
| + else: |
| + if self._stack_frame_map.modified or self._type_name_map.modified: |
| + self._string_map.Clear() |
| + self._stack_frame_map.ApplyModifications(self._string_map, force=True) |
| + self._type_name_map.ApplyModifications(self._string_map, force=True) |
| + self._string_map.ApplyModifications() |
| + for dump in self._memory_dumps: |
| + dump.ApplyModifications() |
| + |
| + def __init__(self, trace_json): |
| + self._trace_json = trace_json |
| + self._processes = [] |
| + self._heap_dump_version = None |
| + |
| + # Misc per-process information needed only during parsing. |
| + class ProcessExt(object): |
| + def __init__(self, pid): |
| + self.process = Trace.Process(pid) |
| + self.mapped_entry_names = set() |
| + self.process_mmaps_json = None |
| + self.seen_strings_json = False |
| + |
| + process_ext_by_pid = {} |
| + |
| + # Android traces produced via 'chrome://inspect/?tracing#devices' are |
| + # just list of events. |
| + events = trace_json if isinstance(trace_json, list) \ |
| + else trace_json['traceEvents'] |
| + for event in events: |
| + name = event.get('name') |
| + if not name: |
| + continue |
| + |
| + pid = event['pid'] |
| + process_ext = process_ext_by_pid.get(pid) |
| + if process_ext is None: |
| + process_ext = ProcessExt(pid) |
| + process_ext_by_pid[pid] = process_ext |
| + process = process_ext.process |
| + |
| + phase = event['ph'] |
| + if phase == self._EVENT_PHASE_METADATA: |
| + if name == 'process_name': |
| + process._name = event['args']['name'] |
| + elif name == 'stackFrames': |
| + process._stack_frame_map.ParseMore( |
| + self._UseHeapDumpVersion(self.HEAP_DUMP_VERSION_LEGACY), |
| + event['args']['stackFrames'], |
| + process._string_map) |
| + elif phase == self._EVENT_PHASE_MEMORY_DUMP: |
| + dumps = event['args']['dumps'] |
| + process_mmaps = dumps.get('process_mmaps') |
| + if process_mmaps: |
| + # We want the most recent memory map, so parsing happens later |
| + # once we finished reading all events. |
| + process_ext.process_mmaps_json = process_mmaps |
| + heaps = dumps.get('heaps_v2') |
| + if heaps: |
| + version = self._UseHeapDumpVersion(heaps['version']) |
| + maps = heaps.get('maps') |
| + if maps: |
| + process_ext.mapped_entry_names.update(maps.iterkeys()) |
| + types = maps.get('types') |
| + stack_frames = maps.get('nodes') |
| + strings = maps.get('strings') |
| + if (strings is None and (types or stack_frames) |
| + and not process_ext.seen_strings_json): |
| + # ApplyModifications() for TypeNameMap and StackFrameMap puts |
| + # everything into the first node and depends on StringMap. So |
| + # we need to make sure that 'strings' node is there if any of |
| + # other two nodes present. |
| + strings = [] |
| + maps['strings'] = strings |
| + if strings is not None: |
| + process_ext.seen_strings_json = True |
| + process._string_map.ParseMore(version, strings) |
| + if types: |
| + process._type_name_map.ParseMore( |
| + version, types, process._string_map) |
| + if stack_frames: |
| + process._stack_frame_map.ParseMore( |
| + version, stack_frames, process._string_map) |
| + allocators = heaps.get('allocators') |
| + if allocators: |
| + dump = MemoryDump(allocators, process_ext.mapped_entry_names) |
| + process._memory_dumps.append(dump) |
| + |
| + self._processes = [] |
| + for pe in process_ext_by_pid.itervalues(): |
| + pe.process._heap_dump_version = self._heap_dump_version |
| + if pe.process_mmaps_json: |
| + # Now parse the most recent memory map. |
| + pe.process._memory_map = MemoryMap(pe.process_mmaps_json) |
| + self._processes.append(pe.process) |
| - phase = event['ph'] |
| - if phase == TRACE_EVENT_PHASE_METADATA: |
| - if name == 'process_name': |
| - process.name = event['args']['name'] |
| - elif name == 'stackFrames': |
| - process.stack_frames = StackFrames(event['args']['stackFrames']) |
| - elif phase == TRACE_EVENT_PHASE_MEMORY_DUMP: |
| - process_mmaps = event['args']['dumps'].get('process_mmaps') |
| - if process_mmaps: |
| - # TODO(dskiba): this parses all process_mmaps, but retains only the |
| - # last one. We need to parse only once (lazy parsing?). |
| - process.mmaps = ProcessMemoryMaps(process_mmaps) |
| + @property |
| + def modified(self): |
| + return any(p.modified for p in self._processes) |
| - return [p for p in process_map.itervalues() if p.mmaps and p.stack_frames] |
| + @property |
| + def processes(self): |
| + return self._processes |
| + |
| + @property |
| + def heap_dump_version(self): |
| + return self._heap_dump_version |
| + |
| + def ApplyModifications(self): |
| + for process in self._processes: |
| + process.ApplyModifications() |
| + assert not self.modified, 'still modified' |
| + |
| + def Serialize(self): |
| + return self._trace_json |
| + |
| + # Relevant trace event phases from Chromium's |
| + # src/base/trace_event/common/trace_event_common.h. |
| + _EVENT_PHASE_METADATA = 'M' |
| + _EVENT_PHASE_MEMORY_DUMP = 'v' |
| + |
| + def _UseHeapDumpVersion(self, version): |
| + if self._heap_dump_version is None: |
| + self._heap_dump_version = version |
| + return version |
| + elif self._heap_dump_version != version: |
| + raise Exception( |
| + ("Inconsistent trace file: first saw '{}' heap dump version, " |
| + "then '{}'.").format(self._heap_dump_version, version)) |
| + else: |
| + return version |
| class SymbolizableFile(object): |
| @@ -381,8 +668,12 @@ def ResolveSymbolizableFiles(processes): |
| """ |
| symfile_by_path = {} |
| for process in processes: |
| - for frame in process.stack_frames.pc_frames: |
| - region = process.mmaps.FindRegion(frame.pc) |
| + if not process.memory_map: |
|
awong
2017/04/20 22:28:04
Comment explaining when this can occur?
|
| + continue |
| + for frame in process.stack_frame_map.frame_by_id.itervalues(): |
| + if frame.pc is None: |
| + continue |
| + region = process.memory_map.FindRegion(frame.pc) |
| if region is None: |
| frame.name = '<unresolved>' |
| continue |
| @@ -397,15 +688,154 @@ def ResolveSymbolizableFiles(processes): |
| return symfile_by_path.values() |
| +def FindInSystemPath(binary_name): |
| + paths = os.environ['PATH'].split(os.pathsep) |
| + for path in paths: |
| + binary_path = os.path.join(path, binary_name) |
| + if os.path.isfile(binary_path): |
| + return binary_path |
| + return None |
| + |
| + |
| +class Symbolizer(object): |
| + # Encapsulates platform-specific symbolization logic. |
|
awong
2017/04/20 22:28:04
Turn into docstring.
|
| + def __init__(self): |
| + self.is_mac = sys.platform == 'darwin' |
| + self.is_win = sys.platform == 'win32' |
| + if self.is_mac: |
| + self.binary = 'atos' |
| + self._matcher = symbolize_trace_atos_regex.AtosRegexMatcher() |
| + elif self.is_win: |
| + self.binary = 'addr2line-pdb.exe' |
| + else: |
| + self.binary = 'addr2line' |
| + self.symbolizer_path = FindInSystemPath(self.binary) |
| + |
| + def _SymbolizeLinuxAndAndroid(self, symfile, unsymbolized_name): |
| + def _SymbolizerCallback(sym_info, frames): |
| + # Unwind inline chain to the top. |
| + while sym_info.inlined_by: |
| + sym_info = sym_info.inlined_by |
| + |
| + symbolized_name = sym_info.name if sym_info.name else unsymbolized_name |
| + for frame in frames: |
| + frame.name = symbolized_name |
| + frame.ext.source_path = sym_info.source_path |
| + |
| + symbolizer = elf_symbolizer.ELFSymbolizer(symfile.symbolizable_path, |
| + self.symbolizer_path, |
| + _SymbolizerCallback, |
| + inlines=True) |
| + |
| + for address, frames in symfile.frames_by_address.iteritems(): |
| + # SymbolizeAsync() asserts that the type of address is int. We operate |
| + # on longs (since they are raw pointers possibly from 64-bit processes). |
| + # It's OK to cast here because we're passing relative PC, which should |
| + # always fit into int. |
| + symbolizer.SymbolizeAsync(int(address), frames) |
| + |
| + symbolizer.Join() |
| + |
| + |
| + def _SymbolizeMac(self, symfile): |
| + chars_max = int(subprocess.check_output("getconf ARG_MAX", shell=True)) |
| + |
| + # 16 for the address, 2 for "0x", 1 for the space |
| + chars_per_address = 19 |
| + |
| + load_address = (symbolize_trace_macho_reader. |
| + ReadMachOTextLoadAddress(symfile.symbolizable_path)) |
| + assert load_address is not None |
| + |
| + cmd_base = [self.symbolizer_path, '-arch', 'x86_64', '-l', |
| + '0x%x' % load_address, '-o', |
| + symfile.symbolizable_path] |
| + chars_for_other_arguments = len(' '.join(cmd_base)) + 1 |
| + |
| + # The maximum number of inputs that can be processed at once is limited by |
| + # ARG_MAX. This currently evalutes to ~13000 on macOS. |
| + max_inputs = (chars_max - chars_for_other_arguments) / chars_per_address |
| + |
| + all_keys = symfile.frames_by_address.keys() |
| + processed_keys_count = 0 |
| + while len(all_keys): |
| + input_count = min(len(all_keys), max_inputs) |
| + keys_to_process = all_keys[0:input_count] |
| + cmd = list(cmd_base) |
| + cmd.extend([hex(int(x) + load_address) |
| + for x in keys_to_process]) |
| + output_array = subprocess.check_output(cmd).split('\n') |
| + for i in range(len(keys_to_process)): |
| + for frame in (symfile.frames_by_address.values() |
| + [i + processed_keys_count]): |
| + frame.name = self._matcher.Match(output_array[i]) |
| + processed_keys_count += len(keys_to_process) |
| + all_keys = all_keys[input_count:] |
| + |
| + def _SymbolizeWin(self, symfile): |
| + """Invoke symbolizer binary on windows and write all input in one go. |
| + |
| + Unlike linux, on windows, symbolization talks through a shared system |
| + service that handles communication with the NT symbol servers. This |
| + creates an explicit serialization (and therefor lock contention) of |
| + any process using the symbol API for files do not have a local PDB. |
| + |
| + Thus, even though the windows symbolizer binary can be make command line |
| + compatible with the POSIX addr2line interface, paralellizing the |
| + symbolization does not yield the same performance effects. Running |
| + just one symbolizer seems good enough for now. Can optimize later |
| + if this becomes a bottleneck. |
| + """ |
| + cmd = [self.symbolizer_path, '--functions', '--demangle', '--exe', |
| + symfile.symbolizable_path] |
| + |
| + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, |
| + stderr=sys.stderr) |
| + addrs = ["%x" % relative_pc for relative_pc in |
| + symfile.frames_by_address.keys()] |
| + (stdout_data, stderr_data) = proc.communicate('\n'.join(addrs)) |
| + stdout_data = stdout_data.split('\n') |
| + |
| + # This is known to be in the same order as stderr_data. |
| + for i, addr in enumerate(addrs): |
| + for frame in symfile.frames_by_address[int(addr, 16)]: |
| + # Output of addr2line with --functions is always 2 outputs per |
| + # symbol, function name followed by source line number. Only grab |
| + # the function name as line info is not always available. |
| + frame.name = stdout_data[i * 2] |
| + |
| + def Symbolize(self, symfile, unsymbolized_name): |
| + if self.is_mac: |
| + self._SymbolizeMac(symfile) |
| + elif self.is_win: |
| + self._SymbolizeWin(symfile) |
| + else: |
| + self._SymbolizeLinuxAndAndroid(symfile, unsymbolized_name) |
| + |
| + def IsSymbolizableFile(self, file_path): |
| + if self.is_win: |
| + extension = os.path.splitext(file_path)[1].lower() |
| + return extension in ['.dll', '.exe'] |
| + else: |
| + result = subprocess.check_output(['file', '-0', file_path]) |
| + type_string = result[result.find('\0') + 1:] |
| + return bool(re.match(r'.*(ELF|Mach-O) (32|64)-bit\b.*', |
| + type_string, re.DOTALL)) |
| + |
| + |
| def SymbolizeFiles(symfiles, symbolizer): |
| """Symbolizes each file in the given list of SymbolizableFiles |
| and updates stack frames with symbolization results.""" |
| + |
| + if not symfiles: |
| + print 'Nothing to symbolize.' |
| + return |
| + |
| print 'Symbolizing...' |
| def _SubPrintf(message, *args): |
| print (' ' + message).format(*args) |
| - symbolized = False |
| for symfile in symfiles: |
| unsymbolized_name = '<{}>'.format( |
| symfile.path if symfile.path else 'unnamed') |
| @@ -432,9 +862,20 @@ def SymbolizeFiles(symfiles, symbolizer): |
| symfile.path) |
| symbolizer.Symbolize(symfile, unsymbolized_name) |
| - symbolized = True |
| - return symbolized |
| + |
| +# Matches Android library paths, supports both K (/data/app-lib/<>/lib.so) |
| +# as well as L+ (/data/app/<>/lib/<>/lib.so). Library name is available |
| +# via 'name' group. |
| +ANDROID_PATH_MATCHER = re.compile( |
|
awong
2017/04/20 22:28:04
This is hardish to read and matching paths with re
|
| + r'^/data/(?:' |
| + r'app/[^/]+/lib/[^/]+/|' |
| + r'app-lib/[^/]+/|' |
| + r'data/[^/]+/incremental-install-files/lib/' |
| + r')(?P<name>.*\.so)') |
| + |
| +# Subpath of output path where unstripped libraries are stored. |
| +ANDROID_UNSTRIPPED_SUBPATH = 'lib.unstripped' |
| def HaveFilesFromAndroid(symfiles): |
| @@ -455,59 +896,87 @@ def RemapAndroidFiles(symfiles, output_path): |
| symfile.symbolizable_path = 'android://{}'.format(symfile.path) |
| +def Symbolize(options, trace, symbolizer): |
| + symfiles = ResolveSymbolizableFiles(trace.processes) |
| + |
| + # Android trace files don't have any indication they are from Android. |
| + # So we're checking for Android-specific paths. |
| + if HaveFilesFromAndroid(symfiles): |
| + if not options.output_directory: |
| + sys.exit('The trace file appears to be from Android. Please ' |
| + 'specify output directory to properly symbolize it.') |
| + RemapAndroidFiles(symfiles, os.path.abspath(options.output_directory)) |
| + |
| + SymbolizeFiles(symfiles, symbolizer) |
| + |
| + |
| +def OpenTraceFile(file_path, mode): |
| + if file_path.endswith('.gz'): |
| + return gzip.open(file_path, mode + 'b') |
| + else: |
| + return open(file_path, mode + 't') |
| + |
| + |
| # Suffix used for backup files. |
| BACKUP_FILE_TAG = '.BACKUP' |
| def main(): |
| - parser = argparse.ArgumentParser() |
| - parser.add_argument('file', |
| - help='Trace file to symbolize (.json or .json.gz)') |
| - parser.add_argument('--no-backup', |
| - dest='backup', default='true', action='store_false', |
| - help="Don't create {} files".format(BACKUP_FILE_TAG)) |
| - parser.add_argument('--output-directory', |
| - help='The path to the build output directory, such ' + |
| - 'as out/Debug. Only needed for Android.') |
| - options = parser.parse_args() |
| - |
| - trace_file_path = options.file |
| - def _OpenTraceFile(mode): |
| - if trace_file_path.endswith('.gz'): |
| - return gzip.open(trace_file_path, mode + 'b') |
| - else: |
| - return open(trace_file_path, mode + 't') |
| + class MultilineHelpFormatter(argparse.HelpFormatter): |
| + def _split_lines(self, text, width): |
| + extra_lines = [] |
| + if '\n' in text: |
| + lines = text.splitlines() |
| + text = lines[0] |
| + extra_lines = lines[1:] |
| + return super(MultilineHelpFormatter, self)._split_lines(text, width) + \ |
| + extra_lines |
| + |
| + parser = argparse.ArgumentParser(formatter_class=MultilineHelpFormatter) |
| + parser.add_argument( |
| + 'file', |
| + help='Trace file to symbolize (.json or .json.gz)') |
| + |
| + parser.add_argument( |
| + '--no-backup', dest='backup', default='true', action='store_false', |
| + help="Don't create {} files".format(BACKUP_FILE_TAG)) |
| + |
| + parser.add_argument( |
| + '--output-directory', |
| + help='The path to the build output directory, such as out/Debug.') |
| symbolizer = Symbolizer() |
| if symbolizer.symbolizer_path is None: |
| sys.exit("Can't symbolize - no %s in PATH." % symbolizer.binary) |
| + options = parser.parse_args() |
| + |
| + trace_file_path = options.file |
| + |
| print 'Reading trace file...' |
| - with _OpenTraceFile('r') as trace_file: |
| - trace = json.load(trace_file) |
| + with OpenTraceFile(trace_file_path, 'r') as trace_file: |
| + trace = Trace(json.load(trace_file)) |
| - processes = CollectProcesses(trace) |
| - symfiles = ResolveSymbolizableFiles(processes) |
| + Symbolize(options, trace, symbolizer) |
| - # Android trace files don't have any indication they are from Android. |
| - # So we're checking for Android-specific paths. |
| - if HaveFilesFromAndroid(symfiles): |
| - if not options.output_directory: |
| - parser.error('The trace file appears to be from Android. Please ' |
| - "specify output directory (e.g. 'out/Debug') to properly " |
| - 'symbolize it.') |
| - RemapAndroidFiles(symfiles, os.path.abspath(options.output_directory)) |
| + if trace.modified: |
| + trace.ApplyModifications() |
| - if SymbolizeFiles(symfiles, symbolizer): |
| if options.backup: |
| backup_file_path = trace_file_path + BACKUP_FILE_TAG |
| - print 'Backing up trace file to {}...'.format(backup_file_path) |
| + if os.path.exists(backup_file_path): |
| + for i in itertools.count(1): |
| + unique_file_path = '{}{}'.format(backup_file_path, i) |
| + if not os.path.exists(unique_file_path): |
| + backup_file_path = unique_file_path |
| + break |
| + print 'Backing up trace file to {}'.format(backup_file_path) |
| os.rename(trace_file_path, backup_file_path) |
| - print 'Updating trace file...' |
| - with _OpenTraceFile('w') as trace_file: |
| - json.dump(trace, trace_file) |
| + print 'Updating the trace file...' |
| + with OpenTraceFile(trace_file_path, 'w') as trace_file: |
| + json.dump(trace.Serialize(), trace_file) |
| else: |
| - print 'No PCs symbolized - not updating trace file.' |
| + print 'No modifications were made - not updating the trace file.' |
| if __name__ == '__main__': |