Chromium Code Reviews| Index: tracing/bin/symbolize_trace |
| diff --git a/tracing/bin/symbolize_trace b/tracing/bin/symbolize_trace |
| index 7c6f5a4e37b04144fc5bf255bb5509857023654a..851102f9114cbbdbe5a8e36df93e30fa91323591 100755 |
| --- a/tracing/bin/symbolize_trace |
| +++ b/tracing/bin/symbolize_trace |
| @@ -3,10 +3,67 @@ |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| +""" |
| +This script processes trace files and symbolizes stack frames generated by |
| +Chrome's native heap profiler. |
| + |
| +=== Overview === |
| + |
| +Trace file is essentially a giant JSON array of dictionaries (events). |
| +Events have some predefined keys, but otherwise are free to have anything |
| +inside. Trace file contains events from all Chrome processes that were |
| +sampled during tracing period (and 'pid' is an example of a predefined key). |
|
Wez
2017/04/29 00:41:21
nit: "(and 'pid' is an example..." reads oddly her
DmitrySkiba
2017/05/02 06:19:59
Done.
|
| + |
| +This script cares only about memory dump events generated by memory-infra |
|
Wez
2017/04/29 00:41:21
nit: Suggest "...dump events in trace files genera
DmitrySkiba
2017/05/02 06:19:59
Done.
|
| +component. |
| + |
| +When Chrome native heap profiling is enabled, some memory dump events |
| +include the following extra information: |
| + |
| +* (Per allocator) Information about live allocations at the moment of the |
| + memory dump (the information includes backtraces, types / categories, |
| + sizes, and counts of allocations). There are several allocators in |
| + Chrome: malloc, blink_gc, and partition_alloc. |
|
Wez
2017/04/29 00:41:21
nit: If these are examples, not an exhaustive list
DmitrySkiba
2017/05/02 06:19:59
This is actually an exhaustive list.
Wez
2017/05/03 00:17:09
OK; in that case I would say "There are three allo
DmitrySkiba
2017/05/04 00:30:55
Acknowledged.
|
| + |
| +* (Per process) Stack frame tree of all functions that called allocators |
| + above. |
|
Wez
2017/04/29 00:41:21
nit: If we failed to trace all the way back to mai
DmitrySkiba
2017/05/02 06:19:59
It's still a single tree, just with an implicit ro
Wez
2017/05/03 00:17:10
OK; you could add a brief note that effect here, f
DmitrySkiba
2017/05/04 00:30:55
Acknowledged.
|
| + |
| +This script does the following: |
| + |
| +1. Parses the given trace file. |
| +2. Finds memory dump events and parses stack frame tree for each process. |
| +3. Finds stack frames that have PC addresses instead of function names. |
| +4. Symbolizes these PCs. |
| +6. Rewrites stack frame names (this updates parts of memory dump events). |
|
Wez
2017/04/29 00:41:21
nit: You're missing #5 ;)
It's also not clear wha
DmitrySkiba
2017/05/02 06:19:59
Done. Added note about script not coalescing such
Wez
2017/05/03 00:17:09
Acknowledged.
|
| +7. Updates the trace file. |
| + |
| +=== Details === |
| + |
| +There are two formats of heap profiler information: legacy and modern. The |
| +main differences are: |
| + |
| +* In the legacy format stack frame tree is not dumped in memory dump events, |
| + but in metadata events (one per process). I.e. it's sufficient to parse |
| + a single metadata event to get full stack frame tree for a process. |
|
Wez
2017/04/29 00:41:21
IIUC the point here is that every "event" in a leg
DmitrySkiba
2017/05/02 06:19:59
Both formats dump live objects per allocator in ea
Wez
2017/05/03 00:17:10
Thanks for adding this detail, however it seems a
DmitrySkiba
2017/05/04 00:30:55
Well, the section is named "Details", and details
|
| + |
| +* In the modern format stack frame tree (also type name and string mappings) |
| + are dumped incrementally. I.e. each memory dump event carries additions to |
| + the stack frame tree that occurred since the previous memory dump event. |
|
Wez
2017/04/29 00:41:21
You might express this as each memory-infra event
DmitrySkiba
2017/05/02 06:19:59
Done.
|
| + To get the full stack frame tree for a process the script needs to parse |
| + all memory dump events. However, when wrappers update incremental nodes, |
| + they put everything in the first node, and clear all others. |
|
Wez
2017/04/29 00:41:21
Not sure what you mean about moving everything int
DmitrySkiba
2017/05/02 06:19:59
Explained more.
|
| + |
| +* In the modern format stack frame tree doesn't reference name strings |
| + directly, but through a string mapping table. |
| + |
| +See crbug.com/708930 for more information about the modern format. |
| +""" |
| + |
| import argparse |
| import bisect |
| import collections |
| import gzip |
| +import itertools |
| import json |
| import os |
| import re |
| @@ -26,166 +83,82 @@ import symbolize_trace_atos_regex |
| import symbolize_trace_macho_reader |
| -# Relevant trace event phases from Chromium's |
| -# src/base/trace_event/common/trace_event_common.h. |
| -TRACE_EVENT_PHASE_METADATA = 'M' |
| -TRACE_EVENT_PHASE_MEMORY_DUMP = 'v' |
| +class NodeWrapper(object): |
| + """Wraps an event data node(s). |
| + A node is a reference into a trace event JSON. Wrappers parse nodes to |
| + provide convenient APIs and update nodes when asked to propagate changes |
| + back (see ApplyModifications() below). |
| -# Matches Android library paths, supports both K (/data/app-lib/<>/lib.so) |
| -# as well as L+ (/data/app/<>/lib/<>/lib.so). Library name is available |
| -# via 'name' group. |
| -ANDROID_PATH_MATCHER = re.compile( |
| - r'^/data/(?:' |
| - r'app/[^/]+/lib/[^/]+/|' |
| - r'app-lib/[^/]+/|' |
| - r'data/[^/]+/incremental-install-files/lib/' |
| - r')(?P<name>.*\.so)') |
| + Here is an example of legacy metadata event that contains stack frame tree: |
| -# Subpath of output path where unstripped libraries are stored. |
| -ANDROID_UNSTRIPPED_SUBPATH = 'lib.unstripped' |
| + { |
| + "args": { |
| + "stackFrames": { ... } |
| + }, |
| + "cat": "__metadata", |
| + "name": "stackFrames", |
| + "ph": "M", |
| + ... |
| + } |
| + When this event is encountered, a reference to the "stackFrames" dictionary |
| + is obtained and passed down to a specific wrapped class, which knows how to |
| + parse / update the dictionary. |
| -def FindInSystemPath(binary_name): |
| - paths = os.environ['PATH'].split(os.pathsep) |
| - for path in paths: |
| - binary_path = os.path.join(path, binary_name) |
| - if os.path.isfile(binary_path): |
| - return binary_path |
| - return None |
| + There are two parsing patterns depending on whether node is serialized |
| + incrementally: |
| + * If node is not incremental, then parsing is done by __init__(), |
| + see MemoryMap for an example. |
| -class Symbolizer(object): |
| - # Encapsulates platform-specific symbolization logic. |
| - def __init__(self): |
| - self.is_mac = sys.platform == 'darwin' |
| - self.is_win = sys.platform == 'win32' |
| - if self.is_mac: |
| - self.binary = 'atos' |
| - self._matcher = symbolize_trace_atos_regex.AtosRegexMatcher() |
| - elif self.is_win: |
| - self.binary = 'addr2line-pdb.exe' |
| - else: |
| - self.binary = 'addr2line' |
| - self.symbolizer_path = FindInSystemPath(self.binary) |
| + * If node is incremental, then __init__() does nothing, and ParseNext() |
| + is called when next node (from a next event) is encountered. |
| - def _SymbolizeLinuxAndAndroid(self, symfile, unsymbolized_name): |
| - def _SymbolizerCallback(sym_info, frames): |
| - # Unwind inline chain to the top. |
| - while sym_info.inlined_by: |
| - sym_info = sym_info.inlined_by |
| + Some wrappers can also modify nodes they parsed. In such cases they have |
| + additional APIs: |
| - symbolized_name = sym_info.name if sym_info.name else unsymbolized_name |
| - for frame in frames: |
| - frame.name = symbolized_name |
| + * 'modified' flag, which indicates whether the wrapper was changed. |
| - symbolizer = elf_symbolizer.ELFSymbolizer(symfile.symbolizable_path, |
| - self.symbolizer_path, |
| - _SymbolizerCallback, |
| - inlines=True) |
| - |
| - for address, frames in symfile.frames_by_address.iteritems(): |
| - # SymbolizeAsync() asserts that the type of address is int. We operate |
| - # on longs (since they are raw pointers possibly from 64-bit processes). |
| - # It's OK to cast here because we're passing relative PC, which should |
| - # always fit into int. |
| - symbolizer.SymbolizeAsync(int(address), frames) |
| - |
| - symbolizer.Join() |
| - |
| - |
| - def _SymbolizeMac(self, symfile): |
| - chars_max = int(subprocess.check_output("getconf ARG_MAX", shell=True)) |
| - |
| - # 16 for the address, 2 for "0x", 1 for the space |
| - chars_per_address = 19 |
| - |
| - load_address = (symbolize_trace_macho_reader. |
| - ReadMachOTextLoadAddress(symfile.symbolizable_path)) |
| - assert load_address is not None |
| - |
| - cmd_base = [self.symbolizer_path, '-arch', 'x86_64', '-l', |
| - '0x%x' % load_address, '-o', |
| - symfile.symbolizable_path] |
| - chars_for_other_arguments = len(' '.join(cmd_base)) + 1 |
| - |
| - # The maximum number of inputs that can be processed at once is limited by |
| - # ARG_MAX. This currently evalutes to ~13000 on macOS. |
| - max_inputs = (chars_max - chars_for_other_arguments) / chars_per_address |
| - |
| - all_keys = symfile.frames_by_address.keys() |
| - processed_keys_count = 0 |
| - while len(all_keys): |
| - input_count = min(len(all_keys), max_inputs) |
| - keys_to_process = all_keys[0:input_count] |
| - |
| - cmd = list(cmd_base) |
| - cmd.extend([hex(int(x) + load_address) |
| - for x in keys_to_process]) |
| - output_array = subprocess.check_output(cmd).split('\n') |
| - for i in range(len(keys_to_process)): |
| - for frame in (symfile.frames_by_address.values() |
| - [i + processed_keys_count]): |
| - frame.name = self._matcher.Match(output_array[i]) |
| - processed_keys_count += len(keys_to_process) |
| - all_keys = all_keys[input_count:] |
| - |
| - |
| - def _SymbolizeWin(self, symfile): |
| - """Invoke symbolizer binary on windows and write all input in one go. |
| + * 'ApplyModifications' method, which propagates changes made to the wrapper |
| + back to nodes. Successful invocation of ApplyModifications() resets |
| + 'modified' flag. |
| - Unlike linux, on windows, symbolization talks through a shared system |
| - service that handles communication with the NT symbol servers. This |
| - creates an explicit serialization (and therefor lock contention) of |
| - any process using the symbol API for files do not have a local PDB. |
| + """ |
| - Thus, even though the windows symbolizer binary can be make command line |
| - compatible with the POSIX addr2line interface, paralellizing the |
| - symbolization does not yield the same performance effects. Running |
| - just one symbolizer seems good enough for now. Can optimize later |
| - if this becomes a bottleneck. |
| - """ |
| - cmd = [self.symbolizer_path, '--functions', '--demangle', '--exe', |
| - symfile.symbolizable_path] |
| + # def __init__(self, node): |
| + # ... |
| - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, |
| - stderr=sys.stderr) |
| - addrs = ["%x" % relative_pc for relative_pc in |
| - symfile.frames_by_address.keys()] |
| - (stdout_data, stderr_data) = proc.communicate('\n'.join(addrs)) |
| - stdout_data = stdout_data.split('\n') |
| + # def ParseNext(self, node, ...): |
| + # ... |
| - # This is known to be in the same order as stderr_data. |
| - for i, addr in enumerate(addrs): |
| - for frame in symfile.frames_by_address[int(addr, 16)]: |
| - # Output of addr2line with --functions is always 2 outputs per |
| - # symbol, function name followed by source line number. Only grab |
| - # the function name as line info is not always available. |
| - frame.name = stdout_data[i * 2] |
| + # @property |
| + # def modified(self): |
| + # ... |
| + # def ApplyModifications(self, ...): |
| + # ... |
| - def Symbolize(self, symfile, unsymbolized_name): |
| - if self.is_mac: |
| - self._SymbolizeMac(symfile) |
| - if self.is_win: |
| - self._SymbolizeWin(symfile) |
| - else: |
| - self._SymbolizeLinuxAndAndroid(symfile, unsymbolized_name) |
| + pass |
| - def IsSymbolizableFile(self, file_path): |
| - if self.is_win: |
| - extension = os.path.splitext(file_path)[1].lower() |
| - return extension in ['.dll', '.exe'] |
| - else: |
| - result = subprocess.check_output(['file', '-0', file_path]) |
| - type_string = result[result.find('\0') + 1:] |
| - return bool(re.match(r'.*(ELF|Mach-O) (32|64)-bit\b.*', |
| - type_string, re.DOTALL)) |
| +class MemoryMap(NodeWrapper): |
| + """Wraps 'process_mmaps' node. |
| + 'process_mmaps' node contains information about file mappings. |
| -class ProcessMemoryMaps(object): |
| - """Represents 'process_mmaps' trace file entry.""" |
| + "process_mmaps": { |
| + "vm_regions": [ |
| + { |
| + "mf": "<file_path>", |
| + "sa": "<start_address>", |
| + "sz": "<size>", |
| + ... |
| + }, |
| + ... |
| + ] |
| + } |
| + """ |
| class Region(object): |
| def __init__(self, start_address, size, file_path): |
| @@ -221,15 +194,13 @@ class ProcessMemoryMaps(object): |
| return 'Region(0x{:X} - 0x{:X}, {})'.format( |
| self.start_address, self.end_address, self.file_path) |
| - def __init__(self, process_mmaps): |
| - """Parses 'process_mmaps' dictionary.""" |
| - |
| + def __init__(self, process_mmaps_node): |
| regions = [] |
| - for region_value in process_mmaps['vm_regions']: |
| + for region_node in process_mmaps_node['vm_regions']: |
| regions.append(self.Region( |
| - long(region_value['sa'], 16), |
| - long(region_value['sz'], 16), |
| - region_value['mf'])) |
| + long(region_node['sa'], 16), |
| + long(region_node['sz'], 16), |
| + region_node['mf'])) |
| regions.sort() |
| # Copy regions without duplicates and check for overlaps. |
| @@ -259,104 +230,540 @@ class ProcessMemoryMaps(object): |
| return None |
| -class StackFrames(object): |
| - """Represents 'stackFrames' trace file entry.""" |
| +class UnsupportedHeapDumpVersionError(Exception): |
| + """Helper exception class to signal unsupported heap dump version.""" |
| + |
| + def __init__(self, version): |
| + message = 'Unsupported heap dump version: {}'.format(version) |
| + super(UnsupportedHeapDumpVersionError, self).__init__(message) |
| + |
| + |
| +class StringMap(NodeWrapper): |
| + """Wraps all 'strings' nodes for a process. |
| + |
| + 'strings' node contains incremental mappings between integer ids and strings. |
| - class PCFrame(object): |
| - def __init__(self, pc, frame): |
| + "strings": [ |
| + { |
| + "id": <string_id>, |
| + "string": <string> |
| + }, |
| + ... |
| + ] |
| + """ |
| + |
| + def __init__(self): |
| + self._modified = False |
| + self._strings_nodes = [] |
| + self._string_by_id = {} |
| + self._id_by_string = {} |
| + self._max_string_id = 0 |
| + |
| + @property |
| + def modified(self): |
| + """Returns True if the wrapper was modified (see NodeWrapper).""" |
| + return self._modified |
| + |
| + @property |
| + def string_by_id(self): |
| + return self._string_by_id |
| + |
| + def ParseNext(self, heap_dump_version, strings_node): |
| + """Parses and interns next node (see NodeWrapper).""" |
| + |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + |
| + self._strings_nodes.append(strings_node) |
| + for string_node in strings_node: |
| + self._Insert(string_node['id'], string_node['string']) |
| + |
| + def Clear(self): |
| + """Clears all string mappings.""" |
| + if self._string_by_id: |
| + self._modified = True |
| + self._string_by_id = {} |
| + self._id_by_string = {} |
| + self._Insert(0, '[null]') |
| + self._max_string_id = 0 |
| + |
| + def AddString(self, string): |
| + """Adds a string (if it doesn't exist) and returns its integer id.""" |
| + string_id = self._id_by_string.get(string) |
| + if string_id is None: |
| + string_id = self._max_string_id + 1 |
| + self._Insert(string_id, string) |
| + self._modified = True |
| + return string_id |
| + |
| + def ApplyModifications(self): |
| + """Propagates modifications back to nodes (see NodeWrapper).""" |
| + if not self.modified: |
| + return |
| + |
| + assert self._strings_nodes, 'no nodes' |
| + |
| + # Serialize into the first node, and clear all others. |
| + |
| + for strings_node in self._strings_nodes: |
| + del strings_node[:] |
| + strings_node = self._strings_nodes[0] |
| + for string_id, string in self._string_by_id.iteritems(): |
| + strings_node.append({'id': string_id, 'string': string}) |
| + |
| + self._modified = False |
| + |
| + def _Insert(self, string_id, string): |
| + self._id_by_string[string] = string_id |
| + self._string_by_id[string_id] = string |
| + self._max_string_id = max(self._max_string_id, string_id) |
| + |
| + |
| +class TypeNameMap(NodeWrapper): |
| + """Wraps all 'types' nodes for a process. |
| + |
| + 'types' nodes encode mappings between integer type ids and integer |
| + string ids (from 'strings' nodes). |
| + |
| + "types": [ |
| + { |
| + "id": <type_id>, |
| + "name_sid": <name_string_id> |
| + } |
| + ... |
| + ] |
| + |
| + For simplicity string ids are translated into strings during parsing, |
| + and then translated back to ids in ApplyModifications(). |
| + """ |
| + def __init__(self): |
| + self._modified = False |
| + self._type_name_nodes = [] |
| + self._name_by_id = {} |
| + self._id_by_name = {} |
| + self._max_type_id = 0 |
| + |
| + @property |
| + def modified(self): |
| + """Returns True if the wrapper was modified (see NodeWrapper).""" |
| + return self._modified |
| + |
| + @property |
| + def name_by_id(self): |
| + """Returns {id -> name} dict (must not be changed directly).""" |
| + return self._name_by_id |
| + |
| + def ParseNext(self, heap_dump_version, type_name_node, string_map): |
| + """Parses and interns next node (see NodeWrapper). |
| + |
| + |string_map| - A StringMap object to use to translate string ids |
| + to strings. |
| + """ |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + |
| + self._type_name_nodes.append(type_name_node) |
| + for type_node in type_name_node: |
| + self._Insert(type_node['id'], |
| + string_map.string_by_id[type_node['name_sid']]) |
| + |
| + def AddType(self, type_name): |
| + """Adds a type name (if it doesn't exist) and returns its id.""" |
| + type_id = self._id_by_name.get(type_name) |
| + if type_id is None: |
| + type_id = self._max_type_id + 1 |
| + self._Insert(type_id, type_name) |
| + self._modified = True |
| + return type_id |
| + |
| + def ApplyModifications(self, string_map, force=False): |
| + """Propagates modifications back to nodes. |
| + |
| + |string_map| - A StringMap object to use to translate strings to ids. |
| + |force| - Whether to propagate changes regardless of 'modified' flag. |
| + """ |
| + if not self.modified and not force: |
| + return |
| + |
| + assert self._type_name_nodes, 'no nodes' |
| + |
| + # Serialize into the first node, and clear all others. |
| + |
| + for types_node in self._type_name_nodes: |
| + del types_node[:] |
| + types_node = self._type_name_nodes[0] |
| + for type_id, type_name in self._name_by_id.iteritems(): |
| + types_node.append({ |
| + 'id': type_id, |
| + 'name_sid': string_map.AddString(type_name)}) |
| + |
| + self._modified = False |
| + |
| + def _Insert(self, type_id, type_name): |
| + self._id_by_name[type_name] = type_id |
| + self._name_by_id[type_id] = type_name |
| + self._max_type_id = max(self._max_type_id, type_id) |
| + |
| + |
| +class StackFrameMap(NodeWrapper): |
| + """ Wraps stack frame tree nodes for a process. |
| + |
| + For the legacy format this wrapper expects a single 'stackFrames' node |
| + (which comes from metadata event): |
| + |
| + "stackFrames": { |
| + "<frame_id>": { |
| + "name": "<frame_name>" |
| + "parent": "<parent_frame_id>" |
| + }, |
| + ... |
| + } |
| + |
| + For the modern format this wrapper expects several 'nodes' nodes: |
| + |
| + "nodes": [ |
| + { |
| + "id": <frame_id>, |
| + "parent": <parent_frame_id>, |
| + "name_sid": <name_string_id> |
| + }, |
| + ... |
| + ] |
| + |
| + In both formats frame name is a string. Native heap profiler generates |
| + specially formatted frame names (e.g. "pc:10eb78dba") for function |
| + addresses (PCs). Inner Frame class below parses name and extracts PC, |
| + if it's there. |
| + """ |
| + class Frame(object): |
| + def __init__(self, frame_id, name, parent_frame_id): |
| self._modified = False |
| - self._pc = pc |
| - self._frame = frame |
| + self._id = frame_id |
| + self._name = name |
| + self._pc = self._ParsePC(name) |
| + self._parent_id = parent_frame_id |
| + self._ext = None |
| @property |
| def modified(self): |
| + """Returns True if the frame was modified. |
| + |
| + For example changing frame's name sets this flag (since the change |
| + needs to be propagated back to nodes). |
| + """ |
| return self._modified |
| @property |
| + def id(self): |
| + """Frame id (integer).""" |
| + return self._id |
| + |
| + @property |
| def pc(self): |
| + """Parsed (integer) PC of the frame, or None.""" |
| return self._pc |
| @property |
| def name(self): |
| - return self._frame['name'] |
| + """Name of the frame (see above).""" |
| + return self._name |
| @name.setter |
| def name(self, value): |
| + """Changes the name. Doesn't affect value of |pc|.""" |
| self._modified = True |
| - self._frame['name'] = value |
| + self._name = value |
| - def __init__(self, stack_frames): |
| - """Constructs object using 'stackFrames' dictionary.""" |
| - self._pc_frames = [] |
| - for frame in stack_frames.itervalues(): |
| - pc_frame = self._ParsePCFrame(frame) |
| - if pc_frame: |
| - self._pc_frames.append(pc_frame) |
| + @property |
| + def parent_id(self): |
| + """Parent frame id (integer).""" |
| + return self._parent_id |
| - @property |
| - def pc_frames(self): |
| - return self._pc_frames |
| + _PC_TAG = 'pc:' |
| + |
| + def _ParsePC(self, name): |
| + if not name.startswith(self._PC_TAG): |
| + return None |
| + return long(name[len(self._PC_TAG):], 16) |
| + |
| + def _ClearModified(self): |
| + self._modified = False |
| + |
| + def __init__(self): |
| + self._modified = False |
| + self._heap_dump_version = None |
| + self._stack_frames_nodes = [] |
| + self._frame_by_id = {} |
| @property |
| def modified(self): |
| - return any(f.modified for f in self._pc_frames) |
| + """Returns True if the wrapper or any of its frames were modified.""" |
| + return (self._modified or |
| + any(f.modified for f in self._frame_by_id.itervalues())) |
| - _PC_TAG = 'pc:' |
| + @property |
| + def frame_by_id(self): |
| + """Returns {id -> frame} dict (must not be modified directly).""" |
| + return self._frame_by_id |
| - @classmethod |
| - def _ParsePCFrame(self, frame): |
| - name = frame['name'] |
| - if not name.startswith(self._PC_TAG): |
| - return None |
| - pc = long(name[len(self._PC_TAG):], 16) |
| - return self.PCFrame(pc, frame) |
| + def ParseNext(self, heap_dump_version, stack_frames_node, string_map): |
| + """Parses the next stack frames node (see NodeWrapper). |
| + For the modern format |string_map| is used to translate string ids |
| + to strings. |
| + """ |
| -class Process(object): |
| - """Holds various bits of information about a process in a trace file.""" |
| + frame_by_id = {} |
| + if heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + if self._stack_frames_nodes: |
| + raise Exception('Legacy stack frames node is expected only once.') |
| + for frame_id, frame_node in stack_frames_node.iteritems(): |
| + frame = self.Frame(frame_id, |
| + frame_node['name'], |
| + frame_node.get('parent')) |
| + frame_by_id[frame.id] = frame |
| + else: |
| + if heap_dump_version != Trace.HEAP_DUMP_VERSION_1: |
| + raise UnsupportedHeapDumpVersionError(heap_dump_version) |
| + for frame_node in stack_frames_node: |
| + frame = self.Frame(frame_node['id'], |
| + string_map.string_by_id[frame_node['name_sid']], |
| + frame_node.get('parent')) |
| + frame_by_id[frame.id] = frame |
| - def __init__(self, pid): |
| - self.pid = pid |
| - self.name = None |
| - self.mmaps = None |
| - self.stack_frames = None |
| + self._heap_dump_version = heap_dump_version |
| + self._stack_frames_nodes.append(stack_frames_node) |
| + self._frame_by_id = frame_by_id |
| + |
| + def ApplyModifications(self, string_map, force=False): |
| + """Applies modifications back to nodes (see NodeWrapper).""" |
| + |
| + if not self.modified and not force: |
| + return |
| + |
| + assert self._stack_frames_nodes, 'no nodes' |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + assert string_map is None, \ |
| + 'string_map should not be used with the legacy format' |
| + |
| + # Serialize frames into the first node, clear all others. |
| + |
| + for frames_node in self._stack_frames_nodes: |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + frames_node.clear() |
| + else: |
| + del frames_node[:] |
| -def CollectProcesses(trace): |
| - """Parses trace dictionary and returns pid->Process map of all processes |
| - suitable for symbolization (which have both mmaps and stack_frames). |
| + frames_node = self._stack_frames_nodes[0] |
| + for frame in self._frame_by_id.itervalues(): |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + frame_node = {'name': frame.name} |
| + frames_node[frame.id] = frame_node |
| + else: |
| + frame_node = { |
| + 'id': frame.id, |
| + 'name_sid': string_map.AddString(frame.name) |
| + } |
| + frames_node.append(frame_node) |
| + if frame.parent_id is not None: |
| + frame_node['parent'] = frame.parent_id |
| + frame._ClearModified() |
| + |
| + self._modified = False |
| + |
| + |
| +class Trace(NodeWrapper): |
| + """Wrapper for the root trace node (i.e. the trace JSON itself). |
| + |
| + This wrapper parses select nodes from memory-infra events and groups |
| + parsed data per-process (see inner Process class below). |
| """ |
| - process_map = {} |
| + # Indicates legacy heap dump format. |
| + HEAP_DUMP_VERSION_LEGACY = 'Legacy' |
| - # Android traces produced via 'chrome://inspect/?tracing#devices' are |
| - # just list of events. |
| - events = trace if isinstance(trace, list) else trace['traceEvents'] |
| - for event in events: |
| - name = event.get('name') |
| - if not name: |
| - continue |
| + # Indicates variation of a modern heap dump format. |
| + HEAP_DUMP_VERSION_1 = 1 |
| + |
| + class Process(object): |
| + """Collection of per-process data and wrappers.""" |
| + |
| + def __init__(self, pid): |
| + self._pid = pid |
| + self._name = None |
| + self._memory_map = None |
| + self._stack_frame_map = StackFrameMap() |
| + self._type_name_map = TypeNameMap() |
| + self._string_map = StringMap() |
| + self._heap_dump_version = None |
| + |
| + @property |
| + def modified(self): |
| + return self._stack_frame_map.modified or self._type_name_map.modified |
| + |
| + @property |
| + def pid(self): |
| + return self._pid |
| + |
| + @property |
| + def name(self): |
| + return self._name |
| + |
| + @property |
| + def unique_name(self): |
| + """Returns string that includes both process name and its pid.""" |
| + name = self._name if self._name else 'UnnamedProcess' |
| + return '{}({})'.format(name, self._pid) |
| + |
| + @property |
| + def memory_map(self): |
| + return self._memory_map |
| + |
| + @property |
| + def stack_frame_map(self): |
| + return self._stack_frame_map |
| + |
| + @property |
| + def type_name_map(self): |
| + return self._type_name_map |
| + |
| + def ApplyModifications(self): |
| + """Calls ApplyModifications() on contained wrappers.""" |
| + if self._heap_dump_version == Trace.HEAP_DUMP_VERSION_LEGACY: |
| + self._stack_frame_map.ApplyModifications(None) |
| + else: |
| + if self._stack_frame_map.modified or self._type_name_map.modified: |
| + self._string_map.Clear() |
| + self._stack_frame_map.ApplyModifications(self._string_map, force=True) |
| + self._type_name_map.ApplyModifications(self._string_map, force=True) |
| + self._string_map.ApplyModifications() |
| + |
| + def __init__(self, trace_node): |
| + self._trace_node = trace_node |
| + self._processes = [] |
| + self._heap_dump_version = None |
| + |
| + # Misc per-process information needed only during parsing. |
| + class ProcessExt(object): |
| + def __init__(self, pid): |
| + self.process = Trace.Process(pid) |
| + self.mapped_entry_names = set() |
| + self.process_mmaps_node = None |
| + self.seen_strings_node = False |
| + |
| + process_ext_by_pid = {} |
| + |
| + # Android traces produced via 'chrome://inspect/?tracing#devices' are |
| + # just list of events. |
| + events = trace_node if isinstance(trace_node, list) \ |
| + else trace_node['traceEvents'] |
| + for event in events: |
| + name = event.get('name') |
| + if not name: |
| + continue |
| - pid = event['pid'] |
| - process = process_map.get(pid) |
| - if process is None: |
| - process = Process(pid) |
| - process_map[pid] = process |
| + pid = event['pid'] |
| + process_ext = process_ext_by_pid.get(pid) |
| + if process_ext is None: |
| + process_ext = ProcessExt(pid) |
| + process_ext_by_pid[pid] = process_ext |
| + process = process_ext.process |
| + |
| + phase = event['ph'] |
| + if phase == self._EVENT_PHASE_METADATA: |
| + if name == 'process_name': |
| + process._name = event['args']['name'] |
| + elif name == 'stackFrames': |
| + process._stack_frame_map.ParseNext( |
| + self._UseHeapDumpVersion(self.HEAP_DUMP_VERSION_LEGACY), |
| + event['args']['stackFrames'], |
| + process._string_map) |
| + elif phase == self._EVENT_PHASE_MEMORY_DUMP: |
| + dumps = event['args']['dumps'] |
| + process_mmaps = dumps.get('process_mmaps') |
| + if process_mmaps: |
| + # We want the most recent memory map, so parsing happens later |
| + # once we finished reading all events. |
| + process_ext.process_mmaps_node = process_mmaps |
| + heaps = dumps.get('heaps_v2') |
| + if heaps: |
| + version = self._UseHeapDumpVersion(heaps['version']) |
| + maps = heaps.get('maps') |
| + if maps: |
| + process_ext.mapped_entry_names.update(maps.iterkeys()) |
| + types = maps.get('types') |
| + stack_frames = maps.get('nodes') |
| + strings = maps.get('strings') |
| + if (strings is None and (types or stack_frames) |
| + and not process_ext.seen_strings_node): |
| + # ApplyModifications() for TypeNameMap and StackFrameMap puts |
| + # everything into the first node and depends on StringMap. So |
| + # we need to make sure that 'strings' node is there if any of |
| + # other two nodes present. |
| + strings = [] |
| + maps['strings'] = strings |
| + if strings is not None: |
| + process_ext.seen_strings_node = True |
| + process._string_map.ParseNext(version, strings) |
| + if types: |
| + process._type_name_map.ParseNext( |
| + version, types, process._string_map) |
| + if stack_frames: |
| + process._stack_frame_map.ParseNext( |
| + version, stack_frames, process._string_map) |
| + |
| + self._processes = [] |
| + for pe in process_ext_by_pid.itervalues(): |
| + pe.process._heap_dump_version = self._heap_dump_version |
| + if pe.process_mmaps_node: |
| + # Now parse the most recent memory map. |
| + pe.process._memory_map = MemoryMap(pe.process_mmaps_node) |
| + self._processes.append(pe.process) |
| - phase = event['ph'] |
| - if phase == TRACE_EVENT_PHASE_METADATA: |
| - if name == 'process_name': |
| - process.name = event['args']['name'] |
| - elif name == 'stackFrames': |
| - process.stack_frames = StackFrames(event['args']['stackFrames']) |
| - elif phase == TRACE_EVENT_PHASE_MEMORY_DUMP: |
| - process_mmaps = event['args']['dumps'].get('process_mmaps') |
| - if process_mmaps: |
| - # TODO(dskiba): this parses all process_mmaps, but retains only the |
| - # last one. We need to parse only once (lazy parsing?). |
| - process.mmaps = ProcessMemoryMaps(process_mmaps) |
| + @property |
| + def node(self): |
| + """Root node (that was passed to the __init__).""" |
| + return self._trace_node |
| + |
| + @property |
| + def modified(self): |
| + """Returns True if trace file needs to be updated. |
| + |
| + Before writing trace JSON back to a file ApplyModifications() needs |
| + to be called. |
| + """ |
| + return any(p.modified for p in self._processes) |
| + |
| + @property |
| + def processes(self): |
| + return self._processes |
| - return [p for p in process_map.itervalues() if p.mmaps and p.stack_frames] |
| + @property |
| + def heap_dump_version(self): |
| + return self._heap_dump_version |
| + |
| + def ApplyModifications(self): |
| + """Propagates modifications back to the trace JSON.""" |
| + for process in self._processes: |
| + process.ApplyModifications() |
| + assert not self.modified, 'still modified' |
| + |
| + # Relevant trace event phases from Chromium's |
| + # src/base/trace_event/common/trace_event_common.h. |
| + _EVENT_PHASE_METADATA = 'M' |
| + _EVENT_PHASE_MEMORY_DUMP = 'v' |
| + |
| + def _UseHeapDumpVersion(self, version): |
| + if self._heap_dump_version is None: |
| + self._heap_dump_version = version |
| + return version |
| + elif self._heap_dump_version != version: |
| + raise Exception( |
| + ("Inconsistent trace file: first saw '{}' heap dump version, " |
| + "then '{}'.").format(self._heap_dump_version, version)) |
| + else: |
| + return version |
| class SymbolizableFile(object): |
| @@ -381,8 +788,12 @@ def ResolveSymbolizableFiles(processes): |
| """ |
| symfile_by_path = {} |
| for process in processes: |
| - for frame in process.stack_frames.pc_frames: |
| - region = process.mmaps.FindRegion(frame.pc) |
| + if not process.memory_map: |
| + continue |
| + for frame in process.stack_frame_map.frame_by_id.itervalues(): |
| + if frame.pc is None: |
| + continue |
| + region = process.memory_map.FindRegion(frame.pc) |
| if region is None: |
| frame.name = '<unresolved>' |
| continue |
| @@ -397,15 +808,155 @@ def ResolveSymbolizableFiles(processes): |
| return symfile_by_path.values() |
| +def FindInSystemPath(binary_name): |
| + paths = os.environ['PATH'].split(os.pathsep) |
| + for path in paths: |
| + binary_path = os.path.join(path, binary_name) |
| + if os.path.isfile(binary_path): |
| + return binary_path |
| + return None |
| + |
| + |
| +class Symbolizer(object): |
| + """Encapsulates platform-specific symbolization logic.""" |
| + |
| + def __init__(self): |
| + self.is_mac = sys.platform == 'darwin' |
| + self.is_win = sys.platform == 'win32' |
| + if self.is_mac: |
| + self.binary = 'atos' |
| + self._matcher = symbolize_trace_atos_regex.AtosRegexMatcher() |
| + elif self.is_win: |
| + self.binary = 'addr2line-pdb.exe' |
| + else: |
| + self.binary = 'addr2line' |
| + self.symbolizer_path = FindInSystemPath(self.binary) |
| + |
| + def _SymbolizeLinuxAndAndroid(self, symfile, unsymbolized_name): |
| + def _SymbolizerCallback(sym_info, frames): |
| + # Unwind inline chain to the top. |
| + while sym_info.inlined_by: |
| + sym_info = sym_info.inlined_by |
| + |
| + symbolized_name = sym_info.name if sym_info.name else unsymbolized_name |
| + for frame in frames: |
| + frame.name = symbolized_name |
| + frame.ext.source_path = sym_info.source_path |
| + |
| + symbolizer = elf_symbolizer.ELFSymbolizer(symfile.symbolizable_path, |
| + self.symbolizer_path, |
| + _SymbolizerCallback, |
| + inlines=True) |
| + |
| + for address, frames in symfile.frames_by_address.iteritems(): |
| + # SymbolizeAsync() asserts that the type of address is int. We operate |
| + # on longs (since they are raw pointers possibly from 64-bit processes). |
| + # It's OK to cast here because we're passing relative PC, which should |
| + # always fit into int. |
| + symbolizer.SymbolizeAsync(int(address), frames) |
| + |
| + symbolizer.Join() |
| + |
| + |
| + def _SymbolizeMac(self, symfile): |
| + chars_max = int(subprocess.check_output("getconf ARG_MAX", shell=True)) |
| + |
| + # 16 for the address, 2 for "0x", 1 for the space |
| + chars_per_address = 19 |
| + |
| + load_address = (symbolize_trace_macho_reader. |
| + ReadMachOTextLoadAddress(symfile.symbolizable_path)) |
| + assert load_address is not None |
| + |
| + cmd_base = [self.symbolizer_path, '-arch', 'x86_64', '-l', |
| + '0x%x' % load_address, '-o', |
| + symfile.symbolizable_path] |
| + chars_for_other_arguments = len(' '.join(cmd_base)) + 1 |
| + |
| + # The maximum number of inputs that can be processed at once is limited by |
| + # ARG_MAX. This currently evalutes to ~13000 on macOS. |
| + max_inputs = (chars_max - chars_for_other_arguments) / chars_per_address |
| + |
| + all_keys = symfile.frames_by_address.keys() |
| + processed_keys_count = 0 |
| + while len(all_keys): |
| + input_count = min(len(all_keys), max_inputs) |
| + keys_to_process = all_keys[0:input_count] |
| + cmd = list(cmd_base) |
| + cmd.extend([hex(int(x) + load_address) |
| + for x in keys_to_process]) |
| + output_array = subprocess.check_output(cmd).split('\n') |
| + for i in range(len(keys_to_process)): |
| + for frame in (symfile.frames_by_address.values() |
| + [i + processed_keys_count]): |
| + frame.name = self._matcher.Match(output_array[i]) |
| + processed_keys_count += len(keys_to_process) |
| + all_keys = all_keys[input_count:] |
| + |
| + def _SymbolizeWin(self, symfile): |
| + """Invoke symbolizer binary on windows and write all input in one go. |
| + |
| + Unlike linux, on windows, symbolization talks through a shared system |
| + service that handles communication with the NT symbol servers. This |
| + creates an explicit serialization (and therefor lock contention) of |
| + any process using the symbol API for files do not have a local PDB. |
| + |
| + Thus, even though the windows symbolizer binary can be make command line |
| + compatible with the POSIX addr2line interface, paralellizing the |
| + symbolization does not yield the same performance effects. Running |
| + just one symbolizer seems good enough for now. Can optimize later |
| + if this becomes a bottleneck. |
| + """ |
| + cmd = [self.symbolizer_path, '--functions', '--demangle', '--exe', |
| + symfile.symbolizable_path] |
| + |
| + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, |
| + stderr=sys.stderr) |
| + addrs = ["%x" % relative_pc for relative_pc in |
| + symfile.frames_by_address.keys()] |
| + (stdout_data, stderr_data) = proc.communicate('\n'.join(addrs)) |
| + stdout_data = stdout_data.split('\n') |
| + |
| + # This is known to be in the same order as stderr_data. |
| + for i, addr in enumerate(addrs): |
| + for frame in symfile.frames_by_address[int(addr, 16)]: |
| + # Output of addr2line with --functions is always 2 outputs per |
| + # symbol, function name followed by source line number. Only grab |
| + # the function name as line info is not always available. |
| + frame.name = stdout_data[i * 2] |
| + |
| + def Symbolize(self, symfile, unsymbolized_name): |
| + if self.is_mac: |
| + self._SymbolizeMac(symfile) |
| + elif self.is_win: |
| + self._SymbolizeWin(symfile) |
| + else: |
| + self._SymbolizeLinuxAndAndroid(symfile, unsymbolized_name) |
| + |
| + def IsSymbolizableFile(self, file_path): |
| + if self.is_win: |
| + extension = os.path.splitext(file_path)[1].lower() |
| + return extension in ['.dll', '.exe'] |
| + else: |
| + result = subprocess.check_output(['file', '-0', file_path]) |
| + type_string = result[result.find('\0') + 1:] |
| + return bool(re.match(r'.*(ELF|Mach-O) (32|64)-bit\b.*', |
| + type_string, re.DOTALL)) |
| + |
| + |
| def SymbolizeFiles(symfiles, symbolizer): |
| """Symbolizes each file in the given list of SymbolizableFiles |
| and updates stack frames with symbolization results.""" |
| + |
| + if not symfiles: |
| + print 'Nothing to symbolize.' |
| + return |
| + |
| print 'Symbolizing...' |
| def _SubPrintf(message, *args): |
| print (' ' + message).format(*args) |
| - symbolized = False |
| for symfile in symfiles: |
| unsymbolized_name = '<{}>'.format( |
| symfile.path if symfile.path else 'unnamed') |
| @@ -432,9 +983,20 @@ def SymbolizeFiles(symfiles, symbolizer): |
| symfile.path) |
| symbolizer.Symbolize(symfile, unsymbolized_name) |
| - symbolized = True |
| - return symbolized |
| + |
| +# Matches Android library paths, supports both K (/data/app-lib/<>/lib.so) |
| +# as well as L+ (/data/app/<>/lib/<>/lib.so). Library name is available |
| +# via 'name' group. |
| +ANDROID_PATH_MATCHER = re.compile( |
| + r'^/data/(?:' |
| + r'app/[^/]+/lib/[^/]+/|' |
| + r'app-lib/[^/]+/|' |
| + r'data/[^/]+/incremental-install-files/lib/' |
| + r')(?P<name>.*\.so)') |
| + |
| +# Subpath of output path where unstripped libraries are stored. |
| +ANDROID_UNSTRIPPED_SUBPATH = 'lib.unstripped' |
| def HaveFilesFromAndroid(symfiles): |
| @@ -455,59 +1017,87 @@ def RemapAndroidFiles(symfiles, output_path): |
| symfile.symbolizable_path = 'android://{}'.format(symfile.path) |
| +def Symbolize(options, trace, symbolizer): |
| + symfiles = ResolveSymbolizableFiles(trace.processes) |
| + |
| + # Android trace files don't have any indication they are from Android. |
| + # So we're checking for Android-specific paths. |
| + if HaveFilesFromAndroid(symfiles): |
| + if not options.output_directory: |
| + sys.exit('The trace file appears to be from Android. Please ' |
| + 'specify output directory to properly symbolize it.') |
| + RemapAndroidFiles(symfiles, os.path.abspath(options.output_directory)) |
| + |
| + SymbolizeFiles(symfiles, symbolizer) |
| + |
| + |
| +def OpenTraceFile(file_path, mode): |
| + if file_path.endswith('.gz'): |
| + return gzip.open(file_path, mode + 'b') |
| + else: |
| + return open(file_path, mode + 't') |
| + |
| + |
| # Suffix used for backup files. |
| BACKUP_FILE_TAG = '.BACKUP' |
| def main(): |
| - parser = argparse.ArgumentParser() |
| - parser.add_argument('file', |
| - help='Trace file to symbolize (.json or .json.gz)') |
| - parser.add_argument('--no-backup', |
| - dest='backup', default='true', action='store_false', |
| - help="Don't create {} files".format(BACKUP_FILE_TAG)) |
| - parser.add_argument('--output-directory', |
| - help='The path to the build output directory, such ' + |
| - 'as out/Debug. Only needed for Android.') |
| - options = parser.parse_args() |
| - |
| - trace_file_path = options.file |
| - def _OpenTraceFile(mode): |
| - if trace_file_path.endswith('.gz'): |
| - return gzip.open(trace_file_path, mode + 'b') |
| - else: |
| - return open(trace_file_path, mode + 't') |
| + class MultilineHelpFormatter(argparse.HelpFormatter): |
| + def _split_lines(self, text, width): |
| + extra_lines = [] |
| + if '\n' in text: |
| + lines = text.splitlines() |
| + text = lines[0] |
| + extra_lines = lines[1:] |
| + return super(MultilineHelpFormatter, self)._split_lines(text, width) + \ |
| + extra_lines |
| + |
| + parser = argparse.ArgumentParser(formatter_class=MultilineHelpFormatter) |
| + parser.add_argument( |
| + 'file', |
| + help='Trace file to symbolize (.json or .json.gz)') |
| + |
| + parser.add_argument( |
| + '--no-backup', dest='backup', default='true', action='store_false', |
| + help="Don't create {} files".format(BACKUP_FILE_TAG)) |
| + |
| + parser.add_argument( |
| + '--output-directory', |
| + help='The path to the build output directory, such as out/Debug.') |
| symbolizer = Symbolizer() |
| if symbolizer.symbolizer_path is None: |
| sys.exit("Can't symbolize - no %s in PATH." % symbolizer.binary) |
| + options = parser.parse_args() |
| + |
| + trace_file_path = options.file |
| + |
| print 'Reading trace file...' |
| - with _OpenTraceFile('r') as trace_file: |
| - trace = json.load(trace_file) |
| + with OpenTraceFile(trace_file_path, 'r') as trace_file: |
| + trace = Trace(json.load(trace_file)) |
| - processes = CollectProcesses(trace) |
| - symfiles = ResolveSymbolizableFiles(processes) |
| + Symbolize(options, trace, symbolizer) |
| - # Android trace files don't have any indication they are from Android. |
| - # So we're checking for Android-specific paths. |
| - if HaveFilesFromAndroid(symfiles): |
| - if not options.output_directory: |
| - parser.error('The trace file appears to be from Android. Please ' |
| - "specify output directory (e.g. 'out/Debug') to properly " |
| - 'symbolize it.') |
| - RemapAndroidFiles(symfiles, os.path.abspath(options.output_directory)) |
| + if trace.modified: |
| + trace.ApplyModifications() |
| - if SymbolizeFiles(symfiles, symbolizer): |
| if options.backup: |
| backup_file_path = trace_file_path + BACKUP_FILE_TAG |
| - print 'Backing up trace file to {}...'.format(backup_file_path) |
| + if os.path.exists(backup_file_path): |
| + for i in itertools.count(1): |
| + unique_file_path = '{}{}'.format(backup_file_path, i) |
| + if not os.path.exists(unique_file_path): |
| + backup_file_path = unique_file_path |
| + break |
| + print 'Backing up trace file to {}'.format(backup_file_path) |
| os.rename(trace_file_path, backup_file_path) |
| - print 'Updating trace file...' |
| - with _OpenTraceFile('w') as trace_file: |
| - json.dump(trace, trace_file) |
| + print 'Updating the trace file...' |
| + with OpenTraceFile(trace_file_path, 'w') as trace_file: |
| + json.dump(trace.node, trace_file) |
| else: |
| - print 'No PCs symbolized - not updating trace file.' |
| + print 'No modifications were made - not updating the trace file.' |
| if __name__ == '__main__': |