Chromium Code Reviews| Index: tools/binary_size/analyze.py |
| diff --git a/tools/binary_size/analyze.py b/tools/binary_size/analyze.py |
| new file mode 100755 |
| index 0000000000000000000000000000000000000000..a02ccda24cb3fb88a23b83628360613edab85a8f |
| --- /dev/null |
| +++ b/tools/binary_size/analyze.py |
| @@ -0,0 +1,441 @@ |
| +#!/usr/bin/env python |
| +# Copyright 2017 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +"""Main Python API for analyzing binary size.""" |
| + |
| +import argparse |
| +import ast |
| +import distutils.spawn |
| +import gzip |
| +import logging |
| +import os |
| +import re |
| +import subprocess |
| + |
| +import parsers |
| +import helpers |
| +import symbols |
| + |
| + |
| +# File format version for .size files. |
| +_SERIALIZATION_VERSION = 1 |
| + |
| + |
| +def _OpenMaybeGz(path, mode=None): |
| + """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`.""" |
| + if path.endswith('.gz'): |
| + if mode and 'w' in mode: |
| + return gzip.GzipFile(path, mode, 1) |
| + return gzip.open(path, mode) |
| + return open(path, mode or 'r') |
| + |
| + |
| +def _EndsWithMaybeGz(path, suffix): |
| + return path.endswith(suffix) or path.endswith(suffix + '.gz') |
| + |
| + |
| +def _IterLines(s): |
| + prev_idx = -1 |
| + while True: |
| + idx = s.find('\n', prev_idx + 1) |
| + if idx == -1: |
| + return |
| + yield s[prev_idx + 1:idx] |
| + prev_idx = idx |
| + |
| + |
| +def _UnmangleRemainingSymbols(symbol_group, tool_prefix): |
| + """Uses c++filt to unmangle any symbols that need it.""" |
| + to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')] |
| + if not to_process: |
| + return |
| + |
| + logging.info('Unmangling %d names', len(to_process)) |
| + proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE, |
| + stdout=subprocess.PIPE) |
| + stdout = proc.communicate('\n'.join(s.name for s in to_process))[0] |
| + assert proc.returncode == 0 |
| + |
| + for i, line in enumerate(_IterLines(stdout)): |
| + to_process[i].name = line |
| + |
| + |
| +def _FindParameterListParen(name): |
| + """Finds index of the "(" that denotes the start of a paremeter list.""" |
| + # This loops from left-to-right, but the only reason (I think) that this |
| + # is necessary (rather than reusing _FindLastCharOutsideOfBrackets), is |
| + # to capture the outer-most function in the case where classes are nested. |
| + start_idx = 0 |
| + while True: |
| + template_balance_count = 0 |
| + paren_balance_count = 0 |
| + while True: |
| + idx = name.find('(', start_idx) |
| + if idx == -1: |
| + return -1 |
| + template_balance_count += ( |
| + name.count('<', start_idx, idx) - name.count('>', start_idx, idx)) |
| + # Special: operators with angle brackets. |
| + operator_idx = name.find('operator<', start_idx, idx) |
| + if operator_idx != -1: |
| + if name[operator_idx + 9] == '<': |
| + template_balance_count -= 2 |
| + else: |
| + template_balance_count -= 1 |
| + else: |
| + operator_idx = name.find('operator>', start_idx, idx) |
| + if operator_idx != -1: |
| + if name[operator_idx + 9] == '>': |
| + template_balance_count += 2 |
| + else: |
| + template_balance_count += 1 |
| + |
| + paren_balance_count += ( |
| + name.count('(', start_idx, idx) - name.count(')', start_idx, idx)) |
| + if template_balance_count == 0 and paren_balance_count == 0: |
| + # Special case: skip "(anonymous namespace)". |
| + if -1 != name.find('(anonymous namespace)', idx, idx + 21): |
| + start_idx = idx + 21 |
| + continue |
| + # Special case: skip "decltype (...)" |
| + if name[idx - 1] != ' ': |
| + return idx |
| + start_idx = idx + 1 |
| + paren_balance_count += 1 |
| + |
| + |
| +def _FindLastCharOutsideOfBrackets(name, target_char, prev_idx=None): |
| + paren_balance_count = 0 |
| + template_balance_count = 0 |
| + while True: |
| + idx = name.rfind(target_char, 0, prev_idx) |
| + if idx == -1: |
| + return -1 |
| + # It is much faster to use.find() and.count() than to loop over each |
| + # character. |
| + template_balance_count += ( |
| + name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx)) |
| + paren_balance_count += ( |
| + name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx)) |
| + if template_balance_count == 0 and paren_balance_count == 0: |
| + return idx |
| + prev_idx = idx |
| + |
| + |
| +def _ParseFunctionSignature(name): |
| + """Extracts a function name from a function signature. |
| + |
| + See unit tests for example signatures. |
| + |
| + Returns: |
| + A tuple of (name_without_return_type, name_without_return_type_and_params). |
| + """ |
| + paren_idx = _FindParameterListParen(name) |
| + |
| + if paren_idx > 0: |
| + space_idx = paren_idx |
| + # Special case: const cast operators (see tests). |
| + if -1 != name.find(' const', paren_idx - 6, paren_idx): |
| + space_idx = paren_idx - 6 |
| + while True: |
| + space_idx = _FindLastCharOutsideOfBrackets(name, ' ', space_idx) |
| + # Special case: "operator new", and "operator<< <template>". |
| + if -1 == space_idx or ( |
| + -1 == name.find('operator', space_idx - 8, space_idx) and |
| + -1 == name.find('operator<<', space_idx - 10, space_idx)): |
| + break |
| + space_idx -= 8 |
| + return (name[space_idx + 1:], name[space_idx + 1:paren_idx]) |
| + return name, name |
| + |
| + |
| +def _NormalizeNames(symbol_group): |
| + """Ensures that all names are formatted in a useful way. |
| + |
| + This include: |
| + - Assigning of |function_signature| (for functions). |
| + - Stripping of return types in |function_signature| and |name|. |
| + - Stripping parameters from |name|. |
| + - Moving "vtable for" and the like to be suffixes rather than prefixes. |
| + """ |
| + found_prefixes = set() |
| + for symbol in symbol_group: |
| + if not symbol.name or symbol.name.startswith('*'): |
| + # See comment in _RemoveDuplicatesAndCalculatePadding() about when this |
| + # can happen. |
| + continue |
| + |
| + # E.g.: vtable for FOO |
| + idx = symbol.name.find(' for ', 0, 30) |
| + if idx != -1: |
| + found_prefixes.add(symbol.name[:idx + 4]) |
| + symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']' |
| + |
| + # E.g.: virtual thunk to FOO |
| + idx = symbol.name.find(' to ', 0, 30) |
| + if idx != -1: |
| + found_prefixes.add(symbol.name[:idx + 3]) |
| + symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']' |
| + |
| + # Strip out return type, and identify where parameter list starts. |
| + if symbol.section == 't': |
| + symbol.function_signature, symbol.name = ( |
| + _ParseFunctionSignature(symbol.name)) |
| + |
| + # Remove anonymous namespaces (they just harm clustering). |
| + symbol.name = symbol.name.replace('(anonymous namespace)::', '') |
| + |
| + logging.debug('Found name prefixes of: %r', found_prefixes) |
| + |
| + |
| +def _NormalizeObjectPaths(symbol_group): |
| + """Ensures that all paths are formatted in a useful way.""" |
| + for symbol in symbol_group: |
| + if symbol.path: |
| + if symbol.path.startswith('obj/'): |
| + # Convert obj/third_party/... -> third_party/... |
| + symbol.path = symbol.path[4:] |
| + elif symbol.path.startswith('../../'): |
| + # Convert ../../third_party/... -> third_party/... |
| + symbol.path = symbol.path[6:] |
| + if symbol.path.endswith(')'): |
| + # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o |
| + start_idx = symbol.path.index('(') |
| + paren_path = symbol.path[start_idx + 1:-1] |
| + symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path |
| + |
| + |
| +def _RemoveDuplicatesAndCalculatePadding(symbol_group): |
| + """Removes symbols at the same address and calculates the |padding| field. |
| + |
| + Symbols must already be sorted by |address|. |
| + """ |
| + i = 0 |
| + to_remove = set() |
| + all_symbols = symbol_group.symbols |
| + for i in xrange(len(all_symbols)): |
| + prev_symbol = all_symbols[i - 1] |
| + symbol = all_symbols[i] |
| + if prev_symbol.section_name is not symbol.section_name: |
| + continue |
| + if symbol.address > 0 and prev_symbol.address > 0: |
| + # Fold symbols that are at the same address (happens in nm output). |
| + if symbol.address == prev_symbol.address: |
| + symbol.size = max(prev_symbol.size, symbol.size) |
| + to_remove.add(i) |
| + continue |
| + # Even with symbols at the same address removed, overlaps can still |
| + # happen. In this case, padding will be negative (and this is fine). |
| + padding = symbol.address - prev_symbol.end_address |
| + if (symbol.section in 'rd' and padding >= 256 or |
| + symbol.section in 't' and padding >= 64): |
| + # For nm data, this is caused by data that has no associated symbol. |
| + # The linker map file lists them with no name, but with a file. |
| + # Example: |
| + # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o |
| + # Where as most look like: |
| + # .data.MANGLED_NAME... |
| + logging.debug('Large padding of %d between:\n A) %r\n B) %r' % ( |
| + padding, prev_symbol, symbol)) |
| + continue |
| + symbol.padding = padding |
| + symbol.size += padding |
| + assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol |
| + # Map files have no overlaps, so worth special-casing the no-op case. |
| + if to_remove: |
| + logging.info('Removing %d overlapping symbols', len(to_remove)) |
| + symbol_group.symbols = ( |
| + [s for i, s in enumerate(all_symbols) if i not in to_remove]) |
| + |
| + |
| +def _PrintStats(result, write_func): |
| + """Prints out how accurate |result| is.""" |
| + for section in symbols.SECTION_TO_SECTION_NAME: |
| + if section == 'd': |
| + expected_size = sum(v for k, v in result.section_sizes.iteritems() |
| + if k.startswith('.data')) |
| + else: |
| + expected_size = result.section_sizes[ |
| + symbols.SECTION_TO_SECTION_NAME[section]] |
| + |
| + def show_one_stat(group): |
| + template = ('Section %s has %.1f%% of %d bytes accounted for from ' |
| + '%d symbols. %d bytes are unaccounted for. Padding ' |
| + 'accounts for %d bytes\n') |
| + actual_size = group.size |
| + count = len(group) |
| + padding = group.padding |
| + size_percent = 100.0 * actual_size / expected_size |
| + write_func(template % (section, size_percent, actual_size, count, |
| + expected_size - actual_size, padding)) |
| + |
| + in_section = result.symbol_group.WhereInSection(section) |
| + show_one_stat(in_section) |
| + |
| + star_syms = in_section.WhereNameMatches(r'^\*') |
| + attributed_syms = star_syms.Inverted().WhereHasAnyAttribution() |
| + anonymous_syms = attributed_syms.Inverted() |
| + if star_syms or anonymous_syms: |
| + missing_size = star_syms.size + anonymous_syms.size |
| + write_func(('Without %d merge sections and %d anonymous entries (' |
| + 'accounting for %d bytes):\n') % ( |
| + len(star_syms), len(anonymous_syms), missing_size)) |
| + show_one_stat(attributed_syms) |
|
estevenson
2017/03/20 14:13:03
It's a little hard to see just by looking at the o
agrieve
2017/03/20 19:58:09
Good idea! Done. Looks like:
I 3711 Section r h
|
| + |
| + |
| +def _SaveResult(result, file_obj): |
| + """Saves the result to the given file object.""" |
| + # Store one bucket per line. |
| + file_obj.write('%d\n' % _SERIALIZATION_VERSION) |
| + file_obj.write('%r\n' % result.section_sizes) |
| + file_obj.write('%d\n' % len(result.symbol_group)) |
| + prev_section_name = None |
| + # Store symbol fields as tab-separated. |
| + # Store only non-derived fields. |
| + for symbol in result.symbol_group: |
| + if symbol.section_name != prev_section_name: |
| + file_obj.write('%s\n' % symbol.section_name) |
| + prev_section_name = symbol.section_name |
| + # Don't write padding nor name since these are derived values. |
| + file_obj.write('%x\t%x\t%s\t%s\n' % ( |
| + symbol.address, symbol.size_without_padding, |
| + symbol.function_signature or symbol.name or '', |
| + symbol.path or '')) |
| + |
| + |
| +def _LoadResults(file_obj): |
| + """Loads a result from the given file.""" |
| + lines = iter(file_obj) |
| + actual_version = int(next(lines)) |
| + assert actual_version == _SERIALIZATION_VERSION, ( |
| + 'Version mismatch. Need to write some upgrade code.') |
| + |
| + section_sizes = ast.literal_eval(next(lines)) |
| + num_syms = int(next(lines)) |
| + symbol_list = [None] * num_syms |
| + section_name = None |
| + for i in xrange(num_syms): |
| + line = next(lines)[:-1] |
| + if '\t' not in line: |
| + section_name = intern(line) |
| + line = next(lines)[:-1] |
| + new_sym = symbols.Symbol.__new__(symbols.Symbol) |
| + parts = line.split('\t') |
| + new_sym.section_name = section_name |
| + new_sym.address = int(parts[0], 16) |
| + new_sym.size = int(parts[1], 16) |
| + new_sym.name = parts[2] or None |
| + new_sym.path = parts[3] or None |
| + new_sym.padding = 0 # Derived |
| + new_sym.function_signature = None # Derived |
| + symbol_list[i] = new_sym |
| + |
| + # Recompute derived values (padding and function names). |
| + result = parsers.ParseResult(symbol_list, section_sizes) |
| + logging.info('Calculating padding') |
| + _RemoveDuplicatesAndCalculatePadding(result.symbol_group) |
| + logging.info('Deriving signatures') |
| + # Re-parse out function parameters. |
| + _NormalizeNames(result.symbol_group.WhereInSection('t')) |
| + return result |
| + |
| + |
| +def AddOptions(parser): |
| + parser.add_argument('input_file', |
| + help='Path to input file. Can be a linker .map file, an ' |
| + 'unstripped binary, or a saved result from ' |
| + 'analyze.py') |
| + parser.add_argument('--tool-prefix', default='', |
| + help='Path prefix for c++filt.') |
| + parser.add_argument('--output-directory', |
| + help='Path to the root build directory.') |
| + |
| + |
| +def _DetectToolPrefix(tool_prefix, input_file, output_directory=None): |
| + """Calls Analyze with values from args.""" |
| + if not output_directory: |
| + abs_path = os.path.abspath(input_file) |
| + release_idx = abs_path.find('Release') |
| + if release_idx != -1: |
| + output_directory = os.path.relpath(abs_path[:release_idx], |
| + helpers.SRC_ROOT) + '/Release' |
| + logging.debug('Detected --output-directory=%s', output_directory) |
| + |
| + if not tool_prefix and output_directory: |
| + # Auto-detect from build_vars.txt |
| + build_vars_path = os.path.join(output_directory, 'build_vars.txt') |
| + if os.path.exists(build_vars_path): |
| + with open(build_vars_path) as f: |
| + build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l) |
| + logging.debug('Found --tool-prefix from build_vars.txt') |
| + tool_prefix = build_vars['android_tool_prefix'] |
| + |
| + if os.path.sep not in tool_prefix: |
| + full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt') |
| + else: |
| + full_path = tool_prefix + 'c++filt' |
| + |
| + if not os.path.isfile(full_path): |
| + raise Exception('Bad --tool-prefix. Path not found: %s' % full_path) |
| + return tool_prefix |
| + |
| + |
| +def AnalyzeWithArgs(args): |
| + return Analyze(args.input_file, args.output_directory, args.tool_prefix) |
| + |
| + |
| +def Analyze(path, output_directory=None, tool_prefix=''): |
| + if _EndsWithMaybeGz(path, '.size'): |
| + logging.info('Loading cached results.') |
| + with _OpenMaybeGz(path) as f: |
| + result = _LoadResults(f) |
| + elif not _EndsWithMaybeGz(path, '.map'): |
| + raise Exception('Expected input to be a .map or a .size') |
| + else: |
| + # Verify tool_prefix early. |
| + tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory) |
| + |
| + with _OpenMaybeGz(path) as map_file: |
| + result = parsers.MapFileParser().Parse(map_file) |
| + |
| + # Map file for some reason doesn't unmangle all names. |
| + logging.info('Calculating padding') |
| + _RemoveDuplicatesAndCalculatePadding(result.symbol_group) |
| + # Unmangle prints its own log statement. |
| + _UnmangleRemainingSymbols(result.symbol_group, tool_prefix) |
| + # Resolve paths prints its own log statement. |
| + logging.info('Normalizing names') |
| + _NormalizeNames(result.symbol_group) |
| + logging.info('Normalizing paths') |
| + _NormalizeObjectPaths(result.symbol_group) |
| + |
| + if logging.getLogger().isEnabledFor(logging.INFO): |
| + _PrintStats(result, lambda l: logging.info(l.rstrip())) |
| + logging.info('Finished analyzing %d symbols', len(result.symbol_group)) |
| + return result |
| + |
| + |
| +def main(): |
| + parser = argparse.ArgumentParser() |
| + parser.add_argument('--output', required=True, |
| + help='Path to store results. Must end in .size or ' |
| + '.size.gz') |
| + AddOptions(parser) |
| + helpers.AddCommonOptions(parser) |
| + args = parser.parse_args() |
| + if not _EndsWithMaybeGz(args.output, '.size'): |
| + raise Exception('--output must end with .size or .size.gz') |
| + helpers.HandleCommonOptions(args) |
| + |
| + result = AnalyzeWithArgs(args) |
| + logging.info('Saving result to %s', args.output) |
| + with _OpenMaybeGz(args.output, 'wb') as f: |
| + _SaveResult(result, f) |
| + |
| + logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage()) |
| + |
| + |
| +if __name__ == '__main__': |
| + main() |