tools/binary_size/analyze.py - Issue 2724253002: V1 of //tools/binary_size rewrite

Unified Diff: tools/binary_size/analyze.py

Issue 2724253002: V1 of //tools/binary_size rewrite (Closed)

Patch Set: README tweaks, more cases for function parsing Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: tools/binary_size/analyze.py

diff --git a/tools/binary_size/analyze.py b/tools/binary_size/analyze.py

new file mode 100755

index 0000000000000000000000000000000000000000..a02ccda24cb3fb88a23b83628360613edab85a8f

--- /dev/null

+++ b/tools/binary_size/analyze.py

@@ -0,0 +1,441 @@

+#!/usr/bin/env python

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+"""Main Python API for analyzing binary size."""

+import argparse

+import ast

+import distutils.spawn

+import gzip

+import logging

+import os

+import re

+import subprocess

+import parsers

+import helpers

+import symbols

+# File format version for .size files.

+_SERIALIZATION_VERSION = 1

+def _OpenMaybeGz(path, mode=None):

+ """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""

+ if path.endswith('.gz'):

+ if mode and 'w' in mode:

+ return gzip.GzipFile(path, mode, 1)

+ return gzip.open(path, mode)

+ return open(path, mode or 'r')

+def _EndsWithMaybeGz(path, suffix):

+ return path.endswith(suffix) or path.endswith(suffix + '.gz')

+def _IterLines(s):

+ prev_idx = -1

+ while True:

+ idx = s.find('\n', prev_idx + 1)

+ if idx == -1:

+ return

+ yield s[prev_idx + 1:idx]

+ prev_idx = idx

+def _UnmangleRemainingSymbols(symbol_group, tool_prefix):

+ """Uses c++filt to unmangle any symbols that need it."""

+ to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]

+ if not to_process:

+ return

+ logging.info('Unmangling %d names', len(to_process))

+ proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,

+ stdout=subprocess.PIPE)

+ stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]

+ assert proc.returncode == 0

+ for i, line in enumerate(_IterLines(stdout)):

+ to_process[i].name = line

+def _FindParameterListParen(name):

+ """Finds index of the "(" that denotes the start of a paremeter list."""

+ # This loops from left-to-right, but the only reason (I think) that this

+ # is necessary (rather than reusing _FindLastCharOutsideOfBrackets), is

+ # to capture the outer-most function in the case where classes are nested.

+ start_idx = 0

+ while True:

+ template_balance_count = 0

+ paren_balance_count = 0

+ while True:

+ idx = name.find('(', start_idx)

+ if idx == -1:

+ return -1

+ template_balance_count += (

+ name.count('<', start_idx, idx) - name.count('>', start_idx, idx))

+ # Special: operators with angle brackets.

+ operator_idx = name.find('operator<', start_idx, idx)

+ if operator_idx != -1:

+ if name[operator_idx + 9] == '<':

+ template_balance_count -= 2

+ else:

+ template_balance_count -= 1

+ else:

+ operator_idx = name.find('operator>', start_idx, idx)

+ if operator_idx != -1:

+ if name[operator_idx + 9] == '>':

+ template_balance_count += 2

+ else:

+ template_balance_count += 1

+ paren_balance_count += (

+ name.count('(', start_idx, idx) - name.count(')', start_idx, idx))

+ if template_balance_count == 0 and paren_balance_count == 0:

+ # Special case: skip "(anonymous namespace)".

+ if -1 != name.find('(anonymous namespace)', idx, idx + 21):

+ start_idx = idx + 21

+ continue

+ # Special case: skip "decltype (...)"

+ if name[idx - 1] != ' ':

+ return idx

+ start_idx = idx + 1

+ paren_balance_count += 1

+def _FindLastCharOutsideOfBrackets(name, target_char, prev_idx=None):

+ paren_balance_count = 0

+ template_balance_count = 0

+ while True:

+ idx = name.rfind(target_char, 0, prev_idx)

+ if idx == -1:

+ return -1

+ # It is much faster to use.find() and.count() than to loop over each

+ # character.

+ template_balance_count += (

+ name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx))

+ paren_balance_count += (

+ name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx))

+ if template_balance_count == 0 and paren_balance_count == 0:

+ return idx

+ prev_idx = idx

+def _ParseFunctionSignature(name):

+ """Extracts a function name from a function signature.

+ See unit tests for example signatures.

+ Returns:

+ A tuple of (name_without_return_type, name_without_return_type_and_params).

+ """

+ paren_idx = _FindParameterListParen(name)

+ if paren_idx > 0:

+ space_idx = paren_idx

+ # Special case: const cast operators (see tests).

+ if -1 != name.find(' const', paren_idx - 6, paren_idx):

+ space_idx = paren_idx - 6

+ while True:

+ space_idx = _FindLastCharOutsideOfBrackets(name, ' ', space_idx)

+ # Special case: "operator new", and "operator<< <template>".

+ if -1 == space_idx or (

+ -1 == name.find('operator', space_idx - 8, space_idx) and

+ -1 == name.find('operator<<', space_idx - 10, space_idx)):

+ break

+ space_idx -= 8

+ return (name[space_idx + 1:], name[space_idx + 1:paren_idx])

+ return name, name

+def _NormalizeNames(symbol_group):

+ """Ensures that all names are formatted in a useful way.

+ This include:

+ - Assigning of |function_signature| (for functions).

+ - Stripping of return types in |function_signature| and |name|.

+ - Stripping parameters from |name|.

+ - Moving "vtable for" and the like to be suffixes rather than prefixes.

+ """

+ found_prefixes = set()

+ for symbol in symbol_group:

+ if not symbol.name or symbol.name.startswith('*'):

+ # See comment in _RemoveDuplicatesAndCalculatePadding() about when this

+ # can happen.

+ continue

+ # E.g.: vtable for FOO

+ idx = symbol.name.find(' for ', 0, 30)

+ if idx != -1:

+ found_prefixes.add(symbol.name[:idx + 4])

+ symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'

+ # E.g.: virtual thunk to FOO

+ idx = symbol.name.find(' to ', 0, 30)

+ if idx != -1:

+ found_prefixes.add(symbol.name[:idx + 3])

+ symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'

+ # Strip out return type, and identify where parameter list starts.

+ if symbol.section == 't':

+ symbol.function_signature, symbol.name = (

+ _ParseFunctionSignature(symbol.name))

+ # Remove anonymous namespaces (they just harm clustering).

+ symbol.name = symbol.name.replace('(anonymous namespace)::', '')

+ logging.debug('Found name prefixes of: %r', found_prefixes)

+def _NormalizeObjectPaths(symbol_group):

+ """Ensures that all paths are formatted in a useful way."""

+ for symbol in symbol_group:

+ if symbol.path:

+ if symbol.path.startswith('obj/'):

+ # Convert obj/third_party/... -> third_party/...

+ symbol.path = symbol.path[4:]

+ elif symbol.path.startswith('../../'):

+ # Convert ../../third_party/... -> third_party/...

+ symbol.path = symbol.path[6:]

+ if symbol.path.endswith(')'):

+ # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

+ start_idx = symbol.path.index('(')

+ paren_path = symbol.path[start_idx + 1:-1]

+ symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path

+def _RemoveDuplicatesAndCalculatePadding(symbol_group):

+ """Removes symbols at the same address and calculates the |padding| field.

+ Symbols must already be sorted by |address|.

+ """

+ i = 0

+ to_remove = set()

+ all_symbols = symbol_group.symbols

+ for i in xrange(len(all_symbols)):

+ prev_symbol = all_symbols[i - 1]

+ symbol = all_symbols[i]

+ if prev_symbol.section_name is not symbol.section_name:

+ continue

+ if symbol.address > 0 and prev_symbol.address > 0:

+ # Fold symbols that are at the same address (happens in nm output).

+ if symbol.address == prev_symbol.address:

+ symbol.size = max(prev_symbol.size, symbol.size)

+ to_remove.add(i)

+ continue

+ # Even with symbols at the same address removed, overlaps can still

+ # happen. In this case, padding will be negative (and this is fine).

+ padding = symbol.address - prev_symbol.end_address

+ if (symbol.section in 'rd' and padding >= 256 or

+ symbol.section in 't' and padding >= 64):

+ # For nm data, this is caused by data that has no associated symbol.

+ # The linker map file lists them with no name, but with a file.

+ # Example:

+ # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o

+ # Where as most look like:

+ # .data.MANGLED_NAME...

+ logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (

+ padding, prev_symbol, symbol))

+ continue

+ symbol.padding = padding

+ symbol.size += padding

+ assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol

+ # Map files have no overlaps, so worth special-casing the no-op case.

+ if to_remove:

+ logging.info('Removing %d overlapping symbols', len(to_remove))

+ symbol_group.symbols = (

+ [s for i, s in enumerate(all_symbols) if i not in to_remove])

+def _PrintStats(result, write_func):

+ """Prints out how accurate |result| is."""

+ for section in symbols.SECTION_TO_SECTION_NAME:

+ if section == 'd':

+ expected_size = sum(v for k, v in result.section_sizes.iteritems()

+ if k.startswith('.data'))

+ else:

+ expected_size = result.section_sizes[

+ symbols.SECTION_TO_SECTION_NAME[section]]

+ def show_one_stat(group):

+ template = ('Section %s has %.1f%% of %d bytes accounted for from '

+ '%d symbols. %d bytes are unaccounted for. Padding '

+ 'accounts for %d bytes\n')

+ actual_size = group.size

+ count = len(group)

+ padding = group.padding

+ size_percent = 100.0 * actual_size / expected_size

+ write_func(template % (section, size_percent, actual_size, count,

+ expected_size - actual_size, padding))

+ in_section = result.symbol_group.WhereInSection(section)

+ show_one_stat(in_section)

+ star_syms = in_section.WhereNameMatches(r'^\*')

+ attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()

+ anonymous_syms = attributed_syms.Inverted()

+ if star_syms or anonymous_syms:

+ missing_size = star_syms.size + anonymous_syms.size

+ write_func(('Without %d merge sections and %d anonymous entries ('

+ 'accounting for %d bytes):\n') % (

+ len(star_syms), len(anonymous_syms), missing_size))

+ show_one_stat(attributed_syms)

estevenson 2017/03/20 14:13:03 It's a little hard to see just by looking at the o

agrieve 2017/03/20 19:58:09 Good idea! Done. Looks like: I 3711 Section r h

+def _SaveResult(result, file_obj):

+ """Saves the result to the given file object."""

+ # Store one bucket per line.

+ file_obj.write('%d\n' % _SERIALIZATION_VERSION)

+ file_obj.write('%r\n' % result.section_sizes)

+ file_obj.write('%d\n' % len(result.symbol_group))

+ prev_section_name = None

+ # Store symbol fields as tab-separated.

+ # Store only non-derived fields.

+ for symbol in result.symbol_group:

+ if symbol.section_name != prev_section_name:

+ file_obj.write('%s\n' % symbol.section_name)

+ prev_section_name = symbol.section_name

+ # Don't write padding nor name since these are derived values.

+ file_obj.write('%x\t%x\t%s\t%s\n' % (

+ symbol.address, symbol.size_without_padding,

+ symbol.function_signature or symbol.name or '',

+ symbol.path or ''))

+def _LoadResults(file_obj):

+ """Loads a result from the given file."""

+ lines = iter(file_obj)

+ actual_version = int(next(lines))

+ assert actual_version == _SERIALIZATION_VERSION, (

+ 'Version mismatch. Need to write some upgrade code.')

+ section_sizes = ast.literal_eval(next(lines))

+ num_syms = int(next(lines))

+ symbol_list = [None] * num_syms

+ section_name = None

+ for i in xrange(num_syms):

+ line = next(lines)[:-1]

+ if '\t' not in line:

+ section_name = intern(line)

+ line = next(lines)[:-1]

+ new_sym = symbols.Symbol.__new__(symbols.Symbol)

+ parts = line.split('\t')

+ new_sym.section_name = section_name

+ new_sym.address = int(parts[0], 16)

+ new_sym.size = int(parts[1], 16)

+ new_sym.name = parts[2] or None

+ new_sym.path = parts[3] or None

+ new_sym.padding = 0 # Derived

+ new_sym.function_signature = None # Derived

+ symbol_list[i] = new_sym

+ # Recompute derived values (padding and function names).

+ result = parsers.ParseResult(symbol_list, section_sizes)

+ logging.info('Calculating padding')

+ _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

+ logging.info('Deriving signatures')

+ # Re-parse out function parameters.

+ _NormalizeNames(result.symbol_group.WhereInSection('t'))

+ return result

+def AddOptions(parser):

+ parser.add_argument('input_file',

+ help='Path to input file. Can be a linker .map file, an '

+ 'unstripped binary, or a saved result from '

+ 'analyze.py')

+ parser.add_argument('--tool-prefix', default='',

+ help='Path prefix for c++filt.')

+ parser.add_argument('--output-directory',

+ help='Path to the root build directory.')

+def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):

+ """Calls Analyze with values from args."""

+ if not output_directory:

+ abs_path = os.path.abspath(input_file)

+ release_idx = abs_path.find('Release')

+ if release_idx != -1:

+ output_directory = os.path.relpath(abs_path[:release_idx],

+ helpers.SRC_ROOT) + '/Release'

+ logging.debug('Detected --output-directory=%s', output_directory)

+ if not tool_prefix and output_directory:

+ # Auto-detect from build_vars.txt

+ build_vars_path = os.path.join(output_directory, 'build_vars.txt')

+ if os.path.exists(build_vars_path):

+ with open(build_vars_path) as f:

+ build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)

+ logging.debug('Found --tool-prefix from build_vars.txt')

+ tool_prefix = build_vars['android_tool_prefix']

+ if os.path.sep not in tool_prefix:

+ full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')

+ else:

+ full_path = tool_prefix + 'c++filt'

+ if not os.path.isfile(full_path):

+ raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)

+ return tool_prefix

+def AnalyzeWithArgs(args):

+ return Analyze(args.input_file, args.output_directory, args.tool_prefix)

+def Analyze(path, output_directory=None, tool_prefix=''):

+ if _EndsWithMaybeGz(path, '.size'):

+ logging.info('Loading cached results.')

+ with _OpenMaybeGz(path) as f:

+ result = _LoadResults(f)

+ elif not _EndsWithMaybeGz(path, '.map'):

+ raise Exception('Expected input to be a .map or a .size')

+ else:

+ # Verify tool_prefix early.

+ tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)

+ with _OpenMaybeGz(path) as map_file:

+ result = parsers.MapFileParser().Parse(map_file)

+ # Map file for some reason doesn't unmangle all names.

+ logging.info('Calculating padding')

+ _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

+ # Unmangle prints its own log statement.

+ _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)

+ # Resolve paths prints its own log statement.

+ logging.info('Normalizing names')

+ _NormalizeNames(result.symbol_group)

+ logging.info('Normalizing paths')

+ _NormalizeObjectPaths(result.symbol_group)

+ if logging.getLogger().isEnabledFor(logging.INFO):

+ _PrintStats(result, lambda l: logging.info(l.rstrip()))

+ logging.info('Finished analyzing %d symbols', len(result.symbol_group))

+ return result

+def main():

+ parser = argparse.ArgumentParser()

+ parser.add_argument('--output', required=True,

+ help='Path to store results. Must end in .size or '

+ '.size.gz')

+ AddOptions(parser)

+ helpers.AddCommonOptions(parser)

+ args = parser.parse_args()

+ if not _EndsWithMaybeGz(args.output, '.size'):

+ raise Exception('--output must end with .size or .size.gz')

+ helpers.HandleCommonOptions(args)

+ result = AnalyzeWithArgs(args)

+ logging.info('Saving result to %s', args.output)

+ with _OpenMaybeGz(args.output, 'wb') as f:

+ _SaveResult(result, f)

+ logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage())

+if __name__ == '__main__':

+ main()

« tools/binary_size/README.md ('K') | « tools/binary_size/README.md ('k') | tools/binary_size/analyze_test.py » ('j') | tools/binary_size/helpers.py » ('J')