tools/binary_size/analyze.py - Issue 2724253002: V1 of //tools/binary_size rewrite

Side by Side Diff: tools/binary_size/analyze.py

Issue 2724253002: V1 of //tools/binary_size rewrite (Closed)

Patch Set: Add repl to query.py Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright 2017 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Main Python API for analyzing binary size."""

	7

	8 import argparse

	9 import ast

	10 import distutils.spawn

	11 import gzip

	12 import logging

	13 import os

	14 import re

	15 import subprocess

	16

	17 import parsers

	18 import helpers

	19 import symbols

	20

	21

	22 # File format version for .size files.

	23 _SERIALIZATION_VERSION = 1

	24

	25 _ANONYMOUS_NAMESPACE = '(anonymous namespace)'

	26 _LEN_ANONYMOUS_NAMESPACE = len(_ANONYMOUS_NAMESPACE)

	27 _STARTS_WITH_OPERATOR_PATTERN = re.compile(r'\S*(?::\|^)operator')

	28

	29

	30 def _OpenMaybeGz(path, mode=None):

	31 """Calls `gzip.open()` if \|path\| ends in ".gz", otherwise calls `open()`."""

	32 if path.endswith('.gz'):

	33 if mode and 'w' in mode:

	34 return gzip.GzipFile(path, mode, 1)

	35 return gzip.open(path, mode)

	36 return open(path, mode or 'r')

	37

	38

	39 def _EndsWithMaybeGz(path, suffix):

	40 return path.endswith(suffix) or path.endswith(suffix + '.gz')

	41

	42

	43 def _IterLines(s):

	44 prev_idx = -1

	45 while True:

	46 idx = s.find('\n', prev_idx + 1)

	47 if idx == -1:

	48 return

	49 yield s[prev_idx + 1:idx]

	50 prev_idx = idx

	51

	52

	53 def _UnmangleRemainingSymbols(symbol_group, tool_prefix):

	54 """Uses c++filt to unmangle any symbols that need it."""

	55 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]

	56 if not to_process:

	57 return

	58

	59 logging.info('Unmangling %d names', len(to_process))

	60 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,

	61 stdout=subprocess.PIPE)

	62 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]

	63 assert proc.returncode == 0

	64

	65 for i, line in enumerate(_IterLines(stdout)):

	66 to_process[i].name = line

	67

	68

	69 def _FindParameterListParen(name):
	estevenson 2017/03/16 19:49:18 Spent too long trying to figure this out: why can' Spent too long trying to figure this out: why can't you just search from the right for the first '('? Seems like that would work for all your test cases. agrieve 2017/03/20 19:58:08 Well, originally, I decided to look from the front Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > Spent too long trying to figure this out: why can't you just search from the > right for the first '('? Seems like that would work for all your test cases. Well, originally, I decided to look from the front in order to avoid having to deal with ()s within the parameter list. However, since adding the bracket-counting logic, looking from right-to-left and reusing _FindLastSpaceOutsideOfBrackets works just fine! ... Except... When diffing & validating this new approach, I found some names that are from methods of classes which are defined within functions. This puts me back to looking left-to-right, but at least I have a test case now :P (and had to fix up the logic to handle it)
	70 """Finds index of the "(" that denotes the start of a paremeter list."""

	71 # It is much faster to use .find() and .count() than to loop over each

	72 # character.

	73 start_idx = 0

	74 while True:

	75 template_balance_count = 0

	76 paren_balance_count = 0

	77 while True:

	78 idx = name.find('(', start_idx)

	79 if idx == -1:

	80 return -1

	81 template_balance_count += (

	82 name.count('<', start_idx, idx) - name.count('>', start_idx, idx))

	83 paren_balance_count += (

	84 name.count('(', start_idx, idx) - name.count(')', start_idx, idx))

	85 if template_balance_count == 0 and paren_balance_count == 0:

	86 # Special case: skip "(anonymous namespace)".

	87 if -1 != name.find(_ANONYMOUS_NAMESPACE, idx,

	88 idx + _LEN_ANONYMOUS_NAMESPACE):

	89 start_idx = idx + _LEN_ANONYMOUS_NAMESPACE

	90 continue

	91 # Special case: skip "decltype (...)"

	92 if name[idx - 1] != ' ':

	93 return idx

	94 start_idx = idx + 1

	95 paren_balance_count += 1

	96

	97

	98 def _FindLastSpaceOutsideOfBrackets(name, prev_idx=None):

	99 template_balance_count = 0

	100 paren_balance_count = 0

	101 while True:

	102 idx = name.rfind(' ', 0, prev_idx)

	103 if idx == -1:

	104 return -1

	105 template_balance_count += (

	106 name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx))

	107 paren_balance_count += (

	108 name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx))

	109 if template_balance_count == 0 and paren_balance_count == 0:

	110 return idx

	111 prev_idx = idx

	112

	113

	114 def _ParseFunctionSignature(name):

	115 """Extracts a function name from a function signature.

	116

	117 See unit tests for example signatures.

	118

	119 Returns:

	120 A tuple of (name_without_return_type, name_without_return_type_and_params).

	121 """

	122 paren_start = _FindParameterListParen(name)

	123

	124 if paren_start == 0:

	125 logging.warning('Found an odd name %s', name)

	126 elif paren_start > 0:

	127 # Special case: Some operators have odd syntax (see tests).

	128 if _STARTS_WITH_OPERATOR_PATTERN.match(name):

	129 space_index = -1

	130 else:

	131 space_index = _FindLastSpaceOutsideOfBrackets(name, paren_start)

	132 return (name[space_index + 1:], name[space_index + 1:paren_start])

	133 return name, name

	134

	135

	136 def _NormalizeNames(symbol_group):

	137 """Ensures that all names are formatted in a useful way.

	138

	139 This include:
	estevenson 2017/03/16 19:49:18 nit: s/include/includes nit: s/include/includes agrieve 2017/03/20 19:58:08 Done. Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > nit: s/include/includes Done.
	140 - Assigning of \|function_signature\| (for functions).

	141 - Stripping of return types in \|function_signature\| and \|name\|.

	142 - Stripping parameters from \|name\|.

	143 - Moving "vtable for" and the like to be suffixes rather than prefixes.

	144 """

	145 found_prefixes = set()

	146 for symbol in symbol_group:

	147 if not symbol.name or symbol.name.startswith('*'):

	148 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this

	149 # can happen.

	150 continue

	151

	152 # E.g.: vtable for FOO

	153 idx = symbol.name.find(' for ', 0, 30)

	154 if idx != -1:

	155 found_prefixes.add(symbol.name[:idx + 4])

	156 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'

	157

	158 # E.g.: virtual thunk to FOO

	159 idx = symbol.name.find(' to ', 0, 30)

	160 if idx != -1:

	161 found_prefixes.add(symbol.name[:idx + 3])

	162 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'

	163

	164 # TODO(agrieve): Store mangled names instead (smaller).

	165 if symbol.section == 't':

	166 symbol.function_signature, symbol.name = (

	167 _ParseFunctionSignature(symbol.name))

	168 logging.debug('Found name prefixes of: %r', found_prefixes)

	169

	170

	171 def _NormalizeObjectPaths(symbol_group):

	172 """Ensures that all paths are formatted in a useful way."""

	173 for symbol in symbol_group:

	174 if symbol.path:

	175 if symbol.path.startswith('obj/'):

	176 # Convert obj/third_party/... -> third_party/...

	177 symbol.path = symbol.path[4:]

	178 elif symbol.path.startswith('../../'):

	179 # Convert ../../third_party/... -> third_party/...

	180 symbol.path = symbol.path[6:]

	181 if symbol.path.endswith(')'):

	182 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

	183 start_idx = symbol.path.index('(')

	184 paren_path = symbol.path[start_idx + 1:-1]

	185 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path

	186

	187

	188 def _RemoveDuplicatesAndCalculatePadding(symbol_group):

	189 """Removes symbols at the same address and calculates the \|padding\| field.

	190

	191 Symbols must already be sorted by \|address\|.

	192 """

	193 i = 0

	194 to_remove = set()

	195 all_symbols = symbol_group.symbols

	196 for i in xrange(len(all_symbols)):

	197 prev_symbol = all_symbols[i - 1]

	198 symbol = all_symbols[i]

	199 if prev_symbol.section_name is not symbol.section_name:

	200 continue

	201 if symbol.address > 0 and prev_symbol.address > 0:

	202 # Fold symbols that are at the same address (happens in nm output).

	203 if symbol.address == prev_symbol.address:

	204 symbol.size = max(prev_symbol.size, symbol.size)

	205 to_remove.add(i)

	206 continue

	207 # Even with symbols at the same address removed, overlaps can still

	208 # happen. In this case, padding will be negative (and this is fine).

	209 padding = symbol.address - prev_symbol.end_address

	210 if (symbol.section in 'rd' and padding >= 256 or
	estevenson 2017/03/16 19:49:18 nit: might be worth adding a comment saying these nit: might be worth adding a comment saying these values were found via manual audit? agrieve 2017/03/20 19:58:08 Done. Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > nit: might be worth adding a comment saying these values were found via manual > audit? Done.
	211 symbol.section in 't' and padding >= 64):

	212 # For nm data, this is caused by data that has no associated symbol.

	213 # The linker map file lists them with no name, but with a file.

	214 # Example:

	215 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o

	216 # Where as most look like:

	217 # .data.MANGLED_NAME ...

	218 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (

	219 padding, prev_symbol, symbol))

	220 continue

	221 symbol.padding = padding

	222 symbol.size += padding

	223 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol

	224 # Map files have no overlaps, so worth special-casing the no-op case.

	225 if to_remove:

	226 logging.info('Removing %d overlapping symbols', len(to_remove))

	227 symbol_group.symbols = (

	228 [s for i, s in enumerate(all_symbols) if i not in to_remove])

	229

	230

	231 def _PrintStats(result, write_func):

	232 """Prints out how accurate \|result\| is."""

	233 for section in symbols.SECTION_TO_SECTION_NAME:

	234 if section == 'd':

	235 expected_size = sum(v for k, v in result.section_sizes.iteritems()

	236 if k.startswith('.data'))

	237 else:

	238 expected_size = result.section_sizes[

	239 symbols.SECTION_TO_SECTION_NAME[section]]

	240

	241 def show_one_stat(group):

	242 template = ('Section %s has %.1f%% of %d bytes accounted for from '

	243 '%d symbols. %d bytes are unaccounted for. Padding '

	244 'accounts for %d bytes\n')

	245 actual_size = group.size

	246 count = len(group)

	247 padding = group.padding

	248 size_percent = 100.0 * actual_size / expected_size

	249 write_func(template % (section, size_percent, actual_size, count,

	250 expected_size - actual_size, padding))

	251

	252 in_section = result.symbol_group.WhereInSection(section)

	253 show_one_stat(in_section)

	254

	255 star_syms = in_section.WhereNameMatches(r'^\*')

	256 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()

	257 anonymous_syms = attributed_syms.Inverted()

	258 if star_syms or anonymous_syms:

	259 missing_size = star_syms.size + anonymous_syms.size

	260 write_func(('Without %d merge sections and %d anonymous entries ('

	261 'accounting for %d bytes):\n') % (

	262 len(star_syms), len(anonymous_syms), missing_size))

	263 show_one_stat(attributed_syms)

	264

	265

	266 def _SaveResult(result, file_obj):

	267 """Saves the result to the given file object."""
	estevenson 2017/03/16 19:49:18 It's probably too slow but did you try pickle for It's probably too slow but did you try pickle for this? agrieve 2017/03/20 19:58:08 It was the first thing I tried, and it is sadly 10 Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > It's probably too slow but did you try pickle for this? It was the first thing I tried, and it is sadly 10x slower.
	268 # Store one bucket per line.

	269 file_obj.write('%d\n' % _SERIALIZATION_VERSION)

	270 file_obj.write('%r\n' % result.section_sizes)

	271 file_obj.write('%d\n' % len(result.symbol_group))

	272 prev_section_name = None

	273 # Store symbol fields as tab-separated.

	274 # Store only non-derived fields.

	275 for symbol in result.symbol_group:

	276 if symbol.section_name != prev_section_name:

	277 file_obj.write('%s\n' % symbol.section_name)

	278 prev_section_name = symbol.section_name

	279 # Don't write padding nor name since these are derived values.

	280 file_obj.write('%x\t%x\t%s\t%s\n' % (

	281 symbol.address, symbol.size_without_padding,

	282 symbol.function_signature or symbol.name or '',

	283 symbol.path or ''))

	284

	285

	286 def _LoadResults(file_obj):

	287 """Loads a result from the given file."""

	288 lines = iter(file_obj)

	289 actual_version = int(next(lines))

	290 assert actual_version == _SERIALIZATION_VERSION, (

	291 'Version mismatch. Need to write some upgrade code.')

	292

	293 section_sizes = ast.literal_eval(next(lines))

	294 num_syms = int(next(lines))

	295 symbol_list = [None] * num_syms

	296 section_name = None

	297 for i in xrange(num_syms):

	298 line = next(lines)[:-1]

	299 if '\t' not in line:

	300 section_name = intern(line)

	301 line = next(lines)[:-1]

	302 new_sym = symbols.Symbol.__new__(symbols.Symbol)

	303 parts = line.split('\t')

	304 new_sym.section_name = section_name

	305 new_sym.address = int(parts[0], 16)

	306 new_sym.size = int(parts[1], 16)

	307 new_sym.name = parts[2] or None

	308 new_sym.path = parts[3] or None

	309 new_sym.padding = 0 # Derived

	310 new_sym.function_signature = None # Derived

	311 symbol_list[i] = new_sym

	312

	313 # Recompute derived values (padding and function names).

	314 result = parsers.ParseResult(symbol_list, section_sizes)

	315 logging.info('Calculating padding')

	316 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)
	estevenson 2017/03/16 19:49:18 Does this need to be done, since it's done in Anal Does this need to be done, since it's done in Analyze() anyways? Same with _NormalizeNames(). estevenson 2017/03/20 14:13:02 Nvm, can't read. Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > Does this need to be done, since it's done in Analyze() anyways? Same with > _NormalizeNames(). Nvm, can't read.
	317 logging.info('Deriving signatures')

	318 # Re-parse out function parameters.

	319 _NormalizeNames(result.symbol_group.WhereInSection('t'))

	320 return result

	321

	322

	323 def AddOptions(parser):

	324 parser.add_argument('input_file',

	325 help='Path to input file. Can be a linker .map file, an '

	326 'unstripped binary, or a saved result from '

	327 'analyze.py')

	328 parser.add_argument('--tool-prefix', default='',

	329 help='Path prefix for c++filt.')

	330 parser.add_argument('--output-directory',

	331 help='Path to the root build directory.')

	332

	333

	334 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):

	335 """Calls Analyze with values from args."""

	336 if not output_directory:

	337 abs_path = os.path.abspath(input_file)

	338 release_idx = abs_path.find('Release')

	339 if release_idx != -1:

	340 output_directory = os.path.relpath(abs_path[:release_idx],

	341 helpers.SRC_ROOT) + '/Release'

	342 logging.debug('Detected --output-directory=%s', output_directory)

	343

	344 if not tool_prefix and output_directory:

	345 # Auto-detect from build_vars.txt

	346 build_vars_path = os.path.join(output_directory, 'build_vars.txt')

	347 if os.path.exists(build_vars_path):

	348 with open(build_vars_path) as f:

	349 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)

	350 logging.debug('Found --tool-prefix from build_vars.txt')

	351 tool_prefix = build_vars['android_tool_prefix']
	estevenson 2017/03/16 19:49:18 I get an error because of this, I think the "//" n I get an error because of this, I think the "//" needs to be stripped. agrieve 2017/03/20 19:58:08 Changed it to not have // in: https://codereview.c Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > I get an error because of this, I think the "//" needs to be stripped. Changed it to not have // in: https://codereview.chromium.org/2761883002/
	352

	353 if os.path.sep not in tool_prefix:
	estevenson 2017/03/16 19:49:18 When does this happen? When does this happen? agrieve 2017/03/20 19:58:08 It's not uncommon for the tool prefix to be entire Show quoted text On 2017/03/16 19:49:18, estevenson wrote: > When does this happen? It's not uncommon for the tool prefix to be entirely filename based rather than directory based. E.g. we have "arm-linux-gnueabihf-c++filt" on our machines, so the tool_prefix here would be "arm-linux-gnueabihf-".
	354 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')

	355 else:

	356 full_path = tool_prefix + 'c++filt'

	357

	358 if not os.path.isfile(full_path):

	359 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)

	360 return tool_prefix

	361

	362

	363 def AnalyzeWithArgs(args):

	364 return Analyze(args.input_file, args.output_directory, args.tool_prefix)

	365

	366

	367 def Analyze(path, output_directory=None, tool_prefix=''):

	368 if _EndsWithMaybeGz(path, '.size'):

	369 logging.info('Loading cached results.')

	370 with _OpenMaybeGz(path) as f:

	371 result = _LoadResults(f)

	372 elif not _EndsWithMaybeGz(path, '.map'):

	373 raise Exception('Expected input to be a .map or a .size')

	374 else:

	375 # Verify tool_prefix early.

	376 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)

	377

	378 with _OpenMaybeGz(path) as map_file:

	379 result = parsers.MapFileParser().Parse(map_file)

	380

	381 # Map file for some reason doesn't unmangle all names.

	382 logging.info('Calculating padding')

	383 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

	384 # Unmangle prints its own log statement.

	385 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)

	386 # Resolve paths prints its own log statement.

	387 logging.info('Normalizing names')

	388 _NormalizeNames(result.symbol_group)

	389 logging.info('Normalizing paths')

	390 _NormalizeObjectPaths(result.symbol_group)

	391

	392 if logging.getLogger().isEnabledFor(logging.INFO):

	393 _PrintStats(result, lambda l: logging.info(l.rstrip()))

	394 logging.info('Finished analyzing %d symbols', len(result.symbol_group))

	395 return result

	396

	397

	398 def main():

	399 parser = argparse.ArgumentParser()

	400 parser.add_argument('--output', required=True,

	401 help='Path to store results. Must end in .size or '

	402 '.size.gz')

	403 AddOptions(parser)

	404 helpers.AddCommonOptions(parser)

	405 args = parser.parse_args()

	406 if not _EndsWithMaybeGz(args.output, '.size'):

	407 raise Exception('--output must end with .size or .size.gz')

	408 helpers.HandleCommonOptions(args)

	409

	410 result = AnalyzeWithArgs(args)

	411 logging.info('Saving result to %s', args.output)

	412 with _OpenMaybeGz(args.output, 'wb') as f:

	413 _SaveResult(result, f)

	414

	415 logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage())

	416

	417

	418 if __name__ == '__main__':

	419 main()

OLD	NEW

« tools/binary_size/README.md ('K') | « tools/binary_size/README.md ('k') | tools/binary_size/create_html_breakdown.py » ('j') | tools/binary_size/helpers.py » ('J')