tools/binary_size/analyze.py - Issue 2724253002: V1 of //tools/binary_size rewrite

Side by Side Diff: tools/binary_size/analyze.py

Issue 2724253002: V1 of //tools/binary_size rewrite (Closed)

Patch Set: README tweaks, more cases for function parsing Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright 2017 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Main Python API for analyzing binary size."""

	7

	8 import argparse

	9 import ast

	10 import distutils.spawn

	11 import gzip

	12 import logging

	13 import os

	14 import re

	15 import subprocess

	16

	17 import parsers

	18 import helpers

	19 import symbols

	20

	21

	22 # File format version for .size files.

	23 _SERIALIZATION_VERSION = 1

	24

	25

	26 def _OpenMaybeGz(path, mode=None):

	27 """Calls `gzip.open()` if \|path\| ends in ".gz", otherwise calls `open()`."""

	28 if path.endswith('.gz'):

	29 if mode and 'w' in mode:

	30 return gzip.GzipFile(path, mode, 1)

	31 return gzip.open(path, mode)

	32 return open(path, mode or 'r')

	33

	34

	35 def _EndsWithMaybeGz(path, suffix):

	36 return path.endswith(suffix) or path.endswith(suffix + '.gz')

	37

	38

	39 def _IterLines(s):

	40 prev_idx = -1

	41 while True:

	42 idx = s.find('\n', prev_idx + 1)

	43 if idx == -1:

	44 return

	45 yield s[prev_idx + 1:idx]

	46 prev_idx = idx

	47

	48

	49 def _UnmangleRemainingSymbols(symbol_group, tool_prefix):

	50 """Uses c++filt to unmangle any symbols that need it."""

	51 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]

	52 if not to_process:

	53 return

	54

	55 logging.info('Unmangling %d names', len(to_process))

	56 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,

	57 stdout=subprocess.PIPE)

	58 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]

	59 assert proc.returncode == 0

	60

	61 for i, line in enumerate(_IterLines(stdout)):

	62 to_process[i].name = line

	63

	64

	65 def _FindParameterListParen(name):

	66 """Finds index of the "(" that denotes the start of a paremeter list."""

	67 # This loops from left-to-right, but the only reason (I think) that this

	68 # is necessary (rather than reusing _FindLastCharOutsideOfBrackets), is

	69 # to capture the outer-most function in the case where classes are nested.

	70 start_idx = 0

	71 while True:

	72 template_balance_count = 0

	73 paren_balance_count = 0

	74 while True:

	75 idx = name.find('(', start_idx)

	76 if idx == -1:

	77 return -1

	78 template_balance_count += (

	79 name.count('<', start_idx, idx) - name.count('>', start_idx, idx))

	80 # Special: operators with angle brackets.

	81 operator_idx = name.find('operator<', start_idx, idx)

	82 if operator_idx != -1:

	83 if name[operator_idx + 9] == '<':

	84 template_balance_count -= 2

	85 else:

	86 template_balance_count -= 1

	87 else:

	88 operator_idx = name.find('operator>', start_idx, idx)

	89 if operator_idx != -1:

	90 if name[operator_idx + 9] == '>':

	91 template_balance_count += 2

	92 else:

	93 template_balance_count += 1

	94

	95 paren_balance_count += (

	96 name.count('(', start_idx, idx) - name.count(')', start_idx, idx))

	97 if template_balance_count == 0 and paren_balance_count == 0:

	98 # Special case: skip "(anonymous namespace)".

	99 if -1 != name.find('(anonymous namespace)', idx, idx + 21):

	100 start_idx = idx + 21

	101 continue

	102 # Special case: skip "decltype (...)"

	103 if name[idx - 1] != ' ':

	104 return idx

	105 start_idx = idx + 1

	106 paren_balance_count += 1

	107

	108

	109 def _FindLastCharOutsideOfBrackets(name, target_char, prev_idx=None):

	110 paren_balance_count = 0

	111 template_balance_count = 0

	112 while True:

	113 idx = name.rfind(target_char, 0, prev_idx)

	114 if idx == -1:

	115 return -1

	116 # It is much faster to use.find() and.count() than to loop over each

	117 # character.

	118 template_balance_count += (

	119 name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx))

	120 paren_balance_count += (

	121 name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx))

	122 if template_balance_count == 0 and paren_balance_count == 0:

	123 return idx

	124 prev_idx = idx

	125

	126

	127 def _ParseFunctionSignature(name):

	128 """Extracts a function name from a function signature.

	129

	130 See unit tests for example signatures.

	131

	132 Returns:

	133 A tuple of (name_without_return_type, name_without_return_type_and_params).

	134 """

	135 paren_idx = _FindParameterListParen(name)

	136

	137 if paren_idx > 0:

	138 space_idx = paren_idx

	139 # Special case: const cast operators (see tests).

	140 if -1 != name.find(' const', paren_idx - 6, paren_idx):

	141 space_idx = paren_idx - 6

	142 while True:

	143 space_idx = _FindLastCharOutsideOfBrackets(name, ' ', space_idx)

	144 # Special case: "operator new", and "operator<< <template>".

	145 if -1 == space_idx or (

	146 -1 == name.find('operator', space_idx - 8, space_idx) and

	147 -1 == name.find('operator<<', space_idx - 10, space_idx)):

	148 break

	149 space_idx -= 8

	150 return (name[space_idx + 1:], name[space_idx + 1:paren_idx])

	151 return name, name

	152

	153

	154 def _NormalizeNames(symbol_group):

	155 """Ensures that all names are formatted in a useful way.

	156

	157 This include:

	158 - Assigning of \|function_signature\| (for functions).

	159 - Stripping of return types in \|function_signature\| and \|name\|.

	160 - Stripping parameters from \|name\|.

	161 - Moving "vtable for" and the like to be suffixes rather than prefixes.

	162 """

	163 found_prefixes = set()

	164 for symbol in symbol_group:

	165 if not symbol.name or symbol.name.startswith('*'):

	166 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this

	167 # can happen.

	168 continue

	169

	170 # E.g.: vtable for FOO

	171 idx = symbol.name.find(' for ', 0, 30)

	172 if idx != -1:

	173 found_prefixes.add(symbol.name[:idx + 4])

	174 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'

	175

	176 # E.g.: virtual thunk to FOO

	177 idx = symbol.name.find(' to ', 0, 30)

	178 if idx != -1:

	179 found_prefixes.add(symbol.name[:idx + 3])

	180 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'

	181

	182 # Strip out return type, and identify where parameter list starts.

	183 if symbol.section == 't':

	184 symbol.function_signature, symbol.name = (

	185 _ParseFunctionSignature(symbol.name))

	186

	187 # Remove anonymous namespaces (they just harm clustering).

	188 symbol.name = symbol.name.replace('(anonymous namespace)::', '')

	189

	190 logging.debug('Found name prefixes of: %r', found_prefixes)

	191

	192

	193 def _NormalizeObjectPaths(symbol_group):

	194 """Ensures that all paths are formatted in a useful way."""

	195 for symbol in symbol_group:

	196 if symbol.path:

	197 if symbol.path.startswith('obj/'):

	198 # Convert obj/third_party/... -> third_party/...

	199 symbol.path = symbol.path[4:]

	200 elif symbol.path.startswith('../../'):

	201 # Convert ../../third_party/... -> third_party/...

	202 symbol.path = symbol.path[6:]

	203 if symbol.path.endswith(')'):

	204 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

	205 start_idx = symbol.path.index('(')

	206 paren_path = symbol.path[start_idx + 1:-1]

	207 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path

	208

	209

	210 def _RemoveDuplicatesAndCalculatePadding(symbol_group):

	211 """Removes symbols at the same address and calculates the \|padding\| field.

	212

	213 Symbols must already be sorted by \|address\|.

	214 """

	215 i = 0

	216 to_remove = set()

	217 all_symbols = symbol_group.symbols

	218 for i in xrange(len(all_symbols)):

	219 prev_symbol = all_symbols[i - 1]

	220 symbol = all_symbols[i]

	221 if prev_symbol.section_name is not symbol.section_name:

	222 continue

	223 if symbol.address > 0 and prev_symbol.address > 0:

	224 # Fold symbols that are at the same address (happens in nm output).

	225 if symbol.address == prev_symbol.address:

	226 symbol.size = max(prev_symbol.size, symbol.size)

	227 to_remove.add(i)

	228 continue

	229 # Even with symbols at the same address removed, overlaps can still

	230 # happen. In this case, padding will be negative (and this is fine).

	231 padding = symbol.address - prev_symbol.end_address

	232 if (symbol.section in 'rd' and padding >= 256 or

	233 symbol.section in 't' and padding >= 64):

	234 # For nm data, this is caused by data that has no associated symbol.

	235 # The linker map file lists them with no name, but with a file.

	236 # Example:

	237 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o

	238 # Where as most look like:

	239 # .data.MANGLED_NAME...

	240 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (

	241 padding, prev_symbol, symbol))

	242 continue

	243 symbol.padding = padding

	244 symbol.size += padding

	245 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol

	246 # Map files have no overlaps, so worth special-casing the no-op case.

	247 if to_remove:

	248 logging.info('Removing %d overlapping symbols', len(to_remove))

	249 symbol_group.symbols = (

	250 [s for i, s in enumerate(all_symbols) if i not in to_remove])

	251

	252

	253 def _PrintStats(result, write_func):

	254 """Prints out how accurate \|result\| is."""

	255 for section in symbols.SECTION_TO_SECTION_NAME:

	256 if section == 'd':

	257 expected_size = sum(v for k, v in result.section_sizes.iteritems()

	258 if k.startswith('.data'))

	259 else:

	260 expected_size = result.section_sizes[

	261 symbols.SECTION_TO_SECTION_NAME[section]]

	262

	263 def show_one_stat(group):

	264 template = ('Section %s has %.1f%% of %d bytes accounted for from '

	265 '%d symbols. %d bytes are unaccounted for. Padding '

	266 'accounts for %d bytes\n')

	267 actual_size = group.size

	268 count = len(group)

	269 padding = group.padding

	270 size_percent = 100.0 * actual_size / expected_size

	271 write_func(template % (section, size_percent, actual_size, count,

	272 expected_size - actual_size, padding))

	273

	274 in_section = result.symbol_group.WhereInSection(section)

	275 show_one_stat(in_section)

	276

	277 star_syms = in_section.WhereNameMatches(r'^\*')

	278 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()

	279 anonymous_syms = attributed_syms.Inverted()

	280 if star_syms or anonymous_syms:

	281 missing_size = star_syms.size + anonymous_syms.size

	282 write_func(('Without %d merge sections and %d anonymous entries ('

	283 'accounting for %d bytes):\n') % (

	284 len(star_syms), len(anonymous_syms), missing_size))

	285 show_one_stat(attributed_syms)
	estevenson 2017/03/20 14:13:03 It's a little hard to see just by looking at the o It's a little hard to see just by looking at the output that: Section r has ... Without x merge sections ... : Section r has ... Are all really part of one section. Maybe indenting the second and third lines would help indicate that they're all related/part of the same section just analyzed differently. This might not play well with your logging function though.. agrieve 2017/03/20 19:58:09 Good idea! Done. Looks like: I 3711 Section r h Show quoted text On 2017/03/20 14:13:03, estevenson wrote: > It's a little hard to see just by looking at the output that: > > Section r has ... > Without x merge sections ... : > Section r has ... > > Are all really part of one section. Maybe indenting the second and third lines > would help indicate that they're all related/part of the same section just > analyzed differently. This might not play well with your logging function > though.. Good idea! Done. Looks like: I 3711 Section r has 100.0% of 6163796 bytes accounted for from 19417 symbols. 0 bytes are unaccounted for. Padding accounts for 1801 bytes I 3734 + Without 11 merge sections and 0 anonymous entries (accounting for 2811683 bytes): I 3735 + Section r has 54.4% of 3352113 bytes accounted for from 19406 symbols. 2811683 bytes are unaccounted for. Padding accounts for 1798 bytes I 3750 Section b has 100.0% of 1928954 bytes accounted for from 19690 symbols. 78 bytes are unaccounted for. Padding accounts for 924 bytes I 3773 Section d has 100.0% of 1924264 bytes accounted for from 26403 symbols. 0 bytes are unaccounted for. Padding accounts for 2274 bytes I 3813 Section t has 99.6% of 36678104 bytes accounted for from 245934 symbols. 132140 bytes are unaccounted for. Padding accounts for 45608 bytes I 3900 + Without 8 merge sections and 0 anonymous entries (accounting for 445014 bytes): I 3916 + Section t has 98.4% of 36233090 bytes accounted for from 245926 symbols. 577154 bytes are unaccounted for. Padding accounts for 45610 bytes
	286

	287

	288 def _SaveResult(result, file_obj):

	289 """Saves the result to the given file object."""

	290 # Store one bucket per line.

	291 file_obj.write('%d\n' % _SERIALIZATION_VERSION)

	292 file_obj.write('%r\n' % result.section_sizes)

	293 file_obj.write('%d\n' % len(result.symbol_group))

	294 prev_section_name = None

	295 # Store symbol fields as tab-separated.

	296 # Store only non-derived fields.

	297 for symbol in result.symbol_group:

	298 if symbol.section_name != prev_section_name:

	299 file_obj.write('%s\n' % symbol.section_name)

	300 prev_section_name = symbol.section_name

	301 # Don't write padding nor name since these are derived values.

	302 file_obj.write('%x\t%x\t%s\t%s\n' % (

	303 symbol.address, symbol.size_without_padding,

	304 symbol.function_signature or symbol.name or '',

	305 symbol.path or ''))

	306

	307

	308 def _LoadResults(file_obj):

	309 """Loads a result from the given file."""

	310 lines = iter(file_obj)

	311 actual_version = int(next(lines))

	312 assert actual_version == _SERIALIZATION_VERSION, (

	313 'Version mismatch. Need to write some upgrade code.')

	314

	315 section_sizes = ast.literal_eval(next(lines))

	316 num_syms = int(next(lines))

	317 symbol_list = [None] * num_syms

	318 section_name = None

	319 for i in xrange(num_syms):

	320 line = next(lines)[:-1]

	321 if '\t' not in line:

	322 section_name = intern(line)

	323 line = next(lines)[:-1]

	324 new_sym = symbols.Symbol.__new__(symbols.Symbol)

	325 parts = line.split('\t')

	326 new_sym.section_name = section_name

	327 new_sym.address = int(parts[0], 16)

	328 new_sym.size = int(parts[1], 16)

	329 new_sym.name = parts[2] or None

	330 new_sym.path = parts[3] or None

	331 new_sym.padding = 0 # Derived

	332 new_sym.function_signature = None # Derived

	333 symbol_list[i] = new_sym

	334

	335 # Recompute derived values (padding and function names).

	336 result = parsers.ParseResult(symbol_list, section_sizes)

	337 logging.info('Calculating padding')

	338 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

	339 logging.info('Deriving signatures')

	340 # Re-parse out function parameters.

	341 _NormalizeNames(result.symbol_group.WhereInSection('t'))

	342 return result

	343

	344

	345 def AddOptions(parser):

	346 parser.add_argument('input_file',

	347 help='Path to input file. Can be a linker .map file, an '

	348 'unstripped binary, or a saved result from '

	349 'analyze.py')

	350 parser.add_argument('--tool-prefix', default='',

	351 help='Path prefix for c++filt.')

	352 parser.add_argument('--output-directory',

	353 help='Path to the root build directory.')

	354

	355

	356 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):

	357 """Calls Analyze with values from args."""

	358 if not output_directory:

	359 abs_path = os.path.abspath(input_file)

	360 release_idx = abs_path.find('Release')

	361 if release_idx != -1:

	362 output_directory = os.path.relpath(abs_path[:release_idx],

	363 helpers.SRC_ROOT) + '/Release'

	364 logging.debug('Detected --output-directory=%s', output_directory)

	365

	366 if not tool_prefix and output_directory:

	367 # Auto-detect from build_vars.txt

	368 build_vars_path = os.path.join(output_directory, 'build_vars.txt')

	369 if os.path.exists(build_vars_path):

	370 with open(build_vars_path) as f:

	371 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)

	372 logging.debug('Found --tool-prefix from build_vars.txt')

	373 tool_prefix = build_vars['android_tool_prefix']

	374

	375 if os.path.sep not in tool_prefix:

	376 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')

	377 else:

	378 full_path = tool_prefix + 'c++filt'

	379

	380 if not os.path.isfile(full_path):

	381 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)

	382 return tool_prefix

	383

	384

	385 def AnalyzeWithArgs(args):

	386 return Analyze(args.input_file, args.output_directory, args.tool_prefix)

	387

	388

	389 def Analyze(path, output_directory=None, tool_prefix=''):

	390 if _EndsWithMaybeGz(path, '.size'):

	391 logging.info('Loading cached results.')

	392 with _OpenMaybeGz(path) as f:

	393 result = _LoadResults(f)

	394 elif not _EndsWithMaybeGz(path, '.map'):

	395 raise Exception('Expected input to be a .map or a .size')

	396 else:

	397 # Verify tool_prefix early.

	398 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)

	399

	400 with _OpenMaybeGz(path) as map_file:

	401 result = parsers.MapFileParser().Parse(map_file)

	402

	403 # Map file for some reason doesn't unmangle all names.

	404 logging.info('Calculating padding')

	405 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

	406 # Unmangle prints its own log statement.

	407 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)

	408 # Resolve paths prints its own log statement.

	409 logging.info('Normalizing names')

	410 _NormalizeNames(result.symbol_group)

	411 logging.info('Normalizing paths')

	412 _NormalizeObjectPaths(result.symbol_group)

	413

	414 if logging.getLogger().isEnabledFor(logging.INFO):

	415 _PrintStats(result, lambda l: logging.info(l.rstrip()))

	416 logging.info('Finished analyzing %d symbols', len(result.symbol_group))

	417 return result

	418

	419

	420 def main():

	421 parser = argparse.ArgumentParser()

	422 parser.add_argument('--output', required=True,

	423 help='Path to store results. Must end in .size or '

	424 '.size.gz')

	425 AddOptions(parser)

	426 helpers.AddCommonOptions(parser)

	427 args = parser.parse_args()

	428 if not _EndsWithMaybeGz(args.output, '.size'):

	429 raise Exception('--output must end with .size or .size.gz')

	430 helpers.HandleCommonOptions(args)

	431

	432 result = AnalyzeWithArgs(args)

	433 logging.info('Saving result to %s', args.output)

	434 with _OpenMaybeGz(args.output, 'wb') as f:

	435 _SaveResult(result, f)

	436

	437 logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage())

	438

	439

	440 if __name__ == '__main__':

	441 main()

OLD	NEW

« tools/binary_size/README.md ('K') | « tools/binary_size/README.md ('k') | tools/binary_size/analyze_test.py » ('j') | tools/binary_size/helpers.py » ('J')